All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
Example:
Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT" Output: ["AAAAACCCCC", "CCCCCAAAAA"]
Approach #1: C++.
class Solution { public: vector<string> findRepeatedDnaSequences(string s) { vector<string> ans; vector<int> appear((1<<20)+1, 0); int len = s.length(); for (int i = 0, j = 9; j < len; ++i, ++j) { int value = 0; for (int k = i; k <= j; ++k) { value = (value << 2) + helper(s[k]); } appear[value]++; if (appear[value] == 2) { ans.push_back(s.substr(i, 10)); } } return ans; } private: int helper(char c) { if (c == 'A') return 0; else if (c == 'C') return 1; else if (c == 'G') return 2; else return 3; } };
Approach #2: Java.
class Solution { public List<String> findRepeatedDnaSequences(String s) { Set seen = new HashSet(), repeated = new HashSet(); for (int i = 0; i+9 <s.length(); ++i) { String ten = s.substring(i, i+10); if (!seen.add(ten)) repeated.add(ten); } return new ArrayList(repeated); } }
Approach #3: Python.
class Solution(object): def findRepeatedDnaSequences(self, s): """ :type s: str :rtype: List[str] """ sequences = collections.defaultdict(int) #set '0' as the default value for non-existing keys for i in range(len(s)): sequences[s[i:i+10]] += 1#add 1 to the count return [key for key, value in sequences.iteritems() if value > 1] #extract the relevant keys
Time Submitted | Status | Runtime | Language |
---|---|---|---|
a few seconds ago | Accepted | 92 ms | python |
9 minutes ago | Accepted | 39 ms | java |
12 minutes ago | Accepted | 56 ms | cpp |
Analysis:
hash[key] = value.
key represent hash key which don't have the repeated element, we can use value = (value << 2) + helper(s[i]) to generate.