-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy path__init__.py
More file actions
82 lines (62 loc) · 2.98 KB
/
__init__.py
File metadata and controls
82 lines (62 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from typing import List, Dict
def find_repeated_dna_sequences_naive(dna_sequence: str) -> List[str]:
"""
Finds all repeated DNA sequences in a given string.
A repeated DNA sequence is a subsequence that appears more than once in the given string.
The function returns a list of all repeated DNA sequences found in the given string.
Parameters:
dna_sequence (str): The string to search for repeated DNA sequences.
Returns:
List[str]
"""
if len(dna_sequence) <= 10:
return []
result_set = set()
seen = set()
for idx in range(len(dna_sequence)):
subsequence = dna_sequence[idx:idx+10]
if len(subsequence) < 10:
continue
if subsequence in seen:
result_set.add(subsequence)
else:
seen.add(subsequence)
return list(result_set)
def find_repeated_dna_sequences(dna_sequence: str) -> List[str]:
"""
Finds all repeated DNA sequences in a given string.
A repeated DNA sequence is a subsequence that appears more than once in the given string.
The function returns a list of all repeated DNA sequences found in the given string.
Parameters:
dna_sequence (str): The string to search for repeated DNA sequences.
Returns:
List[str]
"""
to_int = {"A": 0, "C": 1, "G": 2, "T": 3}
# Validate input contains only valid DNA bases
if not all(c in to_int for c in dna_sequence):
raise ValueError(f"DNA sequence contains invalid characters. Only A, C, G, T are allowed.")
encoded_sequence = [to_int[c] for c in dna_sequence]
dna_sequence_substr_length, dna_sequence_length = 10, len(dna_sequence) # Length of DNA sequence to check
if dna_sequence_length <= dna_sequence_substr_length:
return []
base_a_encoding = 4 # Base-4 encoding
rolling_hash_value = 0
seen_hashes, output = set(), set()
a_k = 1 # Stores a^k for hash updates
# # Compute the initial hash using base-4 multiplication
for i in range(dna_sequence_substr_length):
rolling_hash_value = rolling_hash_value * base_a_encoding + encoded_sequence[i]
a_k *= base_a_encoding # Precompute a^k for later use in rolling hash updates
seen_hashes.add(rolling_hash_value) # Store the initial hash
# Sliding window approach to update the hash efficiently
for start in range(1, dna_sequence_length - dna_sequence_substr_length + 1):
# Remove the leftmost character and add the new rightmost character
rolling_hash_value = rolling_hash_value * base_a_encoding - encoded_sequence[start - 1] * a_k + encoded_sequence[start + dna_sequence_substr_length - 1]
# If this hash has been seen_hashes before, add the corresponding substring to the output
if rolling_hash_value in seen_hashes:
output.add(dna_sequence[start: start + dna_sequence_substr_length])
else:
seen_hashes.add(rolling_hash_value)
# Convert set to list before returning
return list(output)