Skip to content

Commit 8af615d

Browse files
committed
dna round
Signed-off-by: Drew Wock <dwock@esri.com>
1 parent 5dc4655 commit 8af615d

1 file changed

Lines changed: 32 additions & 3 deletions

File tree

rounds/3_dna/solution.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,42 @@
55
own faster implementation.
66
"""
77

8-
from .baseline import find_matches as _baseline
9-
8+
import re
109

1110
def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
1211
"""Find every FASTA record whose sequence contains ``pattern``.
1312
1413
Returns ``[(record_id, [positions...]), ...]`` in file order.
1514
"""
1615
# TODO: remove this delegation and write your own implementation here.
17-
return _baseline(fasta_path, pattern)
16+
# Step 1: read the whole FASTA file as text and decode the pattern so the
17+
# search below can use a single ``str`` API.
18+
pattern_str = pattern.decode("ascii")
19+
with open(fasta_path, "r") as f:
20+
text = f.read()
21+
22+
matches: list[tuple[str, list[int]]] = []
23+
pattern_str = pattern.decode('ascii')
24+
regex = re.compile(pattern_str)
25+
26+
# Step 2: split the file on '>' to peel off one record at a time. The
27+
# first element is the chunk before any header (empty for well-formed
28+
# files) and is skipped by the ``.strip()`` guard below.
29+
for record in text.split(">"):
30+
if not record.strip():
31+
continue
32+
33+
# Step 3: a record looks like ``"<id>\n<seq line 1>\n<seq line 2>\n..."``.
34+
# The id is the first line; the remaining lines are joined back into a
35+
# single contiguous sequence string.
36+
lines = record.split("\n")
37+
record_id = lines[0].strip()
38+
sequence = "".join(lines[1:]).replace(" ", "")
39+
40+
positions: list[int] = []
41+
print(sequence)
42+
for m in regex.finditer(sequence):
43+
positions.append(m.start())
44+
if positions:
45+
matches.append((record_id, positions))
46+
return matches

0 commit comments

Comments
 (0)