Skip to content

Commit 672a727

Browse files
author
Jacob Summerville
committed
Threaded
1 parent 4124a06 commit 672a727

1 file changed

Lines changed: 53 additions & 3 deletions

File tree

rounds/3_dna/solution.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,63 @@
55
own faster implementation.
66
"""
77

8-
from .baseline import find_matches as _baseline
8+
from concurrent.futures import ThreadPoolExecutor
9+
10+
11+
def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
12+
""" Scan one FASTA record for all occurrences of ``pattern``.
13+
14+
Returns the record id and every zero-based match position, or ``None`` if
15+
the record is empty or does not contain the pattern.
16+
"""
17+
18+
if not record.strip():
19+
return None
20+
21+
# Parition DNA record into header and DNA sequence
22+
header, _, body = record.partition(b'\n')
23+
record_id = header.strip().decode('ascii')
24+
25+
# Keep the hot path in bytes so we avoid decoding each whole sequence.
26+
# Whitespace is not part of the DNA sequence, so remove it before scanning.
27+
sequence = (
28+
body.replace(b'\n', b'')
29+
.replace(b'\r', b'')
30+
.replace(b' ', b'')
31+
)
32+
33+
positions: list[int] = []
34+
start = 0
35+
36+
# Advance by one after each hit so overlapping matches are included.
37+
while True:
38+
pos = sequence.find(pattern, start)
39+
if pos == -1:
40+
break
41+
positions.append(pos)
42+
start = pos + 1
43+
44+
if not positions:
45+
return None
46+
47+
return record_id, positions
948

1049

1150
def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
1251
"""Find every FASTA record whose sequence contains ``pattern``.
1352
1453
Returns ``[(record_id, [positions...]), ...]`` in file order.
1554
"""
16-
# TODO: remove this delegation and write your own implementation here.
17-
return _baseline(fasta_path, pattern)
55+
56+
# Read once in binary mode so parsing and searching can stay on bytes.
57+
with open(fasta_path, 'rb') as f:
58+
text = f.read()
59+
60+
# Split into DNA sequences
61+
records = [record for record in text.split(b'>') if record.strip()]
62+
63+
# Scan records concurrently
64+
with ThreadPoolExecutor() as executor:
65+
results = executor.map(lambda record: _scan_record(record, pattern), records)
66+
67+
return [result for result in results if result is not None]

0 commit comments

Comments
 (0)