Skip to content

Commit df1dad0

Browse files
committed
Round 3 first try
1 parent 6d156e0 commit df1dad0

1 file changed

Lines changed: 40 additions & 9 deletions

File tree

rounds/3_dna/solution.py

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,48 @@
1-
"""Your Round 3 solution — DNA sequence matcher.
1+
"""Your Round 3 solution — DNA sequence matcher."""
22

3-
**Edit this file.** It currently delegates to ``baseline.py`` so everything
4-
passes out of the box. Replace the body of ``find_matches`` with your
5-
own faster implementation.
6-
"""
7-
8-
from .baseline import find_matches as _baseline
3+
from __future__ import annotations
4+
import re
5+
from concurrent.futures import ThreadPoolExecutor, as_completed
96

107

118
def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
129
"""Find every FASTA record whose sequence contains ``pattern``.
1310
1411
Returns ``[(record_id, [positions...]), ...]`` in file order.
1512
"""
16-
# TODO: remove this delegation and write your own implementation here.
17-
return _baseline(fasta_path, pattern)
13+
# Read as bytes — no decode overhead, pattern stays as bytes.
14+
with open(fasta_path, "rb") as f:
15+
data = f.read()
16+
17+
# Pre-compile a lookahead regex so overlapping matches are found in one pass.
18+
regex = re.compile(b"(?=" + re.escape(pattern) + b")")
19+
20+
def process_record(record: bytes) -> tuple[str, list[int]] | None:
21+
if not record.strip():
22+
return None
23+
lines = record.split(b"\n")
24+
record_id = lines[0].strip().decode("ascii")
25+
sequence = b"".join(lines[1:]).replace(b" ", b"")
26+
positions = [m.start() for m in regex.finditer(sequence)]
27+
if positions:
28+
return (record_id, positions)
29+
return None
30+
31+
# Split on b'>' — first chunk is empty for well-formed files.
32+
records = data.split(b">")[1:] # skip leading empty chunk
33+
34+
results: list[tuple[str, list[int]]] = []
35+
36+
# re operations release the GIL, so ThreadPoolExecutor gives real parallelism.
37+
with ThreadPoolExecutor() as executor:
38+
# Submit in order, preserve file order via index.
39+
futures = {executor.submit(process_record, r): i for i, r in enumerate(records)}
40+
ordered: list[tuple[int, tuple[str, list[int]]]] = []
41+
for future in as_completed(futures):
42+
result = future.result()
43+
if result is not None:
44+
ordered.append((futures[future], result))
45+
46+
ordered.sort(key=lambda x: x[0])
47+
results = [r for _, r in ordered]
48+
return results

0 commit comments

Comments
 (0)