Skip to content

Commit 64e112c

Browse files
jrlouis21Copilot
andcommitted
more optimization
Co-authored-by: Copilot <copilot@github.com>
1 parent 74d46e4 commit 64e112c

1 file changed

Lines changed: 53 additions & 37 deletions

File tree

rounds/3_dna/solution.py

Lines changed: 53 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7,52 +7,68 @@
77

88
from __future__ import annotations
99

10+
import os
1011
from concurrent.futures import ThreadPoolExecutor
1112

13+
_DELETE_TABLE = bytes.maketrans(b"", b"")
14+
_DELETE_CHARS = b"\n \r"
15+
_NUM_WORKERS = os.cpu_count() or 4
16+
17+
18+
def _search_chunk(
19+
data: bytes, pattern: bytes, records: list[tuple[int, int]]
20+
) -> list[tuple[str, list[int]]]:
21+
"""Process a batch of (header_start, next_record_start) pairs."""
22+
results: list[tuple[str, list[int]]] = []
23+
for rec_start, rec_end in records:
24+
nl = data.index(b"\n", rec_start)
25+
record_id = data[rec_start + 1 : nl].strip().decode("ascii")
26+
seq = data[nl + 1 : rec_end].translate(_DELETE_TABLE, _DELETE_CHARS)
27+
28+
positions: list[int] = []
29+
start = 0
30+
_find = seq.find
31+
while True:
32+
idx = _find(pattern, start)
33+
if idx == -1:
34+
break
35+
positions.append(idx)
36+
start = idx + 1
37+
38+
if positions:
39+
results.append((record_id, positions))
40+
return results
41+
1242

1343
def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
1444
"""Find every FASTA record whose sequence contains ``pattern``.
1545
1646
Returns ``[(record_id, [positions...]), ...]`` in file order.
1747
"""
18-
# Step 1: read the whole FASTA file as text and decode the pattern so the
19-
# search below can use a single ``str`` API.
20-
pattern_str = pattern.decode("ascii")
21-
with open(fasta_path) as f:
22-
text = f.read()
48+
with open(fasta_path, "rb") as f:
49+
data = f.read()
2350

24-
matches: list[tuple[str, list[int]]] = []
51+
# Serial pass: locate all record boundaries (very fast — just scanning for '>')
52+
boundaries: list[tuple[int, int]] = []
53+
pos = data.find(b">")
54+
while pos != -1:
55+
nxt = data.find(b">", pos + 1)
56+
boundaries.append((pos, nxt if nxt != -1 else len(data)))
57+
pos = nxt
58+
59+
if not boundaries:
60+
return []
2561

26-
with ThreadPoolExecutor() as executor:
27-
# Step 2: split the file on '>' to peel off one record at a time. The
28-
# first element is the chunk before any header (empty for well-formed
29-
# files) and is skipped by the ``.strip()`` guard below.
30-
records = [record for record in text.split(">") if record.strip()]
31-
32-
def process_record(record: str) -> tuple[str, list[int]] | None:
33-
# Step 3: a record looks like ``"<id>\n<seq line 1>\n<seq line 2>\n..."``.
34-
# The id is the first line; the remaining lines are joined back into a
35-
# single contiguous sequence string.
36-
lines = record.split("\n")
37-
record_id = lines[0].strip()
38-
sequence = "".join(lines[1:]).replace(" ", "")
39-
40-
# Step 4: walk the sequence with ``str.find()``, advancing one byte
41-
# past each hit so overlapping matches are reported too.
42-
positions: list[int] = []
43-
start = 0
44-
while True:
45-
pos = sequence.find(pattern_str, start)
46-
if pos == -1:
47-
break
48-
positions.append(pos)
49-
start = pos + 1
50-
51-
if positions:
52-
return (record_id, positions)
53-
return None
54-
55-
results = list(executor.map(process_record, records))
56-
matches.extend(result for result in results if result is not None)
62+
# Partition records into roughly equal chunks for each worker thread.
63+
# With free-threaded Python, each thread runs truly in parallel.
64+
n = len(boundaries)
65+
chunk_size = max(1, n // _NUM_WORKERS)
66+
chunks = [boundaries[i : i + chunk_size] for i in range(0, n, chunk_size)]
67+
68+
matches: list[tuple[str, list[int]]] = []
69+
with ThreadPoolExecutor(max_workers=_NUM_WORKERS) as executor:
70+
futures = [executor.submit(_search_chunk, data, pattern, chunk) for chunk in chunks]
71+
for future in futures:
72+
matches.extend(future.result())
5773

5874
return matches

0 commit comments

Comments
 (0)