Skip to content

Commit 74d46e4

Browse files
committed
user threadpoolexecutor
1 parent 88416d1 commit 74d46e4

1 file changed

Lines changed: 38 additions & 32 deletions

File tree

rounds/3_dna/solution.py

Lines changed: 38 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,46 +7,52 @@
77

88
from __future__ import annotations
99

10+
from concurrent.futures import ThreadPoolExecutor
11+
1012

1113
def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
1214
"""Find every FASTA record whose sequence contains ``pattern``.
1315
1416
Returns ``[(record_id, [positions...]), ...]`` in file order.
1517
"""
16-
# Read as bytes — skips the text-decode cost the baseline pays.
17-
with open(fasta_path, "rb") as f:
18-
data = f.read()
18+
# Step 1: read the whole FASTA file as text and decode the pattern so the
19+
# search below can use a single ``str`` API.
20+
pattern_str = pattern.decode("ascii")
21+
with open(fasta_path) as f:
22+
text = f.read()
1923

20-
plen = len(pattern)
21-
_find = bytes.find # local lookup
2224
matches: list[tuple[str, list[int]]] = []
2325

24-
# Skip the first (empty) chunk before the first ">".
25-
for record in data.split(b">")[1:]:
26-
# Header ends at the first newline.
27-
nl = record.index(b"\n")
28-
# Build the contiguous sequence by stripping newlines — a single
29-
# C-level bytes.replace() call instead of split-then-join.
30-
sequence = record[nl + 1 :].replace(b"\n", b"")
31-
32-
# Quick exit: most records do not contain the pattern at all.
33-
# ``in`` delegates to a fast C memchr/memmem scan.
34-
pos = _find(sequence, pattern)
35-
if pos == -1:
36-
continue
37-
38-
record_id = record[:nl].strip().decode("ascii")
39-
40-
# Collect all (overlapping) hit positions.
41-
positions: list[int] = [pos]
42-
start = pos + 1
43-
while True:
44-
pos = _find(sequence, pattern, start)
45-
if pos == -1:
46-
break
47-
positions.append(pos)
48-
start = pos + 1
49-
50-
matches.append((record_id, positions))
26+
with ThreadPoolExecutor() as executor:
27+
# Step 2: split the file on '>' to peel off one record at a time. The
28+
# first element is the chunk before any header (empty for well-formed
29+
# files) and is skipped by the ``.strip()`` guard below.
30+
records = [record for record in text.split(">") if record.strip()]
31+
32+
def process_record(record: str) -> tuple[str, list[int]] | None:
33+
# Step 3: a record looks like ``"<id>\n<seq line 1>\n<seq line 2>\n..."``.
34+
# The id is the first line; the remaining lines are joined back into a
35+
# single contiguous sequence string.
36+
lines = record.split("\n")
37+
record_id = lines[0].strip()
38+
sequence = "".join(lines[1:]).replace(" ", "")
39+
40+
# Step 4: walk the sequence with ``str.find()``, advancing one byte
41+
# past each hit so overlapping matches are reported too.
42+
positions: list[int] = []
43+
start = 0
44+
while True:
45+
pos = sequence.find(pattern_str, start)
46+
if pos == -1:
47+
break
48+
positions.append(pos)
49+
start = pos + 1
50+
51+
if positions:
52+
return (record_id, positions)
53+
return None
54+
55+
results = list(executor.map(process_record, records))
56+
matches.extend(result for result in results if result is not None)
5157

5258
return matches

0 commit comments

Comments
 (0)