Skip to content

Commit 24a1f06

Browse files
jrlouis21Copilot
andcommitted
user mmap
Co-authored-by: Copilot <copilot@github.com>
1 parent 64e112c commit 24a1f06

1 file changed

Lines changed: 18 additions & 9 deletions

File tree

rounds/3_dna/solution.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from __future__ import annotations
99

10+
import mmap
1011
import os
1112
from concurrent.futures import ThreadPoolExecutor
1213

@@ -16,14 +17,21 @@
1617

1718

1819
def _search_chunk(
19-
data: bytes, pattern: bytes, records: list[tuple[int, int]]
20+
data: bytes | mmap.mmap,
21+
pattern: bytes,
22+
records: list[tuple[int, int]],
2023
) -> list[tuple[str, list[int]]]:
2124
"""Process a batch of (header_start, next_record_start) pairs."""
2225
results: list[tuple[str, list[int]]] = []
2326
for rec_start, rec_end in records:
2427
nl = data.index(b"\n", rec_start)
28+
raw = data[nl + 1 : rec_end]
29+
30+
if pattern not in raw:
31+
continue
32+
2533
record_id = data[rec_start + 1 : nl].strip().decode("ascii")
26-
seq = data[nl + 1 : rec_end].translate(_DELETE_TABLE, _DELETE_CHARS)
34+
seq = raw.translate(_DELETE_TABLE, _DELETE_CHARS)
2735

2836
positions: list[int] = []
2937
start = 0
@@ -46,21 +54,22 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
4654
Returns ``[(record_id, [positions...]), ...]`` in file order.
4755
"""
4856
with open(fasta_path, "rb") as f:
49-
data = f.read()
57+
mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
5058

51-
# Serial pass: locate all record boundaries (very fast — just scanning for '>')
5259
boundaries: list[tuple[int, int]] = []
53-
pos = data.find(b">")
60+
pos = mm.find(b">")
5461
while pos != -1:
55-
nxt = data.find(b">", pos + 1)
56-
boundaries.append((pos, nxt if nxt != -1 else len(data)))
62+
nxt = mm.find(b">", pos + 1)
63+
boundaries.append((pos, nxt if nxt != -1 else mm.size()))
5764
pos = nxt
5865

5966
if not boundaries:
67+
mm.close()
6068
return []
6169

62-
# Partition records into roughly equal chunks for each worker thread.
63-
# With free-threaded Python, each thread runs truly in parallel.
70+
data = mm[:]
71+
mm.close()
72+
6473
n = len(boundaries)
6574
chunk_size = max(1, n // _NUM_WORKERS)
6675
chunks = [boundaries[i : i + chunk_size] for i in range(0, n, chunk_size)]

0 commit comments

Comments
 (0)