Skip to content

Commit daa65a5

Browse files
author
Jacob Summerville
committed
Add mmap
1 parent 672a727 commit daa65a5

1 file changed

Lines changed: 28 additions & 14 deletions

File tree

rounds/3_dna/solution.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
"""
77

88
from concurrent.futures import ThreadPoolExecutor
9+
from mmap import mmap, ACCESS_READ
10+
from os import fstat
911

1012

1113
def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
@@ -22,8 +24,7 @@ def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
2224
header, _, body = record.partition(b'\n')
2325
record_id = header.strip().decode('ascii')
2426

25-
# Keep the hot path in bytes so we avoid decoding each whole sequence.
26-
# Whitespace is not part of the DNA sequence, so remove it before scanning.
27+
# Clean up data before parsing
2728
sequence = (
2829
body.replace(b'\n', b'')
2930
.replace(b'\r', b'')
@@ -48,20 +49,33 @@ def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
4849

4950

5051
def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
51-
"""Find every FASTA record whose sequence contains ``pattern``.
52+
""" Find every FASTA record whose sequence contains ``pattern``.
5253
5354
Returns ``[(record_id, [positions...]), ...]`` in file order.
5455
"""
5556

56-
# Read once in binary mode so parsing and searching can stay on bytes.
5757
with open(fasta_path, 'rb') as f:
58-
text = f.read()
59-
60-
# Split into DNA sequences
61-
records = [record for record in text.split(b'>') if record.strip()]
62-
63-
# Scan records concurrently
64-
with ThreadPoolExecutor() as executor:
65-
results = executor.map(lambda record: _scan_record(record, pattern), records)
66-
67-
return [result for result in results if result is not None]
58+
if fstat(f.fileno()).st_size == 0:
59+
return []
60+
61+
with mmap(f.fileno(), 0, access=ACCESS_READ) as text:
62+
# Read the file as an mmap and break it up into DNA records
63+
records: list[bytes] = []
64+
start = text.find(b'>')
65+
while start != -1:
66+
end = text.find(b'>', start + 1)
67+
if end == -1:
68+
record = text[start + 1:]
69+
start = -1
70+
else:
71+
record = text[start + 1:end]
72+
start = end
73+
74+
if record.strip():
75+
records.append(record)
76+
77+
# Scan records concurrently
78+
with ThreadPoolExecutor() as executor:
79+
results = executor.map(lambda record: _scan_record(record, pattern), records)
80+
81+
return [result for result in results if result is not None]

0 commit comments

Comments
 (0)