66"""
77
88from concurrent .futures import ThreadPoolExecutor
9+ from mmap import mmap , ACCESS_READ
10+ from os import fstat
911
1012
1113def _scan_record (record : bytes , pattern : bytes ) -> tuple [str , list [int ]] | None :
@@ -22,8 +24,7 @@ def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
2224 header , _ , body = record .partition (b'\n ' )
2325 record_id = header .strip ().decode ('ascii' )
2426
25- # Keep the hot path in bytes so we avoid decoding each whole sequence.
26- # Whitespace is not part of the DNA sequence, so remove it before scanning.
27+ # Clean up data before parsing
2728 sequence = (
2829 body .replace (b'\n ' , b'' )
2930 .replace (b'\r ' , b'' )
@@ -48,20 +49,33 @@ def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
4849
4950
5051def find_matches (fasta_path : str , pattern : bytes ) -> list [tuple [str , list [int ]]]:
51- """Find every FASTA record whose sequence contains ``pattern``.
52+ """ Find every FASTA record whose sequence contains ``pattern``.
5253
5354 Returns ``[(record_id, [positions...]), ...]`` in file order.
5455 """
5556
56- # Read once in binary mode so parsing and searching can stay on bytes.
5757 with open (fasta_path , 'rb' ) as f :
58- text = f .read ()
59-
60- # Split into DNA sequences
61- records = [record for record in text .split (b'>' ) if record .strip ()]
62-
63- # Scan records concurrently
64- with ThreadPoolExecutor () as executor :
65- results = executor .map (lambda record : _scan_record (record , pattern ), records )
66-
67- return [result for result in results if result is not None ]
58+ if fstat (f .fileno ()).st_size == 0 :
59+ return []
60+
61+ with mmap (f .fileno (), 0 , access = ACCESS_READ ) as text :
62+ # Read the file as an mmap and break it up into DNA records
63+ records : list [bytes ] = []
64+ start = text .find (b'>' )
65+ while start != - 1 :
66+ end = text .find (b'>' , start + 1 )
67+ if end == - 1 :
68+ record = text [start + 1 :]
69+ start = - 1
70+ else :
71+ record = text [start + 1 :end ]
72+ start = end
73+
74+ if record .strip ():
75+ records .append (record )
76+
77+ # Scan records concurrently
78+ with ThreadPoolExecutor () as executor :
79+ results = executor .map (lambda record : _scan_record (record , pattern ), records )
80+
81+ return [result for result in results if result is not None ]
0 commit comments