77
88from __future__ import annotations
99
10+ import mmap
1011import os
1112from concurrent .futures import ThreadPoolExecutor
1213
1617
1718
1819def _search_chunk (
19- data : bytes , pattern : bytes , records : list [tuple [int , int ]]
20+ data : bytes | mmap .mmap ,
21+ pattern : bytes ,
22+ records : list [tuple [int , int ]],
2023) -> list [tuple [str , list [int ]]]:
2124 """Process a batch of (header_start, next_record_start) pairs."""
2225 results : list [tuple [str , list [int ]]] = []
2326 for rec_start , rec_end in records :
2427 nl = data .index (b"\n " , rec_start )
28+ raw = data [nl + 1 : rec_end ]
29+
30+ if pattern not in raw :
31+ continue
32+
2533 record_id = data [rec_start + 1 : nl ].strip ().decode ("ascii" )
26- seq = data [ nl + 1 : rec_end ] .translate (_DELETE_TABLE , _DELETE_CHARS )
34+ seq = raw .translate (_DELETE_TABLE , _DELETE_CHARS )
2735
2836 positions : list [int ] = []
2937 start = 0
@@ -46,21 +54,22 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
4654 Returns ``[(record_id, [positions...]), ...]`` in file order.
4755 """
4856 with open (fasta_path , "rb" ) as f :
49- data = f . read ( )
57+ mm = mmap . mmap ( f . fileno (), 0 , access = mmap . ACCESS_READ )
5058
51- # Serial pass: locate all record boundaries (very fast — just scanning for '>')
5259 boundaries : list [tuple [int , int ]] = []
53- pos = data .find (b">" )
60+ pos = mm .find (b">" )
5461 while pos != - 1 :
55- nxt = data .find (b">" , pos + 1 )
56- boundaries .append ((pos , nxt if nxt != - 1 else len ( data )))
62+ nxt = mm .find (b">" , pos + 1 )
63+ boundaries .append ((pos , nxt if nxt != - 1 else mm . size ( )))
5764 pos = nxt
5865
5966 if not boundaries :
67+ mm .close ()
6068 return []
6169
62- # Partition records into roughly equal chunks for each worker thread.
63- # With free-threaded Python, each thread runs truly in parallel.
70+ data = mm [:]
71+ mm .close ()
72+
6473 n = len (boundaries )
6574 chunk_size = max (1 , n // _NUM_WORKERS )
6675 chunks = [boundaries [i : i + chunk_size ] for i in range (0 , n , chunk_size )]
0 commit comments