77
88from __future__ import annotations
99
10- import mmap
1110import os
1211from concurrent .futures import ThreadPoolExecutor
1312
1716
1817
1918def _search_chunk (
20- data : bytes | mmap . mmap ,
19+ data : bytes ,
2120 pattern : bytes ,
2221 records : list [tuple [int , int ]],
2322) -> list [tuple [str , list [int ]]]:
2423 """Process a batch of (header_start, next_record_start) pairs."""
2524 results : list [tuple [str , list [int ]]] = []
2625 for rec_start , rec_end in records :
2726 nl = data .index (b"\n " , rec_start )
28- raw = data [nl + 1 : rec_end ]
27+ seq = data [nl + 1 : rec_end ]. translate ( _DELETE_TABLE , _DELETE_CHARS )
2928
30- seq = raw .translate (_DELETE_TABLE , _DELETE_CHARS )
31-
32- # Quick check: if the pattern isn't in the cleaned sequence, skip.
3329 if pattern not in seq :
3430 continue
3531
3632 record_id = data [rec_start + 1 : nl ].strip ().decode ("ascii" )
37- seq = raw .translate (_DELETE_TABLE , _DELETE_CHARS )
3833
3934 positions : list [int ] = []
4035 start = 0
@@ -57,22 +52,18 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
5752 Returns ``[(record_id, [positions...]), ...]`` in file order.
5853 """
5954 with open (fasta_path , "rb" ) as f :
60- mm = mmap . mmap ( f . fileno (), 0 , access = mmap . ACCESS_READ )
55+ data = f . read ( )
6156
6257 boundaries : list [tuple [int , int ]] = []
63- pos = mm .find (b">" )
58+ pos = data .find (b">" )
6459 while pos != - 1 :
65- nxt = mm .find (b">" , pos + 1 )
66- boundaries .append ((pos , nxt if nxt != - 1 else mm . size ( )))
60+ nxt = data .find (b">" , pos + 1 )
61+ boundaries .append ((pos , nxt if nxt != - 1 else len ( data )))
6762 pos = nxt
6863
6964 if not boundaries :
70- mm .close ()
7165 return []
7266
73- data = mm [:]
74- mm .close ()
75-
7667 n = len (boundaries )
7768 chunk_size = max (1 , n // _NUM_WORKERS )
7869 chunks = [boundaries [i : i + chunk_size ] for i in range (0 , n , chunk_size )]
0 commit comments