66"""
77
88import re
9+ from concurrent .futures import ThreadPoolExecutor
10+
11+ def find_match (args ):
12+ regex ,record = args
13+ # Step 3: a record looks like ``"<id>\n<seq line 1>\n<seq line 2>\n..."``.
14+ # The id is the first line; the remaining lines are joined back into a
15+ # single contiguous sequence string.
16+ lines = record .split ("\n " )
17+ record_id = lines [0 ].strip ()
18+ sequence = "" .join (lines [1 :]).replace (" " , "" )
19+
20+ positions : list [int ] = []
21+ positions = [m .start () for m in regex .finditer (sequence )]
22+ if positions :
23+ return (record_id , positions )
24+ else :
25+ return None
26+
927
1028def find_matches (fasta_path : str , pattern : bytes ) -> list [tuple [str , list [int ]]]:
1129 """Find every FASTA record whose sequence contains ``pattern``.
@@ -22,22 +40,17 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
2240 pattern_str = pattern .decode ('ascii' )
2341 regex = re .compile (pattern_str )
2442
25- # Step 2: split the file on '>' to peel off one record at a time. The
26- # first element is the chunk before any header (empty for well-formed
27- # files) and is skipped by the ``.strip()`` guard below.
28- for record in text .split (">" ):
29- if not record .strip ():
30- continue
31-
32- # Step 3: a record looks like ``"<id>\n<seq line 1>\n<seq line 2>\n..."``.
33- # The id is the first line; the remaining lines are joined back into a
34- # single contiguous sequence string.
35- lines = record .split ("\n " )
36- record_id = lines [0 ].strip ()
37- sequence = "" .join (lines [1 :]).replace (" " , "" )
38-
39- positions : list [int ] = []
40- positions = [m .start () for m in regex .finditer (sequence )]
41- if positions :
42- matches .append ((record_id , positions ))
43+ with ThreadPoolExecutor () as ex :
44+ futures = []
45+ for record in text .split (">" ):
46+ if not record .strip ():
47+ continue
48+
49+ t = ex .submit (find_match , args = (regex ,record ))
50+ futures .append (t )
51+
52+ for t in futures :
53+ result = t .result ()
54+ if result :
55+ matches .append (result )
4356 return matches
0 commit comments