|
5 | 5 | own faster implementation. |
6 | 6 | """ |
7 | 7 |
|
| 8 | +from concurrent.futures import ThreadPoolExecutor |
| 9 | +from itertools import repeat |
8 | 10 | from .baseline import find_matches as _baseline |
9 | 11 |
|
10 | 12 |
|
| 13 | +def _process_record(record: str, pattern_str: str) -> tuple[str, list[int]] | None: |
| 14 | + if not record.strip(): |
| 15 | + return None |
| 16 | + |
| 17 | + lines = record.split("\n") |
| 18 | + record_id = lines[0].strip() |
| 19 | + sequence = "".join(lines[1:]).replace(" ", "") |
| 20 | + |
| 21 | + positions: list[int] = [] |
| 22 | + start = 0 |
| 23 | + while True: |
| 24 | + pos = sequence.find(pattern_str, start) |
| 25 | + if pos == -1: |
| 26 | + break |
| 27 | + positions.append(pos) |
| 28 | + start = pos + 1 |
| 29 | + |
| 30 | + if positions: |
| 31 | + return (record_id, positions) |
| 32 | + return None |
| 33 | + |
| 34 | + |
11 | 35 | def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: |
12 | | - """Find every FASTA record whose sequence contains ``pattern``. |
13 | 36 |
|
14 | | - Returns ``[(record_id, [positions...]), ...]`` in file order. |
15 | | - """ |
16 | | - # TODO: remove this delegation and write your own implementation here. |
17 | | - return _baseline(fasta_path, pattern) |
| 37 | + pattern_str = pattern.decode("ascii") |
| 38 | + with open(fasta_path, "r") as f: |
| 39 | + text = f.read() |
| 40 | + |
| 41 | + records = [record for record in text.split(">") if record.strip()] |
| 42 | + matches: list[tuple[str, list[int]]] = [] |
| 43 | + |
| 44 | + with ThreadPoolExecutor(max_workers=16) as executor: |
| 45 | + for result in executor.map(_process_record, records, repeat(pattern_str)): |
| 46 | + if result: |
| 47 | + matches.append(result) |
| 48 | + |
| 49 | + return matches |
| 50 | + |
0 commit comments