|
5 | 5 | own faster implementation. |
6 | 6 | """ |
7 | 7 |
|
8 | | -from .baseline import find_matches as _baseline |
| 8 | +from concurrent.futures import ThreadPoolExecutor |
| 9 | + |
| 10 | + |
| 11 | +def _find_matches(pattern_str: str, record: str) -> tuple[str, list[int]]: |
| 12 | + lines = record.split("\n") |
| 13 | + record_id = lines[0].strip() |
| 14 | + sequence = "".join(lines[1:]).replace(" ", "") |
| 15 | + |
| 16 | + positions: list[int] = [] |
| 17 | + start = 0 |
| 18 | + while True: |
| 19 | + pos = sequence.find(pattern_str, start) |
| 20 | + if pos == -1: |
| 21 | + break |
| 22 | + positions.append(pos) |
| 23 | + start = pos + 1 |
| 24 | + |
| 25 | + return (record_id, positions) |
9 | 26 |
|
10 | 27 |
|
11 | 28 | def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: |
12 | 29 | """Find every FASTA record whose sequence contains ``pattern``. |
13 | 30 |
|
14 | 31 | Returns ``[(record_id, [positions...]), ...]`` in file order. |
15 | 32 | """ |
16 | | - # TODO: remove this delegation and write your own implementation here. |
17 | | - return _baseline(fasta_path, pattern) |
| 33 | + pattern_str = pattern.decode("ascii") |
| 34 | + matches = [] |
| 35 | + with open(fasta_path, "r") as f: |
| 36 | + text = f.read() |
| 37 | + |
| 38 | + with ThreadPoolExecutor(16) as ex: |
| 39 | + futures = [ |
| 40 | + ex.submit(_find_matches, pattern_str, record) |
| 41 | + for record in text.split(">") |
| 42 | + if record.strip() |
| 43 | + ] |
| 44 | + matches = [res for future in futures if (res := future.result())[1]] |
| 45 | + return matches |
0 commit comments