|
1 | | -"""Your Round 3 solution — DNA sequence matcher. |
| 1 | +"""Your Round 3 solution — DNA sequence matcher.""" |
2 | 2 |
|
3 | | -**Edit this file.** It currently delegates to ``baseline.py`` so everything |
4 | | -passes out of the box. Replace the body of ``find_matches`` with your |
5 | | -own faster implementation. |
6 | | -""" |
7 | | - |
8 | | -from .baseline import find_matches as _baseline |
| 3 | +from __future__ import annotations |
| 4 | +import re |
| 5 | +from concurrent.futures import ThreadPoolExecutor, as_completed |
9 | 6 |
|
10 | 7 |
|
11 | 8 | def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: |
12 | 9 | """Find every FASTA record whose sequence contains ``pattern``. |
13 | 10 |
|
14 | 11 | Returns ``[(record_id, [positions...]), ...]`` in file order. |
15 | 12 | """ |
16 | | - # TODO: remove this delegation and write your own implementation here. |
17 | | - return _baseline(fasta_path, pattern) |
| 13 | + # Read as bytes — no decode overhead, pattern stays as bytes. |
| 14 | + with open(fasta_path, "rb") as f: |
| 15 | + data = f.read() |
| 16 | + |
| 17 | + # Pre-compile a lookahead regex so overlapping matches are found in one pass. |
| 18 | + regex = re.compile(b"(?=" + re.escape(pattern) + b")") |
| 19 | + |
| 20 | + def process_record(record: bytes) -> tuple[str, list[int]] | None: |
| 21 | + if not record.strip(): |
| 22 | + return None |
| 23 | + lines = record.split(b"\n") |
| 24 | + record_id = lines[0].strip().decode("ascii") |
| 25 | + sequence = b"".join(lines[1:]).replace(b" ", b"") |
| 26 | + positions = [m.start() for m in regex.finditer(sequence)] |
| 27 | + if positions: |
| 28 | + return (record_id, positions) |
| 29 | + return None |
| 30 | + |
| 31 | + # Split on b'>' — first chunk is empty for well-formed files. |
| 32 | + records = data.split(b">")[1:] # skip leading empty chunk |
| 33 | + |
| 34 | + results: list[tuple[str, list[int]]] = [] |
| 35 | + |
| 36 | + # re operations release the GIL, so ThreadPoolExecutor gives real parallelism. |
| 37 | + with ThreadPoolExecutor() as executor: |
| 38 | + # Submit in order, preserve file order via index. |
| 39 | + futures = {executor.submit(process_record, r): i for i, r in enumerate(records)} |
| 40 | + ordered: list[tuple[int, tuple[str, list[int]]]] = [] |
| 41 | + for future in as_completed(futures): |
| 42 | + result = future.result() |
| 43 | + if result is not None: |
| 44 | + ordered.append((futures[future], result)) |
| 45 | + |
| 46 | + ordered.sort(key=lambda x: x[0]) |
| 47 | + results = [r for _, r in ordered] |
| 48 | + return results |
0 commit comments