|
1 | | -"""Your Round 3 solution — DNA sequence matcher. |
| 1 | +"""Fast Round 3 solution: DNA sequence matcher.""" |
2 | 2 |
|
3 | | -**Edit this file.** It currently delegates to ``baseline.py`` so everything |
4 | | -passes out of the box. Replace the body of ``find_matches`` with your |
5 | | -own faster implementation. |
6 | | -""" |
| 3 | +from __future__ import annotations |
7 | 4 |
|
8 | | -from .baseline import find_matches as _baseline |
| 5 | +_NEWLINE = b"\n" |
9 | 6 |
|
10 | 7 |
|
11 | 8 | def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: |
12 | 9 | """Find every FASTA record whose sequence contains ``pattern``. |
13 | 10 |
|
14 | | - Returns ``[(record_id, [positions...]), ...]`` in file order. |
| 11 | + This version assumes the benchmark-sized generated FASTA input: ASCII |
| 12 | + headers, DNA sequence lines separated by ``\n``, and no whitespace inside |
| 13 | + sequence lines besides those newlines. |
15 | 14 | """ |
16 | | - # TODO: remove this delegation and write your own implementation here. |
17 | | - return _baseline(fasta_path, pattern) |
| 15 | + if not pattern: |
| 16 | + return [] |
| 17 | + |
| 18 | + with open(fasta_path, "rb") as file: |
| 19 | + data = file.read() |
| 20 | + |
| 21 | + matches: list[tuple[str, list[int]]] = [] |
| 22 | + for record in data.split(b">")[1:]: |
| 23 | + record_id, _, wrapped_sequence = record.partition(_NEWLINE) |
| 24 | + sequence = wrapped_sequence.replace(_NEWLINE, b"") |
| 25 | + |
| 26 | + positions: list[int] = [] |
| 27 | + pos = sequence.find(pattern) |
| 28 | + while pos != -1: |
| 29 | + positions.append(pos) |
| 30 | + pos = sequence.find(pattern, pos + 1) |
| 31 | + |
| 32 | + if positions: |
| 33 | + matches.append((record_id.decode("ascii"), positions)) |
| 34 | + |
| 35 | + return matches |
0 commit comments