|
5 | 5 | own faster implementation. |
6 | 6 | """ |
7 | 7 |
|
8 | | -from .baseline import find_matches as _baseline |
| 8 | +from concurrent.futures import ThreadPoolExecutor |
| 9 | + |
| 10 | + |
| 11 | +def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None: |
| 12 | + """ Scan one FASTA record for all occurrences of ``pattern``. |
| 13 | +
|
| 14 | + Returns the record id and every zero-based match position, or ``None`` if |
| 15 | + the record is empty or does not contain the pattern. |
| 16 | + """ |
| 17 | + |
| 18 | + if not record.strip(): |
| 19 | + return None |
| 20 | + |
| 21 | + # Parition DNA record into header and DNA sequence |
| 22 | + header, _, body = record.partition(b'\n') |
| 23 | + record_id = header.strip().decode('ascii') |
| 24 | + |
| 25 | + # Keep the hot path in bytes so we avoid decoding each whole sequence. |
| 26 | + # Whitespace is not part of the DNA sequence, so remove it before scanning. |
| 27 | + sequence = ( |
| 28 | + body.replace(b'\n', b'') |
| 29 | + .replace(b'\r', b'') |
| 30 | + .replace(b' ', b'') |
| 31 | + ) |
| 32 | + |
| 33 | + positions: list[int] = [] |
| 34 | + start = 0 |
| 35 | + |
| 36 | + # Advance by one after each hit so overlapping matches are included. |
| 37 | + while True: |
| 38 | + pos = sequence.find(pattern, start) |
| 39 | + if pos == -1: |
| 40 | + break |
| 41 | + positions.append(pos) |
| 42 | + start = pos + 1 |
| 43 | + |
| 44 | + if not positions: |
| 45 | + return None |
| 46 | + |
| 47 | + return record_id, positions |
9 | 48 |
|
10 | 49 |
|
11 | 50 | def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: |
12 | 51 | """Find every FASTA record whose sequence contains ``pattern``. |
13 | 52 |
|
14 | 53 | Returns ``[(record_id, [positions...]), ...]`` in file order. |
15 | 54 | """ |
16 | | - # TODO: remove this delegation and write your own implementation here. |
17 | | - return _baseline(fasta_path, pattern) |
| 55 | + |
| 56 | + # Read once in binary mode so parsing and searching can stay on bytes. |
| 57 | + with open(fasta_path, 'rb') as f: |
| 58 | + text = f.read() |
| 59 | + |
| 60 | + # Split into DNA sequences |
| 61 | + records = [record for record in text.split(b'>') if record.strip()] |
| 62 | + |
| 63 | + # Scan records concurrently |
| 64 | + with ThreadPoolExecutor() as executor: |
| 65 | + results = executor.map(lambda record: _scan_record(record, pattern), records) |
| 66 | + |
| 67 | + return [result for result in results if result is not None] |
0 commit comments