|
1 | 1 | """Your Round 3 solution — DNA sequence matcher. |
2 | 2 |
|
3 | | -**Edit this file.** It currently delegates to ``baseline.py`` so everything |
4 | | -passes out of the box. Replace the body of ``find_matches`` with your |
5 | | -own faster implementation. |
6 | | -""" |
| 3 | +#**Edit this file.** It currently delegates to ``baseline.py`` so everything |
| 4 | +#passes out of the box. Replace the body of ``find_matches`` with your |
| 5 | +#own faster implementation. |
| 6 | +
|
| 7 | +
|
| 8 | +#import numpy as np |
| 9 | +#import threading |
| 10 | +
|
| 11 | +#def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: |
| 12 | + |
| 13 | +# Returns ``[(record_id, [positions...]), ...]`` in file order. |
| 14 | +# """ |
| 15 | +# # Step 1: read the whole FASTA file as text and decode the pattern so the |
| 16 | +# # search below can use a single ``str`` API. |
| 17 | +# pattern_str = pattern.decode("ascii") |
| 18 | + |
| 19 | + |
| 20 | +# data = np.loadtxt(fasta_path, dtype=str, delimiter="/n") |
| 21 | +# data = |
| 22 | + |
| 23 | +# data = {s.split(delimiter)[0].strip(): s.split('>')[1].strip() for s in sequencet} |
| 24 | + |
| 25 | +# positions: list[int] = [] |
| 26 | +# data = np.array(final_list) |
| 27 | + |
| 28 | +# mask = (data == pattern) |
| 29 | +# count = np.count_nonzero(mask) |
| 30 | + |
| 31 | + |
| 32 | +#from __future__ import annotations |
| 33 | + |
| 34 | +#"""Fast Round 3 solution: DNA sequence matcher.""" |
| 35 | + |
| 36 | + |
| 37 | + |
| 38 | +from __future__ import annotations |
| 39 | + |
| 40 | +import numpy as np |
| 41 | +import os |
| 42 | +from concurrent.futures import ThreadPoolExecutor |
| 43 | + |
| 44 | +_NEWLINE = b"\n" |
7 | 45 |
|
8 | | -from .baseline import find_matches as _baseline |
9 | 46 |
|
10 | 47 |
|
11 | 48 | def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: |
12 | 49 | """Find every FASTA record whose sequence contains ``pattern``. |
13 | 50 |
|
14 | | - Returns ``[(record_id, [positions...]), ...]`` in file order. |
| 51 | + This version assumes the benchmark-sized generated FASTA input: ASCII |
| 52 | + headers, DNA sequence lines separated by ``\n``, and no whitespace inside |
| 53 | + sequence lines besides those newlines. |
15 | 54 | """ |
16 | | - # TODO: remove this delegation and write your own implementation here. |
17 | | - return _baseline(fasta_path, pattern) |
| 55 | + if not pattern: |
| 56 | + return [] |
| 57 | + |
| 58 | + pattern_values = np.frombuffer(pattern, dtype=np.uint8) |
| 59 | + pattern_len = len(pattern) |
| 60 | + |
| 61 | + with open(fasta_path, "rb") as file: |
| 62 | + data = file.read() |
| 63 | + |
| 64 | + matches: list[tuple[str, list[int]]] = [] |
| 65 | + for record in data.split(b">")[1:]: |
| 66 | + record_id, _, wrapped_sequence = record.partition(_NEWLINE) |
| 67 | + sequence = wrapped_sequence.replace(_NEWLINE, b"") |
| 68 | + sequence_len = len(sequence) |
| 69 | + if sequence_len < pattern_len: |
| 70 | + continue |
| 71 | + |
| 72 | + sequence_values = np.frombuffer(sequence, dtype=np.uint8) |
| 73 | + positions_mask = ( |
| 74 | + sequence_values[: sequence_len - pattern_len + 1] == pattern_values[0] |
| 75 | + ) |
| 76 | + for pattern_index in range(1, pattern_len): |
| 77 | + positions_mask &= ( |
| 78 | + sequence_values[ |
| 79 | + pattern_index : sequence_len - pattern_len + 1 + pattern_index |
| 80 | + ] |
| 81 | + == pattern_values[pattern_index] |
| 82 | + ) |
| 83 | + |
| 84 | + positions = np.nonzero(positions_mask)[0] |
| 85 | + if positions.size: |
| 86 | + matches.append((record_id.decode("ascii"), positions.tolist())) |
| 87 | + |
| 88 | + return matches |
| 89 | + |
| 90 | + |
| 91 | + |
0 commit comments