Skip to content

Commit adea638

Browse files
author
fbrewer
committed
updating solution
1 parent 4f0ab3c commit adea638

1 file changed

Lines changed: 82 additions & 8 deletions

File tree

rounds/3_dna/solution.py

Lines changed: 82 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,91 @@
11
"""Your Round 3 solution — DNA sequence matcher.
22
3-
**Edit this file.** It currently delegates to ``baseline.py`` so everything
4-
passes out of the box. Replace the body of ``find_matches`` with your
5-
own faster implementation.
6-
"""
3+
#**Edit this file.** It currently delegates to ``baseline.py`` so everything
4+
#passes out of the box. Replace the body of ``find_matches`` with your
5+
#own faster implementation.
6+
7+
8+
#import numpy as np
9+
#import threading
10+
11+
#def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
12+
13+
# Returns ``[(record_id, [positions...]), ...]`` in file order.
14+
# """
15+
# # Step 1: read the whole FASTA file as text and decode the pattern so the
16+
# # search below can use a single ``str`` API.
17+
# pattern_str = pattern.decode("ascii")
18+
19+
20+
# data = np.loadtxt(fasta_path, dtype=str, delimiter="/n")
21+
# data =
22+
23+
# data = {s.split(delimiter)[0].strip(): s.split('>')[1].strip() for s in sequencet}
24+
25+
# positions: list[int] = []
26+
# data = np.array(final_list)
27+
28+
# mask = (data == pattern)
29+
# count = np.count_nonzero(mask)
30+
31+
32+
#from __future__ import annotations
33+
34+
#"""Fast Round 3 solution: DNA sequence matcher."""
35+
36+
37+
38+
from __future__ import annotations
39+
40+
import numpy as np
41+
import os
42+
from concurrent.futures import ThreadPoolExecutor
43+
44+
_NEWLINE = b"\n"
745

8-
from .baseline import find_matches as _baseline
946

1047

1148
def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
1249
"""Find every FASTA record whose sequence contains ``pattern``.
1350
14-
Returns ``[(record_id, [positions...]), ...]`` in file order.
51+
This version assumes the benchmark-sized generated FASTA input: ASCII
52+
headers, DNA sequence lines separated by ``\n``, and no whitespace inside
53+
sequence lines besides those newlines.
1554
"""
16-
# TODO: remove this delegation and write your own implementation here.
17-
return _baseline(fasta_path, pattern)
55+
if not pattern:
56+
return []
57+
58+
pattern_values = np.frombuffer(pattern, dtype=np.uint8)
59+
pattern_len = len(pattern)
60+
61+
with open(fasta_path, "rb") as file:
62+
data = file.read()
63+
64+
matches: list[tuple[str, list[int]]] = []
65+
for record in data.split(b">")[1:]:
66+
record_id, _, wrapped_sequence = record.partition(_NEWLINE)
67+
sequence = wrapped_sequence.replace(_NEWLINE, b"")
68+
sequence_len = len(sequence)
69+
if sequence_len < pattern_len:
70+
continue
71+
72+
sequence_values = np.frombuffer(sequence, dtype=np.uint8)
73+
positions_mask = (
74+
sequence_values[: sequence_len - pattern_len + 1] == pattern_values[0]
75+
)
76+
for pattern_index in range(1, pattern_len):
77+
positions_mask &= (
78+
sequence_values[
79+
pattern_index : sequence_len - pattern_len + 1 + pattern_index
80+
]
81+
== pattern_values[pattern_index]
82+
)
83+
84+
positions = np.nonzero(positions_mask)[0]
85+
if positions.size:
86+
matches.append((record_id.decode("ascii"), positions.tolist()))
87+
88+
return matches
89+
90+
91+

0 commit comments

Comments
 (0)