Skip to content

Commit 1094bc5

Browse files
committed
switch to array, use mmap
skip an array access dna attempt
1 parent 08413d3 commit 1094bc5

2 files changed

Lines changed: 61 additions & 76 deletions

File tree

rounds/1_histogram/solution.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,32 @@
44
passes out of the box. Replace the body of ``compute_histogram`` with your
55
own faster implementation.
66
"""
7+
from collections import defaultdict
8+
from mmap import mmap, ACCESS_READ
79

10+
def b2i(low: int, high: int) -> int:
11+
return high + (low << 8)
12+
13+
def i2b(x: int) -> bytes:
14+
return bytes([(x & 0xFF00) >> 8, x & 0xFF])
815

916
def compute_histogram(path: str) -> dict[bytes, int]:
1017
"""Frequency of every 2-byte bigram in the file at ``path``."""
1118
# Step 1: read the whole file into memory as a single bytes object.
12-
with open(path, "rb") as f:
13-
data = f.read()
19+
counts = [0 for _ in range(2**16)]
1420

15-
# Create a 2D matrix to count bigrams
16-
counts = [[0] * 256 for _ in range(256)]
21+
source = open(path, "rb", buffering=0)
22+
data = mmap(source.fileno(), 0, access=ACCESS_READ)
1723

24+
# Step 2: slide a 2-byte window across the buffer. For ``b"ABCD"`` the
25+
# iterations produce ``b"AB"``, ``b"BC"``, then ``b"CD"``. For each window,
26+
# bump the matching bucket in a ``dict`` keyed by the bigram itself.
27+
previous = data[0]
1828
for i in range(len(data) - 1):
19-
# Increment the count in each cell
20-
counts[data[i]][data[i + 1]] += 1
29+
current = data[i + 1]
30+
counts[current + (previous << 8)] += 1
31+
previous = current
2132

22-
# Convert the matrix to the original format
23-
output = {}
24-
for i in range(256):
25-
for j in range(256):
26-
if counts[i][j] > 0:
27-
output[bytes([i, j])] = counts[i][j]
28-
return output
33+
return {
34+
i2b(idx): value for idx, value in enumerate(counts) if value != 0
35+
}

rounds/3_dna/solution.py

Lines changed: 41 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -5,80 +5,58 @@
55
own faster implementation.
66
"""
77

8-
from __future__ import annotations
8+
from mmap import mmap, ACCESS_READ
9+
from concurrent.futures import ThreadPoolExecutor, wait
910

10-
import os
11-
from concurrent.futures import ThreadPoolExecutor
11+
def _subsearch(raw, record_id_start: int, data_start: int, data_end: int, pattern: bytes):
12+
plen = len(pattern)
13+
data = bytes(raw[data_start : data_end - 1]).replace(b"\n", b"")
14+
locations = []
15+
loc = data.find(pattern)
16+
while loc != -1:
17+
locations.append(loc)
18+
loc = data.find(pattern, loc + plen)
1219

13-
_NL = 0x0A # b"\n"
20+
if not locations:
21+
return None
1422

23+
record_id = raw[record_id_start : data_start - 1].decode("ascii")
24+
return (record_id, locations)
1525

1626
def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
17-
with open(fasta_path, "rb") as f:
18-
data = f.read()
27+
"""Find every FASTA record whose sequence contains ``pattern``.
1928
20-
# Step 1: locate every record start. A record starts with ``>`` either at
21-
# offset 0 or immediately after a ``\n``.
22-
starts: list[int] = []
23-
i = 0
24-
while True:
25-
p = data.find(b">", i)
26-
if p == -1:
27-
break
28-
if p == 0 or data[p - 1] == _NL:
29-
starts.append(p)
30-
i = p + 1
31-
starts.append(len(data)) # sentinel marking the end of the last record.
29+
Returns ``[(record_id, [positions...]), ...]`` in file order.
30+
"""
31+
source = open(fasta_path, "rb")
32+
data = mmap(source.fileno(), 0, access=ACCESS_READ)
3233

33-
num_records = len(starts) - 1
34-
if num_records <= 0:
35-
return []
34+
last = -1
3635

37-
# Step 2: parallel scan. Choose enough batches to keep workers balanced
38-
# even when record sizes vary.
39-
n_workers = max(1, os.cpu_count() or 1)
40-
batches = max(1, n_workers * 4)
41-
batch_size = max(1, (num_records + batches - 1) // batches)
36+
data_end = len(data) - 1
37+
while data[data_end] == b"\n":
38+
data_end -= 1
4239

43-
def scan_batch(start_idx: int, end_idx: int) -> list[tuple[int, str, list[int]]]:
44-
out: list[tuple[int, str, list[int]]] = []
45-
for j in range(start_idx, end_idx):
46-
rec_start = starts[j]
47-
rec_end = starts[j + 1]
40+
with ThreadPoolExecutor(max_workers=16) as executor:
41+
records = []
42+
while data_end > 0:
43+
gt_pos = data.rfind(b">", 0, data_end)
44+
if gt_pos == -1:
45+
raise Exception("expected greater than")
4846

49-
# Locate the end of the header line within this record's slice.
50-
nl = data.find(b"\n", rec_start, rec_end)
51-
if nl <= rec_start:
52-
continue # Malformed or header-only.
47+
record_id_start = gt_pos + 1
5348

54-
record_id = data[rec_start + 1 : nl].decode("ascii").strip()
49+
nl_pos = data.find(b"\n", record_id_start)
50+
if nl_pos == -1:
51+
raise Exception("expected new line")
5552

56-
# Contiguous sequence: drop the newlines so matches that straddle
57-
# line breaks are still found by ``bytes.find``.
58-
sequence = data[nl + 1 : rec_end].replace(b"\n", b"")
53+
data_start = nl_pos + 1
5954

60-
positions: list[int] = []
61-
s = 0
62-
while True:
63-
p = sequence.find(pattern, s)
64-
if p == -1:
65-
break
66-
positions.append(p)
67-
s = p + 1
55+
records.append(
56+
executor.submit(_subsearch, data, record_id_start, data_start, data_end, pattern)
57+
)
58+
data_end = gt_pos
6859

69-
if positions:
70-
out.append((j, record_id, positions))
71-
return out
72-
73-
with ThreadPoolExecutor(max_workers=n_workers) as pool:
74-
futures = [
75-
pool.submit(scan_batch, lo, min(lo + batch_size, num_records))
76-
for lo in range(0, num_records, batch_size)
77-
]
78-
chunks = [f.result() for f in futures]
79-
80-
# Step 3: flatten and restore file order (record index is monotonic per
81-
# batch, but batches finish in arbitrary order).
82-
flat = [item for chunk in chunks for item in chunk]
83-
flat.sort(key=lambda triple: triple[0])
84-
return [(rid, positions) for _, rid, positions in flat]
60+
results = [d.result() for d in records if d.result() is not None]
61+
results.reverse()
62+
return results

0 commit comments

Comments
 (0)