Skip to content

Commit 970e215

Browse files
committed
switch to array, use mmap
1 parent 244c1f8 commit 970e215

1 file changed

Lines changed: 20 additions & 3 deletions

File tree

rounds/1_histogram/solution.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,28 @@
44
passes out of the box. Replace the body of ``compute_histogram`` with your
55
own faster implementation.
66
"""
7+
from collections import defaultdict
8+
from mmap import mmap, ACCESS_READ
79

10+
def b2i(low: int, high: int) -> int:
11+
return high + (low << 8)
12+
13+
def i2b(x: int) -> bytes:
14+
return bytes([(x & 0xFF00) >> 8, x & 0xFF])
815

916
def compute_histogram(path: str) -> dict[bytes, int]:
1017
"""Frequency of every 2-byte bigram in the file at ``path``."""
11-
# TODO: remove this delegation and write your own implementation here.
12-
from .baseline import compute_histogram as _baseline
18+
# Step 1: read the whole file into memory as a single bytes object.
19+
counts = [0 for _ in range(0, 2**16)]
20+
21+
source = open(path, "rb", buffering=0)
22+
data = mmap(source.fileno(), 0, access=ACCESS_READ)
23+
24+
# Step 2: slide a 2-byte window across the buffer. For ``b"ABCD"`` the
25+
# iterations produce ``b"AB"``, ``b"BC"``, then ``b"CD"``. For each window,
26+
# bump the matching bucket in a ``dict`` keyed by the bigram itself.
27+
for i in range(len(data) - 1):
28+
bigram = b2i(data[i], data[i + 1])
29+
counts[bigram] += 1
1330

14-
return _baseline(path)
31+
return {i2b(idx): value for idx, value in enumerate(counts) if value != 0}

0 commit comments

Comments
 (0)