Skip to content

Commit 7a26897

Browse files
author
Jacob Summerville
committed
Solution 1
1 parent 841652e commit 7a26897

1 file changed

Lines changed: 35 additions & 6 deletions

File tree

rounds/1_histogram/solution.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,43 @@
11
"""Your Round 1 solution — byte-pair histogram.
22
3-
**Edit this file.** It currently delegates to ``baseline.py`` so everything
4-
passes out of the box. Replace the body of ``compute_histogram`` with your
5-
own faster implementation.
3+
This version keeps the same contract as ``baseline.py`` but replaces the
4+
per-bigram Python loop with NumPy operations over the whole byte buffer.
65
"""
76

7+
from __future__ import annotations
8+
from pathlib import Path
9+
10+
import numpy as np
11+
12+
DATA_DIR = Path(__file__).parent / "data"
13+
FIXTURE_PATH = DATA_DIR / "fixture_payload.bin"
14+
815

916
def compute_histogram(path: str) -> dict[bytes, int]:
1017
"""Frequency of every 2-byte bigram in the file at ``path``."""
11-
# TODO: remove this delegation and write your own implementation here.
12-
from .baseline import compute_histogram as _baseline
1318

14-
return _baseline(path)
19+
# Read the whole file into memory as a single bytes object
20+
with open(path, 'rb') as f:
21+
data = f.read()
22+
23+
# Expose the bytes object as a uint8 NumPy array without copying
24+
byte_values = np.frombuffer(data, dtype=np.uint8)
25+
26+
# Encode each overlapping 2-byte window as a uint16 token
27+
bigrams = byte_values[:-1].astype(np.uint16)
28+
bigrams <<= 8
29+
bigrams |= byte_values[1:]
30+
31+
# Count the uint16 tokens directly
32+
counts = np.bincount(bigrams, minlength=1 << 16)
33+
34+
# Convert back into the return format
35+
return {
36+
int(token).to_bytes(2, "big"): int(count)
37+
for token, count in enumerate(counts)
38+
if count
39+
}
40+
41+
42+
if __name__ == '__main__':
43+
compute_histogram(str(FIXTURE_PATH))

0 commit comments

Comments
 (0)