|
1 | 1 | """Your Round 1 solution — byte-pair histogram. |
2 | 2 |
|
3 | | -**Edit this file.** It currently delegates to ``baseline.py`` so everything |
4 | | -passes out of the box. Replace the body of ``compute_histogram`` with your |
5 | | -own faster implementation. |
| 3 | +This version keeps the same contract as ``baseline.py`` but replaces the |
| 4 | +per-bigram Python loop with NumPy operations over the whole byte buffer. |
6 | 5 | """ |
7 | 6 |
|
| 7 | +from __future__ import annotations |
| 8 | +from pathlib import Path |
| 9 | + |
| 10 | +import numpy as np |
| 11 | + |
| 12 | +DATA_DIR = Path(__file__).parent / "data" |
| 13 | +FIXTURE_PATH = DATA_DIR / "fixture_payload.bin" |
| 14 | + |
8 | 15 |
|
9 | 16 | def compute_histogram(path: str) -> dict[bytes, int]: |
10 | 17 | """Frequency of every 2-byte bigram in the file at ``path``.""" |
11 | | - # TODO: remove this delegation and write your own implementation here. |
12 | | - from .baseline import compute_histogram as _baseline |
13 | 18 |
|
14 | | - return _baseline(path) |
| 19 | + # Read the whole file into memory as a single bytes object |
| 20 | + with open(path, 'rb') as f: |
| 21 | + data = f.read() |
| 22 | + |
| 23 | + # Expose the bytes object as a uint8 NumPy array without copying |
| 24 | + byte_values = np.frombuffer(data, dtype=np.uint8) |
| 25 | + |
| 26 | + # Encode each overlapping 2-byte window as a uint16 token |
| 27 | + bigrams = byte_values[:-1].astype(np.uint16) |
| 28 | + bigrams <<= 8 |
| 29 | + bigrams |= byte_values[1:] |
| 30 | + |
| 31 | + # Count the uint16 tokens directly |
| 32 | + counts = np.bincount(bigrams, minlength=1 << 16) |
| 33 | + |
| 34 | + # Convert back into the return format |
| 35 | + return { |
| 36 | + int(token).to_bytes(2, "big"): int(count) |
| 37 | + for token, count in enumerate(counts) |
| 38 | + if count |
| 39 | + } |
| 40 | + |
| 41 | + |
| 42 | +if __name__ == '__main__': |
| 43 | + compute_histogram(str(FIXTURE_PATH)) |
0 commit comments