|
4 | 4 | passes out of the box. Replace the body of ``compute_histogram`` with your |
5 | 5 | own faster implementation. |
6 | 6 | """ |
| 7 | +from collections import defaultdict |
| 8 | +from mmap import mmap, ACCESS_READ |
7 | 9 |
|
| 10 | +def b2i(low: int, high: int) -> int: |
| 11 | + return high + (low << 8) |
| 12 | + |
| 13 | +def i2b(x: int) -> bytes: |
| 14 | + return bytes([(x & 0xFF00) >> 8, x & 0xFF]) |
8 | 15 |
|
9 | 16 | def compute_histogram(path: str) -> dict[bytes, int]: |
10 | 17 | """Frequency of every 2-byte bigram in the file at ``path``.""" |
11 | | - # TODO: remove this delegation and write your own implementation here. |
12 | | - from .baseline import compute_histogram as _baseline |
| 18 | + # Step 1: read the whole file into memory as a single bytes object. |
| 19 | + counts = [0 for _ in range(0, 2**16)] |
| 20 | + |
| 21 | + source = open(path, "rb", buffering=0) |
| 22 | + data = mmap(source.fileno(), 0, access=ACCESS_READ) |
| 23 | + |
| 24 | + # Step 2: slide a 2-byte window across the buffer. For ``b"ABCD"`` the |
| 25 | + # iterations produce ``b"AB"``, ``b"BC"``, then ``b"CD"``. For each window, |
| 26 | + # bump the matching bucket in a ``dict`` keyed by the bigram itself. |
| 27 | + for i in range(len(data) - 1): |
| 28 | + bigram = b2i(data[i], data[i + 1]) |
| 29 | + counts[bigram] += 1 |
13 | 30 |
|
14 | | - return _baseline(path) |
| 31 | + return {i2b(idx): value for idx, value in enumerate(counts) if value != 0} |
0 commit comments