|
1 | | -"""Your Round 2 solution — corruption scanner. |
| 1 | +"""Your Round 2 solution - corruption scanner.""" |
2 | 2 |
|
3 | | -**Edit this file.** It currently delegates to ``baseline.py`` so everything |
4 | | -passes out of the box. Replace the body of ``find_corruptions`` with your |
5 | | -own faster implementation. |
6 | | -""" |
| 3 | +from __future__ import annotations |
7 | 4 |
|
8 | | -from .baseline import find_corruptions as _baseline |
| 5 | +import mmap |
| 6 | + |
| 7 | + |
| 8 | +_BLOCK_SIZE = 4096 |
9 | 9 |
|
10 | 10 |
|
11 | 11 | def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]: |
12 | | - """Return ``[(offset, length), ...]`` for every differing byte range.""" |
13 | | - # TODO: remove this delegation and write your own implementation here. |
14 | | - return _baseline(ref_path, cor_path) |
| 12 | + """ Return ``[(offset, length), ...]`` for every differing byte range. """ |
| 13 | + |
| 14 | + with open(ref_path, "rb") as ref_file, open(cor_path, "rb") as cor_file: |
| 15 | + # Use the file size as the single source of truth before mapping. |
| 16 | + size = ref_file.seek(0, 2) |
| 17 | + if size != cor_file.seek(0, 2): |
| 18 | + raise ValueError("reference and corrupted files differ in length") |
| 19 | + if size == 0: |
| 20 | + return [] |
| 21 | + |
| 22 | + ref_file.seek(0) |
| 23 | + cor_file.seek(0) |
| 24 | + |
| 25 | + with mmap.mmap(ref_file.fileno(), 0, access=mmap.ACCESS_READ) as ref: |
| 26 | + with mmap.mmap(cor_file.fileno(), 0, access=mmap.ACCESS_READ) as cor: |
| 27 | + ranges: list[tuple[int, int]] = [] |
| 28 | + # -1 means there is no currently open corruption range. |
| 29 | + run_start = -1 |
| 30 | + append = ranges.append |
| 31 | + block_size = _BLOCK_SIZE |
| 32 | + |
| 33 | + for block_start in range(0, size, block_size): |
| 34 | + block_end = min(block_start + block_size, size) |
| 35 | + |
| 36 | + # Most blocks are identical, so skip them with a C-level |
| 37 | + # bytes comparison instead of a Python loop over each byte. |
| 38 | + if ref[block_start:block_end] == cor[block_start:block_end]: |
| 39 | + if run_start != -1: |
| 40 | + append((run_start, block_start - run_start)) |
| 41 | + run_start = -1 |
| 42 | + continue |
| 43 | + |
| 44 | + # Only scan inside blocks that actually differ. Keeping |
| 45 | + # run_start outside this loop lets ranges cross block edges. |
| 46 | + for pos in range(block_start, block_end): |
| 47 | + if ref[pos] != cor[pos]: |
| 48 | + if run_start == -1: |
| 49 | + run_start = pos |
| 50 | + elif run_start != -1: |
| 51 | + append((run_start, pos - run_start)) |
| 52 | + run_start = -1 |
| 53 | + |
| 54 | + # Close a corruption range that reaches the end of the file. |
| 55 | + if run_start != -1: |
| 56 | + append((run_start, size - run_start)) |
| 57 | + |
| 58 | + return ranges |
0 commit comments