|
5 | 5 | own faster implementation. |
6 | 6 | """ |
7 | 7 |
|
8 | | -from .baseline import find_corruptions as _baseline |
| 8 | +from concurrent.futures import ThreadPoolExecutor |
| 9 | + |
| 10 | + |
| 11 | +def compare_bytes(ref: bytes, cor: bytes, offset: int) -> list[tuple[int, int]]: |
| 12 | + ranges: list[tuple[int, int]] = [] |
| 13 | + start: int | None = None |
| 14 | + for i in range(len(ref)): |
| 15 | + if ref[i] != cor[i]: |
| 16 | + if start is None: |
| 17 | + start = i |
| 18 | + elif start is not None: |
| 19 | + ranges.append((start + offset, i - start)) |
| 20 | + start = None |
| 21 | + if start is not None: |
| 22 | + ranges.append((start + offset, len(ref) - start)) |
| 23 | + return ranges |
9 | 24 |
|
10 | 25 |
|
11 | 26 | def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]: |
12 | 27 | """Return ``[(offset, length), ...]`` for every differing byte range.""" |
13 | | - # TODO: remove this delegation and write your own implementation here. |
14 | | - return _baseline(ref_path, cor_path) |
| 28 | + # Step 1: read both files fully into memory as bytes objects. |
| 29 | + with open(ref_path, "rb") as f: |
| 30 | + ref = f.read() |
| 31 | + with open(cor_path, "rb") as f: |
| 32 | + cor = f.read() |
| 33 | + if len(ref) != len(cor): |
| 34 | + raise ValueError("reference and corrupted files differ in length") |
| 35 | + |
| 36 | + N = 16 |
| 37 | + chunk_size = len(ref) // N |
| 38 | + |
| 39 | + with ThreadPoolExecutor(N) as ex: |
| 40 | + futures = [ |
| 41 | + ex.submit( |
| 42 | + compare_bytes, |
| 43 | + ref[i * chunk_size : (i + 1) * chunk_size], |
| 44 | + cor[i * chunk_size : (i + 1) * chunk_size], |
| 45 | + i * chunk_size, |
| 46 | + ) |
| 47 | + for i in range(N) |
| 48 | + ] |
| 49 | + results = [future.result() for future in futures] |
| 50 | + |
| 51 | + ranges: list[tuple[int, int]] = [] |
| 52 | + for result in results: |
| 53 | + if not result: |
| 54 | + continue |
| 55 | + if ranges and result[0][0] == ranges[-1][0] + ranges[-1][1]: |
| 56 | + ranges[-1] = (ranges[-1][0], ranges[-1][1] + result[0][1]) |
| 57 | + ranges.extend(result[1:]) |
| 58 | + else: |
| 59 | + ranges.extend(result) |
| 60 | + |
| 61 | + return ranges |
0 commit comments