round 2

Jacob Summerville · Jacob Summerville · commit 4124a064cb66 · 2026-05-13T10:32:10.000-07:00
diff --git a/rounds/2_corruption/solution.py b/rounds/2_corruption/solution.py
@@ -1,14 +1,58 @@
-"""Your Round 2 solution — corruption scanner.
+"""Your Round 2 solution - corruption scanner."""
 
-**Edit this file.** It currently delegates to ``baseline.py`` so everything
-passes out of the box. Replace the body of ``find_corruptions`` with your
-own faster implementation.
-"""
+from __future__ import annotations
 
-from .baseline import find_corruptions as _baseline
+import mmap
+
+
+_BLOCK_SIZE = 4096
 
 
 def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]:
-    """Return ``[(offset, length), ...]`` for every differing byte range."""
-    # TODO: remove this delegation and write your own implementation here.
-    return _baseline(ref_path, cor_path)
+    """ Return ``[(offset, length), ...]`` for every differing byte range. """
+
+    with open(ref_path, "rb") as ref_file, open(cor_path, "rb") as cor_file:
+        # Use the file size as the single source of truth before mapping.
+        size = ref_file.seek(0, 2)
+        if size != cor_file.seek(0, 2):
+            raise ValueError("reference and corrupted files differ in length")
+        if size == 0:
+            return []
+
+        ref_file.seek(0)
+        cor_file.seek(0)
+
+        with mmap.mmap(ref_file.fileno(), 0, access=mmap.ACCESS_READ) as ref:
+            with mmap.mmap(cor_file.fileno(), 0, access=mmap.ACCESS_READ) as cor:
+                ranges: list[tuple[int, int]] = []
+                # -1 means there is no currently open corruption range.
+                run_start = -1
+                append = ranges.append
+                block_size = _BLOCK_SIZE
+
+                for block_start in range(0, size, block_size):
+                    block_end = min(block_start + block_size, size)
+
+                    # Most blocks are identical, so skip them with a C-level
+                    # bytes comparison instead of a Python loop over each byte.
+                    if ref[block_start:block_end] == cor[block_start:block_end]:
+                        if run_start != -1:
+                            append((run_start, block_start - run_start))
+                            run_start = -1
+                        continue
+
+                    # Only scan inside blocks that actually differ. Keeping
+                    # run_start outside this loop lets ranges cross block edges.
+                    for pos in range(block_start, block_end):
+                        if ref[pos] != cor[pos]:
+                            if run_start == -1:
+                                run_start = pos
+                        elif run_start != -1:
+                            append((run_start, pos - run_start))
+                            run_start = -1
+
+                # Close a corruption range that reaches the end of the file.
+                if run_start != -1:
+                    append((run_start, size - run_start))
+
+                return ranges