From 841652ecb49ae4cd1def3ee07f21b8a6940d76b2 Mon Sep 17 00:00:00 2001
From: Jacob Summerville <jsvikings97@gmail.com>
Date: Wed, 13 May 2026 09:27:48 -0700
Subject: [PATCH 1/5] Add jsummer10 to the README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 44e0723..92c32ca 100644
--- a/README.md
+++ b/README.md
@@ -91,3 +91,4 @@ scripts/
 ```
 
 Each round's `data/` directory is generated locally and gitignored.
+This is jsummer10's PR

From 7a2689798305f93eb769d2e0952bfd9cda715f5e Mon Sep 17 00:00:00 2001
From: Jacob Summerville <jsvikings97@gmail.com>
Date: Wed, 13 May 2026 10:20:41 -0700
Subject: [PATCH 2/5] Solution 1

---
 rounds/1_histogram/solution.py | 41 +++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py
index dffbee5..9aa5e7a 100644
--- a/rounds/1_histogram/solution.py
+++ b/rounds/1_histogram/solution.py
@@ -1,14 +1,43 @@
 """Your Round 1 solution — byte-pair histogram.
 
-**Edit this file.** It currently delegates to ``baseline.py`` so everything
-passes out of the box. Replace the body of ``compute_histogram`` with your
-own faster implementation.
+This version keeps the same contract as ``baseline.py`` but replaces the
+per-bigram Python loop with NumPy operations over the whole byte buffer.
 """
 
+from __future__ import annotations
+from pathlib import Path
+
+import numpy as np
+
+DATA_DIR = Path(__file__).parent / "data"
+FIXTURE_PATH = DATA_DIR / "fixture_payload.bin"
+
 
 def compute_histogram(path: str) -> dict[bytes, int]:
     """Frequency of every 2-byte bigram in the file at ``path``."""
-    # TODO: remove this delegation and write your own implementation here.
-    from .baseline import compute_histogram as _baseline
 
-    return _baseline(path)
+    # Read the whole file into memory as a single bytes object
+    with open(path, 'rb') as f:
+        data = f.read()
+
+    # Expose the bytes object as a uint8 NumPy array without copying
+    byte_values = np.frombuffer(data, dtype=np.uint8)
+
+    # Encode each overlapping 2-byte window as a uint16 token
+    bigrams = byte_values[:-1].astype(np.uint16)
+    bigrams <<= 8
+    bigrams |= byte_values[1:]
+
+    # Count the uint16 tokens directly
+    counts = np.bincount(bigrams, minlength=1 << 16)
+
+    # Convert back into the return format
+    return {
+        int(token).to_bytes(2, "big"): int(count)
+        for token, count in enumerate(counts)
+        if count
+    }
+
+
+if __name__ == '__main__':
+    compute_histogram(str(FIXTURE_PATH))

From 4124a064cb662cd9519bf55172a5fe38dfe47e0e Mon Sep 17 00:00:00 2001
From: Jacob Summerville <jsvikings97@gmail.com>
Date: Wed, 13 May 2026 10:32:10 -0700
Subject: [PATCH 3/5] round 2

---
 rounds/2_corruption/solution.py | 62 ++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 9 deletions(-)

diff --git a/rounds/2_corruption/solution.py b/rounds/2_corruption/solution.py
index a5b752a..0e18b81 100644
--- a/rounds/2_corruption/solution.py
+++ b/rounds/2_corruption/solution.py
@@ -1,14 +1,58 @@
-"""Your Round 2 solution — corruption scanner.
+"""Your Round 2 solution - corruption scanner."""
 
-**Edit this file.** It currently delegates to ``baseline.py`` so everything
-passes out of the box. Replace the body of ``find_corruptions`` with your
-own faster implementation.
-"""
+from __future__ import annotations
 
-from .baseline import find_corruptions as _baseline
+import mmap
+
+
+_BLOCK_SIZE = 4096
 
 
 def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]:
-    """Return ``[(offset, length), ...]`` for every differing byte range."""
-    # TODO: remove this delegation and write your own implementation here.
-    return _baseline(ref_path, cor_path)
+    """ Return ``[(offset, length), ...]`` for every differing byte range. """
+
+    with open(ref_path, "rb") as ref_file, open(cor_path, "rb") as cor_file:
+        # Use the file size as the single source of truth before mapping.
+        size = ref_file.seek(0, 2)
+        if size != cor_file.seek(0, 2):
+            raise ValueError("reference and corrupted files differ in length")
+        if size == 0:
+            return []
+
+        ref_file.seek(0)
+        cor_file.seek(0)
+
+        with mmap.mmap(ref_file.fileno(), 0, access=mmap.ACCESS_READ) as ref:
+            with mmap.mmap(cor_file.fileno(), 0, access=mmap.ACCESS_READ) as cor:
+                ranges: list[tuple[int, int]] = []
+                # -1 means there is no currently open corruption range.
+                run_start = -1
+                append = ranges.append
+                block_size = _BLOCK_SIZE
+
+                for block_start in range(0, size, block_size):
+                    block_end = min(block_start + block_size, size)
+
+                    # Most blocks are identical, so skip them with a C-level
+                    # bytes comparison instead of a Python loop over each byte.
+                    if ref[block_start:block_end] == cor[block_start:block_end]:
+                        if run_start != -1:
+                            append((run_start, block_start - run_start))
+                            run_start = -1
+                        continue
+
+                    # Only scan inside blocks that actually differ. Keeping
+                    # run_start outside this loop lets ranges cross block edges.
+                    for pos in range(block_start, block_end):
+                        if ref[pos] != cor[pos]:
+                            if run_start == -1:
+                                run_start = pos
+                        elif run_start != -1:
+                            append((run_start, pos - run_start))
+                            run_start = -1
+
+                # Close a corruption range that reaches the end of the file.
+                if run_start != -1:
+                    append((run_start, size - run_start))
+
+                return ranges

From 672a727782d3ff101124ea617ba4b0166d3c1a0a Mon Sep 17 00:00:00 2001
From: Jacob Summerville <jsvikings97@gmail.com>
Date: Wed, 13 May 2026 11:05:09 -0700
Subject: [PATCH 4/5] Threaded

---
 rounds/3_dna/solution.py | 56 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py
index 8b917da..7c2f93a 100644
--- a/rounds/3_dna/solution.py
+++ b/rounds/3_dna/solution.py
@@ -5,7 +5,46 @@
 own faster implementation.
 """
 
-from .baseline import find_matches as _baseline
+from concurrent.futures import ThreadPoolExecutor
+
+
+def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
+    """ Scan one FASTA record for all occurrences of ``pattern``.
+
+    Returns the record id and every zero-based match position, or ``None`` if
+    the record is empty or does not contain the pattern.
+    """
+
+    if not record.strip():
+        return None
+
+    # Parition DNA record into header and DNA sequence
+    header, _, body = record.partition(b'\n')
+    record_id = header.strip().decode('ascii')
+
+    # Keep the hot path in bytes so we avoid decoding each whole sequence.
+    # Whitespace is not part of the DNA sequence, so remove it before scanning.
+    sequence = (
+        body.replace(b'\n', b'')
+            .replace(b'\r', b'')
+            .replace(b' ', b'')
+    )
+
+    positions: list[int] = []
+    start = 0
+
+    # Advance by one after each hit so overlapping matches are included.
+    while True:
+        pos = sequence.find(pattern, start)
+        if pos == -1:
+            break
+        positions.append(pos)
+        start = pos + 1
+
+    if not positions:
+        return None
+
+    return record_id, positions
 
 
 def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
@@ -13,5 +52,16 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
 
     Returns ``[(record_id, [positions...]), ...]`` in file order.
     """
-    # TODO: remove this delegation and write your own implementation here.
-    return _baseline(fasta_path, pattern)
+
+    # Read once in binary mode so parsing and searching can stay on bytes.
+    with open(fasta_path, 'rb') as f:
+        text = f.read()
+
+    # Split into DNA sequences
+    records = [record for record in text.split(b'>') if record.strip()]
+
+    # Scan records concurrently
+    with ThreadPoolExecutor() as executor:
+        results = executor.map(lambda record: _scan_record(record, pattern), records)
+
+    return [result for result in results if result is not None]

From daa65a5b3ca1663d2cd8f73e9b7dc3cff8c679e6 Mon Sep 17 00:00:00 2001
From: Jacob Summerville <jsvikings97@gmail.com>
Date: Wed, 13 May 2026 11:20:57 -0700
Subject: [PATCH 5/5] Add mmap

---
 rounds/3_dna/solution.py | 42 ++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py
index 7c2f93a..da234a1 100644
--- a/rounds/3_dna/solution.py
+++ b/rounds/3_dna/solution.py
@@ -6,6 +6,8 @@
 """
 
 from concurrent.futures import ThreadPoolExecutor
+from mmap import mmap, ACCESS_READ
+from os import fstat
 
 
 def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
@@ -22,8 +24,7 @@ def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
     header, _, body = record.partition(b'\n')
     record_id = header.strip().decode('ascii')
 
-    # Keep the hot path in bytes so we avoid decoding each whole sequence.
-    # Whitespace is not part of the DNA sequence, so remove it before scanning.
+    # Clean up data before parsing
     sequence = (
         body.replace(b'\n', b'')
             .replace(b'\r', b'')
@@ -48,20 +49,33 @@ def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None:
 
 
 def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
-    """Find every FASTA record whose sequence contains ``pattern``.
+    """ Find every FASTA record whose sequence contains ``pattern``.
 
     Returns ``[(record_id, [positions...]), ...]`` in file order.
     """
 
-    # Read once in binary mode so parsing and searching can stay on bytes.
     with open(fasta_path, 'rb') as f:
-        text = f.read()
-
-    # Split into DNA sequences
-    records = [record for record in text.split(b'>') if record.strip()]
-
-    # Scan records concurrently
-    with ThreadPoolExecutor() as executor:
-        results = executor.map(lambda record: _scan_record(record, pattern), records)
-
-    return [result for result in results if result is not None]
+        if fstat(f.fileno()).st_size == 0:
+            return []
+
+        with mmap(f.fileno(), 0, access=ACCESS_READ) as text:
+            # Read the file as an mmap and break it up into DNA records
+            records: list[bytes] = []
+            start = text.find(b'>')
+            while start != -1:
+                end = text.find(b'>', start + 1)
+                if end == -1:
+                    record = text[start + 1:]
+                    start = -1
+                else:
+                    record = text[start + 1:end]
+                    start = end
+
+                if record.strip():
+                    records.append(record)
+
+            # Scan records concurrently
+            with ThreadPoolExecutor() as executor:
+                results = executor.map(lambda record: _scan_record(record, pattern), records)
+
+            return [result for result in results if result is not None]