From 841652ecb49ae4cd1def3ee07f21b8a6940d76b2 Mon Sep 17 00:00:00 2001 From: Jacob Summerville Date: Wed, 13 May 2026 09:27:48 -0700 Subject: [PATCH 1/5] Add jsummer10 to the README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44e0723..92c32ca 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,4 @@ scripts/ ``` Each round's `data/` directory is generated locally and gitignored. +This is jsummer10's PR From 7a2689798305f93eb769d2e0952bfd9cda715f5e Mon Sep 17 00:00:00 2001 From: Jacob Summerville Date: Wed, 13 May 2026 10:20:41 -0700 Subject: [PATCH 2/5] Solution 1 --- rounds/1_histogram/solution.py | 41 +++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..9aa5e7a 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -1,14 +1,43 @@ """Your Round 1 solution — byte-pair histogram. -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``compute_histogram`` with your -own faster implementation. +This version keeps the same contract as ``baseline.py`` but replaces the +per-bigram Python loop with NumPy operations over the whole byte buffer. """ +from __future__ import annotations +from pathlib import Path + +import numpy as np + +DATA_DIR = Path(__file__).parent / "data" +FIXTURE_PATH = DATA_DIR / "fixture_payload.bin" + def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline - return _baseline(path) + # Read the whole file into memory as a single bytes object + with open(path, 'rb') as f: + data = f.read() + + # Expose the bytes object as a uint8 NumPy array without copying + byte_values = np.frombuffer(data, dtype=np.uint8) + + # Encode each overlapping 2-byte window as a uint16 token + bigrams = byte_values[:-1].astype(np.uint16) + bigrams <<= 8 + bigrams |= byte_values[1:] + + # Count the uint16 tokens directly + counts = np.bincount(bigrams, minlength=1 << 16) + + # Convert back into the return format + return { + int(token).to_bytes(2, "big"): int(count) + for token, count in enumerate(counts) + if count + } + + +if __name__ == '__main__': + compute_histogram(str(FIXTURE_PATH)) From 4124a064cb662cd9519bf55172a5fe38dfe47e0e Mon Sep 17 00:00:00 2001 From: Jacob Summerville Date: Wed, 13 May 2026 10:32:10 -0700 Subject: [PATCH 3/5] round 2 --- rounds/2_corruption/solution.py | 62 ++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/rounds/2_corruption/solution.py b/rounds/2_corruption/solution.py index a5b752a..0e18b81 100644 --- a/rounds/2_corruption/solution.py +++ b/rounds/2_corruption/solution.py @@ -1,14 +1,58 @@ -"""Your Round 2 solution — corruption scanner. +"""Your Round 2 solution - corruption scanner.""" -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``find_corruptions`` with your -own faster implementation. -""" +from __future__ import annotations -from .baseline import find_corruptions as _baseline +import mmap + + +_BLOCK_SIZE = 4096 def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]: - """Return ``[(offset, length), ...]`` for every differing byte range.""" - # TODO: remove this delegation and write your own implementation here. - return _baseline(ref_path, cor_path) + """ Return ``[(offset, length), ...]`` for every differing byte range. """ + + with open(ref_path, "rb") as ref_file, open(cor_path, "rb") as cor_file: + # Use the file size as the single source of truth before mapping. + size = ref_file.seek(0, 2) + if size != cor_file.seek(0, 2): + raise ValueError("reference and corrupted files differ in length") + if size == 0: + return [] + + ref_file.seek(0) + cor_file.seek(0) + + with mmap.mmap(ref_file.fileno(), 0, access=mmap.ACCESS_READ) as ref: + with mmap.mmap(cor_file.fileno(), 0, access=mmap.ACCESS_READ) as cor: + ranges: list[tuple[int, int]] = [] + # -1 means there is no currently open corruption range. + run_start = -1 + append = ranges.append + block_size = _BLOCK_SIZE + + for block_start in range(0, size, block_size): + block_end = min(block_start + block_size, size) + + # Most blocks are identical, so skip them with a C-level + # bytes comparison instead of a Python loop over each byte. + if ref[block_start:block_end] == cor[block_start:block_end]: + if run_start != -1: + append((run_start, block_start - run_start)) + run_start = -1 + continue + + # Only scan inside blocks that actually differ. Keeping + # run_start outside this loop lets ranges cross block edges. + for pos in range(block_start, block_end): + if ref[pos] != cor[pos]: + if run_start == -1: + run_start = pos + elif run_start != -1: + append((run_start, pos - run_start)) + run_start = -1 + + # Close a corruption range that reaches the end of the file. + if run_start != -1: + append((run_start, size - run_start)) + + return ranges From 672a727782d3ff101124ea617ba4b0166d3c1a0a Mon Sep 17 00:00:00 2001 From: Jacob Summerville Date: Wed, 13 May 2026 11:05:09 -0700 Subject: [PATCH 4/5] Threaded --- rounds/3_dna/solution.py | 56 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..7c2f93a 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -5,7 +5,46 @@ own faster implementation. """ -from .baseline import find_matches as _baseline +from concurrent.futures import ThreadPoolExecutor + + +def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None: + """ Scan one FASTA record for all occurrences of ``pattern``. + + Returns the record id and every zero-based match position, or ``None`` if + the record is empty or does not contain the pattern. + """ + + if not record.strip(): + return None + + # Parition DNA record into header and DNA sequence + header, _, body = record.partition(b'\n') + record_id = header.strip().decode('ascii') + + # Keep the hot path in bytes so we avoid decoding each whole sequence. + # Whitespace is not part of the DNA sequence, so remove it before scanning. + sequence = ( + body.replace(b'\n', b'') + .replace(b'\r', b'') + .replace(b' ', b'') + ) + + positions: list[int] = [] + start = 0 + + # Advance by one after each hit so overlapping matches are included. + while True: + pos = sequence.find(pattern, start) + if pos == -1: + break + positions.append(pos) + start = pos + 1 + + if not positions: + return None + + return record_id, positions def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: @@ -13,5 +52,16 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + + # Read once in binary mode so parsing and searching can stay on bytes. + with open(fasta_path, 'rb') as f: + text = f.read() + + # Split into DNA sequences + records = [record for record in text.split(b'>') if record.strip()] + + # Scan records concurrently + with ThreadPoolExecutor() as executor: + results = executor.map(lambda record: _scan_record(record, pattern), records) + + return [result for result in results if result is not None] From daa65a5b3ca1663d2cd8f73e9b7dc3cff8c679e6 Mon Sep 17 00:00:00 2001 From: Jacob Summerville Date: Wed, 13 May 2026 11:20:57 -0700 Subject: [PATCH 5/5] Add mmap --- rounds/3_dna/solution.py | 42 ++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 7c2f93a..da234a1 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -6,6 +6,8 @@ """ from concurrent.futures import ThreadPoolExecutor +from mmap import mmap, ACCESS_READ +from os import fstat def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None: @@ -22,8 +24,7 @@ def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None: header, _, body = record.partition(b'\n') record_id = header.strip().decode('ascii') - # Keep the hot path in bytes so we avoid decoding each whole sequence. - # Whitespace is not part of the DNA sequence, so remove it before scanning. + # Clean up data before parsing sequence = ( body.replace(b'\n', b'') .replace(b'\r', b'') @@ -48,20 +49,33 @@ def _scan_record(record: bytes, pattern: bytes) -> tuple[str, list[int]] | None: def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: - """Find every FASTA record whose sequence contains ``pattern``. + """ Find every FASTA record whose sequence contains ``pattern``. Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # Read once in binary mode so parsing and searching can stay on bytes. with open(fasta_path, 'rb') as f: - text = f.read() - - # Split into DNA sequences - records = [record for record in text.split(b'>') if record.strip()] - - # Scan records concurrently - with ThreadPoolExecutor() as executor: - results = executor.map(lambda record: _scan_record(record, pattern), records) - - return [result for result in results if result is not None] + if fstat(f.fileno()).st_size == 0: + return [] + + with mmap(f.fileno(), 0, access=ACCESS_READ) as text: + # Read the file as an mmap and break it up into DNA records + records: list[bytes] = [] + start = text.find(b'>') + while start != -1: + end = text.find(b'>', start + 1) + if end == -1: + record = text[start + 1:] + start = -1 + else: + record = text[start + 1:end] + start = end + + if record.strip(): + records.append(record) + + # Scan records concurrently + with ThreadPoolExecutor() as executor: + results = executor.map(lambda record: _scan_record(record, pattern), records) + + return [result for result in results if result is not None]