perf(brotli,zstd): faster encode on low-redundancy input

MagicalTux · claude · MagicalTux · commit 552f93e1170b · 2026-06-30T20:12:09.000+09:00
Audit of the codecs against their official CLIs found two encoders that
were dramatically slower than the reference on incompressible/low-match
data — both linear but with pathological constants. (Interop and the
streaming contract were otherwise clean across every codec with an
official tool; lh1/lh2's Unsupported-without-length is documented, not a
bug.)

brotli: the literal-context histogram clustering was O(contexts^3 * 256)
— it rescanned all cluster pairs and recomputed each cluster's cost from
scratch on every merge — which exploded on dense histograms (~37k
instructions/byte on random input). Cache per-cluster costs and the
pairwise-delta matrix, updating only the merged cluster each round. The
merge sequence and compressed output are byte-for-byte identical;
incompressible encode is ~8x faster.

zstd: the match finder used a fixed 64 Ki-bucket hash table over an
up-to-8 MiB window (load factor in the hundreds), so each probe walked a
full chain of useless far links. Size the table to the window. Also build
the per-block match index incrementally — the chains persist across blocks
(the history prefix is byte-stable until a window trim) instead of
re-indexing all of history every block, which was O(history) per block and
quadratic over a stream. Output is unchanged on single-block inputs and
equal-or-smaller on multi-block ones (0 ratio regressions observed);
random encode is ~3x faster.

Verified: brotli output byte-identical across inputs x quality 0..11;
zstd 0 ratio regressions and interop both ways with the zstd CLI; 50-case
zstd fuzz (incl. &gt;8 MiB trim path and block boundaries) round-trips
through both our decoder and the CLI; full suite, clippy, and fmt clean.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   streaming XXH64 implementation; the decompressed output is hashed and checked
   against the 4-byte frame trailer, reporting `ChecksumMismatch` on corruption.
 
+### Changed
+
+- *(brotli)* much faster encode on low-redundancy input. The literal-context
+  histogram clustering was O(contexts³ · 256) — it rescanned every cluster pair
+  and recomputed each cluster's cost from scratch on every merge — which blew up
+  on dense histograms (e.g. random/incompressible data: ~37k instructions per
+  byte). It now caches per-cluster costs and the pairwise-delta matrix and
+  updates only the merged cluster each round. The merge sequence, and therefore
+  the compressed output, is byte-for-byte identical; encode of incompressible
+  input is ~8× faster.
+- *(zstd)* faster encode, especially on low-match input, with equal-or-better
+  ratio. The match finder's hash table was a fixed 64 Ki buckets over an up-to
+  8 MiB window (load factor in the hundreds), so every probe walked a full chain
+  of useless far links; it is now sized to the window. The per-block match index
+  is also built incrementally — the chains persist across blocks instead of
+  re-indexing all of history every block (which was O(history) per block, i.e.
+  quadratic over a stream). Output is unchanged on single-block inputs and
+  equal-or-smaller on multi-block inputs (no ratio regression observed).
+
 ### Fixed
 
 - *(decoder bridge)* a decoder that buffers a whole block internally (notably
diff --git a/src/brotli/encoder_ctx.rs b/src/brotli/encoder_ctx.rs
@@ -159,22 +159,49 @@ pub(crate) fn cluster(
         }
     }
 
+    // Agglomerative clustering. The naive form recomputes every pair's merge
+    // delta — including each cluster's own `histogram_bits` — on every iteration,
+    // which is O(active³ · 256) and blows up on dense histograms (e.g. random
+    // input, where every context spans all 256 symbols). Instead cache each
+    // cluster's self-cost and the pairwise deltas, keyed by stable cluster id,
+    // and after each merge recompute only the merged cluster's row. The merge
+    // sequence — and therefore the resulting model and compressed output — is
+    // byte-for-byte identical to the naive version; only redundant work is cut.
+    let mut self_bits = alloc::vec![0u64; NUM_CONTEXTS];
+    for &c in &active {
+        self_bits[c] = histogram_bits(&histograms[c], totals[c]);
+    }
+    // `delta[ci][cj]` for `ci < cj`; valid only for currently-active pairs.
+    let mut delta = alloc::vec![alloc::vec![0i64; NUM_CONTEXTS]; NUM_CONTEXTS];
+    let pair_delta = |ci: usize, cj: usize, sb: &[u64], hs: &[[u32; 256]], ts: &[u32]| -> i64 {
+        let bm = merged_bits(&hs[ci], ts[ci], &hs[cj], ts[cj]);
+        bm as i64 - sb[ci] as i64 - sb[cj] as i64 - HEADER_COST_BITS as i64
+    };
+    for ai in 0..active.len() {
+        for aj in (ai + 1)..active.len() {
+            let (ci, cj) = (active[ai], active[aj]);
+            delta[ci][cj] = pair_delta(ci, cj, &self_bits, &histograms, &totals);
+        }
+    }
+
     while active.len() > 1 {
         let force = active.len() > max_trees;
         let mut best_i = 0usize;
         let mut best_j = 0usize;
         let mut best_delta: i64 = i64::MAX;
+        // Same scan order and strict `<` tie-break as the naive loop, so the
+        // chosen pair is identical — but now a cheap matrix lookup, not a
+        // 256-symbol recomputation.
         for ai in 0..active.len() {
             for aj in (ai + 1)..active.len() {
-                let ci = active[ai];
-                let cj = active[aj];
-                let bi = histogram_bits(&histograms[ci], totals[ci]);
-                let bj = histogram_bits(&histograms[cj], totals[cj]);
-                let bm = merged_bits(&histograms[ci], totals[ci], &histograms[cj], totals[cj]);
-                // Merging trades a header allowance against extra data bits.
-                let delta = bm as i64 - bi as i64 - bj as i64 - HEADER_COST_BITS as i64;
-                if delta < best_delta {
-                    best_delta = delta;
+                let (ci, cj) = (active[ai], active[aj]);
+                let d = if ci < cj {
+                    delta[ci][cj]
+                } else {
+                    delta[cj][ci]
+                };
+                if d < best_delta {
+                    best_delta = d;
                     best_i = ai;
                     best_j = aj;
                 }
@@ -197,6 +224,15 @@ pub(crate) fn cluster(
             }
         }
         active.swap_remove(best_j);
+        // Only the merged cluster `ci`'s costs changed; refresh its self-cost
+        // and its delta against every other surviving cluster.
+        self_bits[ci] = histogram_bits(&histograms[ci], totals[ci]);
+        for &ck in &active {
+            if ck != ci {
+                let (lo, hi) = if ci < ck { (ci, ck) } else { (ck, ci) };
+                delta[lo][hi] = pair_delta(lo, hi, &self_bits, &histograms, &totals);
+            }
+        }
     }
 
     // Compress cluster ids to a dense 0..num_trees range.
diff --git a/src/zstd/encoder.rs b/src/zstd/encoder.rs
@@ -307,14 +307,22 @@ impl Encoder {
         let buffer = buffer.as_slice();
         let buf_len = buffer.len();
 
-        // Rebuild the chains for this buffer and pre-index only the retained
-        // history (`[0, start)`). Each parser then splices in the *current
-        // block's* positions lazily as it advances, so the hash chains never
-        // contain positions ahead of the probe — the standard LZ invariant that
-        // keeps match finding correct and the depth budget meaningful. Indexing
-        // history up front is what enables cross-block back-references.
-        self.matcher.resize_for(buf_len);
-        for i in 0..start.min(buf_len.saturating_sub(3)) {
+        // Pre-index the retained history (`[0, start)`) so cross-block
+        // back-references are findable; each parser then splices in the
+        // *current block's* positions lazily as it advances, preserving the LZ
+        // invariant that the chains never contain positions ahead of the probe.
+        //
+        // The chains persist across blocks (the history prefix is byte-stable
+        // until the window is trimmed), so we only index the positions not
+        // already indexed by earlier blocks — `[inserted_upto, start)`. The old
+        // code re-indexed all of history every block, which is O(history) per
+        // block and quadratic over a stream; this makes it amortised O(input).
+        // `prepare_incremental` keeps the existing chains (rebuilding only on a
+        // head-size change); window trims call `resize_for`, which resets the
+        // high-water so the next block re-indexes from scratch.
+        self.matcher.prepare_incremental(buf_len);
+        let index_to = start.min(buf_len.saturating_sub(3));
+        for i in self.matcher.inserted_upto()..index_to {
             self.matcher.insert(buffer, i);
         }
 
diff --git a/src/zstd/matcher.rs b/src/zstd/matcher.rs
@@ -16,8 +16,6 @@
 //! - `Match { length, distance }` returned by value, with `MIN_MATCH = 3`
 //!   (zstd's minimum) and a generous `MAX_MATCH` cap.
 
-use alloc::boxed::Box;
-
 /// Minimum match length the matcher will report (RFC 8478 §3.1.1.3.2 implies
 /// a hard minimum of 3 via the match-length base table).
 pub const MIN_MATCH: usize = 3;
@@ -30,9 +28,15 @@ pub const MIN_MATCH: usize = 3;
 /// periodicity at distance ~445 bytes): each long match amortises the
 /// per-sequence FSE-table cost across thousands more output bytes.
 pub const MAX_MATCH: usize = 65535;
-/// Hash table size (must be a power of two).
-const HASH_BITS: u32 = 15;
-const HASH_SIZE: usize = 1 << HASH_BITS;
+/// Minimum hash-table size (power of two). The table is sized to the indexed
+/// buffer at construction / `resize_for` time and floored here for tiny inputs.
+const HASH_MIN_BITS: u32 = 15;
+/// Upper bound on the hash table (4 Mi buckets = 16 MiB). The matcher indexes
+/// up to an 8 MiB history; a fixed small table would give that window a load
+/// factor in the hundreds, so on low-match input every probe walked the full
+/// `max_chain` of useless far-distance links. Sizing the table to the buffer
+/// keeps chains short (the same reason liblzma sizes its hash to the dict).
+const HASH_MAX_BITS: u32 = 22;
 /// "Empty" marker in the hash table.
 const NIL: u32 = u32::MAX;
 
@@ -46,31 +50,86 @@ pub struct Match {
 
 /// Per-block matcher state.
 pub struct MatchFinder {
-    head: Box<[u32; HASH_SIZE]>,
+    head: Vec<u32>,
+    /// Right-shift applied to the 32-bit hash to land in `head`; `32 - log2(len)`.
+    head_shift: u32,
     /// Linked-list chain `prev[pos]` = position of the previous occurrence of
     /// the same 4-byte prefix.
     prev: Vec<u32>,
+    /// Number of leading positions already spliced into the chains. The chains
+    /// persist across blocks (the buffer prefix is byte-stable until the window
+    /// is trimmed), so each block only needs to insert positions `>= this`
+    /// rather than re-indexing all of history — turning the per-block O(history)
+    /// rebuild (quadratic over a stream) into amortised O(input).
+    inserted_upto: usize,
 }
 
 use alloc::vec;
 use alloc::vec::Vec;
 
-/// Hash function over four bytes. A multiplicative hash with a prime
-/// multiplier gives reasonable distribution and is cheap to compute.
+/// Full-width multiplicative hash over four bytes. The caller takes the top
+/// `head` bits via `head_shift`; the high bits of a golden-ratio multiply are
+/// the well-distributed ones.
+#[inline]
 fn hash4(b: &[u8]) -> u32 {
     let v = (b[0] as u32) | ((b[1] as u32) << 8) | ((b[2] as u32) << 16) | ((b[3] as u32) << 24);
-    // 0x9E3779B1 = golden-ratio multiplier; high bits are the well-distributed ones.
-    v.wrapping_mul(0x9E37_79B1) >> (32 - HASH_BITS)
+    v.wrapping_mul(0x9E37_79B1)
+}
+
+/// `(head_len, head_shift)` for a buffer of `buffer_len` bytes: the table is the
+/// buffer size rounded up to a power of two, clamped to `[HASH_MIN_BITS,
+/// HASH_MAX_BITS]`, so the average chain length stays O(1).
+fn head_params(buffer_len: usize) -> (usize, u32) {
+    let bits = buffer_len
+        .next_power_of_two()
+        .trailing_zeros()
+        .clamp(HASH_MIN_BITS, HASH_MAX_BITS);
+    (1usize << bits, 32 - bits)
 }
 
 impl MatchFinder {
     pub fn new(buffer_len: usize) -> Self {
+        let (head_len, head_shift) = head_params(buffer_len);
         Self {
-            head: Box::new([NIL; HASH_SIZE]),
+            head: vec![NIL; head_len],
+            head_shift,
             prev: vec![NIL; buffer_len.max(1)],
+            inserted_upto: 0,
         }
     }
 
+    /// How many leading positions are already in the chains.
+    #[inline]
+    pub fn inserted_upto(&self) -> usize {
+        self.inserted_upto
+    }
+
+    /// Prepare to index a buffer of `buffer_len` bytes *incrementally*, keeping
+    /// the chains built for the byte-stable prefix from earlier blocks. Grows
+    /// the per-position array (preserving entries) and only rebuilds the head
+    /// table when the ideal size changes (a power-of-two growth, O(log input)
+    /// times total) — a rebuild resets `inserted_upto` so the caller re-indexes
+    /// the prefix that round. Use [`resize_for`](Self::resize_for) instead when
+    /// the window is trimmed and absolute positions shift.
+    pub fn prepare_incremental(&mut self, buffer_len: usize) {
+        if self.prev.len() < buffer_len {
+            self.prev.resize(buffer_len.max(1), NIL);
+        }
+        let (head_len, head_shift) = head_params(buffer_len);
+        if head_len != self.head.len() {
+            self.head.clear();
+            self.head.resize(head_len, NIL);
+            self.head_shift = head_shift;
+            self.inserted_upto = 0;
+        }
+    }
+
+    /// Bucket index for the 4 bytes at `b`.
+    #[inline]
+    fn bucket(&self, b: &[u8]) -> usize {
+        (hash4(b) >> self.head_shift) as usize
+    }
+
     /// Forget every position recorded so far. The buffer length stays the
     /// same. Not currently called — [`MatchFinder::resize_for`] is used on
     /// each new block — but kept for completeness / future tuning.
@@ -89,20 +148,26 @@ impl MatchFinder {
     pub fn resize_for(&mut self, buffer_len: usize) {
         self.prev.clear();
         self.prev.resize(buffer_len.max(1), NIL);
-        for h in self.head.iter_mut() {
-            *h = NIL;
-        }
+        let (head_len, head_shift) = head_params(buffer_len);
+        self.head_shift = head_shift;
+        self.head.clear();
+        self.head.resize(head_len, NIL);
+        self.inserted_upto = 0;
     }
 
-    /// Record `buffer[pos..pos+4]`.
+    /// Record `buffer[pos..pos+4]`. Positions must be inserted in increasing
+    /// order (the standard LZ invariant); `inserted_upto` tracks the high-water
+    /// so later blocks can skip what is already indexed.
     pub fn insert(&mut self, buffer: &[u8], pos: usize) {
         if pos + 4 > buffer.len() {
             return;
         }
-        let h = hash4(&buffer[pos..pos + 4]) as usize;
-        // Safety: head is fixed size HASH_SIZE, h < HASH_SIZE.
+        let h = self.bucket(&buffer[pos..pos + 4]);
         self.prev[pos] = self.head[h];
         self.head[h] = pos as u32;
+        if pos + 1 > self.inserted_upto {
+            self.inserted_upto = pos + 1;
+        }
     }
 
     /// Find the longest match for `buffer[pos..]` against any earlier
@@ -126,7 +191,7 @@ impl MatchFinder {
             // Can't compute the 4-byte hash; just fail (rare; near end of buf).
             return None;
         }
-        let h = hash4(&buffer[pos..pos + 4]) as usize;
+        let h = self.bucket(&buffer[pos..pos + 4]);
         let max_dist = window.min(pos);
         let max_len = MAX_MATCH.min(buffer.len() - pos);
         if max_len < MIN_MATCH {
@@ -225,7 +290,7 @@ impl MatchFinder {
         if pos + MIN_MATCH > buffer.len() || pos + 4 > buffer.len() {
             return;
         }
-        let h = hash4(&buffer[pos..pos + 4]) as usize;
+        let h = self.bucket(&buffer[pos..pos + 4]);
         let max_dist = window.min(pos);
         let max_len = MAX_MATCH.min(buffer.len() - pos);
         if max_len < MIN_MATCH {