fix: V-norm in memory_stats, SeedSequence PRNG, MSE compressed_size_bits

brett · TheTom · commit b75813b0e180 · 2026-05-09T10:42:53.000-05:00
Subset of @brosequist's #90 commit 0fd5de9 — keeping the actual fixes, deferring the streaming + serialization API surface until a production caller exists. Included: - KVCacheCompressor.memory_stats() was omitting the float32 norm stored per V vector, inflating reported compression ratio. Adds v_bits_total += n_vectors * 32. - TurboQuantMSE.compressed_size_bits() — was missing (TurboQuant already had it). - Replaces seed + 1000 magic offset with np.random.SeedSequence(seed).spawn(2) for true PRNG independence between PolarQuant and QJL stages, and between K and V quantizers. Deferred (not in this commit): - compress_token() / get_compressed_cache() streaming API - CompressedVector.to_bytes() / from_bytes() binary serialization - CompressedKVCache.save() / load() npz serialization
diff --git a/tests/test_kv_cache.py b/tests/test_kv_cache.py
@@ -102,8 +102,9 @@ def test_memory_stats(self):
         compressor = KVCacheCompressor(head_dim=128, k_bits=3, v_bits=3)
         stats = compressor.memory_stats(seq_len=1024, num_layers=32, num_heads=32)
 
-        # K: 3 bits/val + norm overhead, V: 3 bits/val
-        # Ratio vs fp16 (16 bits): 16 / ((3+3)/2 + overhead) ≈ 2.5-3x
+        # K: 3 bits/val + 32-bit norm, V: 3 bits/val + 32-bit norm
+        # Both K and V include per-vector norm (float32) for rescaling.
+        # Ratio vs fp16 (16 bits/val): 16*128 / (128*3 + 32 + 128*3 + 32) / 2 ≈ 2.46x
         assert stats["compression_ratio"] > 2.0
         assert stats["compressed_mb"] < stats["original_mb"]
 
@@ -125,6 +126,7 @@ def test_metadata_stored(self):
         assert compressed.v_bit_width == 3
 
 
+
 def _softmax(x):
     """Simple softmax for testing."""
     e = np.exp(x - np.max(x, axis=-1, keepdims=True))
diff --git a/turboquant/kv_cache.py b/turboquant/kv_cache.py
@@ -47,9 +47,6 @@ class KVCacheCompressor:
 
         # Decompress
         k_hat, v_hat = compressor.decompress(compressed)
-
-        # Or compress streaming (one token at a time)
-        compressor.compress_token(k_vec, v_vec, layer=0, head=0)
     """
 
     def __init__(
@@ -71,14 +68,20 @@ def __init__(
         self.k_bits = k_bits
         self.v_bits = v_bits
 
+        # Spawn independent child seeds so K and V quantizers use statistically
+        # independent random streams without magic offset arithmetic.
+        # Accept either an int or an already-created SeedSequence.
+        ss = seed if isinstance(seed, np.random.SeedSequence) else np.random.SeedSequence(seed)
+        k_child, v_child = ss.spawn(2)
+
         # K cache uses full TurboQuant (inner product preservation)
         self.k_quantizer = TurboQuant(
-            head_dim, bit_width=k_bits, seed=seed, norm_correction=norm_correction,
+            head_dim, bit_width=k_bits, seed=k_child, norm_correction=norm_correction,
         )
 
         # V cache uses MSE-only PolarQuant (value reconstruction)
         self.v_quantizer = TurboQuantMSE(
-            head_dim, bit_width=v_bits, seed=seed + 500, norm_correction=norm_correction,
+            head_dim, bit_width=v_bits, seed=v_child, norm_correction=norm_correction,
         )
 
     def compress(self, k_cache: np.ndarray, v_cache: np.ndarray) -> CompressedKVCache:
@@ -160,8 +163,8 @@ def memory_stats(self, seq_len: int, num_layers: int, num_heads: int) -> dict:
 
         # K: b bits per coord + 32-bit norm
         k_bits_total = n_vectors * (self.head_dim * self.k_bits + 32)
-        # V: b bits per coord (no norm needed for MSE-only)
-        v_bits_total = n_vectors * self.head_dim * self.v_bits
+        # V: b bits per coord + 32-bit norm (PolarQuant stores per-vector norm for rescaling)
+        v_bits_total = n_vectors * self.head_dim * self.v_bits + n_vectors * 32
 
         compressed_bytes = (k_bits_total + v_bits_total) / 8
 
diff --git a/turboquant/turboquant.py b/turboquant/turboquant.py
@@ -19,11 +19,11 @@
 @dataclass
 class CompressedVector:
     """Container for a TurboQuant-compressed vector."""
-    mse_indices: np.ndarray   # (d,) or (batch, d) — PolarQuant indices, (b-1)-bit integers
-    vector_norms: np.ndarray  # scalar or (batch,) — original ||x||_2 for rescaling
-    qjl_signs: np.ndarray     # (d,) or (batch, d) — QJL sign bits, int8 {+1, -1}
-    residual_norms: np.ndarray # scalar or (batch,) — ||residual||_2
-    bit_width: int             # total bits per coordinate
+    mse_indices: np.ndarray    # (d,) or (batch, d) — PolarQuant indices, (b-1)-bit integers
+    vector_norms: np.ndarray   # scalar or (batch,) — original ||x||_2 for rescaling
+    qjl_signs: np.ndarray      # (d,) or (batch, d) — QJL sign bits, int8 {+1, -1}
+    residual_norms: np.ndarray  # scalar or (batch,) — ||residual||_2
+    bit_width: int              # total bits per coordinate
 
 
 class TurboQuant:
@@ -54,13 +54,19 @@ def __init__(self, d: int, bit_width: int, seed: int = 42, norm_correction: bool
         self.d = d
         self.bit_width = bit_width
 
+        # Spawn independent child seeds from a SeedSequence so PolarQuant and QJL
+        # use statistically independent random streams without magic offset arithmetic.
+        # Accept either an int or an already-created SeedSequence (e.g. from a parent spawner).
+        ss = seed if isinstance(seed, np.random.SeedSequence) else np.random.SeedSequence(seed)
+        pq_child, qjl_child = ss.spawn(2)
+
         # Stage 1: PolarQuant at (b-1) bits
         self.polar_quant = PolarQuant(
-            d, bit_width=bit_width - 1, seed=seed, norm_correction=norm_correction,
+            d, bit_width=bit_width - 1, seed=pq_child, norm_correction=norm_correction,
         )
 
-        # Stage 2: QJL for residual (uses different seed)
-        self.qjl = QJL(d, seed=seed + 1000)
+        # Stage 2: QJL for residual (independent seed stream)
+        self.qjl = QJL(d, seed=qjl_child)
 
     def quantize(self, x: np.ndarray) -> CompressedVector:
         """Quantize a vector or batch.
@@ -148,3 +154,14 @@ def quantize(self, x: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
 
     def dequantize(self, indices: np.ndarray, norms: np.ndarray) -> np.ndarray:
         return self.polar_quant.dequantize(indices, norms)
+
+    def compressed_size_bits(self, n_vectors: int) -> int:
+        """Compute total storage in bits for n_vectors compressed vectors.
+
+        Includes:
+        - PolarQuant indices: b bits per coordinate per vector
+        - Norms: 32 bits (float32) per vector (stored for per-vector rescaling)
+        """
+        per_vector = self.d * self.bit_width
+        norms = 32  # float32 per vector
+        return n_vectors * (per_vector + norms)