experiments/rht-k-sweep: tweet-screencap script

TheTom · claude · TheTom · commit 1e3d7a287612 · 2026-05-13T19:08:18.000-05:00
One-shot script that prints the production-codebook result in a
visually-formatted terminal output for X attachment. ANSI colors:
green baseline row, red +1/+2 RHT rows, bold-red emphasis on the
25.0% -&gt; 100.0% catastrophic rate line.

Self-contained, runs in ~5s on M5 Max. No GPU, no model load. Matches
production_codebook.py numbers exactly.

  python3 experiments/rht_k_sweep/screencap.py

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/experiments/rht_k_sweep/screencap.py b/experiments/rht_k_sweep/screencap.py
@@ -0,0 +1,147 @@
+"""Clean terminal output for screenshot. Production codebook only,
+formatted for visual impact."""
+
+import numpy as np
+from scipy import linalg, stats
+
+# ANSI
+R = "\033[0m"
+B = "\033[1m"
+RED = "\033[91m"
+GREEN = "\033[92m"
+YELLOW = "\033[93m"
+DIM = "\033[2m"
+CYAN = "\033[96m"
+
+d, T, n_trials, n_queries = 128, 512, 5, 64
+sigma = 1.0 / np.sqrt(d)
+
+# Production turbo4 codebook (ggml/src/ggml-cuda/turbo-quant.cuh)
+PROD = np.array([
+    -0.173926, -0.117195, -0.089527, -0.068756,
+    -0.051262, -0.035597, -0.020989, -0.006938,
+     0.006938,  0.020989,  0.035597,  0.051262,
+     0.068756,  0.089527,  0.117195,  0.173926
+])
+
+
+def gen_K(T, d, rng, nf):
+    bern = rng.choice([-1.0, 1.0], size=(T, d))
+    noise = rng.normal(0.0, nf, size=(T, d))
+    raw = bern + noise
+    return raw * (sigma / np.sqrt(np.var(raw)))
+
+
+H = linalg.hadamard(d) / np.sqrt(d)
+
+
+def apply_rht(X, k, seed):
+    rng = np.random.default_rng(seed)
+    Y, ss = X.copy(), []
+    for _ in range(k):
+        s = rng.choice([-1.0, 1.0], size=d).astype(np.float64)
+        ss.append(s)
+        Y = (Y * s) @ H.T
+    return Y, ss
+
+
+def invert_rht(Y, ss):
+    X = Y.copy()
+    for s in reversed(ss):
+        X = (X @ H) * s
+    return X
+
+
+def quantize_prod(X, cb):
+    norms = np.maximum(np.linalg.norm(X, axis=1, keepdims=True), 1e-12)
+    flat = (X / norms).flatten()
+    idx = np.argmin(np.abs(flat[:, None] - cb[None, :]), axis=1)
+    return cb[idx].reshape(X.shape) * norms
+
+
+def softmax(x, axis=-1):
+    x = x - x.max(axis=axis, keepdims=True)
+    e = np.exp(x)
+    return e / e.sum(axis=axis, keepdims=True)
+
+
+def attn_kl(K_ref, K_test, Q):
+    sd = np.sqrt(d)
+    p = softmax(Q @ K_ref.T / sd)
+    q = softmax(Q @ K_test.T / sd)
+    eps = 1e-12
+    return (p * (np.log(p + eps) - np.log(q + eps))).sum(axis=-1)
+
+
+# Tune to §3 layer-0 stats
+K_orig = gen_K(T, d, np.random.default_rng(42), 0.38)
+k0, ks0 = stats.kurtosis(K_orig.flatten()), stats.kstest(K_orig.flatten(), 'norm', args=(0, sigma))[0]
+
+
+def run(k_extra):
+    mses, kls = [], []
+    kurt_p, ks_p = None, None
+    for t in range(n_trials):
+        seed = 2000 + 100 * (k_extra + 1) + t
+        if k_extra == 0:
+            K_rot, ss = K_orig.copy(), []
+        else:
+            K_rot, ss = apply_rht(K_orig, k_extra, seed)
+        if t == 0:
+            kurt_p = stats.kurtosis(K_rot.flatten())
+            ks_p = stats.kstest(K_rot.flatten(), 'norm', args=(0, sigma))[0]
+        Kq = quantize_prod(K_rot, PROD)
+        Kr = invert_rht(Kq, ss) if ss else Kq
+        mses.append(np.mean((Kr - K_orig) ** 2))
+        q_rng = np.random.default_rng(seed + 50000)
+        Q = q_rng.normal(0.0, sigma, size=(n_queries, d))
+        kls.append(attn_kl(K_orig, Kr, Q))
+    return float(np.mean(mses)), np.concatenate(kls), kurt_p, ks_p
+
+
+print()
+print(f"{B}{CYAN}Basat 2026 (arxiv:2605.06014v1) — RHT-count prescription for KV cache{R}")
+print(f"{DIM}claim: more RHTs → Gaussian-marginal-recovery → better quantization{R}")
+print(f"{DIM}application cited: TurboQuant KV-cache compression{R}")
+print()
+print(f"{B}Test setup{R}")
+print(f"  source K       : sub-Gaussian, kurt={k0:+.3f}, KS-vs-N(0,1/d)={ks0:.3f}")
+print(f"                   {DIM}(matched to §3 layer-0 K of why-mse-fails-for-kv-quantization){R}")
+print(f"  codebook       : production TURBO_CENTROIDS_4BIT, 16 levels, ±0.174")
+print(f"                   {DIM}(from ggml/src/ggml-cuda/turbo-quant.cuh — ships in fork){R}")
+print(f"  block norm     : per-128-element L2 (QK_TURBO4 = 128 = head_dim)")
+print(f"  attn KL proxy  : softmax(Q K^T / √d), Q ~ N(0, 1/d), {n_queries*n_trials} queries")
+print()
+
+m0, kl0, kurt0, kspos0 = run(0)
+m1, kl1, kurt1, kspos1 = run(1)
+m2, kl2, kurt2, kspos2 = run(2)
+
+base_med = float(np.median(kl0))
+c0 = float(np.mean(kl0 >= 1.10 * base_med))
+c1 = float(np.mean(kl1 >= 1.10 * base_med))
+c2 = float(np.mean(kl2 >= 1.10 * base_med))
+
+
+def dpct(new, old):
+    return (new / old - 1) * 100
+
+
+print(f"{B}Result{R}")
+print(f"  {'k_extra':<10}{'post-kurt':<14}{'KS':<10}{'MSE':<14}{'Δ MSE':<10}{'KL mean':<14}{'Δ KL':<10}{'catastrophic':<14}")
+print(f"  {DIM}{'-'*96}{R}")
+print(f"  {GREEN}{'0  baseline':<10}{R}  {kurt0:<+12.3f}{kspos0:<10.3f}{m0:<14.3e}{'—':<10}{kl0.mean():<14.3e}{'—':<10}{GREEN}{c0:<14.1%}{R}")
+print(f"  {RED}{'1  +1 RHT':<10}{R}  {kurt1:<+12.3f}{kspos1:<10.3f}{m1:<14.3e}{RED}{dpct(m1,m0):<+10.1f}{R}{kl1.mean():<14.3e}{RED}{dpct(kl1.mean(),kl0.mean()):<+10.1f}{R}{RED}{B}{c1:<14.1%}{R}")
+print(f"  {RED}{'2  +2 RHT':<10}{R}  {kurt2:<+12.3f}{kspos2:<10.3f}{m2:<14.3e}{RED}{dpct(m2,m0):<+10.1f}{R}{kl2.mean():<14.3e}{RED}{dpct(kl2.mean(),kl0.mean()):<+10.1f}{R}{RED}{c2:<14.1%}{R}")
+print()
+print(f"  {B}{RED}catastrophic rate: {c0:.1%}  →  {c1:.1%}{R}  {DIM}(per-query KL > 1.10 × baseline median){R}")
+print()
+print(f"{B}Mechanism{R}")
+print(f"  Theorem holds: kurt drifts from {kurt0:+.2f} to {kurt1:+.2f} after 1 extra RHT, as proven.")
+print(f"  Application fails: production turbo4 centroids extend to ±0.174 ≈ ±2σ because")
+print(f"  real K post-WHT is bounded sub-Gaussian. +RHT Gaussianizes K → mass past ±2σ →")
+print(f"  saturation at the codebook extreme → 100% catastrophic.")
+print()
+print(f"{DIM}repro: experiments/rht_k_sweep/screencap.py  (pure numpy/scipy, ~5s, no GPU){R}")
+print(f"{DIM}branch: experiment/rht-k-sweep on TheTom/llama-cpp-turboquant fork (local){R}")
+print()