transformerless_lm: FibGen v2 — extended Fibonacci table + cross-frequency mode

claude · claude · commit b354c1170378 · 2026-05-21T00:28:48.000Z
Two follow-ups from the v1 result (100x compression at +19% loss floor):

1. FIBONACCI table extended from 16 to 32 entries. v1 silently clamped
   K to 16, so the "K=32" arm was actually K=16. Now K can scale.

2. New "cross" generator mode. v1 used separable components where each
   uses the SAME Fibonacci frequency on both axes, giving rank-K matrices.
   Cross mode uses INDEPENDENT (F_k_i, F_k_j) frequency pairs, giving K^2
   outer-product components for 4*K^2 params per layer.

Compression at d=128, n_blocks=4 (vs dense 800K params):
   K=16 separable:    8K params (100x), 16 separable components per layer
   K=16 cross:       25K params ( 32x), 256 cross-frequency components
   K=32 separable:    9K params ( 88x), 32 separable components
   K=32 cross:       81K params ( 10x), 1024 cross-frequency components

Bench sweeps the 6 cells of (K, mode) against dense_crt to find the
expressivity/compression Pareto frontier. If cross mode breaks through
the +19% loss wall at acceptable compression, the generator-from-seed
thesis has a real competitive case for inference.
diff --git a/experiments/transformerless_lm/models_fibgen.py b/experiments/transformerless_lm/models_fibgen.py
@@ -42,44 +42,66 @@
 import torch.nn.functional as F
 
 
-FIBONACCI = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597]
+# Extended unique-positive Fibonacci table — 32 entries.
+# Previous 16-entry version caused K>16 to silently clamp.
+FIBONACCI = [
+    1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987,
+    1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025,
+    121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578,
+]
 
 
 class FibGenLinear(nn.Module):
     """Drop-in replacement for nn.Linear where W is generated from a seed.
 
+    Two generator modes:
+
+    "separable" (the original): each component uses the SAME Fibonacci
+        frequency on both axes. Generates rank-K terms.
+            W[i,j] = Σ_k [a_k cos(F_k·i) cos(F_k·j) + ...]
+        Seed: 4·K params.
+
+    "cross" (new): each component uses INDEPENDENT Fibonacci frequencies
+        on the two axes. Generates a full K_i × K_j grid of frequency
+        pairs, so the matrix is a sum of K_i·K_j outer products of
+        single-frequency 1-D bases.
+            W[i,j] = Σ_{k_i, k_j} [a_{kk'} cos(F_{k_i}·i) cos(F_{k_j}·j) + ...]
+        Seed: 4·K² params. Equal expressivity as separable at K_separable = K²,
+        but with the substrate-canonical Fibonacci-coprime structure that
+        makes the basis non-degenerate (Fibonacci frequencies are pairwise
+        substrate-distinguishable).
+
     Args:
         in_features: input dim.
         out_features: output dim.
-        K: number of Fibonacci-frequency components in the generator.
-            Higher K = more capacity, more params. K=16 → 64 params
-            (vs in·out for a stored matrix).
+        K: number of Fibonacci frequencies per axis.
+        mode: "separable" or "cross".
         bias: whether to include a learnable bias vector.
-        init_scale: scales the seed initialization. The generated W has
-            magnitude ~ init_scale · sqrt(4K), so smaller init_scale
-            gives smaller initial weights.
+        init_scale: scales the seed initialization.
     """
 
     def __init__(self, in_features: int, out_features: int, K: int = 16,
+                 mode: str = "separable",
                  bias: bool = True, init_scale: float = 0.1):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.K = min(K, len(FIBONACCI))
-        # Seed: 4 coefficients per Fibonacci component (cc, sc, cs, ss).
+        if mode not in ("separable", "cross"):
+            raise ValueError(f"unknown mode: {mode}")
+        self.mode = mode
+        n_components = self.K if mode == "separable" else self.K * self.K
         self.seed = nn.Parameter(
-            torch.randn(self.K, 4) * (init_scale / max(1, math.sqrt(self.K)))
+            torch.randn(n_components, 4) * (init_scale / max(1, math.sqrt(n_components)))
         )
         if bias:
             self.bias = nn.Parameter(torch.zeros(out_features))
         else:
             self.register_parameter("bias", None)
-        # Precompute the cos/sin of position·Fibonacci-frequency for both
-        # axes. These are FIXED — no gradient flows through positions.
+        # Precompute cos/sin position·Fibonacci-frequency tables.
         i_idx = torch.arange(out_features).float()
         j_idx = torch.arange(in_features).float()
         freqs = torch.tensor(FIBONACCI[:self.K], dtype=torch.float)
-        # angles: [out, K], [in, K]
         a_i = 2 * math.pi * i_idx.unsqueeze(1) * freqs.unsqueeze(0) / max(out_features, 1)
         a_j = 2 * math.pi * j_idx.unsqueeze(1) * freqs.unsqueeze(0) / max(in_features, 1)
         self.register_buffer("cos_i", torch.cos(a_i))   # [out, K]
@@ -88,21 +110,24 @@ def __init__(self, in_features: int, out_features: int, K: int = 16,
         self.register_buffer("sin_j", torch.sin(a_j))
 
     def generate_W(self) -> torch.Tensor:
-        # seed: [K, 4] → split into 4 [K] tensors.
-        a, b, c, d = self.seed[:, 0], self.seed[:, 1], self.seed[:, 2], self.seed[:, 3]
-        # W = sum_k (
-        #   a_k · cos_i[:, k] · cos_j[:, k]^T +
-        #   b_k · sin_i[:, k] · cos_j[:, k]^T +
-        #   c_k · cos_i[:, k] · sin_j[:, k]^T +
-        #   d_k · sin_i[:, k] · sin_j[:, k]^T
-        # )
-        # Each term is an [out, in] outer product.
-        # Compose via einsum: [out, K] · [K] · [K, in] (with the diagonal)
-        # → [out, in].
-        W = torch.einsum("ok,k,jk->oj", self.cos_i, a, self.cos_j)
-        W = W + torch.einsum("ok,k,jk->oj", self.sin_i, b, self.cos_j)
-        W = W + torch.einsum("ok,k,jk->oj", self.cos_i, c, self.sin_j)
-        W = W + torch.einsum("ok,k,jk->oj", self.sin_i, d, self.sin_j)
+        if self.mode == "separable":
+            a, b, c, d = self.seed[:, 0], self.seed[:, 1], self.seed[:, 2], self.seed[:, 3]
+            W = torch.einsum("ok,k,jk->oj", self.cos_i, a, self.cos_j)
+            W = W + torch.einsum("ok,k,jk->oj", self.sin_i, b, self.cos_j)
+            W = W + torch.einsum("ok,k,jk->oj", self.cos_i, c, self.sin_j)
+            W = W + torch.einsum("ok,k,jk->oj", self.sin_i, d, self.sin_j)
+            return W
+        # mode == "cross": seed shape [K*K, 4], reshape to [K, K, 4]
+        K = self.K
+        seed = self.seed.view(K, K, 4)
+        a, b, c, d = seed[..., 0], seed[..., 1], seed[..., 2], seed[..., 3]
+        # W[i,j] = Σ_{k_i, k_j} [a · cos_i[i, k_i] cos_j[j, k_j] + ...]
+        # einsum: cos_i [out, k_i] @ a [k_i, k_j] -> [out, k_j], then
+        # · cos_j [in, k_j] -> [out, in].
+        W = torch.einsum("ol,lm,jm->oj", self.cos_i, a, self.cos_j)
+        W = W + torch.einsum("ol,lm,jm->oj", self.sin_i, b, self.cos_j)
+        W = W + torch.einsum("ol,lm,jm->oj", self.cos_i, c, self.sin_j)
+        W = W + torch.einsum("ol,lm,jm->oj", self.sin_i, d, self.sin_j)
         return W
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -127,11 +152,11 @@ def n_dense_equivalent_params(self) -> int:
 class FibGenAttention(nn.Module):
     """Single-head self-attention with all linear layers FibGen-generated."""
 
-    def __init__(self, d_model: int, K: int = 16):
+    def __init__(self, d_model: int, K: int = 16, mode: str = "separable"):
         super().__init__()
         self.d_model = d_model
-        self.qkv = FibGenLinear(d_model, 3 * d_model, K=K)
-        self.out = FibGenLinear(d_model, d_model, K=K)
+        self.qkv = FibGenLinear(d_model, 3 * d_model, K=K, mode=mode)
+        self.out = FibGenLinear(d_model, d_model, K=K, mode=mode)
 
     def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
         B, T, D = x.shape
@@ -148,21 +173,22 @@ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
 class FibGenFeedForward(nn.Module):
     """FFN with FibGen-generated linear layers."""
 
-    def __init__(self, d_model: int, expansion: int = 4, K: int = 16):
+    def __init__(self, d_model: int, expansion: int = 4, K: int = 16,
+                 mode: str = "separable"):
         super().__init__()
         d_inner = d_model * expansion
-        self.w1 = FibGenLinear(d_model, d_inner, K=K)
-        self.w2 = FibGenLinear(d_inner, d_model, K=K)
+        self.w1 = FibGenLinear(d_model, d_inner, K=K, mode=mode)
+        self.w2 = FibGenLinear(d_inner, d_model, K=K, mode=mode)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.w2(F.gelu(self.w1(x)))
 
 
 class FibGenBlock(nn.Module):
-    def __init__(self, d_model: int, K: int = 16):
+    def __init__(self, d_model: int, K: int = 16, mode: str = "separable"):
         super().__init__()
-        self.attn = FibGenAttention(d_model, K=K)
-        self.ff = FibGenFeedForward(d_model, K=K)
+        self.attn = FibGenAttention(d_model, K=K, mode=mode)
+        self.ff = FibGenFeedForward(d_model, K=K, mode=mode)
         self.ln1 = nn.LayerNorm(d_model)
         self.ln2 = nn.LayerNorm(d_model)
 
@@ -184,25 +210,20 @@ class FibGenLM(nn.Module):
     """
 
     def __init__(self, vocab_size: int, d_model: int, n_blocks: int,
-                 seq_len: int, K: int = 16):
+                 seq_len: int, K: int = 16, mode: str = "separable"):
         super().__init__()
         self.seq_len = seq_len
         self.K = K
-        # Embedding implemented as FibGen + index → FibGen produces a
-        # [vocab, d_model] table that we index into.
-        self.embed_gen = FibGenLinear(vocab_size, d_model, K=K, bias=False)
-        # Positional encoding stays CRT-Fibonacci (already substrate-aligned,
-        # and it's a buffer, not a learned weight).
+        self.mode = mode
+        self.embed_gen = FibGenLinear(vocab_size, d_model, K=K, mode=mode,
+                                        bias=False)
         pe = self._crt_pe(seq_len, d_model)
         self.register_buffer("pe", pe)
         self.blocks = nn.ModuleList([
-            FibGenBlock(d_model, K=K) for _ in range(n_blocks)
+            FibGenBlock(d_model, K=K, mode=mode) for _ in range(n_blocks)
         ])
         self.ln_f = nn.LayerNorm(d_model)
-        # Head: FibGen too (or tied with embed — but tied with a generator
-        # means head and embed share the SAME generator seed which forces
-        # a constraint. Pick untied for now to test capacity.)
-        self.head = FibGenLinear(d_model, vocab_size, K=K, bias=False)
+        self.head = FibGenLinear(d_model, vocab_size, K=K, mode=mode, bias=False)
         mask = torch.tril(torch.ones(seq_len, seq_len))
         self.register_buffer("mask", mask)
 
diff --git a/experiments/transformerless_lm/train_fibgen.py b/experiments/transformerless_lm/train_fibgen.py
@@ -106,6 +106,8 @@ def main():
     parser.add_argument("--distractor-frac", type=float, default=0.20)
     parser.add_argument("--K-sweep", type=str, default="8,16,32",
                         help="Comma-separated K values for FibGen.")
+    parser.add_argument("--modes", type=str, default="separable,cross",
+                        help="Comma-separated generator modes.")
     parser.add_argument("--out", type=str, default="results_fibgen.json")
     args = parser.parse_args()
 
@@ -135,16 +137,20 @@ def make_crt():
     results["dense_crt"] = train_one("dense_crt", vocab_size, train_split,
                                        val_split, args, fib_positions, make_crt)
 
-    # 2. FibGen at each K
+    # 2. FibGen at each K x mode
     K_values = [int(k) for k in args.K_sweep.split(",")]
-    for K in K_values:
-        def make_fibgen(K=K):
-            return FibGenLM(vocab_size=vocab_size, d_model=args.d_model,
-                             n_blocks=args.n_blocks, seq_len=args.seq_len, K=K)
-        results[f"fibgen_K{K}"] = train_one(
-            f"fibgen_K{K}", vocab_size, train_split, val_split, args,
-            fib_positions, make_fibgen,
-        )
+    modes = [m.strip() for m in args.modes.split(",")]
+    for mode in modes:
+        for K in K_values:
+            def make_fibgen(K=K, mode=mode):
+                return FibGenLM(vocab_size=vocab_size, d_model=args.d_model,
+                                 n_blocks=args.n_blocks, seq_len=args.seq_len,
+                                 K=K, mode=mode)
+            name = f"fibgen_K{K}_{mode}"
+            results[name] = train_one(
+                name, vocab_size, train_split, val_split, args,
+                fib_positions, make_fibgen,
+            )
 
     # Summary
     print()
@@ -166,24 +172,25 @@ def make_fibgen(K=K):
 
     # Verdict
     base_val = results["dense_crt"]["final_val"]
-    print(f"VERDICT (uniform-random floor: {uniform_floor:.4f}, dense_crt: {base_val:.4f}):")
-    for K in K_values:
-        r = results[f"fibgen_K{K}"]
-        if r["final_val"] < uniform_floor * 0.85:
-            tag = "LEARNED (≤85% of uniform floor)"
-        elif r["final_val"] < uniform_floor * 0.95:
-            tag = "WEAK LEARNING"
-        else:
-            tag = "FAILED (near uniform-random)"
-        # Compute compression
-        dense_eq = 0
-        stored = 0
-        m = FibGenLM(vocab_size=vocab_size, d_model=args.d_model,
-                      n_blocks=args.n_blocks, seq_len=args.seq_len, K=K)
-        ss = m.storage_summary()
-        compr = ss["compression"]
-        print(f"  K={K:>3}: val={r['final_val']:.4f}  "
-              f"compression={compr:.1f}x  → {tag}")
+    print(f"VERDICT (uniform-random floor: {uniform_floor:.4f}, "
+          f"dense_crt: {base_val:.4f}):")
+    for mode in modes:
+        for K in K_values:
+            r = results[f"fibgen_K{K}_{mode}"]
+            if r["final_val"] < uniform_floor * 0.85:
+                tag = "LEARNED"
+            elif r["final_val"] < uniform_floor * 0.95:
+                tag = "WEAK LEARNING"
+            else:
+                tag = "FAILED"
+            m = FibGenLM(vocab_size=vocab_size, d_model=args.d_model,
+                          n_blocks=args.n_blocks, seq_len=args.seq_len,
+                          K=K, mode=mode)
+            ss = m.storage_summary()
+            gap_pct = (r["final_val"] - base_val) / base_val * 100
+            print(f"  K={K:>3} mode={mode:<10}: val={r['final_val']:.4f}  "
+                  f"compr={ss['compression']:5.1f}x  vs_dense={gap_pct:+5.1f}%  "
+                  f"→ {tag}")
 
     out_path = Path(__file__).parent / args.out
     with open(out_path, "w") as f: