Skip to content

Commit fd6d33b

Browse files
committed
transformerless_lm: FibGen Pareto bench — K-extension + scale + composed
Three follow-up directions in one bench: (1) K-EXTENSION at d=128: K in {32, 48, 64} with cross mode. The v2 sweep stopped at K=32 (FIBONACCI table extended to 32 entries); extending to K=64 (table now 64 entries) tests whether more Fibonacci-frequency components close the +6.3% gap to dense. (2) SCALE TEST at d=256: dense_crt vs FibGen K=32 cross at 4x the previous scale. FibGen weight storage grows as O(K^2) regardless of d_model, while dense grows as O(d_model^2) -- so the compression ratio scales as (d/K)^2. At d=4096 (LLM scale) and K=32 that is 16000x per layer. If the loss penalty stays single-digit at d=256, the substrate-generated weight basis demonstrably scales. (3) COMPOSED TRANSFORMERLESS at d=128: FibGenTransformerless arch that stacks every validated substrate primitive in one model: - CRT-Fibonacci PE - FibGen embedding (substrate-generated) - Fibonacci-offset sparse attention (only attends to F(k)-distance positions) - FibGen QKV / out weights inside the sparse attention - Zeckendorf-routed FFN (n_specialists=5, routed by token-id Zeckendorf) - FibGen specialist weights - FibGen LM head Lazy-loading data by default. Tests whether all the primitives compose to compound the wins or interfere and lose. models_fibgen.FIBONACCI table extended from 32 to 64 entries via the standard recurrence so K can scale freely. Smoke @ 60 steps (everything under-converged): dense_crt_d128: val=6.54 fibgen_K32_cross_d128: val=3.45 fibgen_K48_cross_d128: val=3.46 fibgen_K64_cross_d128: val=3.48 dense_crt_d256: val=5.25 fibgen_K32_cross_d256: val=3.56 transformerless_K32_cross: val=3.21 (BEST early -- composed primitives warm up faster than any single one) Full 1500-step bench in progress; results in a follow-up commit.
1 parent a3d47c8 commit fd6d33b

2 files changed

Lines changed: 413 additions & 7 deletions

File tree

experiments/transformerless_lm/models_fibgen.py

Lines changed: 195 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,18 @@
4242
import torch.nn.functional as F
4343

4444

45-
# Extended unique-positive Fibonacci table — 32 entries.
46-
# Previous 16-entry version caused K>16 to silently clamp.
47-
FIBONACCI = [
48-
1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987,
49-
1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025,
50-
121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578,
51-
]
45+
# Extended unique-positive Fibonacci table — 64 entries.
46+
# Computed by recurrence; large F(k) wrap pseudo-randomly mod small
47+
# dimensions but remain pairwise-distinct, so they still serve as a
48+
# rich basis on weight matrices at d=128-1024.
49+
def _build_fibonacci(n: int) -> list[int]:
50+
out = [1, 2]
51+
while len(out) < n:
52+
out.append(out[-1] + out[-2])
53+
return out
54+
55+
56+
FIBONACCI = _build_fibonacci(64)
5257

5358

5459
class FibGenLinear(nn.Module):
@@ -198,6 +203,189 @@ def forward(self, x, mask):
198203
return x
199204

200205

206+
class FibGenSparseAttention(nn.Module):
207+
"""Fibonacci-offset attention + FibGen QKV/out weights.
208+
209+
Composes two validated substrate components:
210+
- sparse attention restricted to Fibonacci-distance position pairs
211+
(~log_phi_pi(T) edges per query instead of T)
212+
- FibGen-generated Q, K, V, out projections (100x weight compression)
213+
"""
214+
215+
def __init__(self, d_model: int, seq_len: int, K: int = 16,
216+
mode: str = "separable"):
217+
super().__init__()
218+
self.d_model = d_model
219+
self.seq_len = seq_len
220+
self.qkv = FibGenLinear(d_model, 3 * d_model, K=K, mode=mode)
221+
self.out = FibGenLinear(d_model, d_model, K=K, mode=mode)
222+
# Fibonacci-offset mask
223+
mask = torch.zeros(seq_len, seq_len, dtype=torch.bool)
224+
diag = torch.arange(seq_len)
225+
mask[diag, diag] = True
226+
for f in FIBONACCI:
227+
if f >= seq_len:
228+
break
229+
i_idx = torch.arange(f, seq_len)
230+
j_idx = i_idx - f
231+
mask[i_idx, j_idx] = True
232+
self.register_buffer("fib_mask", mask)
233+
234+
def forward(self, x: torch.Tensor) -> torch.Tensor:
235+
B, T, D = x.shape
236+
qkv = self.qkv(x)
237+
q, k, v = qkv.chunk(3, dim=-1)
238+
scale = 1.0 / math.sqrt(D)
239+
scores = (q @ k.transpose(-2, -1)) * scale
240+
scores = scores.masked_fill(~self.fib_mask[:T, :T], float("-inf"))
241+
attn = F.softmax(scores, dim=-1)
242+
return self.out(attn @ v)
243+
244+
245+
class FibGenRoutedFFN(nn.Module):
246+
"""Zeckendorf-routed FFN where each specialist is FibGen-generated.
247+
248+
Composes three substrate primitives:
249+
- K specialists, each at d_inner = expansion·d/n_specialists width
250+
so total params match standard FFN
251+
- per-token routing by the top Zeckendorf index of the token id
252+
(integer routing, no float router)
253+
- each specialist's W1, W2 are FibGen-generated
254+
"""
255+
256+
def __init__(self, d_model: int, n_specialists: int = 5,
257+
expansion: int = 4, vocab_size: int = 65,
258+
K: int = 16, mode: str = "separable"):
259+
super().__init__()
260+
self.d_model = d_model
261+
self.n_specialists = n_specialists
262+
d_inner = max(1, int(expansion * d_model / n_specialists))
263+
self.specialists = nn.ModuleList([
264+
nn.Sequential(
265+
FibGenLinear(d_model, d_inner, K=K, mode=mode),
266+
nn.GELU(),
267+
FibGenLinear(d_inner, d_model, K=K, mode=mode),
268+
)
269+
for _ in range(n_specialists)
270+
])
271+
# Routing table from omnimcode-core/src/phi_pi_fib.rs (Zeckendorf-top
272+
# index of each token id, mod K)
273+
def _zeckendorf_top(n):
274+
if n <= 0:
275+
return 0
276+
rem = n
277+
i = len(FIBONACCI) - 1
278+
while i >= 0:
279+
if FIBONACCI[i] <= rem:
280+
return i
281+
i -= 1
282+
return 0
283+
route = torch.tensor(
284+
[_zeckendorf_top(t) % n_specialists for t in range(vocab_size)],
285+
dtype=torch.long,
286+
)
287+
self.register_buffer("route_table", route)
288+
289+
def forward(self, x: torch.Tensor, token_ids: torch.Tensor) -> torch.Tensor:
290+
B, T, D = x.shape
291+
route_id = self.route_table[token_ids] # [B, T]
292+
out = torch.zeros_like(x)
293+
for k, spec in enumerate(self.specialists):
294+
mask = (route_id == k).float().unsqueeze(-1)
295+
if mask.sum() == 0:
296+
continue
297+
out = out + spec(x) * mask
298+
return out
299+
300+
301+
class FibGenTransformerlessBlock(nn.Module):
302+
"""Block = sparse Fibonacci-offset attention + Zeckendorf-routed FFN.
303+
All weights inside both inner modules are FibGen-generated."""
304+
305+
def __init__(self, d_model: int, seq_len: int, vocab_size: int,
306+
K: int = 16, mode: str = "separable",
307+
n_specialists: int = 5):
308+
super().__init__()
309+
self.attn = FibGenSparseAttention(d_model, seq_len, K=K, mode=mode)
310+
self.ff = FibGenRoutedFFN(d_model, n_specialists=n_specialists,
311+
vocab_size=vocab_size, K=K, mode=mode)
312+
self.ln1 = nn.LayerNorm(d_model)
313+
self.ln2 = nn.LayerNorm(d_model)
314+
315+
def forward(self, x, token_ids):
316+
x = x + self.attn(self.ln1(x))
317+
x = x + self.ff(self.ln2(x), token_ids)
318+
return x
319+
320+
321+
class FibGenTransformerless(nn.Module):
322+
"""All-substrate transformerless candidate.
323+
324+
Composes:
325+
- CRT-Fibonacci positional encoding (validated -5.4%)
326+
- FibGen embedding (100x compression)
327+
- Fibonacci-offset sparse attention (-3.2% / 14x FLOPs)
328+
- FibGen QKV/out weights (100x compression)
329+
- Zeckendorf-routed FFN (1/n_specialists per-token FFN)
330+
- FibGen specialist weights (100x compression each)
331+
- FibGen LM head (100x compression)
332+
333+
Storage at d=128 should be dramatically smaller than the dense
334+
baseline; inference should run on Fibonacci-strided KV state.
335+
"""
336+
337+
def __init__(self, vocab_size: int, d_model: int, n_blocks: int,
338+
seq_len: int, K: int = 16, mode: str = "separable",
339+
n_specialists: int = 5):
340+
super().__init__()
341+
self.seq_len = seq_len
342+
self.K = K
343+
self.mode = mode
344+
self.embed_gen = FibGenLinear(vocab_size, d_model, K=K, mode=mode,
345+
bias=False)
346+
pe = FibGenLM._crt_pe(seq_len, d_model)
347+
self.register_buffer("pe", pe)
348+
self.blocks = nn.ModuleList([
349+
FibGenTransformerlessBlock(
350+
d_model, seq_len, vocab_size, K=K, mode=mode,
351+
n_specialists=n_specialists,
352+
)
353+
for _ in range(n_blocks)
354+
])
355+
self.ln_f = nn.LayerNorm(d_model)
356+
self.head = FibGenLinear(d_model, vocab_size, K=K, mode=mode, bias=False)
357+
358+
def forward(self, token_ids):
359+
B, T = token_ids.shape
360+
W_emb = self.embed_gen.generate_W()
361+
h = W_emb.t()[token_ids] + self.pe[:T]
362+
for block in self.blocks:
363+
h = block(h, token_ids)
364+
h = self.ln_f(h)
365+
return self.head(h)
366+
367+
def storage_summary(self) -> dict:
368+
stored = 0
369+
dense_eq = 0
370+
for m in self.modules():
371+
if isinstance(m, FibGenLinear):
372+
stored += m.n_stored_params
373+
dense_eq += m.n_dense_equivalent_params
374+
# LayerNorms etc.
375+
for n, p in self.named_parameters():
376+
if "seed" in n:
377+
continue
378+
if any(s in n for s in (".embed_gen.bias", ".head.bias",
379+
".qkv.bias", ".out.bias",
380+
".w1.bias", ".w2.bias",
381+
".0.bias", ".2.bias")):
382+
continue
383+
stored += p.numel()
384+
dense_eq += p.numel()
385+
return {"stored": stored, "dense_equivalent": dense_eq,
386+
"compression": dense_eq / max(stored, 1)}
387+
388+
201389
class FibGenLM(nn.Module):
202390
"""Char-level LM with EVERY linear layer FibGen-generated.
203391

0 commit comments

Comments
 (0)