Skip to content

Commit b66bae7

Browse files
committed
transformerless_lm: v3 — phi-power "fluid Fibonacci" tier basis
V2 quantization sweep showed: - per_row scale fixes ~0.24 nats (the standard quantization trick works) - reciprocal Fibonacci values barely help (geometric spacing near 0 is still too coarse — F(k+1)/F(k) is 2 at small k, not phi) - both archs plateau around +0.6 to +0.7 nats vs the 0.1 nat target - tied_substrate quantizes BETTER than dense_crt (+0.59 vs +0.74) -- the tied weight constraint pushes W toward more substrate-compatible structure during training (Principle A x Principle B synergy) V3 introduces the "fluid Fibonacci" tier basis the user gestured at: phi_power_tier_values(n) returns {0, +-phi^k} for k centered around 0. Adjacent ratio is EXACTLY phi (not approaching phi asymptotically like discrete Fibonacci does at small k). This is Binet's continuous limit: F(k) ~= phi^k / sqrt(5), so phi^k is the natural "what Fibonacci wants to be" at all scales including the small-k regime where discrete F(k) has integer jumps. The bench now tests all three tier bases: - fibonacci, no reciprocals (v1) - fibonacci, with reciprocals (v2) - phi_power (v3, new) each at n_tiers in {4,8,16,32} and scale in {per_tensor, per_row}. Includes the v2 results json for reference.
1 parent c44ef19 commit b66bae7

3 files changed

Lines changed: 363 additions & 14 deletions

File tree

experiments/transformerless_lm/models_substrate.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,30 @@
5252
FIB_POS_UNIQUE = sorted(set(f for f in FIBONACCI if f > 0))
5353

5454

55+
PHI = (1.0 + 5.0 ** 0.5) / 2.0 # golden ratio φ ≈ 1.61803
56+
57+
58+
def phi_power_tier_values(n_tiers: int) -> list[float]:
59+
"""Continuous Binet limit of Fibonacci tiers: {0, ±φ^k}.
60+
61+
Since F(k+1)/F(k) → φ, Fibonacci's "true" continuous ratio is φ.
62+
Tier values {φ^k} have ADJACENT RATIO EXACTLY = φ (not approaching φ
63+
asymptotically like discrete Fibonacci does at small k).
64+
65+
n_tiers = number of distinct positive φ^k values. Centered around
66+
φ^0 = 1 so we get both reciprocals (small values) and powers (large
67+
values) for free, in a single smooth geometric series.
68+
69+
For n_tiers=8: positive values = {φ^-4, ..., φ^3}
70+
≈ {0.146, 0.236, 0.382, 0.618, 1.0, 1.618, 2.618, 4.236}
71+
"""
72+
half = n_tiers // 2
73+
k_lo = -half
74+
k_hi = n_tiers - half
75+
pos = [PHI ** k for k in range(k_lo, k_hi)]
76+
return sorted([-v for v in pos] + [0.0] + pos)
77+
78+
5579
def fibonacci_tier_values(n_tiers: int, reciprocals: bool = False) -> list[float]:
5680
"""Signed Fibonacci tier values.
5781
@@ -78,7 +102,8 @@ def fibonacci_tier_values(n_tiers: int, reciprocals: bool = False) -> list[float
78102

79103
def fibonacci_tier_snap(W: torch.Tensor, n_tiers: int = 8,
80104
scale: str = "per_tensor",
81-
reciprocals: bool = False) -> tuple[torch.Tensor, int]:
105+
reciprocals: bool = False,
106+
tier_basis: str = "fibonacci") -> tuple[torch.Tensor, int]:
82107
"""Snap each weight in W to its nearest signed-Fibonacci tier value.
83108
84109
Args:
@@ -95,10 +120,13 @@ def fibonacci_tier_snap(W: torch.Tensor, n_tiers: int = 8,
95120
Returns:
96121
(W_quantized, n_unique_values_actually_used_avg)
97122
"""
98-
tier_vals = torch.tensor(
99-
fibonacci_tier_values(n_tiers, reciprocals=reciprocals),
100-
dtype=W.dtype, device=W.device,
101-
) # [n_levels]
123+
if tier_basis == "fibonacci":
124+
tv_list = fibonacci_tier_values(n_tiers, reciprocals=reciprocals)
125+
elif tier_basis == "phi_power":
126+
tv_list = phi_power_tier_values(n_tiers)
127+
else:
128+
raise ValueError(f"unknown tier_basis: {tier_basis}")
129+
tier_vals = torch.tensor(tv_list, dtype=W.dtype, device=W.device) # [n_levels]
102130
max_tier = max(tier_vals.abs().max().item(), 1.0)
103131

104132
if scale == "per_tensor":
@@ -116,7 +144,8 @@ def fibonacci_tier_snap(W: torch.Tensor, n_tiers: int = 8,
116144
if scale == "per_row":
117145
if W.dim() != 2:
118146
# Fall back to per-tensor for 1-D / N-D parameters.
119-
return fibonacci_tier_snap(W, n_tiers, "per_tensor", reciprocals)
147+
return fibonacci_tier_snap(W, n_tiers, "per_tensor",
148+
reciprocals, tier_basis)
120149
abs_max_row = W.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-12) # [out, 1]
121150
s_row = abs_max_row / max_tier # [out, 1]
122151
# For each row, scaled tier set is tier_vals * s_row. We need
@@ -135,6 +164,7 @@ def fibonacci_tier_snap(W: torch.Tensor, n_tiers: int = 8,
135164
def fibonacci_quantize_model(model: torch.nn.Module, n_tiers: int = 8,
136165
scale: str = "per_tensor",
137166
reciprocals: bool = False,
167+
tier_basis: str = "fibonacci",
138168
targets: list[str] = None) -> dict:
139169
"""In-place Fibonacci-tier-snap of model parameters matching `targets`."""
140170
if targets is None:
@@ -146,7 +176,8 @@ def fibonacci_quantize_model(model: torch.nn.Module, n_tiers: int = 8,
146176
continue
147177
with torch.no_grad():
148178
W_q, n_unique = fibonacci_tier_snap(
149-
p.data, n_tiers=n_tiers, scale=scale, reciprocals=reciprocals,
179+
p.data, n_tiers=n_tiers, scale=scale,
180+
reciprocals=reciprocals, tier_basis=tier_basis,
150181
)
151182
p.data.copy_(W_q)
152183
stats["params_quantized"] += p.numel()
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
{
2+
"archs": {
3+
"dense_crt": {
4+
"n_params": 801664,
5+
"n_attn_params": 264192,
6+
"val_fp32": 2.439574755728245,
7+
"quantized": {
8+
"n4_nor_per_tensor": {
9+
"n_tiers": 4,
10+
"reciprocals": false,
11+
"scale": "per_tensor",
12+
"val": 7.848854899406433,
13+
"delta": 5.409280143678188,
14+
"params_quantized": 801664,
15+
"avg_unique_tier_values": 5.784313725490196
16+
},
17+
"n8_nor_per_tensor": {
18+
"n_tiers": 8,
19+
"reciprocals": false,
20+
"scale": "per_tensor",
21+
"val": 3.4171799793839455,
22+
"delta": 0.9776052236557007,
23+
"params_quantized": 801664,
24+
"avg_unique_tier_values": 12.254901960784315
25+
},
26+
"n16_nor_per_tensor": {
27+
"n_tiers": 16,
28+
"reciprocals": false,
29+
"scale": "per_tensor",
30+
"val": 3.404290087521076,
31+
"delta": 0.9647153317928314,
32+
"params_quantized": 801664,
33+
"avg_unique_tier_values": 19.96078431372549
34+
},
35+
"n32_nor_per_tensor": {
36+
"n_tiers": 32,
37+
"reciprocals": false,
38+
"scale": "per_tensor",
39+
"val": 3.4040319994091988,
40+
"delta": 0.964457243680954,
41+
"params_quantized": 801664,
42+
"avg_unique_tier_values": 24.235294117647058
43+
},
44+
"n4_nor_per_row": {
45+
"n_tiers": 4,
46+
"reciprocals": false,
47+
"scale": "per_row",
48+
"val": 4.788576230406761,
49+
"delta": 2.3490014746785164,
50+
"params_quantized": 801664,
51+
"avg_unique_tier_values": 5.823529411764706
52+
},
53+
"n8_nor_per_row": {
54+
"n_tiers": 8,
55+
"reciprocals": false,
56+
"scale": "per_row",
57+
"val": 3.1771102994680405,
58+
"delta": 0.7375355437397957,
59+
"params_quantized": 801664,
60+
"avg_unique_tier_values": 12.27450980392157
61+
},
62+
"n16_nor_per_row": {
63+
"n_tiers": 16,
64+
"reciprocals": false,
65+
"scale": "per_row",
66+
"val": 3.2084167823195457,
67+
"delta": 0.768842026591301,
68+
"params_quantized": 801664,
69+
"avg_unique_tier_values": 19.980392156862745
70+
},
71+
"n32_nor_per_row": {
72+
"n_tiers": 32,
73+
"reciprocals": false,
74+
"scale": "per_row",
75+
"val": 3.2085734754800797,
76+
"delta": 0.7689987197518349,
77+
"params_quantized": 801664,
78+
"avg_unique_tier_values": 23.84313725490196
79+
},
80+
"n4_rec_per_tensor": {
81+
"n_tiers": 4,
82+
"reciprocals": true,
83+
"scale": "per_tensor",
84+
"val": 4.2178787142038345,
85+
"delta": 1.7783039584755898,
86+
"params_quantized": 801664,
87+
"avg_unique_tier_values": 9.07843137254902
88+
},
89+
"n8_rec_per_tensor": {
90+
"n_tiers": 8,
91+
"reciprocals": true,
92+
"scale": "per_tensor",
93+
"val": 3.3867647871375084,
94+
"delta": 0.9471900314092636,
95+
"params_quantized": 801664,
96+
"avg_unique_tier_values": 18.07843137254902
97+
},
98+
"n16_rec_per_tensor": {
99+
"n_tiers": 16,
100+
"reciprocals": true,
101+
"scale": "per_tensor",
102+
"val": 3.4041296541690826,
103+
"delta": 0.9645548984408379,
104+
"params_quantized": 801664,
105+
"avg_unique_tier_values": 23.862745098039216
106+
},
107+
"n32_rec_per_tensor": {
108+
"n_tiers": 32,
109+
"reciprocals": true,
110+
"scale": "per_tensor",
111+
"val": 3.4040321484208107,
112+
"delta": 0.9644573926925659,
113+
"params_quantized": 801664,
114+
"avg_unique_tier_values": 24.372549019607842
115+
},
116+
"n4_rec_per_row": {
117+
"n_tiers": 4,
118+
"reciprocals": true,
119+
"scale": "per_row",
120+
"val": 4.287693277001381,
121+
"delta": 1.8481185212731361,
122+
"params_quantized": 801664,
123+
"avg_unique_tier_values": 9.117647058823529
124+
},
125+
"n8_rec_per_row": {
126+
"n_tiers": 8,
127+
"reciprocals": true,
128+
"scale": "per_row",
129+
"val": 3.2172485813498497,
130+
"delta": 0.7776738256216049,
131+
"params_quantized": 801664,
132+
"avg_unique_tier_values": 18.098039215686274
133+
},
134+
"n16_rec_per_row": {
135+
"n_tiers": 16,
136+
"reciprocals": true,
137+
"scale": "per_row",
138+
"val": 3.208352394402027,
139+
"delta": 0.7687776386737823,
140+
"params_quantized": 801664,
141+
"avg_unique_tier_values": 23.352941176470587
142+
},
143+
"n32_rec_per_row": {
144+
"n_tiers": 32,
145+
"reciprocals": true,
146+
"scale": "per_row",
147+
"val": 3.208573505282402,
148+
"delta": 0.7689987495541573,
149+
"params_quantized": 801664,
150+
"avg_unique_tier_values": 23.88235294117647
151+
}
152+
}
153+
},
154+
"tied_substrate": {
155+
"n_params": 668536,
156+
"n_attn_params": 131072,
157+
"val_fp32": 2.592747889459133,
158+
"quantized": {
159+
"n4_nor_per_tensor": {
160+
"n_tiers": 4,
161+
"reciprocals": false,
162+
"scale": "per_tensor",
163+
"val": 6.7923648953437805,
164+
"delta": 4.199617005884647,
165+
"params_quantized": 668536,
166+
"avg_unique_tier_values": 6.373831775700935
167+
},
168+
"n8_nor_per_tensor": {
169+
"n_tiers": 8,
170+
"reciprocals": false,
171+
"scale": "per_tensor",
172+
"val": 3.584361217916012,
173+
"delta": 0.9916133284568787,
174+
"params_quantized": 668536,
175+
"avg_unique_tier_values": 13.598130841121495
176+
},
177+
"n16_nor_per_tensor": {
178+
"n_tiers": 16,
179+
"reciprocals": false,
180+
"scale": "per_tensor",
181+
"val": 3.6512366607785225,
182+
"delta": 1.0584887713193893,
183+
"params_quantized": 668536,
184+
"avg_unique_tier_values": 22.289719626168225
185+
},
186+
"n32_nor_per_tensor": {
187+
"n_tiers": 32,
188+
"reciprocals": false,
189+
"scale": "per_tensor",
190+
"val": 3.6507833153009415,
191+
"delta": 1.0580354258418083,
192+
"params_quantized": 668536,
193+
"avg_unique_tier_values": 26.14018691588785
194+
},
195+
"n4_nor_per_row": {
196+
"n_tiers": 4,
197+
"reciprocals": false,
198+
"scale": "per_row",
199+
"val": 6.571510136127472,
200+
"delta": 3.9787622466683388,
201+
"params_quantized": 668536,
202+
"avg_unique_tier_values": 6.383177570093458
203+
},
204+
"n8_nor_per_row": {
205+
"n_tiers": 8,
206+
"reciprocals": false,
207+
"scale": "per_row",
208+
"val": 3.2470703125,
209+
"delta": 0.6543224230408669,
210+
"params_quantized": 668536,
211+
"avg_unique_tier_values": 13.607476635514018
212+
},
213+
"n16_nor_per_row": {
214+
"n_tiers": 16,
215+
"reciprocals": false,
216+
"scale": "per_row",
217+
"val": 3.183598607778549,
218+
"delta": 0.590850718319416,
219+
"params_quantized": 668536,
220+
"avg_unique_tier_values": 22.299065420560748
221+
},
222+
"n32_nor_per_row": {
223+
"n_tiers": 32,
224+
"reciprocals": false,
225+
"scale": "per_row",
226+
"val": 3.183070808649063,
227+
"delta": 0.59032291918993,
228+
"params_quantized": 668536,
229+
"avg_unique_tier_values": 25.49532710280374
230+
},
231+
"n4_rec_per_tensor": {
232+
"n_tiers": 4,
233+
"reciprocals": true,
234+
"scale": "per_tensor",
235+
"val": 3.734321117401123,
236+
"delta": 1.14157322794199,
237+
"params_quantized": 668536,
238+
"avg_unique_tier_values": 10.037383177570094
239+
},
240+
"n8_rec_per_tensor": {
241+
"n_tiers": 8,
242+
"reciprocals": true,
243+
"scale": "per_tensor",
244+
"val": 3.504885621368885,
245+
"delta": 0.9121377319097519,
246+
"params_quantized": 668536,
247+
"avg_unique_tier_values": 20.233644859813083
248+
},
249+
"n16_rec_per_tensor": {
250+
"n_tiers": 16,
251+
"reciprocals": true,
252+
"scale": "per_tensor",
253+
"val": 3.6509373784065247,
254+
"delta": 1.0581894889473915,
255+
"params_quantized": 668536,
256+
"avg_unique_tier_values": 25.53271028037383
257+
},
258+
"n32_rec_per_tensor": {
259+
"n_tiers": 32,
260+
"reciprocals": true,
261+
"scale": "per_tensor",
262+
"val": 3.650783285498619,
263+
"delta": 1.058035396039486,
264+
"params_quantized": 668536,
265+
"avg_unique_tier_values": 26.1588785046729
266+
},
267+
"n4_rec_per_row": {
268+
"n_tiers": 4,
269+
"reciprocals": true,
270+
"scale": "per_row",
271+
"val": 3.894965998828411,
272+
"delta": 1.302218109369278,
273+
"params_quantized": 668536,
274+
"avg_unique_tier_values": 10.046728971962617
275+
},
276+
"n8_rec_per_row": {
277+
"n_tiers": 8,
278+
"reciprocals": true,
279+
"scale": "per_row",
280+
"val": 3.2067508846521378,
281+
"delta": 0.6140029951930046,
282+
"params_quantized": 668536,
283+
"avg_unique_tier_values": 20.242990654205606
284+
},
285+
"n16_rec_per_row": {
286+
"n_tiers": 16,
287+
"reciprocals": true,
288+
"scale": "per_row",
289+
"val": 3.183425836265087,
290+
"delta": 0.590677946805954,
291+
"params_quantized": 668536,
292+
"avg_unique_tier_values": 24.94392523364486
293+
},
294+
"n32_rec_per_row": {
295+
"n_tiers": 32,
296+
"reciprocals": true,
297+
"scale": "per_row",
298+
"val": 3.183070831000805,
299+
"delta": 0.5903229415416718,
300+
"params_quantized": 668536,
301+
"avg_unique_tier_values": 25.49532710280374
302+
}
303+
}
304+
}
305+
}
306+
}

0 commit comments

Comments
 (0)