Skip to content

Commit c26ace8

Browse files
v0.8.8 four findings: Q6 post-train substrate alignment + JIT fix + 2 falsifications
Following the v0.8.7 sweep, four follow-up experiments produced one major positive finding, one infrastructure fix, and two honest negatives. Finding 1 (POSITIVE) — Q6 training pushes attention 8.3× toward substrate positions, not just substrate magnitudes: baseline (no Q6) post-train: 4.82% mass / 6.84% cells = 0.70 (anti-corr) Q6 fused post-train: 56.80% mass / 6.84% cells = 8.31× This FLIPS the v0.8.7 #8 falsification. Sparse attention IS viable *after Q6 training* — a sparse kernel computing only substrate-close cells captures 56.8% of attention with 6.84% of compute. Real architecture-level "substrate is the architecture" claim, unlocked as a post-training inference optimization. Finding 2 (NEGATIVE) — substrate-quant 6-seed verifies as noise: f32 baseline mean=2.337 substrate-quant scale=4096 mean=2.365 (+1.2% worse) The v0.8.7 single-seed promising loss was seed noise. Training-time substrate quantization is a marginal regression. Doesn't rule out inference-only weight encoding. Finding 3 (NEGATIVE) — substrate-aware param init falsified: baseline uniform random mean=2.502 substrate-snap scale=1024 mean=2.567 (+2.6%, wins 2/6) substrate-snap scale=4096 mean=2.620 (+4.7%, wins 1/6) Starting on attractors gives less gradient info per step — training trajectory beats starting point. Finding 4 (POSITIVE, infra) — JIT eligibility audit fix: fn_uses_collections() in omnimcode-codegen skips JIT for any fn whose bytecode touches NewArray/NewDict/ArrayIndex/ArrayLen or whose constant pool contains strings. Skipped fns get an unreachable body so accidental calls trap loudly. OMC_HBIT_JIT=1 runs Prometheus cleanly now. Wall-clock 0.674 vs 0.661 tree-walk (~0.013s JIT init overhead; no win because v0.8.4 already eliminated the orchestration overhead the JIT would have compressed). Bug fix only. Also adds substrate_snap_matrix builtin and _prom_substrate_random_matrix helper (used by Finding 3). 1111/1111 OMC tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent dbfb19e commit c26ace8

7 files changed

Lines changed: 916 additions & 0 deletions

File tree

examples/lib/prometheus.omc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,26 @@ fn _prom_random_matrix(rows, cols, bound, state) {
6767
return out;
6868
}
6969

70+
# v0.8.8 substrate-init variant. After random init, snap each cell to
71+
# the nearest Fibonacci attractor at the given scale. The hypothesis:
72+
# substrate-aligned starting weights may give different (potentially
73+
# better-regularized) training trajectories than uniform random init.
74+
# Pairs with the substrate_snap_matrix Rust builtin.
75+
#
76+
# Usage:
77+
# h W = _prom_substrate_random_matrix(rows, cols, bound, state, 1024.0);
78+
# scale=0 returns the same node unchanged (no snapping).
79+
fn _prom_substrate_random_matrix(rows, cols, bound, state, init_scale) {
80+
h out = _prom_random_matrix(rows, cols, bound, state);
81+
if init_scale > 0.0 {
82+
h node = dict_get(out, "node");
83+
h v = tape_value(node);
84+
h snapped = substrate_snap_matrix(v, init_scale);
85+
tape_set_value(node, snapped);
86+
}
87+
return out;
88+
}
89+
7090
# Same as above but produces a zero-initialized bias row vector.
7191
fn _prom_zeros_row(cols) {
7292
h row = [];
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
# Post-training Q6 sparsity test (v0.8.7 #8 reformulation).
2+
#
3+
# v0.8.7 first-look measured attention concentration at random init:
4+
# 8.36% of softmax mass in 6.84% of substrate-close cells — essentially
5+
# uniform, hypothesis "sparse via substrate distance" FALSIFIED at init.
6+
#
7+
# Reformulation: train a Q6-fused model for N steps, then measure the
8+
# SAME concentration ratio on the trained q's attention scores. The Q6
9+
# modulation explicitly pushes q toward substrate magnitudes; does it
10+
# also push q toward substrate-aligned POSITIONS?
11+
#
12+
# If post-training mass ratio > pre-training (8.36%), substrate sparsity
13+
# is viable after training. If equal/lower, falsification holds at scale.
14+
15+
import "examples/lib/prometheus.omc";
16+
17+
fn build_vocab(text) {
18+
h seen = dict_new();
19+
h chars = [];
20+
h i = 0;
21+
while i < str_len(text) {
22+
h ch = str_slice(text, i, i + 1);
23+
if !dict_has(seen, ch) { dict_set(seen, ch, arr_len(chars)); arr_push(chars, ch); }
24+
i = i + 1;
25+
}
26+
h v = dict_new();
27+
dict_set(v, "chars", chars);
28+
dict_set(v, "lookup", seen);
29+
return v;
30+
}
31+
32+
fn encode(text, vocab) {
33+
h lookup = dict_get(vocab, "lookup");
34+
h ids = [];
35+
h i = 0;
36+
while i < str_len(text) {
37+
h ch = str_slice(text, i, i + 1);
38+
arr_push(ids, dict_get(lookup, ch));
39+
i = i + 1;
40+
}
41+
return ids;
42+
}
43+
44+
fn substrate_dist(i, j) {
45+
h moduli = [5, 8, 13, 21];
46+
h s = 0;
47+
h k = 0;
48+
while k < arr_len(moduli) {
49+
h m = arr_get(moduli, k);
50+
h di = i - (i / m) * m;
51+
h dj = j - (j / m) * m;
52+
h d = di - dj;
53+
if d < 0 { d = 0 - d; }
54+
s = s + d;
55+
k = k + 1;
56+
}
57+
return s;
58+
}
59+
60+
fn measure_concentration(attn_val, seq_len, threshold) {
61+
h mass_near = 0.0;
62+
h mass_total = 0.0;
63+
h n_near = 0;
64+
h i = 0;
65+
while i < seq_len {
66+
h row = arr_get(attn_val, i);
67+
h j = 0;
68+
while j < seq_len {
69+
h p = arr_get(row, j);
70+
mass_total = mass_total + p;
71+
if substrate_dist(i, j) <= threshold {
72+
mass_near = mass_near + p;
73+
n_near = n_near + 1;
74+
}
75+
j = j + 1;
76+
}
77+
i = i + 1;
78+
}
79+
h result = dict_new();
80+
dict_set(result, "mass_frac", mass_near / mass_total);
81+
dict_set(result, "cell_frac", n_near * 1.0 / (seq_len * seq_len));
82+
return result;
83+
}
84+
85+
fn build_model(q6_on, vocab_size, d_model, ff_dim, seq_len, seed) {
86+
h emb = prom_embedding_new(vocab_size, d_model, seed);
87+
h s1 = dict_get(emb, "rng_state");
88+
h attn = prom_attention_substrate_k_new(d_model, seq_len, s1 + 11);
89+
if q6_on { dict_set(attn, "q6_mode", "fused"); }
90+
h s2 = dict_get(attn, "rng_state");
91+
h ln1 = prom_layernorm_new(d_model, s2);
92+
h ff_up = prom_linear_new(d_model, ff_dim, s2 + 13);
93+
h s3 = dict_get(ff_up, "rng_state");
94+
h ff_down = prom_linear_new(ff_dim, d_model, s3);
95+
h s4 = dict_get(ff_down, "rng_state");
96+
h ln2 = prom_layernorm_new(d_model, s4);
97+
h head = prom_linear_new(d_model, vocab_size, s4 + 17);
98+
h m = dict_new();
99+
dict_set(m, "emb", emb);
100+
dict_set(m, "attn", attn);
101+
dict_set(m, "ln1", ln1);
102+
dict_set(m, "ff_up", ff_up);
103+
dict_set(m, "ff_down", ff_down);
104+
dict_set(m, "ln2", ln2);
105+
dict_set(m, "head", head);
106+
return m;
107+
}
108+
109+
fn forward_window(model, token_ids, pe_table) {
110+
h x = prom_embedding_batch(dict_get(model, "emb"), token_ids);
111+
h pe_rows = [];
112+
h i = 0;
113+
while i < arr_len(token_ids) { arr_push(pe_rows, arr_get(pe_table, i)); i = i + 1; }
114+
x = tape_add(x, tape_const(pe_rows));
115+
h attn_out = prom_attention_substrate_k_forward(dict_get(model, "attn"), x);
116+
h x_post = tape_add(x, attn_out);
117+
h n1 = prom_layernorm_forward(dict_get(model, "ln1"), x_post);
118+
h up = prom_linear_forward(dict_get(model, "ff_up"), n1);
119+
h down = prom_linear_forward(dict_get(model, "ff_down"), prom_relu(up));
120+
h x_ff = tape_add(x_post, down);
121+
h n2 = prom_layernorm_forward(dict_get(model, "ln2"), x_ff);
122+
return prom_linear_forward(dict_get(model, "head"), n2);
123+
}
124+
125+
fn collect_all(model) {
126+
h attn_p = prom_attention_substrate_k_params(dict_get(model, "attn"));
127+
h other = prom_collect_params_v2([
128+
dict_get(model, "emb"),
129+
dict_get(model, "ln1"),
130+
dict_get(model, "ff_up"),
131+
dict_get(model, "ff_down"),
132+
dict_get(model, "ln2"),
133+
dict_get(model, "head"),
134+
]);
135+
h out = [];
136+
h i = 0;
137+
while i < arr_len(attn_p) { arr_push(out, arr_get(attn_p, i)); i = i + 1; }
138+
i = 0;
139+
while i < arr_len(other) { arr_push(out, arr_get(other, i)); i = i + 1; }
140+
return out;
141+
}
142+
143+
# Extract the post-softmax attn matrix for one window using the trained model.
144+
fn attn_concentration_for(model, ids, seq_len, pe_table, threshold) {
145+
h window = [];
146+
h k = 0;
147+
while k < seq_len {
148+
arr_push(window, arr_get(ids, k));
149+
k = k + 1;
150+
}
151+
h x = prom_embedding_batch(dict_get(model, "emb"), window);
152+
h pe_rows = [];
153+
h i = 0;
154+
while i < arr_len(window) { arr_push(pe_rows, arr_get(pe_table, i)); i = i + 1; }
155+
x = tape_add(x, tape_const(pe_rows));
156+
# Recreate the same forward as prom_attention_substrate_k_forward to
157+
# get the attn (post-softmax) matrix specifically, not the final
158+
# weighted-V output.
159+
h attn = dict_get(model, "attn");
160+
h Q_w = dict_get(attn, "Q");
161+
h K_const = dict_get(attn, "K_const");
162+
h q = tape_matmul(x, Q_w);
163+
h q_mod = prom_q6_modulate(q, dict_get(attn, "q6_scale"),
164+
dict_get(attn, "q6_gamma"),
165+
dict_get(attn, "q6_mode"));
166+
h k_t = tape_transpose(tape_const(K_const));
167+
h scores = tape_matmul(q_mod, k_t);
168+
h attn_node = prom_substrate_softmax(scores, dict_get(attn, "smod_alpha"));
169+
h attn_val = tape_value(attn_node);
170+
return measure_concentration(attn_val, seq_len, threshold);
171+
}
172+
173+
fn train_and_measure(q6_on, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed) {
174+
tape_reset();
175+
h model = build_model(q6_on, vocab_size, d_model, ff_dim, seq_len, seed);
176+
h params = collect_all(model);
177+
h opt = prom_adamw_new(params, lr, 0.9, 0.999, 1e-8, 0.0);
178+
h pe_table = prom_crt_pe_matrix(seq_len, d_model);
179+
h n_windows = arr_len(ids) - seq_len - 1;
180+
h step = 0;
181+
while step < steps {
182+
h start = step - (step / n_windows) * n_windows;
183+
h window = [];
184+
h targets = [];
185+
h k = 0;
186+
while k < seq_len {
187+
arr_push(window, arr_get(ids, start + k));
188+
arr_push(targets, arr_get(ids, start + k + 1));
189+
k = k + 1;
190+
}
191+
h logits = forward_window(model, window, pe_table);
192+
h loss = prom_cross_entropy_batch(logits, targets, vocab_size);
193+
tape_backward(loss);
194+
prom_adamw_step(opt);
195+
step = step + 1;
196+
}
197+
return attn_concentration_for(model, ids, seq_len, pe_table, 5);
198+
}
199+
200+
fn main() {
201+
print("=== post-training Q6 sparsity test (#8 reformulation) ===");
202+
h text = "the rain in spain falls mainly on the plain and the sun rises in the east while the moon hides behind the mountain peaks of distant lands";
203+
h vocab = build_vocab(text);
204+
h vocab_size = arr_len(dict_get(vocab, "chars"));
205+
h ids = encode(text, vocab);
206+
h seq_len = 32;
207+
h d_model = 32;
208+
h ff_dim = 64;
209+
h lr = 0.005;
210+
h steps = 1000;
211+
212+
# Three measurements:
213+
# pre-training (0 steps), Q6-trained, baseline-trained
214+
print("Random q (untrained) reference: ~8.36% mass in 6.84% cells (v0.8.7)");
215+
print(concat_many("Training ", to_string(steps), " steps each, seq_len=",
216+
to_string(seq_len), " (", to_string(seq_len * seq_len), " cells)"));
217+
print("");
218+
219+
h q6_result = train_and_measure(true, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, 42);
220+
h base_result = train_and_measure(false, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, 42);
221+
222+
h q6_mass = dict_get(q6_result, "mass_frac");
223+
h q6_cells = dict_get(q6_result, "cell_frac");
224+
h base_mass = dict_get(base_result, "mass_frac");
225+
h base_cells = dict_get(base_result, "cell_frac");
226+
227+
print(concat_many("baseline (no Q6) post-train: ",
228+
to_string(base_mass * 100.0), "% mass in ",
229+
to_string(base_cells * 100.0), "% cells (ratio ",
230+
to_string(base_mass / base_cells), ")"));
231+
print(concat_many("Q6 fused post-train: ",
232+
to_string(q6_mass * 100.0), "% mass in ",
233+
to_string(q6_cells * 100.0), "% cells (ratio ",
234+
to_string(q6_mass / q6_cells), ")"));
235+
236+
if q6_mass / q6_cells > 1.5 {
237+
print("→ Q6 IS pushing attention toward substrate positions; sparse kernel viable after training");
238+
} else if q6_mass / q6_cells > 1.1 {
239+
print("→ mild substrate alignment after Q6 training; sparse kernel marginal");
240+
} else {
241+
print("→ no substrate position alignment even after Q6 training; reformulation falsified");
242+
}
243+
}
244+
245+
main();

0 commit comments

Comments
 (0)