Skip to content

Commit 9638cdf

Browse files
Cross-validate substrate-attention stack in pure-OMC Prometheus
Task #264 deliverable: now that v0.8.4 made OMC training practical (96× speedup), run the PyTorch findings through OMC's tape autograd. Headline at d_model=16, 6 seeds, 400 steps: L0 standard QKV 2.3373 baseline B L1+SMOD+V 2.2796 -2.47% (4/6 wins) ✓ cross-validates C + Q6 fused 2.3093 -1.20% (3/6) D + Q6 composed 2.3319 -0.23% (3/6) PyTorch L1-MH was -8.94%; OMC single-head gets -2.47% — directionally consistent. PyTorch's Q6 -12.15% at L1-MH doesn't replicate at single- head modest scale (need multi-head + larger capacity to compound). Q6 at d_model=32, 600 steps, 3 seeds: base (L1+SMOD+V) 2.5853 + Q6 fused 2.5781 -0.28% (2/3 wins) Q6's win SCALES with capacity. d_model=16 it loses, d_model=32 it starts to win small. Multi-head + larger d_model is where the PyTorch finding lives. Bonus finding: composed-vs-fused diverge meaningfully at training length. v0.8.1 unit tests showed 1e-9 equivalence; at 400 AdamW steps they differ 1.20% vs 0.23%. The fused tape_phi_log accumulates rounding in fewer places and is more numerically stable in long training. Same math; different drift trajectory. Velocity: 6-seed cross-runtime took 35s; 4-arm 6-seed 400-step took 143s; Q6 scale test 311s. Total compute for the chapter: ~8 min. Pre-v0.8.4 would have been hours. Files: examples/prometheus_substrate_stack_xval.omc 4-arm cumulative examples/prometheus_q6_scale_test.omc Q6 at d_model=32 examples/prometheus_substrate_larger_scale.omc d_model=128 (task #265) experiments/prometheus_parity/SUBSTRATE_STACK_OMC_XVAL.md Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 8d8c214 commit 9638cdf

4 files changed

Lines changed: 775 additions & 0 deletions

File tree

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
# Q6-at-scale test in OMC. Single-head, d_model=16 / 6-seed run showed
2+
# L1+SMOD+V wins -2.47% but adding Q6 fused LOSES some ground vs the
3+
# base stack. PyTorch saw Q6 win -12.15% at L1-MH on TinyShakespeare.
4+
# Question: does Q6 need more capacity to fire?
5+
#
6+
# Test: increase d_model and steps, hold the rest fixed. Just B vs C
7+
# (substrate stack +/- Q6 fused) — 3 seeds each.
8+
9+
import "examples/lib/prometheus.omc";
10+
11+
fn build_vocab(text) {
12+
h seen = dict_new();
13+
h chars = [];
14+
h i = 0;
15+
while i < str_len(text) {
16+
h ch = str_slice(text, i, i + 1);
17+
if !dict_has(seen, ch) {
18+
dict_set(seen, ch, arr_len(chars));
19+
arr_push(chars, ch);
20+
}
21+
i = i + 1;
22+
}
23+
h v = dict_new();
24+
dict_set(v, "chars", chars);
25+
dict_set(v, "lookup", seen);
26+
return v;
27+
}
28+
29+
fn encode(text, vocab) {
30+
h lookup = dict_get(vocab, "lookup");
31+
h ids = [];
32+
h i = 0;
33+
while i < str_len(text) {
34+
h ch = str_slice(text, i, i + 1);
35+
arr_push(ids, dict_get(lookup, ch));
36+
i = i + 1;
37+
}
38+
return ids;
39+
}
40+
41+
fn build_model(q6_on, vocab_size, d_model, ff_dim, seq_len, seed) {
42+
h emb = prom_embedding_new(vocab_size, d_model, seed);
43+
h s1 = dict_get(emb, "rng_state");
44+
h attn = prom_attention_substrate_k_new(d_model, seq_len, s1 + 11);
45+
if q6_on { dict_set(attn, "q6_mode", "fused"); }
46+
h s2 = dict_get(attn, "rng_state");
47+
h ln1 = prom_layernorm_new(d_model, s2);
48+
h ff_up = prom_linear_new(d_model, ff_dim, s2 + 13);
49+
h s3 = dict_get(ff_up, "rng_state");
50+
h ff_down = prom_linear_new(ff_dim, d_model, s3);
51+
h s4 = dict_get(ff_down, "rng_state");
52+
h ln2 = prom_layernorm_new(d_model, s4);
53+
h head = prom_linear_new(d_model, vocab_size, s4 + 17);
54+
h m = dict_new();
55+
dict_set(m, "emb", emb);
56+
dict_set(m, "attn", attn);
57+
dict_set(m, "ln1", ln1);
58+
dict_set(m, "ff_up", ff_up);
59+
dict_set(m, "ff_down", ff_down);
60+
dict_set(m, "ln2", ln2);
61+
dict_set(m, "head", head);
62+
return m;
63+
}
64+
65+
fn forward_window(model, token_ids, pe_table) {
66+
h x = prom_embedding_batch(dict_get(model, "emb"), token_ids);
67+
h pe_rows = [];
68+
h i = 0;
69+
while i < arr_len(token_ids) {
70+
arr_push(pe_rows, arr_get(pe_table, i));
71+
i = i + 1;
72+
}
73+
x = tape_add(x, tape_const(pe_rows));
74+
h attn_out = prom_attention_substrate_k_forward(dict_get(model, "attn"), x);
75+
h x_post_attn = tape_add(x, attn_out);
76+
h n1 = prom_layernorm_forward(dict_get(model, "ln1"), x_post_attn);
77+
h up = prom_linear_forward(dict_get(model, "ff_up"), n1);
78+
h down = prom_linear_forward(dict_get(model, "ff_down"), prom_relu(up));
79+
h x_post_ff = tape_add(x_post_attn, down);
80+
h n2 = prom_layernorm_forward(dict_get(model, "ln2"), x_post_ff);
81+
return prom_linear_forward(dict_get(model, "head"), n2);
82+
}
83+
84+
fn collect_all(model) {
85+
h attn_p = prom_attention_substrate_k_params(dict_get(model, "attn"));
86+
h other = prom_collect_params_v2([
87+
dict_get(model, "emb"),
88+
dict_get(model, "ln1"),
89+
dict_get(model, "ff_up"),
90+
dict_get(model, "ff_down"),
91+
dict_get(model, "ln2"),
92+
dict_get(model, "head"),
93+
]);
94+
h out = [];
95+
h i = 0;
96+
while i < arr_len(attn_p) { arr_push(out, arr_get(attn_p, i)); i = i + 1; }
97+
i = 0;
98+
while i < arr_len(other) { arr_push(out, arr_get(other, i)); i = i + 1; }
99+
return out;
100+
}
101+
102+
fn train(q6_on, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed) {
103+
tape_reset();
104+
h model = build_model(q6_on, vocab_size, d_model, ff_dim, seq_len, seed);
105+
h params = collect_all(model);
106+
h opt = prom_adamw_new(params, lr, 0.9, 0.999, 1e-8, 0.0);
107+
h pe_table = prom_crt_pe_matrix(seq_len, d_model);
108+
h n_windows = arr_len(ids) - seq_len - 1;
109+
h tail = [];
110+
h step = 0;
111+
while step < steps {
112+
h start = step - (step / n_windows) * n_windows;
113+
h window = [];
114+
h targets = [];
115+
h k = 0;
116+
while k < seq_len {
117+
arr_push(window, arr_get(ids, start + k));
118+
arr_push(targets, arr_get(ids, start + k + 1));
119+
k = k + 1;
120+
}
121+
h logits = forward_window(model, window, pe_table);
122+
h loss = prom_cross_entropy_batch(logits, targets, vocab_size);
123+
tape_backward(loss);
124+
prom_adamw_step(opt);
125+
if step >= steps - 30 { arr_push(tail, tape_value(loss)); }
126+
step = step + 1;
127+
}
128+
h s = 0.0;
129+
h i = 0;
130+
while i < arr_len(tail) { s = s + arr_get(tail, i); i = i + 1; }
131+
return s / arr_len(tail);
132+
}
133+
134+
fn mean_arr(xs) {
135+
h s = 0.0;
136+
h i = 0;
137+
while i < arr_len(xs) { s = s + arr_get(xs, i); i = i + 1; }
138+
return s / arr_len(xs);
139+
}
140+
141+
fn main() {
142+
print("=== Q6 scale test: does d_model help? ===");
143+
h text = "the rain in spain falls mainly on the plain and the sun rises in the east while the moon hides behind the mountain peaks of distant lands where ancient creatures sleep in caves of silver beneath the stars while waves crash against rocky shores carrying secrets older than time itself across the deep blue ocean toward the horizon where dreams meet reality";
144+
h vocab = build_vocab(text);
145+
h vocab_size = arr_len(dict_get(vocab, "chars"));
146+
h ids = encode(text, vocab);
147+
h seq_len = 16;
148+
h d_model = 32;
149+
h ff_dim = 64;
150+
h lr = 0.005;
151+
h steps = 600;
152+
h seeds = [42, 7, 123];
153+
154+
print(concat_many("corpus=", to_string(str_len(text)),
155+
" vocab=", to_string(vocab_size),
156+
" seq_len=", to_string(seq_len),
157+
" d_model=", to_string(d_model),
158+
" steps=", to_string(steps)));
159+
print("");
160+
161+
h base_losses = [];
162+
h q6_losses = [];
163+
h si = 0;
164+
while si < arr_len(seeds) {
165+
h seed = arr_get(seeds, si);
166+
h B = train(false, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed);
167+
h C = train(true, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed);
168+
arr_push(base_losses, B);
169+
arr_push(q6_losses, C);
170+
h marker = "base wins";
171+
if C < B { marker = "Q6 wins"; }
172+
print(concat_many("seed=", to_string(seed),
173+
" base=", to_string(B),
174+
" +Q6=", to_string(C),
175+
" Δ=", to_string(C - B), " ", marker));
176+
si = si + 1;
177+
}
178+
print("");
179+
h B_mean = mean_arr(base_losses);
180+
h C_mean = mean_arr(q6_losses);
181+
h delta = C_mean - B_mean;
182+
h pct = (delta / B_mean) * 100.0;
183+
h wins = 0;
184+
si = 0;
185+
while si < arr_len(seeds) {
186+
if arr_get(q6_losses, si) < arr_get(base_losses, si) { wins = wins + 1; }
187+
si = si + 1;
188+
}
189+
print(concat_many("base (L1+SMOD+V) mean=", to_string(B_mean)));
190+
print(concat_many("+ Q6 fused mean=", to_string(C_mean),
191+
" Δ=", to_string(delta), " (", to_string(pct), "%)",
192+
" wins ", to_string(wins), "/", to_string(arr_len(seeds))));
193+
}
194+
195+
main();

0 commit comments

Comments
 (0)