RandomCoder-lab
diff --git a/‎examples/prometheus_mh_q6_compound.omc‎
Lines changed: 228 additions & 0 deletions b/‎examples/prometheus_mh_q6_compound.omc‎
Lines changed: 228 additions & 0 deletions
@@ -0,0 +1,228 @@
+# MH + Q6 compound test (#3 — validates v0.8.8 finding in multi-head setting).
+#
+# v0.8.5 saw MH at d_model=32 win -0.25% vs SH (single-head). v0.8.8 saw
+# Q6 push attention 8.31x toward substrate positions after training. If
+# Q6 sculpts attention per-head, the MH+Q6 combo should beat plain MH by
+# more than the SH+Q6 combo beat plain SH.
+#
+# Four arms at d_model=64, n_heads=4, 3 seeds, 400 steps:
+#   A. MH off  (substrate-K + S-MOD + V, Q6 off)
+#   B. MH+Q6 fused (same + Q6 fused)
+#   C. SH off  (single-head, Q6 off) — reference
+#   D. SH+Q6 fused — reference
+
+import "examples/lib/prometheus.omc";
+
+fn build_vocab(text) {
+    h seen = dict_new();
+    h chars = [];
+    h i = 0;
+    while i < str_len(text) {
+        h ch = str_slice(text, i, i + 1);
+        if !dict_has(seen, ch) { dict_set(seen, ch, arr_len(chars)); arr_push(chars, ch); }
+        i = i + 1;
+    }
+    h v = dict_new();
+    dict_set(v, "chars", chars);
+    dict_set(v, "lookup", seen);
+    return v;
+}
+
+fn encode(text, vocab) {
+    h lookup = dict_get(vocab, "lookup");
+    h ids = [];
+    h i = 0;
+    while i < str_len(text) {
+        h ch = str_slice(text, i, i + 1);
+        arr_push(ids, dict_get(lookup, ch));
+        i = i + 1;
+    }
+    return ids;
+}
+
+fn build_model(arm, vocab_size, d_model, ff_dim, seq_len, n_heads, seed) {
+    h emb = prom_embedding_new(vocab_size, d_model, seed);
+    h s1 = dict_get(emb, "rng_state");
+    h attn = null;
+    h s2 = s1 + 11;
+    if arm == "SH" {
+        attn = prom_attention_substrate_k_new(d_model, seq_len, s2);
+        s2 = dict_get(attn, "rng_state");
+    } elif arm == "SHQ6" {
+        attn = prom_attention_substrate_k_new(d_model, seq_len, s2);
+        dict_set(attn, "q6_mode", "fused");
+        s2 = dict_get(attn, "rng_state");
+    } elif arm == "MH" {
+        attn = prom_attention_substrate_k_mh_new(d_model, seq_len, n_heads, s2);
+        s2 = dict_get(attn, "rng_state");
+    } else {  # MHQ6
+        attn = prom_attention_substrate_k_mh_new(d_model, seq_len, n_heads, s2);
+        dict_set(attn, "q6_mode", "fused");
+        s2 = dict_get(attn, "rng_state");
+    }
+    h ln1 = prom_layernorm_new(d_model, s2);
+    h ff_up = prom_linear_new(d_model, ff_dim, s2 + 13);
+    h s3 = dict_get(ff_up, "rng_state");
+    h ff_down = prom_linear_new(ff_dim, d_model, s3);
+    h s4 = dict_get(ff_down, "rng_state");
+    h ln2 = prom_layernorm_new(d_model, s4);
+    h head = prom_linear_new(d_model, vocab_size, s4 + 17);
+    h m = dict_new();
+    dict_set(m, "arm", arm);
+    dict_set(m, "emb", emb);
+    dict_set(m, "attn", attn);
+    dict_set(m, "ln1", ln1);
+    dict_set(m, "ff_up", ff_up);
+    dict_set(m, "ff_down", ff_down);
+    dict_set(m, "ln2", ln2);
+    dict_set(m, "head", head);
+    return m;
+}
+
+fn attn_forward(arm, attn, x_id) {
+    if arm == "MH" { return prom_attention_substrate_k_mh_forward(attn, x_id); }
+    if arm == "MHQ6" { return prom_attention_substrate_k_mh_forward(attn, x_id); }
+    return prom_attention_substrate_k_forward(attn, x_id);
+}
+
+fn attn_params(arm, attn) {
+    if arm == "MH" { return prom_attention_substrate_k_mh_params(attn); }
+    if arm == "MHQ6" { return prom_attention_substrate_k_mh_params(attn); }
+    return prom_attention_substrate_k_params(attn);
+}
+
+fn forward_window(model, token_ids, pe_table) {
+    h arm = dict_get(model, "arm");
+    h x = prom_embedding_batch(dict_get(model, "emb"), token_ids);
+    h pe_rows = [];
+    h i = 0;
+    while i < arr_len(token_ids) { arr_push(pe_rows, arr_get(pe_table, i)); i = i + 1; }
+    x = tape_add(x, tape_const(pe_rows));
+    h attn_out = attn_forward(arm, dict_get(model, "attn"), x);
+    h x_post = tape_add(x, attn_out);
+    h n1 = prom_layernorm_forward(dict_get(model, "ln1"), x_post);
+    h up = prom_linear_forward(dict_get(model, "ff_up"), n1);
+    h down = prom_linear_forward(dict_get(model, "ff_down"), prom_relu(up));
+    h x_ff = tape_add(x_post, down);
+    h n2 = prom_layernorm_forward(dict_get(model, "ln2"), x_ff);
+    return prom_linear_forward(dict_get(model, "head"), n2);
+}
+
+fn collect_all(model) {
+    h arm = dict_get(model, "arm");
+    h attn_p = attn_params(arm, dict_get(model, "attn"));
+    h other = prom_collect_params_v2([
+        dict_get(model, "emb"),
+        dict_get(model, "ln1"),
+        dict_get(model, "ff_up"),
+        dict_get(model, "ff_down"),
+        dict_get(model, "ln2"),
+        dict_get(model, "head"),
+    ]);
+    h out = [];
+    h i = 0;
+    while i < arr_len(attn_p) { arr_push(out, arr_get(attn_p, i)); i = i + 1; }
+    i = 0;
+    while i < arr_len(other) { arr_push(out, arr_get(other, i)); i = i + 1; }
+    return out;
+}
+
+fn train(arm, vocab_size, ids, seq_len, d_model, ff_dim, n_heads, lr, steps, seed) {
+    tape_reset();
+    h model = build_model(arm, vocab_size, d_model, ff_dim, seq_len, n_heads, seed);
+    h params = collect_all(model);
+    h opt = prom_adamw_new(params, lr, 0.9, 0.999, 1e-8, 0.0);
+    h pe_table = prom_crt_pe_matrix(seq_len, d_model);
+    h n_windows = arr_len(ids) - seq_len - 1;
+    h tail = [];
+    h step = 0;
+    while step < steps {
+        h start = step - (step / n_windows) * n_windows;
+        h window = [];
+        h targets = [];
+        h k = 0;
+        while k < seq_len {
+            arr_push(window, arr_get(ids, start + k));
+            arr_push(targets, arr_get(ids, start + k + 1));
+            k = k + 1;
+        }
+        h logits = forward_window(model, window, pe_table);
+        h loss = prom_cross_entropy_batch(logits, targets, vocab_size);
+        tape_backward(loss);
+        prom_adamw_step(opt);
+        if step >= steps - 30 { arr_push(tail, tape_value(loss)); }
+        step = step + 1;
+    }
+    h s = 0.0; h i = 0;
+    while i < arr_len(tail) { s = s + arr_get(tail, i); i = i + 1; }
+    return s / arr_len(tail);
+}
+
+fn mean_arr(xs) {
+    h s = 0.0; h i = 0;
+    while i < arr_len(xs) { s = s + arr_get(xs, i); i = i + 1; }
+    return s / arr_len(xs);
+}
+
+fn main() {
+    print("=== MH+Q6 compound test (#3) ===");
+    h text = "the rain in spain falls mainly on the plain and the sun rises in the east while the moon hides behind the mountain peaks of distant lands where ancient creatures sleep in caves of silver beneath the stars";
+    h vocab = build_vocab(text);
+    h vocab_size = arr_len(dict_get(vocab, "chars"));
+    h ids = encode(text, vocab);
+    h seq_len = 16;
+    h d_model = 32;
+    h ff_dim = 64;
+    h n_heads = 4;
+    h lr = 0.005;
+    h steps = 250;
+    h seeds = [42, 7, 123];
+
+    print(concat_many("d_model=", to_string(d_model),
+        "  n_heads=", to_string(n_heads),
+        "  steps=", to_string(steps),
+        "  seeds=", to_string(arr_len(seeds))));
+    print("");
+
+    h arms = ["SH", "SHQ6", "MH", "MHQ6"];
+    h labels = dict_new();
+    dict_set(labels, "SH",   "SH         ");
+    dict_set(labels, "SHQ6", "SH + Q6    ");
+    dict_set(labels, "MH",   "MH (4h)    ");
+    dict_set(labels, "MHQ6", "MH (4h) + Q6");
+
+    h results = dict_new();
+    h ai = 0;
+    while ai < arr_len(arms) {
+        h arm = arr_get(arms, ai);
+        h losses = [];
+        h si = 0;
+        while si < arr_len(seeds) {
+            h seed = arr_get(seeds, si);
+            h L = train(arm, vocab_size, ids, seq_len, d_model, ff_dim, n_heads, lr, steps, seed);
+            arr_push(losses, L);
+            si = si + 1;
+        }
+        dict_set(results, arm, losses);
+        h mu = mean_arr(losses);
+        print(concat_many(dict_get(labels, arm), "  mean=", to_string(mu)));
+        ai = ai + 1;
+    }
+
+    print("");
+    print("=== compound analysis ===");
+    h sh_mu = mean_arr(dict_get(results, "SH"));
+    h shq6_mu = mean_arr(dict_get(results, "SHQ6"));
+    h mh_mu = mean_arr(dict_get(results, "MH"));
+    h mhq6_mu = mean_arr(dict_get(results, "MHQ6"));
+    print(concat_many("SH→SHQ6  Δ=", to_string(shq6_mu - sh_mu),
+        " (", to_string((shq6_mu - sh_mu) / sh_mu * 100.0), "%)"));
+    print(concat_many("MH→MHQ6  Δ=", to_string(mhq6_mu - mh_mu),
+        " (", to_string((mhq6_mu - mh_mu) / mh_mu * 100.0), "%)"));
+    print(concat_many("SH→MH    Δ=", to_string(mh_mu - sh_mu),
+        " (", to_string((mh_mu - sh_mu) / sh_mu * 100.0), "%)"));
+    print(concat_many("SH→MHQ6  Δ=", to_string(mhq6_mu - sh_mu),
+        " (", to_string((mhq6_mu - sh_mu) / sh_mu * 100.0), "%) ← compound"));
+}
+
+main();