|
| 1 | +# Post-training Q6 sparsity test (v0.8.7 #8 reformulation). |
| 2 | +# |
| 3 | +# v0.8.7 first-look measured attention concentration at random init: |
| 4 | +# 8.36% of softmax mass in 6.84% of substrate-close cells — essentially |
| 5 | +# uniform, hypothesis "sparse via substrate distance" FALSIFIED at init. |
| 6 | +# |
| 7 | +# Reformulation: train a Q6-fused model for N steps, then measure the |
| 8 | +# SAME concentration ratio on the trained q's attention scores. The Q6 |
| 9 | +# modulation explicitly pushes q toward substrate magnitudes; does it |
| 10 | +# also push q toward substrate-aligned POSITIONS? |
| 11 | +# |
| 12 | +# If post-training mass ratio > pre-training (8.36%), substrate sparsity |
| 13 | +# is viable after training. If equal/lower, falsification holds at scale. |
| 14 | + |
| 15 | +import "examples/lib/prometheus.omc"; |
| 16 | + |
| 17 | +fn build_vocab(text) { |
| 18 | + h seen = dict_new(); |
| 19 | + h chars = []; |
| 20 | + h i = 0; |
| 21 | + while i < str_len(text) { |
| 22 | + h ch = str_slice(text, i, i + 1); |
| 23 | + if !dict_has(seen, ch) { dict_set(seen, ch, arr_len(chars)); arr_push(chars, ch); } |
| 24 | + i = i + 1; |
| 25 | + } |
| 26 | + h v = dict_new(); |
| 27 | + dict_set(v, "chars", chars); |
| 28 | + dict_set(v, "lookup", seen); |
| 29 | + return v; |
| 30 | +} |
| 31 | + |
| 32 | +fn encode(text, vocab) { |
| 33 | + h lookup = dict_get(vocab, "lookup"); |
| 34 | + h ids = []; |
| 35 | + h i = 0; |
| 36 | + while i < str_len(text) { |
| 37 | + h ch = str_slice(text, i, i + 1); |
| 38 | + arr_push(ids, dict_get(lookup, ch)); |
| 39 | + i = i + 1; |
| 40 | + } |
| 41 | + return ids; |
| 42 | +} |
| 43 | + |
| 44 | +fn substrate_dist(i, j) { |
| 45 | + h moduli = [5, 8, 13, 21]; |
| 46 | + h s = 0; |
| 47 | + h k = 0; |
| 48 | + while k < arr_len(moduli) { |
| 49 | + h m = arr_get(moduli, k); |
| 50 | + h di = i - (i / m) * m; |
| 51 | + h dj = j - (j / m) * m; |
| 52 | + h d = di - dj; |
| 53 | + if d < 0 { d = 0 - d; } |
| 54 | + s = s + d; |
| 55 | + k = k + 1; |
| 56 | + } |
| 57 | + return s; |
| 58 | +} |
| 59 | + |
| 60 | +fn measure_concentration(attn_val, seq_len, threshold) { |
| 61 | + h mass_near = 0.0; |
| 62 | + h mass_total = 0.0; |
| 63 | + h n_near = 0; |
| 64 | + h i = 0; |
| 65 | + while i < seq_len { |
| 66 | + h row = arr_get(attn_val, i); |
| 67 | + h j = 0; |
| 68 | + while j < seq_len { |
| 69 | + h p = arr_get(row, j); |
| 70 | + mass_total = mass_total + p; |
| 71 | + if substrate_dist(i, j) <= threshold { |
| 72 | + mass_near = mass_near + p; |
| 73 | + n_near = n_near + 1; |
| 74 | + } |
| 75 | + j = j + 1; |
| 76 | + } |
| 77 | + i = i + 1; |
| 78 | + } |
| 79 | + h result = dict_new(); |
| 80 | + dict_set(result, "mass_frac", mass_near / mass_total); |
| 81 | + dict_set(result, "cell_frac", n_near * 1.0 / (seq_len * seq_len)); |
| 82 | + return result; |
| 83 | +} |
| 84 | + |
| 85 | +fn build_model(q6_on, vocab_size, d_model, ff_dim, seq_len, seed) { |
| 86 | + h emb = prom_embedding_new(vocab_size, d_model, seed); |
| 87 | + h s1 = dict_get(emb, "rng_state"); |
| 88 | + h attn = prom_attention_substrate_k_new(d_model, seq_len, s1 + 11); |
| 89 | + if q6_on { dict_set(attn, "q6_mode", "fused"); } |
| 90 | + h s2 = dict_get(attn, "rng_state"); |
| 91 | + h ln1 = prom_layernorm_new(d_model, s2); |
| 92 | + h ff_up = prom_linear_new(d_model, ff_dim, s2 + 13); |
| 93 | + h s3 = dict_get(ff_up, "rng_state"); |
| 94 | + h ff_down = prom_linear_new(ff_dim, d_model, s3); |
| 95 | + h s4 = dict_get(ff_down, "rng_state"); |
| 96 | + h ln2 = prom_layernorm_new(d_model, s4); |
| 97 | + h head = prom_linear_new(d_model, vocab_size, s4 + 17); |
| 98 | + h m = dict_new(); |
| 99 | + dict_set(m, "emb", emb); |
| 100 | + dict_set(m, "attn", attn); |
| 101 | + dict_set(m, "ln1", ln1); |
| 102 | + dict_set(m, "ff_up", ff_up); |
| 103 | + dict_set(m, "ff_down", ff_down); |
| 104 | + dict_set(m, "ln2", ln2); |
| 105 | + dict_set(m, "head", head); |
| 106 | + return m; |
| 107 | +} |
| 108 | + |
| 109 | +fn forward_window(model, token_ids, pe_table) { |
| 110 | + h x = prom_embedding_batch(dict_get(model, "emb"), token_ids); |
| 111 | + h pe_rows = []; |
| 112 | + h i = 0; |
| 113 | + while i < arr_len(token_ids) { arr_push(pe_rows, arr_get(pe_table, i)); i = i + 1; } |
| 114 | + x = tape_add(x, tape_const(pe_rows)); |
| 115 | + h attn_out = prom_attention_substrate_k_forward(dict_get(model, "attn"), x); |
| 116 | + h x_post = tape_add(x, attn_out); |
| 117 | + h n1 = prom_layernorm_forward(dict_get(model, "ln1"), x_post); |
| 118 | + h up = prom_linear_forward(dict_get(model, "ff_up"), n1); |
| 119 | + h down = prom_linear_forward(dict_get(model, "ff_down"), prom_relu(up)); |
| 120 | + h x_ff = tape_add(x_post, down); |
| 121 | + h n2 = prom_layernorm_forward(dict_get(model, "ln2"), x_ff); |
| 122 | + return prom_linear_forward(dict_get(model, "head"), n2); |
| 123 | +} |
| 124 | + |
| 125 | +fn collect_all(model) { |
| 126 | + h attn_p = prom_attention_substrate_k_params(dict_get(model, "attn")); |
| 127 | + h other = prom_collect_params_v2([ |
| 128 | + dict_get(model, "emb"), |
| 129 | + dict_get(model, "ln1"), |
| 130 | + dict_get(model, "ff_up"), |
| 131 | + dict_get(model, "ff_down"), |
| 132 | + dict_get(model, "ln2"), |
| 133 | + dict_get(model, "head"), |
| 134 | + ]); |
| 135 | + h out = []; |
| 136 | + h i = 0; |
| 137 | + while i < arr_len(attn_p) { arr_push(out, arr_get(attn_p, i)); i = i + 1; } |
| 138 | + i = 0; |
| 139 | + while i < arr_len(other) { arr_push(out, arr_get(other, i)); i = i + 1; } |
| 140 | + return out; |
| 141 | +} |
| 142 | + |
| 143 | +# Extract the post-softmax attn matrix for one window using the trained model. |
| 144 | +fn attn_concentration_for(model, ids, seq_len, pe_table, threshold) { |
| 145 | + h window = []; |
| 146 | + h k = 0; |
| 147 | + while k < seq_len { |
| 148 | + arr_push(window, arr_get(ids, k)); |
| 149 | + k = k + 1; |
| 150 | + } |
| 151 | + h x = prom_embedding_batch(dict_get(model, "emb"), window); |
| 152 | + h pe_rows = []; |
| 153 | + h i = 0; |
| 154 | + while i < arr_len(window) { arr_push(pe_rows, arr_get(pe_table, i)); i = i + 1; } |
| 155 | + x = tape_add(x, tape_const(pe_rows)); |
| 156 | + # Recreate the same forward as prom_attention_substrate_k_forward to |
| 157 | + # get the attn (post-softmax) matrix specifically, not the final |
| 158 | + # weighted-V output. |
| 159 | + h attn = dict_get(model, "attn"); |
| 160 | + h Q_w = dict_get(attn, "Q"); |
| 161 | + h K_const = dict_get(attn, "K_const"); |
| 162 | + h q = tape_matmul(x, Q_w); |
| 163 | + h q_mod = prom_q6_modulate(q, dict_get(attn, "q6_scale"), |
| 164 | + dict_get(attn, "q6_gamma"), |
| 165 | + dict_get(attn, "q6_mode")); |
| 166 | + h k_t = tape_transpose(tape_const(K_const)); |
| 167 | + h scores = tape_matmul(q_mod, k_t); |
| 168 | + h attn_node = prom_substrate_softmax(scores, dict_get(attn, "smod_alpha")); |
| 169 | + h attn_val = tape_value(attn_node); |
| 170 | + return measure_concentration(attn_val, seq_len, threshold); |
| 171 | +} |
| 172 | + |
| 173 | +fn train_and_measure(q6_on, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed) { |
| 174 | + tape_reset(); |
| 175 | + h model = build_model(q6_on, vocab_size, d_model, ff_dim, seq_len, seed); |
| 176 | + h params = collect_all(model); |
| 177 | + h opt = prom_adamw_new(params, lr, 0.9, 0.999, 1e-8, 0.0); |
| 178 | + h pe_table = prom_crt_pe_matrix(seq_len, d_model); |
| 179 | + h n_windows = arr_len(ids) - seq_len - 1; |
| 180 | + h step = 0; |
| 181 | + while step < steps { |
| 182 | + h start = step - (step / n_windows) * n_windows; |
| 183 | + h window = []; |
| 184 | + h targets = []; |
| 185 | + h k = 0; |
| 186 | + while k < seq_len { |
| 187 | + arr_push(window, arr_get(ids, start + k)); |
| 188 | + arr_push(targets, arr_get(ids, start + k + 1)); |
| 189 | + k = k + 1; |
| 190 | + } |
| 191 | + h logits = forward_window(model, window, pe_table); |
| 192 | + h loss = prom_cross_entropy_batch(logits, targets, vocab_size); |
| 193 | + tape_backward(loss); |
| 194 | + prom_adamw_step(opt); |
| 195 | + step = step + 1; |
| 196 | + } |
| 197 | + return attn_concentration_for(model, ids, seq_len, pe_table, 5); |
| 198 | +} |
| 199 | + |
| 200 | +fn main() { |
| 201 | + print("=== post-training Q6 sparsity test (#8 reformulation) ==="); |
| 202 | + h text = "the rain in spain falls mainly on the plain and the sun rises in the east while the moon hides behind the mountain peaks of distant lands"; |
| 203 | + h vocab = build_vocab(text); |
| 204 | + h vocab_size = arr_len(dict_get(vocab, "chars")); |
| 205 | + h ids = encode(text, vocab); |
| 206 | + h seq_len = 32; |
| 207 | + h d_model = 32; |
| 208 | + h ff_dim = 64; |
| 209 | + h lr = 0.005; |
| 210 | + h steps = 1000; |
| 211 | + |
| 212 | + # Three measurements: |
| 213 | + # pre-training (0 steps), Q6-trained, baseline-trained |
| 214 | + print("Random q (untrained) reference: ~8.36% mass in 6.84% cells (v0.8.7)"); |
| 215 | + print(concat_many("Training ", to_string(steps), " steps each, seq_len=", |
| 216 | + to_string(seq_len), " (", to_string(seq_len * seq_len), " cells)")); |
| 217 | + print(""); |
| 218 | + |
| 219 | + h q6_result = train_and_measure(true, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, 42); |
| 220 | + h base_result = train_and_measure(false, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, 42); |
| 221 | + |
| 222 | + h q6_mass = dict_get(q6_result, "mass_frac"); |
| 223 | + h q6_cells = dict_get(q6_result, "cell_frac"); |
| 224 | + h base_mass = dict_get(base_result, "mass_frac"); |
| 225 | + h base_cells = dict_get(base_result, "cell_frac"); |
| 226 | + |
| 227 | + print(concat_many("baseline (no Q6) post-train: ", |
| 228 | + to_string(base_mass * 100.0), "% mass in ", |
| 229 | + to_string(base_cells * 100.0), "% cells (ratio ", |
| 230 | + to_string(base_mass / base_cells), ")")); |
| 231 | + print(concat_many("Q6 fused post-train: ", |
| 232 | + to_string(q6_mass * 100.0), "% mass in ", |
| 233 | + to_string(q6_cells * 100.0), "% cells (ratio ", |
| 234 | + to_string(q6_mass / q6_cells), ")")); |
| 235 | + |
| 236 | + if q6_mass / q6_cells > 1.5 { |
| 237 | + print("→ Q6 IS pushing attention toward substrate positions; sparse kernel viable after training"); |
| 238 | + } else if q6_mass / q6_cells > 1.1 { |
| 239 | + print("→ mild substrate alignment after Q6 training; sparse kernel marginal"); |
| 240 | + } else { |
| 241 | + print("→ no substrate position alignment even after Q6 training; reformulation falsified"); |
| 242 | + } |
| 243 | +} |
| 244 | + |
| 245 | +main(); |
0 commit comments