|
| 1 | +# Substrate-aware backward gradients A/B (task #284, v0.8.10 research item). |
| 2 | +# |
| 3 | +# Forward is identity; backward amplifies gradient components pulling params |
| 4 | +# toward Fibonacci attractors and dampens components pushing away. The |
| 5 | +# substrate as gradient-flow regularizer. |
| 6 | +# |
| 7 | +# Three arms at d_model=32, single-head, 250 steps, 3 seeds: |
| 8 | +# A. baseline plain L1+SMOD+V |
| 9 | +# B. grad_mod wrap Q and V projections with tape_substrate_grad_mod |
| 10 | +# before tape_matmul. Forward UNCHANGED; backward biased. |
| 11 | +# C. + Q6 grad_mod + Q6 fused, to see if substrate-shaped backward |
| 12 | +# compounds with substrate-shaped forward modulation. |
| 13 | +# |
| 14 | +# Hypothesis: gradient bias toward attractors might regularize Q/V like |
| 15 | +# substrate-init was supposed to — but at TRAINING time instead of init, |
| 16 | +# which lets parameters drift if the loss landscape pulls hard. |
| 17 | + |
| 18 | +import "examples/lib/prometheus.omc"; |
| 19 | + |
| 20 | +fn build_vocab(text) { |
| 21 | + h seen = dict_new(); |
| 22 | + h chars = []; |
| 23 | + h i = 0; |
| 24 | + while i < str_len(text) { |
| 25 | + h ch = str_slice(text, i, i + 1); |
| 26 | + if !dict_has(seen, ch) { dict_set(seen, ch, arr_len(chars)); arr_push(chars, ch); } |
| 27 | + i = i + 1; |
| 28 | + } |
| 29 | + h v = dict_new(); |
| 30 | + dict_set(v, "chars", chars); |
| 31 | + dict_set(v, "lookup", seen); |
| 32 | + return v; |
| 33 | +} |
| 34 | + |
| 35 | +fn encode(text, vocab) { |
| 36 | + h lookup = dict_get(vocab, "lookup"); |
| 37 | + h ids = []; |
| 38 | + h i = 0; |
| 39 | + while i < str_len(text) { |
| 40 | + h ch = str_slice(text, i, i + 1); |
| 41 | + arr_push(ids, dict_get(lookup, ch)); |
| 42 | + i = i + 1; |
| 43 | + } |
| 44 | + return ids; |
| 45 | +} |
| 46 | + |
| 47 | +# Custom attention forward that wraps Q and V tape nodes in substrate_grad_mod |
| 48 | +# before they enter the matmul. This biases their gradients without changing |
| 49 | +# the forward computation. |
| 50 | +fn attn_forward_with_grad_mod(layer, x_id, gm_scale, gm_alpha, use_q6) { |
| 51 | + h Q_w = dict_get(layer, "Q"); |
| 52 | + h V_w = dict_get(layer, "V"); |
| 53 | + h K_const = dict_get(layer, "K_const"); |
| 54 | + h smod_alpha = dict_get(layer, "smod_alpha"); |
| 55 | + h v_scale = dict_get(layer, "v_resample_scale"); |
| 56 | + if v_scale == null { v_scale = 0.0; } |
| 57 | + |
| 58 | + # Wrap Q and V param tape nodes — biases backward flow into them. |
| 59 | + h Q_mod = tape_substrate_grad_mod(Q_w, gm_scale, gm_alpha); |
| 60 | + h V_mod = tape_substrate_grad_mod(V_w, gm_scale, gm_alpha); |
| 61 | + |
| 62 | + h q = tape_matmul(x_id, Q_mod); |
| 63 | + h q_mod = q; |
| 64 | + if use_q6 { |
| 65 | + q_mod = prom_q6_modulate(q, 10.0, 0.5, "fused"); |
| 66 | + } |
| 67 | + h v_raw = tape_matmul(x_id, V_mod); |
| 68 | + h v = prom_substrate_resample(v_raw, v_scale); |
| 69 | + |
| 70 | + h k = tape_const(K_const); |
| 71 | + h kt = tape_transpose(k); |
| 72 | + h scores = tape_matmul(q_mod, kt); |
| 73 | + h attn = prom_substrate_softmax(scores, smod_alpha); |
| 74 | + return tape_matmul(attn, v); |
| 75 | +} |
| 76 | + |
| 77 | +fn build_model(arm, vocab_size, d_model, ff_dim, seq_len, seed) { |
| 78 | + h emb = prom_embedding_new(vocab_size, d_model, seed); |
| 79 | + h s1 = dict_get(emb, "rng_state"); |
| 80 | + h attn = prom_attention_substrate_k_new(d_model, seq_len, s1 + 11); |
| 81 | + h s2 = dict_get(attn, "rng_state"); |
| 82 | + h ln1 = prom_layernorm_new(d_model, s2); |
| 83 | + h ff_up = prom_linear_new(d_model, ff_dim, s2 + 13); |
| 84 | + h s3 = dict_get(ff_up, "rng_state"); |
| 85 | + h ff_down = prom_linear_new(ff_dim, d_model, s3); |
| 86 | + h s4 = dict_get(ff_down, "rng_state"); |
| 87 | + h ln2 = prom_layernorm_new(d_model, s4); |
| 88 | + h head = prom_linear_new(d_model, vocab_size, s4 + 17); |
| 89 | + h m = dict_new(); |
| 90 | + dict_set(m, "arm", arm); |
| 91 | + dict_set(m, "emb", emb); |
| 92 | + dict_set(m, "attn", attn); |
| 93 | + dict_set(m, "ln1", ln1); |
| 94 | + dict_set(m, "ff_up", ff_up); |
| 95 | + dict_set(m, "ff_down", ff_down); |
| 96 | + dict_set(m, "ln2", ln2); |
| 97 | + dict_set(m, "head", head); |
| 98 | + return m; |
| 99 | +} |
| 100 | + |
| 101 | +fn forward_window(model, token_ids, pe_table) { |
| 102 | + h arm = dict_get(model, "arm"); |
| 103 | + h x = prom_embedding_batch(dict_get(model, "emb"), token_ids); |
| 104 | + h pe_rows = []; |
| 105 | + h i = 0; |
| 106 | + while i < arr_len(token_ids) { arr_push(pe_rows, arr_get(pe_table, i)); i = i + 1; } |
| 107 | + x = tape_add(x, tape_const(pe_rows)); |
| 108 | + h attn_out = null; |
| 109 | + if arm == "baseline" { |
| 110 | + attn_out = prom_attention_substrate_k_forward(dict_get(model, "attn"), x); |
| 111 | + } elif arm == "gradmod" { |
| 112 | + attn_out = attn_forward_with_grad_mod(dict_get(model, "attn"), x, 64.0, 0.5, false); |
| 113 | + } else { |
| 114 | + # gradmod_q6 |
| 115 | + attn_out = attn_forward_with_grad_mod(dict_get(model, "attn"), x, 64.0, 0.5, true); |
| 116 | + } |
| 117 | + h x_post = tape_add(x, attn_out); |
| 118 | + h n1 = prom_layernorm_forward(dict_get(model, "ln1"), x_post); |
| 119 | + h up = prom_linear_forward(dict_get(model, "ff_up"), n1); |
| 120 | + h down = prom_linear_forward(dict_get(model, "ff_down"), prom_relu(up)); |
| 121 | + h x_ff = tape_add(x_post, down); |
| 122 | + h n2 = prom_layernorm_forward(dict_get(model, "ln2"), x_ff); |
| 123 | + return prom_linear_forward(dict_get(model, "head"), n2); |
| 124 | +} |
| 125 | + |
| 126 | +fn collect_all(model) { |
| 127 | + h attn_p = prom_attention_substrate_k_params(dict_get(model, "attn")); |
| 128 | + h other = prom_collect_params_v2([ |
| 129 | + dict_get(model, "emb"), |
| 130 | + dict_get(model, "ln1"), |
| 131 | + dict_get(model, "ff_up"), |
| 132 | + dict_get(model, "ff_down"), |
| 133 | + dict_get(model, "ln2"), |
| 134 | + dict_get(model, "head"), |
| 135 | + ]); |
| 136 | + h out = []; |
| 137 | + h i = 0; |
| 138 | + while i < arr_len(attn_p) { arr_push(out, arr_get(attn_p, i)); i = i + 1; } |
| 139 | + i = 0; |
| 140 | + while i < arr_len(other) { arr_push(out, arr_get(other, i)); i = i + 1; } |
| 141 | + return out; |
| 142 | +} |
| 143 | + |
| 144 | +fn train(arm, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed) { |
| 145 | + tape_reset(); |
| 146 | + h model = build_model(arm, vocab_size, d_model, ff_dim, seq_len, seed); |
| 147 | + h params = collect_all(model); |
| 148 | + h opt = prom_adamw_new(params, lr, 0.9, 0.999, 1e-8, 0.0); |
| 149 | + h pe_table = prom_crt_pe_matrix(seq_len, d_model); |
| 150 | + h n_windows = arr_len(ids) - seq_len - 1; |
| 151 | + h tail = []; |
| 152 | + h step = 0; |
| 153 | + while step < steps { |
| 154 | + h start = step - (step / n_windows) * n_windows; |
| 155 | + h window = []; |
| 156 | + h targets = []; |
| 157 | + h k = 0; |
| 158 | + while k < seq_len { |
| 159 | + arr_push(window, arr_get(ids, start + k)); |
| 160 | + arr_push(targets, arr_get(ids, start + k + 1)); |
| 161 | + k = k + 1; |
| 162 | + } |
| 163 | + h logits = forward_window(model, window, pe_table); |
| 164 | + h loss = prom_cross_entropy_batch(logits, targets, vocab_size); |
| 165 | + tape_backward(loss); |
| 166 | + prom_adamw_step(opt); |
| 167 | + if step >= steps - 30 { arr_push(tail, tape_value(loss)); } |
| 168 | + step = step + 1; |
| 169 | + } |
| 170 | + h s = 0.0; h i = 0; |
| 171 | + while i < arr_len(tail) { s = s + arr_get(tail, i); i = i + 1; } |
| 172 | + return s / arr_len(tail); |
| 173 | +} |
| 174 | + |
| 175 | +fn mean_arr(xs) { |
| 176 | + h s = 0.0; h i = 0; |
| 177 | + while i < arr_len(xs) { s = s + arr_get(xs, i); i = i + 1; } |
| 178 | + return s / arr_len(xs); |
| 179 | +} |
| 180 | + |
| 181 | +fn main() { |
| 182 | + print("=== substrate-aware backward gradients A/B (task #284) ==="); |
| 183 | + h text = "the rain in spain falls mainly on the plain and the sun rises in the east while the moon hides behind the mountain peaks of distant lands"; |
| 184 | + h vocab = build_vocab(text); |
| 185 | + h vocab_size = arr_len(dict_get(vocab, "chars")); |
| 186 | + h ids = encode(text, vocab); |
| 187 | + h seq_len = 16; |
| 188 | + h d_model = 32; |
| 189 | + h ff_dim = 64; |
| 190 | + h lr = 0.005; |
| 191 | + h steps = 250; |
| 192 | + h seeds = [42, 7, 123]; |
| 193 | + |
| 194 | + print(concat_many("d_model=", to_string(d_model), |
| 195 | + " steps=", to_string(steps), |
| 196 | + " seeds=", to_string(arr_len(seeds)))); |
| 197 | + print(""); |
| 198 | + |
| 199 | + h arms = ["baseline", "gradmod", "gradmod_q6"]; |
| 200 | + h labels = dict_new(); |
| 201 | + dict_set(labels, "baseline", "baseline (no gm) "); |
| 202 | + dict_set(labels, "gradmod", "+ substrate gm "); |
| 203 | + dict_set(labels, "gradmod_q6", "+ substrate gm + Q6"); |
| 204 | + |
| 205 | + h results = dict_new(); |
| 206 | + h ai = 0; |
| 207 | + while ai < arr_len(arms) { |
| 208 | + h arm = arr_get(arms, ai); |
| 209 | + h losses = []; |
| 210 | + h si = 0; |
| 211 | + while si < arr_len(seeds) { |
| 212 | + h seed = arr_get(seeds, si); |
| 213 | + h L = train(arm, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed); |
| 214 | + arr_push(losses, L); |
| 215 | + si = si + 1; |
| 216 | + } |
| 217 | + dict_set(results, arm, losses); |
| 218 | + h mu = mean_arr(losses); |
| 219 | + print(concat_many(dict_get(labels, arm), " mean=", to_string(mu))); |
| 220 | + ai = ai + 1; |
| 221 | + } |
| 222 | + |
| 223 | + print(""); |
| 224 | + print("=== headline ==="); |
| 225 | + h base_mu = mean_arr(dict_get(results, "baseline")); |
| 226 | + ai = 0; |
| 227 | + while ai < arr_len(arms) { |
| 228 | + h arm = arr_get(arms, ai); |
| 229 | + h mu = mean_arr(dict_get(results, arm)); |
| 230 | + h delta = mu - base_mu; |
| 231 | + h pct = (delta / base_mu) * 100.0; |
| 232 | + h wins = 0; |
| 233 | + h si = 0; |
| 234 | + while si < arr_len(seeds) { |
| 235 | + if arr_get(dict_get(results, arm), si) < arr_get(dict_get(results, "baseline"), si) { |
| 236 | + wins = wins + 1; |
| 237 | + } |
| 238 | + si = si + 1; |
| 239 | + } |
| 240 | + print(concat_many(dict_get(labels, arm), |
| 241 | + " mean=", to_string(mu), |
| 242 | + " Δ=", to_string(delta), |
| 243 | + " (", to_string(pct), "%)", |
| 244 | + " wins ", to_string(wins), "/", to_string(arr_len(seeds)))); |
| 245 | + ai = ai + 1; |
| 246 | + } |
| 247 | +} |
| 248 | + |
| 249 | +main(); |
0 commit comments