|
| 1 | +# Q6-at-scale test in OMC. Single-head, d_model=16 / 6-seed run showed |
| 2 | +# L1+SMOD+V wins -2.47% but adding Q6 fused LOSES some ground vs the |
| 3 | +# base stack. PyTorch saw Q6 win -12.15% at L1-MH on TinyShakespeare. |
| 4 | +# Question: does Q6 need more capacity to fire? |
| 5 | +# |
| 6 | +# Test: increase d_model and steps, hold the rest fixed. Just B vs C |
| 7 | +# (substrate stack +/- Q6 fused) — 3 seeds each. |
| 8 | + |
| 9 | +import "examples/lib/prometheus.omc"; |
| 10 | + |
| 11 | +fn build_vocab(text) { |
| 12 | + h seen = dict_new(); |
| 13 | + h chars = []; |
| 14 | + h i = 0; |
| 15 | + while i < str_len(text) { |
| 16 | + h ch = str_slice(text, i, i + 1); |
| 17 | + if !dict_has(seen, ch) { |
| 18 | + dict_set(seen, ch, arr_len(chars)); |
| 19 | + arr_push(chars, ch); |
| 20 | + } |
| 21 | + i = i + 1; |
| 22 | + } |
| 23 | + h v = dict_new(); |
| 24 | + dict_set(v, "chars", chars); |
| 25 | + dict_set(v, "lookup", seen); |
| 26 | + return v; |
| 27 | +} |
| 28 | + |
| 29 | +fn encode(text, vocab) { |
| 30 | + h lookup = dict_get(vocab, "lookup"); |
| 31 | + h ids = []; |
| 32 | + h i = 0; |
| 33 | + while i < str_len(text) { |
| 34 | + h ch = str_slice(text, i, i + 1); |
| 35 | + arr_push(ids, dict_get(lookup, ch)); |
| 36 | + i = i + 1; |
| 37 | + } |
| 38 | + return ids; |
| 39 | +} |
| 40 | + |
| 41 | +fn build_model(q6_on, vocab_size, d_model, ff_dim, seq_len, seed) { |
| 42 | + h emb = prom_embedding_new(vocab_size, d_model, seed); |
| 43 | + h s1 = dict_get(emb, "rng_state"); |
| 44 | + h attn = prom_attention_substrate_k_new(d_model, seq_len, s1 + 11); |
| 45 | + if q6_on { dict_set(attn, "q6_mode", "fused"); } |
| 46 | + h s2 = dict_get(attn, "rng_state"); |
| 47 | + h ln1 = prom_layernorm_new(d_model, s2); |
| 48 | + h ff_up = prom_linear_new(d_model, ff_dim, s2 + 13); |
| 49 | + h s3 = dict_get(ff_up, "rng_state"); |
| 50 | + h ff_down = prom_linear_new(ff_dim, d_model, s3); |
| 51 | + h s4 = dict_get(ff_down, "rng_state"); |
| 52 | + h ln2 = prom_layernorm_new(d_model, s4); |
| 53 | + h head = prom_linear_new(d_model, vocab_size, s4 + 17); |
| 54 | + h m = dict_new(); |
| 55 | + dict_set(m, "emb", emb); |
| 56 | + dict_set(m, "attn", attn); |
| 57 | + dict_set(m, "ln1", ln1); |
| 58 | + dict_set(m, "ff_up", ff_up); |
| 59 | + dict_set(m, "ff_down", ff_down); |
| 60 | + dict_set(m, "ln2", ln2); |
| 61 | + dict_set(m, "head", head); |
| 62 | + return m; |
| 63 | +} |
| 64 | + |
| 65 | +fn forward_window(model, token_ids, pe_table) { |
| 66 | + h x = prom_embedding_batch(dict_get(model, "emb"), token_ids); |
| 67 | + h pe_rows = []; |
| 68 | + h i = 0; |
| 69 | + while i < arr_len(token_ids) { |
| 70 | + arr_push(pe_rows, arr_get(pe_table, i)); |
| 71 | + i = i + 1; |
| 72 | + } |
| 73 | + x = tape_add(x, tape_const(pe_rows)); |
| 74 | + h attn_out = prom_attention_substrate_k_forward(dict_get(model, "attn"), x); |
| 75 | + h x_post_attn = tape_add(x, attn_out); |
| 76 | + h n1 = prom_layernorm_forward(dict_get(model, "ln1"), x_post_attn); |
| 77 | + h up = prom_linear_forward(dict_get(model, "ff_up"), n1); |
| 78 | + h down = prom_linear_forward(dict_get(model, "ff_down"), prom_relu(up)); |
| 79 | + h x_post_ff = tape_add(x_post_attn, down); |
| 80 | + h n2 = prom_layernorm_forward(dict_get(model, "ln2"), x_post_ff); |
| 81 | + return prom_linear_forward(dict_get(model, "head"), n2); |
| 82 | +} |
| 83 | + |
| 84 | +fn collect_all(model) { |
| 85 | + h attn_p = prom_attention_substrate_k_params(dict_get(model, "attn")); |
| 86 | + h other = prom_collect_params_v2([ |
| 87 | + dict_get(model, "emb"), |
| 88 | + dict_get(model, "ln1"), |
| 89 | + dict_get(model, "ff_up"), |
| 90 | + dict_get(model, "ff_down"), |
| 91 | + dict_get(model, "ln2"), |
| 92 | + dict_get(model, "head"), |
| 93 | + ]); |
| 94 | + h out = []; |
| 95 | + h i = 0; |
| 96 | + while i < arr_len(attn_p) { arr_push(out, arr_get(attn_p, i)); i = i + 1; } |
| 97 | + i = 0; |
| 98 | + while i < arr_len(other) { arr_push(out, arr_get(other, i)); i = i + 1; } |
| 99 | + return out; |
| 100 | +} |
| 101 | + |
| 102 | +fn train(q6_on, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed) { |
| 103 | + tape_reset(); |
| 104 | + h model = build_model(q6_on, vocab_size, d_model, ff_dim, seq_len, seed); |
| 105 | + h params = collect_all(model); |
| 106 | + h opt = prom_adamw_new(params, lr, 0.9, 0.999, 1e-8, 0.0); |
| 107 | + h pe_table = prom_crt_pe_matrix(seq_len, d_model); |
| 108 | + h n_windows = arr_len(ids) - seq_len - 1; |
| 109 | + h tail = []; |
| 110 | + h step = 0; |
| 111 | + while step < steps { |
| 112 | + h start = step - (step / n_windows) * n_windows; |
| 113 | + h window = []; |
| 114 | + h targets = []; |
| 115 | + h k = 0; |
| 116 | + while k < seq_len { |
| 117 | + arr_push(window, arr_get(ids, start + k)); |
| 118 | + arr_push(targets, arr_get(ids, start + k + 1)); |
| 119 | + k = k + 1; |
| 120 | + } |
| 121 | + h logits = forward_window(model, window, pe_table); |
| 122 | + h loss = prom_cross_entropy_batch(logits, targets, vocab_size); |
| 123 | + tape_backward(loss); |
| 124 | + prom_adamw_step(opt); |
| 125 | + if step >= steps - 30 { arr_push(tail, tape_value(loss)); } |
| 126 | + step = step + 1; |
| 127 | + } |
| 128 | + h s = 0.0; |
| 129 | + h i = 0; |
| 130 | + while i < arr_len(tail) { s = s + arr_get(tail, i); i = i + 1; } |
| 131 | + return s / arr_len(tail); |
| 132 | +} |
| 133 | + |
| 134 | +fn mean_arr(xs) { |
| 135 | + h s = 0.0; |
| 136 | + h i = 0; |
| 137 | + while i < arr_len(xs) { s = s + arr_get(xs, i); i = i + 1; } |
| 138 | + return s / arr_len(xs); |
| 139 | +} |
| 140 | + |
| 141 | +fn main() { |
| 142 | + print("=== Q6 scale test: does d_model help? ==="); |
| 143 | + h text = "the rain in spain falls mainly on the plain and the sun rises in the east while the moon hides behind the mountain peaks of distant lands where ancient creatures sleep in caves of silver beneath the stars while waves crash against rocky shores carrying secrets older than time itself across the deep blue ocean toward the horizon where dreams meet reality"; |
| 144 | + h vocab = build_vocab(text); |
| 145 | + h vocab_size = arr_len(dict_get(vocab, "chars")); |
| 146 | + h ids = encode(text, vocab); |
| 147 | + h seq_len = 16; |
| 148 | + h d_model = 32; |
| 149 | + h ff_dim = 64; |
| 150 | + h lr = 0.005; |
| 151 | + h steps = 600; |
| 152 | + h seeds = [42, 7, 123]; |
| 153 | + |
| 154 | + print(concat_many("corpus=", to_string(str_len(text)), |
| 155 | + " vocab=", to_string(vocab_size), |
| 156 | + " seq_len=", to_string(seq_len), |
| 157 | + " d_model=", to_string(d_model), |
| 158 | + " steps=", to_string(steps))); |
| 159 | + print(""); |
| 160 | + |
| 161 | + h base_losses = []; |
| 162 | + h q6_losses = []; |
| 163 | + h si = 0; |
| 164 | + while si < arr_len(seeds) { |
| 165 | + h seed = arr_get(seeds, si); |
| 166 | + h B = train(false, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed); |
| 167 | + h C = train(true, vocab_size, ids, seq_len, d_model, ff_dim, lr, steps, seed); |
| 168 | + arr_push(base_losses, B); |
| 169 | + arr_push(q6_losses, C); |
| 170 | + h marker = "base wins"; |
| 171 | + if C < B { marker = "Q6 wins"; } |
| 172 | + print(concat_many("seed=", to_string(seed), |
| 173 | + " base=", to_string(B), |
| 174 | + " +Q6=", to_string(C), |
| 175 | + " Δ=", to_string(C - B), " ", marker)); |
| 176 | + si = si + 1; |
| 177 | + } |
| 178 | + print(""); |
| 179 | + h B_mean = mean_arr(base_losses); |
| 180 | + h C_mean = mean_arr(q6_losses); |
| 181 | + h delta = C_mean - B_mean; |
| 182 | + h pct = (delta / B_mean) * 100.0; |
| 183 | + h wins = 0; |
| 184 | + si = 0; |
| 185 | + while si < arr_len(seeds) { |
| 186 | + if arr_get(q6_losses, si) < arr_get(base_losses, si) { wins = wins + 1; } |
| 187 | + si = si + 1; |
| 188 | + } |
| 189 | + print(concat_many("base (L1+SMOD+V) mean=", to_string(B_mean))); |
| 190 | + print(concat_many("+ Q6 fused mean=", to_string(C_mean), |
| 191 | + " Δ=", to_string(delta), " (", to_string(pct), "%)", |
| 192 | + " wins ", to_string(wins), "/", to_string(arr_len(seeds)))); |
| 193 | +} |
| 194 | + |
| 195 | +main(); |
0 commit comments