Skip to content

Commit cf505bb

Browse files
Fix K-frozen attention bug + substrate-attention variants (L0-L3)
Two parallel wins from the K-frozen bug discovery: 1. tape_transpose Rust builtin fixes the original bug — K's gradient now flows through the score path. 2. Substrate-K, substrate-K+Q, and fully-parameter-free attention variants ship as Prometheus layers, testing the deeper hypothesis: can the substrate REPLACE attention? Rust additions (omnimcode-core/src/interpreter.rs): tape_transpose(a_id) -> [cols, rows] Forward: swap dimensions. Backward: transpose upstream gradient. ~25 lines total. TapeOp::Transpose variant added. Prometheus additions (examples/lib/prometheus.omc): L0 — prom_attention_forward (existing, FIXED) Switched from tape_value/arr_transpose/tape_const hack to tape_transpose. Q·K^T now backpropagates to K properly. L1 — prom_attention_substrate_k_* K replaced by CRT-PE position memory. Q, V still learned. 2 params per layer instead of 3. L2 — prom_attention_substrate_kq_* K AND Q both CRT-PE. Only V is learned. 1 param per layer. L3 — prom_attention_substrate_full_* All three substrate-derived. Q = K = CRT-PE; V = identity (pass-through). ZERO learnable params in the attention block. Regression tests (examples/tests/test_prometheus.omc): test_attention_backward_flows_to_QKV Locks the K-fix: ALL of Q, K, V get non-zero gradient after one backward pass. Would have caught the original bug. test_tape_transpose_forward test_tape_transpose_backward test_substrate_k_has_no_K_params test_substrate_full_has_zero_params All 15 Prometheus tests pass. A/B experiment (examples/prometheus_attention_4way.omc): Trains all four variants (L0, L1, L2, L3) on the same task with the same seeds. Three seeds × 250 steps each. Tests the deepest question OMC can pose: "is attention's expressivity actually needed at this scale, or can the substrate provide enough inductive prior?" Result will land in a separate commit when the run finishes. Strategic significance: The user's reframe ("why not invent a new K primitive?") shifted the work from "patch the bug" to "ship 4 architecturally distinct attention layers." Either way the bug is gone; but the substrate variants test the harder hypothesis that OMC's substrate doesn't just augment transformer primitives — it can REPLACE them. Whichever variant wins (or loses) the A/B is a real empirical result on the substrate's expressive power at the attention layer. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 8fcf7ee commit cf505bb

4 files changed

Lines changed: 541 additions & 27 deletions

File tree

examples/lib/prometheus.omc

Lines changed: 146 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -640,52 +640,171 @@ fn prom_attention_new(d_model, seq_len, rng_state) {
640640
return layer;
641641
}
642642

643-
# Forward: given x as a tape node of shape [seq_len, d_model],
644-
# returns attention output [seq_len, d_model].
645-
# This is a hand-rolled attention since we don't have a fused
646-
# tape op for it — uses tape_matmul, tape_softmax, and the
647-
# geodesic bias as a const subtracted before softmax.
643+
# Forward (L0 — standard QKV attention with optional geodesic bias):
644+
# given x as a tape node of shape [seq_len, d_model], returns
645+
# attention output [seq_len, d_model].
646+
# Uses tape_transpose so K's gradient flows properly through the
647+
# score path (fixes the earlier K-frozen bug).
648648
fn prom_attention_forward(layer, x_id) {
649649
h Q_w = dict_get(layer, "Q");
650650
h K_w = dict_get(layer, "K");
651651
h V_w = dict_get(layer, "V");
652-
h d_model = dict_get(layer, "d_model");
653652
h seq_len = dict_get(layer, "seq_len");
654653
h alpha = dict_get(layer, "alpha");
655654

656655
h q = tape_matmul(x_id, Q_w);
657656
h k = tape_matmul(x_id, K_w);
658657
h v = tape_matmul(x_id, V_w);
659658

660-
# scores = q @ k^T (we have no tape_transpose so build k^T as a
661-
# const slice of k.value — fine since it just shapes the data).
662-
h k_val = tape_value(k);
663-
h kt_val = arr_transpose(k_val);
664-
h kt = tape_const(kt_val);
659+
# scores = q @ k^T — proper differentiable transpose now.
660+
h kt = tape_transpose(k);
665661
h scores = tape_matmul(q, kt);
666662

667-
# Subtract alpha * geodesic_bias element-wise (as a const matrix).
668-
h bias_matrix = prom_geodesic_bias_matrix(seq_len);
669-
h neg_bias_scaled = [];
670-
h i = 0;
671-
while i < seq_len {
672-
h row = arr_get(bias_matrix, i);
673-
h new_row = [];
674-
h j = 0;
675-
while j < seq_len {
676-
arr_push(new_row, 0.0 - alpha * arr_get(row, j));
677-
j = j + 1;
663+
# Optional geodesic bias subtracted pre-softmax.
664+
if alpha != 0.0 {
665+
h bias_matrix = prom_geodesic_bias_matrix(seq_len);
666+
h neg_bias_scaled = [];
667+
h i = 0;
668+
while i < seq_len {
669+
h row = arr_get(bias_matrix, i);
670+
h new_row = [];
671+
h j = 0;
672+
while j < seq_len {
673+
arr_push(new_row, 0.0 - alpha * arr_get(row, j));
674+
j = j + 1;
675+
}
676+
arr_push(neg_bias_scaled, new_row);
677+
i = i + 1;
678678
}
679-
arr_push(neg_bias_scaled, new_row);
680-
i = i + 1;
679+
h bias_node = tape_const(neg_bias_scaled);
680+
scores = tape_add(scores, bias_node);
681681
}
682-
h bias_node = tape_const(neg_bias_scaled);
683-
h biased = tape_add(scores, bias_node);
684682

685-
h attn = tape_softmax(biased);
683+
h attn = tape_softmax(scores);
686684
return tape_matmul(attn, v);
687685
}
688686

687+
# ---------------------------------------------------------------------------
688+
# Substrate-attention variants (L1, L2, L3).
689+
#
690+
# L0: K, Q, V all learned matrices (prom_attention_forward above).
691+
# L1: K replaced by CRT-PE position memory (no learnable K params).
692+
# L2: K + Q both replaced by substrate-derived projections (only V learned).
693+
# L3: K + Q + V all substrate-derived (parameter-free attention block).
694+
#
695+
# Each tests a stronger version of the hypothesis that OMC's substrate
696+
# can REPLACE learned attention rather than just augment it.
697+
# ---------------------------------------------------------------------------
698+
699+
# L1: substrate-K attention. K is fixed at CRT-PE; Q, V trained.
700+
fn prom_attention_substrate_k_new(d_model, seq_len, rng_state) {
701+
h Q = _prom_random_matrix(d_model, d_model, 0.3, rng_state);
702+
h V = _prom_random_matrix(d_model, d_model, 0.3, dict_get(Q, "state"));
703+
h layer = dict_new();
704+
dict_set(layer, "kind", "attention"); # reuse for param-collect
705+
dict_set(layer, "variant", "substrate_k");
706+
dict_set(layer, "d_model", d_model);
707+
dict_set(layer, "seq_len", seq_len);
708+
dict_set(layer, "Q", dict_get(Q, "node"));
709+
dict_set(layer, "V", dict_get(V, "node"));
710+
# K is precomputed as a [seq_len, d_model] CRT-PE table — no tape_var.
711+
dict_set(layer, "K_const", prom_crt_pe_matrix(seq_len, d_model));
712+
dict_set(layer, "alpha", 0.0);
713+
dict_set(layer, "rng_state", dict_get(V, "state"));
714+
return layer;
715+
}
716+
717+
fn prom_attention_substrate_k_forward(layer, x_id) {
718+
h Q_w = dict_get(layer, "Q");
719+
h V_w = dict_get(layer, "V");
720+
h K_const = dict_get(layer, "K_const");
721+
722+
h q = tape_matmul(x_id, Q_w);
723+
h v = tape_matmul(x_id, V_w);
724+
725+
# K is the substrate (CRT-PE table). No learnable params on K side.
726+
h k = tape_const(K_const);
727+
h kt = tape_transpose(k);
728+
h scores = tape_matmul(q, kt);
729+
730+
h attn = tape_softmax(scores);
731+
return tape_matmul(attn, v);
732+
}
733+
734+
# L2: substrate K + Q. Only V is learned.
735+
# Q is derived as: x_pos_concat * fixed projection (use CRT-PE directly).
736+
# In the simplest form: Q = CRT-PE (same as K) so each position queries
737+
# its own substrate address. The attention reduces to "soft positional
738+
# self-similarity" — diagonal-biased by construction.
739+
fn prom_attention_substrate_kq_new(d_model, seq_len, rng_state) {
740+
h V = _prom_random_matrix(d_model, d_model, 0.3, rng_state);
741+
h layer = dict_new();
742+
dict_set(layer, "kind", "attention");
743+
dict_set(layer, "variant", "substrate_kq");
744+
dict_set(layer, "d_model", d_model);
745+
dict_set(layer, "seq_len", seq_len);
746+
dict_set(layer, "V", dict_get(V, "node"));
747+
h pe = prom_crt_pe_matrix(seq_len, d_model);
748+
dict_set(layer, "K_const", pe);
749+
dict_set(layer, "Q_const", pe);
750+
dict_set(layer, "alpha", 0.0);
751+
dict_set(layer, "rng_state", dict_get(V, "state"));
752+
return layer;
753+
}
754+
755+
fn prom_attention_substrate_kq_forward(layer, x_id) {
756+
h V_w = dict_get(layer, "V");
757+
h K_const = dict_get(layer, "K_const");
758+
h Q_const = dict_get(layer, "Q_const");
759+
h v = tape_matmul(x_id, V_w);
760+
h q = tape_const(Q_const);
761+
h k = tape_const(K_const);
762+
h kt = tape_transpose(k);
763+
h scores = tape_matmul(q, kt);
764+
h attn = tape_softmax(scores);
765+
return tape_matmul(attn, v);
766+
}
767+
768+
# L3: fully substrate attention. Zero learnable params in the layer.
769+
# Q = K = CRT-PE; V = identity transform on x (the input passes
770+
# through unchanged, weighted by substrate-determined attention).
771+
fn prom_attention_substrate_full_new(d_model, seq_len) {
772+
h layer = dict_new();
773+
dict_set(layer, "kind", "attention_substrate_full");
774+
dict_set(layer, "variant", "substrate_full");
775+
dict_set(layer, "d_model", d_model);
776+
dict_set(layer, "seq_len", seq_len);
777+
h pe = prom_crt_pe_matrix(seq_len, d_model);
778+
dict_set(layer, "K_const", pe);
779+
dict_set(layer, "Q_const", pe);
780+
dict_set(layer, "alpha", 0.0);
781+
return layer;
782+
}
783+
784+
fn prom_attention_substrate_full_forward(layer, x_id) {
785+
h K_const = dict_get(layer, "K_const");
786+
h Q_const = dict_get(layer, "Q_const");
787+
h q = tape_const(Q_const);
788+
h k = tape_const(K_const);
789+
h kt = tape_transpose(k);
790+
h scores = tape_matmul(q, kt);
791+
h attn = tape_softmax(scores);
792+
# V is x itself — no transformation. The block weights x by
793+
# substrate-determined attention.
794+
return tape_matmul(attn, x_id);
795+
}
796+
797+
# Param collectors per variant.
798+
fn prom_attention_substrate_k_params(layer) {
799+
return [dict_get(layer, "Q"), dict_get(layer, "V")];
800+
}
801+
fn prom_attention_substrate_kq_params(layer) {
802+
return [dict_get(layer, "V")];
803+
}
804+
fn prom_attention_substrate_full_params(layer) {
805+
return [];
806+
}
807+
689808
fn prom_attention_params(layer) {
690809
h out = [];
691810
arr_push(out, dict_get(layer, "Q"));

0 commit comments

Comments
 (0)