Skip to content

Commit 285dfb4

Browse files
Reverse-mode autograd + substrate-preserving matmul
The real ML training engine. Reverse-mode is O(forward) per parameter — a single tape_backward() walks the recorded graph in reverse and populates gradients for every leaf variable. Forward-mode (dual numbers) needs one full forward pass PER parameter, so it can't scale past toy models. Reverse-mode is what PyTorch is built on. The OMC-only property: forward values stay substrate-typed all the way through. arr_matmul now keeps the inner loop in i64 when every input cell is an HInt, so each output cell carries its own φ-resonance / HIM score (HInt::new computes resonance from the integer value). The float fallback only kicks in when a float shows up anywhere. tape_value(id) returns substrate-annotated HInt cells for integer nodes; tape_grad(id) returns HFloat gradients. Tape ops: tape_reset / tape_var / tape_const / tape_value / tape_grad tape_add / tape_sub / tape_mul / tape_div / tape_neg / tape_pow_int tape_exp / tape_sin / tape_cos tape_relu / tape_sigmoid / tape_tanh tape_matmul (with dA = dy @ B^T, dB = A^T @ dy backward) tape_sum / tape_mean tape_backward / tape_update (in-place SGD step) Each tape node holds a TapeMat (flat f64 buffer + rows/cols) so a single node can be a scalar, a row vector, or a 2D matrix. Same op constructors work across all three shapes — Mul broadcasts scalar into matrix, MatMul does the real linear-algebra version, the backward dispatcher reads shapes from the live values rather than needing them passed at construction time. Tests: 12 cases — analytic gradients for arithmetic, transcendentals, ReLU branches, chain rule, multi-parameter (single backward fills every leaf's grad), 2D matmul backward with shape verification, end-to-end SGD training that converges to w=2 on y=2x, and two substrate-preservation tests proving HInt resonance survives both the tape lift and a full matmul. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 162c67e commit 285dfb4

3 files changed

Lines changed: 911 additions & 4 deletions

File tree

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
# Reverse-mode autograd — the real ML training engine.
2+
#
3+
# Reverse-mode is O(forward) per parameter — you can compute the
4+
# gradient of one scalar loss w.r.t. thousands of weights in a single
5+
# backward walk. Forward-mode (dual numbers) needs one full forward
6+
# pass PER parameter, so it can't scale past toy models. Reverse-mode
7+
# is what PyTorch/TensorFlow/JAX are built on.
8+
#
9+
# What's uniquely OMC here: forward values are substrate-preserved
10+
# (HInt cells carry resonance/HIM through matmul; tape_value(id)
11+
# returns substrate-annotated cells). Python's autograd returns
12+
# plain numpy floats — substrate metadata is not a thing it has.
13+
14+
fn assert_eq(actual, expected, msg) {
15+
if actual != expected {
16+
test_record_failure(msg + ": expected " + to_string(expected) + " got " + to_string(actual));
17+
}
18+
}
19+
20+
fn assert_true(cond, msg) {
21+
if !cond { test_record_failure(msg); }
22+
}
23+
24+
fn approx_eq(a, b, tol) {
25+
h d = a - b;
26+
if d < 0.0 { d = 0.0 - d; }
27+
return d <= tol;
28+
}
29+
30+
# ---- Scalar: f(x) = x^2 ; f'(x) = 2x ; at x=3, grad = 6 ----
31+
32+
fn test_scalar_square_grad() {
33+
tape_reset();
34+
h x = tape_var(3.0);
35+
h y = tape_mul(x, x);
36+
tape_backward(y);
37+
h g = tape_grad(x);
38+
assert_true(approx_eq(g, 6.0, 0.001), "d(x^2)/dx at 3 = 6");
39+
}
40+
41+
# ---- Multi-parameter: f(a, b) = a*b + a^2 ; df/da = b + 2a ; df/db = a ----
42+
# At a=2, b=5: df/da = 5 + 4 = 9 ; df/db = 2.
43+
44+
fn test_multi_parameter_grads() {
45+
tape_reset();
46+
h a = tape_var(2.0);
47+
h b = tape_var(5.0);
48+
h ab = tape_mul(a, b);
49+
h aa = tape_mul(a, a);
50+
h y = tape_add(ab, aa);
51+
tape_backward(y);
52+
h ga = tape_grad(a);
53+
h gb = tape_grad(b);
54+
assert_true(approx_eq(ga, 9.0, 0.001), "df/da = b+2a = 9");
55+
assert_true(approx_eq(gb, 2.0, 0.001), "df/db = a = 2");
56+
}
57+
58+
# ---- Demonstrate the reverse-mode WIN: many params, one backward ---
59+
# f(a,b,c,d) = a + b + c + d ; df/dx = 1 for all x.
60+
# A single tape_backward(loss) gives gradients for every leaf.
61+
62+
fn test_one_backward_many_grads() {
63+
tape_reset();
64+
h a = tape_var(1.0);
65+
h b = tape_var(2.0);
66+
h c = tape_var(3.0);
67+
h d = tape_var(4.0);
68+
h ab = tape_add(a, b);
69+
h cd = tape_add(c, d);
70+
h y = tape_add(ab, cd);
71+
tape_backward(y);
72+
assert_true(approx_eq(tape_grad(a), 1.0, 0.001), "da = 1");
73+
assert_true(approx_eq(tape_grad(b), 1.0, 0.001), "db = 1");
74+
assert_true(approx_eq(tape_grad(c), 1.0, 0.001), "dc = 1");
75+
assert_true(approx_eq(tape_grad(d), 1.0, 0.001), "dd = 1");
76+
}
77+
78+
# ---- Nonlinearities ----
79+
80+
fn test_sigmoid_grad() {
81+
tape_reset();
82+
h x = tape_var(0.0);
83+
h y = tape_sigmoid(x);
84+
tape_backward(y);
85+
# sigmoid'(0) = 0.5 * 0.5 = 0.25
86+
assert_true(approx_eq(tape_grad(x), 0.25, 0.001), "sigmoid'(0) = 0.25");
87+
}
88+
89+
fn test_tanh_grad() {
90+
tape_reset();
91+
h x = tape_var(0.0);
92+
h y = tape_tanh(x);
93+
tape_backward(y);
94+
# tanh'(0) = 1
95+
assert_true(approx_eq(tape_grad(x), 1.0, 0.001), "tanh'(0) = 1");
96+
}
97+
98+
fn test_relu_grad_branches() {
99+
tape_reset();
100+
h xp = tape_var(2.0);
101+
h xn = tape_var(0 - 3.0);
102+
h yp = tape_relu(xp);
103+
h yn = tape_relu(xn);
104+
h y = tape_add(yp, yn);
105+
tape_backward(y);
106+
assert_true(approx_eq(tape_grad(xp), 1.0, 0.001), "relu' on positive = 1");
107+
assert_true(approx_eq(tape_grad(xn), 0.0, 0.001), "relu' on negative = 0");
108+
}
109+
110+
fn test_exp_grad() {
111+
tape_reset();
112+
h x = tape_var(0.0);
113+
h y = tape_exp(x);
114+
tape_backward(y);
115+
assert_true(approx_eq(tape_grad(x), 1.0, 0.001), "exp'(0) = 1");
116+
}
117+
118+
# ---- Composition: chain rule through sigmoid(2x + 1) ----
119+
# At x=0: u=1, sigmoid(1) ≈ 0.7310586,
120+
# sigmoid'(1) ≈ 0.196612, dy/dx = 0.196612 * 2 ≈ 0.393224
121+
122+
fn test_chain_rule() {
123+
tape_reset();
124+
h x = tape_var(0.0);
125+
h two = tape_const(2.0);
126+
h one = tape_const(1.0);
127+
h u = tape_add(tape_mul(two, x), one);
128+
h y = tape_sigmoid(u);
129+
tape_backward(y);
130+
assert_true(approx_eq(tape_grad(x), 0.393224, 0.001), "chain rule");
131+
}
132+
133+
# ---- Matrix autograd: forward through a 2D matmul ----
134+
# Y = X @ W ; loss = sum(Y) ; dL/dX = ones(Y) @ W^T ; dL/dW = X^T @ ones(Y)
135+
136+
fn test_matmul_backward_shapes() {
137+
tape_reset();
138+
# X is 1x3, W is 3x2 → Y is 1x2 → loss is scalar.
139+
# 1-row gradients come back as 1D arrays (convention: drop the
140+
# outer row when there's only one).
141+
h X = tape_var([[1.0, 2.0, 3.0]]);
142+
h W = tape_var([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]);
143+
h Y = tape_matmul(X, W);
144+
h L = tape_sum(Y);
145+
tape_backward(L);
146+
h gX = tape_grad(X); # 1D: [0.3, 0.7, 1.1]
147+
h gW = tape_grad(W); # 2D: 3x2
148+
assert_true(approx_eq(arr_get(gX, 0), 0.3, 0.001), "dL/dX[0] = 0.3");
149+
assert_true(approx_eq(arr_get(gX, 1), 0.7, 0.001), "dL/dX[1] = 0.7");
150+
assert_true(approx_eq(arr_get(gX, 2), 1.1, 0.001), "dL/dX[2] = 1.1");
151+
h gW0 = arr_get(gW, 0);
152+
assert_true(approx_eq(arr_get(gW0, 0), 1.0, 0.001), "dL/dW[0,0] = 1.0");
153+
assert_true(approx_eq(arr_get(gW0, 1), 1.0, 0.001), "dL/dW[0,1] = 1.0");
154+
h gW2 = arr_get(gW, 2);
155+
assert_true(approx_eq(arr_get(gW2, 0), 3.0, 0.001), "dL/dW[2,0] = 3.0");
156+
assert_true(approx_eq(arr_get(gW2, 1), 3.0, 0.001), "dL/dW[2,1] = 3.0");
157+
}
158+
159+
# ---- End-to-end gradient descent training ----
160+
# Train a single scalar w to minimize L(w) = sum_i (w*x_i - y_i)^2
161+
# where y_i = 2 * x_i. Converges fast.
162+
163+
fn test_sgd_training_converges() {
164+
h xs = [1.0, 2.0, 3.0, 4.0];
165+
h ys = [2.0, 4.0, 6.0, 8.0];
166+
h n = arr_len(xs);
167+
h step = 0;
168+
h max_steps = 100;
169+
h lr = 0.01;
170+
# Persistent variable across steps via tape_var rebuilt each step,
171+
# but we read the converged value from a normal OMC variable.
172+
h w_value = 0.1;
173+
h converged = 0;
174+
175+
while step < max_steps {
176+
tape_reset();
177+
h w = tape_var(w_value);
178+
# Accumulate loss across the dataset.
179+
h L = tape_const(0.0);
180+
h i = 0;
181+
while i < n {
182+
h xi = tape_const(arr_get(xs, i));
183+
h yi = tape_const(arr_get(ys, i));
184+
h pred = tape_mul(w, xi);
185+
h err = tape_sub(pred, yi);
186+
h sq = tape_mul(err, err);
187+
L = tape_add(L, sq);
188+
i = i + 1;
189+
}
190+
tape_backward(L);
191+
h gw = tape_grad(w);
192+
h Lv = tape_value(L);
193+
if Lv < 0.001 {
194+
converged = 1;
195+
step = max_steps;
196+
} else {
197+
w_value = w_value - lr * gw;
198+
step = step + 1;
199+
}
200+
}
201+
assert_eq(converged, 1, "SGD converged on y=2x");
202+
assert_true(approx_eq(w_value, 2.0, 0.05), "w near 2.0");
203+
}
204+
205+
# ---- Substrate metadata preserved on forward values ----
206+
# After running a forward pass, tape_value(id) for an integer-valued
207+
# node should come back as HInt with substrate metadata, NOT a plain
208+
# float. This is the OMC-only property — Python autograd loses this.
209+
210+
fn test_substrate_preserved_through_tape() {
211+
tape_reset();
212+
h a = tape_var(3);
213+
h b = tape_var(5);
214+
h s = tape_add(a, b); # 8 — a Fibonacci attractor
215+
h v = tape_value(s);
216+
# v should round-trip as an integer-valued result; the cell's
217+
# resonance should be high because 8 IS a Fibonacci attractor.
218+
assert_true(is_attractor(v) == 1, "8 is a Fibonacci attractor");
219+
}
220+
221+
fn test_substrate_preserved_through_matmul() {
222+
# Substrate-typed matmul (integer in, integer out, resonance on every cell).
223+
tape_reset();
224+
h X = tape_var([[1, 2]]);
225+
h W = tape_var([[1, 1], [2, 2]]);
226+
h Y = tape_matmul(X, W); # = [[5, 5]] ; 1-row → 1D [5, 5]
227+
h v = tape_value(Y);
228+
h c0 = arr_get(v, 0);
229+
# 5 is a Fibonacci attractor — substrate resonance preserved through matmul.
230+
assert_eq(is_attractor(c0), 1, "5 is on-attractor (substrate preserved)");
231+
}

omnimcode-core/src/compiler.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,14 @@ impl Compiler {
192192
| "mod_pow" | "bit_count" | "bit_length"
193193
| "digit_sum" | "digit_count"
194194
| "arr_unique_count" | "arr_gcd" | "fnv1a_hash"
195-
| "is_instance" => Some("int"),
195+
| "is_instance"
196+
// tape_* op constructors return node IDs (int)
197+
| "tape_var" | "tape_const"
198+
| "tape_add" | "tape_sub" | "tape_mul" | "tape_div"
199+
| "tape_neg" | "tape_pow_int"
200+
| "tape_exp" | "tape_sin" | "tape_cos"
201+
| "tape_relu" | "tape_sigmoid" | "tape_tanh"
202+
| "tape_matmul" | "tape_sum" | "tape_mean" => Some("int"),
196203
"pow" | "sqrt" | "log" | "log2" | "log10"
197204
| "exp" | "sin" | "cos" | "tan" | "asin" | "acos"
198205
| "atan" | "atan2" | "hypot" | "lerp"

0 commit comments

Comments
 (0)