|
| 1 | +# Reverse-mode autograd — the real ML training engine. |
| 2 | +# |
| 3 | +# Reverse-mode is O(forward) per parameter — you can compute the |
| 4 | +# gradient of one scalar loss w.r.t. thousands of weights in a single |
| 5 | +# backward walk. Forward-mode (dual numbers) needs one full forward |
| 6 | +# pass PER parameter, so it can't scale past toy models. Reverse-mode |
| 7 | +# is what PyTorch/TensorFlow/JAX are built on. |
| 8 | +# |
| 9 | +# What's uniquely OMC here: forward values are substrate-preserved |
| 10 | +# (HInt cells carry resonance/HIM through matmul; tape_value(id) |
| 11 | +# returns substrate-annotated cells). Python's autograd returns |
| 12 | +# plain numpy floats — substrate metadata is not a thing it has. |
| 13 | + |
| 14 | +fn assert_eq(actual, expected, msg) { |
| 15 | + if actual != expected { |
| 16 | + test_record_failure(msg + ": expected " + to_string(expected) + " got " + to_string(actual)); |
| 17 | + } |
| 18 | +} |
| 19 | + |
| 20 | +fn assert_true(cond, msg) { |
| 21 | + if !cond { test_record_failure(msg); } |
| 22 | +} |
| 23 | + |
| 24 | +fn approx_eq(a, b, tol) { |
| 25 | + h d = a - b; |
| 26 | + if d < 0.0 { d = 0.0 - d; } |
| 27 | + return d <= tol; |
| 28 | +} |
| 29 | + |
| 30 | +# ---- Scalar: f(x) = x^2 ; f'(x) = 2x ; at x=3, grad = 6 ---- |
| 31 | + |
| 32 | +fn test_scalar_square_grad() { |
| 33 | + tape_reset(); |
| 34 | + h x = tape_var(3.0); |
| 35 | + h y = tape_mul(x, x); |
| 36 | + tape_backward(y); |
| 37 | + h g = tape_grad(x); |
| 38 | + assert_true(approx_eq(g, 6.0, 0.001), "d(x^2)/dx at 3 = 6"); |
| 39 | +} |
| 40 | + |
| 41 | +# ---- Multi-parameter: f(a, b) = a*b + a^2 ; df/da = b + 2a ; df/db = a ---- |
| 42 | +# At a=2, b=5: df/da = 5 + 4 = 9 ; df/db = 2. |
| 43 | + |
| 44 | +fn test_multi_parameter_grads() { |
| 45 | + tape_reset(); |
| 46 | + h a = tape_var(2.0); |
| 47 | + h b = tape_var(5.0); |
| 48 | + h ab = tape_mul(a, b); |
| 49 | + h aa = tape_mul(a, a); |
| 50 | + h y = tape_add(ab, aa); |
| 51 | + tape_backward(y); |
| 52 | + h ga = tape_grad(a); |
| 53 | + h gb = tape_grad(b); |
| 54 | + assert_true(approx_eq(ga, 9.0, 0.001), "df/da = b+2a = 9"); |
| 55 | + assert_true(approx_eq(gb, 2.0, 0.001), "df/db = a = 2"); |
| 56 | +} |
| 57 | + |
| 58 | +# ---- Demonstrate the reverse-mode WIN: many params, one backward --- |
| 59 | +# f(a,b,c,d) = a + b + c + d ; df/dx = 1 for all x. |
| 60 | +# A single tape_backward(loss) gives gradients for every leaf. |
| 61 | + |
| 62 | +fn test_one_backward_many_grads() { |
| 63 | + tape_reset(); |
| 64 | + h a = tape_var(1.0); |
| 65 | + h b = tape_var(2.0); |
| 66 | + h c = tape_var(3.0); |
| 67 | + h d = tape_var(4.0); |
| 68 | + h ab = tape_add(a, b); |
| 69 | + h cd = tape_add(c, d); |
| 70 | + h y = tape_add(ab, cd); |
| 71 | + tape_backward(y); |
| 72 | + assert_true(approx_eq(tape_grad(a), 1.0, 0.001), "da = 1"); |
| 73 | + assert_true(approx_eq(tape_grad(b), 1.0, 0.001), "db = 1"); |
| 74 | + assert_true(approx_eq(tape_grad(c), 1.0, 0.001), "dc = 1"); |
| 75 | + assert_true(approx_eq(tape_grad(d), 1.0, 0.001), "dd = 1"); |
| 76 | +} |
| 77 | + |
| 78 | +# ---- Nonlinearities ---- |
| 79 | + |
| 80 | +fn test_sigmoid_grad() { |
| 81 | + tape_reset(); |
| 82 | + h x = tape_var(0.0); |
| 83 | + h y = tape_sigmoid(x); |
| 84 | + tape_backward(y); |
| 85 | + # sigmoid'(0) = 0.5 * 0.5 = 0.25 |
| 86 | + assert_true(approx_eq(tape_grad(x), 0.25, 0.001), "sigmoid'(0) = 0.25"); |
| 87 | +} |
| 88 | + |
| 89 | +fn test_tanh_grad() { |
| 90 | + tape_reset(); |
| 91 | + h x = tape_var(0.0); |
| 92 | + h y = tape_tanh(x); |
| 93 | + tape_backward(y); |
| 94 | + # tanh'(0) = 1 |
| 95 | + assert_true(approx_eq(tape_grad(x), 1.0, 0.001), "tanh'(0) = 1"); |
| 96 | +} |
| 97 | + |
| 98 | +fn test_relu_grad_branches() { |
| 99 | + tape_reset(); |
| 100 | + h xp = tape_var(2.0); |
| 101 | + h xn = tape_var(0 - 3.0); |
| 102 | + h yp = tape_relu(xp); |
| 103 | + h yn = tape_relu(xn); |
| 104 | + h y = tape_add(yp, yn); |
| 105 | + tape_backward(y); |
| 106 | + assert_true(approx_eq(tape_grad(xp), 1.0, 0.001), "relu' on positive = 1"); |
| 107 | + assert_true(approx_eq(tape_grad(xn), 0.0, 0.001), "relu' on negative = 0"); |
| 108 | +} |
| 109 | + |
| 110 | +fn test_exp_grad() { |
| 111 | + tape_reset(); |
| 112 | + h x = tape_var(0.0); |
| 113 | + h y = tape_exp(x); |
| 114 | + tape_backward(y); |
| 115 | + assert_true(approx_eq(tape_grad(x), 1.0, 0.001), "exp'(0) = 1"); |
| 116 | +} |
| 117 | + |
| 118 | +# ---- Composition: chain rule through sigmoid(2x + 1) ---- |
| 119 | +# At x=0: u=1, sigmoid(1) ≈ 0.7310586, |
| 120 | +# sigmoid'(1) ≈ 0.196612, dy/dx = 0.196612 * 2 ≈ 0.393224 |
| 121 | + |
| 122 | +fn test_chain_rule() { |
| 123 | + tape_reset(); |
| 124 | + h x = tape_var(0.0); |
| 125 | + h two = tape_const(2.0); |
| 126 | + h one = tape_const(1.0); |
| 127 | + h u = tape_add(tape_mul(two, x), one); |
| 128 | + h y = tape_sigmoid(u); |
| 129 | + tape_backward(y); |
| 130 | + assert_true(approx_eq(tape_grad(x), 0.393224, 0.001), "chain rule"); |
| 131 | +} |
| 132 | + |
| 133 | +# ---- Matrix autograd: forward through a 2D matmul ---- |
| 134 | +# Y = X @ W ; loss = sum(Y) ; dL/dX = ones(Y) @ W^T ; dL/dW = X^T @ ones(Y) |
| 135 | + |
| 136 | +fn test_matmul_backward_shapes() { |
| 137 | + tape_reset(); |
| 138 | + # X is 1x3, W is 3x2 → Y is 1x2 → loss is scalar. |
| 139 | + # 1-row gradients come back as 1D arrays (convention: drop the |
| 140 | + # outer row when there's only one). |
| 141 | + h X = tape_var([[1.0, 2.0, 3.0]]); |
| 142 | + h W = tape_var([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]); |
| 143 | + h Y = tape_matmul(X, W); |
| 144 | + h L = tape_sum(Y); |
| 145 | + tape_backward(L); |
| 146 | + h gX = tape_grad(X); # 1D: [0.3, 0.7, 1.1] |
| 147 | + h gW = tape_grad(W); # 2D: 3x2 |
| 148 | + assert_true(approx_eq(arr_get(gX, 0), 0.3, 0.001), "dL/dX[0] = 0.3"); |
| 149 | + assert_true(approx_eq(arr_get(gX, 1), 0.7, 0.001), "dL/dX[1] = 0.7"); |
| 150 | + assert_true(approx_eq(arr_get(gX, 2), 1.1, 0.001), "dL/dX[2] = 1.1"); |
| 151 | + h gW0 = arr_get(gW, 0); |
| 152 | + assert_true(approx_eq(arr_get(gW0, 0), 1.0, 0.001), "dL/dW[0,0] = 1.0"); |
| 153 | + assert_true(approx_eq(arr_get(gW0, 1), 1.0, 0.001), "dL/dW[0,1] = 1.0"); |
| 154 | + h gW2 = arr_get(gW, 2); |
| 155 | + assert_true(approx_eq(arr_get(gW2, 0), 3.0, 0.001), "dL/dW[2,0] = 3.0"); |
| 156 | + assert_true(approx_eq(arr_get(gW2, 1), 3.0, 0.001), "dL/dW[2,1] = 3.0"); |
| 157 | +} |
| 158 | + |
| 159 | +# ---- End-to-end gradient descent training ---- |
| 160 | +# Train a single scalar w to minimize L(w) = sum_i (w*x_i - y_i)^2 |
| 161 | +# where y_i = 2 * x_i. Converges fast. |
| 162 | + |
| 163 | +fn test_sgd_training_converges() { |
| 164 | + h xs = [1.0, 2.0, 3.0, 4.0]; |
| 165 | + h ys = [2.0, 4.0, 6.0, 8.0]; |
| 166 | + h n = arr_len(xs); |
| 167 | + h step = 0; |
| 168 | + h max_steps = 100; |
| 169 | + h lr = 0.01; |
| 170 | + # Persistent variable across steps via tape_var rebuilt each step, |
| 171 | + # but we read the converged value from a normal OMC variable. |
| 172 | + h w_value = 0.1; |
| 173 | + h converged = 0; |
| 174 | + |
| 175 | + while step < max_steps { |
| 176 | + tape_reset(); |
| 177 | + h w = tape_var(w_value); |
| 178 | + # Accumulate loss across the dataset. |
| 179 | + h L = tape_const(0.0); |
| 180 | + h i = 0; |
| 181 | + while i < n { |
| 182 | + h xi = tape_const(arr_get(xs, i)); |
| 183 | + h yi = tape_const(arr_get(ys, i)); |
| 184 | + h pred = tape_mul(w, xi); |
| 185 | + h err = tape_sub(pred, yi); |
| 186 | + h sq = tape_mul(err, err); |
| 187 | + L = tape_add(L, sq); |
| 188 | + i = i + 1; |
| 189 | + } |
| 190 | + tape_backward(L); |
| 191 | + h gw = tape_grad(w); |
| 192 | + h Lv = tape_value(L); |
| 193 | + if Lv < 0.001 { |
| 194 | + converged = 1; |
| 195 | + step = max_steps; |
| 196 | + } else { |
| 197 | + w_value = w_value - lr * gw; |
| 198 | + step = step + 1; |
| 199 | + } |
| 200 | + } |
| 201 | + assert_eq(converged, 1, "SGD converged on y=2x"); |
| 202 | + assert_true(approx_eq(w_value, 2.0, 0.05), "w near 2.0"); |
| 203 | +} |
| 204 | + |
| 205 | +# ---- Substrate metadata preserved on forward values ---- |
| 206 | +# After running a forward pass, tape_value(id) for an integer-valued |
| 207 | +# node should come back as HInt with substrate metadata, NOT a plain |
| 208 | +# float. This is the OMC-only property — Python autograd loses this. |
| 209 | + |
| 210 | +fn test_substrate_preserved_through_tape() { |
| 211 | + tape_reset(); |
| 212 | + h a = tape_var(3); |
| 213 | + h b = tape_var(5); |
| 214 | + h s = tape_add(a, b); # 8 — a Fibonacci attractor |
| 215 | + h v = tape_value(s); |
| 216 | + # v should round-trip as an integer-valued result; the cell's |
| 217 | + # resonance should be high because 8 IS a Fibonacci attractor. |
| 218 | + assert_true(is_attractor(v) == 1, "8 is a Fibonacci attractor"); |
| 219 | +} |
| 220 | + |
| 221 | +fn test_substrate_preserved_through_matmul() { |
| 222 | + # Substrate-typed matmul (integer in, integer out, resonance on every cell). |
| 223 | + tape_reset(); |
| 224 | + h X = tape_var([[1, 2]]); |
| 225 | + h W = tape_var([[1, 1], [2, 2]]); |
| 226 | + h Y = tape_matmul(X, W); # = [[5, 5]] ; 1-row → 1D [5, 5] |
| 227 | + h v = tape_value(Y); |
| 228 | + h c0 = arr_get(v, 0); |
| 229 | + # 5 is a Fibonacci attractor — substrate resonance preserved through matmul. |
| 230 | + assert_eq(is_attractor(c0), 1, "5 is on-attractor (substrate preserved)"); |
| 231 | +} |
0 commit comments