gHashTag
diff --git a/‎src/hslm/ema.zig‎
Lines changed: 121 additions & 0 deletions b/‎src/hslm/ema.zig‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎src/hslm/mask.zig‎
Lines changed: 154 additions & 0 deletions b/‎src/hslm/mask.zig‎
Lines changed: 154 additions & 0 deletions
@@ -0,0 +1,121 @@
+// T-JEPA — EMA (Exponential Moving Average) Weight Synchronization
+// Target encoder = EMA of online encoder shadow floats
+// After EMA update, target requantizes ternary weights from updated shadows
+//
+// φ² + 1/φ² = 3 = TRINITY
+
+const std = @import("std");
+const constants = @import("constants.zig");
+const model_mod = @import("model.zig");
+
+const EMBED_DIM = constants.EMBED_DIM;
+const HIDDEN_DIM = constants.HIDDEN_DIM;
+const VOCAB_SIZE = constants.VOCAB_SIZE;
+const NUM_BLOCKS = constants.NUM_BLOCKS;
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// EMA SYNC
+// ═══════════════════════════════════════════════════════════════════════════════
+
+pub const EmaSync = struct {
+    decay_start: f32, // 0.996 — initial decay (more online influence)
+    decay_end: f32, // 1.0 — final decay (target freezes)
+
+    /// Update target shadow floats via EMA: target[i] = decay * target[i] + (1-decay) * online[i]
+    pub fn updateShadows(target_shadow: []f32, online_shadow: []const f32, decay: f32) void {
+        std.debug.assert(target_shadow.len == online_shadow.len);
+        const one_minus_decay = 1.0 - decay;
+        for (target_shadow, online_shadow) |*t, o| {
+            t.* = decay * t.* + one_minus_decay * o;
+        }
+    }
+
+    /// Sync all shadow weights from online encoder to target encoder
+    /// Operates on: output_shadow, per-block TNN shadows + biases, sacred attention shadows + rms_gamma
+    pub fn syncModels(self: *const EmaSync, target: *model_mod.HSLM, online: *const model_mod.HSLM, step: u32, total_steps: u32) void {
+        const decay = scheduledDecay(step, total_steps, self.decay_start, self.decay_end);
+
+        // Output projection shadows
+        updateShadows(target.output_shadow, online.output_shadow, decay);
+
+        // Per-block params
+        for (&target.blocks, online.blocks) |*t_block, o_block| {
+            // TNN dense shadows
+            updateShadows(t_block.tnn.shadow_up, o_block.tnn.shadow_up, decay);
+            updateShadows(t_block.tnn.shadow_down, o_block.tnn.shadow_down, decay);
+            updateShadows(t_block.tnn.bias_up, o_block.tnn.bias_up, decay);
+            updateShadows(t_block.tnn.bias_down, o_block.tnn.bias_down, decay);
+
+            // Sacred attention shadows
+            updateShadows(t_block.sacred_attn.shadow_q, o_block.sacred_attn.shadow_q, decay);
+            updateShadows(t_block.sacred_attn.shadow_k, o_block.sacred_attn.shadow_k, decay);
+            updateShadows(t_block.sacred_attn.shadow_v, o_block.sacred_attn.shadow_v, decay);
+            updateShadows(t_block.sacred_attn.shadow_o, o_block.sacred_attn.shadow_o, decay);
+
+            // RMS gamma
+            updateShadows(t_block.sacred_attn.rms_gamma, o_block.sacred_attn.rms_gamma, decay);
+        }
+
+        // Embedding float table
+        updateShadows(target.emb.float_table, online.emb.float_table, decay);
+    }
+
+    /// Current decay value at given step
+    pub fn currentDecay(self: *const EmaSync, step: u32, total_steps: u32) f32 {
+        return scheduledDecay(step, total_steps, self.decay_start, self.decay_end);
+    }
+};
+
+/// Linear ramp from start to end over total_steps
+pub fn scheduledDecay(step: u32, total_steps: u32, start: f32, end: f32) f32 {
+    if (total_steps == 0) return end;
+    const t = @min(@as(f32, @floatFromInt(step)) / @as(f32, @floatFromInt(total_steps)), 1.0);
+    return start + (end - start) * t;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// TESTS
+// ═══════════════════════════════════════════════════════════════════════════════
+
+test "ema decay formula" {
+    var target = [_]f32{ 1.0, 2.0, 3.0 };
+    const online = [_]f32{ 0.0, 0.0, 0.0 };
+    EmaSync.updateShadows(&target, &online, 0.996);
+    // target[0] = 0.996 * 1.0 + 0.004 * 0.0 = 0.996
+    try std.testing.expectApproxEqAbs(@as(f32, 0.996), target[0], 1e-5);
+    try std.testing.expectApproxEqAbs(@as(f32, 1.992), target[1], 1e-5);
+    try std.testing.expectApproxEqAbs(@as(f32, 2.988), target[2], 1e-5);
+}
+
+test "ema schedule ramp" {
+    // At step 0 → start
+    try std.testing.expectApproxEqAbs(@as(f32, 0.996), scheduledDecay(0, 100, 0.996, 1.0), 1e-6);
+    // At step 50 → midpoint
+    try std.testing.expectApproxEqAbs(@as(f32, 0.998), scheduledDecay(50, 100, 0.996, 1.0), 1e-6);
+    // At step 100 → end
+    try std.testing.expectApproxEqAbs(@as(f32, 1.0), scheduledDecay(100, 100, 0.996, 1.0), 1e-6);
+    // Beyond total → clamped to end
+    try std.testing.expectApproxEqAbs(@as(f32, 1.0), scheduledDecay(200, 100, 0.996, 1.0), 1e-6);
+}
+
+test "ema sync models" {
+    const allocator = std.testing.allocator;
+
+    var online = try model_mod.HSLM.init(allocator);
+    defer online.deinit();
+    var target = try model_mod.HSLM.init(allocator);
+    defer target.deinit();
+
+    const ema = EmaSync{ .decay_start = 0.0, .decay_end = 0.0 };
+    // decay=0 means target = online (full copy)
+    ema.syncModels(&target, &online, 0, 100);
+
+    // After decay=0 sync, target shadows should equal online shadows
+    for (target.output_shadow, online.output_shadow) |t, o| {
+        try std.testing.expectApproxEqAbs(t, o, 1e-6);
+    }
+    // Check one block
+    for (target.blocks[0].tnn.shadow_up, online.blocks[0].tnn.shadow_up) |t, o| {
+        try std.testing.expectApproxEqAbs(t, o, 1e-6);
+    }
+}
@@ -0,0 +1,154 @@
+// T-JEPA — Block Masking for Sequences
+// Contiguous span masking: harder prediction → better representations
+// Spans aligned to ternary powers (3^1=3, 3^2=9)
+//
+// φ² + 1/φ² = 3 = TRINITY
+
+const std = @import("std");
+const constants = @import("constants.zig");
+
+const CONTEXT_LEN = constants.CONTEXT_LEN;
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// MASK CONFIG
+// ═══════════════════════════════════════════════════════════════════════════════
+
+pub const MaskConfig = struct {
+    mask_ratio: f32 = 0.3, // 30% masked
+    min_span: usize = 3, // 3^1
+    max_span: usize = 9, // 3^2 (ctx=27 can't fit 3 spans of 27)
+    num_spans: usize = 2, // 2 spans fit in ctx=27..81
+};
+
+pub const MaskResult = struct {
+    visible: [CONTEXT_LEN]bool, // true = visible, false = masked
+    num_visible: usize,
+    num_masked: usize,
+    masked_positions: [CONTEXT_LEN]usize, // packed list of masked indices
+    visible_positions: [CONTEXT_LEN]usize, // packed list of visible indices
+
+    pub fn init() MaskResult {
+        return .{
+            .visible = [_]bool{true} ** CONTEXT_LEN,
+            .num_visible = 0,
+            .num_masked = 0,
+            .masked_positions = [_]usize{0} ** CONTEXT_LEN,
+            .visible_positions = [_]usize{0} ** CONTEXT_LEN,
+        };
+    }
+};
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// MASK GENERATION
+// ═══════════════════════════════════════════════════════════════════════════════
+
+/// Generate contiguous span mask for a sequence
+/// 1. Sample num_spans spans of length uniform[min_span, max_span]
+/// 2. Random start positions, merge overlaps
+/// 3. Clamp total masked ≤ seq_len * mask_ratio
+pub fn generateMask(seq_len: usize, config: MaskConfig, rng: std.Random) MaskResult {
+    var result = MaskResult.init();
+    if (seq_len == 0) return result;
+
+    const effective_len = @min(seq_len, CONTEXT_LEN);
+    const max_masked = @as(usize, @intFromFloat(@as(f32, @floatFromInt(effective_len)) * config.mask_ratio));
+
+    // Mark all as visible initially
+    for (0..CONTEXT_LEN) |i| {
+        result.visible[i] = true;
+    }
+
+    // Generate spans
+    var total_masked: usize = 0;
+    for (0..config.num_spans) |_| {
+        if (total_masked >= max_masked) break;
+
+        // Sample span length
+        const span_range = config.max_span - config.min_span + 1;
+        const span_len = config.min_span + rng.uintLessThan(usize, span_range);
+        const actual_span = @min(span_len, max_masked - total_masked);
+
+        if (actual_span == 0) break;
+        if (effective_len <= actual_span) break;
+
+        // Random start position
+        const max_start = effective_len - actual_span;
+        const start = rng.uintLessThan(usize, max_start + 1);
+
+        // Mark span as masked
+        for (start..start + actual_span) |pos| {
+            if (result.visible[pos]) {
+                result.visible[pos] = false;
+                total_masked += 1;
+                if (total_masked >= max_masked) break;
+            }
+        }
+    }
+
+    // Build packed position arrays
+    var vi: usize = 0;
+    var mi: usize = 0;
+    for (0..effective_len) |i| {
+        if (result.visible[i]) {
+            result.visible_positions[vi] = i;
+            vi += 1;
+        } else {
+            result.masked_positions[mi] = i;
+            mi += 1;
+        }
+    }
+    result.num_visible = vi;
+    result.num_masked = mi;
+
+    return result;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════
+// TESTS
+// ═══════════════════════════════════════════════════════════════════════════════
+
+test "mask valid split" {
+    var prng = std.Random.DefaultPrng.init(42);
+    const result = generateMask(27, .{}, prng.random());
+    // visible + masked = seq_len
+    try std.testing.expectEqual(@as(usize, 27), result.num_visible + result.num_masked);
+}
+
+test "mask ratio approximate" {
+    var prng = std.Random.DefaultPrng.init(123);
+    // Run multiple times and check average
+    var total_masked: usize = 0;
+    const trials = 100;
+    const seq_len: usize = 81;
+    for (0..trials) |_| {
+        const result = generateMask(seq_len, .{}, prng.random());
+        total_masked += result.num_masked;
+    }
+    const avg_ratio = @as(f32, @floatFromInt(total_masked)) / @as(f32, @floatFromInt(trials * seq_len));
+    // Should be within 20% of 0.3 → between 0.1 and 0.5
+    try std.testing.expect(avg_ratio > 0.1);
+    try std.testing.expect(avg_ratio < 0.5);
+}
+
+test "mask spans contiguous" {
+    var prng = std.Random.DefaultPrng.init(777);
+    const result = generateMask(81, .{ .num_spans = 1, .min_span = 5, .max_span = 9 }, prng.random());
+    // With 1 span, masked positions should be contiguous
+    if (result.num_masked > 1) {
+        for (0..result.num_masked - 1) |i| {
+            const diff = result.masked_positions[i + 1] - result.masked_positions[i];
+            try std.testing.expectEqual(@as(usize, 1), diff);
+        }
+    }
+}
+
+test "mask deterministic seed" {
+    var prng1 = std.Random.DefaultPrng.init(42);
+    var prng2 = std.Random.DefaultPrng.init(42);
+    const r1 = generateMask(27, .{}, prng1.random());
+    const r2 = generateMask(27, .{}, prng2.random());
+    try std.testing.expectEqual(r1.num_masked, r2.num_masked);
+    for (0..r1.num_masked) |i| {
+        try std.testing.expectEqual(r1.masked_positions[i], r2.masked_positions[i]);
+    }
+}