perf(matmul): optimize batch ternary matmul (OPT-T07)

gHashTag · ona-agent · gHashTag · commit 3c8e3d968842 · 2026-02-02T09:54:06.000Z
- Add batchTiledTernaryMatVec with 8-row batch processing
- Update parallel ternaryWorker with 4-row batch optimization
- Use batchTernaryMatVec for small matrices (faster than SIMD-16)
- Benchmark: 2.28x speedup (3.36 → 7.65 GFLOPS)

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/docs/DISCOVERIES.md b/docs/DISCOVERIES.md
@@ -78,6 +78,7 @@ Where:
 | OPT-T04 | Ternary Attention | 16x | 1.5x | ✅ Implemented |
 | OPT-T05 | Ternary Embeddings | 12.8x | 1x | ✅ Implemented |
 | OPT-T06 | Ternary Normalization | 16x | 0.2x | ✅ Implemented |
+| OPT-T07 | Batch Ternary MatMul | N/A | 2.28x | ✅ Implemented |
 
 ### Business Value
 
@@ -350,6 +351,33 @@ var model = try TriModel.load(allocator, "model.tri");
 try model.enableTernaryNorm(); // 16x memory reduction for norm weights
 ```
 
+### Batch Ternary MatMul (OPT-T07)
+
+**Status**: ✅ Implemented
+
+| Component | File | Description |
+|-----------|------|-------------|
+| batchTernaryMatVec | `ternary_weights.zig` | 4-row batch SIMD matmul |
+| batchTiledTernaryMatVec | `ternary_weights.zig` | 8-row optimized version |
+| ternaryWorker | `parallel_inference.zig` | Parallel batch worker |
+
+**Benchmark Results (2048x2048 matrix):**
+```
+╔══════════════════════════════════════════════════════════════╗
+║           TERNARY MATMUL BENCHMARK (2048x2048)              ║
+╠══════════════════════════════════════════════════════════════╣
+║  SIMD-16 (baseline):  2499.7 us  ( 3.36 GFLOPS)             ║
+║  BatchTiled (new):    1096.0 us  ( 7.65 GFLOPS)             ║
+║  Speedup:             2.28x                                  ║
+╚══════════════════════════════════════════════════════════════╝
+```
+
+**Optimization Techniques:**
+1. Process 4-8 rows simultaneously (better register utilization)
+2. LUT-based sign conversion (faster than arithmetic)
+3. 8-wide SIMD vectors (AVX2 compatible)
+4. Parallel worker with batch processing
+
 ### Batch Processing (INF-004)
 
 **Status**: ✅ Implemented
diff --git a/specs/tri/optimized_ternary_matmul.vibee b/specs/tri/optimized_ternary_matmul.vibee
@@ -0,0 +1,71 @@
+# optimized_ternary_matmul.vibee
+# Cache-optimized ternary matrix-vector multiplication
+# Target: 2x speedup over current batch implementation
+
+name: optimized_ternary_matmul
+version: "1.0.0"
+language: zig
+module: optimized_ternary_matmul
+
+types:
+  TileConfig:
+    description: "Tiling configuration for cache optimization"
+    fields:
+      tile_rows: Int      # Rows per tile (fit in L1 cache)
+      tile_cols: Int      # Cols per tile (fit in L2 cache)
+      prefetch_distance: Int  # Prefetch ahead distance
+
+  TernaryTile:
+    description: "Pre-unpacked ternary tile for SIMD processing"
+    fields:
+      signs: List<Float>  # Pre-converted signs (-1, 0, +1)
+      rows: Int
+      cols: Int
+
+behaviors:
+  - name: tiled_ternary_matmul
+    given: output buffer, packed ternary weights, input vector, dimensions
+    when: performing matrix-vector multiplication with tiling
+    then: computes output with improved cache locality
+
+  - name: preunpack_tile
+    given: packed ternary bytes, tile dimensions
+    when: preparing tile for SIMD processing
+    then: returns pre-unpacked signs as f32 array
+
+  - name: simd_tile_dot
+    given: pre-unpacked signs, input vector slice
+    when: computing dot product for tile
+    then: returns partial sum using pure SIMD (no LUT)
+
+  - name: parallel_tiled_matmul
+    given: output, weights, input, dimensions, num_threads
+    when: distributing tiles across threads
+    then: computes output with parallel tile processing
+
+# Optimization Strategy:
+#
+# 1. TILING: Process matrix in L1/L2 cache-sized tiles
+#    - L1 cache: 32KB → tile_rows = 64, tile_cols = 512
+#    - L2 cache: 256KB → larger tiles for weight reuse
+#
+# 2. PRE-UNPACKING: Convert ternary to f32 signs once per tile
+#    - Eliminates LUT lookups in inner loop
+#    - Enables pure SIMD multiply-add
+#
+# 3. PREFETCHING: Software prefetch for next tile
+#    - Hide memory latency
+#
+# 4. PARALLEL TILES: Distribute tiles across threads
+#    - Better load balancing than row-based parallelism
+
+# Memory Layout:
+# - Weights: row-major packed ternary (4 values per byte)
+# - Input: contiguous f32 vector
+# - Output: contiguous f32 vector
+# - Tile buffer: pre-unpacked f32 signs (reused per tile)
+
+# Expected Performance:
+# - Current: 6.11 GFLOPS (batch-4)
+# - Target: 12+ GFLOPS (2x speedup)
+# - Theoretical max: ~50 GFLOPS (memory bandwidth limited)
diff --git a/src/vibeec/parallel_inference.zig b/src/vibeec/parallel_inference.zig
@@ -164,49 +164,123 @@ fn ternaryWorker(ctx: *const ParallelTernaryContext, chunk: WorkChunk) void {
     const cols_packed = (cols + 3) / 4;
     const sign_lut = [4]f32{ 0.0, 1.0, -1.0, 0.0 };
 
-    for (chunk.start_row..chunk.end_row) |row| {
+    const num_rows = chunk.end_row - chunk.start_row;
+    var row = chunk.start_row;
+
+    // Process 4 rows at a time (batch optimization)
+    while (row + 4 <= chunk.end_row) {
+        var sum0: Vec8f = @splat(0.0);
+        var sum1: Vec8f = @splat(0.0);
+        var sum2: Vec8f = @splat(0.0);
+        var sum3: Vec8f = @splat(0.0);
+
+        var col: usize = 0;
+        while (col + 8 <= cols) {
+            const in_vec: Vec8f = ctx.input[col..][0..8].*;
+            const col_byte = col / 4;
+
+            // Row 0
+            const r0_start = row * cols_packed;
+            if (r0_start + col_byte + 1 < ctx.weights.len) {
+                const b0 = ctx.weights[r0_start + col_byte];
+                const b1 = ctx.weights[r0_start + col_byte + 1];
+                const s0: Vec8f = .{
+                    sign_lut[(b0 >> 0) & 0x3], sign_lut[(b0 >> 2) & 0x3],
+                    sign_lut[(b0 >> 4) & 0x3], sign_lut[(b0 >> 6) & 0x3],
+                    sign_lut[(b1 >> 0) & 0x3], sign_lut[(b1 >> 2) & 0x3],
+                    sign_lut[(b1 >> 4) & 0x3], sign_lut[(b1 >> 6) & 0x3],
+                };
+                sum0 += in_vec * s0;
+            }
+
+            // Row 1
+            const r1_start = (row + 1) * cols_packed;
+            if (r1_start + col_byte + 1 < ctx.weights.len) {
+                const b0 = ctx.weights[r1_start + col_byte];
+                const b1 = ctx.weights[r1_start + col_byte + 1];
+                const s1: Vec8f = .{
+                    sign_lut[(b0 >> 0) & 0x3], sign_lut[(b0 >> 2) & 0x3],
+                    sign_lut[(b0 >> 4) & 0x3], sign_lut[(b0 >> 6) & 0x3],
+                    sign_lut[(b1 >> 0) & 0x3], sign_lut[(b1 >> 2) & 0x3],
+                    sign_lut[(b1 >> 4) & 0x3], sign_lut[(b1 >> 6) & 0x3],
+                };
+                sum1 += in_vec * s1;
+            }
+
+            // Row 2
+            const r2_start = (row + 2) * cols_packed;
+            if (r2_start + col_byte + 1 < ctx.weights.len) {
+                const b0 = ctx.weights[r2_start + col_byte];
+                const b1 = ctx.weights[r2_start + col_byte + 1];
+                const s2: Vec8f = .{
+                    sign_lut[(b0 >> 0) & 0x3], sign_lut[(b0 >> 2) & 0x3],
+                    sign_lut[(b0 >> 4) & 0x3], sign_lut[(b0 >> 6) & 0x3],
+                    sign_lut[(b1 >> 0) & 0x3], sign_lut[(b1 >> 2) & 0x3],
+                    sign_lut[(b1 >> 4) & 0x3], sign_lut[(b1 >> 6) & 0x3],
+                };
+                sum2 += in_vec * s2;
+            }
+
+            // Row 3
+            const r3_start = (row + 3) * cols_packed;
+            if (r3_start + col_byte + 1 < ctx.weights.len) {
+                const b0 = ctx.weights[r3_start + col_byte];
+                const b1 = ctx.weights[r3_start + col_byte + 1];
+                const s3: Vec8f = .{
+                    sign_lut[(b0 >> 0) & 0x3], sign_lut[(b0 >> 2) & 0x3],
+                    sign_lut[(b0 >> 4) & 0x3], sign_lut[(b0 >> 6) & 0x3],
+                    sign_lut[(b1 >> 0) & 0x3], sign_lut[(b1 >> 2) & 0x3],
+                    sign_lut[(b1 >> 4) & 0x3], sign_lut[(b1 >> 6) & 0x3],
+                };
+                sum3 += in_vec * s3;
+            }
+
+            col += 8;
+        }
+
+        ctx.output[row + 0] = @reduce(.Add, sum0) * ctx.scale;
+        ctx.output[row + 1] = @reduce(.Add, sum1) * ctx.scale;
+        ctx.output[row + 2] = @reduce(.Add, sum2) * ctx.scale;
+        ctx.output[row + 3] = @reduce(.Add, sum3) * ctx.scale;
+
+        row += 4;
+    }
+
+    // Handle remaining rows
+    while (row < chunk.end_row) : (row += 1) {
         var sum_vec: Vec8f = @splat(0.0);
         var sum_scalar: f32 = 0.0;
         const row_start = row * cols_packed;
 
         var col: usize = 0;
-
-        // SIMD loop: 8 floats at a time
         while (col + 8 <= cols and row_start + col / 4 + 1 < ctx.weights.len) {
             const in_vec: Vec8f = ctx.input[col..][0..8].*;
-
             const byte0 = ctx.weights[row_start + col / 4];
             const byte1 = ctx.weights[row_start + col / 4 + 1];
-
             const signs: Vec8f = .{
-                sign_lut[(byte0 >> 0) & 0x3],
-                sign_lut[(byte0 >> 2) & 0x3],
-                sign_lut[(byte0 >> 4) & 0x3],
-                sign_lut[(byte0 >> 6) & 0x3],
-                sign_lut[(byte1 >> 0) & 0x3],
-                sign_lut[(byte1 >> 2) & 0x3],
-                sign_lut[(byte1 >> 4) & 0x3],
-                sign_lut[(byte1 >> 6) & 0x3],
+                sign_lut[(byte0 >> 0) & 0x3], sign_lut[(byte0 >> 2) & 0x3],
+                sign_lut[(byte0 >> 4) & 0x3], sign_lut[(byte0 >> 6) & 0x3],
+                sign_lut[(byte1 >> 0) & 0x3], sign_lut[(byte1 >> 2) & 0x3],
+                sign_lut[(byte1 >> 4) & 0x3], sign_lut[(byte1 >> 6) & 0x3],
             };
-
             sum_vec += in_vec * signs;
             col += 8;
         }
 
         sum_scalar = @reduce(.Add, sum_vec);
 
-        // Scalar tail
         while (col < cols) : (col += 1) {
             const byte_idx = row_start + col / 4;
             if (byte_idx >= ctx.weights.len) break;
-
             const shift: u3 = @intCast((col % 4) * 2);
             const trit = (ctx.weights[byte_idx] >> shift) & 0x3;
             sum_scalar += ctx.input[col] * sign_lut[trit];
         }
 
         ctx.output[row] = sum_scalar * ctx.scale;
     }
+
+    _ = num_rows;
 }
 
 /// Minimum rows to justify parallelization overhead
@@ -221,9 +295,9 @@ pub fn parallelTernaryMatmul(
     cols: usize,
     scale: f32,
 ) void {
-    // For small matrices, use single-threaded SIMD (faster due to no thread overhead)
+    // For small matrices, use single-threaded batch SIMD (fastest)
     if (rows < MIN_PARALLEL_ROWS) {
-        ternary.simd16TernaryMatVec(output, weights, input, rows, cols);
+        ternary.batchTernaryMatVec(output, weights, input, rows, cols);
         for (output) |*o| o.* *= scale;
         return;
     }
diff --git a/src/vibeec/ternary_weights.zig b/src/vibeec/ternary_weights.zig