perf(kernel): cache-block PanamaVectorMatmulKernel (8x8x128 tiles)

michalharakal · claude · michalharakal · commit ee22828f5292 · 2026-04-28T20:57:34.000+02:00
Ports the (m, n, k)-tile blocking pattern from JvmVectorKernels.matmulFloatBlocked into the SPI kernel: 8x8 output tiles, 128-wide K-stripes. Output is zeroed once up front and the K-tile loop accumulates via `+=`, which keeps the contract "fully overwrite the m x n block" intact and avoids the gnarly "init only on first tile" gating in the original blocked kernel. Closes the perf gap that #558 flagged between the SPI kernel and the existing production blocked path. After this change the SPI kernel matches or beats the production path within JMH noise — routing DefaultCpuOpsJvm.matmul through KernelRegistry won't show a regression any more. KernelMatmulBench (JDK 21.0.10, M-series macOS): size scalar panama speedup prior panama (simple) 256 9.77ms 1.13ms 8.61x 1.36ms (-16%) 512 81.55ms 9.47ms 8.62x 13.62ms (-30%) 1024 865.54ms 79.88ms 10.83x 118.24ms (-32%) vs production MatmulBench (vector=true, blas=false) same run: size SPI tiled production blocked delta 256 1.13ms 1.24ms SPI 8.5% faster 512 9.47ms 10.38ms SPI 8.8% faster 1024 79.88ms 78.32ms SPI 2% slower (within noise) Existing parity tests (PanamaVectorMatmulKernelTest, including the 31x17x23 randomized case that exercises partial tiles in all three dims) pass unchanged within the 1e-5*k tolerance. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorMatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorMatmulKernel.kt
@@ -12,15 +12,20 @@ import sk.ainet.backend.api.kernel.Fp32MatmulKernel
  *
  * Strategy:
  * - Pack `B` into a transposed buffer `bt` of shape `(n, k)` so the
- *   inner reduction streams contiguously over `k` for both operands —
- *   `a[i, kk]` walks one row of `A` and `bt[j, kk]` walks one row of
- *   the packed transpose.
- * - Inner loop is a vector-width FMA accumulator (`v.fma(w, acc)`),
- *   reduced once per `(i, j)` pair via `reduceLanes(ADD)`.
- * - Tail elements that don't fill a vector lane are handled in scalar.
+ *   inner reduction streams contiguously over `k` for both operands.
+ * - Cache-block the `(m, n, k)` iteration space with tiles
+ *   ([TILE_M], [TILE_N], [TILE_K]). Default 8×8×128 keeps a working
+ *   set well under L1 — eight A rows × 128 floats + eight Bᵀ rows ×
+ *   128 floats ≈ 8 KB, within typical 32 KB L1.
+ * - Inner reduction is a vector-width FMA accumulator
+ *   (`v.fma(w, acc)`), reduced via `reduceLanes(ADD)` once per
+ *   `(i, j)` cell per K-tile. Tail elements that don't fill a vector
+ *   lane are handled in scalar.
+ * - Output is zeroed once up front; per-tile work accumulates via `+=`
+ *   so the K-loop can split across multiple tiles cleanly.
  *
- * The B-pack is `O(n * k)` floats per call; that's cheap relative to
- * the `O(m * n * k)` FLOPs but still allocates each invocation. A
+ * The B-pack is `O(n * k)` floats per call; cheap relative to the
+ * `O(m * n * k)` FLOPs but still allocates each invocation. A
  * scratch-pool integration is out of scope for this kernel and lives
  * one layer up (see `ScratchPool` SPI in `skainet-lang-core`).
  *
@@ -31,6 +36,10 @@ import sk.ainet.backend.api.kernel.Fp32MatmulKernel
 public object PanamaVectorMatmulKernel : Fp32MatmulKernel {
     private val species: VectorSpecies<Float> = FloatVector.SPECIES_PREFERRED
 
+    private const val TILE_M = 8
+    private const val TILE_N = 8
+    private const val TILE_K = 128
+
     override fun matmul(
         a: FloatArray, aOffset: Int, aStride: Int,
         b: FloatArray, bOffset: Int, bStride: Int,
@@ -41,13 +50,14 @@ public object PanamaVectorMatmulKernel : Fp32MatmulKernel {
             "PanamaVectorMatmulKernel: m, n, k must be non-negative; got m=$m n=$n k=$k"
         }
         if (m == 0 || n == 0) return
-        if (k == 0) {
-            for (i in 0 until m) {
-                val rowOff = outOffset + i * outStride
-                for (j in 0 until n) out[rowOff + j] = 0f
-            }
-            return
+        // Zero the m×n output block once. The K-tile loop accumulates
+        // via `+=`, so the contract "fully overwrite the output block"
+        // is preserved even when k == 0 (no tile loop runs).
+        for (i in 0 until m) {
+            val rowOff = outOffset + i * outStride
+            for (j in 0 until n) out[rowOff + j] = 0f
         }
+        if (k == 0) return
 
         // Pack B^T: bt[j, kk] = b[kk, j].
         val bt = FloatArray(n * k)
@@ -59,28 +69,44 @@ public object PanamaVectorMatmulKernel : Fp32MatmulKernel {
         }
 
         val step = species.length()
-        val loopBound = species.loopBound(k)
 
-        for (i in 0 until m) {
-            val aRow = aOffset + i * aStride
-            val outRow = outOffset + i * outStride
-            for (j in 0 until n) {
-                val btRow = j * k
-                var acc = FloatVector.zero(species)
-                var idx = 0
-                while (idx < loopBound) {
-                    val va = FloatVector.fromArray(species, a, aRow + idx)
-                    val vb = FloatVector.fromArray(species, bt, btRow + idx)
-                    acc = va.fma(vb, acc)
-                    idx += step
-                }
-                var sum = acc.reduceLanes(VectorOperators.ADD)
-                while (idx < k) {
-                    sum += a[aRow + idx] * bt[btRow + idx]
-                    idx++
+        var mTile = 0
+        while (mTile < m) {
+            val mEnd = minOf(mTile + TILE_M, m)
+            var nTile = 0
+            while (nTile < n) {
+                val nEnd = minOf(nTile + TILE_N, n)
+                var kTile = 0
+                while (kTile < k) {
+                    val kEnd = minOf(kTile + TILE_K, k)
+                    val kLen = kEnd - kTile
+                    val loopBound = species.loopBound(kLen)
+                    for (i in mTile until mEnd) {
+                        val aRowBase = aOffset + i * aStride + kTile
+                        val outRowBase = outOffset + i * outStride
+                        for (j in nTile until nEnd) {
+                            val btRowBase = j * k + kTile
+                            var acc = FloatVector.zero(species)
+                            var idx = 0
+                            while (idx < loopBound) {
+                                val va = FloatVector.fromArray(species, a, aRowBase + idx)
+                                val vb = FloatVector.fromArray(species, bt, btRowBase + idx)
+                                acc = va.fma(vb, acc)
+                                idx += step
+                            }
+                            var sum = acc.reduceLanes(VectorOperators.ADD)
+                            while (idx < kLen) {
+                                sum += a[aRowBase + idx] * bt[btRowBase + idx]
+                                idx++
+                            }
+                            out[outRowBase + j] += sum
+                        }
+                    }
+                    kTile = kEnd
                 }
-                out[outRow + j] = sum
+                nTile = nEnd
             }
+            mTile = mEnd
         }
     }
 }