Merge pull request #575 from SKaiNET-developers/feature/native-fp32-matmul

michalharakal · web-flow · commit 9d05fc47bc3a · 2026-04-29T23:28:04.000+02:00
feat(native-cpu): native FFM FP32 SGEMM kernel (PR 5 of 5)
diff --git a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
@@ -12,6 +12,7 @@ endif()
 add_library(skainet_kernels SHARED
     src/skainet_smoke.c
     src/q4k_matmul.c
+    src/fp32_matmul.c
 )
 
 target_include_directories(skainet_kernels PUBLIC
diff --git a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h
@@ -60,6 +60,22 @@ SKAINET_API void skainet_q4k_matmul(
     int32_t output_offset
 );
 
+/*
+ * Row-major FP32 SGEMM:  C(m, n) = A(m, k) * B(k, n).
+ *
+ * Strides are in floats (not bytes). For a contiguous parent matrix
+ * `a_stride == k`, `b_stride == n`, `c_stride == n`. The kernel zeros
+ * the m×n output block before accumulating, so callers always get
+ * `C = A·B` (not `C += A·B`). `k == 0` zeros the block; `m == 0`
+ * or `n == 0` is a no-op.
+ */
+SKAINET_API void skainet_fp32_matmul(
+    const float* a, int32_t a_offset, int32_t a_stride,
+    const float* b, int32_t b_offset, int32_t b_stride,
+    float* c, int32_t c_offset, int32_t c_stride,
+    int32_t m, int32_t n, int32_t k
+);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c
@@ -0,0 +1,61 @@
+#include "skainet_kernels.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * Native row-major SGEMM matching the
+ * sk.ainet.backend.api.kernel.Fp32MatmulKernel SPI:
+ *
+ *   C(m, n) = A(m, k) * B(k, n)
+ *
+ * Strides are in floats (not bytes); for a contiguous parent matrix
+ * `aStride == k`, `bStride == n`, `cStride == n`. Sub-block scenarios
+ * pass larger strides and the corresponding offsets.
+ *
+ * Iteration order is i-p-j (outer product into rows of C). The inner
+ * loop is `c[j] += a_ip * b[j]` over a contiguous run of `n` floats
+ * for both b's row and c's row — auto-vectorizes cleanly under
+ * -O3 -ffast-math into vfmadd231ps / fmla.
+ *
+ * Caller contract:
+ *  - C is FULLY OVERWRITTEN in the m×n block (zero-then-accumulate).
+ *  - k == 0 zeros the m×n block.
+ *  - m == 0 || n == 0 is a no-op.
+ *  - Negative m / n / k are caller errors; the Kotlin wrapper rejects
+ *    them. The C kernel still treats negatives as no-op (via the
+ *    `<=` loop bounds) defensively.
+ */
+SKAINET_API void skainet_fp32_matmul(
+    const float* SKAINET_RESTRICT a, int32_t a_offset, int32_t a_stride,
+    const float* SKAINET_RESTRICT b, int32_t b_offset, int32_t b_stride,
+    float* SKAINET_RESTRICT c, int32_t c_offset, int32_t c_stride,
+    int32_t m, int32_t n, int32_t k
+) {
+    if (m <= 0 || n <= 0) return;
+
+    /* Zero the output block. Required by the SPI contract for k == 0
+     * AND prerequisite for the i-p-j accumulator pattern below. */
+    for (int32_t i = 0; i < m; ++i) {
+        float* SKAINET_RESTRICT c_row = c + c_offset + (size_t) i * c_stride;
+        for (int32_t j = 0; j < n; ++j) {
+            c_row[j] = 0.0f;
+        }
+    }
+    if (k <= 0) return;
+
+    /* Outer-product accumulator: streams two contiguous rows on the
+     * inner loop (b's row and c's row), broadcasts a single A scalar.
+     * The compiler emits vfmadd231ps with a vbroadcastss for a_ip. */
+    for (int32_t i = 0; i < m; ++i) {
+        const float* SKAINET_RESTRICT a_row = a + a_offset + (size_t) i * a_stride;
+        float* SKAINET_RESTRICT c_row = c + c_offset + (size_t) i * c_stride;
+        for (int32_t p = 0; p < k; ++p) {
+            const float a_ip = a_row[p];
+            const float* SKAINET_RESTRICT b_row = b + b_offset + (size_t) p * b_stride;
+            for (int32_t j = 0; j < n; ++j) {
+                c_row[j] += a_ip * b_row[j];
+            }
+        }
+    }
+}
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeFp32MatmulKernel.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeFp32MatmulKernel.kt
@@ -0,0 +1,113 @@
+package sk.ainet.exec.kernel
+
+import java.lang.foreign.Arena
+import java.lang.foreign.FunctionDescriptor
+import java.lang.foreign.Linker
+import java.lang.foreign.MemorySegment
+import java.lang.foreign.ValueLayout
+import java.lang.invoke.MethodHandle
+import sk.ainet.backend.api.kernel.Fp32MatmulKernel
+
+/**
+ * Native (FFM) implementation of [Fp32MatmulKernel].
+ *
+ * Wraps the bundled C symbol
+ *
+ *   void skainet_fp32_matmul(
+ *       const float* a, int32_t a_offset, int32_t a_stride,
+ *       const float* b, int32_t b_offset, int32_t b_stride,
+ *       float* c, int32_t c_offset, int32_t c_stride,
+ *       int32_t m, int32_t n, int32_t k);
+ *
+ * The C kernel is a tight i-p-j outer-product accumulator over rows
+ * of C; the inner `c[j] += a*b[j]` loop streams two contiguous arrays
+ * and auto-vectorizes into FMA under -O3 -ffast-math (vfmadd231ps on
+ * x86_64, fmla on AArch64).
+ *
+ * Numerical parity vs [PanamaVectorMatmulKernel] is asserted by
+ * [NativeFp32MatmulKernelParityTest] within FMA + reordered-reduction
+ * tolerance (the same `1e-5 * k` bar Panama uses against the scalar
+ * reference).
+ *
+ * PR 5 of the staged native-FFM rollout — wraps the rollout per the
+ * `native-ffm-plan` asciidoc. Single-threaded, no cache blocking;
+ * future work could add parallelChunks-style row blocking and B-tile
+ * packing, but the scalar C path already lands well within the SPI
+ * contract on host-arch CPUs.
+ */
+internal object NativeFp32MatmulKernel : Fp32MatmulKernel {
+
+    fun isAvailable(): Boolean = handle != null
+
+    override fun matmul(
+        a: FloatArray, aOffset: Int, aStride: Int,
+        b: FloatArray, bOffset: Int, bStride: Int,
+        out: FloatArray, outOffset: Int, outStride: Int,
+        m: Int, n: Int, k: Int,
+    ) {
+        require(m >= 0 && n >= 0 && k >= 0) {
+            "NativeFp32MatmulKernel: m, n, k must be non-negative; got m=$m n=$n k=$k"
+        }
+        if (m == 0 || n == 0) return
+
+        val mh = handle
+            ?: error("NativeFp32MatmulKernel.matmul invoked while native library unavailable")
+
+        // Sizes for the off-heap copies. Each of A, B, C uses the
+        // bytes the kernel actually reaches — for non-contiguous
+        // strides this can be larger than the matrix's element count
+        // because the strides skip past unused floats. Allocating to
+        // the full reach (offset + last-row reach) keeps the kernel
+        // pointer arithmetic simple and matches Kotlin's bounds.
+        val aReachFloats = if (m == 0 || k == 0) 0 else aOffset + (m - 1) * aStride + k
+        val bReachFloats = if (k == 0 || n == 0) 0 else bOffset + (k - 1) * bStride + n
+        val cReachFloats = outOffset + (m - 1) * outStride + n
+
+        Arena.ofConfined().use { arena ->
+            val aBytes = aReachFloats.toLong() * java.lang.Float.BYTES
+            val bBytes = bReachFloats.toLong() * java.lang.Float.BYTES
+            val cBytes = cReachFloats.toLong() * java.lang.Float.BYTES
+            val align = ValueLayout.JAVA_FLOAT.byteAlignment()
+
+            val aSeg: MemorySegment = if (aBytes > 0) arena.allocate(aBytes, align) else MemorySegment.NULL
+            val bSeg: MemorySegment = if (bBytes > 0) arena.allocate(bBytes, align) else MemorySegment.NULL
+            val cSeg: MemorySegment = arena.allocate(cBytes, align)
+
+            if (aReachFloats > 0) {
+                MemorySegment.copy(a, 0, aSeg, ValueLayout.JAVA_FLOAT, 0L, aReachFloats)
+            }
+            if (bReachFloats > 0) {
+                MemorySegment.copy(b, 0, bSeg, ValueLayout.JAVA_FLOAT, 0L, bReachFloats)
+            }
+
+            mh.invoke(
+                aSeg, aOffset, aStride,
+                bSeg, bOffset, bStride,
+                cSeg, outOffset, outStride,
+                m, n, k,
+            )
+
+            MemorySegment.copy(cSeg, ValueLayout.JAVA_FLOAT, 0L, out, 0, cReachFloats)
+        }
+    }
+
+    private val handle: MethodHandle? by lazy {
+        val lookup = NativeLibraryLoader.lookup() ?: return@lazy null
+        val symbol = lookup.find("skainet_fp32_matmul").orElse(null) ?: return@lazy null
+        val descriptor = FunctionDescriptor.ofVoid(
+            ValueLayout.ADDRESS,    // a
+            ValueLayout.JAVA_INT,   // a_offset
+            ValueLayout.JAVA_INT,   // a_stride
+            ValueLayout.ADDRESS,    // b
+            ValueLayout.JAVA_INT,   // b_offset
+            ValueLayout.JAVA_INT,   // b_stride
+            ValueLayout.ADDRESS,    // c
+            ValueLayout.JAVA_INT,   // c_offset
+            ValueLayout.JAVA_INT,   // c_stride
+            ValueLayout.JAVA_INT,   // m
+            ValueLayout.JAVA_INT,   // n
+            ValueLayout.JAVA_INT,   // k
+        )
+        runCatching { Linker.nativeLinker().downcallHandle(symbol, descriptor) }.getOrNull()
+    }
+}
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt
@@ -26,16 +26,18 @@ import sk.ainet.backend.api.kernel.Q4KMemSegMatmulKernel
  *
  * Staged rollout cursor (see `native-ffm-plan` asciidoc):
  *  - PR 2: real Q4_K matmul wired into the heap SPI.
- *  - PR 3 (this commit): MemSeg-input zero-copy sibling.
- *  - Later: native `matmulFp32`, `matmulQ6K`, `matmulQ8_0`.
+ *  - PR 3: MemSeg-input zero-copy sibling.
+ *  - PR 5 (this commit): native FP32 matmul wired into [matmulFp32].
+ *  - Later: native `matmulQ6K`, `matmulQ8_0` (need new SPI accessors).
  */
 public object NativeKernelProvider : KernelProvider, MemSegKernelProvider {
     override val name: String = "native-ffm"
     override val priority: Int = 100
 
     override fun isAvailable(): Boolean = NativeQ4KMatmulKernel.isAvailable()
 
-    override fun matmulFp32(): Fp32MatmulKernel? = null
+    override fun matmulFp32(): Fp32MatmulKernel? =
+        if (NativeFp32MatmulKernel.isAvailable()) NativeFp32MatmulKernel else null
 
     override fun matmulQ4K(): Q4KMatmulKernel? =
         if (NativeQ4KMatmulKernel.isAvailable()) NativeQ4KMatmulKernel else null
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeFfmPipelineTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeFfmPipelineTest.kt
@@ -32,16 +32,16 @@ class NativeFfmPipelineTest {
     }
 
     @Test
-    fun `provider exposes Q4_K kernel when the native lib loads`() {
+    fun `provider exposes Q4_K and FP32 kernels when the native lib loads`() {
         assertEquals("native-ffm", NativeKernelProvider.name)
         assertEquals(100, NativeKernelProvider.priority)
         assertTrue(
             NativeKernelProvider.isAvailable(),
             "Native kernel provider reports unavailable on this host — " +
                 "bundled libskainet_kernels missing or skainet_q4k_matmul unresolved",
         )
-        // FP32 matmul ships in a later PR; Q4_K is wired through PR 2.
-        assertEquals(null, NativeKernelProvider.matmulFp32())
+        // PR 5 wires both Q4_K (PR 2) and FP32 (this PR) through the SPI.
+        assertNotNull(NativeKernelProvider.matmulFp32())
         assertNotNull(NativeKernelProvider.matmulQ4K())
     }
 
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeFp32MatmulKernelParityTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeFp32MatmulKernelParityTest.kt
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/Q4KMatmulMicrobenchTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/Q4KMatmulMicrobenchTest.kt

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@ endif()`
`12`	`12`	`add_library(skainet_kernels SHARED`
`13`	`13`	`src/skainet_smoke.c`
`14`	`14`	`src/q4k_matmul.c`
	`15`	`+ src/fp32_matmul.c`
`15`	`16`	`)`
`16`	`17`
`17`	`18`	`target_include_directories(skainet_kernels PUBLIC`
Original file line number	Diff line number	Diff line change
`@@ -32,16 +32,16 @@ class NativeFfmPipelineTest {`
`32`	`32`	`}`
`33`	`33`
`34`	`34`	`@Test`
`35`		- fun `provider exposes Q4_K kernel when the native lib loads`() {
	`35`	+ fun `provider exposes Q4_K and FP32 kernels when the native lib loads`() {
`36`	`36`	`assertEquals("native-ffm", NativeKernelProvider.name)`
`37`	`37`	`assertEquals(100, NativeKernelProvider.priority)`
`38`	`38`	`assertTrue(`
`39`	`39`	`NativeKernelProvider.isAvailable(),`
`40`	`40`	`"Native kernel provider reports unavailable on this host — " +`
`41`	`41`	`"bundled libskainet_kernels missing or skainet_q4k_matmul unresolved",`
`42`	`42`	`)`
`43`		`- // FP32 matmul ships in a later PR; Q4_K is wired through PR 2.`
`44`		`- assertEquals(null, NativeKernelProvider.matmulFp32())`
	`43`	`+ // PR 5 wires both Q4_K (PR 2) and FP32 (this PR) through the SPI.`
	`44`	`+ assertNotNull(NativeKernelProvider.matmulFp32())`
`45`	`45`	`assertNotNull(NativeKernelProvider.matmulQ4K())`
`46`	`46`	`}`
`47`	`47`