Merge pull request #562 from SKaiNET-developers/feature/jvm-q4k-simd-spi

michalharakal · web-flow · commit 9cc73aa2e833 · 2026-04-28T22:50:56.000+02:00
feat(kernel): SIMD-fused Q4_K matmul kernel + Q4KMatmulKernel SPI
diff --git a/skainet-backends/benchmarks/jvm-cpu-jmh/src/jmh/kotlin/sk/ainet/bench/QuantizedMatmulBench.kt b/skainet-backends/benchmarks/jvm-cpu-jmh/src/jmh/kotlin/sk/ainet/bench/QuantizedMatmulBench.kt
@@ -0,0 +1,79 @@
+package sk.ainet.bench
+
+import java.util.concurrent.TimeUnit
+import kotlin.random.Random
+import org.openjdk.jmh.annotations.Benchmark
+import org.openjdk.jmh.annotations.BenchmarkMode
+import org.openjdk.jmh.annotations.Level
+import org.openjdk.jmh.annotations.Mode
+import org.openjdk.jmh.annotations.OutputTimeUnit
+import org.openjdk.jmh.annotations.Param
+import org.openjdk.jmh.annotations.Scope
+import org.openjdk.jmh.annotations.Setup
+import org.openjdk.jmh.annotations.State
+import sk.ainet.exec.kernel.PanamaVectorQ4KMatmulKernel
+
+/**
+ * F32-input × Q4_K-weight matmul bench: measures the SIMD-fused
+ * Panama kernel ([PanamaVectorQ4KMatmulKernel]) at typical LLM matmul
+ * shapes for Gemma 4 E2B Q4_K_M:
+ *   - 1024 x 1024 — small attention projection
+ *   - 4096 x 4096 — hidden→hidden / FFN gate
+ *   - 4096 x 1024 — hidden→KV slice
+ *
+ * Each `inputDim` must be a multiple of 256 (Q4_K block size). Packed
+ * layout is input-block-major (`(blockIdx * outputDim + o) * 144`).
+ *
+ * Direct comparison vs the prior `JvmQuantizedVectorKernels.matmulQ4_KVec`
+ * partial-vec implementation is via the parity test in
+ * `PanamaVectorQ4KMatmulKernelTest`, which exercises both code paths.
+ * The internal visibility of that legacy kernel keeps it out of the
+ * cross-module bench harness.
+ */
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+open class QuantizedMatmulBench {
+
+    @Param("1024-1024", "4096-1024", "4096-4096")
+    var shape: String = "4096-4096"
+
+    private var inputDim: Int = 0
+    private var outputDim: Int = 0
+    private lateinit var input: FloatArray
+    private lateinit var packedWeights: ByteArray
+    private lateinit var output: FloatArray
+
+    @Setup(Level.Trial)
+    fun setup() {
+        val parts = shape.split("-")
+        inputDim = parts[0].toInt()
+        outputDim = parts[1].toInt()
+        require(inputDim % 256 == 0) { "inputDim must be multiple of 256, got $inputDim" }
+
+        val numBlocks = (inputDim / 256) * outputDim
+        val rng = Random(42)
+        packedWeights = ByteArray(numBlocks * 144)
+        rng.nextBytes(packedWeights)
+        // Force d / dMin per block to 1.0f16 (0x3C00) so dequantized
+        // magnitudes stay within finite range for steady-state runs.
+        for (block in 0 until numBlocks) {
+            val base = block * 144
+            packedWeights[base] = 0x00.toByte(); packedWeights[base + 1] = 0x3C.toByte()
+            packedWeights[base + 2] = 0x00.toByte(); packedWeights[base + 3] = 0x3C.toByte()
+        }
+        input = FloatArray(inputDim) { ((it % 251) - 125).toFloat() / 127f }
+        output = FloatArray(outputDim)
+    }
+
+    @Benchmark
+    fun matmul_q4k_panama(): FloatArray {
+        PanamaVectorQ4KMatmulKernel.matmul(
+            input, 0,
+            packedWeights, 0,
+            inputDim, outputDim,
+            output, 0,
+        )
+        return output
+    }
+}
diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt
@@ -42,4 +42,14 @@ public interface KernelProvider {
      * provider does not specialize matmul.
      */
     public fun matmulFp32(): Fp32MatmulKernel?
+
+    /**
+     * F32 × Q4_K matmul kernel exposed by this provider, or `null` if
+     * this provider does not specialize Q4_K. Default returns `null`
+     * so providers that pre-date this accessor (e.g. older custom
+     * providers and the scalar reference) keep compiling without
+     * change — callers cascade to a lower-priority provider that does
+     * carry the kernel.
+     */
+    public fun matmulQ4K(): Q4KMatmulKernel? = null
 }
diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q4KMatmulKernel.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q4KMatmulKernel.kt
@@ -0,0 +1,60 @@
+package sk.ainet.backend.api.kernel
+
+/**
+ * F32 input × Q4_K-packed weights matrix-vector multiply, in canonical
+ * ggml super-block layout.
+ *
+ *   output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
+ *     for j ∈ [0, inputDim), o ∈ [0, outputDim)
+ *
+ * Block layout (256-element super-block, 144 bytes/block; see
+ * [sk.ainet.lang.tensor.data.Q4_KTensorData] kdoc for the byte map):
+ * - bytes 0..1  : `d` (super-block scale, FP16 LE)
+ * - bytes 2..3  : `dMin` (super-block min-scale, FP16 LE)
+ * - bytes 4..15 : 12 bytes of packed (6-bit scaleIdx, 6-bit minIdx) for
+ *                 8 sub-blocks via ggml's `get_scale_min_k4` mixing
+ * - bytes 16..143 : 128 bytes of 4-bit codes, *strided* in 4 groups of
+ *                   32 bytes — each byte's lo nibble belongs to one
+ *                   sub-block and the hi nibble of the same byte
+ *                   belongs to the *next* sub-block over the same
+ *                   intra-group index.
+ *
+ * Per sub-block s ∈ 0..7:
+ *   `scale[s]  = d    * scaleIdx[s]`
+ *   `offset[s] = dMin * minIdx[s]`
+ *   per element: `dequant = code * scale[s] - offset[s]`
+ *
+ * The lazy-`dmin` accumulation trick (used by every well-tuned Q4_K
+ * kernel including ggml's reference) avoids subtracting `offset` per
+ * element by tracking `Σ(input · code)` and `Σ(input)` per sub-block
+ * and combining as `scale * codeSum − offset * inputSum` once.
+ *
+ * Implementations MUST NOT mutate `input` or `weight`. They MAY assume
+ * the arrays do not alias each other or `output`. They MUST fully
+ * write the `outputDim` floats starting at `output[outputOffset]`.
+ *
+ * Packed-weight row-major contract: `weight` holds blocks laid out
+ * `(blockIdx * outputDim + o) * 144` for output row `o` and input
+ * block index `blockIdx`. This matches `Q4_KBlockTensorData.packedData`
+ * and `JvmQuantizedVectorKernels.matmulQ4_KVec`.
+ *
+ * `inputDim` MUST be a multiple of 256 (the Q4_K block size).
+ */
+public interface Q4KMatmulKernel {
+    /**
+     * @param input FP32 input vector (single row).
+     * @param inputOffset element offset into [input] where the row starts.
+     * @param weight packed Q4_K bytes for the full `outputDim × inputDim` weight tensor.
+     * @param weightByteOffset byte offset into [weight] where block (0, 0) starts.
+     * @param inputDim contraction dimension (must be a multiple of 256).
+     * @param outputDim number of output cells.
+     * @param output FP32 output vector.
+     * @param outputOffset element offset into [output] where the row starts.
+     */
+    public fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    )
+}
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt
@@ -2,6 +2,7 @@ package sk.ainet.exec.kernel
 
 import sk.ainet.backend.api.kernel.Fp32MatmulKernel
 import sk.ainet.backend.api.kernel.KernelProvider
+import sk.ainet.backend.api.kernel.Q4KMatmulKernel
 import sk.ainet.exec.tensor.ops.JvmCpuBackendConfig
 
 /**
@@ -37,6 +38,9 @@ public object PanamaVectorKernelProvider : KernelProvider {
     override fun matmulFp32(): Fp32MatmulKernel? =
         if (isAvailable()) PanamaVectorMatmulKernel else null
 
+    override fun matmulQ4K(): Q4KMatmulKernel? =
+        if (isAvailable()) PanamaVectorQ4KMatmulKernel else null
+
     private fun isVectorApiClassLoaded(): Boolean = runCatching {
         Class.forName("jdk.incubator.vector.FloatVector")
         Class.forName("jdk.incubator.vector.VectorSpecies")
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ4KMatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ4KMatmulKernel.kt
@@ -0,0 +1,201 @@
+package sk.ainet.exec.kernel
+
+import jdk.incubator.vector.ByteVector
+import jdk.incubator.vector.FloatVector
+import jdk.incubator.vector.VectorOperators
+import jdk.incubator.vector.VectorSpecies
+import sk.ainet.backend.api.kernel.Q4KMatmulKernel
+import sk.ainet.exec.tensor.ops.parallelChunks
+
+/**
+ * SIMD-vectorized Q4_K matmul on the JDK Vector API.
+ *
+ * Pipeline per 32-byte qs slab (which carries two adjacent sub-blocks
+ * — sub-block `2j` in lo nibbles, sub-block `2j+1` in hi nibbles):
+ *   1. `ByteVector.fromArray(byteSpeciesForFloat, weight, qsRegion+idx)` — single load.
+ *   2. `loNibVec = byteVec.and(0x0F.toByte())`,
+ *      `hiNibVec = byteVec.lanewise(LSHR, 4)` — extract both nibbles.
+ *   3. `castShape(floatSpecies, 0)` — widen + I2F.
+ *   4. `inputVec.fma(codeFloatVec, codeAcc)` — accumulate `Σ(input·code)`
+ *      per sub-block; track `inputAcc = Σ(input)` separately for the
+ *      lazy-`dmin` correction.
+ *   5. After all super-blocks for a given output cell, sum across
+ *      sub-blocks: `acc += scale[s] · codeSum[s] − offset[s] · inputSum[s]`
+ *      with `scale[s] = d · scaleIdx[s]` and `offset[s] = dMin · minIdx[s]`.
+ *
+ * Compared to [sk.ainet.exec.tensor.ops.JvmQuantizedVectorKernels.matmulQ4_KVec]:
+ * - Replaces the scalar 32-iteration nibble unpack into a scratch
+ *   `FloatArray` with a single `ByteVector` load + `castShape` per
+ *   `floatSpecies.length()` elements.
+ * - Folds lo + hi nibble passes into a single byte load (existing
+ *   helper called the byte-load helper twice — once per nibble).
+ *
+ * Numerical equivalence with the existing partial-vec kernel is
+ * within FMA + reordered-reduction tolerance; verified via parity
+ * tests at `1e-5 · inputDim`.
+ */
+public object PanamaVectorQ4KMatmulKernel : Q4KMatmulKernel {
+
+    private const val BLOCK_SIZE = 256
+    private const val SUB_BLOCK_SIZE = 32
+    private const val SUB_BLOCKS_PER_BLOCK = 8
+    private const val BYTES_PER_BLOCK = 144
+
+    private val floatSpecies: VectorSpecies<Float> = FloatVector.SPECIES_PREFERRED
+
+    /**
+     * Byte species sized so `castShape(floatSpecies, 0)` consumes
+     * exactly `floatSpecies.length()` bytes — same convention as
+     * [sk.ainet.exec.tensor.ops.JvmQuantizedVectorKernels.byteSpeciesForFloat].
+     */
+    private val byteSpeciesForFloat: VectorSpecies<Byte> = when (floatSpecies.length()) {
+        16 -> ByteVector.SPECIES_128
+        else -> ByteVector.SPECIES_64 // covers 4-wide (NEON) and 8-wide (AVX2)
+    }
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "PanamaVectorQ4KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) return
+        val blocksPerInputDim = inputDim / BLOCK_SIZE
+
+        parallelChunks(outputDim) { startO, endO ->
+            // Per-task scratch — must not be shared across worker threads.
+            val scaleIdx = IntArray(SUB_BLOCKS_PER_BLOCK)
+            val minIdx = IntArray(SUB_BLOCKS_PER_BLOCK)
+            for (o in startO until endO) {
+                var acc = 0f
+                for (blockIdx in 0 until blocksPerInputDim) {
+                    val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
+
+                    // d, dMin (FP16 LE).
+                    val dBits = (weight[blockBase + 1].toInt() and 0xFF shl 8) or
+                        (weight[blockBase].toInt() and 0xFF)
+                    val dMinBits = (weight[blockBase + 3].toInt() and 0xFF shl 8) or
+                        (weight[blockBase + 2].toInt() and 0xFF)
+                    val d = halfToFloat(dBits)
+                    val dMin = halfToFloat(dMinBits)
+
+                    // Sub-scale decode via ggml `get_scale_min_k4`.
+                    val scalesOffset = blockBase + 4
+                    for (sb in 0 until 4) {
+                        scaleIdx[sb] = weight[scalesOffset + sb].toInt() and 0x3F
+                        minIdx[sb] = weight[scalesOffset + sb + 4].toInt() and 0x3F
+                    }
+                    for (sb in 4 until 8) {
+                        val low4S = weight[scalesOffset + sb + 4].toInt() and 0x0F
+                        val high2S = (weight[scalesOffset + sb - 4].toInt() and 0xFF) ushr 6
+                        scaleIdx[sb] = low4S or (high2S shl 4)
+                        val low4M = (weight[scalesOffset + sb + 4].toInt() and 0xFF) ushr 4
+                        val high2M = (weight[scalesOffset + sb].toInt() and 0xFF) ushr 6
+                        minIdx[sb] = low4M or (high2M shl 4)
+                    }
+
+                    // 4 strided qs groups; each carries sbLo (lo nibbles) and sbHi (hi nibbles).
+                    val codesOffset = blockBase + 16
+                    val inputBlockBase = inputOffset + blockIdx * BLOCK_SIZE
+                    for (groupJ in 0 until 4) {
+                        val qsRegion = codesOffset + groupJ * 32
+                        val sbLo = 2 * groupJ
+                        val sbHi = sbLo + 1
+                        val inputStartLo = inputBlockBase + sbLo * SUB_BLOCK_SIZE
+                        val inputStartHi = inputStartLo + SUB_BLOCK_SIZE
+
+                        var codeAccLo = FloatVector.zero(floatSpecies)
+                        var inputAccLo = FloatVector.zero(floatSpecies)
+                        var codeAccHi = FloatVector.zero(floatSpecies)
+                        var inputAccHi = FloatVector.zero(floatSpecies)
+
+                        val floatStep = floatSpecies.length()
+                        val byteLoadLen = byteSpeciesForFloat.length()
+                        var idx = 0
+
+                        // SIMD body — single byte load feeds both nibble vectors.
+                        while (idx + floatStep <= SUB_BLOCK_SIZE &&
+                            qsRegion + idx + byteLoadLen <= weight.size
+                        ) {
+                            val inVecLo = FloatVector.fromArray(floatSpecies, input, inputStartLo + idx)
+                            val inVecHi = FloatVector.fromArray(floatSpecies, input, inputStartHi + idx)
+                            val byteVec = ByteVector.fromArray(byteSpeciesForFloat, weight, qsRegion + idx)
+                            val loBytes = byteVec.and(0x0F.toByte())
+                            val hiBytes = byteVec.lanewise(VectorOperators.LSHR, 4.toByte())
+                            val codeVecLo = loBytes.castShape(floatSpecies, 0) as FloatVector
+                            val codeVecHi = hiBytes.castShape(floatSpecies, 0) as FloatVector
+                            codeAccLo = inVecLo.fma(codeVecLo, codeAccLo)
+                            inputAccLo = inVecLo.add(inputAccLo)
+                            codeAccHi = inVecHi.fma(codeVecHi, codeAccHi)
+                            inputAccHi = inVecHi.add(inputAccHi)
+                            idx += floatStep
+                        }
+
+                        var codeSumLo = codeAccLo.reduceLanes(VectorOperators.ADD)
+                        var inputSumLo = inputAccLo.reduceLanes(VectorOperators.ADD)
+                        var codeSumHi = codeAccHi.reduceLanes(VectorOperators.ADD)
+                        var inputSumHi = inputAccHi.reduceLanes(VectorOperators.ADD)
+
+                        // Scalar tail — only fires if floatSpecies.length() doesn't divide 32 (rare).
+                        while (idx < SUB_BLOCK_SIZE) {
+                            val byte = weight[qsRegion + idx].toInt() and 0xFF
+                            val codeLo = (byte and 0x0F).toFloat()
+                            val codeHi = (byte ushr 4).toFloat()
+                            val vLo = input[inputStartLo + idx]
+                            val vHi = input[inputStartHi + idx]
+                            codeSumLo += vLo * codeLo
+                            inputSumLo += vLo
+                            codeSumHi += vHi * codeHi
+                            inputSumHi += vHi
+                            idx++
+                        }
+
+                        val scaleLo = d * scaleIdx[sbLo]
+                        val offsetLo = dMin * minIdx[sbLo]
+                        val scaleHi = d * scaleIdx[sbHi]
+                        val offsetHi = dMin * minIdx[sbHi]
+                        acc += codeSumLo * scaleLo - inputSumLo * offsetLo
+                        acc += codeSumHi * scaleHi - inputSumHi * offsetHi
+                    }
+                }
+                output[outputOffset + o] = acc
+            }
+        }
+    }
+
+    /**
+     * IEEE 754 binary16 → binary32 conversion. Mirrors the helper used
+     * inside `JvmQuantizedVectorKernels` and `Q4_KTensorData` — kept
+     * private to this file rather than depending on either, since both
+     * are `internal` in their respective modules.
+     */
+    private fun halfToFloat(hbits: Int): Float {
+        val sign = (hbits ushr 15) and 0x1
+        val exp = (hbits ushr 10) and 0x1F
+        val frac = hbits and 0x3FF
+        return when {
+            exp == 0 -> {
+                if (frac == 0) {
+                    if (sign == 0) 0.0f else -0.0f
+                } else {
+                    val f = frac / 1024.0f * (1.0f / 16384.0f)
+                    if (sign == 0) f else -f
+                }
+            }
+            exp == 0x1F -> {
+                if (frac == 0) {
+                    if (sign == 0) Float.POSITIVE_INFINITY else Float.NEGATIVE_INFINITY
+                } else {
+                    Float.NaN
+                }
+            }
+            else -> {
+                val bits = (sign shl 31) or ((exp - 15 + 127) shl 23) or (frac shl 13)
+                Float.fromBits(bits)
+            }
+        }
+    }
+}
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ4KMatmulKernelTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ4KMatmulKernelTest.kt