SKaiNET-developers
diff --git a/‎skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt‎
Lines changed: 6 additions & 0 deletions b/‎skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q8_0MatmulKernel.kt‎
Lines changed: 48 additions & 0 deletions b/‎skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q8_0MatmulKernel.kt‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt‎
Lines changed: 2 additions & 0 deletions b/‎skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ8_0MatmulKernel.kt‎
Lines changed: 95 additions & 0 deletions b/‎skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ8_0MatmulKernel.kt‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt‎
Lines changed: 4 additions & 0 deletions b/‎skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ8_0MatmulKernel.kt‎
Lines changed: 110 additions & 0 deletions b/‎skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ8_0MatmulKernel.kt‎
Lines changed: 110 additions & 0 deletions
@@ -60,4 +60,10 @@ public interface KernelProvider {
      * to the next provider when this one returns `null`.
      */
     public fun matmulBf16(): Bf16MatmulKernel? = null
+
+    /**
+     * F32 × Q8_0 matmul kernel exposed by this provider, or `null` if
+     * this provider does not specialize Q8_0. Same fall-through pattern.
+     */
+    public fun matmulQ8_0(): Q8_0MatmulKernel? = null
 }
@@ -0,0 +1,48 @@
+package sk.ainet.backend.api.kernel
+
+/**
+ * F32 input × Q8_0-packed weights matrix-vector multiply, in canonical
+ * ggml block layout.
+ *
+ *   output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
+ *     for j ∈ [0, inputDim), o ∈ [0, outputDim)
+ *
+ * Block layout (32-element block, 34 bytes/block; see
+ * [sk.ainet.lang.tensor.data.Q8_0BlockTensorData] kdoc):
+ * - bytes 0..1  : `d` (block scale, FP16 LE)
+ * - bytes 2..33 : 32 bytes of int8 codes (signed)
+ *
+ * Per element: `dequant = code * d`.
+ *
+ * Q8_0 has no per-block min / offset — simpler than Q4_K. Accumulation
+ * is a straight FMA chain after dequantising the 32 signed int8 codes
+ * for each block; the scale broadcasts across all 32 lanes.
+ *
+ * Implementations MUST NOT mutate `input` or `weight`. They MAY assume
+ * the arrays do not alias each other or `output`. They MUST fully
+ * write the `outputDim` floats starting at `output[outputOffset]`.
+ *
+ * Packed-weight row-major contract: `weight` holds blocks laid out
+ * `(blockIdx * outputDim + o) * 34` for output row `o` and input
+ * block index `blockIdx`. This matches `Q8_0BlockTensorData.packedData`.
+ *
+ * `inputDim` MUST be a multiple of 32 (the Q8_0 block size).
+ */
+public interface Q8_0MatmulKernel {
+    /**
+     * @param input FP32 input vector (single row).
+     * @param inputOffset element offset into [input] where the row starts.
+     * @param weight packed Q8_0 bytes for the full `outputDim × inputDim` weight tensor.
+     * @param weightByteOffset byte offset into [weight] where block (0, 0) starts.
+     * @param inputDim contraction dimension (must be a multiple of 32).
+     * @param outputDim number of output cells.
+     * @param output FP32 output vector.
+     * @param outputOffset element offset into [output] where the row starts.
+     */
+    public fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    )
+}
@@ -3,6 +3,7 @@ package sk.ainet.exec.kernel
 import sk.ainet.backend.api.kernel.Bf16MatmulKernel
 import sk.ainet.backend.api.kernel.Fp32MatmulKernel
 import sk.ainet.backend.api.kernel.KernelProvider
+import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
 
 /**
  * Scalar (non-SIMD) [KernelProvider] — always available, lowest
@@ -23,4 +24,5 @@ public object ScalarKernelProvider : KernelProvider {
     override fun isAvailable(): Boolean = true
     override fun matmulFp32(): Fp32MatmulKernel = ScalarMatmulKernel
     override fun matmulBf16(): Bf16MatmulKernel = ScalarBf16MatmulKernel
+    override fun matmulQ8_0(): Q8_0MatmulKernel = ScalarQ8_0MatmulKernel
 }
@@ -0,0 +1,95 @@
+package sk.ainet.exec.kernel
+
+import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
+
+/**
+ * Scalar reference implementation of [Q8_0MatmulKernel] — straight
+ * per-block dequant + per-element FMA, no SIMD. Always available on
+ * every KMP target. Used as:
+ *
+ * - The correctness reference that accelerated kernels (Panama Vector,
+ *   native FFM) must match within FP order tolerance.
+ * - A guaranteed fallback when no accelerated provider is registered.
+ *
+ * Block layout (32-element block, 34 bytes):
+ *   - bytes 0..1 : FP16 little-endian scale (`d`)
+ *   - bytes 2..33: 32 signed int8 codes
+ *
+ * Dequant per element: `code * d`. No min / offset.
+ *
+ * Performance is intentionally modest; production paths should pick the
+ * Panama Vector or native variant via the kernel registry.
+ */
+public object ScalarQ8_0MatmulKernel : Q8_0MatmulKernel {
+
+    private const val BLOCK_SIZE = 32
+    private const val BYTES_PER_BLOCK = 34
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "ScalarQ8_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) {
+            if (outputDim > 0) {
+                for (o in 0 until outputDim) output[outputOffset + o] = 0f
+            }
+            return
+        }
+        val blocksPerInputDim = inputDim / BLOCK_SIZE
+
+        for (o in 0 until outputDim) {
+            var acc = 0f
+            for (blockIdx in 0 until blocksPerInputDim) {
+                val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
+                // FP16 scale: two LE bytes.
+                val dBits = (weight[blockBase].toInt() and 0xFF) or
+                    ((weight[blockBase + 1].toInt() and 0xFF) shl 8)
+                val d = halfToFloat(dBits)
+                // 32 int8 codes, blockIdx-th window of the input vector.
+                val inputBase = inputOffset + blockIdx * BLOCK_SIZE
+                val codesBase = blockBase + 2
+                for (k in 0 until BLOCK_SIZE) {
+                    val code = weight[codesBase + k].toInt() // signed
+                    acc += input[inputBase + k] * code * d
+                }
+            }
+            output[outputOffset + o] = acc
+        }
+    }
+
+    /**
+     * Convert a 16-bit IEEE-754 half-precision value (low 16 bits of
+     * [hbits]) to FP32. Mirrors the helper inside
+     * `sk.ainet.lang.tensor.data.Q4_KBlockTensorData.halfToFloat`,
+     * which is internal to skainet-lang-core and can't be imported
+     * from this module. Inlined here as the single non-trivial piece
+     * of Q8_0 dequant.
+     */
+    private fun halfToFloat(hbits: Int): Float {
+        val sign = (hbits and 0x8000) shl 16
+        val exp = (hbits and 0x7C00) shr 10
+        val mant = hbits and 0x03FF
+        return when (exp) {
+            0 -> {
+                if (mant == 0) Float.fromBits(sign)
+                else {
+                    var m = mant
+                    var e = -14
+                    while ((m and 0x400) == 0) {
+                        m = m shl 1
+                        e--
+                    }
+                    m = m and 0x3FF
+                    Float.fromBits(sign or ((e + 127) shl 23) or (m shl 13))
+                }
+            }
+            31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13))
+            else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13))
+        }
+    }
+}
@@ -4,6 +4,7 @@ import sk.ainet.backend.api.kernel.Bf16MatmulKernel
 import sk.ainet.backend.api.kernel.Fp32MatmulKernel
 import sk.ainet.backend.api.kernel.KernelProvider
 import sk.ainet.backend.api.kernel.Q4KMatmulKernel
+import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
 import sk.ainet.exec.tensor.ops.JvmCpuBackendConfig
 
 /**
@@ -45,6 +46,9 @@ public object PanamaVectorKernelProvider : KernelProvider {
     override fun matmulBf16(): Bf16MatmulKernel? =
         if (isAvailable()) PanamaVectorBf16MatmulKernel else null
 
+    override fun matmulQ8_0(): Q8_0MatmulKernel? =
+        if (isAvailable()) PanamaVectorQ8_0MatmulKernel else null
+
     private fun isVectorApiClassLoaded(): Boolean = runCatching {
         Class.forName("jdk.incubator.vector.FloatVector")
         Class.forName("jdk.incubator.vector.VectorSpecies")
 
@@ -0,0 +1,110 @@
+package sk.ainet.exec.kernel
+
+import jdk.incubator.vector.ByteVector
+import jdk.incubator.vector.FloatVector
+import jdk.incubator.vector.VectorOperators
+import jdk.incubator.vector.VectorSpecies
+import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
+
+/**
+ * SIMD-vectorized FP32 × Q8_0 matmul on the JDK Vector API.
+ *
+ * Pipeline per 32-element block:
+ *  1. Decode the 2-byte FP16 scale `d` once.
+ *  2. Walk the 32 signed int8 codes in `floatSpecies.length()`-sized
+ *     chunks. Each chunk: one ByteVector load, one `castShape` to
+ *     FloatVector (signed widening — int8 codes become small floats
+ *     in [-128, 127]), one `FloatVector.fma(input, codes, blockAcc)`
+ *     into a lane-wise block accumulator.
+ *  3. Reduce the block accumulator across lanes (`reduceLanes(ADD)`)
+ *     and fold `* d` exactly once before adding to the running output
+ *     cell. Folding scale per-block (rather than per-element) avoids
+ *     32 extra multiplies per block; the broadcast-and-FMA-with-scale
+ *     pattern would be wasteful here.
+ *
+ * Numerical equivalence with [ScalarQ8_0MatmulKernel] is within FMA +
+ * reordered-reduction tolerance — the same bar Q4_K Panama uses.
+ */
+public object PanamaVectorQ8_0MatmulKernel : Q8_0MatmulKernel {
+
+    private const val BLOCK_SIZE = 32
+    private const val BYTES_PER_BLOCK = 34
+
+    private val floatSpecies: VectorSpecies<Float> = FloatVector.SPECIES_PREFERRED
+
+    /** Byte species sized so `castShape(floatSpecies, 0)` consumes
+     *  `floatSpecies.length()` bytes — same convention as Q4_K. */
+    private val byteSpeciesForFloat: VectorSpecies<Byte> = when (floatSpecies.length()) {
+        16 -> ByteVector.SPECIES_128
+        else -> ByteVector.SPECIES_64 // covers 4-wide (NEON) and 8-wide (AVX2)
+    }
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "PanamaVectorQ8_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0) return
+        if (inputDim == 0) {
+            for (o in 0 until outputDim) output[outputOffset + o] = 0f
+            return
+        }
+        val blocksPerInputDim = inputDim / BLOCK_SIZE
+        val laneCount = floatSpecies.length()
+
+        for (o in 0 until outputDim) {
+            var acc = 0f
+            for (blockIdx in 0 until blocksPerInputDim) {
+                val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
+                // FP16 scale — two LE bytes.
+                val dBits = (weight[blockBase].toInt() and 0xFF) or
+                    ((weight[blockBase + 1].toInt() and 0xFF) shl 8)
+                val d = halfToFloat(dBits)
+
+                val codesBase = blockBase + 2
+                val inputBase = inputOffset + blockIdx * BLOCK_SIZE
+
+                var blockAccVec = FloatVector.zero(floatSpecies)
+                var k = 0
+                while (k < BLOCK_SIZE) {
+                    val byteVec = ByteVector.fromArray(byteSpeciesForFloat, weight, codesBase + k)
+                    @Suppress("UNCHECKED_CAST")
+                    val codesVec = byteVec.castShape(floatSpecies, 0) as FloatVector
+                    val inputVec = FloatVector.fromArray(floatSpecies, input, inputBase + k)
+                    blockAccVec = inputVec.fma(codesVec, blockAccVec)
+                    k += laneCount
+                }
+                acc += blockAccVec.reduceLanes(VectorOperators.ADD) * d
+            }
+            output[outputOffset + o] = acc
+        }
+    }
+
+    /** Same FP16 → FP32 conversion as [ScalarQ8_0MatmulKernel.halfToFloat]. */
+    private fun halfToFloat(hbits: Int): Float {
+        val sign = (hbits and 0x8000) shl 16
+        val exp = (hbits and 0x7C00) shr 10
+        val mant = hbits and 0x03FF
+        return when (exp) {
+            0 -> {
+                if (mant == 0) Float.fromBits(sign)
+                else {
+                    var m = mant
+                    var e = -14
+                    while ((m and 0x400) == 0) {
+                        m = m shl 1
+                        e--
+                    }
+                    m = m and 0x3FF
+                    Float.fromBits(sign or ((e + 127) shl 23) or (m shl 13))
+                }
+            }
+            31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13))
+            else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13))
+        }
+    }
+}