perf(native q6k): fused Q8 int8 dot path (dotprod)

michalharakal · claude · michalharakal · commit 4525c1227190 · 2026-07-02T22:27:14.000+02:00
Mirror the q4k fused-int8 kernel: pre-quantize the input row to symmetric
int8 (Q8) once per 256-block (reused across all output rows), unpack the
6-bit weight to centered int8 codes, and run each scale-group as an int8
dot (vdotq_s32 on dotprod targets, scalar fallback otherwise). Drops the
256-float scratch dequant + per-element float multiply.

acc = d · d_in · Σ_g sc[g]·Σ_{i∈g} q8[i]·codes[i].

This is deliberately lossy (ggml-style activation quant, ~1-3% on
worst-case uniform-random fixtures) so it is no longer bit-exact vs the
float/scalar reference. Both parity tests (jvmTest Panama, nativeTest
cinterop on linuxX64 + linuxArm64) switch from per-row relative error —
unbounded on near-zero rows of zero-mean fixtures — to the aggregate
error-energy gate RMS(error)/RMS(signal) &lt; 0.03.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q6k_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q6k_matmul.c
@@ -3,6 +3,8 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
 
 #define Q6K_BLOCK_SIZE       256
 #define Q6K_BYTES_PER_BLOCK  210
@@ -40,62 +42,109 @@ static inline float skainet_q6k_half_to_float(uint16_t hbits) {
 }
 
 /*
- * Dequantize one 256-element Q6_K super-block into scratch[256].
- * Direct transcription of ScalarQ6_KMatmulKernel.dequantBlock /
- * ggml dequantize_row_q6_K: two 128-element halves, each split into two
- * 16-element scale groups carrying four strided sub-codes (q1..q4).
- *
- * The 6-bit code is `lowNibble(ql) | (twoHighBits(qh) << 4)`, biased by
- * -32, and `scales` are SIGNED int8. Per-element value = d * scale * code.
+ * Quantize one 256-float input block to symmetric int8 (Q8), d_in = maxabs/127,
+ * q8[i] = round(in[i]/d_in). Mirrors q4k_matmul.c's activation quant (ggml
+ * block_q8_K style) — the source of the small (~1-3%) error vs the exact float
+ * kernel and what unlocks the int8 dot path. Returns d_in (0 + zeroed q8 if the
+ * block is all-zero).
+ */
+static inline float skainet_q6k_q8_quantize_block(const float* SKAINET_RESTRICT in,
+                                                  int8_t* SKAINET_RESTRICT q8) {
+    float maxabs = 0.0f;
+    for (int i = 0; i < Q6K_BLOCK_SIZE; ++i) {
+        const float a = in[i] < 0.0f ? -in[i] : in[i];
+        if (a > maxabs) maxabs = a;
+    }
+    if (maxabs == 0.0f) {
+        for (int i = 0; i < Q6K_BLOCK_SIZE; ++i) q8[i] = 0;
+        return 0.0f;
+    }
+    const float d_in = maxabs / 127.0f;
+    const float inv = 127.0f / maxabs;
+    for (int i = 0; i < Q6K_BLOCK_SIZE; ++i) {
+        int v = (int) lrintf(in[i] * inv);
+        if (v > 127) v = 127; else if (v < -127) v = -127;
+        q8[i] = (int8_t) v;
+    }
+    return d_in;
+}
+
+/*
+ * Unpack one 256-element Q6_K super-block into CENTERED int8 codes[256] (the
+ * 6-bit code biased by -32, range [-32, 31]) in natural element order — i.e.
+ * codes[i] pairs with input[i]. Same bit layout as the float dequant
+ * (ScalarQ6_KMatmulKernel / ggml dequantize_row_q6_K) but without folding in
+ * `d`/`scale`: those are applied per scale-group in the int dot, so the inner
+ * product stays integer. Two 128-element halves, each with two 16-element scale
+ * groups carrying four strided sub-codes (q1..q4) at output offsets +0/+32/+64/+96.
  */
-static inline void skainet_q6k_dequant_block(const uint8_t* SKAINET_RESTRICT block,
-                                             float* SKAINET_RESTRICT scratch) {
+static inline void skainet_q6k_unpack_codes(const uint8_t* SKAINET_RESTRICT block,
+                                            int8_t* SKAINET_RESTRICT codes) {
     const uint8_t* ql0 = block + Q6K_QL_OFFSET;
     const uint8_t* qh0 = block + Q6K_QH_OFFSET;
-    const int8_t*  sc0 = (const int8_t*)(block + Q6K_SCALES_OFFSET);
-    const uint16_t d_bits = (uint16_t) block[Q6K_D_OFFSET]
-        | ((uint16_t) block[Q6K_D_OFFSET + 1] << 8);
-    const float d = skainet_q6k_half_to_float(d_bits);
 
     for (int half = 0; half < 2; ++half) {
         const uint8_t* ql = ql0 + half * 64;
         const uint8_t* qh = qh0 + half * 32;
-        const int8_t*  sc = sc0 + half * 8;
-        float* out = scratch + half * 128;
+        int8_t* out = codes + half * 128;
         for (int is = 0; is < 2; ++is) {
-            const float sc1 = d * (float) sc[is + 0];
-            const float sc2 = d * (float) sc[is + 2];
-            const float sc3 = d * (float) sc[is + 4];
-            const float sc4 = d * (float) sc[is + 6];
             const int l_start = is * 16;
             for (int l = l_start; l < l_start + 16; ++l) {
                 const int q_l0  = ql[l];
                 const int q_l32 = ql[l + 32];
                 const int q_h   = qh[l];
-                const int q1 = ((q_l0  & 0x0F) | ((q_h        & 0x03) << 4)) - 32;
-                const int q2 = ((q_l32 & 0x0F) | (((q_h >> 2) & 0x03) << 4)) - 32;
-                const int q3 = ((q_l0  >> 4)   | (((q_h >> 4) & 0x03) << 4)) - 32;
-                const int q4 = ((q_l32 >> 4)   | (((q_h >> 6) & 0x03) << 4)) - 32;
-                out[l +  0] = sc1 * (float) q1;
-                out[l + 32] = sc2 * (float) q2;
-                out[l + 64] = sc3 * (float) q3;
-                out[l + 96] = sc4 * (float) q4;
+                out[l +  0] = (int8_t)(((q_l0  & 0x0F) | ((q_h        & 0x03) << 4)) - 32);
+                out[l + 32] = (int8_t)(((q_l32 & 0x0F) | (((q_h >> 2) & 0x03) << 4)) - 32);
+                out[l + 64] = (int8_t)(((q_l0  >> 4)   | (((q_h >> 4) & 0x03) << 4)) - 32);
+                out[l + 96] = (int8_t)(((q_l32 >> 4)   | (((q_h >> 6) & 0x03) << 4)) - 32);
             }
         }
     }
 }
 
+/*
+ * Weighted integer dot of one Q6_K block: Σ_g sc[g] · Σ_{i∈g} q8[i]·codes[i],
+ * over the 16 scale-groups (each a 16-element contiguous run in natural order).
+ * Run `r` for (half,k,is) starts at half*128 + 32*k + is*16 and uses signed
+ * scale sc[half*8 + is + 2*k]. On AArch64 with dotprod each 16-element dot is a
+ * single vdotq_s32; otherwise a scalar fallback (auto-vectorizes under -O3).
+ */
+static inline int64_t skainet_q6k_weighted_dot(const int8_t* SKAINET_RESTRICT q8,
+                                               const int8_t* SKAINET_RESTRICT codes,
+                                               const int8_t* SKAINET_RESTRICT sc) {
+    int64_t sum = 0;
+    for (int half = 0; half < 2; ++half) {
+        for (int k = 0; k < 4; ++k) {
+            for (int is = 0; is < 2; ++is) {
+                const int start = half * 128 + 32 * k + is * 16;
+                const int gs = half * 8 + is + 2 * k;
+                int32_t dot;
+#ifdef SKAINET_HAVE_DOTPROD
+                const int32x4_t acc = vdotq_s32(vdupq_n_s32(0),
+                    vld1q_s8(codes + start), vld1q_s8(q8 + start));
+                dot = vaddvq_s32(acc);
+#else
+                dot = 0;
+                for (int j = 0; j < 16; ++j) dot += (int) q8[start + j] * (int) codes[start + j];
+#endif
+                sum += (int64_t) sc[gs] * dot;
+            }
+        }
+    }
+    return sum;
+}
+
 /*
  * Native Q6_K matrix-vector multiply matching the
  * sk.ainet.backend.api.kernel.Q6KMatmulKernel SPI contract. A single
  * input row times an `outputDim x inputDim` Q6_K-packed weight tensor
  * laid out (blockIdx * outputDim + o) * 210 bytes.
  *
- * The 6-bit bit-assembly is kept scalar (cheap byte shuffling that the
- * compiler auto-vectorizes under -O3) and materialized into a 256-float
- * scratch block; the hot dot product against the input window is the
- * NEON path (vfmaq_f32 + horizontal add) behind __ARM_NEON. On non-ARM
- * targets the dot is a straight-line loop that auto-vectorizes too.
+ * Fused int8 dot path (ggml-style, mirrors q4k_matmul.c): the input row is
+ * quantized to Q8 ONCE per 256-block (reused across all output rows), the 6-bit
+ * weight is unpacked to centered int8 codes, and each scale-group is an int8
+ * dot (vdotq_s32 on dotprod targets) — no 256-float scratch, no per-element
+ * float multiply. acc = d · d_in · Σ_g sc[g]·Σ_{i∈g} q8[i]·codes[i].
  */
 SKAINET_API void skainet_q6k_matmul(
     const float* SKAINET_RESTRICT input,
@@ -113,43 +162,46 @@ SKAINET_API void skainet_q6k_matmul(
     const float* in_base = input + input_offset;
     float* out_base = output + output_offset;
 
-    float scratch[Q6K_BLOCK_SIZE];
+    /* Pre-quantize the whole input row to Q8 once (reused across all o). */
+    int8_t* q8 = (int8_t*) malloc((size_t) input_dim * sizeof(int8_t));
+    float* d_in = (float*) malloc((size_t) blocks_per_input_dim * sizeof(float));
+    if (q8 == NULL || d_in == NULL) { free(q8); free(d_in); return; }
+    for (int32_t b = 0; b < blocks_per_input_dim; ++b) {
+        d_in[b] = skainet_q6k_q8_quantize_block(in_base + (size_t) b * Q6K_BLOCK_SIZE,
+                                                q8 + (size_t) b * Q6K_BLOCK_SIZE);
+    }
+
+    int8_t codes[Q6K_BLOCK_SIZE];
 
     /*
      * Loop order: block OUTER, output row INNER — see q4k_matmul.c for the
      * rationale. The weight is block-major (blockIdx*output_dim + o)*210, so for
      * a fixed block consecutive `o` are 210 bytes apart: the weight bytes are
      * read sequentially (cache/prefetch friendly) instead of striding
-     * output_dim*210 per step, which on the in-order A55 makes every read a cold
-     * miss. The big Q6_K `output` projection (hidden→vocab, hit every token) is
-     * the main beneficiary. out_base[o] accumulates across blocks; the order
-     * over blocks is unchanged ⇒ numerically identical to the o-outer form.
+     * output_dim*210 per step. out_base[o] accumulates across blocks; the order
+     * over blocks is unchanged.
      */
     for (int32_t o = 0; o < output_dim; ++o) out_base[o] = 0.0f;
 
     for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
-        const float* in_block = in_base + (size_t) block_idx * Q6K_BLOCK_SIZE;
+        const int8_t* q8_block = q8 + (size_t) block_idx * Q6K_BLOCK_SIZE;
+        const float di = d_in[block_idx];
         const uint8_t* block = weight + weight_byte_offset
             + (size_t)(block_idx * output_dim) * Q6K_BYTES_PER_BLOCK;
 
         for (int32_t o = 0; o < output_dim; ++o, block += Q6K_BYTES_PER_BLOCK) {
-            skainet_q6k_dequant_block(block, scratch);
-
-            float acc = 0.0f;
-#ifdef SKAINET_HAVE_NEON
-            float32x4_t vacc = vdupq_n_f32(0.0f);
-            for (int i = 0; i < Q6K_BLOCK_SIZE; i += 4) {
-                const float32x4_t vi = vld1q_f32(in_block + i);
-                const float32x4_t vw = vld1q_f32(scratch + i);
-                vacc = vfmaq_f32(vacc, vi, vw);
-            }
-            acc = skainet_neon_hadd_f32(vacc);
-#else
-            for (int i = 0; i < Q6K_BLOCK_SIZE; ++i) {
-                acc += in_block[i] * scratch[i];
-            }
-#endif
-            out_base[o] += acc;
+            const uint16_t d_bits = (uint16_t) block[Q6K_D_OFFSET]
+                | ((uint16_t) block[Q6K_D_OFFSET + 1] << 8);
+            const float d = skainet_q6k_half_to_float(d_bits);
+            const int8_t* sc = (const int8_t*)(block + Q6K_SCALES_OFFSET);
+
+            skainet_q6k_unpack_codes(block, codes);
+            const int64_t wdot = skainet_q6k_weighted_dot(q8_block, codes, sc);
+
+            out_base[o] += d * di * (float) wdot;
         }
     }
+
+    free(q8);
+    free(d_in);
 }
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ6KMatmulKernelParityTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ6KMatmulKernelParityTest.kt
@@ -16,9 +16,16 @@ import kotlin.test.assertTrue
  * Fixture mirrors [NativeQ5KMatmulKernelParityTest]: random Q6_K bytes with
  * `d` clamped to `1.0f16` (bytes 208-209), packed input-block-major
  * `(blockIdx * outputDim + o) * 210`. Random `ql`/`qh`/`scales` exercise the
- * 6-bit bit-assembly and the signed int8 scales. Q6_K magnitudes are larger
- * than Q5_K (codes [-32, 31] × int8 scales), so absolute tolerances are a
- * touch looser; the `rel < 1e-4` relative check is the real gate.
+ * 6-bit bit-assembly and the signed int8 scales.
+ *
+ * Like [NativeQ4KMatmulKernelParityTest], the native kernel quantizes the
+ * activation to int8 (Q8) for the dotprod fast path — deliberately lossy
+ * (ggml-style), so it is NOT bit-exact vs the float Panama reference. Per-row
+ * relative error is the wrong gate (a near-zero true row shows unbounded
+ * relative error from a tiny absolute one on zero-mean random fixtures); the
+ * meaningful metric is the aggregate error energy RMS(error)/RMS(signal). Real
+ * (smoother) LLM activations are far tighter than these worst-case fixtures;
+ * the end-to-end gate is the on-board generation output.
  */
 class NativeQ6KMatmulKernelParityTest {
 
@@ -58,14 +65,25 @@ class NativeQ6KMatmulKernelParityTest {
         val nativeOut = FloatArray(outputDim)
         NativeQ6KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, nativeOut, 0)
 
+        var sqErr = 0.0
+        var sqSig = 0.0
         for (o in 0 until outputDim) {
-            val diff = abs(refOut[o] - nativeOut[o])
-            val rel = diff / (abs(refOut[o]) + 1e-9f)
-            assertTrue(
-                diff <= tol || rel < 1e-4f,
-                "row $o diverged: panama=${refOut[o]} native=${nativeOut[o]} diff=$diff rel=$rel tol=$tol",
-            )
+            val d = (refOut[o] - nativeOut[o]).toDouble()
+            sqErr += d * d
+            sqSig += refOut[o].toDouble() * refOut[o].toDouble()
         }
+        val rmsErr = kotlin.math.sqrt(sqErr / outputDim)
+        val rmsSig = kotlin.math.sqrt(sqSig / outputDim)
+        val relRms = rmsErr / (rmsSig + 1e-9)
+        assertTrue(
+            relRms < AGG_REL_TOL || rmsErr < tol,
+            "Q8 parity exceeded: relRms=$relRms (rmsErr=$rmsErr rmsSig=$rmsSig) over $outputDim rows, tol=$AGG_REL_TOL",
+        )
+    }
+
+    private companion object {
+        // Aggregate Q8-activation RMS-relative-error bound (uniform-random worst case).
+        const val AGG_REL_TOL = 0.03
     }
 
     @Test
diff --git a/skainet-backends/skainet-backend-native-cpu/src/nativeTest/kotlin/sk/ainet/exec/kernel/NativeKnQ6KMatmulKernelParityTest.kt b/skainet-backends/skainet-backend-native-cpu/src/nativeTest/kotlin/sk/ainet/exec/kernel/NativeKnQ6KMatmulKernelParityTest.kt
@@ -12,10 +12,13 @@ import kotlin.test.assertTrue
  * `-ffast-math` reassociation tolerance.
  *
  * Runs on linuxX64 (host archive: scalar/auto-vectorized) AND linuxArm64
- * (cross-built archive: NEON), so the aarch64 run bit-checks the
- * `SKAINET_HAVE_NEON` path in q6k_matmul.c. Q6_K magnitudes (codes
- * [-32, 31] × signed int8 scales) are larger than Q5_K, so absolute tolerances
- * are a touch looser; the `rel < 1e-4` relative check is the real gate.
+ * (cross-built archive: NEON), so the aarch64 run exercises the
+ * `SKAINET_HAVE_NEON` / `SKAINET_HAVE_DOTPROD` path in q6k_matmul.c.
+ *
+ * The C kernel quantizes the activation to int8 (Q8) for the dotprod fast path
+ * — deliberately lossy (ggml-style), so it is NOT bit-exact vs the scalar
+ * reference. The gate is the aggregate error energy RMS(error)/RMS(signal), not
+ * per-row relative error (unbounded on near-zero rows of zero-mean fixtures).
  */
 class NativeKnQ6KMatmulKernelParityTest {
 
@@ -46,14 +49,24 @@ class NativeKnQ6KMatmulKernelParityTest {
         val knOut = FloatArray(outputDim)
         NativeKnQ6KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, knOut, 0)
 
+        var sqErr = 0.0
+        var sqSig = 0.0
         for (o in 0 until outputDim) {
-            val diff = abs(refOut[o] - knOut[o])
-            val rel = diff / (abs(refOut[o]) + 1e-9f)
-            assertTrue(
-                diff <= tol || rel < 1e-4f,
-                "row $o diverged: scalar=${refOut[o]} cinterop=${knOut[o]} diff=$diff rel=$rel tol=$tol",
-            )
+            val d = (refOut[o] - knOut[o]).toDouble()
+            sqErr += d * d
+            sqSig += refOut[o].toDouble() * refOut[o].toDouble()
         }
+        val rmsErr = kotlin.math.sqrt(sqErr / outputDim)
+        val rmsSig = kotlin.math.sqrt(sqSig / outputDim)
+        val relRms = rmsErr / (rmsSig + 1e-9)
+        assertTrue(
+            relRms < AGG_REL_TOL || rmsErr < tol,
+            "Q8 parity exceeded: relRms=$relRms (rmsErr=$rmsErr rmsSig=$rmsSig) over $outputDim rows, tol=$AGG_REL_TOL",
+        )
+    }
+
+    private companion object {
+        const val AGG_REL_TOL = 0.03
     }
 
     @Test