perf(native q5k,q8_0): block-outer loop order for sequential weight reads

michalharakal · claude · michalharakal · commit 86365b2b67a7 · 2026-07-02T22:19:49.000+02:00
Apply the same cache-locality fix as q4k_matmul (d998feb) to the Q5_K and Q8_0 kernels: iterate block-OUTER / output-row-INNER so the block-major weight (blockIdx*output_dim + o)*bytes is read sequentially (stride = one block) instead of striding output_dim*bytes per step — the strided pattern makes every weight read a cold miss on the in-order A55. out_base[o] accumulates across blocks; accumulation order is unchanged so results are numerically identical. Both validated on host against the Panama reference (NativeQ5KMatmulKernelParityTest, NativeQ8_0MatmulKernelParityTest green). Not exercised by TinyLlama Q4_K_M (Q4_K + Q6_K + F32 only), so no board delta for that model — this keeps the K-quant kernels consistent and benefits any model that uses Q5_K/Q8_0 weights. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c
@@ -103,12 +103,25 @@ SKAINET_API void skainet_q5k_matmul(
     int scale_idx[Q5K_SUB_BLOCKS];
     int min_idx[Q5K_SUB_BLOCKS];
 
-    for (int32_t o = 0; o < output_dim; ++o) {
-        float acc = 0.0f;
-
-        for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
-            const uint8_t* block = weight + weight_byte_offset
-                + (size_t)(block_idx * output_dim + o) * Q5K_BYTES_PER_BLOCK;
+    /*
+     * Loop order: block OUTER, output row INNER — see q4k_matmul.c for the
+     * rationale. The weight is block-major (blockIdx*output_dim + o)*176, so for
+     * a fixed block consecutive `o` are 176 bytes apart: weight bytes are read
+     * sequentially (cache/prefetch friendly) instead of striding output_dim*176
+     * per step, which on the in-order A55 makes every read a cold miss.
+     * out_base[o] accumulates across blocks (a per-o register `acc` holds the
+     * inner sum); accumulation order over blocks is unchanged ⇒ numerically
+     * identical to the o-outer form.
+     */
+    for (int32_t o = 0; o < output_dim; ++o) out_base[o] = 0.0f;
+
+    for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
+        const float* in_block = in_base + (size_t) block_idx * Q5K_BLOCK_SIZE;
+        const uint8_t* block = weight + weight_byte_offset
+            + (size_t)(block_idx * output_dim) * Q5K_BYTES_PER_BLOCK;
+
+        for (int32_t o = 0; o < output_dim; ++o, block += Q5K_BYTES_PER_BLOCK) {
+            float acc = 0.0f;
 
             /* d, dMin (FP16 LE -> FP32). */
             const uint16_t d_bits     = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
@@ -121,7 +134,6 @@ SKAINET_API void skainet_q5k_matmul(
 
             const uint8_t* qh = block + Q5K_QH_OFFSET;
             const uint8_t* qs = block + Q5K_QS_OFFSET;
-            const float* in_block = in_base + (size_t) block_idx * Q5K_BLOCK_SIZE;
 
             /* 4 strided qs groups; group j carries sub-blocks 2j (lo) and 2j+1 (hi). */
             for (int group_j = 0; group_j < 4; ++group_j) {
@@ -195,8 +207,8 @@ SKAINET_API void skainet_q5k_matmul(
                 acc += code_sum_lo * scale_lo - input_sum_lo * offset_lo;
                 acc += code_sum_hi * scale_hi - input_sum_hi * offset_hi;
             }
-        }
 
-        out_base[o] = acc;
+            out_base[o] += acc;
+        }
     }
 }
diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c
@@ -71,18 +71,29 @@ SKAINET_API void skainet_q8_0_matmul(
     const int32_t BLOCK_SIZE = 32;
     const int32_t BYTES_PER_BLOCK = 34;
     const int32_t blocks_per_input_dim = input_dim / BLOCK_SIZE;
+    float* SKAINET_RESTRICT out_base = output + output_offset;
 
-    for (int32_t o = 0; o < output_dim; ++o) {
-        float acc = 0.0f;
-        for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
-            const uint8_t* SKAINET_RESTRICT block =
-                weight + weight_byte_offset +
-                (size_t)(block_idx * output_dim + o) * BYTES_PER_BLOCK;
+    /*
+     * Loop order: block OUTER, output row INNER — see q4k_matmul.c for the
+     * rationale. The weight is block-major (blockIdx*output_dim + o)*34, so for
+     * a fixed block consecutive `o` are 34 bytes apart: weight bytes are read
+     * sequentially instead of striding output_dim*34 per step, which on the
+     * in-order A55 makes every read a cold cache miss. out_base[o] accumulates
+     * across blocks; accumulation order is unchanged ⇒ numerically identical.
+     */
+    for (int32_t o = 0; o < output_dim; ++o) out_base[o] = 0.0f;
+
+    for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
+        const float* SKAINET_RESTRICT input_block =
+            input + input_offset + (size_t) block_idx * BLOCK_SIZE;
+        const uint8_t* SKAINET_RESTRICT block =
+            weight + weight_byte_offset +
+            (size_t)(block_idx * output_dim) * BYTES_PER_BLOCK;
+
+        for (int32_t o = 0; o < output_dim; ++o, block += BYTES_PER_BLOCK) {
             uint16_t d_bits = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
             float d = skainet_fp16_to_fp32(d_bits);
             const int8_t* SKAINET_RESTRICT codes = (const int8_t*) (block + 2);
-            const float* SKAINET_RESTRICT input_block =
-                input + input_offset + (size_t) block_idx * BLOCK_SIZE;
             float block_sum = 0.0f;
 #ifdef SKAINET_HAVE_NEON
             /* Activations are FP32, so widen int8 codes to float and FMA
@@ -107,8 +118,7 @@ SKAINET_API void skainet_q8_0_matmul(
                 block_sum += input_block[k] * (float) codes[k];
             }
 #endif
-            acc += block_sum * d;
+            out_base[o] += block_sum * d;
         }
-        output[output_offset + o] = acc;
     }
 }