Skip to content

Commit 86365b2

Browse files
michalharakalclaude
andcommitted
perf(native q5k,q8_0): block-outer loop order for sequential weight reads
Apply the same cache-locality fix as q4k_matmul (d998feb) to the Q5_K and Q8_0 kernels: iterate block-OUTER / output-row-INNER so the block-major weight (blockIdx*output_dim + o)*bytes is read sequentially (stride = one block) instead of striding output_dim*bytes per step — the strided pattern makes every weight read a cold miss on the in-order A55. out_base[o] accumulates across blocks; accumulation order is unchanged so results are numerically identical. Both validated on host against the Panama reference (NativeQ5KMatmulKernelParityTest, NativeQ8_0MatmulKernelParityTest green). Not exercised by TinyLlama Q4_K_M (Q4_K + Q6_K + F32 only), so no board delta for that model — this keeps the K-quant kernels consistent and benefits any model that uses Q5_K/Q8_0 weights. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 453ff40 commit 86365b2

2 files changed

Lines changed: 41 additions & 19 deletions

File tree

skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,25 @@ SKAINET_API void skainet_q5k_matmul(
103103
int scale_idx[Q5K_SUB_BLOCKS];
104104
int min_idx[Q5K_SUB_BLOCKS];
105105

106-
for (int32_t o = 0; o < output_dim; ++o) {
107-
float acc = 0.0f;
108-
109-
for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
110-
const uint8_t* block = weight + weight_byte_offset
111-
+ (size_t)(block_idx * output_dim + o) * Q5K_BYTES_PER_BLOCK;
106+
/*
107+
* Loop order: block OUTER, output row INNER — see q4k_matmul.c for the
108+
* rationale. The weight is block-major (blockIdx*output_dim + o)*176, so for
109+
* a fixed block consecutive `o` are 176 bytes apart: weight bytes are read
110+
* sequentially (cache/prefetch friendly) instead of striding output_dim*176
111+
* per step, which on the in-order A55 makes every read a cold miss.
112+
* out_base[o] accumulates across blocks (a per-o register `acc` holds the
113+
* inner sum); accumulation order over blocks is unchanged ⇒ numerically
114+
* identical to the o-outer form.
115+
*/
116+
for (int32_t o = 0; o < output_dim; ++o) out_base[o] = 0.0f;
117+
118+
for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
119+
const float* in_block = in_base + (size_t) block_idx * Q5K_BLOCK_SIZE;
120+
const uint8_t* block = weight + weight_byte_offset
121+
+ (size_t)(block_idx * output_dim) * Q5K_BYTES_PER_BLOCK;
122+
123+
for (int32_t o = 0; o < output_dim; ++o, block += Q5K_BYTES_PER_BLOCK) {
124+
float acc = 0.0f;
112125

113126
/* d, dMin (FP16 LE -> FP32). */
114127
const uint16_t d_bits = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
@@ -121,7 +134,6 @@ SKAINET_API void skainet_q5k_matmul(
121134

122135
const uint8_t* qh = block + Q5K_QH_OFFSET;
123136
const uint8_t* qs = block + Q5K_QS_OFFSET;
124-
const float* in_block = in_base + (size_t) block_idx * Q5K_BLOCK_SIZE;
125137

126138
/* 4 strided qs groups; group j carries sub-blocks 2j (lo) and 2j+1 (hi). */
127139
for (int group_j = 0; group_j < 4; ++group_j) {
@@ -195,8 +207,8 @@ SKAINET_API void skainet_q5k_matmul(
195207
acc += code_sum_lo * scale_lo - input_sum_lo * offset_lo;
196208
acc += code_sum_hi * scale_hi - input_sum_hi * offset_hi;
197209
}
198-
}
199210

200-
out_base[o] = acc;
211+
out_base[o] += acc;
212+
}
201213
}
202214
}

skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,18 +71,29 @@ SKAINET_API void skainet_q8_0_matmul(
7171
const int32_t BLOCK_SIZE = 32;
7272
const int32_t BYTES_PER_BLOCK = 34;
7373
const int32_t blocks_per_input_dim = input_dim / BLOCK_SIZE;
74+
float* SKAINET_RESTRICT out_base = output + output_offset;
7475

75-
for (int32_t o = 0; o < output_dim; ++o) {
76-
float acc = 0.0f;
77-
for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
78-
const uint8_t* SKAINET_RESTRICT block =
79-
weight + weight_byte_offset +
80-
(size_t)(block_idx * output_dim + o) * BYTES_PER_BLOCK;
76+
/*
77+
* Loop order: block OUTER, output row INNER — see q4k_matmul.c for the
78+
* rationale. The weight is block-major (blockIdx*output_dim + o)*34, so for
79+
* a fixed block consecutive `o` are 34 bytes apart: weight bytes are read
80+
* sequentially instead of striding output_dim*34 per step, which on the
81+
* in-order A55 makes every read a cold cache miss. out_base[o] accumulates
82+
* across blocks; accumulation order is unchanged ⇒ numerically identical.
83+
*/
84+
for (int32_t o = 0; o < output_dim; ++o) out_base[o] = 0.0f;
85+
86+
for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
87+
const float* SKAINET_RESTRICT input_block =
88+
input + input_offset + (size_t) block_idx * BLOCK_SIZE;
89+
const uint8_t* SKAINET_RESTRICT block =
90+
weight + weight_byte_offset +
91+
(size_t)(block_idx * output_dim) * BYTES_PER_BLOCK;
92+
93+
for (int32_t o = 0; o < output_dim; ++o, block += BYTES_PER_BLOCK) {
8194
uint16_t d_bits = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
8295
float d = skainet_fp16_to_fp32(d_bits);
8396
const int8_t* SKAINET_RESTRICT codes = (const int8_t*) (block + 2);
84-
const float* SKAINET_RESTRICT input_block =
85-
input + input_offset + (size_t) block_idx * BLOCK_SIZE;
8697
float block_sum = 0.0f;
8798
#ifdef SKAINET_HAVE_NEON
8899
/* Activations are FP32, so widen int8 codes to float and FMA
@@ -107,8 +118,7 @@ SKAINET_API void skainet_q8_0_matmul(
107118
block_sum += input_block[k] * (float) codes[k];
108119
}
109120
#endif
110-
acc += block_sum * d;
121+
out_base[o] += block_sum * d;
111122
}
112-
output[output_offset + o] = acc;
113123
}
114124
}

0 commit comments

Comments
 (0)