fix: turbo4 SET_ROWS corruption, tail-block truncation, constant coupling (Issue PrismML-Eng#29)

seanrasch · claude · seanrasch · commit 65ed3729535d · 2026-03-26T22:13:32.000-04:00
Three bugs from the block-size-32 refactor: 1. kernel_set_rows_turbo hardcoded turbo3 packing for turbo4 — split into separate kernel_set_rows_turbo3 and kernel_set_rows_turbo4 kernels. turbo4 now correctly does 3-bit PolarQuant + QJL residual correction. 2. Integer division in n_groups = nk0 / blocks_per_group silently dropped tail blocks for non-128-aligned head dims (e.g. dk=192). Added ceiling division with tail-group bounds checking in turbo3, and GGML_ASSERT in WHT dispatch to catch non-128-aligned tensors. 3. TURBO_D constant was semantically coupled to QK_TURBO4 — replaced with TURBO_ROT_DIM (= QK_TURBO3_GROUP) and added static_assert that QK_TURBO4 == QK_TURBO3_GROUP to guard against future drift. Closes PrismML-Eng#29 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -1659,6 +1659,7 @@ int ggml_metal_op_turbo_wht(ggml_metal_op_t ctx, int idx) {
     memcpy(&direction, op->op_params, sizeof(int));
 
     const int64_t n_elements = ggml_nelements(op->src[0]);
+    GGML_ASSERT(n_elements % 128 == 0 && "TURBO_WHT requires head_dim to be a multiple of 128");
     const int64_t n_groups = n_elements / 128;
 
     auto pipeline = ggml_metal_library_get_pipeline_turbo_wht(lib);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -452,7 +452,7 @@ constant float turbo_mid_2bit[3] = { -0.086728f, 0.0f, 0.086728f };
 constant float turbo_mid_3bit[7] = { -0.154259f, -0.091775f, -0.043589f, 0.0f, 0.043589f, 0.091775f, 0.154259f };
 
 // Quantize 32 elements into one block_turbo3_0 (NO rotation — rotation happens
-// at the 128-element group level in kernel_set_rows_turbo)
+// at the 128-element group level in kernel_set_rows_turbo3)
 void quantize_turbo3_0(device const float * src, device block_turbo3_0 & dst) {
 #pragma METAL fp math_mode(safe)
     // Compute norm for this 32-element sub-block
@@ -9489,12 +9489,11 @@ kernel void kernel_set_rows_q32(
     }
 }
 
-// TurboQuant set_rows kernel — block size 128 (QK_TURBO3/QK_TURBO4)
-// TurboQuant SET_ROWS kernel — processes QK_TURBO3_GROUP (128) elements per iteration,
+// TurboQuant3 SET_ROWS kernel — processes QK_TURBO3_GROUP (128) elements per iteration,
 // writes QK_TURBO3_GROUP/QK_TURBO3 (4) blocks per iteration.
 // The rotation operates on 128 elements, then results are split into 32-element blocks.
-template<typename TI, typename block_q, int QK, void (*quantize_func)(device const float *, device block_q &)>
-kernel void kernel_set_rows_turbo(
+template<typename TI>
+kernel void kernel_set_rows_turbo3(
         constant ggml_metal_kargs_set_rows & args,
         device const  void * src0,
         device const  void * src1,
@@ -9512,44 +9511,48 @@ kernel void kernel_set_rows_turbo(
     const int32_t i10 = i01;
     const TI      i1  = ((const device TI *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0];
 
-          device block_q * dst_row = (      device block_q *) ((      device char *) dst  +  i1*args.nb1  + i02*args.nb2  + i03*args.nb3);
-    const device float   * src_row = (const device float   *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
+          device block_turbo3_0 * dst_row = (      device block_turbo3_0 *) ((      device char *) dst  +  i1*args.nb1  + i02*args.nb2  + i03*args.nb3);
+    const device float           * src_row = (const device float         *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
 
-    // Process in groups of 4 blocks (128 elements) for rotation
-    const int blocks_per_group = QK_TURBO3_GROUP / QK;  // 128/32 = 4
-    const int n_groups = args.nk0 / blocks_per_group;
+    // Process in groups of 4 blocks (128 elements) for rotation.
+    // Use ceiling division so tail blocks for non-128-aligned head dims are not dropped.
+    const int blocks_per_group = QK_TURBO3_GROUP / QK_TURBO3;  // 128/32 = 4
+    const int n_groups = (args.nk0 + blocks_per_group - 1) / blocks_per_group;
 
     for (int grp = tiitg%tptg.x; grp < n_groups; grp += tptg.x) {
         const device float * grp_src = src_row + QK_TURBO3_GROUP * grp;
 
-        // Normalize and rotate the full 128-element group
+        // How many blocks are valid in this group (may be < 4 for tail group)
+        const int grp_start_block = grp * blocks_per_group;
+        const int grp_blocks = min(blocks_per_group, (int)args.nk0 - grp_start_block);
+        const int grp_elems = grp_blocks * QK_TURBO3;
+
+        // Normalize the valid elements, zero-pad the rest for WHT
         float norm_sq = 0.0f;
-        for (int j = 0; j < QK_TURBO3_GROUP; j++) norm_sq += grp_src[j] * grp_src[j];
+        for (int j = 0; j < grp_elems; j++) norm_sq += grp_src[j] * grp_src[j];
         float grp_norm = sqrt(norm_sq);
         float inv_norm = grp_norm > 1e-10f ? 1.0f / grp_norm : 0.0f;
 
         float x[128];
-        for (int j = 0; j < 128; j++) x[j] = grp_src[j] * inv_norm;
+        for (int j = 0; j < grp_elems; j++) x[j] = grp_src[j] * inv_norm;
+        for (int j = grp_elems; j < 128; j++) x[j] = 0.0f;  // zero-pad tail
         turbo_rotate_forward(x, turbo_wht_signs1, turbo_wht_signs2);
 
-        // Split into 4 blocks of 32 elements each
-        // All blocks store the SAME group norm — centroids are in normalized space
+        // Split into blocks (may be fewer than 4 for tail group)
         // Norm correction (ported from @spiritbuun's CUDA implementation):
-        // Accumulate ||centroid_vector||^2 across all 128 elements, then store
-        // grp_norm / ||centroid_vector|| instead of raw grp_norm. This makes
-        // dequantized vectors have the exact original L2 norm at zero decode cost.
+        // Store grp_norm / ||centroid_vector|| so dequant has exact original L2 norm.
         float recon_norm_sq = 0.0f;
 
-        for (int b = 0; b < blocks_per_group; b++) {
-            device block_q & blk = dst_row[grp * blocks_per_group + b];
-            const int off = b * QK;
+        for (int b = 0; b < grp_blocks; b++) {
+            device block_turbo3_0 & blk = dst_row[grp_start_block + b];
+            const int off = b * QK_TURBO3;
 
-            for (int j = 0; j < QK / 4; j++) blk.qs[j] = 0;
-            for (int j = 0; j < QK / 8; j++) blk.signs[j] = 0;
+            for (int j = 0; j < QK_TURBO3 / 4; j++) blk.qs[j] = 0;
+            for (int j = 0; j < QK_TURBO3 / 8; j++) blk.signs[j] = 0;
 
-            // Quantize rotated values to 3-bit centroids
-            for (int j = 0; j < QK; j++) {
-                float rv = x[off + j];  // rotated, normalized value
+            // Quantize rotated values to 3-bit centroids (split: 2-bit low in qs, 1-bit high in signs)
+            for (int j = 0; j < QK_TURBO3; j++) {
+                float rv = x[off + j];
                 uint8_t idx;
                 if      (rv < turbo_mid_3bit[0]) idx = 0;
                 else if (rv < turbo_mid_3bit[1]) idx = 1;
@@ -9563,18 +9566,110 @@ kernel void kernel_set_rows_turbo(
                 blk.qs[j / 4] |= (idx & 0x3) << ((j % 4) * 2);
                 if (idx & 0x4) blk.signs[j / 8] |= (1 << (j % 8));
 
-                // Accumulate centroid reconstruction norm for norm correction
                 float c = turbo_centroids_3bit[idx];
                 recon_norm_sq += c * c;
             }
         }
 
         // Norm correction: store corrected norm so dequant(x) has exact original L2 norm.
-        // Zero decode cost — dequant already multiplies by stored norm.
         float recon_norm = sqrt(recon_norm_sq);
         float corrected_norm = (recon_norm > 1e-10f) ? grp_norm / recon_norm : grp_norm;
-        for (int b = 0; b < blocks_per_group; b++) {
-            dst_row[grp * blocks_per_group + b].norm = half(corrected_norm);
+        for (int b = 0; b < grp_blocks; b++) {
+            dst_row[grp_start_block + b].norm = half(corrected_norm);
+        }
+    }
+}
+
+// TurboQuant4 SET_ROWS kernel — processes 128 elements per block (QK_TURBO4).
+// Turbo4 = 3-bit PolarQuant + 1-bit QJL residual correction.
+// Unlike turbo3 which splits 128-element groups into 4x32-element blocks,
+// turbo4 uses a single 128-element block with packed 3-bit indices + QJL signs.
+template<typename TI>
+kernel void kernel_set_rows_turbo4(
+        constant ggml_metal_kargs_set_rows & args,
+        device const  void * src0,
+        device const  void * src1,
+        device       float * dst,
+        uint3                tgpig[[threadgroup_position_in_grid]],
+        uint                 tiitg[[thread_index_in_threadgroup]],
+        uint3                tptg [[threads_per_threadgroup]]) {
+    const int32_t i03 = tgpig.z;
+    const int32_t i02 = tgpig.y;
+    const int32_t i12 = i03%args.ne12;
+    const int32_t i11 = i02%args.ne11;
+    const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x;
+    if (i01 >= args.ne01) return;
+
+    const int32_t i10 = i01;
+    const TI      i1  = ((const device TI *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0];
+
+          device block_turbo4_0 * dst_row = (      device block_turbo4_0 *) ((      device char *) dst  +  i1*args.nb1  + i02*args.nb2  + i03*args.nb3);
+    const device float           * src_row = (const device float         *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
+
+    // Each block is one 128-element group
+    const int n_blocks = args.nk0;  // nk0 = ne0 / QK_TURBO4, already in block units
+
+    for (int blk_idx = tiitg%tptg.x; blk_idx < n_blocks; blk_idx += tptg.x) {
+        const device float * blk_src = src_row + QK_TURBO4 * blk_idx;
+        device block_turbo4_0 & blk = dst_row[blk_idx];
+
+        // Step 1: Compute norm + normalize
+        float norm_sq = 0.0f;
+        for (int j = 0; j < QK_TURBO4; j++) norm_sq += blk_src[j] * blk_src[j];
+        float grp_norm = sqrt(norm_sq);
+        float inv_norm = grp_norm > 1e-10f ? 1.0f / grp_norm : 0.0f;
+        blk.norm = half(grp_norm);
+
+        float x[128];
+        for (int j = 0; j < 128; j++) x[j] = blk_src[j] * inv_norm;
+        float normalized[128];
+        for (int j = 0; j < 128; j++) normalized[j] = x[j];
+
+        // Step 2: WHT rotate in-place
+        turbo_rotate_forward(x, turbo_wht_signs1, turbo_wht_signs2);
+
+        // Step 3: 3-bit PolarQuant quantization — packed 3-bit indices
+        for (int j = 0; j < QK_TURBO4 * 3 / 8; j++) blk.qs[j] = 0;
+        for (int j = 0; j < QK_TURBO4 / 8; j++) blk.signs[j] = 0;
+
+        float recon[128];
+        for (int j = 0; j < 128; j++) {
+            float val = x[j];
+            uint8_t idx;
+            if      (val < turbo_mid_3bit[0]) idx = 0;
+            else if (val < turbo_mid_3bit[1]) idx = 1;
+            else if (val < turbo_mid_3bit[2]) idx = 2;
+            else if (val < turbo_mid_3bit[3]) idx = 3;
+            else if (val < turbo_mid_3bit[4]) idx = 4;
+            else if (val < turbo_mid_3bit[5]) idx = 5;
+            else if (val < turbo_mid_3bit[6]) idx = 6;
+            else                              idx = 7;
+            recon[j] = turbo_centroids_3bit[idx];
+
+            // Pack 3-bit index (may span byte boundary)
+            int bit_offset = j * 3;
+            int byte_idx = bit_offset / 8;
+            int bit_pos = bit_offset % 8;
+            blk.qs[byte_idx] |= (uint8_t)((idx & 0x7) << bit_pos);
+            if (bit_pos > 5 && byte_idx + 1 < QK_TURBO4 * 3 / 8) {
+                blk.qs[byte_idx + 1] |= (uint8_t)((idx & 0x7) >> (8 - bit_pos));
+            }
+        }
+
+        // Step 4: Compute residual and its norm
+        float rnorm_sq = 0.0f;
+        for (int j = 0; j < 128; j++) {
+            x[j] = normalized[j] - recon[j];  // residual in x buffer
+            rnorm_sq += x[j] * x[j];
+        }
+        blk.rnorm = half(sqrt(rnorm_sq));
+
+        // Step 5: QJL — WHT rotate residual, store sign bits
+        turbo_rotate_forward(x, turbo_qjl_wht_signs1, turbo_qjl_wht_signs2);
+        for (int i = 0; i < 128; i++) {
+            if (x[i] >= 0.0f) {
+                blk.signs[i / 8] |= (1 << (i % 8));
+            }
         }
     }
 }
@@ -10381,13 +10476,14 @@ template [[host_name("kernel_set_rows_q5_1_i32")]]   kernel set_rows_q32_t kerne
 template [[host_name("kernel_set_rows_iq4_nl_i64")]] kernel set_rows_q32_t kernel_set_rows_q32<int64_t, block_iq4_nl, quantize_iq4_nl>;
 template [[host_name("kernel_set_rows_iq4_nl_i32")]] kernel set_rows_q32_t kernel_set_rows_q32<int32_t, block_iq4_nl, quantize_iq4_nl>;
 
-// TurboQuant set_rows instantiations (block size 128)
-typedef decltype(kernel_set_rows_turbo<int64_t, block_turbo3_0, QK_TURBO3, quantize_turbo3_0>) set_rows_turbo_t;
+// TurboQuant set_rows instantiations — separate turbo3 and turbo4 kernels
+typedef decltype(kernel_set_rows_turbo3<int64_t>) set_rows_turbo3_t;
+typedef decltype(kernel_set_rows_turbo4<int64_t>) set_rows_turbo4_t;
 
-template [[host_name("kernel_set_rows_turbo3_i64")]] kernel set_rows_turbo_t kernel_set_rows_turbo<int64_t, block_turbo3_0, QK_TURBO3, quantize_turbo3_0>;
-template [[host_name("kernel_set_rows_turbo3_i32")]] kernel set_rows_turbo_t kernel_set_rows_turbo<int32_t, block_turbo3_0, QK_TURBO3, quantize_turbo3_0>;
-template [[host_name("kernel_set_rows_turbo4_i64")]] kernel set_rows_turbo_t kernel_set_rows_turbo<int64_t, block_turbo4_0, QK_TURBO4, quantize_turbo4_0>;
-template [[host_name("kernel_set_rows_turbo4_i32")]] kernel set_rows_turbo_t kernel_set_rows_turbo<int32_t, block_turbo4_0, QK_TURBO4, quantize_turbo4_0>;
+template [[host_name("kernel_set_rows_turbo3_i64")]] kernel set_rows_turbo3_t kernel_set_rows_turbo3<int64_t>;
+template [[host_name("kernel_set_rows_turbo3_i32")]] kernel set_rows_turbo3_t kernel_set_rows_turbo3<int32_t>;
+template [[host_name("kernel_set_rows_turbo4_i64")]] kernel set_rows_turbo4_t kernel_set_rows_turbo4<int64_t>;
+template [[host_name("kernel_set_rows_turbo4_i32")]] kernel set_rows_turbo4_t kernel_set_rows_turbo4<int32_t>;
 
 //
 // matrix-matrix multiplication
diff --git a/ggml/src/ggml-turbo-quant.c b/ggml/src/ggml-turbo-quant.c
@@ -19,9 +19,16 @@
 
 #define TURBO_SEED_ROTATION 42
 #define TURBO_SEED_QJL      1042
-#define TURBO_D             128  /* rotation group size = head_dim (independent of block size) */
 #define TURBO_QJL_CONST     1.2533141373155003f  /* sqrt(pi/2) */
 
+/* Rotation group size = QK_TURBO3_GROUP (from ggml-common.h), NOT a separate constant.
+ * turbo4 block size (QK_TURBO4) happens to equal the rotation group size today,
+ * but they are semantically different. Assert they match so turbo4 code can safely
+ * use QK_TURBO4 for both array sizing and loop bounds. */
+static_assert(QK_TURBO4 == QK_TURBO3_GROUP,
+    "turbo4 block size must equal rotation group size (both 128)");
+#define TURBO_ROT_DIM QK_TURBO3_GROUP
+
 /* Optimal centroids from paper (scaled by 1/sqrt(d)) */
 /* 1-bit: ±sqrt(2/(pi*d)) */
 static const float CENTROIDS_1BIT[2] = { -0.070711f, 0.070711f };  /* for d=128 */
@@ -37,8 +44,8 @@ static const float CENTROIDS_3BIT[8] = {
 
 /* ---------- rotation matrix (lazy init) ---------- */
 
-static float turbo_rotation[TURBO_D * TURBO_D];
-static float turbo_rotation_t[TURBO_D * TURBO_D]; /* transpose */
+static float turbo_rotation[TURBO_ROT_DIM * TURBO_ROT_DIM];
+static float turbo_rotation_t[TURBO_ROT_DIM * TURBO_ROT_DIM]; /* transpose */
 static int   turbo_rotation_initialized = 0;
 
 /* Simple LCG PRNG for deterministic rotation generation */
@@ -61,11 +68,11 @@ static double turbo_prng_normal(void) {
 static void turbo_init_rotation(void) {
     if (turbo_rotation_initialized) return;
 
-    const int d = TURBO_D;
+    const int d = TURBO_ROT_DIM;
 
     /* Generate random Gaussian matrix */
     turbo_prng_seed(TURBO_SEED_ROTATION);
-    float G[TURBO_D * TURBO_D];
+    float G[TURBO_ROT_DIM * TURBO_ROT_DIM];
     for (int i = 0; i < d * d; i++) {
         G[i] = (float)turbo_prng_normal();
     }
@@ -111,14 +118,14 @@ static void turbo_init_rotation(void) {
 
 /* ---------- QJL projection matrix (lazy init, seed-based) ---------- */
 
-static float turbo_qjl_matrix[TURBO_D * TURBO_D];
-static float turbo_qjl_matrix_t[TURBO_D * TURBO_D];
+static float turbo_qjl_matrix[TURBO_ROT_DIM * TURBO_ROT_DIM];
+static float turbo_qjl_matrix_t[TURBO_ROT_DIM * TURBO_ROT_DIM];
 static int   turbo_qjl_initialized = 0;
 
 static void turbo_init_qjl(void) {
     if (turbo_qjl_initialized) return;
 
-    const int d = TURBO_D;
+    const int d = TURBO_ROT_DIM;
     turbo_prng_seed(TURBO_SEED_QJL);
 
     for (int i = 0; i < d * d; i++) {
@@ -235,7 +242,7 @@ void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * G
         float norm = sqrtf(norm_sq);
 
         /* Normalize */
-        float normalized[TURBO_D];
+        float normalized[TURBO_ROT_DIM];
         if (norm > 1e-10f) {
             const float inv = 1.0f / norm;
             for (int i = 0; i < d; i++) normalized[i] = src[i] * inv;
@@ -244,31 +251,31 @@ void quantize_row_turbo4_0_ref(const float * GGML_RESTRICT x, block_turbo4_0 * G
         }
 
         /* Step 2: Rotate */
-        float rotated[TURBO_D];
+        float rotated[TURBO_ROT_DIM];
         matvec(turbo_rotation, normalized, rotated, d);
 
         /* Step 3: 3-bit quantization */
-        uint8_t indices[TURBO_D];
+        uint8_t indices[TURBO_ROT_DIM];
         for (int i = 0; i < d; i++) {
             indices[i] = (uint8_t)nearest_centroid_3bit(rotated[i]);
         }
 
         /* Step 4: Residual */
-        float reconstructed[TURBO_D];
+        float reconstructed[TURBO_ROT_DIM];
         for (int i = 0; i < d; i++) {
             reconstructed[i] = CENTROIDS_3BIT[indices[i]];
         }
-        float mse_recon[TURBO_D];
+        float mse_recon[TURBO_ROT_DIM];
         matvec(turbo_rotation_t, reconstructed, mse_recon, d);
 
-        float residual[TURBO_D];
+        float residual[TURBO_ROT_DIM];
         for (int i = 0; i < d; i++) {
             residual[i] = normalized[i] - mse_recon[i];
         }
 
 
         /* Step 5: QJL */
-        float projected[TURBO_D];
+        float projected[TURBO_ROT_DIM];
         matvec(turbo_qjl_matrix, residual, projected, d);
 
         /* Pack */
@@ -310,7 +317,7 @@ void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGM
         float norm  = GGML_FP16_TO_FP32(x[block].norm);
 
         /* Unpack 3-bit indices */
-        uint8_t indices[TURBO_D];
+        uint8_t indices[TURBO_ROT_DIM];
         for (int i = 0; i < d; i++) {
             int bit_offset = i * 3;
             int byte_idx   = bit_offset / 8;
@@ -323,7 +330,7 @@ void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGM
         }
 
         /* Unpack signs */
-        float signs[TURBO_D];
+        float signs[TURBO_ROT_DIM];
         for (int i = 0; i < d; i++) {
             signs[i] = (x[block].signs[i / 8] & (1 << (i % 8))) ? 1.0f : -1.0f;
         }
@@ -332,15 +339,15 @@ void dequantize_row_turbo4_0(const block_turbo4_0 * GGML_RESTRICT x, float * GGM
         const float qjl_scale = TURBO_QJL_CONST / (float)d * rnorm;
 
         /* PolarQuant dequant */
-        float rotated_recon[TURBO_D];
+        float rotated_recon[TURBO_ROT_DIM];
         for (int i = 0; i < d; i++) {
             rotated_recon[i] = CENTROIDS_3BIT[indices[i]];
         }
-        float mse_recon[TURBO_D];
+        float mse_recon[TURBO_ROT_DIM];
         matvec(turbo_rotation_t, rotated_recon, mse_recon, d);
 
         /* QJL dequant */
-        float qjl_recon[TURBO_D];
+        float qjl_recon[TURBO_ROT_DIM];
         matvec(turbo_qjl_matrix_t, signs, qjl_recon, d);
         for (int i = 0; i < d; i++) {
             qjl_recon[i] *= qjl_scale;