feat: symmetric turbo3 K support in TurboFlash + research conclusions

TheTom · claude · TheTom · commit 10cb187f8c6e · 2026-04-08T18:41:46.000-05:00
Added turbo3 K dequant path to TurboFlash kernel via function constant
FC_turbo_flash_p1_k_is_turbo3. Symmetric turbo3/turbo3 now dispatches
through TurboFlash instead of baseline FA.

Result: symmetric TurboFlash is neutral vs baseline FA (-0.7%).
This confirms the 56-&gt;145 tok/s gap to Eric's MLX-Swift is 100%
framework overhead (dispatch count, graph evaluation), not kernel-level.

Best config remains asymmetric q8_0-K/turbo3-V with TurboFlash V4
+ simd_shuffle WHT: 56.82 tok/s, 93% of q8_0, +1.5% over baseline.

Co-Authored-By: tturney@psyguard.ai
Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2709,8 +2709,8 @@ static bool ggml_metal_op_flash_attn_ext_use_turbo_flash(const ggml_tensor * op)
     // Only for turbo3 V cache
     if (type_v != GGML_TYPE_TURBO3_0) return false;
 
-    // Only for q8_0 K (asymmetric) — the primary target config
-    if (type_k != GGML_TYPE_Q8_0) return false;
+    // Only for q8_0 or turbo3 K — asymmetric or symmetric turbo
+    if (type_k != GGML_TYPE_Q8_0 && type_k != GGML_TYPE_TURBO3_0) return false;
 
     // Only for supported head dims (64, 96, 128) and power-of-2 aligned to 32
     if (ne00 % 32 != 0) return false;
@@ -2947,7 +2947,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
 
     // ==================== TurboFlash two-pass dispatch ====================
     // Intercept before the normal VEC/non-VEC path when conditions are met:
-    //   - V is turbo3, K is q8_0
+    //   - V is turbo3, K is q8_0 or turbo3
     //   - Single-token decode (ne01 == 1)
     //   - Supported head dimensions (64, 96, 128)
     fprintf(stderr, "TURBOFLASH: pre-check ne01=%d type_k=%d type_v=%d ne00=%d TURBO3=%d\n",
@@ -3013,19 +3013,23 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
                 };
 
                 // Pipeline name: kernel_turbo_flash_p1_dk{dk}_dv{dv}
+                const ggml_type type_k = op->src[1]->type;
+                const bool k_is_turbo3 = (type_k == GGML_TYPE_TURBO3_0);
+
                 char p1_base[128];
                 char p1_name[256];
                 snprintf(p1_base, 128, "kernel_turbo_flash_p1_dk%d_dv%d", dk, dv);
-                snprintf(p1_name, 256, "%s_mask=%d_dk=%d_dv=%d",
-                        p1_base, has_mask ? 1 : 0, dk, dv);
+                snprintf(p1_name, 256, "%s_mask=%d_dk=%d_dv=%d_kt3=%d",
+                        p1_base, has_mask ? 1 : 0, dk, dv, k_is_turbo3 ? 1 : 0);
 
-                // The kernel uses FC_turbo_flash_p1_has_mask as a function constant
+                // The kernel uses FC_turbo_flash_p1_has_mask and FC_turbo_flash_p1_k_is_turbo3 as function constants
                 ggml_metal_pipeline_with_params res_p1 = ggml_metal_library_get_pipeline(lib, p1_name);
                 if (!res_p1.pipeline) {
                     ggml_metal_cv_t cv = ggml_metal_cv_init();
-                    ggml_metal_cv_set_int32(cv, dk,       FC_TURBO_FLASH_P1 + 0);
-                    ggml_metal_cv_set_int32(cv, dv,       FC_TURBO_FLASH_P1 + 1);
-                    ggml_metal_cv_set_bool(cv,  has_mask,  FC_TURBO_FLASH_P1 + 2);
+                    ggml_metal_cv_set_int32(cv, dk,          FC_TURBO_FLASH_P1 + 0);
+                    ggml_metal_cv_set_int32(cv, dv,          FC_TURBO_FLASH_P1 + 1);
+                    ggml_metal_cv_set_bool(cv,  has_mask,     FC_TURBO_FLASH_P1 + 2);
+                    ggml_metal_cv_set_bool(cv,  k_is_turbo3,  FC_TURBO_FLASH_P1 + 3);
 
                     fprintf(stderr, "TURBOFLASH: compiling P1 pipeline base='%s' has_mask=%d\n", p1_base, has_mask);
                     res_p1 = ggml_metal_library_compile_pipeline(lib, p1_base, p1_name, cv);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -8587,6 +8587,7 @@ kernel void kernel_flash_attn_ext_vec_reduce(
 constant int32_t FC_turbo_flash_p1_dk  [[function_constant(FC_TURBO_FLASH_P1 + 0)]];
 constant int32_t FC_turbo_flash_p1_dv  [[function_constant(FC_TURBO_FLASH_P1 + 1)]];
 constant bool    FC_turbo_flash_p1_has_mask [[function_constant(FC_TURBO_FLASH_P1 + 2)]];
+constant bool    FC_turbo_flash_p1_k_is_turbo3 [[function_constant(FC_TURBO_FLASH_P1 + 3)]];
 
 // Function constants for Pass 2
 constant int32_t FC_turbo_flash_p2_dv  [[function_constant(FC_TURBO_FLASH_P2 + 0)]];
@@ -8666,6 +8667,14 @@ kernel void kernel_turbo_flash_p1(
         v_cb[i] = float(turbo_centroids_3bit_h[i]);
     }
 
+    // K codebook — same centroids, only loaded when K is turbo3
+    float k_cb[8];
+    if (FC_turbo_flash_p1_k_is_turbo3) {
+        for (int i = 0; i < 8; i++) {
+            k_cb[i] = float(turbo_centroids_3bit_h[i]);
+        }
+    }
+
     // ====== Online softmax state — all in registers ======
     float m_state = -INFINITY;
     float l_state = 0.0f;
@@ -8697,19 +8706,42 @@ kernel void kernel_turbo_flash_p1(
         }
 
         // --- Dequant K and compute Q·K score ---
-        // K is q8_0: 32 elements per block, DK/32 blocks per row.
         // Each lane computes partial dot for its interleaved dims, then simd_sum.
-        device const block_q8_0 * k_row = (device const block_q8_0 *)(k_base + t * args.nb11);
-
         float dot_partial = 0.0f;
-        for (short i = 0; i < DK_PER_LANE; i++) {
-            const int d = (int)lane + i * 32;
-            if (d >= DK) break;
 
-            // Which q8_0 block and offset within it
-            const int qb = d / 32;   // block index
-            const int qj = d % 32;   // element within block
-            dot_partial += q_vals[i] * (float)k_row[qb].qs[qj] * (float)k_row[qb].d;
+        if (FC_turbo_flash_p1_k_is_turbo3) {
+            // K is turbo3_0: same struct as V — norm, qs[], signs[]
+            device const block_turbo3_0 * k_row = (device const block_turbo3_0 *)(k_base + t * args.nb11);
+            const float k_norm = float(k_row[0].norm);
+
+            for (short i = 0; i < DK_PER_LANE; i++) {
+                const int d = (int)lane + i * 32;
+                if (d >= DK) break;
+
+                const int qs_byte = d / 4;
+                const int qs_shift = (d % 4) * 2;
+                const uint8_t q_idx = (k_row[0].qs[qs_byte] >> qs_shift) & 0x03;
+
+                const int sign_byte = d / 8;
+                const int sign_bit  = d % 8;
+                const uint8_t s_bit = (k_row[0].signs[sign_byte] >> sign_bit) & 1;
+
+                const uint8_t centroid_idx = q_idx | (s_bit << 2);
+                dot_partial += q_vals[i] * k_cb[centroid_idx] * k_norm;
+            }
+        } else {
+            // K is q8_0: 32 elements per block, DK/32 blocks per row.
+            device const block_q8_0 * k_row = (device const block_q8_0 *)(k_base + t * args.nb11);
+
+            for (short i = 0; i < DK_PER_LANE; i++) {
+                const int d = (int)lane + i * 32;
+                if (d >= DK) break;
+
+                // Which q8_0 block and offset within it
+                const int qb = d / 32;   // block index
+                const int qj = d % 32;   // element within block
+                dot_partial += q_vals[i] * (float)k_row[qb].qs[qj] * (float)k_row[qb].d;
+            }
         }
         float score = simd_sum(dot_partial) * args.scale + mask_val;