From 6112eb423ae0acb471d33adedf3b68b07aa49bfb Mon Sep 17 00:00:00 2001
From: TheTom <tturney1@gmail.com>
Date: Mon, 20 Apr 2026 08:45:12 -0500
Subject: [PATCH 1/2] fix: gate turbo V unpad on V type, not K type

---
 src/llama-graph.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 04b9617236c..1997a377fe3 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -2189,7 +2189,8 @@ ggml_tensor * llm_graph_context::build_attn(
 
     // TurboQuant: if V was padded, the output has padded dimensions.
     // Extract original V head_dim after inverse WHT (applied inside build_attn_mha).
-    if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
+    // NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo
+    if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
         const int64_t orig_v_head = hparams.n_embd_head_v(il);
         // cur is 2D: (n_embd_head * n_head, n_tokens) after build_attn_mha
         const int64_t padded_v_head = v->ne[0];
@@ -2415,7 +2416,8 @@ ggml_tensor * llm_graph_context::build_attn(
     cb(cur, "kqv_out", il);
 
     // TurboQuant: if V was padded, extract original V head_dim after inverse WHT
-    if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) {
+    // NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo
+    if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) {
         const int64_t orig_v_head = hparams.n_embd_head_v(il);
         const int64_t padded_v_head = v->ne[0];
         if (padded_v_head != orig_v_head) {

From a1bcb34a18d4a3396453c850d19d27996e4dc391 Mon Sep 17 00:00:00 2001
From: TheTom <tturney1@gmail.com>
Date: Mon, 20 Apr 2026 09:10:46 -0500
Subject: [PATCH 2/2] =?UTF-8?q?fix(metal):=20disable=20TurboFlash=20by=20d?=
 =?UTF-8?q?efault=20=E2=80=94=20corrupt=20output=20on=20Apple10?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TurboFlash two-pass fused attention kernel produces garbage output
on M5 Max (Apple10/Metal4) for all turbo3 V configs. Disabling by
default routes turbo3 through the standard FA path which works correctly.

Users can opt-in with TURBO_FLASH=1 for testing/debugging.

No perf regression — standard FA path matches TurboFlash speed within
noise (~55-57 t/s tg128 for q8_0/turbo3 on M5 Max).

Co-Authored-By: tturney@psyguard.ai
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 24bd4fed095..fd6335323c8 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2731,8 +2731,10 @@ static bool ggml_metal_op_flash_attn_ext_use_turbo_flash(const ggml_tensor * op)
     // Check environment variable to force enable (bypasses other checks)
     if (turbo_flash_env && turbo_flash_env[0] == '1') return true;
 
-    // Default: enabled for all qualifying configurations
-    return true;
+    // Default: disabled — TurboFlash two-pass kernel produces corrupt output
+    // on Apple10 (M5 Max) and possibly other Metal4 GPUs. Use TURBO_FLASH=1
+    // to opt-in for testing. See PR #91.
+    return false;
 }
 
 size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {