From 6112eb423ae0acb471d33adedf3b68b07aa49bfb Mon Sep 17 00:00:00 2001 From: TheTom Date: Mon, 20 Apr 2026 08:45:12 -0500 Subject: [PATCH 1/2] fix: gate turbo V unpad on V type, not K type --- src/llama-graph.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 04b9617236c..1997a377fe3 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2189,7 +2189,8 @@ ggml_tensor * llm_graph_context::build_attn( // TurboQuant: if V was padded, the output has padded dimensions. // Extract original V head_dim after inverse WHT (applied inside build_attn_mha). - if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + // NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo + if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) { const int64_t orig_v_head = hparams.n_embd_head_v(il); // cur is 2D: (n_embd_head * n_head, n_tokens) after build_attn_mha const int64_t padded_v_head = v->ne[0]; @@ -2415,7 +2416,8 @@ ggml_tensor * llm_graph_context::build_attn( cb(cur, "kqv_out", il); // TurboQuant: if V was padded, extract original V head_dim after inverse WHT - if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + // NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo + if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) { const int64_t orig_v_head = hparams.n_embd_head_v(il); const int64_t padded_v_head = v->ne[0]; if (padded_v_head != orig_v_head) { From a1bcb34a18d4a3396453c850d19d27996e4dc391 Mon Sep 17 00:00:00 2001 From: TheTom Date: Mon, 20 Apr 2026 09:10:46 -0500 Subject: [PATCH 2/2] =?UTF-8?q?fix(metal):=20disable=20TurboFlash=20by=20d?= =?UTF-8?q?efault=20=E2=80=94=20corrupt=20output=20on=20Apple10?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TurboFlash two-pass fused attention kernel produces garbage output on M5 Max (Apple10/Metal4) for all turbo3 V configs. Disabling by default routes turbo3 through the standard FA path which works correctly. Users can opt-in with TURBO_FLASH=1 for testing/debugging. No perf regression — standard FA path matches TurboFlash speed within noise (~55-57 t/s tg128 for q8_0/turbo3 on M5 Max). Co-Authored-By: tturney@psyguard.ai Co-Authored-By: Claude Opus 4.6 (1M context) --- ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 24bd4fed095..fd6335323c8 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -2731,8 +2731,10 @@ static bool ggml_metal_op_flash_attn_ext_use_turbo_flash(const ggml_tensor * op) // Check environment variable to force enable (bypasses other checks) if (turbo_flash_env && turbo_flash_env[0] == '1') return true; - // Default: enabled for all qualifying configurations - return true; + // Default: disabled — TurboFlash two-pass kernel produces corrupt output + // on Apple10 (M5 Max) and possibly other Metal4 GPUs. Use TURBO_FLASH=1 + // to opt-in for testing. See PR #91. + return false; } size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {