diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 24bd4fed095..fd6335323c8 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -2731,8 +2731,10 @@ static bool ggml_metal_op_flash_attn_ext_use_turbo_flash(const ggml_tensor * op) // Check environment variable to force enable (bypasses other checks) if (turbo_flash_env && turbo_flash_env[0] == '1') return true; - // Default: enabled for all qualifying configurations - return true; + // Default: disabled — TurboFlash two-pass kernel produces corrupt output + // on Apple10 (M5 Max) and possibly other Metal4 GPUs. Use TURBO_FLASH=1 + // to opt-in for testing. See PR #91. + return false; } size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 04b9617236c..1997a377fe3 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2189,7 +2189,8 @@ ggml_tensor * llm_graph_context::build_attn( // TurboQuant: if V was padded, the output has padded dimensions. // Extract original V head_dim after inverse WHT (applied inside build_attn_mha). - if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + // NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo + if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) { const int64_t orig_v_head = hparams.n_embd_head_v(il); // cur is 2D: (n_embd_head * n_head, n_tokens) after build_attn_mha const int64_t padded_v_head = v->ne[0]; @@ -2415,7 +2416,8 @@ ggml_tensor * llm_graph_context::build_attn( cb(cur, "kqv_out", il); // TurboQuant: if V was padded, extract original V head_dim after inverse WHT - if (k->type == GGML_TYPE_TURBO3_0 || k->type == GGML_TYPE_TURBO4_0 || k->type == GGML_TYPE_TURBO2_0) { + // NOTE: gate on v->type (not k->type) for asymmetric configs where K=q8_0 but V=turbo + if (v->type == GGML_TYPE_TURBO3_0 || v->type == GGML_TYPE_TURBO4_0 || v->type == GGML_TYPE_TURBO2_0) { const int64_t orig_v_head = hparams.n_embd_head_v(il); const int64_t padded_v_head = v->ne[0]; if (padded_v_head != orig_v_head) {