Fix q8 Turbo V CUDA FA decode routing

Anbeeld · Anbeeld · commit 6fa65bd79e95 · 2026-06-14T16:46:00.000+02:00
Route Turbo-V decode cases with q8_0 K at D&gt;=256 away from the unsafe vector FlashAttention path, matching the existing classic-K guard without broadening classic non-q8 semantics.

Add route-policy coverage so q8_0 is included only in the Turbo-V unsafe-K policy.
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -1471,14 +1471,19 @@ static inline bool ggml_cuda_fattn_prefill_mma_can_materialize_turbo_k_classic_v
            ggml_cuda_fattn_is_classic_non_q8_type(V->type);
 }
 
+static inline bool ggml_cuda_fattn_is_turbo_v_decode_unsafe_k_type(const ggml_type type) {
+    return type == GGML_TYPE_Q8_0 ||
+           ggml_cuda_fattn_is_classic_non_q8_type(type);
+}
+
 // Shape guard for the effective K/V pair after Turbo V decode-dequant.
-// Gemma-like D>=256 with classic_K/f16 (non-q8) is unsafe on the vec path.
+// D>=256 with classic-or-q8 K/f16 V is unsafe on the vec path.
 // Only applied when V was actually decoded from Turbo — explicit q5_0/f16
 // at D>=256 is unaffected. D=128 is safe on vec and not gated.
 static inline bool ggml_cuda_fattn_effective_vec_shape_unsafe(
         const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V) {
     return Q->ne[0] >= 256 &&
-           ggml_cuda_fattn_is_classic_non_q8_type(K->type) &&
+           ggml_cuda_fattn_is_turbo_v_decode_unsafe_k_type(K->type) &&
            V->type == GGML_TYPE_F16;
 }
 
@@ -2502,8 +2507,8 @@ static ggml_cuda_fattn_route_plan ggml_cuda_fattn_make_route_plan(const int devi
     }
 
     // If V was decoded from Turbo to f16 and the effective pair is
-    // classic_K/f16 at D>=256, the vec path is unsafe. Only gate vec for
-    // Turbo-originated f16 V — explicit q5_0/f16 at D>=256 is unaffected.
+    // classic-or-q8 K/f16 at D>=256, the vec path is unsafe. Only gate vec
+    // for Turbo-originated f16 V — explicit q5_0/f16 at D>=256 is unaffected.
     // Disable vec so the existing kernel selector picks MMA_F16 or tile with
     // generic f16 K conversion. D=128 is fine on vec and is not affected.
     plan.unsafe_vec_after_turbo_v_decode =
diff --git a/tests/test-cuda-fattn-route-policy.cpp b/tests/test-cuda-fattn-route-policy.cpp
@@ -57,6 +57,15 @@ int main(int argc, char ** argv) {
     const std::string prefill_policy = slice_between(fattn,
             "static inline bool ggml_cuda_fattn_prefill_mma_can_materialize_turbo_k_classic_v",
             "// Shape guard for the effective K/V pair after Turbo V decode-dequant.");
+    const std::string classic_non_q8 = slice_between(fattn,
+            "static inline bool ggml_cuda_fattn_is_classic_non_q8_type",
+            "static void ggml_cuda_fattn_materialize_to_f16");
+    const std::string unsafe_k_helper = slice_between(fattn,
+            "static inline bool ggml_cuda_fattn_is_turbo_v_decode_unsafe_k_type",
+            "static inline bool ggml_cuda_fattn_effective_vec_shape_unsafe");
+    const std::string unsafe_shape = slice_between(fattn,
+            "static inline bool ggml_cuda_fattn_effective_vec_shape_unsafe",
+            "static void ggml_cuda_flash_attn_ext_vec");
     const std::string exec = slice_between(fattn,
             "void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst)",
             "bool ggml_cuda_flash_attn_ext_support");
@@ -93,5 +102,17 @@ int main(int argc, char ** argv) {
                  prefill_policy.find("ggml_cuda_fattn_is_classic_non_q8_type(V->type)") != std::string::npos,
         "Turbo K + classic V prefill eligibility must not broaden classic-K/Turbo-V routing");
 
+    ok &= expect(!classic_non_q8.empty() &&
+                 classic_non_q8.find("GGML_TYPE_Q8_0") == std::string::npos,
+        "classic non-q8 helper must not be broadened to include q8_0");
+    ok &= expect(!unsafe_k_helper.empty() &&
+                 unsafe_k_helper.find("GGML_TYPE_Q8_0") != std::string::npos &&
+                 unsafe_k_helper.find("ggml_cuda_fattn_is_classic_non_q8_type(type)") != std::string::npos,
+        "Turbo V decode unsafe-K policy must cover q8_0 plus classic non-q8 K types");
+    ok &= expect(!unsafe_shape.empty() &&
+                 unsafe_shape.find("ggml_cuda_fattn_is_turbo_v_decode_unsafe_k_type(K->type)") != std::string::npos &&
+                 unsafe_shape.find("V->type == GGML_TYPE_F16") != std::string::npos,
+        "Turbo V decode shape guard must use the unsafe-K policy for effective f16 V");
+
     return ok ? 0 : 1;
 }