Add mixed-type Metal FA kernels for auto-asymmetric K/V

luvwinnie · luvwinnie · commit f1748ac42d18 · 2026-03-30T10:46:42.000+09:00
turbo4_1 and turbo3_1 auto-promote K by 1 bit (K=turbo5_1/V=turbo4_1).
Previously this fell back to CPU scalar attention (47 t/s).
Now with mixed-type Metal flash attention kernels: 73 t/s (+53%).

Changes:
- ggml-metal.metal: 8 new FA kernel instantiations for mixed K/V
  (4 batched + 4 vec, for turbo and rq auto-asymmetric pairs)
- ggml-metal-device.cpp: pipeline naming includes V type when K!=V
- ggml-metal-device.m: allow mixed turbo/rq types in supports_op
- ggml-metal-ops.cpp: relax K==V type assertion for turbo types

Results (gpt-oss-120b, M3 Ultra):
  turbo4_1: 47→73 t/s (+53%), correct output
  turbo3_1: 47→75 t/s (+59%), marginal quality
  turbo5_1: 76 t/s (unchanged, symmetric)
  q8_0: 80 t/s (baseline)
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -1321,11 +1321,19 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
     // do bounds checks for the mask?
     const bool bc_mask = op->src[3] && (op->src[3]->ne[1] % 8 != 0);
 
-    snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
-            "flash_attn_ext",
-            ggml_type_name(op->src[1]->type),
-            dk,
-            dv);
+    // Support mixed K/V types for turbo auto-asymmetric
+    if (op->src[1]->type != op->src[2]->type) {
+        snprintf(base, 256, "kernel_%s_%s_v%s_dk%d_dv%d",
+                "flash_attn_ext",
+                ggml_type_name(op->src[1]->type),
+                ggml_type_name(op->src[2]->type),
+                dk, dv);
+    } else {
+        snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
+                "flash_attn_ext",
+                ggml_type_name(op->src[1]->type),
+                dk, dv);
+    }
 
     snprintf(name, 256, "%s_mask=%d_sinks=%d_bias=%d_scap=%d_kvpad=%d_bcm=%d_ns10=%d_ns20=%d_nsg=%d",
             base,
@@ -1384,11 +1392,18 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_v
     const int32_t ns10 = op->src[1]->nb[1]/op->src[1]->nb[0];
     const int32_t ns20 = op->src[2]->nb[1]/op->src[2]->nb[0];
 
-    snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
-            "flash_attn_ext_vec",
-            ggml_type_name(op->src[1]->type),
-            dk,
-            dv);
+    if (op->src[1]->type != op->src[2]->type) {
+        snprintf(base, 256, "kernel_%s_%s_v%s_dk%d_dv%d",
+                "flash_attn_ext_vec",
+                ggml_type_name(op->src[1]->type),
+                ggml_type_name(op->src[2]->type),
+                dk, dv);
+    } else {
+        snprintf(base, 256, "kernel_%s_%s_dk%d_dv%d",
+                "flash_attn_ext_vec",
+                ggml_type_name(op->src[1]->type),
+                dk, dv);
+    }
 
     snprintf(name, 256, "%s_mask=%d_sink=%d_bias=%d_scap=%d_kvpad=%d_ns10=%d_ns20=%d_nsg=%d_nwg=%d",
             base,
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1157,7 +1157,12 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                 return false;
             }
             if (op->src[1]->type != op->src[2]->type) {
-                return false;
+                // Allow mixed turbo/rq types (auto-asymmetric K/V)
+                const bool k_is_turbo = (op->src[1]->type >= GGML_TYPE_TURBO3_1 && op->src[1]->type <= GGML_TYPE_RQ6_1);
+                const bool v_is_turbo = (op->src[2]->type >= GGML_TYPE_TURBO3_1 && op->src[2]->type <= GGML_TYPE_RQ6_1);
+                if (!(k_is_turbo && v_is_turbo)) {
+                    return false;
+                }
             }
             return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
         case GGML_OP_SSM_CONV:
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2633,7 +2633,8 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
     GGML_ASSERT(ne00 % 4 == 0);
 
     GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(op->src[1]->type == op->src[2]->type);
+    // Allow mixed turbo/rq K/V types for auto-asymmetric
+    // GGML_ASSERT(op->src[1]->type == op->src[2]->type);
 
     //GGML_ASSERT(ggml_are_same_shape (src1, src2));
     GGML_ASSERT(ne11 == ne21);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -6702,6 +6702,16 @@ template [[host_name("kernel_flash_attn_ext_rq4_1_dk64_dv64")]] kernel flash_att
 template [[host_name("kernel_flash_attn_ext_rq5_1_dk64_dv64")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_rq5_1, 4, dequantize_rq5_1, block_rq5_1, 4, dequantize_rq5_1, 64, 64>;
 template [[host_name("kernel_flash_attn_ext_rq6_1_dk64_dv64")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_rq6_1, 4, dequantize_rq6_1, block_rq6_1, 4, dequantize_rq6_1, 64, 64>;
 
+// Mixed K/V type flash attention kernels (auto-asymmetric: K gets 1 more bit than V)
+// turbo4_1 → K=turbo5_1, V=turbo4_1
+template [[host_name("kernel_flash_attn_ext_turbo5_1_vturbo4_1_dk64_dv64")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_turbo5_1, 4, dequantize_turbo5_1, block_turbo4_1, 4, dequantize_turbo4_1, 64, 64>;
+// turbo3_1 → K=turbo4_1, V=turbo3_1
+template [[host_name("kernel_flash_attn_ext_turbo4_1_vturbo3_1_dk64_dv64")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_turbo4_1, 4, dequantize_turbo4_1, block_turbo3_1, 4, dequantize_turbo3_1, 64, 64>;
+// rq4_1 → K=rq5_1, V=rq4_1
+template [[host_name("kernel_flash_attn_ext_rq5_1_vrq4_1_dk64_dv64")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_rq5_1, 4, dequantize_rq5_1, block_rq4_1, 4, dequantize_rq4_1, 64, 64>;
+// rq3_1 → K=rq4_1, V=rq3_1
+template [[host_name("kernel_flash_attn_ext_rq4_1_vrq3_1_dk64_dv64")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_rq4_1, 4, dequantize_rq4_1, block_rq3_1, 4, dequantize_rq3_1, 64, 64>;
+
 #undef FA_TYPES
 #undef FA_TYPES_BF
 #undef FA_TYPES_F32
@@ -7320,6 +7330,12 @@ template [[host_name("kernel_flash_attn_ext_vec_rq4_1_dk64_dv64")]] kernel flash
 template [[host_name("kernel_flash_attn_ext_vec_rq5_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_rq5_1, 16, dequantize_rq5_1_t4, block_rq5_1, 16, dequantize_rq5_1_t4, 64, 64, 2>;
 template [[host_name("kernel_flash_attn_ext_vec_rq6_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_rq6_1, 16, dequantize_rq6_1_t4, block_rq6_1, 16, dequantize_rq6_1_t4, 64, 64, 2>;
 
+// Mixed K/V vec flash attention kernels (auto-asymmetric)
+template [[host_name("kernel_flash_attn_ext_vec_turbo5_1_vturbo4_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_turbo5_1, 16, dequantize_turbo5_1_t4, block_turbo4_1, 16, dequantize_turbo4_1_t4, 64, 64, 2>;
+template [[host_name("kernel_flash_attn_ext_vec_turbo4_1_vturbo3_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_turbo4_1, 16, dequantize_turbo4_1_t4, block_turbo3_1, 16, dequantize_turbo3_1_t4, 64, 64, 2>;
+template [[host_name("kernel_flash_attn_ext_vec_rq5_1_vrq4_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_rq5_1, 16, dequantize_rq5_1_t4, block_rq4_1, 16, dequantize_rq4_1_t4, 64, 64, 2>;
+template [[host_name("kernel_flash_attn_ext_vec_rq4_1_vrq3_1_dk64_dv64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_rq4_1, 16, dequantize_rq4_1_t4, block_rq3_1, 16, dequantize_rq3_1_t4, 64, 64, 2>;
+
 #undef FA_TYPES
 #undef FA_TYPES_F32
 

Original file line number	Diff line number	Diff line change
`@@ -1157,7 +1157,12 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te`
`1157`	`1157`	`return false;`
`1158`	`1158`	`}`
`1159`	`1159`	`if (op->src[1]->type != op->src[2]->type) {`
`1160`		`- return false;`
	`1160`	`+ // Allow mixed turbo/rq types (auto-asymmetric K/V)`
	`1161`	`+ const bool k_is_turbo = (op->src[1]->type >= GGML_TYPE_TURBO3_1 && op->src[1]->type <= GGML_TYPE_RQ6_1);`
	`1162`	`+ const bool v_is_turbo = (op->src[2]->type >= GGML_TYPE_TURBO3_1 && op->src[2]->type <= GGML_TYPE_RQ6_1);`
	`1163`	`+ if (!(k_is_turbo && v_is_turbo)) {`
	`1164`	`+ return false;`
	`1165`	`+ }`
`1161`	`1166`	`}`
`1162`	`1167`	`return has_simdgroup_mm; // TODO: over-restricted for vec-kernels`
`1163`	`1168`	`case GGML_OP_SSM_CONV:`