Metal backend: Add SDPA head_dim=256 support (#18875)

manuelcandales · web-flow · commit 4898af2da93b · 2026-04-20T15:30:25.000-04:00
Qwen 3.5 MoE uses head_dim=256 for full attention layers. The existing
SDPA Metal kernel only instantiated head_dim 64, 96, 128. At D=256 each
thread handles 8 QK elements (8 x 32 threads = 256 dims); register
pressure and threadgroup memory are well within Apple GPU limits.
diff --git a/backends/apple/metal/runtime/ops/op_sdpa.mm b/backends/apple/metal/runtime/ops/op_sdpa.mm
@@ -226,7 +226,8 @@
 #define INSTANTIATE_SDPA_VECTOR_HEADS(DTYPE)        \
   INSTANTIATE_SDPA_VECTOR(DTYPE, 64, 64);           \
   INSTANTIATE_SDPA_VECTOR(DTYPE, 96, 96);           \
-  INSTANTIATE_SDPA_VECTOR(DTYPE, 128, 128);
+  INSTANTIATE_SDPA_VECTOR(DTYPE, 128, 128);         \
+  INSTANTIATE_SDPA_VECTOR(DTYPE, 256, 256);
 
 INSTANTIATE_SDPA_VECTOR_HEADS(float);
 INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
@@ -430,11 +431,11 @@ AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps(
         throw std::runtime_error("Unsupported dtype for Metal SDPA kernel");
       }
 
-      // Select head_dim - must match exactly one of the supported sizes (64, 96, 128)
+      // Select head_dim - must match exactly one of the supported sizes (64, 96, 128, 256)
       int64_t head_dim = headSize;
-      if (head_dim != 64 && head_dim != 96 && head_dim != 128) {
-        ET_LOG(Error, "aoti_torch_mps__scaled_dot_product_attention_math_for_mps: Unsupported head_dim %lld (must be 64, 96, or 128)", head_dim);
-        throw std::runtime_error("Unsupported head_dim for Metal SDPA kernel - must be exactly 64, 96, or 128");
+      if (head_dim != 64 && head_dim != 96 && head_dim != 128 && head_dim != 256) {
+        ET_LOG(Error, "aoti_torch_mps__scaled_dot_product_attention_math_for_mps: Unsupported head_dim %lld (must be 64, 96, 128, or 256)", head_dim);
+        throw std::runtime_error("Unsupported head_dim for Metal SDPA kernel - must be exactly 64, 96, 128, or 256");
       }
 
       std::string kernel_name = "sdpa_vector_" + type_name + "_" + std::to_string(head_dim) + "_" + std::to_string(head_dim);
diff --git a/backends/apple/metal/tests/test_modules.py b/backends/apple/metal/tests/test_modules.py
@@ -639,6 +639,31 @@ def __init__(self):
 }
 
 
+# -------------------------------------------------------------------------
+# SDPA with head_dim=256 (Qwen 3.5 MoE)
+# -------------------------------------------------------------------------
+
+
+class SDPAHeadDim256(nn.Module):
+    """SDPA with head_dim=256, required by Qwen 3.5 MoE full attention layers."""
+
+    def forward(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> torch.Tensor:
+        return torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, dropout_p=0.0, is_causal=False
+        )
+
+
+MODULE_REGISTRY["sdpa_head_dim_256"] = {
+    "model_class": SDPAHeadDim256,
+    "input_shapes": [(1, 4, 8, 256), (1, 4, 8, 256), (1, 4, 8, 256)],
+    "description": "SDPA with head_dim=256 (Qwen 3.5 MoE)",
+    "atol_float32": 1e-4,
+    "atol_bfloat16": 5e-2,
+}
+
+
 # =============================================================================
 # Helper Functions
 # =============================================================================