Re-enable warp-agnostic ROCm SDPA kernel

NripeshN · NripeshN · commit af26ee92bd0f · 2026-02-07T18:22:39.000Z
Re-enable the optimized SDPA kernel with the warp-size agnostic
implementation. The kernel uses 32-thread tiles for consistent
behavior across RDNA and CDNA architectures.

The memory fault issue appears to be elsewhere in the inference
pipeline, not in SDPA.
diff --git a/mlx/backend/rocm/scaled_dot_product_attention.hip b/mlx/backend/rocm/scaled_dot_product_attention.hip
@@ -216,9 +216,26 @@ bool supports_sdpa_vector(
     bool has_arr_mask,
     bool do_causal,
     bool output_logsumexp) {
-  // Temporarily disable optimized SDPA to debug memory fault
-  // The memory fault occurs even with SDPA disabled, so the issue is elsewhere
-  return false;
+  if (output_logsumexp) {
+    return false;
+  }
+
+  // Check for supported dtypes
+  if (q.dtype() != float32 && q.dtype() != float16 && q.dtype() != bfloat16) {
+    return false;
+  }
+
+  const int value_head_dim = v.shape(-1);
+  const int query_head_dim = q.shape(-1);
+  const int query_sequence_length = q.shape(2);
+
+  const bool sdpa_supported_head_dim = query_head_dim == value_head_dim &&
+      (query_head_dim == 64 || query_head_dim == 96 || query_head_dim == 128);
+
+  const bool supported_vector_config =
+      sdpa_supported_head_dim && query_sequence_length < 4;
+
+  return supported_vector_config && !has_arr_mask;
 }
 
 void sdpa_vector(