fix flash_attn_supported override for cross-attention causal mask

sudhakarsingh27 · sudhakarsingh27 · commit 6c94b36cf393 · 2026-04-08T22:12:19.000-07:00
Factor out cross_attn_causal check to avoid no-backend errors when
FA3 is installed but flash attention doesn't support non-bottom-right
causal mask with different Q/KV sequence lengths.
diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
@@ -197,21 +197,20 @@ def test_dot_product_attention(
         )
         flash_attn_supported, fused_attn_supported, unfused_attn_supported = available_backends
 
-    # FA3 natively supports pad_between_seqs via seqused_q/seqused_k.
-    # FA2 does not support pad_between_seqs
-    # Flash Attention is not supported on SM > 90
-    if (
-        pad_between_seqs
-        and FlashAttentionUtils.v3_is_installed
-        and get_device_compute_capability() == (9, 0)
+    # Flash Attention requires bottom-right-diagonal causal mask for cross-attention
+    cross_attn_causal = (
+        config.max_seqlen_q != config.max_seqlen_kv
+        and config.attn_mask_type in ["causal", "padding_causal"]
+    )
+    sm = get_device_compute_capability()
+    # FA3 natively supports pad_between_seqs via seqused_q/seqused_k (SM90 only).
+    # FA2 does not support pad_between_seqs and is not available on SM >= 100.
+    if not cross_attn_causal and (
+        pad_between_seqs and FlashAttentionUtils.v3_is_installed and sm == (9, 0)
         or not pad_between_seqs
         and FlashAttentionUtils.is_installed
-        and not (
-            config.max_seqlen_q != config.max_seqlen_kv
-            and config.attn_mask_type in ["causal", "padding_causal"]
-        )
         and (config.window_size[0] == -1 or FlashAttentionUtils.v2_3_plus)
-        and get_device_compute_capability() < (10, 0)
+        and sm < (10, 0)
     ):
         flash_attn_supported = True