fix test skips for FA3 pad_between_seqs and deterministic CP tests

sudhakarsingh27 · sudhakarsingh27 · commit 439caa9186eb · 2026-04-08T21:44:10.000-07:00
- test_attention.py: Guard flash_attn_supported override for thd+pad_between_seqs
  to require FA3 installed + SM90. FA2 path retained for non-pad_between_seqs.
- test_attention_with_cp.py: Skip fused attention CP tests in deterministic mode
  for post_scale_bias (requires_grad) and non-vanilla softmax configs, which have
  no deterministic cuDNN backend available.
diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
@@ -198,11 +198,13 @@ def test_dot_product_attention(
         flash_attn_supported, fused_attn_supported, unfused_attn_supported = available_backends
 
     # FA3 natively supports pad_between_seqs via seqused_q/seqused_k.
-    # FA2 does not support pad_between_seqs, but _run_dot_product_attention
-    # manually pads and unpads the input and output of FlashAttention for testing purposes.
-    # Flash Attention is not supported on SM100+
+    # FA2 does not support pad_between_seqs
+    # Flash Attention is not supported on SM > 90
     if (
         pad_between_seqs
+        and FlashAttentionUtils.v3_is_installed
+        and get_device_compute_capability() == (9, 0)
+        or not pad_between_seqs
         and FlashAttentionUtils.is_installed
         and not (
             config.max_seqlen_q != config.max_seqlen_kv
diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py
@@ -374,9 +374,22 @@ def test_cp_with_fused_attention(
         is_training=is_training,
     )
     _, fused_attn_supported, _ = available_backends
+
+    # Skip any tests if not supported by the configs
     if not fused_attn_supported:
         pytest.skip("No attention backend available.")
 
+    deterministic = not bool(int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO", "1")))
+    if deterministic:
+        if config.softmax_type != "vanilla":
+            pytest.skip(
+                "Deterministic mode does not support non-vanilla softmax with FusedAttention"
+            )
+        if config.attn_bias_type == "post_scale_bias" and is_training:
+            pytest.skip(
+                "Deterministic mode does not support post_scale_bias with requires_grad"
+            )
+
     run_distributed(
         get_bash_arguments(
             num_gpus_per_node=num_gpus,