test(attention): cover merged causal- and attn_mask-bias flash branches

aymuos15 · aymuos15 · commit c5b2a1c34a56 · 2026-05-04T10:47:03.000+01:00
Address CodeRabbit review on PR #8842: - Narrow the use_flash_attention docstring in SABlock and CrossAttentionBlock so it reflects the actual implementation: pure causal masking keeps the fast path via is_causal=True; only an additive bias (rel_pos_embedding, or causal/attn_mask merged with another bias) forces SDPA to fall back to the memory-efficient or cuDNN backend. - Extend the numerical-equivalence tests to cover the new merged-bias paths: causal=True + rel_pos_embedding for both blocks, and attn_mask + rel_pos_embedding for SABlock. All cases assert assert_allclose(out_flash, out_ref, atol=1e-4) on 2D and 3D inputs. Signed-off-by: Soumya Snigdha Kundu <soumya_snigdha.kundu@kcl.ac.uk>
diff --git a/monai/networks/blocks/crossattention.py b/monai/networks/blocks/crossattention.py
@@ -63,9 +63,11 @@ def __init__(
             attention_dtype: cast attention operations to this dtype.
             use_flash_attention: if True, dispatch attention through
                 ``torch.nn.functional.scaled_dot_product_attention``. PyTorch selects the backend;
-                the true flash kernel is used only when no attention bias is present. When combined
-                with ``rel_pos_embedding`` or ``causal``, PyTorch will fall back to the
-                memory-efficient or cuDNN SDPA backend.
+                the true flash kernel is used when no custom additive attention bias is passed.
+                Pure ``causal`` masking (with no ``rel_pos_embedding``) keeps the fast path via
+                ``is_causal=True``. When an additive bias is required (for example,
+                ``rel_pos_embedding``, or ``causal`` merged with another bias), PyTorch falls
+                back to the memory-efficient or cuDNN SDPA backend.
         """
 
         super().__init__()
diff --git a/monai/networks/blocks/selfattention.py b/monai/networks/blocks/selfattention.py
@@ -65,9 +65,11 @@ def __init__(
             use_combined_linear: whether to use a single linear layer for qkv projection, default to True.
             use_flash_attention: if True, dispatch attention through
                 ``torch.nn.functional.scaled_dot_product_attention``. PyTorch selects the backend;
-                the true flash kernel is used only when no attention bias is present. When combined
-                with ``rel_pos_embedding``, ``causal``, or ``attn_mask``, PyTorch will fall back to
-                the memory-efficient or cuDNN SDPA backend.
+                the true flash kernel is used when no custom additive attention bias is passed.
+                Pure ``causal`` masking (with no ``rel_pos_embedding`` or ``attn_mask``) keeps the
+                fast path via ``is_causal=True``. When an additive bias is required (for example,
+                ``rel_pos_embedding``, or ``causal``/``attn_mask`` merged with another bias),
+                PyTorch falls back to the memory-efficient or cuDNN SDPA backend.
 
         """
 
diff --git a/tests/networks/blocks/test_crossattention.py b/tests/networks/blocks/test_crossattention.py
@@ -92,6 +92,31 @@ def test_rel_pos_embedding_with_flash_attention(self):
                 out_ref = block_ref(test_data)
             assert_allclose(out_flash, out_ref, atol=1e-4)
 
+    @skipUnless(has_einops, "Requires einops")
+    def test_causal_rel_pos_with_flash_attention(self):
+        # Exercise the merged causal-bias branch: causal=True together with
+        # rel_pos_embedding builds an additive bias and disables is_causal.
+        for input_size in [(16, 32), (8, 8, 8)]:
+            seq_len = int(np.prod(input_size))
+            input_param = {
+                "hidden_size": 128,
+                "num_heads": 4,
+                "dropout_rate": 0.0,
+                "rel_pos_embedding": RelPosEmbedding.DECOMPOSED,
+                "input_size": input_size,
+                "causal": True,
+                "sequence_length": seq_len,
+            }
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            block_flash = CrossAttentionBlock(**input_param, use_flash_attention=True).to(device)
+            block_ref = CrossAttentionBlock(**input_param, use_flash_attention=False).to(device)
+            block_ref.load_state_dict(block_flash.state_dict())
+            test_data = torch.randn(2, seq_len, 128).to(device)
+            with eval_mode(block_flash), eval_mode(block_ref):
+                out_flash = block_flash(test_data)
+                out_ref = block_ref(test_data)
+            assert_allclose(out_flash, out_ref, atol=1e-4)
+
     @skipUnless(has_einops, "Requires einops")
     def test_attention_dim_not_multiple_of_heads(self):
         with self.assertRaises(ValueError):
diff --git a/tests/networks/blocks/test_selfattention.py b/tests/networks/blocks/test_selfattention.py
@@ -90,6 +90,57 @@ def test_rel_pos_embedding_with_flash_attention(self):
                 out_ref = block_ref(test_data)
             assert_allclose(out_flash, out_ref, atol=1e-4)
 
+    @skipUnless(has_einops, "Requires einops")
+    def test_causal_rel_pos_with_flash_attention(self):
+        # Exercise the merged causal-bias branch: causal=True together with
+        # rel_pos_embedding builds an additive bias and disables is_causal,
+        # so flash and reference paths must still match numerically.
+        for input_size in [(16, 32), (8, 8, 8)]:
+            seq_len = int(np.prod(input_size))
+            input_param = {
+                "hidden_size": 128,
+                "num_heads": 4,
+                "dropout_rate": 0.0,
+                "rel_pos_embedding": RelPosEmbedding.DECOMPOSED,
+                "input_size": input_size,
+                "causal": True,
+                "sequence_length": seq_len,
+            }
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            block_flash = SABlock(**input_param, use_flash_attention=True).to(device)
+            block_ref = SABlock(**input_param, use_flash_attention=False).to(device)
+            block_ref.load_state_dict(block_flash.state_dict())
+            test_data = torch.randn(2, seq_len, 128).to(device)
+            with eval_mode(block_flash), eval_mode(block_ref):
+                out_flash = block_flash(test_data)
+                out_ref = block_ref(test_data)
+            assert_allclose(out_flash, out_ref, atol=1e-4)
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_attn_mask_rel_pos_with_flash_attention(self):
+        # Exercise the user-attn-mask + rel_pos branch: the user mask is
+        # merged into the additive bias passed via SDPA's attn_mask argument.
+        for input_size in [(16, 32), (8, 8, 8)]:
+            seq_len = int(np.prod(input_size))
+            input_param = {
+                "hidden_size": 128,
+                "num_heads": 4,
+                "dropout_rate": 0.0,
+                "rel_pos_embedding": RelPosEmbedding.DECOMPOSED,
+                "input_size": input_size,
+            }
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            block_flash = SABlock(**input_param, use_flash_attention=True).to(device)
+            block_ref = SABlock(**input_param, use_flash_attention=False).to(device)
+            block_ref.load_state_dict(block_flash.state_dict())
+            test_data = torch.randn(2, seq_len, 128).to(device)
+            attn_mask = torch.ones(2, seq_len, dtype=torch.bool, device=device)
+            attn_mask[:, seq_len // 2 :] = False  # mask out the second half
+            with eval_mode(block_flash), eval_mode(block_ref):
+                out_flash = block_flash(test_data, attn_mask=attn_mask)
+                out_ref = block_ref(test_data, attn_mask=attn_mask)
+            assert_allclose(out_flash, out_ref, atol=1e-4)
+
     def test_save_attn_with_flash_attention(self):
         with self.assertRaises(ValueError):
             SABlock(hidden_size=128, num_heads=3, dropout_rate=0.1, use_flash_attention=True, save_attn=True)