Avoid full mask allocation in unfused padding causal attention

Rahul Mangalampalli · Rahul Mangalampalli · commit 82f0e0e688fe · 2026-06-08T11:18:54.000-07:00
diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
@@ -26,6 +26,8 @@
 from transformer_engine.pytorch.attention.dot_product_attention import (
     _attention_backends,
 )
+from transformer_engine.pytorch.attention.dot_product_attention import backends as dpa_backends
+import transformer_engine.pytorch.attention.dot_product_attention.utils as dpa_utils
 from transformer_engine.pytorch.attention.dot_product_attention.utils import (
     FlashAttentionUtils,
     check_set_window_size,
@@ -647,6 +649,75 @@ def test_dpa_mask(dtype, model_configs, model):
     test_dot_product_attention(dtype, model_configs, model, False, True, None, False, False)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required.")
+def test_unfused_thd_padding_causal_uses_sdpa_without_full_mask(monkeypatch):
+    """Unfused THD padding_causal should avoid materializing a full quadratic mask."""
+    reset_rng_states()
+    batch_size = 2
+    num_heads = 2
+    head_dim = 16
+    seqlens = torch.tensor([3, 5], dtype=torch.int32, device="cuda")
+    cu_seqlens = torch.zeros(batch_size + 1, dtype=torch.int32, device="cuda")
+    cu_seqlens[1:] = torch.cumsum(seqlens, dim=0)
+    total_seqlen = int(cu_seqlens[-1].item())
+    max_seqlen = int(seqlens.max().item())
+
+    query = torch.randn(
+        total_seqlen, num_heads, head_dim, dtype=torch.float16, device="cuda", requires_grad=True
+    )
+    key = torch.randn_like(query, requires_grad=True)
+    value = torch.randn_like(query, requires_grad=True)
+    softmax_scale = head_dim**-0.5
+
+    expected = []
+    with torch.no_grad():
+        for batch_id in range(batch_size):
+            start = int(cu_seqlens[batch_id].item())
+            end = int(cu_seqlens[batch_id + 1].item())
+            q = query[start:end].permute(1, 0, 2).unsqueeze(0)
+            k = key[start:end].permute(1, 0, 2).unsqueeze(0)
+            v = value[start:end].permute(1, 0, 2).unsqueeze(0)
+            expected.append(
+                torch.nn.functional.scaled_dot_product_attention(
+                    q, k, v, dropout_p=0.0, is_causal=True, scale=softmax_scale
+                )
+                .squeeze(0)
+                .permute(1, 0, 2)
+                .reshape(end - start, -1)
+            )
+    expected = torch.cat(expected, dim=0)
+
+    def fail_get_full_mask(*args, **kwargs):
+        raise AssertionError("get_full_mask should not be called for this path")
+
+    monkeypatch.setattr(dpa_utils, "get_full_mask", fail_get_full_mask)
+
+    attention = dpa_backends.UnfusedDotProductAttention(
+        softmax_scale=softmax_scale,
+        attention_type="self",
+        attention_dropout=0.0,
+    ).eval()
+    output = attention(
+        {},
+        query,
+        key,
+        value,
+        qkv_layout="thd_thd_thd",
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_kv=max_seqlen,
+        attn_mask_type="padding_causal",
+        window_size=(-1, 0),
+    )
+
+    torch.testing.assert_close(output, expected, rtol=1e-3, atol=1e-3)
+    output.float().sum().backward()
+    assert query.grad is not None
+    assert key.grad is not None
+    assert value.grad is not None
+
+
 model_configs_bias = {
     # test: ModelConfig(b, sq, hq, dqk)
     "bias_1_0": ModelConfig(4, 128, 16, 64, attn_bias_type="post_scale_bias"),
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/backends.py b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -365,6 +365,111 @@ def fast_setattr(self, name: str, value: Any) -> None:
         """Fast attribute set for non-parameter fields."""
         self.__dict__[name] = value
 
+    def _use_varlen_sdpa(
+        self,
+        attn_mask_type: str,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]],
+        window_size: Optional[Tuple[int, int]],
+        core_attention_bias_type: str,
+        alibi_slopes: Optional[torch.Tensor],
+        fp8: bool,
+    ) -> bool:
+        """Whether PyTorch SDPA can replace unfused attention without materializing masks."""
+        if self.attention_type != "self":
+            return False
+        if attn_mask_type != "padding_causal":
+            return False
+        if window_size not in [None, (-1, 0), (-1, -1)]:
+            return False
+        if attn_mask_type == "padding_causal" and attention_mask is None:
+            return False
+        if isinstance(attention_mask, tuple):
+            return False
+        return (
+            core_attention_bias_type == "no_bias"
+            and self.attention_dropout.p == 0.0
+            and alibi_slopes is None
+            and self.softmax_type == "vanilla"
+            and not self.return_max_logit
+            and not fp8
+        )
+
+    def _format_context(
+        self,
+        context_layer: torch.Tensor,
+        q_format: str,
+        max_seqlen_q: int,
+        batch_size: int,
+        cu_seqlens_q: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """Convert context from [b, h, sq, d] to the requested output layout."""
+        if q_format == "sbhd":
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            return context_layer.view(max_seqlen_q, batch_size, -1)
+        if q_format == "bshd":
+            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+            return context_layer.view(batch_size, max_seqlen_q, -1)
+        if q_format == "thd":
+            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+            context_layer = ConvertBSHDtoTHD.apply(context_layer, cu_seqlens_q)
+            return context_layer.view(context_layer.shape[0], -1)
+        raise ValueError(f"Unsupported q_format = {q_format}!")
+
+    def _forward_varlen_sdpa(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        q_format: str,
+        batch_size: int,
+        max_seqlen_q: int,
+        cu_seqlens_q: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        scale: float,
+    ) -> torch.Tensor:
+        """Run causal self-attention without expanding padding masks to [b, 1, sq, sk]."""
+        context_layer = torch.zeros(
+            batch_size,
+            query_layer.size(2),
+            max_seqlen_q,
+            value_layer.size(3),
+            dtype=query_layer.dtype,
+            device=query_layer.device,
+        )
+
+        if attention_mask is not None:
+            seqlens_q = attention_mask.logical_not()[:, 0, 0, :].sum(dim=1)
+        else:
+            seqlens_q = torch.full(
+                (batch_size,), max_seqlen_q, dtype=torch.int64, device=query_layer.device
+            )
+
+        dropout_p = self.attention_dropout.p if self.training else 0.0
+        with self.attention_dropout_ctx():
+            for batch_id in range(batch_size):
+                seqlen_q = int(seqlens_q[batch_id].item())
+                if seqlen_q == 0:
+                    continue
+                query = query_layer[:seqlen_q, batch_id].permute(1, 0, 2).unsqueeze(0)
+                key = key_layer[:seqlen_q, batch_id].permute(1, 0, 2).unsqueeze(0)
+                value = value_layer[:seqlen_q, batch_id].permute(1, 0, 2).unsqueeze(0)
+                context_layer[batch_id, :, :seqlen_q, :] = F.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    dropout_p=dropout_p,
+                    is_causal=True,
+                    scale=scale,
+                ).squeeze(0)
+
+        return self._format_context(
+            context_layer,
+            q_format,
+            max_seqlen_q,
+            batch_size,
+            cu_seqlens_q,
+        )
+
     def forward(
         self,
         _alibi_cache: Dict[str, Any],
@@ -457,22 +562,6 @@ def forward(
                 max_seqlen_kv,
                 self.attention_type,
             )
-        attn_mask_type, attention_mask, actual_seqlens_q, actual_seqlens_kv = (
-            dpa_utils.get_full_mask(
-                max_seqlen_q,
-                max_seqlen_kv,
-                attn_mask_type=attn_mask_type,
-                attention_mask=attention_mask,
-                window_size=window_size,
-                attention_type=self.attention_type,
-                bottom_right_alignment=(
-                    attn_mask_type not in ["causal", "padding_causal"]
-                    if bottom_right_diagonal is None
-                    else bottom_right_diagonal
-                ),
-            )
-        )
-
         apply_qk_layer_scaling = self.apply_qk_layer_scaling and key_layer.dtype == torch.float16
 
         # [b, h, sq, sk]
@@ -494,6 +583,46 @@ def forward(
                 int(query_layer.shape[2] / value_layer.shape[2]), dim=2
             )
 
+        scale = self.softmax_scale
+        if apply_qk_layer_scaling:
+            scale /= self.layer_number
+
+        if self._use_varlen_sdpa(
+            attn_mask_type,
+            attention_mask,
+            window_size,
+            core_attention_bias_type,
+            alibi_slopes,
+            fp8,
+        ):
+            return self._forward_varlen_sdpa(
+                query_layer,
+                key_layer,
+                value_layer,
+                q_format,
+                batch_size,
+                max_seqlen_q,
+                cu_seqlens_q,
+                attention_mask,
+                self.softmax_scale,
+            )
+
+        attn_mask_type, attention_mask, actual_seqlens_q, actual_seqlens_kv = (
+            dpa_utils.get_full_mask(
+                max_seqlen_q,
+                max_seqlen_kv,
+                attn_mask_type=attn_mask_type,
+                attention_mask=attention_mask,
+                window_size=window_size,
+                attention_type=self.attention_type,
+                bottom_right_alignment=(
+                    attn_mask_type not in ["causal", "padding_causal"]
+                    if bottom_right_diagonal is None
+                    else bottom_right_diagonal
+                ),
+            )
+        )
+
         # preallocting result tensor: [b * h, sq, sk]
         matmul_result = torch.empty(
             output_size[0] * output_size[1],
@@ -503,10 +632,6 @@ def forward(
             device=torch.cuda.current_device(),
         )
 
-        scale = self.softmax_scale
-        if apply_qk_layer_scaling:
-            scale /= self.layer_number
-
         if fp8:
             # get fp8 recipe for DPA
             fp8_recipe = FP8GlobalStateManager.get_fp8_recipe()