NVIDIA
diff --git a/‎tensorrt_llm/_torch/visual_gen/attention_backend/flash_attn4.py‎
Lines changed: 29 additions & 2 deletions b/‎tensorrt_llm/_torch/visual_gen/attention_backend/flash_attn4.py‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/attention_backend/vanilla.py‎
Lines changed: 21 additions & 6 deletions b/‎tensorrt_llm/_torch/visual_gen/attention_backend/vanilla.py‎
Lines changed: 21 additions & 6 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/models/ltx2/ltx2_core/transformer_args.py‎
Lines changed: 5 additions & 0 deletions b/‎tensorrt_llm/_torch/visual_gen/models/ltx2/ltx2_core/transformer_args.py‎
Lines changed: 5 additions & 0 deletions
@@ -71,12 +71,14 @@ def _fwd(
         k: torch.Tensor,
         v: torch.Tensor,
         causal: bool,
+        seqused_k: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Calls _flash_attn_fwd with torch.compile disabled. Returns (output, lse)."""
         output, lse = _flash_attn_fwd(
             q,
             k,
             v,
+            seqused_k=seqused_k,
             softmax_scale=self.scale,
             causal=causal,
             window_size_left=None,
@@ -120,6 +122,7 @@ def forward(
         v: torch.Tensor,
         *,
         attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.FULL,
+        key_padding_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -132,11 +135,21 @@ def forward(
             k: Key tensor [batch_size, seq_len_kv, num_kv_heads, head_dim]
             v: Value tensor [batch_size, seq_len_kv, num_kv_heads, head_dim]
             attention_mask: Attention mask type (CAUSAL or FULL)
+            key_padding_mask: Optional ``[B, S_kv]`` bool tensor; True = valid,
+                False = pad. Translated to FA4's ``seqused_k = mask.sum(dim=1)``
+                (assumes True-prefix layout). Non-causal only.
 
         Returns:
             Output tensor [batch_size, seq_len, num_heads, head_dim]
         """
-        output, _ = self.forward_with_lse(q, k, v, attention_mask=attention_mask, **kwargs)
+        output, _ = self.forward_with_lse(
+            q,
+            k,
+            v,
+            attention_mask=attention_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs,
+        )
         return output
 
     def forward_with_lse(
@@ -145,6 +158,7 @@ def forward_with_lse(
         k: torch.Tensor,
         v: torch.Tensor,
         attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.FULL,
+        key_padding_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
@@ -157,7 +171,20 @@ def forward_with_lse(
                     partial attention results in Attention2D parallelism.
         """
         q, k, v, is_causal, origin_dtype = self._prepare_inputs(q, k, v, attention_mask)
-        output, lse = self._fwd(q, k, v, is_causal)
+        seqused_k = None
+        if key_padding_mask is not None:
+            assert not is_causal, "key_padding_mask is not supported with causal attention"
+            assert key_padding_mask.dim() == 2 and key_padding_mask.shape == (
+                q.shape[0],
+                k.shape[1],
+            ), (
+                f"Invalid key_padding_mask shape: expected [B={q.shape[0]}, "
+                f"S_kv={k.shape[1]}], got {tuple(key_padding_mask.shape)}"
+            )
+            # FA4 seqused_k assumes a True-prefix layout: positions [0, valid)
+            # are kept, [valid, S_kv) are masked. mask.sum gives the prefix length.
+            seqused_k = key_padding_mask.sum(dim=1).to(torch.int32)
+        output, lse = self._fwd(q, k, v, is_causal, seqused_k=seqused_k)
         if output.dtype != origin_dtype:
             output = output.to(origin_dtype)
         return output, lse
 
@@ -71,6 +71,7 @@ def forward(
         v: torch.Tensor,
         *,
         attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.FULL,
+        key_padding_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -83,6 +84,9 @@ def forward(
             k: Key tensor [batch_size, num_kv_heads, seq_len_kv, head_dim]
             v: Value tensor [batch_size, num_kv_heads, seq_len_kv, head_dim]
             attention_mask: Attention mask type (CAUSAL or FULL)
+            key_padding_mask: Optional ``[B, S_kv]`` bool tensor; True = valid,
+                False = pad. Expanded internally to ``[B, 1, 1, S_kv]`` and
+                passed as ``attn_mask`` to SDPA. Non-causal only.
 
         Returns:
             Output tensor [batch_size, num_heads, seq_len, head_dim]
@@ -99,13 +103,24 @@ def forward(
             f"Invalid v shape: expected [B={q.shape[0]}, H_kv, S_kv, D={self.head_dim}], got {v.shape}"
         )
 
+        enable_gqa = self.num_heads != self.num_kv_heads
+        if key_padding_mask is not None:
+            assert not is_causal, "key_padding_mask is not supported with causal attention"
+            assert key_padding_mask.dim() == 2 and key_padding_mask.shape == (
+                q.shape[0],
+                k.shape[2],
+            ), (
+                f"Invalid key_padding_mask shape: expected [B={q.shape[0]}, "
+                f"S_kv={k.shape[2]}], got {tuple(key_padding_mask.shape)}"
+            )
+            # [B, S_kv] -> [B, 1, 1, S_kv] so SDPA broadcasts over H and S_q.
+            attn_mask = key_padding_mask[:, None, None, :]
+            return F.scaled_dot_product_attention(
+                q, k, v, attn_mask=attn_mask, scale=self.scale, enable_gqa=enable_gqa
+            )
+
         return F.scaled_dot_product_attention(
-            q,
-            k,
-            v,
-            is_causal=is_causal,
-            scale=self.scale,
-            enable_gqa=self.num_heads != self.num_kv_heads,
+            q, k, v, is_causal=is_causal, scale=self.scale, enable_gqa=enable_gqa
         )
 
     @property
 
@@ -32,6 +32,11 @@ class TransformerArgs:
     cross_scale_shift_timestep: torch.Tensor | None
     cross_gate_timestep: torch.Tensor | None
     enabled: bool
+    # Optional [B, S_full_padded] bool mask (True=valid, False=pad) for the
+    # audio modality when Ulysses padding is engaged (T_a padded to be
+    # divisible by ulysses_size). Identical across Ulysses ranks (full-seq).
+    # None when no padding is applied.
+    audio_padding_mask: torch.Tensor | None = None
 
 
 class TransformerArgsPreprocessor: