[None][feat] enable GQA and cross-attention for attn2d (#14961)

NVShreyas · web-flow · commit 54dec4f707c2 · 2026-06-11T10:09:53.000-05:00
Signed-off-by: Shreyas Misra &lt;shreyasm@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/visual_gen/attention_backend/parallel.py b/tensorrt_llm/_torch/visual_gen/attention_backend/parallel.py
@@ -387,7 +387,10 @@ class Attention2DAttention(AttentionBackend):
     -----------
     Ranks are arranged in a 2-D logical mesh of shape ``[row_size, col_size]``
     (total parallelism degree = ``P = row_size * col_size``).  Each rank holds a
-    ``[B, S/P, H, D]`` shard of Q, K, and V.
+    ``[B, S_q/P, H_q, D]`` shard of Q and ``[B, S_kv/P, H_kv, D]`` shards of K and V.
+    For self-attention ``S_q = S_kv`` and ``H_q = H_kv``; for GQA ``H_kv < H_q``; for
+    cross-attention ``S_kv`` may differ from ``S_q``.  K/V must be sequence-sharded
+    across the same mesh as Q (not replicated on every rank).
 
     Example for ``row_size=2, col_size=3`` (6 ranks total)::
 
@@ -401,19 +404,22 @@ class Attention2DAttention(AttentionBackend):
     Ranks in the same **column** share a ``col_process_group`` and all-gather K/V.
 
     Architecture:
-        Input:   [B, S/P, H, D]  (sequence sharded across P = row_size × col_size ranks)
-        Step 1:  Q all-gather within row group:        [B, S/P, H, D] → [B, S/col_size, H, D]
-        Step 2:  K/V fused all-gather within col group [B, S/P, H, D] → [B, S/row_size, H, D]
-                   (K and V packed into [2, B, S/P, H, D] before the gather,
+        Input:   Q [B, S_q/P, H_q, D], K/V [B, S_kv/P, H_kv, D]
+                 (sequence sharded across P = row_size × col_size ranks)
+        Step 1:  Q all-gather within row group:
+                   [B, S_q/P, H_q, D] → [B, S_q/row_size, H_q, D]
+        Step 2:  K/V fused all-gather within col group:
+                   [B, S_kv/P, H_kv, D] → [B, S_kv/col_size, H_kv, D]
+                   (K and V packed into [2, B, S_kv/P, H_kv, D] before the gather,
                     halving NCCL launch overhead vs. two separate collectives)
         Step 3:  Local attention with inner backend:
-                   Q [B, S/col_size, H, D] × K,V [B, S/row_size, H, D]
-                   → output [B, S/col_size, H, D] + LSE [B, H, S/col_size]
+                   Q [B, S_q/row_size, H_q, D] × K,V [B, S_kv/col_size, H_kv, D]
+                   → output [B, S_q/row_size, H_q, D] + LSE [B, H_q, S_q/row_size]
         Step 4:  Reduce-scatter output within row group, split into:
                    all_to_all_single to exchange partial outputs and LSEs, then
                    LSE-weighted combine via flash_attn_combine
-                   → [B, S/P, H, D]  (fully reduced, matching input layout)
-        Output:  [B, S/P, H, D]
+                   → [B, S_q/P, H_q, D]  (fully reduced, matching input Q layout)
+        Output:  [B, S_q/P, H_q, D]
 
     Supported inner backends
     ------------------------
@@ -432,6 +438,10 @@ class Attention2DAttention(AttentionBackend):
     Constraints
     -----------
     * Only ``PredefinedAttentionMask.FULL`` (or ``None``) is supported.
+    * Global ``S_q`` and ``S_kv`` must each be divisible by ``P = row_size × col_size``
+      so every rank holds an equal local shard.
+    * Cross-attention requires K/V to be sequence-sharded across the mesh (same as Q),
+      not replicated on every rank.
     * ``flash_attn_combine`` (JIT CUDA kernel) must be importable at
       construction time; the constructor raises ``ImportError`` otherwise.
     * The ``_combine`` step is wrapped in ``@torch.compiler.disable`` because
@@ -478,6 +488,7 @@ def __init__(
                 )
         self.head_dim = inner_backend.head_dim
         self.num_heads = inner_backend.num_heads
+        self.num_kv_heads = getattr(inner_backend, "num_kv_heads", self.num_heads)
         self._inner_layout = inner_backend.preferred_layout
         if self._inner_layout not in (AttentionTensorLayout.NHD, AttentionTensorLayout.HND):
             raise NotImplementedError(
@@ -494,44 +505,66 @@ def forward(
         """
         Forward pass with Attention2D sequence parallelism.
 
-        q/k/v: [B, S/P, H, D] each.
+        q: [B, S_q/P, H_q, D].  k/v: [B, S_kv/P, H_kv, D].
         """
-        B, shard_seq, H, D = q.shape
+        B, shard_seq_q, H_q, D = q.shape
+        _, shard_seq_kv, H_kv, D_kv = k.shape
         attention_mask = kwargs.get("attention_mask", None)
 
+        if D_kv != D:
+            raise ValueError(
+                f"Attention2DAttention: q head_dim ({D}) must match k head_dim ({D_kv})."
+            )
+        if v.shape != k.shape:
+            raise ValueError(
+                f"Attention2DAttention: k and v shapes must match, got k={k.shape}, v={v.shape}."
+            )
+        if H_q != self.num_heads:
+            raise ValueError(
+                f"Attention2DAttention: q num_heads ({H_q}) must match "
+                f"inner backend num_heads ({self.num_heads})."
+            )
+        if H_kv != self.num_kv_heads:
+            raise ValueError(
+                f"Attention2DAttention: k num_kv_heads ({H_kv}) must match "
+                f"inner backend num_kv_heads ({self.num_kv_heads})."
+            )
+
         if attention_mask is not None and attention_mask != PredefinedAttentionMask.FULL:
             raise ValueError(
                 f"Attention2DAttention only supports FULL attention mask, got {attention_mask}."
             )
 
         if self.row_group_size > 1:
             # All-gather q within row_process_group using a single flat buffer.
-            # [B, S/P, H, D] → [row_group_size, B, S/P, H, D] → [B, S/col_group_size, H, D]
-            q_recv = q.new_empty(self.row_group_size, B, shard_seq, H, D)
+            # [B, S_q/P, H_q, D] → [row_group_size, B, S_q/P, H_q, D]
+            # → [B, S_q/row_size, H_q, D]
+            q_recv = q.new_empty(self.row_group_size, B, shard_seq_q, H_q, D)
             torch.distributed.all_gather_into_tensor(
                 q_recv.view(-1), q.contiguous().view(-1), group=self.row_process_group
             )
-            q = q_recv.permute(1, 0, 2, 3, 4).reshape(B, self.row_group_size * shard_seq, H, D)
+            q = q_recv.permute(1, 0, 2, 3, 4).reshape(B, self.row_group_size * shard_seq_q, H_q, D)
 
         if self.col_group_size > 1:
             # Fuse K and V into a single all-gather to reduce NCCL launch overhead.
-            # [2, B, S/P, H, D] → [col_group_size, 2, B, S/P, H, D] → split back to K, V
-            kv_send = k.new_empty(2, B, shard_seq, H, D)
+            # [2, B, S_kv/P, H_kv, D] → [col_group_size, 2, B, S_kv/P, H_kv, D]
+            # → [B, S_kv/col_size, H_kv, D]
+            kv_send = k.new_empty(2, B, shard_seq_kv, H_kv, D)
             kv_send[0].copy_(k)
             kv_send[1].copy_(v)
-            kv_recv = k.new_empty(self.col_group_size, 2, B, shard_seq, H, D)
+            kv_recv = k.new_empty(self.col_group_size, 2, B, shard_seq_kv, H_kv, D)
             torch.distributed.all_gather_into_tensor(
                 kv_recv.view(-1), kv_send.view(-1), group=self.col_process_group
             )
             k = (
                 kv_recv[:, 0]
                 .permute(1, 0, 2, 3, 4)
-                .reshape(B, self.col_group_size * shard_seq, H, D)
+                .reshape(B, self.col_group_size * shard_seq_kv, H_kv, D)
             )
             v = (
                 kv_recv[:, 1]
                 .permute(1, 0, 2, 3, 4)
-                .reshape(B, self.col_group_size * shard_seq, H, D)
+                .reshape(B, self.col_group_size * shard_seq_kv, H_kv, D)
             )
 
         seq_len = q.shape[1]
diff --git a/tensorrt_llm/_torch/visual_gen/models/cosmos3/transformer_cosmos3.py b/tensorrt_llm/_torch/visual_gen/models/cosmos3/transformer_cosmos3.py
@@ -679,7 +679,7 @@ def __init__(self, model_config: DiffusionModelConfig):
         )
         tp_size = vgm.tp_size if vgm else 1
         ulysses_size = vgm.ulysses_size if vgm else 1
-        cp_size = vgm.cp_size if vgm else 1
+        ring_size = vgm.ring_size if vgm else 1
         head_divisibility_factor = tp_size * ulysses_size
 
         if (ulysses_size > 1 or tp_size > 1) and (
@@ -692,10 +692,11 @@ def __init__(self, model_config: DiffusionModelConfig):
                 f"TP * Ulysses size ({tp_size} * {ulysses_size})"
             )
 
-        if cp_size > 1:
-            # Context parallelism is not compatible with Cosmos3 cross-attention: its forward()
-            # TODO: Re-enable once Ring/Attn2D PRs with cross-attention support have landed.
-            raise NotImplementedError("Context parallelism is not supported for Cosmos3. ")
+        if ring_size > 1:
+            # Ring parallelism is not compatible with Cosmos3 cross-attention.
+            raise NotImplementedError(
+                "Ring parallelism is not supported for Cosmos3 cross-attention."
+            )
 
         self.language_model = Cosmos3LanguageModel(model_config)
 
diff --git a/tensorrt_llm/_torch/visual_gen/modules/attention.py b/tensorrt_llm/_torch/visual_gen/modules/attention.py
@@ -230,12 +230,10 @@ def __init__(
 
         if enable_sequence_parallel and self.qkv_mode == QKVMode.SEPARATE_QKV and vgm is not None:
             ring_size = vgm.ring_size
-            attn2d_size = vgm.attn2d_row_size * vgm.attn2d_col_size
-            if ring_size > 1 or attn2d_size > 1:
+            if ring_size > 1:
                 raise ValueError(
-                    "SEPARATE_QKV cross-attention does not support Ring or Attention2D "
-                    "sequence parallelism; use enable_sequence_parallel=False or Ulysses-only "
-                    f"(ring_size={ring_size}, attn2d_size={attn2d_size})."
+                    "SEPARATE_QKV cross-attention does not support Ring sequence "
+                    "parallelism; use enable_sequence_parallel=False or Ulysses/Attention2D."
                 )
 
         self.attn = wrap_parallel_attention(
diff --git a/tests/unittest/_torch/visual_gen/multi_gpu/test_attn2d_attention.py b/tests/unittest/_torch/visual_gen/multi_gpu/test_attn2d_attention.py
@@ -70,13 +70,24 @@ class _LSEVanillaAttention(nn.Module):
     values are available, as required by Attention2DAttention.
     """
 
-    def __init__(self, num_heads: int, head_dim: int):
+    def __init__(self, num_heads: int, head_dim: int, num_kv_heads: int | None = None):
         super().__init__()
         self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads or num_heads
         self.head_dim = head_dim
         self.scale = 1.0 / math.sqrt(head_dim)
         self._preferred_layout = AttentionTensorLayout.NHD
 
+    def _expand_kv_heads(
+        self, k_t: torch.Tensor, v_t: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.num_heads == self.num_kv_heads:
+            return k_t, v_t
+        repeat_factor = self.num_heads // self.num_kv_heads
+        k_t = k_t.repeat_interleave(repeat_factor, dim=1)
+        v_t = v_t.repeat_interleave(repeat_factor, dim=1)
+        return k_t, v_t
+
     @property
     def preferred_layout(self) -> AttentionTensorLayout:
         return self._preferred_layout
@@ -93,14 +104,18 @@ def forward(self, q, k, v, batch_size=None, seq_len=None, **kwargs):
         q_t = q.transpose(1, 2).float()
         k_t = k.transpose(1, 2).float()
         v_t = v.transpose(1, 2).float()
-        out = F.scaled_dot_product_attention(q_t, k_t, v_t, scale=self.scale)
+        k_t, v_t = self._expand_kv_heads(k_t, v_t)
+        out = F.scaled_dot_product_attention(
+            q_t, k_t, v_t, scale=self.scale, enable_gqa=self.num_heads != self.num_kv_heads
+        )
         return out.to(q.dtype).transpose(1, 2).contiguous()
 
     def forward_with_lse(self, q, k, v, batch_size=None, seq_len=None, **kwargs):
         """Return (output [B, S, H, D], lse [B, H, S])."""
-        q_t = q.transpose(1, 2).float()  # [B, H, S_q, D]
-        k_t = k.transpose(1, 2).float()  # [B, H, S_k, D]
-        v_t = v.transpose(1, 2).float()  # [B, H, S_k, D]
+        q_t = q.transpose(1, 2).float()  # [B, H_q, S_q, D]
+        k_t = k.transpose(1, 2).float()  # [B, H_kv, S_k, D]
+        v_t = v.transpose(1, 2).float()  # [B, H_kv, S_k, D]
+        k_t, v_t = self._expand_kv_heads(k_t, v_t)
         scores = torch.matmul(q_t, k_t.transpose(-2, -1)) * self.scale  # [B, H, S_q, S_k]
         lse = torch.logsumexp(scores, dim=-1)  # [B, H, S_q]
         attn = torch.softmax(scores, dim=-1)
@@ -385,6 +400,97 @@ def _logic_attn2d_asymmetric_mesh_4x1(rank, world_size):
     )
 
 
+def _logic_attn2d_gqa(rank, world_size):
+    """GQA (H_kv < H_q) with equal Q/KV sequence lengths on a 2x2 mesh."""
+    row_size, col_size = 2, 2
+    batch, num_heads, num_kv_heads, head_dim = 2, 8, 2, 64
+    seq_per_rank = 8
+    seq_full = seq_per_rank * world_size
+
+    device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
+    row_pg, col_pg = _make_process_groups(rank, world_size, row_size, col_size)
+
+    inner = _LSEVanillaAttention(num_heads=num_heads, head_dim=head_dim, num_kv_heads=num_kv_heads)
+    try:
+        attn = Attention2DAttention(inner, row_pg, col_pg)
+    except ImportError:
+        pytest.skip("flash_attn_combine JIT kernels not available")
+
+    torch.manual_seed(42)
+    q_full = torch.randn(batch, seq_full, num_heads, head_dim, device=device)
+    k_full = torch.randn(batch, seq_full, num_kv_heads, head_dim, device=device)
+    v_full = torch.randn(batch, seq_full, num_kv_heads, head_dim, device=device)
+
+    q_shard = q_full[:, rank * seq_per_rank : (rank + 1) * seq_per_rank].contiguous()
+    k_shard = k_full[:, rank * seq_per_rank : (rank + 1) * seq_per_rank].contiguous()
+    v_shard = v_full[:, rank * seq_per_rank : (rank + 1) * seq_per_rank].contiguous()
+
+    attn2d_output = attn(q_shard, k_shard, v_shard, batch_size=batch)
+
+    scale = 1.0 / math.sqrt(head_dim)
+    q_std = q_full.transpose(1, 2).float()
+    k_std = k_full.transpose(1, 2).float()
+    v_std = v_full.transpose(1, 2).float()
+    std_output = F.scaled_dot_product_attention(q_std, k_std, v_std, scale=scale, enable_gqa=True)
+    std_output = std_output.transpose(1, 2).to(attn2d_output.dtype)
+
+    expected_shard = std_output[:, rank * seq_per_rank : (rank + 1) * seq_per_rank]
+    torch.testing.assert_close(
+        attn2d_output,
+        expected_shard,
+        rtol=1e-3,
+        atol=1e-3,
+        msg=f"Rank {rank}: Attention2D GQA output differs from standard attention",
+    )
+
+
+def _logic_attn2d_cross_attention(rank, world_size):
+    """Cross-attention with different Q/KV lengths and GQA on a 2x2 mesh."""
+    row_size, col_size = 2, 2
+    batch, num_heads, num_kv_heads, head_dim = 2, 8, 2, 64
+    seq_per_rank_q = 8
+    seq_per_rank_kv = 4
+    seq_full_q = seq_per_rank_q * world_size
+    seq_full_kv = seq_per_rank_kv * world_size
+
+    device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
+    row_pg, col_pg = _make_process_groups(rank, world_size, row_size, col_size)
+
+    inner = _LSEVanillaAttention(num_heads=num_heads, head_dim=head_dim, num_kv_heads=num_kv_heads)
+    try:
+        attn = Attention2DAttention(inner, row_pg, col_pg)
+    except ImportError:
+        pytest.skip("flash_attn_combine JIT kernels not available")
+
+    torch.manual_seed(42)
+    q_full = torch.randn(batch, seq_full_q, num_heads, head_dim, device=device)
+    k_full = torch.randn(batch, seq_full_kv, num_kv_heads, head_dim, device=device)
+    v_full = torch.randn(batch, seq_full_kv, num_kv_heads, head_dim, device=device)
+
+    q_shard = q_full[:, rank * seq_per_rank_q : (rank + 1) * seq_per_rank_q].contiguous()
+    k_shard = k_full[:, rank * seq_per_rank_kv : (rank + 1) * seq_per_rank_kv].contiguous()
+    v_shard = v_full[:, rank * seq_per_rank_kv : (rank + 1) * seq_per_rank_kv].contiguous()
+
+    attn2d_output = attn(q_shard, k_shard, v_shard, batch_size=batch)
+    assert attn2d_output.shape == q_shard.shape
+
+    scale = 1.0 / math.sqrt(head_dim)
+    q_std = q_full.transpose(1, 2).float()
+    k_std = k_full.transpose(1, 2).float()
+    v_std = v_full.transpose(1, 2).float()
+    std_output = F.scaled_dot_product_attention(q_std, k_std, v_std, scale=scale, enable_gqa=True)
+    std_output = std_output.transpose(1, 2).to(attn2d_output.dtype)
+
+    expected_shard = std_output[:, rank * seq_per_rank_q : (rank + 1) * seq_per_rank_q]
+    torch.testing.assert_close(
+        attn2d_output,
+        expected_shard,
+        rtol=1e-3,
+        atol=1e-3,
+        msg=f"Rank {rank}: Attention2D cross-attention output differs from standard attention",
+    )
+
+
 # =============================================================================
 # Test classes
 # =============================================================================
@@ -422,6 +528,18 @@ def test_attn2d_4x1_mesh(self):
         )
 
 
+class TestAttn2DAttentionGQAAndCrossAttention:
+    """Attention2DAttention with GQA and cross-attention."""
+
+    def test_attn2d_gqa(self):
+        """GQA with H_kv < H_q on a 2x2 mesh."""
+        run_test_in_distributed(world_size=4, test_fn=_logic_attn2d_gqa, use_cuda=True)
+
+    def test_attn2d_cross_attention(self):
+        """Cross-attention with different Q/KV lengths and GQA on a 2x2 mesh."""
+        run_test_in_distributed(world_size=4, test_fn=_logic_attn2d_cross_attention, use_cuda=True)
+
+
 def _logic_attn2d_fa4_vs_standard(rank, world_size):
     """Attention2DAttention with FlashAttn4 inner backend matches standard SDPA (2x2 mesh).
 
diff --git a/tests/unittest/_torch/visual_gen/multi_gpu/test_cosmos3_transformer_parallel.py b/tests/unittest/_torch/visual_gen/multi_gpu/test_cosmos3_transformer_parallel.py
diff --git a/tests/unittest/_torch/visual_gen/test_attention_integration.py b/tests/unittest/_torch/visual_gen/test_attention_integration.py