NVIDIA
diff --git a/‎tensorrt_llm/_torch/modules/gated_mlp.py‎
Lines changed: 14 additions & 1 deletion b/‎tensorrt_llm/_torch/modules/gated_mlp.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/modules/linear.py‎
Lines changed: 492 additions & 299 deletions b/‎tensorrt_llm/_torch/modules/linear.py‎
Lines changed: 492 additions & 299 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/models/flux/attention.py‎
Lines changed: 12 additions & 0 deletions b/‎tensorrt_llm/_torch/visual_gen/models/flux/attention.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/models/flux/joint_proj.py‎
Lines changed: 32 additions & 7 deletions b/‎tensorrt_llm/_torch/visual_gen/models/flux/joint_proj.py‎
Lines changed: 32 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/models/flux/transformer_flux.py‎
Lines changed: 14 additions & 14 deletions b/‎tensorrt_llm/_torch/visual_gen/models/flux/transformer_flux.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/modules/attention.py‎
Lines changed: 59 additions & 9 deletions b/‎tensorrt_llm/_torch/visual_gen/modules/attention.py‎
Lines changed: 59 additions & 9 deletions
@@ -62,13 +62,25 @@ def __init__(
 
         # Calculate local intermediate size after tensor parallel sharding
         tp_size = mapping.tp_size
-        local_intermediate_size = self.intermediate_size // tp_size
+
+        local_intermediate_start = Linear._calc_shard(self.intermediate_size,
+                                                      mapping.tp_size,
+                                                      mapping.tp_rank)
+        local_intermediate_end = Linear._calc_shard(self.intermediate_size,
+                                                    mapping.tp_size,
+                                                    mapping.tp_rank + 1)
+        local_intermediate_size = local_intermediate_end - local_intermediate_start
 
         gateup_shard_indices_mapping = {
             'gate': (0, local_intermediate_size),
             'up': (local_intermediate_size, local_intermediate_size),
         }
 
+        override_tp_sharding = {
+            'gate': (local_intermediate_start, local_intermediate_end),
+            'up': (local_intermediate_start, local_intermediate_end),
+        }
+
         self.gate_up_proj = Linear(
             self.hidden_size,
             self.intermediate_size * 2,
@@ -87,6 +99,7 @@ def __init__(
             disable_deep_gemm=disable_deep_gemm,
             fused_weight_shard_indices_mapping=gateup_shard_indices_mapping,
             use_custom_cublas_mm=use_custom_cublas_mm,
+            override_tp_sharding=override_tp_sharding,
         )
 
         if is_shared_expert:
 
@@ -99,6 +99,11 @@ def __init__(
                 mapping=config.mapping,
                 tensor_parallel_mode=TensorParallelMode.COLUMN,
                 reduce_output=False,
+                override_tp_sharding={
+                    "q": (self.local_q_dim_start, self.local_q_dim_end),
+                    "k": (self.local_kv_dim_start, self.local_kv_dim_end),
+                    "v": (self.local_kv_dim_start, self.local_kv_dim_end),
+                },
             )
 
             # Need not pass any mapping info since this is intra-head normalization
@@ -128,6 +133,7 @@ def __init__(
                 allreduce_strategy=config.allreduce_strategy,
                 tensor_parallel_mode=TensorParallelMode.ROW,
                 reduce_output=True,
+                override_tp_sharding=(self.local_kv_dim_start, self.local_kv_dim_end),
             )
 
     def apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -345,6 +351,7 @@ def __init__(
             skip_create_weights_in_init=self.skip_create_weights_in_init,
             force_dynamic_quantization=self.force_dynamic_quantization,
             config=config,
+            attn_shard=(self.local_q_dim_start, self.local_q_dim_end),
         )
 
     def _init_qkv_proj(self):
@@ -361,6 +368,11 @@ def _init_qkv_proj(self):
             skip_create_weights_in_init=self.skip_create_weights_in_init,
             force_dynamic_quantization=self.force_dynamic_quantization,
             mapping=self.mapping,
+            override_qkv_sharding={
+                "q": (self.local_q_dim_start, self.local_q_dim_end),
+                "k": (self.local_kv_dim_start, self.local_kv_dim_end),
+                "v": (self.local_kv_dim_start, self.local_kv_dim_end),
+            },
         )
 
     def _apply_norm_rope_unfused(
 
@@ -53,13 +53,19 @@ def __init__(
         skip_create_weights_in_init: bool = False,
         force_dynamic_quantization: bool = False,
         config: Optional[DiffusionModelConfig] = None,
+        attn_shard: Optional[tuple[int, int]] = None,
     ):
         super().__init__()
         mapping = config.mapping if config else None
         self.tp_size = getattr(mapping, "tp_size", 1)
         self.tp_rank = getattr(mapping, "tp_rank", 0)
         self.attn_dim = attn_dim
         self.has_bias = bias
+        self.attn_shard = attn_shard
+
+        assert attn_dim % self.tp_size == 0 or self.attn_shard, (
+            "Explicit attention sharding required for uneven TP"
+        )
 
         if self.tp_size == 1:
             self.proj = Linear(
@@ -84,6 +90,7 @@ def __init__(
                 mapping=config.mapping,
                 tensor_parallel_mode=TensorParallelMode.ROW,
                 reduce_output=False,
+                override_tp_sharding=self.attn_shard,
             )
             self.mlp_proj = Linear(
                 mlp_dim,
@@ -162,10 +169,12 @@ def __init__(
         skip_create_weights_in_init: bool = False,
         force_dynamic_quantization: bool = False,
         mapping: Optional[Mapping] = None,
+        override_qkv_sharding=None,
     ):
         super().__init__()
 
         self.tp_size = mapping.tp_size if mapping else 1
+        self.tp_rank = mapping.tp_rank if mapping else 0
 
         # Store full (pre-TP) dims for weight loading (splitting checkpoint weight)
         self.full_q_dim = q_dim
@@ -188,9 +197,12 @@ def __init__(
             self.local_qkv_dim = q_dim + 2 * kv_dim
             self.local_mlp_dim = mlp_dim
         else:
-            local_q_dim = q_dim // self.tp_size
-            local_kv_dim = kv_dim // self.tp_size
-            shard_mlp_hidden_dim = self.mlp_hidden_dim // self.tp_size
+
+            def range_size(r):
+                return r[1] - r[0]
+
+            local_q_dim = range_size(override_qkv_sharding["q"])
+            local_kv_dim = range_size(override_qkv_sharding["k"])
             # QKV: column-parallel with fused Q/K/V sharding
             self.qkv_proj = Linear(
                 in_dim,
@@ -211,8 +223,17 @@ def __init__(
                 mapping=mapping,
                 tensor_parallel_mode=TensorParallelMode.COLUMN,
                 reduce_output=False,
+                override_tp_sharding=override_qkv_sharding,
+            )
+
+            local_mlp_hidden_start = Linear._calc_shard(
+                self.mlp_hidden_dim, self.tp_size, self.tp_rank
+            )
+            local_mlp_hidden_end = Linear._calc_shard(
+                self.mlp_hidden_dim, self.tp_size, self.tp_rank + 1
             )
-            # MLP gate+up: column-parallel with fused gate/up sharding
+            local_mlp_hidden_size = local_mlp_hidden_end - local_mlp_hidden_start
+
             self.mlp_proj = Linear(
                 in_dim,
                 mlp_dim,
@@ -225,15 +246,19 @@ def __init__(
                     weight_mode=WeightMode.FUSED_GATE_UP_LINEAR,
                 ),
                 fused_weight_shard_indices_mapping={
-                    "gate": (0, shard_mlp_hidden_dim),
-                    "up": (shard_mlp_hidden_dim, shard_mlp_hidden_dim),
+                    "gate": (0, local_mlp_hidden_size),
+                    "up": (local_mlp_hidden_size, local_mlp_hidden_size),
                 },
                 mapping=mapping,
                 tensor_parallel_mode=TensorParallelMode.COLUMN,
                 reduce_output=False,
+                override_tp_sharding={
+                    "gate": (local_mlp_hidden_start, local_mlp_hidden_end),
+                    "up": (local_mlp_hidden_start, local_mlp_hidden_end),
+                },
             )
             self.local_qkv_dim = (q_dim + 2 * kv_dim) // self.tp_size
-            self.local_mlp_dim = mlp_dim // self.tp_size
+            self.local_mlp_dim = local_mlp_hidden_size
 
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Returns (qkv, mlp_gate_up) with local (post-TP) sizes."""
 
@@ -465,11 +465,21 @@ def __init__(
         )
         self.act_mlp = _gelu_tanh_eager
 
-        kv_dim = num_attention_heads * attention_head_dim
+        # Attention (no added_kv_proj_dim since tokens are already concatenated)
+        self.attn = FluxJointAttention(
+            hidden_size=dim,
+            num_attention_heads=num_attention_heads,
+            head_dim=attention_head_dim,
+            bias=True,
+            eps=1e-6,
+            pre_only=True,  # No output projection in attention
+            config=config,
+            layer_idx=layer_idx,
+        )
 
         # MLP + Attn Output projection, requires special handling for TP
         self.proj_out = FluxJointAttnMLPProj(
-            attn_dim=kv_dim,
+            attn_dim=self.attn.q_dim,
             mlp_dim=self.mlp_hidden_dim,
             out_dim=dim,
             bias=True,
@@ -478,18 +488,8 @@ def __init__(
             skip_create_weights_in_init=skip_create_weights,
             force_dynamic_quantization=force_dynamic_quant,
             config=config,
-        )
-
-        # Attention (no added_kv_proj_dim since tokens are already concatenated)
-        self.attn = FluxJointAttention(
-            hidden_size=dim,
-            num_attention_heads=num_attention_heads,
-            head_dim=attention_head_dim,
-            bias=True,
-            eps=1e-6,
-            pre_only=True,  # No output projection in attention
-            config=config,
-            layer_idx=layer_idx,
+            # need explicit shard because we are aligned on head boundaries
+            attn_shard=(self.attn.local_q_dim_start, self.attn.local_q_dim_end),
         )
 
     def forward(
 
@@ -356,6 +356,7 @@ def __init__(
                 force_dynamic_quantization=force_dynamic_quant,
                 tensor_parallel_mode=tp_mode,
                 reduce_output=False,
+                override_tp_sharding=(self.attn2.local_kv_dim_start, self.attn2.local_kv_dim_end),
             )
             self.add_v_proj = Linear(
                 added_kv_proj_dim,
@@ -367,6 +368,7 @@ def __init__(
                 force_dynamic_quantization=force_dynamic_quant,
                 tensor_parallel_mode=tp_mode,
                 reduce_output=False,
+                override_tp_sharding=(self.attn2.local_kv_dim_start, self.attn2.local_kv_dim_end),
             )
             self.norm_added_k = RMSNormTPAware(
                 hidden_size=hidden_size,
@@ -375,6 +377,7 @@ def __init__(
                 has_weights=True,
                 enable_tp=(tp_size > 1),
                 mapping=model_config.mapping,
+                override_tp_sharding=(self.attn2.local_kv_dim_start, self.attn2.local_kv_dim_end),
             )
 
         # Use torch.empty().normal_(std=...) instead of torch.randn()/scale for MetaInitMode compatibility
 
@@ -71,10 +71,7 @@ def __init__(
         self.bias = bias
 
         self.tp_size = self.mapping.tp_size if self.mapping else 1
-        assert (
-            self.num_attention_heads % self.tp_size == 0
-            and self.num_key_value_heads % self.tp_size == 0
-        ), "TP size must divide the number of Query and KV Heads"
+        self.tp_rank = self.mapping.tp_rank if self.mapping else 0
 
         # Fused QK Norm + RoPE: each model class opts in via fuse_qk_norm_rope.
         # Backed by torch.ops.trtllm.fused_dit_qk_norm_rope which auto-dispatches:
@@ -108,11 +105,7 @@ def __init__(
         self.q_dim = self.num_attention_heads * self.head_dim
         self.kv_dim = self.num_key_value_heads * self.head_dim
 
-        self.local_num_attention_heads = self.num_attention_heads // self.tp_size
-        self.local_num_key_value_heads = self.num_key_value_heads // self.tp_size
-        self.local_q_dim = self.local_num_attention_heads * self.head_dim
-        self.local_kv_dim = self.local_num_key_value_heads * self.head_dim
-
+        self._calculate_tp_parameters(ulysses_size if enable_ulysses else None)
         self._init_qkv_proj()
 
         attention_metadata_state = getattr(config, "attention_metadata_state", None)
@@ -124,13 +117,20 @@ def __init__(
             q_norm_dim = self.head_dim if qk_norm_mode == "per_head" else self.q_dim
             k_norm_dim = self.head_dim if qk_norm_mode == "per_head" else self.kv_dim
             enable_tp_rms = self.tp_size > 1 and qk_norm_mode == "full"
+
+            q_start = self.local_q_dim_start
+            q_end = self.local_q_dim_end
+            k_start = self.local_kv_dim_start
+            k_end = self.local_kv_dim_end
+
             self.norm_q = RMSNormTPAware(
                 hidden_size=q_norm_dim,
                 eps=self.eps,
                 dtype=self.dtype,
                 has_weights=True,
                 enable_tp=enable_tp_rms,
                 mapping=self.mapping,
+                override_tp_sharding=(q_start, q_end) if qk_norm_mode == "full" else None,
             )
             self.norm_k = RMSNormTPAware(
                 hidden_size=k_norm_dim,
@@ -139,6 +139,7 @@ def __init__(
                 has_weights=True,
                 enable_tp=enable_tp_rms,
                 mapping=self.mapping,
+                override_tp_sharding=(k_start, k_end) if qk_norm_mode == "full" else None,
             )
 
         # TODO: Use weight mapper to create just a Linear module
@@ -156,6 +157,7 @@ def __init__(
                     tensor_parallel_mode=TensorParallelMode.ROW if self.tp_size > 1 else None,
                     reduce_output=(self.tp_size > 1),
                     allreduce_strategy=self.allreduce_strategy,
+                    override_tp_sharding=(self.local_q_dim_start, self.local_q_dim_end),
                 )
             ]
         )
@@ -231,6 +233,46 @@ def __init__(
 
                 self.attn = UlyssesAttention(self.attn, process_group=vgm.ulysses_group)
 
+    def _calculate_tp_parameters(self, ulysses_size: Optional[int]):
+        assert self.num_attention_heads % self.num_key_value_heads == 0
+        gqa_ratio = self.num_attention_heads // self.num_key_value_heads
+
+        if not ulysses_size:
+            ulysses_size = 1
+
+        assert self.num_key_value_heads % ulysses_size == 0
+        # Note: this is intentionally stronger than `num_kv_head >= ulysses_size * tp_size`
+        assert self.num_key_value_heads // ulysses_size >= self.tp_size
+
+        def _calc_shard(full, size, rank):
+            full //= ulysses_size
+            shard = (full // size) * rank + min(full % size, rank)
+            return shard * ulysses_size
+
+        self.local_key_value_head_start = _calc_shard(
+            self.num_key_value_heads, self.tp_size, self.tp_rank
+        )
+        self.local_key_value_head_end = _calc_shard(
+            self.num_key_value_heads, self.tp_size, self.tp_rank + 1
+        )
+        self.local_num_key_value_heads = (
+            self.local_key_value_head_end - self.local_key_value_head_start
+        )
+
+        self.local_attention_head_start = gqa_ratio * self.local_key_value_head_start
+        self.local_attention_head_end = gqa_ratio * self.local_key_value_head_end
+        self.local_num_attention_heads = (
+            self.local_attention_head_end - self.local_attention_head_start
+        )
+
+        self.local_q_dim_start = self.local_attention_head_start * self.head_dim
+        self.local_q_dim_end = self.local_attention_head_end * self.head_dim
+        self.local_q_dim = self.local_q_dim_end - self.local_q_dim_start
+
+        self.local_kv_dim_start = self.local_key_value_head_start * self.head_dim
+        self.local_kv_dim_end = self.local_key_value_head_end * self.head_dim
+        self.local_kv_dim = self.local_kv_dim_end - self.local_kv_dim_start
+
     def _init_qkv_proj(self) -> None:
         tp_mode = TensorParallelMode.COLUMN if self.tp_size > 1 else None
 
@@ -258,6 +300,11 @@ def _init_qkv_proj(self) -> None:
                 },
                 tensor_parallel_mode=tp_mode,
                 reduce_output=False,
+                override_tp_sharding={
+                    "q": (self.local_q_dim_start, self.local_q_dim_end),
+                    "k": (self.local_kv_dim_start, self.local_kv_dim_end),
+                    "v": (self.local_kv_dim_start, self.local_kv_dim_end),
+                },
             )
         else:
             self.to_q = Linear(
@@ -271,6 +318,7 @@ def _init_qkv_proj(self) -> None:
                 force_dynamic_quantization=self.force_dynamic_quantization,
                 tensor_parallel_mode=tp_mode,
                 reduce_output=False,
+                override_tp_sharding=(self.local_q_dim_start, self.local_q_dim_end),
             )
             self.to_k = Linear(
                 self.hidden_size,
@@ -283,6 +331,7 @@ def _init_qkv_proj(self) -> None:
                 force_dynamic_quantization=self.force_dynamic_quantization,
                 tensor_parallel_mode=tp_mode,
                 reduce_output=False,
+                override_tp_sharding=(self.local_kv_dim_start, self.local_kv_dim_end),
             )
             self.to_v = Linear(
                 self.hidden_size,
@@ -295,6 +344,7 @@ def _init_qkv_proj(self) -> None:
                 force_dynamic_quantization=self.force_dynamic_quantization,
                 tensor_parallel_mode=tp_mode,
                 reduce_output=False,
+                override_tp_sharding=(self.local_kv_dim_start, self.local_kv_dim_end),
             )
 
     def get_qkv(