[None][feat] Support tensor parallelism of trtllm moe backend for nemotron-h model (NVIDIA#11470)

Wanli-Jiang · web-flow · commit e56397dd5a0d · 2026-02-26T19:16:18.000+08:00
Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -176,6 +176,19 @@ def maybe_pad_for_mxfp4(weight: torch.Tensor,
     return weight
 
 
+def _pad_tensor_to_shape(tensor: torch.Tensor, shape: tuple) -> torch.Tensor:
+    """Pad tensor to match target shape. Used for post-shard alignment."""
+    if tensor.numel() == 0:
+        return tensor
+    if tensor.shape == shape:
+        return tensor
+    if len(tensor.shape) == 1:
+        return F.pad(tensor, (0, shape[0] - tensor.shape[0])).contiguous()
+    row_pad = shape[0] - tensor.shape[0]
+    col_pad = shape[1] - tensor.shape[1]
+    return F.pad(tensor, (0, col_pad, 0, row_pad)).contiguous()
+
+
 def interleave_linear_and_gate(x: torch.Tensor,
                                group_size: int = 64,
                                dim: int = -1) -> torch.Tensor:
@@ -2915,6 +2928,9 @@ def round_up(x, alignment):
         return (w3_w1_weight_shape, w2_weight_shape, w3_w1_bias_shape,
                 w2_bias_shape, w3_w1_weight_scale_shape, w2_weight_scale_shape)
 
+    def _round_up(self, x, alignment):
+        return (x + alignment - 1) // alignment * alignment
+
     def create_weights(self, module: torch.nn.Module):
         # Here we only enable padding for hidden_size > 1024 since there are small unit tests that expect no padding.
         if module.hidden_size > 1024 and module.hidden_size % 256 != 0:
@@ -2923,6 +2939,15 @@ def create_weights(self, module: torch.nn.Module):
             # See the comment in MXFP4WeightTRTLLMGenFusedMoEMethod for more details.
             self.input_hidden_alignment = 256
 
+        else:
+            # Weight scales require M % 128 in get_shuffle_matrix_sf_a_row_indices.
+            # Check if intermediate_size after padding satisfies this requirement.
+            # If not, set weight_alignment to 128.
+            intermediate_size_padded = self._round_up(
+                module.intermediate_size_per_partition, self.weight_alignment)
+            if intermediate_size_padded % 128 != 0:
+                self.weight_alignment = 128
+
         super().create_weights(module, bias_dtype=torch.float32)
 
     def setup_quant_scales(self, module: torch.nn.Module):
@@ -2981,6 +3006,8 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
             dst_w3_weight.copy_(w3_weight_shard.view(dst_w3_weight.dtype))
             dst_w1_weight.copy_(w1_weight_shard.view(dst_w1_weight.dtype))
         else:
+            w1_weight_shard = _pad_tensor_to_shape(w1_weight_shard,
+                                                   dst_w3_w1_weight_gpu.shape)
             dst_w3_w1_weight_gpu.copy_(
                 w1_weight_shard.view(dst_w3_w1_weight_gpu.dtype))
 
@@ -3038,6 +3065,8 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
         epilogue_tile_m = 128
 
         # Keep weights in device buffer
+        w2_weight_shard = _pad_tensor_to_shape(w2_weight_shard,
+                                               dst_w2_weight_gpu.shape)
         dst_w2_weight_gpu.copy_(w2_weight_shard.view(dst_w2_weight_gpu.dtype),
                                 non_blocking=dst_on_gpu)
         # Get permuted indices
@@ -3071,7 +3100,7 @@ def load_expert_w3_w1_weight_scale_nvfp4(
         alignment = _get_weight_alignment(self.weight_alignment,
                                           module.scaling_vector_size,
                                           module.tp_size,
-                                          w3_weight_scale.shape[0])
+                                          w1_weight_scale.shape[0])
         w1_weight_scale = maybe_pad_for_mxfp4(
             w1_weight_scale,
             self.input_hidden_alignment // module.scaling_vector_size,
@@ -3113,6 +3142,8 @@ def load_expert_w3_w1_weight_scale_nvfp4(
                     w1_weight_scale.view(dst_w1_weight_scale.dtype))
         else:
             # Non-gated activation (e.g., ReLU2): buffer only contains w1 scale
+            w1_weight_scale = _pad_tensor_to_shape(
+                w1_weight_scale, dst_w3_w1_weight_scale_gpu.shape)
             dst_w3_w1_weight_scale_gpu.copy_(
                 w1_weight_scale.view(dst_w3_w1_weight_scale_gpu.dtype))
 
@@ -3170,6 +3201,8 @@ def load_expert_w2_weight_scale_nvfp4(self,
                                             TensorParallelMode.ROW,
                                             device=device)
         # Keep weights in device buffer
+        w2_weight_scale = _pad_tensor_to_shape(w2_weight_scale,
+                                               dst_w2_weight_scale_gpu.shape)
         dst_w2_weight_scale_gpu.copy_(
             w2_weight_scale.view(dst_w2_weight_scale_gpu.dtype))
 
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -5798,6 +5798,46 @@ def test_nvfp4_8gpus(self, attention_dp, moe_backend):
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
 
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_mpi_world_size(8)
+    @pytest.mark.parametrize(
+        "tp_size, ep_size, pp_size, attention_dp",
+        [
+            (4, 1, 2, False),
+            (4, 4, 2, False),
+            (8, 1, 1, False),
+            (8, 8, 1, False),
+            (8, 1, 1, True),
+        ],
+        ids=["TP4_PP2", "TEP4_PP2", "TP8_PP1", "TEP8_PP1", "TP8_PP1_ADP"],
+    )
+    def test_nvfp4_parallelism(self, tp_size, ep_size, pp_size, attention_dp):
+        with LLM(
+                f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv",
+                kv_cache_config=KvCacheConfig(
+                    enable_block_reuse=False,
+                    mamba_ssm_cache_dtype="float16",
+                    free_gpu_memory_fraction=0.8,
+                ),
+                max_batch_size=512,
+                tensor_parallel_size=tp_size,
+                moe_expert_parallel_size=ep_size,
+                pipeline_parallel_size=pp_size,
+                enable_attention_dp=attention_dp,
+                cuda_graph_config=CudaGraphConfig(max_batch_size=512,
+                                                  enable_padding=True),
+                disable_overlap_scheduler=False,
+                moe_config=MoeConfig(backend="TRTLLM"),
+        ) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
+            # TODO: GSM8K will be failed due to mamba cache issue for pp_size > 1.
+            if pp_size == 1:
+                task = GSM8K(self.MODEL_NAME)
+                task.evaluate(
+                    llm, extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
+
     @skip_pre_blackwell
     @pytest.mark.skip_less_mpi_world_size(8)
     def test_nvfp4_8gpus_mtp(self):
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -283,6 +283,11 @@ accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_on-python_mamba_cache]
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_on-cpp_mamba_cache]
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_on-trtllm]
+accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_parallelism[TP4_PP2]
+accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_parallelism[TEP4_PP2]
+accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_parallelism[TP8_PP1]
+accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_parallelism[TEP8_PP1]
+accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_parallelism[TP8_PP1_ADP]
 
 # multimodal accuracy tests
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -117,6 +117,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus_mtp TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_on-trtllm] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_on-cutlass] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_parallelism[TP4_PP2] TIMEOUT (60)
 - condition:
     ranges:
       system_gpu_count:
@@ -146,6 +147,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_pp4_mtp1] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_parallelism[TP8_PP1] TIMEOUT (60)
   - test_e2e.py::test_deepseek_r1_mtp_bench TIMEOUT(60) # Cover https://nvbugs/5670108
 - condition:
     ranges: