bench-moe enhancement: prune TEP/TTP forced-comm candidates at generation time; add DENSEGEMM+EP validation; remove spurious .cpu() in MXFP4 quantize_utils

guqiqi · guqiqi · commit f39a42a4ab91 · 2026-06-23T03:29:34.000-07:00
Signed-off-by: guqiqi &lt;29116997+guqiqi@users.noreply.github.com&gt;
diff --git a/tests/microbenchmarks/bench_moe/search.py b/tests/microbenchmarks/bench_moe/search.py
@@ -27,7 +27,7 @@
 from tensorrt_llm.models.modeling_utils import QuantAlgo
 
 from .backend import MoeBackendType, get_backend_class
-from .mapping import _resolve_mapping_layout
+from .mapping import _PARALLEL_MODE_LAYOUTS, _resolve_mapping_layout
 from .specs import _ALL_BACKENDS, _FORCED_COMM_ENV_VALUES, ConfigSpec, ModelSpec, SearchSpec
 
 _FUSED_COMM_BACKENDS = frozenset({"MEGAMOE_DEEPGEMM"})
@@ -65,6 +65,23 @@ def _comm_axis_for_backend(backend: Any, comm_methods: Tuple[Any, ...]) -> Tuple
     return comm_methods
 
 
+def _comm_axis_for_parallel_mode(pmode: str, comm_methods: Tuple[Any, ...]) -> Tuple[Any, ...]:
+    """Collapse comm axis to AUTO for parallel modes without attention DP.
+
+    Non-AUTO forced comm methods require enable_attention_dp=True (see
+    is_candidate_valid). TEP and TTP have enable_dp=False, so only AUTO
+    is ever valid for them. Generating forced-comm candidates for these
+    modes only produces prune rows — handle it at generation time instead.
+    CUSTOM mode is passed through unchanged (validated separately).
+    """
+    layout = _PARALLEL_MODE_LAYOUTS.get(str(pmode).upper())
+    if layout is None:
+        return comm_methods  # CUSTOM: unknown layout, keep as-is
+    if not layout["enable_attention_dp"]:
+        return ("AUTO",)
+    return comm_methods
+
+
 def expand_search(
     base_config: ConfigSpec,
     search: SearchSpec,
@@ -88,7 +105,13 @@ def expand_search(
     for backend, pmode, cgraph, combine in itertools.product(
         backends, parallel_modes, cuda_graph_options, combine_options
     ):
-        for comm in _comm_axis_for_backend(backend, comm_methods):
+        effective_comm = _comm_axis_for_backend(backend, comm_methods)
+        # For non-fused backends apply parallel-mode comm constraint at
+        # generation time so TEP/TTP always get comm=AUTO instead of
+        # generating forced-comm candidates that are immediately pruned.
+        if effective_comm != ("NONE",):
+            effective_comm = _comm_axis_for_parallel_mode(pmode, effective_comm)
+        for comm in effective_comm:
             candidate = replace(
                 base_config,
                 backend=str(backend).upper(),
@@ -121,6 +144,13 @@ def is_candidate_valid(
     except ValueError as exc:
         return False, str(exc)
 
+    # DenseGEMM only supports TP; any EP configuration (TEP, DEP, custom ep>1) is unsupported.
+    if config.backend.upper() == "DENSEGEMM" and moe_ep > 1:
+        return False, (
+            f"DENSEGEMM does not support EP (ep_size={moe_ep}); "
+            "use TEP/DEP only with other backends"
+        )
+
     # Forced communication on non-DP / MoE-TP paths.
     forced = config.comm_method.upper()
     if forced not in ("AUTO", "NONE"):
diff --git a/tests/unittest/_torch/modules/moe/quantize_utils.py b/tests/unittest/_torch/modules/moe/quantize_utils.py
@@ -1695,21 +1695,21 @@ def create_weights(self, **quant_kwargs) -> Dict[str, torch.Tensor]:
                 w1_weight, None, scaling_vector_size, True
             )
             w1_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
-                w1_sf_block.cpu().view(intermediate_size, -1)
+                w1_sf_block.view(intermediate_size, -1)
             )
 
             w2_weight_mxfp4, w2_sf_block = torch.ops.trtllm.fp4_quantize(
                 w2_weight, None, scaling_vector_size, True
             )
             w2_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
-                w2_sf_block.cpu().view(hidden_size_out, -1)
+                w2_sf_block.view(hidden_size_out, -1)
             )
 
             w3_weight_mxfp4, w3_sf_block = torch.ops.trtllm.fp4_quantize(
                 w3_weight, None, scaling_vector_size, True
             )
             w3_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
-                w3_sf_block.cpu().view(intermediate_size, -1)
+                w3_sf_block.view(intermediate_size, -1)
             )
 
             weights[f"{expert_id}.w1.weight"] = w1_weight_mxfp4

Original file line number	Diff line number	Diff line change
`@@ -1695,21 +1695,21 @@ def create_weights(self, **quant_kwargs) -> Dict[str, torch.Tensor]:`
`1695`	`1695`	`w1_weight, None, scaling_vector_size, True`
`1696`	`1696`	`)`
`1697`	`1697`	`w1_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(`
`1698`		`- w1_sf_block.cpu().view(intermediate_size, -1)`
	`1698`	`+ w1_sf_block.view(intermediate_size, -1)`
`1699`	`1699`	`)`
`1700`	`1700`
`1701`	`1701`	`w2_weight_mxfp4, w2_sf_block = torch.ops.trtllm.fp4_quantize(`
`1702`	`1702`	`w2_weight, None, scaling_vector_size, True`
`1703`	`1703`	`)`
`1704`	`1704`	`w2_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(`
`1705`		`- w2_sf_block.cpu().view(hidden_size_out, -1)`
	`1705`	`+ w2_sf_block.view(hidden_size_out, -1)`
`1706`	`1706`	`)`
`1707`	`1707`
`1708`	`1708`	`w3_weight_mxfp4, w3_sf_block = torch.ops.trtllm.fp4_quantize(`
`1709`	`1709`	`w3_weight, None, scaling_vector_size, True`
`1710`	`1710`	`)`
`1711`	`1711`	`w3_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(`
`1712`		`- w3_sf_block.cpu().view(intermediate_size, -1)`
	`1712`	`+ w3_sf_block.view(intermediate_size, -1)`
`1713`	`1713`	`)`
`1714`	`1714`
`1715`	`1715`	`weights[f"{expert_id}.w1.weight"] = w1_weight_mxfp4`