fix: bench_moe 0-token rank deadlock, feasibility assertion

guqiqi · guqiqi · commit 2e9ada11a3e3 · 2026-07-05T09:39:51.000+08:00
Signed-off-by: guqiqi &lt;29116997+guqiqi@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/moe_scheduler.py b/tensorrt_llm/_torch/modules/fused_moe/moe_scheduler.py
@@ -150,6 +150,31 @@ def forward(
         else:
             all_rank_num_tokens_padded = all_rank_num_tokens
 
+        # ========== 0-token rank deadlock fix ==========
+        # When some ranks have 0 tokens in single-chunk forward with collective comm,
+        # those ranks hang in CUDA kernels (e.g. NVFP4 quantize_input with 0-row tensor)
+        # before reaching moe.comm.dispatch(), causing NCCL AllGather deadlock on
+        # non-zero ranks. Fix: activate DP padding uniformly across all ranks so every
+        # rank uses sizes=None (uniform allgather) and pads x/router_logits to max_tokens.
+        # Mirrors the empty-chunk substitution in _forward_multiple_chunks (line ~597-620).
+        # Existing truncation at line ~202 discards dummy-token outputs automatically.
+        if (
+            moe.comm is not None
+            and moe.use_dp
+            and all_rank_max_num_tokens > 0
+            and not use_dp_padding
+            and any(t == 0 for t in all_rank_num_tokens_padded)
+        ):
+            use_dp_padding = True
+            all_rank_num_tokens_padded = [all_rank_max_num_tokens] * len(all_rank_num_tokens)
+            local_n = x.shape[0]
+            if local_n < all_rank_max_num_tokens:
+                pad = all_rank_max_num_tokens - local_n
+                x = torch.cat([x, x.new_zeros((pad, x.shape[1]))], dim=0)
+                router_logits = torch.cat(
+                    [router_logits, router_logits.new_zeros((pad, router_logits.shape[1]))], dim=0
+                )
+
         # ========== Step 2: Determine communication method ==========
         num_chunks = moe.calculate_num_chunks(all_rank_num_tokens_padded)
 
diff --git a/tests/microbenchmarks/bench_moe/case_runner.py b/tests/microbenchmarks/bench_moe/case_runner.py
@@ -690,7 +690,11 @@ def _run_one_candidate(
                 mapping=mapping,
                 moe_backend=config.backend,
                 use_cuda_graph=bool(config.cuda_graph),
-                max_num_tokens=max(int(local_num_tokens), 1),
+                # Symmetric-memory comm backends (e.g. NVLINK_ONE_SIDED) size their
+                # workspace from max_num_tokens and require every rank to allocate the
+                # same size, so use the global per-rank maximum rather than this rank's
+                # local token count (which differs under uneven attention-DP shards).
+                max_num_tokens=max(int(max(per_rank)) if per_rank else 0, 1),
                 use_low_precision_moe_combine=bool(config.use_low_precision_moe_combine),
                 enable_perfect_router=enable_perfect_router,
                 dtype=act_dtype,
diff --git a/tests/microbenchmarks/bench_moe/cli.py b/tests/microbenchmarks/bench_moe/cli.py
@@ -222,8 +222,9 @@ def parse_args() -> argparse.Namespace:
         nargs="+",
         required=False,
         help=(
-            "Global token counts to sweep. Each value is balanced across ranks "
-            "with any remainder on rank 0. Example: --balanced_total_num_tokens 64 256 1024."
+            "Global token counts to sweep. Each value is balanced across ranks, "
+            "spreading any remainder one token per leading rank (e.g. world_size=4, "
+            "tokens=2 -> [1, 1, 0, 0]). Example: --balanced_total_num_tokens 64 256 1024."
         ),
     )
 
diff --git a/tests/microbenchmarks/bench_moe/mapping.py b/tests/microbenchmarks/bench_moe/mapping.py
@@ -90,12 +90,17 @@ def _resolve_mapping_layout(config: ConfigSpec, world_size: int) -> Tuple[int, i
 def _build_mapping_from_config(config: ConfigSpec, world_size: int) -> Mapping:
     """Build ``Mapping`` from a ``ConfigSpec`` + world size; sets ``rank=mpi_rank()``."""
     moe_ep, moe_tp, enable_dp = _resolve_mapping_layout(config, world_size)
+    # gpus_per_node must match actual visible GPUs per node so that
+    # mapping.local_rank (= rank % gpus_per_node) gives the correct device index.
+    # The Mapping default (8) is wrong for multi-node runs with fewer GPUs per node.
+    gpus_per_node = torch.cuda.device_count()
     mapping = Mapping(
         world_size=world_size,
         tp_size=world_size,
         moe_ep_size=moe_ep,
         moe_tp_size=moe_tp,
         enable_attention_dp=enable_dp,
+        gpus_per_node=gpus_per_node,
     )
     mapping.rank = mpi_rank()
     return mapping
diff --git a/tests/microbenchmarks/bench_moe/search.py b/tests/microbenchmarks/bench_moe/search.py
@@ -24,6 +24,7 @@
 
 import torch
 
+from tensorrt_llm._utils import local_mpi_size
 from tensorrt_llm.models.modeling_utils import QuantAlgo
 
 from .backend import MoeBackendType, get_backend_class
@@ -33,6 +34,24 @@
 _FUSED_COMM_BACKENDS = frozenset({"MEGAMOE_DEEPGEMM"})
 
 
+def _is_deepep_feasible(num_ranks: int) -> bool:
+    """Return True if DeepEP supports the given EP rank count on this node topology.
+
+    Intranode: num_ranks in {2, 4, 8} and num_ranks == local_mpi_size().
+    Internode: exactly 8 ranks per node, with 2/4/8/16 RDMA nodes.
+    Mirrors the feasibility check in fused_moe_wide_ep.py::select_alltoall_method_type.
+    """
+    _INTRANODE_RANKS = {2, 4, 8}
+    _REQUIRED_LOCAL_SIZE = 8
+    _INTERNODE_RDMA_NODES = {2, 4, 8, 16}
+    mpi_size = local_mpi_size()
+    if num_ranks == mpi_size and num_ranks in _INTRANODE_RANKS:
+        return True
+    if mpi_size != _REQUIRED_LOCAL_SIZE:
+        return False
+    return (num_ranks // mpi_size) in _INTERNODE_RDMA_NODES
+
+
 def _check_backend_can_implement(
     backend_str: str,
     quant_algo: Optional[QuantAlgo],
@@ -151,6 +170,61 @@ def is_candidate_valid(
             "use TEP/DEP only with other backends"
         )
 
+    # MegaMoEDeepGemm is EP-only (asserts moe_tp_size == 1 in __init__); DTP/TTP are invalid.
+    if config.backend.upper() == "MEGAMOE_DEEPGEMM" and moe_tp > 1:
+        return False, (
+            f"MEGAMOE_DEEPGEMM does not support MoE-TP (moe_tp_size={moe_tp}); "
+            "use DEP/TEP modes only"
+        )
+
+    # DENSEGEMM DTP: FC2 kernel requires (intermediate_size / moe_tp_size) % 256 == 0.
+    # DENSEGEMM __init__ only checks the full intermediate_size, so a model like
+    # DeepSeek V3 (intermediate_size=2048, 2048%256=0) passes __init__ but fails
+    # at runtime with moe_tp_size=16 (2048/16=128, 128%256!=0).
+    if config.backend.upper() == "DENSEGEMM" and moe_ep == 1 and moe_tp > 1:
+        if model.intermediate_size % moe_tp != 0:
+            return False, (
+                f"DENSEGEMM DTP: intermediate_size={model.intermediate_size} "
+                f"not divisible by moe_tp_size={moe_tp}"
+            )
+        per_tp_k = model.intermediate_size // moe_tp
+        _DENSEGEMM_MMA_TILE_K = 256
+        if per_tp_k % _DENSEGEMM_MMA_TILE_K != 0:
+            return False, (
+                f"DENSEGEMM DTP moe_tp_size={moe_tp}: intermediate_size/tp={per_tp_k} "
+                f"not aligned to FC2 MMA tile-K={_DENSEGEMM_MMA_TILE_K}"
+            )
+
+    # NVFP4 on CuteDSL / TRTLLM-Gen requires the per-partition intermediate size
+    # (intermediate_size / moe_tp_size) to be a multiple of the NVFP4 weight
+    # alignment (128). Unlike CUTLASS (which pads intermediate_size_per_partition
+    # up to 128), these backends use the unpadded logical size when laying out the
+    # block-scale tensor and fail during weight load: CUTEDSL raises a reshape
+    # RuntimeError (e.g. "shape '[-1, 192, 448]' is invalid for input of size
+    # 114688" — 192 padded to 256) and TRTLLM-Gen hits `assert intermediate_size %
+    # weight_alignment == 0`. Prune the unsupported combo with a clear reason
+    # instead of letting it crash mid-sweep. Example: DeepSeek-V4-Pro
+    # (intermediate_size=3072) at moe_tp_size=32 -> 3072/32=96, 96%128!=0.
+    if (
+        config.backend.upper() in ("CUTEDSL", "TRTLLM")
+        and model.quant_algo_enum == QuantAlgo.NVFP4
+        and moe_tp > 1
+    ):
+        _NVFP4_WEIGHT_ALIGNMENT = 128
+        if model.intermediate_size % moe_tp != 0:
+            return False, (
+                f"{config.backend.upper()} NVFP4: intermediate_size="
+                f"{model.intermediate_size} not divisible by moe_tp_size={moe_tp}"
+            )
+        per_tp_k = model.intermediate_size // moe_tp
+        if per_tp_k % _NVFP4_WEIGHT_ALIGNMENT != 0:
+            return False, (
+                f"{config.backend.upper()} NVFP4 moe_tp_size={moe_tp}: "
+                f"intermediate_size/tp={per_tp_k} not aligned to NVFP4 weight "
+                f"alignment={_NVFP4_WEIGHT_ALIGNMENT} (CUTLASS pads to 128, "
+                f"CUTEDSL/TRTLLM do not)"
+            )
+
     # Forced communication on non-DP / MoE-TP paths.
     forced = config.comm_method.upper()
     if forced not in ("AUTO", "NONE"):
@@ -160,6 +234,12 @@ def is_candidate_valid(
             return False, f"comm_method={forced} requires moe_tp_size=1 (got {moe_tp})"
         if world_size == 1:
             return False, f"comm_method={forced} has no effect at world_size=1"
+        if forced == "DEEPEP" and not _is_deepep_feasible(moe_ep):
+            return False, (
+                f"comm_method={forced}: moe_ep_size={moe_ep} not supported by DeepEP topology "
+                f"(local_mpi_size={local_mpi_size()}; supported: intranode {{2,4,8}}, "
+                f"internode 8-ranks/node x {{2,4,8,16}} nodes)"
+            )
 
     return True, None
 
diff --git a/tests/microbenchmarks/bench_moe/utils.py b/tests/microbenchmarks/bench_moe/utils.py
@@ -106,15 +106,18 @@ def _compute_stats(values: List[float]) -> Dict[str, float]:
 
 
 def _distribute_tokens(total: int, world_size: int) -> List[int]:
-    """Distribute ``total`` global tokens evenly across ``world_size`` ranks."""
+    """Distribute ``total`` global tokens evenly across ranks.
+
+    Remainder tokens are spread one-per-rank over the leading ranks (instead of
+    piling the entire remainder on rank 0), so e.g. (total=2, world_size=4) ->
+    [1, 1, 0, 0]. An even, non-degenerate split keeps every rank's per-rank token
+    count within 1 of each other, which the downstream symmetric-memory workspace
+    sizing relies on.
+    """
     if world_size <= 0 or total < 0:
         raise ValueError(f"invalid args: total={total}, world_size={world_size}")
-    if world_size == 1:
-        return [total]
-    base = total // world_size
-    out = [base] * world_size
-    out[0] += total - base * world_size
-    return out
+    base, rem = divmod(total, world_size)
+    return [base + (1 if i < rem else 0) for i in range(world_size)]
 
 
 def _validate_per_rank_token_list(