Skip to content

Commit 2e9ada1

Browse files
committed
fix: bench_moe 0-token rank deadlock, feasibility assertion
Signed-off-by: guqiqi <29116997+guqiqi@users.noreply.github.com>
1 parent 89e0ad3 commit 2e9ada1

6 files changed

Lines changed: 128 additions & 10 deletions

File tree

tensorrt_llm/_torch/modules/fused_moe/moe_scheduler.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,31 @@ def forward(
150150
else:
151151
all_rank_num_tokens_padded = all_rank_num_tokens
152152

153+
# ========== 0-token rank deadlock fix ==========
154+
# When some ranks have 0 tokens in single-chunk forward with collective comm,
155+
# those ranks hang in CUDA kernels (e.g. NVFP4 quantize_input with 0-row tensor)
156+
# before reaching moe.comm.dispatch(), causing NCCL AllGather deadlock on
157+
# non-zero ranks. Fix: activate DP padding uniformly across all ranks so every
158+
# rank uses sizes=None (uniform allgather) and pads x/router_logits to max_tokens.
159+
# Mirrors the empty-chunk substitution in _forward_multiple_chunks (line ~597-620).
160+
# Existing truncation at line ~202 discards dummy-token outputs automatically.
161+
if (
162+
moe.comm is not None
163+
and moe.use_dp
164+
and all_rank_max_num_tokens > 0
165+
and not use_dp_padding
166+
and any(t == 0 for t in all_rank_num_tokens_padded)
167+
):
168+
use_dp_padding = True
169+
all_rank_num_tokens_padded = [all_rank_max_num_tokens] * len(all_rank_num_tokens)
170+
local_n = x.shape[0]
171+
if local_n < all_rank_max_num_tokens:
172+
pad = all_rank_max_num_tokens - local_n
173+
x = torch.cat([x, x.new_zeros((pad, x.shape[1]))], dim=0)
174+
router_logits = torch.cat(
175+
[router_logits, router_logits.new_zeros((pad, router_logits.shape[1]))], dim=0
176+
)
177+
153178
# ========== Step 2: Determine communication method ==========
154179
num_chunks = moe.calculate_num_chunks(all_rank_num_tokens_padded)
155180

tests/microbenchmarks/bench_moe/case_runner.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,11 @@ def _run_one_candidate(
690690
mapping=mapping,
691691
moe_backend=config.backend,
692692
use_cuda_graph=bool(config.cuda_graph),
693-
max_num_tokens=max(int(local_num_tokens), 1),
693+
# Symmetric-memory comm backends (e.g. NVLINK_ONE_SIDED) size their
694+
# workspace from max_num_tokens and require every rank to allocate the
695+
# same size, so use the global per-rank maximum rather than this rank's
696+
# local token count (which differs under uneven attention-DP shards).
697+
max_num_tokens=max(int(max(per_rank)) if per_rank else 0, 1),
694698
use_low_precision_moe_combine=bool(config.use_low_precision_moe_combine),
695699
enable_perfect_router=enable_perfect_router,
696700
dtype=act_dtype,

tests/microbenchmarks/bench_moe/cli.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,9 @@ def parse_args() -> argparse.Namespace:
222222
nargs="+",
223223
required=False,
224224
help=(
225-
"Global token counts to sweep. Each value is balanced across ranks "
226-
"with any remainder on rank 0. Example: --balanced_total_num_tokens 64 256 1024."
225+
"Global token counts to sweep. Each value is balanced across ranks, "
226+
"spreading any remainder one token per leading rank (e.g. world_size=4, "
227+
"tokens=2 -> [1, 1, 0, 0]). Example: --balanced_total_num_tokens 64 256 1024."
227228
),
228229
)
229230

tests/microbenchmarks/bench_moe/mapping.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,17 @@ def _resolve_mapping_layout(config: ConfigSpec, world_size: int) -> Tuple[int, i
9090
def _build_mapping_from_config(config: ConfigSpec, world_size: int) -> Mapping:
9191
"""Build ``Mapping`` from a ``ConfigSpec`` + world size; sets ``rank=mpi_rank()``."""
9292
moe_ep, moe_tp, enable_dp = _resolve_mapping_layout(config, world_size)
93+
# gpus_per_node must match actual visible GPUs per node so that
94+
# mapping.local_rank (= rank % gpus_per_node) gives the correct device index.
95+
# The Mapping default (8) is wrong for multi-node runs with fewer GPUs per node.
96+
gpus_per_node = torch.cuda.device_count()
9397
mapping = Mapping(
9498
world_size=world_size,
9599
tp_size=world_size,
96100
moe_ep_size=moe_ep,
97101
moe_tp_size=moe_tp,
98102
enable_attention_dp=enable_dp,
103+
gpus_per_node=gpus_per_node,
99104
)
100105
mapping.rank = mpi_rank()
101106
return mapping

tests/microbenchmarks/bench_moe/search.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
import torch
2626

27+
from tensorrt_llm._utils import local_mpi_size
2728
from tensorrt_llm.models.modeling_utils import QuantAlgo
2829

2930
from .backend import MoeBackendType, get_backend_class
@@ -33,6 +34,24 @@
3334
_FUSED_COMM_BACKENDS = frozenset({"MEGAMOE_DEEPGEMM"})
3435

3536

37+
def _is_deepep_feasible(num_ranks: int) -> bool:
38+
"""Return True if DeepEP supports the given EP rank count on this node topology.
39+
40+
Intranode: num_ranks in {2, 4, 8} and num_ranks == local_mpi_size().
41+
Internode: exactly 8 ranks per node, with 2/4/8/16 RDMA nodes.
42+
Mirrors the feasibility check in fused_moe_wide_ep.py::select_alltoall_method_type.
43+
"""
44+
_INTRANODE_RANKS = {2, 4, 8}
45+
_REQUIRED_LOCAL_SIZE = 8
46+
_INTERNODE_RDMA_NODES = {2, 4, 8, 16}
47+
mpi_size = local_mpi_size()
48+
if num_ranks == mpi_size and num_ranks in _INTRANODE_RANKS:
49+
return True
50+
if mpi_size != _REQUIRED_LOCAL_SIZE:
51+
return False
52+
return (num_ranks // mpi_size) in _INTERNODE_RDMA_NODES
53+
54+
3655
def _check_backend_can_implement(
3756
backend_str: str,
3857
quant_algo: Optional[QuantAlgo],
@@ -151,6 +170,61 @@ def is_candidate_valid(
151170
"use TEP/DEP only with other backends"
152171
)
153172

173+
# MegaMoEDeepGemm is EP-only (asserts moe_tp_size == 1 in __init__); DTP/TTP are invalid.
174+
if config.backend.upper() == "MEGAMOE_DEEPGEMM" and moe_tp > 1:
175+
return False, (
176+
f"MEGAMOE_DEEPGEMM does not support MoE-TP (moe_tp_size={moe_tp}); "
177+
"use DEP/TEP modes only"
178+
)
179+
180+
# DENSEGEMM DTP: FC2 kernel requires (intermediate_size / moe_tp_size) % 256 == 0.
181+
# DENSEGEMM __init__ only checks the full intermediate_size, so a model like
182+
# DeepSeek V3 (intermediate_size=2048, 2048%256=0) passes __init__ but fails
183+
# at runtime with moe_tp_size=16 (2048/16=128, 128%256!=0).
184+
if config.backend.upper() == "DENSEGEMM" and moe_ep == 1 and moe_tp > 1:
185+
if model.intermediate_size % moe_tp != 0:
186+
return False, (
187+
f"DENSEGEMM DTP: intermediate_size={model.intermediate_size} "
188+
f"not divisible by moe_tp_size={moe_tp}"
189+
)
190+
per_tp_k = model.intermediate_size // moe_tp
191+
_DENSEGEMM_MMA_TILE_K = 256
192+
if per_tp_k % _DENSEGEMM_MMA_TILE_K != 0:
193+
return False, (
194+
f"DENSEGEMM DTP moe_tp_size={moe_tp}: intermediate_size/tp={per_tp_k} "
195+
f"not aligned to FC2 MMA tile-K={_DENSEGEMM_MMA_TILE_K}"
196+
)
197+
198+
# NVFP4 on CuteDSL / TRTLLM-Gen requires the per-partition intermediate size
199+
# (intermediate_size / moe_tp_size) to be a multiple of the NVFP4 weight
200+
# alignment (128). Unlike CUTLASS (which pads intermediate_size_per_partition
201+
# up to 128), these backends use the unpadded logical size when laying out the
202+
# block-scale tensor and fail during weight load: CUTEDSL raises a reshape
203+
# RuntimeError (e.g. "shape '[-1, 192, 448]' is invalid for input of size
204+
# 114688" — 192 padded to 256) and TRTLLM-Gen hits `assert intermediate_size %
205+
# weight_alignment == 0`. Prune the unsupported combo with a clear reason
206+
# instead of letting it crash mid-sweep. Example: DeepSeek-V4-Pro
207+
# (intermediate_size=3072) at moe_tp_size=32 -> 3072/32=96, 96%128!=0.
208+
if (
209+
config.backend.upper() in ("CUTEDSL", "TRTLLM")
210+
and model.quant_algo_enum == QuantAlgo.NVFP4
211+
and moe_tp > 1
212+
):
213+
_NVFP4_WEIGHT_ALIGNMENT = 128
214+
if model.intermediate_size % moe_tp != 0:
215+
return False, (
216+
f"{config.backend.upper()} NVFP4: intermediate_size="
217+
f"{model.intermediate_size} not divisible by moe_tp_size={moe_tp}"
218+
)
219+
per_tp_k = model.intermediate_size // moe_tp
220+
if per_tp_k % _NVFP4_WEIGHT_ALIGNMENT != 0:
221+
return False, (
222+
f"{config.backend.upper()} NVFP4 moe_tp_size={moe_tp}: "
223+
f"intermediate_size/tp={per_tp_k} not aligned to NVFP4 weight "
224+
f"alignment={_NVFP4_WEIGHT_ALIGNMENT} (CUTLASS pads to 128, "
225+
f"CUTEDSL/TRTLLM do not)"
226+
)
227+
154228
# Forced communication on non-DP / MoE-TP paths.
155229
forced = config.comm_method.upper()
156230
if forced not in ("AUTO", "NONE"):
@@ -160,6 +234,12 @@ def is_candidate_valid(
160234
return False, f"comm_method={forced} requires moe_tp_size=1 (got {moe_tp})"
161235
if world_size == 1:
162236
return False, f"comm_method={forced} has no effect at world_size=1"
237+
if forced == "DEEPEP" and not _is_deepep_feasible(moe_ep):
238+
return False, (
239+
f"comm_method={forced}: moe_ep_size={moe_ep} not supported by DeepEP topology "
240+
f"(local_mpi_size={local_mpi_size()}; supported: intranode {{2,4,8}}, "
241+
f"internode 8-ranks/node x {{2,4,8,16}} nodes)"
242+
)
163243

164244
return True, None
165245

tests/microbenchmarks/bench_moe/utils.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,18 @@ def _compute_stats(values: List[float]) -> Dict[str, float]:
106106

107107

108108
def _distribute_tokens(total: int, world_size: int) -> List[int]:
109-
"""Distribute ``total`` global tokens evenly across ``world_size`` ranks."""
109+
"""Distribute ``total`` global tokens evenly across ranks.
110+
111+
Remainder tokens are spread one-per-rank over the leading ranks (instead of
112+
piling the entire remainder on rank 0), so e.g. (total=2, world_size=4) ->
113+
[1, 1, 0, 0]. An even, non-degenerate split keeps every rank's per-rank token
114+
count within 1 of each other, which the downstream symmetric-memory workspace
115+
sizing relies on.
116+
"""
110117
if world_size <= 0 or total < 0:
111118
raise ValueError(f"invalid args: total={total}, world_size={world_size}")
112-
if world_size == 1:
113-
return [total]
114-
base = total // world_size
115-
out = [base] * world_size
116-
out[0] += total - base * world_size
117-
return out
119+
base, rem = divmod(total, world_size)
120+
return [base + (1 if i < rem else 0) for i in range(world_size)]
118121

119122

120123
def _validate_per_rank_token_list(

0 commit comments

Comments
 (0)