fix: bench-moe timeout handler, kernel error handler, workload computation for non-DP attention path, and MoE logic for non-DP attention path

guqiqi · guqiqi · commit 2e959a00cd4b · 2026-06-25T10:01:29.000+08:00
Signed-off-by: guqiqi &lt;29116997+guqiqi@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -429,7 +429,12 @@ def calculate_num_chunks(self, all_rank_num_tokens: List[int]) -> int:
         if self.use_dp and self.comm is not None:
             num_rows = self._dp_padded_num_rows(all_rank_num_tokens)
         else:
-            num_rows = sum(all_rank_num_tokens)
+            # non-DP: no cross-rank dispatch. The scheduler fills all_rank_num_tokens
+            # from [x.shape[0]] before calling here, so it must be a single-element list.
+            assert len(all_rank_num_tokens) == 1, (
+                f"non-DP path expects a single-element list, got {len(all_rank_num_tokens)}"
+            )
+            num_rows = all_rank_num_tokens[0]
         return (num_rows + self.moe_max_num_tokens - 1) // self.moe_max_num_tokens
 
     def split_chunk(self, split_token_num: int, split_num_chunks: int) -> List[int]:
diff --git a/tests/microbenchmarks/bench_moe/case_runner.py b/tests/microbenchmarks/bench_moe/case_runner.py
@@ -561,14 +561,15 @@ def _resolve_layout_and_plan(
                 top_k=int(model.top_k),
                 num_experts=int(model.num_experts),
                 moe_ep_size=int(moe_ep_size),
+                enable_dp=bool(_enable_dp),
             )
         except Exception as exc:
             reason = f"routing plan error: {type(exc).__name__}: {exc}"
             _maybe_print_rank0(f"[bench_moe] {reason}")
             return _short_circuit(result, "skipped", reason)
         per_rank = list(routing_plan.per_rank_num_tokens)
     else:
-        per_rank = _per_rank_tokens(workload, world_size)
+        per_rank = _per_rank_tokens(workload, world_size, enable_dp=bool(_enable_dp))
 
     return int(moe_ep_size), per_rank, routing_plan
 
@@ -663,6 +664,11 @@ def _run_one_candidate(
     result.moe_tp_size = int(mapping.moe_tp_size)
     result.enable_attention_dp = bool(mapping.enable_attention_dp)
 
+    # TEP/TTP (no attention DP): no cross-rank dispatch; the scheduler fills
+    # all_rank_num_tokens from x.shape[0]. Pass None to follow that path.
+    if not mapping.enable_attention_dp:
+        all_rank_num_tokens = None
+
     AutoTuner.get().setup_distributed_state(mapping)
     AutoTuner.get().clear_cache()
 
diff --git a/tests/microbenchmarks/bench_moe/cli.py b/tests/microbenchmarks/bench_moe/cli.py
@@ -29,6 +29,7 @@
 from tensorrt_llm.models.modeling_utils import QuantAlgo
 
 from .backend import MoeBackendType
+from .mapping import _resolve_mapping_layout
 from .routing import _per_rank_tokens
 from .search import (
     _coerce_str_tuple,
@@ -91,7 +92,14 @@ def _build_worker_header(ctx: _BenchmarkContext, launcher: str, world_size: int)
         "world_size": world_size,
         "analysis": list(ctx.analysis) or ["summary"],
         "workloads": [
-            w.to_dict(per_rank_num_tokens=_per_rank_tokens(w, world_size)) for w in ctx.workloads
+            w.to_dict(
+                per_rank_num_tokens=_per_rank_tokens(
+                    w,
+                    world_size,
+                    enable_dp=bool(_resolve_mapping_layout(ctx.base_config, world_size)[2]),
+                )
+            )
+            for w in ctx.workloads
         ],
         "base_config": ctx.base_config.to_dict(),
     }
diff --git a/tests/microbenchmarks/bench_moe/results.py b/tests/microbenchmarks/bench_moe/results.py
@@ -21,6 +21,7 @@
 
 from tensorrt_llm._utils import mpi_allgather
 
+from .mapping import _resolve_mapping_layout
 from .routing import _per_rank_tokens
 from .specs import ConfigSpec, ModelSpec, RunResult, WorkloadSpec
 from .utils import _compute_stats
@@ -407,7 +408,8 @@ def _make_skipped_run_result(
     r = RunResult(model=model, workload=workload, config=config)
     r.status = "skipped"
     r.skip_reason = reason
-    r.per_rank_num_tokens = _per_rank_tokens(workload, world_size)
+    _, _, _enable_dp = _resolve_mapping_layout(config, world_size)
+    r.per_rank_num_tokens = _per_rank_tokens(workload, world_size, enable_dp=bool(_enable_dp))
     r.status_per_rank = {f"rank{i}": "skipped" for i in range(world_size)}
     r.instrumentation = {
         "level": ",".join(sorted(analysis)) if analysis else "summary",
diff --git a/tests/microbenchmarks/bench_moe/routing/builders.py b/tests/microbenchmarks/bench_moe/routing/builders.py
@@ -115,23 +115,35 @@ def _build_per_rank_num_tokens(
     spec: RoutingControlSpec,
     num_tokens: int,
     world_size: int,
+    enable_dp: bool,
 ) -> List[int]:
     """Resolve ``per_rank_num_tokens`` for a workload.
 
-    Explicit ``spec.per_rank_num_tokens`` wins; otherwise tokens are split
-    evenly across ranks with any remainder on rank 0.
+    Explicit ``spec.per_rank_num_tokens`` wins; otherwise the token count per
+    rank depends on the attention-DP setting:
+
+    * ``enable_dp=True``  (DEP / DTP): tokens are DP-sharded across ranks, so
+      each rank holds ``num_tokens / world_size``.
+    * ``enable_dp=False`` (TEP / TTP): attention is tensor-parallel, so every
+      rank sees the complete batch and holds ``num_tokens``.
+
+    When an explicit list is provided its sum is validated against the expected
+    total (``num_tokens`` for DP modes, ``num_tokens * world_size`` for non-DP).
     """
     if spec.per_rank_num_tokens is None:
+        if not enable_dp:
+            return [int(num_tokens)] * world_size
         return _distribute_tokens(int(num_tokens), world_size)
+    expected_total = int(num_tokens) * (1 if enable_dp else world_size)
     return _validate_per_rank_token_list(
-        spec.per_rank_num_tokens, world_size=world_size, expected_total=int(num_tokens)
+        spec.per_rank_num_tokens, world_size=world_size, expected_total=expected_total
     )
 
 
-def _per_rank_tokens(workload: WorkloadSpec, world_size: int) -> List[int]:
+def _per_rank_tokens(workload: WorkloadSpec, world_size: int, enable_dp: bool) -> List[int]:
     """Materialize the ``per_rank_num_tokens`` list for a workload + world size."""
     return _build_per_rank_num_tokens(
-        workload.routing_control, int(workload.num_tokens), world_size
+        workload.routing_control, int(workload.num_tokens), world_size, enable_dp
     )
 
 
@@ -309,6 +321,7 @@ def _build_routing_plan(
     top_k: int,
     num_experts: int,
     moe_ep_size: int,
+    enable_dp: bool,
 ) -> RoutingPlan:
     """Translate a ``RoutingControlSpec`` into a canonical normalised plan."""
     if moe_ep_size <= 0 or num_experts % moe_ep_size != 0:
@@ -318,7 +331,7 @@ def _build_routing_plan(
     experts_per_rank = num_experts // moe_ep_size
     if top_k > num_experts:
         raise ValueError(f"top_k ({top_k}) must be <= num_experts ({num_experts})")
-    per_rank = _build_per_rank_num_tokens(spec, num_tokens, world_size)
+    per_rank = _build_per_rank_num_tokens(spec, num_tokens, world_size, enable_dp)
     # The dispatch matrix is indexed by EP rank on both axes. The current
     # worker only calls routing-control planning when ``moe_ep_size`` equals
     # ``world_size`` so that this EP-axis matrix also matches the user-visible
diff --git a/tests/microbenchmarks/bench_moe/worker.py b/tests/microbenchmarks/bench_moe/worker.py
@@ -28,7 +28,7 @@
 import time
 import traceback
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
 from mpi4py import MPI
@@ -70,6 +70,18 @@ def _try_import(module_path: str, attr: Optional[str] = None, default: Any = Non
 POISON_HERE_PREFIX = "cuda_context_poisoned_after_success"
 POISON_UPSTREAM_PREFIX = "cuda_context_poisoned_upstream"
 WATCHDOG_UPSTREAM_PREFIX = "watchdog_timeout_upstream"
+# Terminal (status="failed") marker for a candidate the watchdog killed for
+# exceeding its wall-clock budget. NOT suffixed "_upstream": is_completed_for_resume
+# treats status="failed" as terminal, so --resume_from SKIPS it (does not re-attempt
+# and re-hang) while still surfacing the hang as a result row with a clear reason.
+WATCHDOG_TIMEOUT_PREFIX = "watchdog_timeout"
+# Terminal (status="failed") placeholder pre-written for the in-flight candidate
+# BEFORE it runs. If the process dies mid-candidate in a way nothing else can
+# record -- a CUDA device-side assert that aborts the MPI step, OOM-kill,
+# SIGSEGV, node loss -- this persisted row makes the candidate terminal so
+# --resume_from skips it and advances. Replaced with the real result on normal
+# completion. Like WATCHDOG_TIMEOUT_PREFIX, NOT suffixed "_upstream".
+INCOMPLETE_PREFIX = "incomplete"
 BENCH_MOE_POISON_EXIT_CODE = 75
 
 
@@ -182,11 +194,29 @@ def allreduce_poison_reason(local_reason: Optional[str]) -> Optional[str]:
 
 
 class CandidateWatchdog:
-    """Hard wall-clock guard around one candidate; SIGKILLs the process on timeout."""
+    """Hard wall-clock guard around one candidate; SIGKILLs the process on timeout.
+
+    On timeout the guard first invokes ``on_timeout`` (used to record the hung
+    candidate as a terminal ``failed`` result + checkpoint so it is not silently
+    lost and is skipped on ``--resume_from`` rather than re-attempted), then
+    SIGKILLs to break the wedged CUDA/NCCL state. A genuine hang cannot be
+    recovered in-process, so the kill is unavoidable; ``on_timeout`` makes it a
+    recorded outcome instead of a vanished one.
+    """
 
-    def __init__(self, budget_s: float, label: str):
+    def __init__(
+        self,
+        budget_s: float,
+        label: str,
+        on_timeout: Optional[Callable[[], None]] = None,
+        rank0_persist_grace_s: float = 8.0,
+    ):
         self._budget_s = float(budget_s)
         self._label = label
+        self._on_timeout = on_timeout
+        # Non-rank-0 ranks wait this long before SIGKILL so rank 0 can persist the
+        # checkpoint before the first task exit tears down the whole srun step.
+        self._rank0_persist_grace_s = float(rank0_persist_grace_s)
         self._cancelled = threading.Event()
         self._thread: Optional[threading.Thread] = None
 
@@ -211,16 +241,35 @@ def __exit__(self, exc_type, exc, tb) -> bool:
     def _guard(self) -> None:
         if self._cancelled.wait(self._budget_s):
             return
+        rank = mpi_rank()
         try:
             sys.stderr.write(
                 f"[bench_moe watchdog] candidate '{self._label}' exceeded "
                 f"{self._budget_s:.1f}s budget on pid={os.getpid()} "
-                f"rank={mpi_rank()}; sending SIGKILL to break suspected "
-                f"NCCL deadlock or CUDA hang.\n"
+                f"rank={rank}; recording it as a failed (timeout) result, then "
+                f"sending SIGKILL to break suspected NCCL deadlock or CUDA hang.\n"
             )
             sys.stderr.flush()
         except Exception:
             pass
+        # Record the hung candidate as a terminal failed row + checkpoint. Rank 0
+        # writes; other ranks no-op (see _emit_checkpoint_report). The main thread
+        # is blocked in a GIL-releasing CUDA call, so this watchdog thread can run.
+        if self._on_timeout is not None:
+            try:
+                self._on_timeout()
+            except Exception as exc:  # never let bookkeeping block the kill
+                try:
+                    sys.stderr.write(
+                        f"[bench_moe watchdog] on_timeout callback failed "
+                        f"({type(exc).__name__}: {exc}); killing anyway.\n"
+                    )
+                    sys.stderr.flush()
+                except Exception:
+                    pass
+        # Let rank 0 flush its checkpoint before the first SIGKILL aborts the step.
+        if rank != 0 and self._rank0_persist_grace_s > 0:
+            time.sleep(self._rank0_persist_grace_s)
         os.kill(os.getpid(), signal.SIGKILL)
 
 
@@ -518,8 +567,52 @@ def _run_benchmark_worker_under_current_mpi(args: argparse.Namespace, launcher:
         )
         _maybe_print_rank0(f"[bench_moe] running {case_label}")
 
+        # Pre-write a terminal "failed" placeholder for THIS candidate and
+        # checkpoint it BEFORE running. If the process then dies mid-candidate in
+        # a way nothing else can catch -- a CUDA device-side assert that aborts
+        # the MPI step, OOM-kill, SIGSEGV, node loss -- this persisted row keeps
+        # the candidate terminal, so --resume_from skips it and advances to the
+        # next one instead of re-attempting (and re-crashing on) the same
+        # candidate forever. On normal completion it is replaced with the real
+        # result below. Only the in-flight candidate gets a placeholder;
+        # not-yet-run candidates have no row and are still attempted on resume.
+        placeholder = _make_skipped_run_result(
+            model=ctx.model,
+            workload=workload,
+            config=cand,
+            world_size=world_size,
+            analysis=ctx.analysis,
+            reason=(
+                f"{INCOMPLETE_PREFIX}: process died before this candidate "
+                f"finished (crash/abort/OOM/kill) ({case_label})"
+            ),
+        )
+        placeholder.status = "failed"
+        placeholder.status_per_rank = {f"rank{i}": "incomplete" for i in range(world_size)}
+        accumulated_rows.append(_runresult_to_row(placeholder))
+        _emit_checkpoint_report(args=args, ctx=ctx, rows=accumulated_rows, world_size=world_size)
+
+        # If the watchdog fires (suspected hang), overwrite the placeholder's
+        # reason with the precise timeout text and re-checkpoint before SIGKILL,
+        # so the hang is surfaced as a clear result (the row is already terminal).
+        def _record_watchdog_timeout(
+            _label: str = case_label,
+            _budget_s: float = watchdog_budget_s,
+        ) -> None:
+            if accumulated_rows:
+                accumulated_rows[-1]["skip_reason"] = (
+                    f"{WATCHDOG_TIMEOUT_PREFIX}: exceeded {_budget_s:.0f}s; "
+                    f"suspected NCCL/CUDA hang ({_label})"
+                )
+                accumulated_rows[-1]["status_per_rank"] = {
+                    f"rank{i}": "timeout" for i in range(world_size)
+                }
+            _emit_checkpoint_report(
+                args=args, ctx=ctx, rows=accumulated_rows, world_size=world_size
+            )
+
         # Hard wall-clock guard around the actual candidate execution.
-        with CandidateWatchdog(watchdog_budget_s, case_label):
+        with CandidateWatchdog(watchdog_budget_s, case_label, on_timeout=_record_watchdog_timeout):
             with torch.device(device):
                 r = _run_one_candidate(
                     model=ctx.model,
@@ -539,8 +632,10 @@ def _run_benchmark_worker_under_current_mpi(args: argparse.Namespace, launcher:
                     input_cache=input_cache,
                     enable_perfect_router_requested=bool(args.enable_perfect_router),
                 )
+        # Candidate finished normally: replace the pre-written placeholder (the
+        # last row) with the real result.
         row = _runresult_to_row(r)
-        accumulated_rows.append(row)
+        accumulated_rows[-1] = row
         if rank == 0:
             print(json.dumps(row, indent=2), flush=True)