add DTP/TTP support

guqiqi · guqiqi · commit 89e0ad370456 · 2026-07-05T09:39:51.000+08:00
Signed-off-by: guqiqi &lt;29116997+guqiqi@users.noreply.github.com&gt;
diff --git a/tests/microbenchmarks/bench_moe/case_runner.py b/tests/microbenchmarks/bench_moe/case_runner.py
@@ -344,6 +344,7 @@ def _select_routing_inputs(
     routing_plan: RoutingPlan,
     rank: int,
     moe_ep_size: int,
+    enable_attention_dp: bool,
     base_router_logits: torch.Tensor,
     device: torch.device,
     act_dtype: torch.dtype,
@@ -411,6 +412,21 @@ def _select_routing_inputs(
     except Exception as exc:
         return None, _RoutingSkip(f"native logits projection error: {type(exc).__name__}: {exc}")
 
+    # In attention-DP + MoE-TP layouts (DTP / CUSTOM-DP), _project_router_logits
+    # returns logits shaped [agg_tokens, E] covering all DP shards aggregated
+    # onto ep_axis_rank.  The MoE internally allgathers each rank's local
+    # router_logits before routing, so each rank must supply only its local
+    # slice [offset_r : offset_r + n_r] of the full projected tensor.
+    world_size_inferred = len(routing_plan.per_rank_num_tokens)
+    if enable_attention_dp and int(moe_ep_size) < world_size_inferred:
+        offset = sum(
+            routing_plan.per_rank_num_tokens[s]
+            for s in range(world_size_inferred)
+            if s % int(moe_ep_size) == ep_axis_rank and s < rank
+        )
+        local_n = routing_plan.per_rank_num_tokens[rank]
+        new_logits = new_logits[offset : offset + local_n]
+
     if projection_status != "exact" and rc_spec.projection_policy == "reject":
         return None, _RoutingSkip(
             skip_reason=(
@@ -536,21 +552,6 @@ def _resolve_layout_and_plan(
     except ValueError as exc:
         return _short_circuit(result, "skipped", str(exc))
 
-    # Routing-control's dispatch_matrix axis is ``moe_ep_size`` while
-    # ``per_rank_num_tokens`` follows the world (DP source) axis. When the two
-    # disagree (DTP/TTP/CUSTOM with ``moe_ep_size != world_size``) the plan
-    # either crashes inside ``_build_routing_plan`` or silently drops the
-    # tokens of world ranks beyond ``moe_ep_size``. Skip cleanly.
-    if rc_active and int(moe_ep_size) != int(world_size):
-        return _short_circuit(
-            result,
-            "skipped",
-            f"routing-control requires moe_ep_size == world_size "
-            f"(got moe_ep_size={moe_ep_size}, world_size={world_size}); "
-            "the dispatch_matrix axis would not align with the per-rank token "
-            "distribution. Use parallel_mode in {DEP, TEP} or drop routing-control.",
-        )
-
     routing_plan: Optional[RoutingPlan] = None
     if rc_active:
         try:
@@ -741,6 +742,7 @@ def _run_one_candidate(
                 routing_plan=routing_plan,
                 rank=rank,
                 moe_ep_size=int(moe_ep_size),
+                enable_attention_dp=bool(result.enable_attention_dp),
                 base_router_logits=router_logits,
                 device=device,
                 act_dtype=act_dtype,
diff --git a/tests/microbenchmarks/bench_moe/routing/builders.py b/tests/microbenchmarks/bench_moe/routing/builders.py
@@ -147,23 +147,56 @@ def _per_rank_tokens(workload: WorkloadSpec, world_size: int, enable_dp: bool) -
     )
 
 
+def _aggregate_dispatch_source_tokens(
+    per_rank_num_tokens: List[int],
+    ep_size: int,
+    enable_dp: bool,
+) -> List[int]:
+    """Project world-rank token counts onto EP-source rows.
+
+    TRT-LLM Mapping orders MoE ranks with ``moe_ep_rank = tp_rank % moe_ep_size``.
+    In attention-DP modes each world rank owns a distinct token shard, so TP
+    shards targeting the same EP row are summed. In non-DP MoE-TP modes those TP
+    shards carry the same logical tokens, so only the first TP shard contributes
+    to the logical dispatch plan.
+    """
+    if ep_size <= 0:
+        return []
+    if len(per_rank_num_tokens) == ep_size:
+        return [int(v) for v in per_rank_num_tokens]
+
+    source_tokens = [0] * ep_size
+    if not enable_dp:
+        for ep_rank in range(ep_size):
+            if ep_rank < len(per_rank_num_tokens):
+                source_tokens[ep_rank] = int(per_rank_num_tokens[ep_rank])
+    else:
+        for rank, num_tokens in enumerate(per_rank_num_tokens):
+            source_tokens[rank % ep_size] += int(num_tokens)
+
+    return source_tokens
+
+
 def _build_dispatch_matrix(
     comm_pattern: str,
     per_rank_num_tokens: List[int],
     top_k: int,
     ep_size: int,
+    enable_dp: bool,
     seed: int = 0,
 ) -> List[List[int]]:
     """Build the canonical slot ``dispatch_matrix`` for ``comm_pattern``.
 
-    Row sums always equal ``per_rank_num_tokens[src] * top_k``. The matrix is
-    a pure planning artefact: it does not enforce per-token uniqueness yet.
-    That constraint is checked at materialisation time.
+    Row sums equal the EP-source token counts projected from
+    ``per_rank_num_tokens`` times ``top_k``. When world ranks outnumber EP
+    ranks (DTP / TTP / CUSTOM MoE-TP layouts), multiple world-rank rows are
+    aggregated onto the same EP-source row.
     """
     name, kwargs = _parse_comm_pattern(comm_pattern)
+    source_tokens = _aggregate_dispatch_source_tokens(per_rank_num_tokens, ep_size, enable_dp)
     matrix: List[List[int]] = [[0] * ep_size for _ in range(ep_size)]
     for src in range(ep_size):
-        row_total = int(per_rank_num_tokens[src]) * int(top_k)
+        row_total = int(source_tokens[src]) * int(top_k)
         if row_total == 0:
             continue
         if name == "file":
@@ -332,10 +365,9 @@ def _build_routing_plan(
     if top_k > num_experts:
         raise ValueError(f"top_k ({top_k}) must be <= num_experts ({num_experts})")
     per_rank = _build_per_rank_num_tokens(spec, num_tokens, world_size, enable_dp)
-    # The dispatch matrix is indexed by EP rank on both axes. The current
-    # worker only calls routing-control planning when ``moe_ep_size`` equals
-    # ``world_size`` so that this EP-axis matrix also matches the user-visible
-    # per-rank token list.
+    # The dispatch matrix stays on EP axes. When MoE-TP makes multiple world
+    # ranks share one EP rank, the world-rank token counts are aggregated onto
+    # the corresponding EP-source row before building the matrix.
     if spec.routing_pattern_file:
         default_patterns = {("balanced_alltoall", "balanced"), ("random", "random")}
         if (spec.comm_pattern, spec.expert_pattern) not in default_patterns:
@@ -348,26 +380,32 @@ def _build_routing_plan(
         )
     else:
         dispatch_matrix = _build_dispatch_matrix(
-            spec.comm_pattern, per_rank, top_k, moe_ep_size, seed=spec.seed
+            spec.comm_pattern,
+            per_rank,
+            top_k,
+            moe_ep_size,
+            enable_dp=enable_dp,
+            seed=spec.seed,
         )
         expert_histogram = _build_expert_histogram(
             spec.expert_pattern, dispatch_matrix, experts_per_rank, moe_ep_size, seed=spec.seed
         )
 
     # Per-row sums are an invariant; emit a clearer error than the materialiser would.
+    source_tokens = _aggregate_dispatch_source_tokens(per_rank, moe_ep_size, enable_dp)
     for src in range(moe_ep_size):
-        expected = int(per_rank[src]) * int(top_k) if src < len(per_rank) else 0
+        expected = int(source_tokens[src]) * int(top_k) if src < len(source_tokens) else 0
         actual = sum(dispatch_matrix[src])
         if actual != expected:
             raise ValueError(
-                f"dispatch_matrix row {src} sums to {actual}, expected per_rank_num_tokens[{src}] * top_k = {expected}"
+                f"dispatch_matrix row {src} sums to {actual}, expected aggregate source tokens * top_k = {expected}"
             )
     # Global expert histogram total must match total slots.
-    total_slots = sum(int(t) for t in per_rank) * int(top_k)
+    total_slots = sum(int(t) for t in source_tokens) * int(top_k)
     hist_total = sum(sum(row) for row in expert_histogram)
     if hist_total != total_slots:
         raise ValueError(
-            f"expert_histogram sum={hist_total} must equal sum(per_rank_num_tokens) * top_k = {total_slots}"
+            f"expert_histogram sum={hist_total} must equal aggregate source tokens * top_k = {total_slots}"
         )
 
     return RoutingPlan(
diff --git a/tests/microbenchmarks/bench_moe/routing/materialize.py b/tests/microbenchmarks/bench_moe/routing/materialize.py
@@ -55,14 +55,21 @@ def _flatten_plan_slots_for_rank(
     experts_per_rank: int,
     moe_ep_size: int,
 ) -> List[int]:
-    """Flatten one plan row into expert ids while preserving slot counts."""
-    local_num_tokens = int(plan.per_rank_num_tokens[src_rank])
+    """Flatten one plan row into expert ids while preserving slot counts.
+
+    ``local_num_tokens`` is derived from the dispatch-matrix row sum rather
+    than from ``per_rank_num_tokens[src_rank]``.  In MoE-TP + attention-DP
+    layouts (DTP / CUSTOM-DP) the dispatch matrix is EP-axis indexed while
+    ``per_rank_num_tokens`` is world-rank indexed; the row sum is always the
+    correct EP-axis aggregate (``source_tokens[src_rank] * top_k``).
+    """
     row = list(plan.dispatch_matrix[src_rank])
-    if sum(row) != local_num_tokens * top_k:
+    row_sum = sum(row)
+    if top_k > 0 and row_sum % top_k != 0:
         raise ValueError(
-            f"dispatch_matrix row sum ({sum(row)}) must equal local_num_tokens*top_k "
-            f"({local_num_tokens * top_k}) for rank {src_rank}"
+            f"dispatch_matrix row {src_rank} sum ({row_sum}) is not divisible by top_k ({top_k})"
         )
+    local_num_tokens = row_sum // top_k if top_k > 0 else 0
 
     flat: List[int] = []
     for dst in range(moe_ep_size):
@@ -174,7 +181,13 @@ def _materialize_selected_experts_for_rank(
       4. Run a small repair pass that swaps duplicated expert ids between
          rows until each token has ``top_k`` distinct experts.
     """
-    local_num_tokens = int(plan.per_rank_num_tokens[src_rank])
+    # Derive the effective token count from the dispatch-matrix row sum so that
+    # MoE-TP + attention-DP layouts (DTP / CUSTOM-DP) are handled correctly.
+    # In those layouts the row sum equals the aggregated source tokens for the
+    # EP rank, while per_rank_num_tokens[src_rank] would only reflect one DP
+    # shard's contribution.
+    row_sum = sum(plan.dispatch_matrix[src_rank])
+    local_num_tokens = row_sum // max(top_k, 1)
     if local_num_tokens == 0:
         ids = torch.zeros((0, top_k), dtype=torch.int32, device=device)
         scales = torch.zeros((0, top_k), dtype=scale_dtype, device=device)
diff --git a/tests/microbenchmarks/bench_moe/routing/native_logits.py b/tests/microbenchmarks/bench_moe/routing/native_logits.py
@@ -123,7 +123,13 @@ def _project_router_logits_for_plan(
     Returns ``(router_logits, status, reason)`` where ``status`` is one of
     ``"exact"``, ``"projected"``, or ``"rejected"``.
     """
-    local_num_tokens = int(plan.per_rank_num_tokens[src_rank])
+    # Derive the effective token count from the dispatch-matrix row sum.
+    # In MoE-TP + attention-DP layouts (DTP / CUSTOM-DP) the row sum equals
+    # the aggregated source tokens for the EP rank (which is what the router
+    # sees after the in-MoE allgather), while per_rank_num_tokens[src_rank]
+    # would only cover one DP shard.
+    row_sum = sum(plan.dispatch_matrix[src_rank])
+    local_num_tokens = row_sum // max(top_k, 1) if row_sum > 0 else 0
     if local_num_tokens == 0:
         return (
             torch.empty((0, num_experts), dtype=dtype, device=device),