[#9164][feat] AutoDeploy: noaux_tc MoE routing pattern matcher

guan404ming · guan404ming · commit 7dda05efb0e6 · 2026-05-29T17:09:17.000+08:00
Signed-off-by: Guan-Ming (Wesley) Chiu &lt;105915352+guan404ming@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
@@ -72,6 +72,8 @@ transforms:
     stage: pattern_matcher
   match_moe_routing_pattern:
     stage: pattern_matcher
+  match_noaux_tc_pattern:
+    stage: pattern_matcher
   ############################################################################################
   # RUN TRANSFORMATIONS ON STANDARDIZED GRAPH REPRESENTATION
   ############################################################################################
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/moe_routing.py b/tensorrt_llm/_torch/auto_deploy/transform/library/moe_routing.py
@@ -28,6 +28,9 @@
 
 The fused kernel avoids computing softmax over all experts (e.g. 256), instead
 finding top-k from raw logits and computing softmax only over the k selected values.
+
+Also detects the noaux_tc routing pattern used by DeepSeek-V3 / NemotronH /
+GLM4-MoE / Kimi-K2, replacing it with ``torch.ops.trtllm.noaux_tc_op``.
 """
 
 import operator
@@ -223,3 +226,298 @@ def _apply(
             has_valid_shapes=num_matches == 0,
         )
         return gm, info
+
+
+# ---------------------------------------------------------------------------
+# noaux_tc routing pattern helpers
+# ---------------------------------------------------------------------------
+
+_TOPK_OPS = (torch.ops.aten.topk.default,)
+_VIEW_OPS = (torch.ops.aten.view.default, torch.ops.aten.reshape.default)
+_ADD_OPS = (torch.ops.aten.add.Tensor,)
+
+
+def _scalar_int(node_or_value) -> Optional[int]:
+    """Return *node_or_value* as a Python int if it is a literal, else None."""
+    if isinstance(node_or_value, int):
+        return node_or_value
+    return None
+
+
+def _find_bias_add_after_sigmoid(sigmoid_node: Node) -> Optional[Tuple[Node, Node]]:
+    """Find ``scores + bias`` user of *sigmoid_node*; return (add_node, bias_node)."""
+    for user in sigmoid_node.users:
+        if not is_op(user, _ADD_OPS):
+            continue
+        a, b = user.args[0], user.args[1]
+        if a is sigmoid_node and isinstance(b, Node):
+            return user, b
+        if b is sigmoid_node and isinstance(a, Node):
+            return user, a
+    return None
+
+
+def _find_group_topk(scores_with_bias: Node) -> Optional[Tuple[Node, int]]:
+    """Find the ``topk(view(scores_with_bias, ...), k=2)`` user; return (node, n_group)."""
+    for user in scores_with_bias.users:
+        view_node = user if is_op(user, _VIEW_OPS) else None
+        if view_node is None:
+            continue
+        # view shape can be the second arg (list of ints/Nodes)
+        shape = view_node.args[1] if len(view_node.args) > 1 else None
+        if not isinstance(shape, (list, tuple)) or len(shape) < 2:
+            continue
+        n_group = _scalar_int(shape[-2])
+        if n_group is None:
+            continue
+        for vu in view_node.users:
+            if is_op(vu, _TOPK_OPS) and _scalar_int(vu.args[1]) == 2:
+                return vu, n_group
+    return None
+
+
+def _find_outer_topk(masked_node: Node) -> Optional[Tuple[Node, int]]:
+    """Find ``topk(masked, k=top_k)`` user of *masked_node*; return (node, top_k)."""
+    for user in masked_node.users:
+        candidate = user
+        if is_op(candidate, _TOPK_OPS):
+            top_k = _scalar_int(candidate.args[1])
+            if top_k is not None:
+                return candidate, top_k
+        # allow one view in between
+        if is_op(candidate, _VIEW_OPS):
+            for vu in candidate.users:
+                if is_op(vu, _TOPK_OPS):
+                    top_k = _scalar_int(vu.args[1])
+                    if top_k is not None:
+                        return vu, top_k
+    return None
+
+
+def _descends_from(node: Node, target: Node, max_depth: int = 10) -> bool:
+    """Return True if *target* is reachable from *node*'s input ancestry within max_depth hops."""
+    if not isinstance(node, Node) or not isinstance(target, Node):
+        return False
+    visited = set()
+    frontier = [(node, 0)]
+    while frontier:
+        n, d = frontier.pop()
+        if n is target:
+            return True
+        if d >= max_depth or n in visited:
+            continue
+        visited.add(n)
+        for inp in n.all_input_nodes:
+            frontier.append((inp, d + 1))
+    return False
+
+
+def _find_gather_from_indices(indices_node: Node, scores_node: Node) -> Optional[Node]:
+    """Find ``aten.gather.default(scores_node, dim, indices_node)`` user of *indices_node*."""
+    for user in indices_node.users:
+        if not is_op(user, torch.ops.aten.gather.default):
+            continue
+        if len(user.args) >= 3 and user.args[0] is scores_node and user.args[2] is indices_node:
+            return user
+    return None
+
+
+def _is_sum_of(node, cur: Node) -> bool:
+    return (
+        isinstance(node, Node)
+        and is_op(node, torch.ops.aten.sum.dim_IntList)
+        and node.args[0] is cur
+    )
+
+
+def _is_normalize_divisor(divisor, cur: Node) -> bool:
+    """Accept ``sum(cur, ...)`` or its epsilon-stabilized form ``sum(...) + eps``."""
+    if _is_sum_of(divisor, cur):
+        return True
+    if isinstance(divisor, Node) and is_op(divisor, torch.ops.aten.add.Tensor):
+        a, b = divisor.args[0], divisor.args[1]
+        for sum_cand, eps_cand in ((a, b), (b, a)):
+            if _is_sum_of(sum_cand, cur) and isinstance(eps_cand, (int, float)):
+                return True
+    return False
+
+
+def _walk_div_then_mul(start: Node) -> Tuple[Node, float]:
+    """Walk forward through optional ``div.Tensor(self, sum)`` then ``mul.Tensor(self, scalar)``.
+
+    Returns ``(final_node, routed_scaling_factor)``. If no scale is found, the
+    factor defaults to ``1.0`` and *final_node* is the last node reached on the
+    chain (gather, or div if no mul, etc.).
+    """
+    cur = start
+    # optional norm: div(cur, sum(cur, ..., keepdim=True) [+ eps])
+    for user in cur.users:
+        if (
+            is_op(user, torch.ops.aten.div.Tensor)
+            and user.args[0] is cur
+            and _is_normalize_divisor(user.args[1], cur)
+        ):
+            cur = user
+            break
+    # optional scalar multiply
+    for user in cur.users:
+        if is_op(user, torch.ops.aten.mul.Tensor) and user.args[0] is cur:
+            scalar = user.args[1]
+            if isinstance(scalar, (int, float)):
+                return user, float(scalar)
+    return cur, 1.0
+
+
+@TransformRegistry.register("match_noaux_tc_pattern")
+class MatchNoAuxTCPattern(BaseTransform):
+    """Match the noaux_tc MoE routing chain and replace with a fused trtllm op.
+
+    This transform detects the DeepSeek-V3 style routing pattern::
+
+        sigmoid → +bias → group top-k → mask → top-k → gather → [norm] → scale
+
+    and replaces it with::
+
+        topk_weights, topk_idx = trtllm.noaux_tc_op(
+            router_logits, bias, n_group, topk_group, top_k, routed_scaling_factor
+        )
+
+    The fused kernel performs sigmoid, bias correction, group-based top-k
+    selection, gather, normalization and scaling in a single CUDA kernel.
+    """
+
+    config: TransformConfig
+
+    @classmethod
+    def get_config_class(cls) -> Type[TransformConfig]:
+        return TransformConfig
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        graph = gm.graph
+        num_matches = 0
+
+        for node in list(graph.nodes):
+            # ---- anchor: aten.sigmoid -> add(bias) ------------------------
+            if not is_op(node, torch.ops.aten.sigmoid.default):
+                continue
+            sigmoid_node = node
+            router_logits = sigmoid_node.args[0]
+            if not isinstance(router_logits, Node):
+                continue
+
+            bias_add = _find_bias_add_after_sigmoid(sigmoid_node)
+            if bias_add is None:
+                continue
+            scores_with_bias_node, bias_node = bias_add
+
+            # ---- group top-k(k=2) ----------------------------------------
+            inner = _find_group_topk(scores_with_bias_node)
+            if inner is None:
+                continue
+            inner_topk, n_group = inner
+
+            inner_values = _get_single_getitem_user(inner_topk, 0)
+            if inner_values is None:
+                continue
+
+            # ---- sum -> outer topk(k=topk_group) -------------------------
+            sum_node = None
+            for u in inner_values.users:
+                if is_op(u, torch.ops.aten.sum.dim_IntList):
+                    sum_node = u
+                    break
+            if sum_node is None:
+                continue
+
+            outer_grp_topk = None
+            for u in sum_node.users:
+                if is_op(u, _TOPK_OPS):
+                    outer_grp_topk = u
+                    break
+            if outer_grp_topk is None:
+                continue
+            topk_group = _scalar_int(outer_grp_topk.args[1])
+            if topk_group is None:
+                continue
+
+            # ---- final masked top-k(k=top_k) -----------------------------
+            # Only accept a masked_node whose mask input descends from outer_grp_topk;
+            # otherwise an unrelated branch consuming scores_with_bias could be picked.
+            masked_node = None
+            for u in scores_with_bias_node.users:
+                if not is_op(
+                    u,
+                    (
+                        torch.ops.aten.where.self,
+                        torch.ops.aten.masked_fill.Scalar,
+                        torch.ops.aten.mul.Tensor,
+                    ),
+                ):
+                    continue
+                if not _descends_from(u, outer_grp_topk):
+                    continue
+                masked_node = u
+                break
+            if masked_node is None:
+                continue
+
+            outer_topk = _find_outer_topk(masked_node)
+            if outer_topk is None:
+                continue
+            final_topk_node, top_k = outer_topk
+
+            final_indices = _get_single_getitem_user(final_topk_node, 1)
+            if final_indices is None:
+                continue
+
+            # ---- weights branch: gather(scores, -1, topk_idx) [/ sum] * scale --
+            gather_node = _find_gather_from_indices(final_indices, sigmoid_node)
+            if gather_node is None:
+                continue
+            weights_tail, routed_scaling_factor = _walk_div_then_mul(gather_node)
+
+            # ---- emit fused noaux_tc_op ---------------------------------
+            ad_logger.info(
+                "Matched noaux_tc routing pattern: "
+                f"n_group={n_group}, topk_group={topk_group}, top_k={top_k}, "
+                f"scale={routed_scaling_factor}"
+            )
+
+            with graph.inserting_before(sigmoid_node):
+                fused = graph.call_function(
+                    torch.ops.trtllm.noaux_tc_op,
+                    args=(
+                        router_logits,
+                        bias_node,
+                        n_group,
+                        topk_group,
+                        top_k,
+                        routed_scaling_factor,
+                    ),
+                )
+                fused_weights = graph.call_function(operator.getitem, args=(fused, 0))
+                fused_indices = graph.call_function(operator.getitem, args=(fused, 1))
+
+            final_indices.replace_all_uses_with(fused_indices)
+            weights_tail.replace_all_uses_with(fused_weights)
+
+            num_matches += 1
+
+        if num_matches > 0:
+            eliminate_dead_code(gm)
+            gm.recompile()
+            ad_logger.info(f"Fused {num_matches} noaux_tc routing pattern(s).")
+
+        info = TransformInfo(
+            skipped=False,
+            num_matches=num_matches,
+            is_clean=num_matches == 0,
+            has_valid_shapes=num_matches == 0,
+        )
+        return gm, info
diff --git a/tests/unittest/auto_deploy/singlegpu/transformations/library/test_match_noaux_tc.py b/tests/unittest/auto_deploy/singlegpu/transformations/library/test_match_noaux_tc.py