NVIDIA
diff --git a/‎tensorrt_llm/_torch/custom_ops/cute_dsl_megamoe_custom_op.py‎
Lines changed: 71 additions & 8 deletions b/‎tensorrt_llm/_torch/custom_ops/cute_dsl_megamoe_custom_op.py‎
Lines changed: 71 additions & 8 deletions
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/__init__.py‎
Lines changed: 18 additions & 0 deletions b/‎tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/__init__.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/custom_ext.py‎
Lines changed: 3 additions & 3 deletions b/‎tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/custom_ext.py‎
Lines changed: 3 additions & 3 deletions
@@ -642,13 +642,18 @@ def query_megamoe_shared_workspace_bytes(
         expand_intermediate_size_per_partition: int,
         max_tokens_per_rank: int,
         tactic: Optional[Tuple] = None,
+        apply_topk_in_fc1: bool = True,
+        gate_up_clamp: Optional[float] = None,
     ) -> int:
         """Probe ``Sm100MegaMoEKernel.get_workspace_sizes()`` for the
         shared workspace byte count. The shared workspace size is
-        invariant across all candidate tactics (its regions depend only
-        on world_size / num_experts_per_rank / num_topk /
-        max_tokens_per_rank -- see _build_shared_region_specs in
-        megamoe_kernel.py), so we use the default tactic for the probe.
+        invariant across all candidate tactics and across the codegen-time
+        graph/clamp modes (its regions depend only on world_size /
+        num_experts_per_rank / num_topk / max_tokens_per_rank -- see
+        _build_shared_region_specs in megamoe_kernel.py), so we use the
+        default tactic for the probe. ``apply_topk_in_fc1`` / ``gate_up_clamp``
+        are still threaded so the probe kernel ctor signature is satisfied
+        and matches the real build.
         """
         from ..cute_dsl_kernels.mega_moe_nvfp4 import import_kernel
 
@@ -681,7 +686,10 @@ def query_megamoe_shared_workspace_bytes(
             num_topk=int(num_topk),
             max_tokens_per_rank=int(max_tokens_per_rank),
             hidden=int(hidden_size),
-            fc2_in_kernel_topk_reduce=bool(tactic[5]),
+            fc2_output_dtype=cutlass.BFloat16,
+            in_kernel_fc2_reduce=bool(tactic[5]),
+            apply_topk_in_fc1=bool(apply_topk_in_fc1),
+            gate_up_clamp=(None if gate_up_clamp is None else float(gate_up_clamp)),
             **_LOCKED_KERNEL_KWARGS,
         )
         _, shared_bytes = probe.get_workspace_sizes()
@@ -717,6 +725,8 @@ def __init__(
             expand_intermediate_size_per_partition: int,
             max_tokens_per_rank: int,
             output_dtype: torch.dtype,
+            apply_topk_in_fc1: bool = True,
+            gate_up_clamp: Optional[float] = None,
         ) -> None:
             super().__init__()
             if (sm_version := get_sm_version()) not in (100, 103):
@@ -745,6 +755,11 @@ def __init__(
             )
             self.max_tokens_per_rank = int(max_tokens_per_rank)
             self.output_dtype = output_dtype
+            # Codegen-time graph/clamp modes. They change the generated
+            # kernel, so they are part of ``unique_id`` (and therefore the
+            # compile-cache key) -- never per-call runtime kwargs.
+            self.apply_topk_in_fc1 = bool(apply_topk_in_fc1)
+            self.gate_up_clamp = None if gate_up_clamp is None else float(gate_up_clamp)
 
         def unique_id(self):
             return (
@@ -757,6 +772,8 @@ def unique_id(self):
                 self.expand_intermediate_size_per_partition,
                 self.max_tokens_per_rank,
                 str(self.output_dtype),
+                self.apply_topk_in_fc1,
+                self.gate_up_clamp,
             )
 
         def get_valid_tactics(
@@ -810,6 +827,17 @@ def _autotuner_inputs_pre_hook(self, inputs: List[torch.Tensor]) -> List[torch.T
             if isinstance(topk_weights, torch.Tensor):
                 topk_weights.zero_()
 
+            # New per-expert scale inputs fc1_alpha(8) / fc2_alpha(9) /
+            # fc1_norm_const(10) are inserted after fc2_weight_sf(7) and
+            # before combine_output(11). Fill them with 1.0 (NOT zero):
+            # the FC1/FC2 epilogues divide/scale by these and a zero
+            # fc1_norm_const would make the fc1-out NVFP4 quant divide by
+            # zero during fake autotune runs.
+            for alpha_idx in (8, 9, 10):
+                tensor = inputs[alpha_idx]
+                if isinstance(tensor, torch.Tensor):
+                    tensor.fill_(1.0)
+
             return inputs
 
         def get_tuning_config(self) -> TuningConfig:
@@ -838,7 +866,9 @@ def _num_tokens(shapes: List[torch.Size]) -> int:
                     ConstraintSpec(1, 0, _num_tokens),  # activation_sf
                     ConstraintSpec(2, 0, _num_tokens),  # topk_idx
                     ConstraintSpec(3, 0, _num_tokens),  # topk_weights
-                    ConstraintSpec(8, 0, _num_tokens),  # combine_output
+                    # combine_output moved from idx 8 -> 11 after inserting
+                    # fc1_alpha(8) / fc2_alpha(9) / fc1_norm_const(10).
+                    ConstraintSpec(11, 0, _num_tokens),  # combine_output
                 ),
                 inputs_pre_hook=self._autotuner_inputs_pre_hook,
                 use_cold_l2_cache=True,
@@ -887,11 +917,17 @@ def _build_kernel(self, tactic: Tuple):
                 num_topk=self.num_topk,
                 max_tokens_per_rank=self.max_tokens_per_rank,
                 hidden=self.hidden_size,
-                fc2_in_kernel_topk_reduce=bool(use_bf16_redg),
+                fc2_output_dtype=cutlass.BFloat16,
+                in_kernel_fc2_reduce=bool(use_bf16_redg),
+                apply_topk_in_fc1=self.apply_topk_in_fc1,
+                gate_up_clamp=self.gate_up_clamp,
                 **_LOCKED_KERNEL_KWARGS,
             )
 
         def _compile_or_get(self, tactic: Tuple, kernel, runtime_kwargs):
+            # ``unique_id()`` already carries apply_topk_in_fc1 / gate_up_clamp,
+            # so the codegen-time graph/clamp modes are part of the cache key
+            # without listing them again here.
             cache_key = (
                 self.unique_id(),
                 tuple(tactic[0]),
@@ -978,8 +1014,11 @@ def forward(
                 fc1_weight_sf,
                 fc2_weight,
                 fc2_weight_sf,
+                fc1_alpha,
+                fc2_alpha,
+                fc1_norm_const,
                 combine_output,
-            ) = inputs[:9]
+            ) = inputs[:12]
             assert peer_offsets is not None, (
                 "Sm100MegaMoENvfp4Runner.forward requires peer_offsets kwarg "
                 "(length = world_size); single-rank degenerate mode passes "
@@ -1037,6 +1076,12 @@ def forward(
             fc1_weight_sf_cute = _to_cute(fc1_weight_sf)
             fc2_weight_cute = _to_cute(fc2_weight)
             fc2_weight_sf_cute = _to_cute(fc2_weight_sf)
+            # Per-expert fp32 scale tensors are 1-D ``(num_local_slots,)``;
+            # 4-byte alignment matches the fp32 element size (the kernel
+            # reads them as a plain fp32 vector, no 16-byte TMA tile).
+            fc1_alpha_cute = _to_cute(fc1_alpha, assumed_align=4)
+            fc2_alpha_cute = _to_cute(fc2_alpha, assumed_align=4)
+            fc1_norm_const_cute = _to_cute(fc1_norm_const, assumed_align=4)
             combine_output_cute = _to_cute(combine_output)
             local_workspace_cute = _to_cute(local_workspace)
             shared_workspace_cute = _to_cute(shared_workspace)
@@ -1066,6 +1111,9 @@ def forward(
                 fc1_weight_sf=fc1_weight_sf_cute,
                 fc2_weight=fc2_weight_cute,
                 fc2_weight_sf=fc2_weight_sf_cute,
+                fc1_alpha=fc1_alpha_cute,
+                fc2_alpha=fc2_alpha_cute,
+                fc1_norm_const=fc1_norm_const_cute,
                 combine_output=combine_output_cute,
                 local_workspace=local_workspace_cute,
                 shared_workspace=shared_workspace_cute,
@@ -1110,6 +1158,9 @@ def cute_dsl_megamoe_nvfp4_blackwell(
         fc1_weight_sf: torch.Tensor,
         fc2_weight: torch.Tensor,
         fc2_weight_sf: torch.Tensor,
+        fc1_alpha: torch.Tensor,
+        fc2_alpha: torch.Tensor,
+        fc1_norm_const: torch.Tensor,
         combine_output: torch.Tensor,
         shared_workspace: torch.Tensor,
         world_size: int,
@@ -1121,6 +1172,8 @@ def cute_dsl_megamoe_nvfp4_blackwell(
         expand_intermediate_size_per_partition: int,
         max_tokens_per_rank: int,
         peer_offsets: List[int],
+        apply_topk_in_fc1: bool = True,
+        gate_up_clamp: Optional[float] = None,
     ) -> None:
         """Run the fused MegaMoE CuteDSL NVFP4 kernel.
 
@@ -1155,6 +1208,8 @@ def cute_dsl_megamoe_nvfp4_blackwell(
             expand_intermediate_size_per_partition=expand_intermediate_size_per_partition,
             max_tokens_per_rank=max_tokens_per_rank,
             output_dtype=combine_output.dtype,
+            apply_topk_in_fc1=apply_topk_in_fc1,
+            gate_up_clamp=gate_up_clamp,
         )
         inputs = [
             activation,
@@ -1165,6 +1220,9 @@ def cute_dsl_megamoe_nvfp4_blackwell(
             fc1_weight_sf,
             fc2_weight,
             fc2_weight_sf,
+            fc1_alpha,
+            fc2_alpha,
+            fc1_norm_const,
             combine_output,
         ]
         tuner = AutoTuner.get()
@@ -1193,6 +1251,9 @@ def _(
         fc1_weight_sf: torch.Tensor,
         fc2_weight: torch.Tensor,
         fc2_weight_sf: torch.Tensor,
+        fc1_alpha: torch.Tensor,
+        fc2_alpha: torch.Tensor,
+        fc1_norm_const: torch.Tensor,
         combine_output: torch.Tensor,
         shared_workspace: torch.Tensor,
         world_size: int,
@@ -1204,5 +1265,7 @@ def _(
         expand_intermediate_size_per_partition: int,
         max_tokens_per_rank: int,
         peer_offsets: List[int],
+        apply_topk_in_fc1: bool = True,
+        gate_up_clamp: Optional[float] = None,
     ) -> None:
         return None
@@ -49,6 +49,7 @@
     "from_blocked",
     "import_kernel",
     "import_sym_buffer_host",
+    "import_topk_reduce",
     "stack_byte_reinterpretable_tensors",
     "to_blocked",
 ]
@@ -81,3 +82,20 @@ def import_sym_buffer_host():
     # SymBufferHost lives at module scope as a factory; the upstream API
     # constructs the per-world-size variant inside sym_buffer.py.
     return sym_buffer
+
+
+def import_topk_reduce():
+    """Lazily import the standalone CuteDSL top-k reduce kernel API.
+
+    Returns ``(compile_topk_reduce, launch_compiled_topk_reduce)`` from
+    :mod:`.topk_reduce` (mirrors :func:`import_kernel`). The reduce kernel
+    is only needed by the opt-in transformers graph
+    (``apply_topk_in_fc1=False``); the deepgemm-default route reduces on
+    the host via ``combine_output.sum(dim=1)`` and never imports it. Like
+    ``import_kernel`` this stays lazy so non-SM100 / no-cutlass-dsl
+    environments can import the backend for capability probing without
+    pulling the heavyweight CuteDSL symbols.
+    """
+    from .topk_reduce import compile_topk_reduce, launch_compiled_topk_reduce
+
+    return compile_topk_reduce, launch_compiled_topk_reduce
@@ -238,8 +238,7 @@ def enrich_work_tile_info(
         - fc1 tiles peek the dispatch->fc1 ``fc1_ready_counter`` at the
           same slot index but with ``valid_tokens_in_tile`` as threshold
           (per-tile dynamic).  This branch only emits when
-          ``self.fc1_ready_counter_ptr is not None`` (MegaMoE mode).  See
-          fc12_integrate_comm.md §4.
+          ``self.fc1_ready_counter_ptr is not None`` (MegaMoE mode).
         """
         # Invalid tiles keep (None_ | 0); do not index an arbitrary counter slot.
         is_valid = base_work.is_valid_tile
@@ -250,7 +249,8 @@ def enrich_work_tile_info(
             # pull) and fc2 release-add (fc1 epi) target the per-task-tile
             # counter slot indexed by ``cumulative_token_block_count +
             # tile_n_idx``.
-            counter_slot = base_work.cumulative_token_block_count + base_work.tile_n_idx
+            counter_slot = (base_work.cumulative_token_block_count +
+                            base_work.tile_n_idx)
             is_fc1 = base_work.phase == Int32(int(BlockPhase.Linear1))
             is_fc2 = base_work.phase == Int32(int(BlockPhase.Linear2))