NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 0 additions & 16 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 15 additions & 43 deletions b/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 15 additions & 43 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 3 additions & 28 deletions b/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 3 additions & 28 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_mixtral.py‎
Lines changed: 0 additions & 16 deletions b/‎tensorrt_llm/_torch/models/modeling_mixtral.py‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_qwen3_moe.py‎
Lines changed: 0 additions & 21 deletions b/‎tensorrt_llm/_torch/models/modeling_qwen3_moe.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py‎
Lines changed: 0 additions & 14 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py‎
Lines changed: 0 additions & 14 deletions
@@ -4124,22 +4124,6 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     TLLM_CHECK_WITH_INFO(gemm1_config_, "MOE GEMM1 Config is not set");
     TLLM_CHECK_WITH_INFO(gemm2_config_, "MOE GEMM2 Config is not set");
 
-    // DEBUG(moe-lora): dump the compile-time precision flags of the runner that
-    // actually ran, so we can see whether NVFP4 reached this runMoe with FP4
-    // activations (act_fp4) or via some bf16/fp8 path. Capped to avoid flooding.
-    {
-        static int _krn_diag = 0;
-        if (use_lora && _krn_diag++ < 64)
-        {
-            printf("[moe-lora-diag][kernel] use_lora=%d act_fp4=%d weight_fp4=%d use_fp4=%d use_fp8=%d "
-                   "use_w4afp8=%d use_wfp4afp8=%d use_wfp4a16=%d use_block_scaling=%d "
-                   "use_deepseek_fp8_block_scale=%d\n",
-                (int) use_lora, (int) act_fp4, (int) weight_fp4, (int) use_fp4, (int) use_fp8, (int) use_w4afp8,
-                (int) use_wfp4afp8, (int) use_wfp4a16, (int) use_block_scaling, (int) use_deepseek_fp8_block_scale);
-            fflush(stdout);
-        }
-    }
-
     TLLM_CHECK_WITH_INFO(!use_lora || !act_fp4, "MOE does not support LoRA with FP4 model");
 
     if (int_scales_required)
 
@@ -611,40 +611,21 @@ class FusedMoeRunner : public torch::CustomClassHolder
             TORCH_CHECK(!enable_alltoall,
                 "MoE LoRA is not supported with alltoall: the per-token adapter pointer arrays do not survive "
                 "cross-rank token reshuffling.");
-            // These checks constrain the dtype of the base MoE inputs (the
-            // activations and the routed-expert weights), not the LoRA adapters,
-            // which are always fp16/bf16. FP8 base weights are supported in both
-            // the per-tensor (qdq) and block-scale forms: the LoRA GEMM runs on
-            // fp16/bf16 activations (loraFC1/loraFC2 in moe_kernels.cu), either
-            // dequantizing the per-tensor FP8 activations or reading the
-            // block-scale path's bf16 activations directly. NVFP4 base weights
-            // are presented as packed FP4 (c10::Long) with FP4 activations, so
-            // the dtype checks below are skipped for NVFP4 and the runtime
-            // FP4-activation handling is enforced in the kernel (moe_kernels.cu).
-            // Integer quant has no LoRA path.
-            bool const is_nvfp4 = isNvfp4Quant();
-            // DEBUG(moe-lora): report the dtypes/quant flags the op actually
-            // received so we can see what precision NVFP4 surfaces as here.
-            {
-                static int _thop_diag = 0;
-                if (_thop_diag++ < 64)
-                {
-                    printf("[moe-lora-diag][thop] act_dtype=%s weight_dtype=%s output_dtype=%s isNvfp4=%d isFp8=%d "
-                           "deepseekBlockScale=%d useW4Group=%d useMxfp8Act=%d\n",
-                        torch::toString(mActivationDtype), torch::toString(mWeightDtype),
-                        torch::toString(mOutputDtype), (int) is_nvfp4, (int) isFp8Quant(),
-                        (int) mUseDeepSeekFP8BlockScaling, (int) mUseW4GroupScaling, (int) mUseMxfp8ActScaling);
-                    fflush(stdout);
-                }
-            }
-            TORCH_CHECK(is_nvfp4 || mActivationDtype == c10::ScalarType::Half
-                    || mActivationDtype == c10::ScalarType::BFloat16
-                    || mActivationDtype == c10::ScalarType::Float8_e4m3fn,
-                "MoE LoRA only supports fp16, bf16, per-tensor FP8, or NVFP4 activation dtypes.");
-            TORCH_CHECK(is_nvfp4 || mWeightDtype == c10::ScalarType::Half
-                    || mWeightDtype == c10::ScalarType::BFloat16
-                    || mWeightDtype == c10::ScalarType::Float8_e4m3fn,
-                "MoE LoRA supports unquantized fp16/bf16, FP8, or NVFP4 base expert weights only "
+            // Constrain the dtype of the base MoE inputs (activations and
+            // routed-expert weights), not the LoRA adapters, which are always
+            // fp16/bf16. Per-tensor FP8 (qdq) is supported because the LoRA GEMM
+            // runs on fp16/bf16 activations after the kernel dequantizes them.
+            // isFp8Quant() matches only per-tensor FP8, so FP8 block-scale,
+            // NVFP4, MXFP8, and integer quant (none of which have a LoRA path)
+            // stay rejected.
+            bool const is_per_tensor_fp8 = isFp8Quant();
+            TORCH_CHECK(mActivationDtype == c10::ScalarType::Half || mActivationDtype == c10::ScalarType::BFloat16
+                    || is_per_tensor_fp8,
+                "MoE LoRA only supports fp16, bf16, or per-tensor FP8 (qdq) base weights. FP8 block-scale, NVFP4, "
+                "MXFP8, and integer quant are not supported.");
+            TORCH_CHECK(
+                mWeightDtype == c10::ScalarType::Half || mWeightDtype == c10::ScalarType::BFloat16 || is_per_tensor_fp8,
+                "MoE LoRA supports unquantized fp16/bf16 or per-tensor FP8 (qdq) base expert weights only "
                 "(LoRA adapters are always fp16/bf16).");
             // CUDA-graph capture is only safe on the device LoRA path. The
             // legacy host path performs a host-side cudaEventSynchronize and
@@ -1322,15 +1303,6 @@ class FusedMoeRunner : public torch::CustomClassHolder
             TORCH_CHECK(mOutputDtype != c10::ScalarType::Float8_e4m3fn,
                 "MoE LoRA with FP8 base activations requires an fp16/bf16 output (LoRA compute) dtype.");
             return loraTypeFromActDtype(mOutputDtype);
-#endif
-#ifdef ENABLE_FP4
-        // NVFP4 base activations are presented as packed FP4 (uint8/Byte) with
-        // separate per-block scales; the LoRA GEMM runs in the fp16/bf16 output
-        // (backbone) dtype after the kernel dequantizes the FP4 activations.
-        case c10::ScalarType::Byte:
-            TORCH_CHECK(mOutputDtype != c10::ScalarType::Float8_e4m3fn,
-                "MoE LoRA with NVFP4 base activations requires an fp16/bf16 output (LoRA compute) dtype.");
-            return loraTypeFromActDtype(mOutputDtype);
 #endif
         default: C10_THROW_ERROR_FORMATTED(Error, "MoE LoRA only supports fp16/bf16/fp32 activation dtype.");
         }
 
@@ -1102,8 +1102,7 @@ def _get_shared_experts_quant_config(model_config,
         return quant_config
 
     def compute_routed_output(self, hidden_states, hidden_states_fp4,
-                              all_rank_num_tokens, do_finalize,
-                              lora_params=None):
+                              all_rank_num_tokens, do_finalize):
         # max-throughput
         use_dp_padding = False
         # Add DP padding on SM120 for context comm performance
@@ -1124,7 +1123,6 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
             output_dtype=hidden_states.dtype,
             all_rank_num_tokens=all_rank_num_tokens,
             use_dp_padding=use_dp_padding,
-            lora_params=lora_params,
             **({
                 "alltoall_result_do_sum": False
             } if isinstance(self.experts, WideEPMoE) else {}),
@@ -1139,24 +1137,10 @@ def forward(
         all_rank_num_tokens: Optional[list[int]] = None,
         final_all_reduce_params: Optional[AllReduceParams] = None,
         do_finalize: Optional[bool] = True,
-        lora_params: Optional[dict] = None,
     ) -> torch.Tensor:
         if not do_finalize:
             assert not self.use_dp
 
-        # DEBUG(moe-lora): confirm routed-expert LoRA params reach the DeepSeek
-        # MoE module. Report the per-layer module ids present in lora_params
-        # (routed-expert ids: moe_h_to_4h=13, moe_4h_to_h=14, moe_gate=15).
-        _layer_idx = getattr(self.experts, "layer_idx", None)
-        _layer_module_ids = (sorted(lora_params.get(_layer_idx, {}).keys()) if
-                             (lora_params and _layer_idx is not None) else None)
-        print(
-            f"[deepseek-moe] layer={_layer_idx} "
-            f"experts_type={type(self.experts).__name__} "
-            f"lora_params_present={lora_params is not None} "
-            f"layer_module_ids={_layer_module_ids}",
-            flush=True)
-
         def _compute_shared_output():
             shared_input = (hidden_states_fp4 if
                             (hidden_states_fp4 is not None
@@ -1171,8 +1155,7 @@ def _compute_routed_output():
             routed_output = self.compute_routed_output(hidden_states,
                                                        hidden_states_fp4,
                                                        all_rank_num_tokens,
-                                                       do_finalize,
-                                                       lora_params=lora_params)
+                                                       do_finalize)
             return routed_output
 
         # NOTE: define compiled helpers at module scope to avoid defining decorators inside compiled frames
@@ -1424,15 +1407,12 @@ def forward(
         attn_metadata: AttentionMetadata,
         residual: torch.Tensor,
         spec_metadata: Optional[SpecMetadata] = None,
-        lora_params: Optional[dict] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
-        # Self Attention. DeepSeek attention is MLA, whose forward does not
-        # accept lora_params, so it is intentionally not threaded here; routed-
-        # expert MoE LoRA is applied in the MoE path below.
+        # Self Attention
         hidden_states = self.self_attn(
             position_ids=position_ids,
             hidden_states=hidden_states,
@@ -1453,7 +1433,6 @@ def forward(
                 attn_metadata=attn_metadata,
                 residual=residual,
                 spec_metadata=spec_metadata,
-                lora_params=lora_params,
             )
         else:
             if spec_metadata is not None and spec_metadata.is_layer_capture(
@@ -1472,7 +1451,6 @@ def forward_MoE(
         attn_metadata: AttentionMetadata,
         residual: torch.Tensor,
         spec_metadata: Optional[SpecMetadata] = None,
-        lora_params: Optional[dict] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
 
         def _run_MoE(hidden_states, hidden_states_fp4, do_finalize):
@@ -1484,7 +1462,6 @@ def _run_MoE(hidden_states, hidden_states_fp4, do_finalize):
                     enable_allreduce=not (self.fusion_config.POST_MOE_FUSION
                                           or self.mapping.tp_size == 1)),
                 do_finalize=do_finalize,
-                lora_params=lora_params,
             )
 
         if self.fusion_config.PRE_MOE_FUSION:
@@ -1803,7 +1780,6 @@ def forward(
         position_ids: Optional[torch.IntTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         spec_metadata: Optional[SpecMetadata] = None,
-        lora_params: Optional[dict] = None,
         **kwargs,
     ) -> torch.Tensor:
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -1825,7 +1801,6 @@ def forward(
                 attn_metadata=attn_metadata,
                 residual=residual,
                 spec_metadata=spec_metadata,
-                lora_params=lora_params,
             )
 
         hidden_states = maybe_allgather_for_helix_cp(hidden_states,
 
@@ -63,22 +63,6 @@ def forward(
         attn_metadata: AttentionMetadata,
         lora_params: Optional[dict] = None,
     ) -> torch.Tensor:
-        # DEBUG(moe-lora): confirm routed-expert LoRA params reach the Mixtral
-        # MoE module. Without this plumbing the experts run without LoRA and the
-        # adapter output is identical to the base model. `self.experts` may be a
-        # ConfigurableMoE wrapper whose CUTLASS impl (with _moe_lora_active)
-        # lives on `.backend`, so resolve through it before giving up.
-        _impl = self.experts
-        if not hasattr(_impl, "_moe_lora_active"):
-            _impl = getattr(self.experts, "backend", None)
-        _active = _impl._moe_lora_active(lora_params) if hasattr(
-            _impl, "_moe_lora_active") else None
-        print(
-            f"[mixtral-moe] layer={getattr(self.experts, 'layer_idx', None)} "
-            f"experts_type={type(self.experts).__name__} "
-            f"lora_params_present={lora_params is not None} "
-            f"moe_lora_active={_active}",
-            flush=True)
         all_rank_num_tokens = attn_metadata.all_rank_num_tokens
         router_logits = self.gate(hidden_states)
         final_hidden_states = self.experts(
 
@@ -134,7 +134,6 @@ def forward(
         attn_metadata: AttentionMetadata,
         all_reduce_params: Optional[AllReduceParams] = None,
         do_finalize: Optional[bool] = True,
-        lora_params: Optional[dict] = None,
     ) -> torch.Tensor:
         assert hidden_states.shape[-1] == self.hidden_dim
         orig_shape = hidden_states.shape
@@ -145,30 +144,13 @@ def forward(
         if not do_finalize:
             assert not self.enable_attention_dp
 
-        # DEBUG(moe-lora): confirm routed-expert LoRA params reach the Qwen3 MoE
-        # module. Report the actual per-layer module ids present in lora_params
-        # (ground truth) rather than _moe_lora_active, which keys on the backend
-        # instance's layer_idx and can be misleading through the ConfigurableMoE
-        # wrapper. The routed-expert ids are moe_h_to_4h=13, moe_4h_to_h=14,
-        # moe_gate=15.
-        _layer_idx = getattr(self.experts, "layer_idx", None)
-        _layer_module_ids = (sorted(lora_params.get(_layer_idx, {}).keys()) if
-                             (lora_params and _layer_idx is not None) else None)
-        print(
-            f"[qwen3-moe] layer={_layer_idx} "
-            f"experts_type={type(self.experts).__name__} "
-            f"lora_params_present={lora_params is not None} "
-            f"layer_module_ids={_layer_module_ids}",
-            flush=True)
-
         router_logits = self.gate(hidden_states)
         final_hidden_states = self.experts(
             hidden_states,
             router_logits,
             all_rank_num_tokens=all_rank_num_tokens,
             use_dp_padding=use_dp_padding,
             do_finalize=do_finalize,
-            lora_params=lora_params,
         )
 
         if not do_finalize:
@@ -243,7 +225,6 @@ def forward(
         spec_metadata: Optional[SpecMetadata] = None,
         mrope_config: Optional[Dict[str, torch.Tensor]] = None,
         deepstack_embeds: Optional[List[torch.Tensor]] = None,
-        lora_params: Optional[dict] = None,
         **kwargs,
     ) -> torch.Tensor:
         if residual is None:
@@ -260,7 +241,6 @@ def forward(
             all_reduce_params=AllReduceParams(
                 enable_allreduce=not self.disable_attn_allreduce),
             mrope_config=mrope_config,
-            lora_params=lora_params,
             **kwargs,
         )
 
@@ -292,7 +272,6 @@ def forward(
                 enable_allreduce=not (self.fusion_config.POST_MOE_FUSION
                                       or self.mapping.tp_size == 1)),
             do_finalize=do_finalize,
-            lora_params=lora_params,
         )
 
         if deepstack_embeds is not None and self.layer_idx in range(
 
@@ -1024,20 +1024,6 @@ def run_moe(
         if lora_kwargs is None:
             lora_kwargs = {}
 
-        # DEBUG(moe-lora): report what the CUTLASS MoE op actually receives for
-        # this layer -- which quant method, the input tensor type/dtype (this is
-        # the activation precision reaching the op), and whether a routed-expert
-        # LoRA delta is being fused. Lets us see how NVFP4 surfaces at runtime.
-        if self._moe_lora_active(lora_params):
-            _x_dtype = getattr(x, "dtype", None)
-            print(
-                f"[moe-lora-diag][py] layer={self.layer_idx} "
-                f"quant_method={type(self.quant_method).__name__} "
-                f"x_type={type(x).__name__} x_dtype={_x_dtype} "
-                f"x_sf_present={x_sf is not None} has_nvfp4={self.has_nvfp4} "
-                f"weight_dtype={weight_dtype} lora_active={bool(lora_kwargs)}",
-                flush=True)
-
         result = torch.ops.trtllm.fused_moe(
             x,
             token_selected_experts,