Resolve some new comments

WeiHaocheng · WeiHaocheng · commit a46a1f0b9d22 · 2026-06-21T04:38:17.000-07:00
Signed-off-by: Fred Wei &lt;20514172+WeiHaocheng@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h
@@ -84,6 +84,19 @@ struct MXSMTypeAdapter<__2SM>
     using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100;
 };
 
+namespace detail
+{
+template <typename T, typename = void>
+struct has_bias_ptr : std::false_type
+{
+};
+
+template <typename T>
+struct has_bias_ptr<T, std::void_t<decltype(std::declval<T&>().bias_ptr)>> : std::true_type
+{
+};
+} // namespace detail
+
 #ifdef PLACEHOLDER_KERNELS
 
 template <typename T, typename CTA_M, typename CTA_N, typename CTA_K, typename CGA_M, typename CGA_N, typename CGA_K,
@@ -187,7 +200,10 @@ typename Gemm::Arguments prepareGemmArgsSm100(void* D, void const* A, void const
     operator_args.mode = cutlass::gemm::GemmUniversalMode::kGemm;
     auto& fusion_args = operator_args.epilogue.thread;
     fusion_args.alpha_ptr = static_cast<ElementCompute const*>(global_sf);
-    fusion_args.bias_ptr = static_cast<ElementD const*>(bias);
+    if constexpr (detail::has_bias_ptr<std::decay_t<decltype(fusion_args)>>::value)
+    {
+        fusion_args.bias_ptr = static_cast<ElementD const*>(bias);
+    }
 
     operator_args.problem_shape = cute::make_shape(m, n, k, batch_count);
 
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
@@ -335,6 +335,7 @@ class MoeGemmRunner
     int sm_{};
     int multi_processor_count_{};
     mutable int num_experts_ = 0;
+    mutable bool use_mxfp8_weight_scaling_ = false;
     mutable size_t gemm_workspace_size_ = 0;
     size_t calcMaxWorkspaceSize(int num_experts, bool use_mxfp8_weight_scaling) const;
 };
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -770,56 +770,38 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                 bool const use_mxfp8 = is_wfp8afp8
                     && hopper_inputs.fpX_block_scaling_type
                         == TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX;
+                // Pick the IsMXFPX template parameter for a given FUSION, factoring out the duplicated
+                // is_wfp4afp8 / is_wfp8afp8 / else chain. C++17-compatible via an integral_constant tag.
+                auto select_mxfpx_mode = [&](auto fusion_tag)
+                {
+                    constexpr auto FUSION = decltype(fusion_tag)::value;
+                    if constexpr (is_wfp4afp8)
+                    {
+                        return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
+                            OutputType, EpilogueTag, FUSION, true>;
+                    }
+                    else if constexpr (is_wfp8afp8)
+                    {
+                        return use_mxfp8 ? &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
+                                   WeightType, OutputType, EpilogueTag, FUSION, true>
+                                         : &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
+                                             WeightType, OutputType, EpilogueTag, FUSION, false>;
+                    }
+                    else
+                    {
+                        return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
+                            OutputType, EpilogueTag, FUSION, false>;
+                    }
+                };
                 auto select_function = [&]()
                 {
+                    using Fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion;
                     switch (hopper_inputs.fusion)
                     {
-                    case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE:
-                        if constexpr (is_wfp4afp8)
-                        {
-                            return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
-                                OutputType, EpilogueTag, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE,
-                                true>;
-                        }
-                        else if constexpr (is_wfp8afp8)
-                        {
-                            return use_mxfp8 ? &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
-                                       WeightType, OutputType, EpilogueTag,
-                                       TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE, true>
-                                             : &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
-                                                 WeightType, OutputType, EpilogueTag,
-                                                 TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE, false>;
-                        }
-                        else
-                        {
-                            return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
-                                OutputType, EpilogueTag, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE,
-                                false>;
-                        }
-                    case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE:
-                        if constexpr (is_wfp4afp8)
-                        {
-                            return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
-                                OutputType, EpilogueTag, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE,
-                                true>;
-                        }
-                        else if constexpr (is_wfp8afp8)
-                        {
-                            return use_mxfp8 ? &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
-                                       WeightType, OutputType, EpilogueTag,
-                                       TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, true>
-                                             : &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
-                                                 WeightType, OutputType, EpilogueTag,
-                                                 TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, false>;
-                        }
-                        else
-                        {
-                            return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
-                                OutputType, EpilogueTag, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE,
-                                false>;
-                        }
-                    case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::ACTIVATION:
-                    case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::GATED_ACTIVATION:
+                    case Fusion::FINALIZE: return select_mxfpx_mode(std::integral_constant<Fusion, Fusion::FINALIZE>{});
+                    case Fusion::NONE: return select_mxfpx_mode(std::integral_constant<Fusion, Fusion::NONE>{});
+                    case Fusion::ACTIVATION:
+                    case Fusion::GATED_ACTIVATION:
                     default: TLLM_THROW("Unimplemented fusion %d requested", (int) hopper_inputs.fusion);
                     };
                 };
@@ -923,10 +905,13 @@ template <typename T, typename WeightType, typename OutputType, typename ScaleBi
 size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getMaxWorkspaceSize(
     int num_experts, bool use_mxfp8_weight_scaling) const
 {
-    if (num_experts != num_experts_)
+    if (num_experts != num_experts_ || use_mxfp8_weight_scaling != use_mxfp8_weight_scaling_)
     {
-        TLLM_LOG_TRACE("Calling getMaxWorkspaceSize() with a new expert count %d vs %d", num_experts, num_experts_);
+        TLLM_LOG_TRACE(
+            "Calling getMaxWorkspaceSize() with a new (expert count, use_mxfp8_weight_scaling) (%d, %d) vs (%d, %d)",
+            num_experts, (int) use_mxfp8_weight_scaling, num_experts_, (int) use_mxfp8_weight_scaling_);
         num_experts_ = num_experts;
+        use_mxfp8_weight_scaling_ = use_mxfp8_weight_scaling;
         gemm_workspace_size_ = calcMaxWorkspaceSize(num_experts, use_mxfp8_weight_scaling);
     }
     return gemm_workspace_size_;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
@@ -463,7 +463,13 @@ void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(TmaWarpSpecializedGroupedG
 
     if (gemm_config.sm_version == 90)
     {
-        if constexpr (kernels::cutlass_kernels::isValidHopperMOESpecialisation<T, WeightType, EpilogueTag, FUSION>())
+        // Block-scaled MXFP8xMXFP8 (IsMXFPX=true) is Blackwell-only; the SM90 launcher
+        // has no `is_mx_fpx=True` explicit instantiation in generate_kernels.py. Gate
+        // the SM90 dispatch on `!IsMXFPX` so the IsMXFPX=true template is never
+        // instantiated for Sm90 (otherwise the link of libth_common.so fails with
+        // undefined references when SM90 is included in CMAKE_CUDA_ARCHITECTURES).
+        if constexpr (!IsMXFPX
+            && kernels::cutlass_kernels::isValidHopperMOESpecialisation<T, WeightType, EpilogueTag, FUSION>())
         {
             switch (gemm_config.tile_config_sm90)
             {
@@ -558,34 +564,30 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecialized(int num_experts, cutlass_extension
     // <e4m3, e4m3> needs the IsMXFPX template to match what the runtime dispatch will pick.
     constexpr bool is_wfp4afp8 = std::is_same_v<T, __nv_fp8_e4m3> && std::is_same_v<WeightType, __nv_fp4_e2m1>;
     constexpr bool is_wfp8afp8 = std::is_same_v<T, __nv_fp8_e4m3> && std::is_same_v<WeightType, __nv_fp8_e4m3>;
-    if constexpr (is_wfp4afp8)
+    auto pick_kernel = [&]()
     {
-        dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType, OutputType,
-            cutlass_extensions::EpilogueOpDefault, FUSION, true>(
-            input, num_experts, gemm_config, multi_processor_count, cudaStream_t{0}, nullptr, &count);
-    }
-    else if constexpr (is_wfp8afp8)
-    {
-        bool const use_mxfp8 = fpX_block_scaling_type == TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX;
-        if (use_mxfp8)
+        if constexpr (is_wfp4afp8)
+        {
+            return &dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType, OutputType,
+                cutlass_extensions::EpilogueOpDefault, FUSION, true>;
+        }
+        else if constexpr (is_wfp8afp8)
         {
-            dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType, OutputType,
-                cutlass_extensions::EpilogueOpDefault, FUSION, true>(
-                input, num_experts, gemm_config, multi_processor_count, cudaStream_t{0}, nullptr, &count);
+            bool const use_mxfp8
+                = fpX_block_scaling_type == TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX;
+            return use_mxfp8 ? &dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType, OutputType,
+                       cutlass_extensions::EpilogueOpDefault, FUSION, true>
+                             : &dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType, OutputType,
+                                 cutlass_extensions::EpilogueOpDefault, FUSION, false>;
         }
         else
         {
-            dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType, OutputType,
-                cutlass_extensions::EpilogueOpDefault, FUSION, false>(
-                input, num_experts, gemm_config, multi_processor_count, cudaStream_t{0}, nullptr, &count);
+            return &dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType, OutputType,
+                cutlass_extensions::EpilogueOpDefault, FUSION, false>;
         }
-    }
-    else
-    {
-        dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType, OutputType,
-            cutlass_extensions::EpilogueOpDefault, FUSION, false>(
-            input, num_experts, gemm_config, multi_processor_count, cudaStream_t{0}, nullptr, &count);
-    }
+    };
+    auto selected_kernel = pick_kernel();
+    selected_kernel(input, num_experts, gemm_config, multi_processor_count, cudaStream_t{0}, nullptr, &count);
     return count;
 }