Resolve some new comments

WeiHaocheng · WeiHaocheng · commit 13ac76d69abb · 2026-06-23T00:37:22.000-07:00
Signed-off-by: Fred Wei &lt;20514172+WeiHaocheng@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@@ -361,7 +361,12 @@ struct CutlassGemmConfig
         GROUPED_GEMM = 1u << 5,
         FP8_ONLY = 1u << 6,
         FP4_ONLY = 1u << 7,
-        FP8FP4_MIXED = 1u << 8
+        FP8FP4_MIXED = 1u << 8,
+        // MXFP8xMXFP8 block-scaled MoE on SM100/103. Restricts the candidate
+        // tile shapes to the subset valid for the Mxf8f6f4 tensor-op (TileM=128,
+        // TileN in {64,128,256}); otherwise autotuning would enumerate FP8 tile
+        // shapes that the runtime dispatcher rejects.
+        MXFP8_MXFP8 = 1u << 9
     };
 
     CutlassTileConfig tile_config_sm80 = CutlassTileConfig::ChooseWithHeuristic;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
@@ -408,32 +408,65 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100_dynamic_cluster_shape
         return candidate_configs;
     }
 
-    std::vector<std::pair<CutlassTileConfigSM100, ClusterShape>> tile_configs{
-        {CutlassTileConfigSM100::CtaShape64x32x128B, cluster1sm},
-        {CutlassTileConfigSM100::CtaShape64x64x128B, cluster1sm},
-        {CutlassTileConfigSM100::CtaShape64x128x128B, cluster1sm},
-        {CutlassTileConfigSM100::CtaShape64x256x128B, cluster1sm},
-        {CutlassTileConfigSM100::CtaShape128x32x128B, cluster1sm},
-        {CutlassTileConfigSM100::CtaShape128x64x128B, cluster1sm},
-        {CutlassTileConfigSM100::CtaShape128x128x128B, cluster1sm},
-        {CutlassTileConfigSM100::CtaShape128x256x128B, cluster1sm},
-    };
-
-    if (supports_2sm)
-    {
-        tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x128x128B, cluster2sm});
-        tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x256x128B, cluster2sm});
-        tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x64x128B, cluster2sm});
-        tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x64x128B, cluster2sm});
-        tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x128x128B, cluster2sm});
-        tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x256x128B, cluster2sm});
+    std::vector<std::pair<CutlassTileConfigSM100, ClusterShape>> tile_configs;
+    if ((config & CutlassGemmConfig::MXFP8_MXFP8) != 0)
+    {
+        // MXFP8xMXFP8 always instantiates the Mxf8f6f4 block-scaled tensor-op
+        // with cutlass::arch::Sm100, even on SM103 (the SM103 dispatch case in
+        // dispatchMoeGemmSelectTileShapeTmaWarpSpecialized only handles FP4xFP4;
+        // MXFP8 falls through to the sm_version>=100 && <120 branch which
+        // instantiates Arch=Sm100). Therefore the TMA-only constraint enforced
+        // by getDispatchFunctionForSM100 (Arch::kMinComputeCapability==103 is
+        // false for Sm100) applies on both SM100 and SM103, so we filter out
+        // non-TMA epilogue candidates unconditionally here.
+        if (schedule != EpilogueScheduleType::TMA)
+            return {};
+        // MXFP8xMXFP8 uses the Mxf8f6f4 block-scaled tensor-op; only TileM=128
+        // and TileN in {64,128,256} are valid (kept in sync with the IsMXFPX
+        // branch in are_tile_shapes_supported_sm100). Returning the broader FP8
+        // tile list would crash autotuning with "Unsupported tile shape" since
+        // the runtime dispatcher rejects the unsupported combinations.
+        tile_configs = {
+            {CutlassTileConfigSM100::CtaShape128x64x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape128x128x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape128x256x128B, cluster1sm},
+        };
+        if (supports_2sm)
+        {
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x64x128B, cluster2sm});
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x128x128B, cluster2sm});
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x256x128B, cluster2sm});
+        }
     }
-
-    if (config & CutlassGemmConfig::FP8_ONLY)
+    else
     {
-        tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x16x128B, cluster1sm});
-        // TODO: re-enable when handled by the MoE GEMM dispatch
-        // tile_configs.push_back({ CutlassTileConfigSM100::CtaShape128x8x256B, ClusterShape::ClusterShape_1x1x1 });
+        tile_configs = {
+            {CutlassTileConfigSM100::CtaShape64x32x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape64x64x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape64x128x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape64x256x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape128x32x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape128x64x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape128x128x128B, cluster1sm},
+            {CutlassTileConfigSM100::CtaShape128x256x128B, cluster1sm},
+        };
+
+        if (supports_2sm)
+        {
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x128x128B, cluster2sm});
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x256x128B, cluster2sm});
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape64x64x128B, cluster2sm});
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x64x128B, cluster2sm});
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x128x128B, cluster2sm});
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x256x128B, cluster2sm});
+        }
+
+        if (config & CutlassGemmConfig::FP8_ONLY)
+        {
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x16x128B, cluster1sm});
+            // TODO: re-enable when handled by the MoE GEMM dispatch
+            // tile_configs.push_back({ CutlassTileConfigSM100::CtaShape128x8x256B, ClusterShape::ClusterShape_1x1x1 });
+        }
     }
 
     for (auto [tile, cluster] : tile_configs)
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h
@@ -84,6 +84,19 @@ struct MXSMTypeAdapter<__2SM>
     using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100;
 };
 
+namespace detail
+{
+template <typename T, typename = void>
+struct has_bias_ptr : std::false_type
+{
+};
+
+template <typename T>
+struct has_bias_ptr<T, std::void_t<decltype(std::declval<T&>().bias_ptr)>> : std::true_type
+{
+};
+} // namespace detail
+
 #ifdef PLACEHOLDER_KERNELS
 
 template <typename T, typename CTA_M, typename CTA_N, typename CTA_K, typename CGA_M, typename CGA_N, typename CGA_K,
@@ -187,7 +200,10 @@ typename Gemm::Arguments prepareGemmArgsSm100(void* D, void const* A, void const
     operator_args.mode = cutlass::gemm::GemmUniversalMode::kGemm;
     auto& fusion_args = operator_args.epilogue.thread;
     fusion_args.alpha_ptr = static_cast<ElementCompute const*>(global_sf);
-    fusion_args.bias_ptr = static_cast<ElementD const*>(bias);
+    if constexpr (detail::has_bias_ptr<std::decay_t<decltype(fusion_args)>>::value)
+    {
+        fusion_args.bias_ptr = static_cast<ElementD const*>(bias);
+    }
 
     operator_args.problem_shape = cute::make_shape(m, n, k, batch_count);
 
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
@@ -300,10 +300,12 @@ class MoeGemmRunner
     void moeGemm(GroupedGemmInput<T, WeightType, ScaleBiasType, OutputType> inputs,
         TmaWarpSpecializedGroupedGemmInput hopper_inputs);
 
-    std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs(bool supports_finalize_fusion) const;
-    static std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs(int sm, bool supports_finalize_fusion);
+    std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs(
+        bool supports_finalize_fusion, bool use_mxfp8 = false) const;
+    static std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs(
+        int sm, bool supports_finalize_fusion, bool use_mxfp8 = false);
     static std::vector<cutlass_extensions::CutlassGemmConfig> getTmaWarpSpecializedConfigs(
-        int sm, bool supports_finalize_fusion);
+        int sm, bool supports_finalize_fusion, bool use_mxfp8 = false);
     static std::vector<cutlass_extensions::CutlassGemmConfig> getAmpereConfigs(int sm);
 
     [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
@@ -335,6 +337,7 @@ class MoeGemmRunner
     int sm_{};
     int multi_processor_count_{};
     mutable int num_experts_ = 0;
+    mutable bool use_mxfp8_weight_scaling_ = false;
     mutable size_t gemm_workspace_size_ = 0;
     size_t calcMaxWorkspaceSize(int num_experts, bool use_mxfp8_weight_scaling) const;
 };
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -685,7 +685,11 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
 
     std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(MoeGemmId gemm_id) override
     {
-        return moe_gemm_runner_.getConfigs(gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused());
+        // Pass `use_mxfp8_weight_scaling_` so MXFP8xMXFP8 enumerates only the
+        // Mxf8f6f4-valid tile shapes; otherwise autotuning would invoke FP8
+        // tile shapes that the runtime dispatcher rejects with TLLM_THROW.
+        return moe_gemm_runner_.getConfigs(
+            gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused(), use_mxfp8_weight_scaling_);
     }
 
     static std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(int sm, MoeGemmId gemm_id)
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -483,17 +483,17 @@ namespace kernels::cutlass_kernels
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getConfigs(
-    bool supports_finalize_fusion) const
+    bool supports_finalize_fusion, bool use_mxfp8) const
 {
-    return getConfigs(sm_, supports_finalize_fusion);
+    return getConfigs(sm_, supports_finalize_fusion, use_mxfp8);
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getConfigs(
-    int sm, bool supports_finalize_fusion)
+    int sm, bool supports_finalize_fusion, bool use_mxfp8)
 {
     std::vector<cutlass_extensions::CutlassGemmConfig> candidate_configs
-        = getTmaWarpSpecializedConfigs(sm, supports_finalize_fusion);
+        = getTmaWarpSpecializedConfigs(sm, supports_finalize_fusion, use_mxfp8);
     std::vector<cutlass_extensions::CutlassGemmConfig> ampere_configs = getAmpereConfigs(sm);
     std::copy(ampere_configs.begin(), ampere_configs.end(), std::back_inserter(candidate_configs));
     return candidate_configs;
@@ -530,7 +530,7 @@ MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getAmpereConfigs(int sm
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 std::vector<cutlass_extensions::CutlassGemmConfig>
 MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getTmaWarpSpecializedConfigs(
-    int sm, bool supports_finalize_fusion)
+    int sm, bool supports_finalize_fusion, bool use_mxfp8)
 {
     using tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
     static constexpr auto weight_only_flag
@@ -545,8 +545,16 @@ MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getTmaWarpSpecializedCo
     static constexpr auto fp4_only_flag
         = (use_fp4 || use_wfp4afp8) ? CutlassGemmConfig::FP4_ONLY : CutlassGemmConfig::NONE;
     static constexpr auto fp8fp4_mixed_flag = use_wfp4afp8 ? CutlassGemmConfig::FP8FP4_MIXED : CutlassGemmConfig::NONE;
-    auto config_type_param = static_cast<CutlassGemmConfig::CandidateConfigTypeParam>(weight_only_flag | simt_only_flag
-        | grouped_gemm_flag | enable_blackwell | enable_hopper | fp8_only_flag | fp4_only_flag | fp8fp4_mixed_flag);
+    // MXFP8xMXFP8 only applies to <e4m3, e4m3>; for other type pairs the flag is ignored.
+#if defined(ENABLE_FP8)
+    static constexpr bool is_wfp8afp8 = std::is_same_v<T, __nv_fp8_e4m3> && std::is_same_v<WeightType, __nv_fp8_e4m3>;
+#else
+    static constexpr bool is_wfp8afp8 = false;
+#endif
+    int const mxfp8_flag = (use_mxfp8 && is_wfp8afp8) ? CutlassGemmConfig::MXFP8_MXFP8 : CutlassGemmConfig::NONE;
+    auto config_type_param
+        = static_cast<CutlassGemmConfig::CandidateConfigTypeParam>(weight_only_flag | simt_only_flag | grouped_gemm_flag
+            | enable_blackwell | enable_hopper | fp8_only_flag | fp4_only_flag | fp8fp4_mixed_flag | mxfp8_flag);
     TLLM_CHECK_WITH_INFO(!(enable_blackwell && enable_hopper), "Blackwell and hopper flags are mutually exclusive");
 
     sm = use_wfp4afp8 && sm == 103 ? 100 : sm;
@@ -770,56 +778,38 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                 bool const use_mxfp8 = is_wfp8afp8
                     && hopper_inputs.fpX_block_scaling_type
                         == TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX;
+                // Pick the IsMXFPX template parameter for a given FUSION, factoring out the duplicated
+                // is_wfp4afp8 / is_wfp8afp8 / else chain. C++17-compatible via an integral_constant tag.
+                auto select_mxfpx_mode = [&](auto fusion_tag)
+                {
+                    constexpr auto FUSION = decltype(fusion_tag)::value;
+                    if constexpr (is_wfp4afp8)
+                    {
+                        return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
+                            OutputType, EpilogueTag, FUSION, true>;
+                    }
+                    else if constexpr (is_wfp8afp8)
+                    {
+                        return use_mxfp8 ? &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
+                                   WeightType, OutputType, EpilogueTag, FUSION, true>
+                                         : &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
+                                             WeightType, OutputType, EpilogueTag, FUSION, false>;
+                    }
+                    else
+                    {
+                        return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
+                            OutputType, EpilogueTag, FUSION, false>;
+                    }
+                };
                 auto select_function = [&]()
                 {
+                    using Fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion;
                     switch (hopper_inputs.fusion)
                     {
-                    case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE:
-                        if constexpr (is_wfp4afp8)
-                        {
-                            return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
-                                OutputType, EpilogueTag, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE,
-                                true>;
-                        }
-                        else if constexpr (is_wfp8afp8)
-                        {
-                            return use_mxfp8 ? &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
-                                       WeightType, OutputType, EpilogueTag,
-                                       TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE, true>
-                                             : &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
-                                                 WeightType, OutputType, EpilogueTag,
-                                                 TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE, false>;
-                        }
-                        else
-                        {
-                            return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
-                                OutputType, EpilogueTag, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE,
-                                false>;
-                        }
-                    case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE:
-                        if constexpr (is_wfp4afp8)
-                        {
-                            return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
-                                OutputType, EpilogueTag, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE,
-                                true>;
-                        }
-                        else if constexpr (is_wfp8afp8)
-                        {
-                            return use_mxfp8 ? &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
-                                       WeightType, OutputType, EpilogueTag,
-                                       TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, true>
-                                             : &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T,
-                                                 WeightType, OutputType, EpilogueTag,
-                                                 TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, false>;
-                        }
-                        else
-                        {
-                            return &cutlass_kernels_oss::dispatchMoeGemmSelectTileShapeTmaWarpSpecialized<T, WeightType,
-                                OutputType, EpilogueTag, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE,
-                                false>;
-                        }
-                    case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::ACTIVATION:
-                    case TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::GATED_ACTIVATION:
+                    case Fusion::FINALIZE: return select_mxfpx_mode(std::integral_constant<Fusion, Fusion::FINALIZE>{});
+                    case Fusion::NONE: return select_mxfpx_mode(std::integral_constant<Fusion, Fusion::NONE>{});
+                    case Fusion::ACTIVATION:
+                    case Fusion::GATED_ACTIVATION:
                     default: TLLM_THROW("Unimplemented fusion %d requested", (int) hopper_inputs.fusion);
                     };
                 };
@@ -923,10 +913,13 @@ template <typename T, typename WeightType, typename OutputType, typename ScaleBi
 size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getMaxWorkspaceSize(
     int num_experts, bool use_mxfp8_weight_scaling) const
 {
-    if (num_experts != num_experts_)
+    if (num_experts != num_experts_ || use_mxfp8_weight_scaling != use_mxfp8_weight_scaling_)
     {
-        TLLM_LOG_TRACE("Calling getMaxWorkspaceSize() with a new expert count %d vs %d", num_experts, num_experts_);
+        TLLM_LOG_TRACE(
+            "Calling getMaxWorkspaceSize() with a new (expert count, use_mxfp8_weight_scaling) (%d, %d) vs (%d, %d)",
+            num_experts, (int) use_mxfp8_weight_scaling, num_experts_, (int) use_mxfp8_weight_scaling_);
         num_experts_ = num_experts;
+        use_mxfp8_weight_scaling_ = use_mxfp8_weight_scaling;
         gemm_workspace_size_ = calcMaxWorkspaceSize(num_experts, use_mxfp8_weight_scaling);
     }
     return gemm_workspace_size_;
@@ -949,8 +942,11 @@ size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspace
         && !use_w4afp8 && !use_wfp4a16)
     {
         // Finalize fusion may not actually be supported by the kernel,
-        // if they are not we will catch the error and skip them
-        auto configs = getTmaWarpSpecializedConfigs(sm_, true);
+        // if they are not we will catch the error and skip them. Restrict the
+        // candidate set to MXFP8-valid tiles when the caller is sizing for the
+        // MXFP8xMXFP8 variant; otherwise the FP8 list would include tiles the
+        // dispatcher rejects.
+        auto configs = getTmaWarpSpecializedConfigs(sm_, true, use_mxfp8_weight_scaling);
         // For <e4m3, e4m3> the same template compiles both per-tensor FP8
         // (NONE) and MXFP8 block-scaled (MXFPX) variants; the caller passes
         // `use_mxfp8_weight_scaling` so we size workspace for exactly the
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h

Original file line number	Diff line number	Diff line change
`@@ -685,7 +685,11 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface`
`685`	`685`
`686`	`686`	`std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(MoeGemmId gemm_id) override`
`687`	`687`	`{`
`688`		`- return moe_gemm_runner_.getConfigs(gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused());`
	`688`	+ // Pass `use_mxfp8_weight_scaling_` so MXFP8xMXFP8 enumerates only the
	`689`	`+ // Mxf8f6f4-valid tile shapes; otherwise autotuning would invoke FP8`
	`690`	`+ // tile shapes that the runtime dispatcher rejects with TLLM_THROW.`
	`691`	`+ return moe_gemm_runner_.getConfigs(`
	`692`	`+ gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused(), use_mxfp8_weight_scaling_);`
`689`	`693`	`}`
`690`	`694`
`691`	`695`	`static std::vector<cutlass_extensions::CutlassGemmConfig> getTactics(int sm, MoeGemmId gemm_id)`