ban DeepSeek routing w/ BF16 TRTLLMGenFusedMoE; bug inside Flashinfer

rosenrodt · rosenrodt · commit 5b0a3fb439fb · 2026-03-28T15:54:14.000+08:00
Signed-off-by: Anthony Chang &lt;27950904+rosenrodt@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -28,6 +28,7 @@
 4. Unified EPLB integration for backends that support it
 """
 
+import copy
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
@@ -162,21 +163,34 @@ def __init__(
         self.apply_router_weight_on_input = apply_router_weight_on_input
 
         # ========== Create MoE Backend (Default: Cutlass) ==========
-        from tensorrt_llm._torch.modules.fused_moe.create_moe import create_moe_backend, get_moe_cls
+        from tensorrt_llm._torch.modules.fused_moe.create_moe import (
+            create_moe_backend,
+            resolve_moe_cls,
+        )
+
+        # Get MoE backend class based on override_quant_config, routing_method, and model_config
+        moe_cls = resolve_moe_cls(
+            model_config,
+            routing_method,
+            self.dtype,
+            override_quant_config=override_quant_config,
+        )
 
-        # Get MoE backend class based on override_quant_config or model_config
-        moe_cls = get_moe_cls(model_config, override_quant_config=override_quant_config)
+        backend_model_config = model_config
+        if override_quant_config is not None:
+            backend_model_config = copy.deepcopy(model_config)
+            backend_model_config.quant_config = override_quant_config
 
         # Call create_moe_backend with all necessary parameters
         # init_load_balancer=False: Prevents backend from registering itself with load balancer
         # without_comm=True: Prevents backend from initializing communication (ConfigurableMoE handles it)
         # skip_create_weights_in_init=True: Prevents backend from creating weights in __init__
         #   because backend uses layer_idx=None and may have different expert assignments
         #   We will create weights after syncing attributes from ConfigurableMoE
-        tmp_skip_create_weights_in_init = model_config.skip_create_weights_in_init
-        model_config._frozen = False
-        model_config.skip_create_weights_in_init = True
-        model_config._frozen = True
+        tmp_skip_create_weights_in_init = backend_model_config.skip_create_weights_in_init
+        backend_model_config._frozen = False
+        backend_model_config.skip_create_weights_in_init = True
+        backend_model_config._frozen = True
 
         backend = create_moe_backend(
             moe_cls=moe_cls,
@@ -186,7 +200,7 @@ def __init__(
             intermediate_size=self.intermediate_size,
             dtype=self.dtype,
             reduce_results=self.reduce_results,
-            model_config=model_config,
+            model_config=backend_model_config,
             aux_stream_dict=self.aux_stream_dict,
             weight_loading_mode=self.weight_loading_mode,
             bias=kwargs.get("bias", False),
@@ -221,10 +235,10 @@ def __init__(
             self.backend.expert_size_per_partition = self.expert_size_per_partition
 
         # Create weights here, because the backend needs the layer_load_balancer info to create weights
-        model_config._frozen = False
-        model_config.skip_create_weights_in_init = tmp_skip_create_weights_in_init
-        model_config._frozen = True
-        if not model_config.skip_create_weights_in_init:
+        backend_model_config._frozen = False
+        backend_model_config.skip_create_weights_in_init = tmp_skip_create_weights_in_init
+        backend_model_config._frozen = True
+        if not backend_model_config.skip_create_weights_in_init:
             self.backend.create_weights()
 
         # ========== Create Communication Strategy ==========
diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
@@ -77,6 +77,25 @@ def get_moe_cls(
         raise ValueError(f"Unsupported moe backend: {moe_backend}")
 
 
+def resolve_moe_cls(
+        model_config: ModelConfig,
+        routing_method: BaseMoeRoutingMethod,
+        dtype: Optional[torch.dtype],
+        override_quant_config: Optional[QuantConfig] = None) -> Type[MoE]:
+    moe_cls = get_moe_cls(model_config, override_quant_config)
+
+    effective_quant_config = override_quant_config or model_config.quant_config
+    has_quant = (effective_quant_config is not None
+                 and effective_quant_config.layer_quant_mode.has_any_quant(
+                     exclude_kv_cache=True))
+    if (moe_cls == TRTLLMGenFusedMoE and not has_quant
+            and not TRTLLMGenFusedMoE._supports_flashinfer_bf16_routing_method(
+                routing_method)):
+        return CutlassFusedMoE
+
+    return moe_cls
+
+
 def create_moe_backend(
     moe_cls: Type[MoE],
     routing_method: BaseMoeRoutingMethod,
@@ -353,7 +372,8 @@ def create_moe(
             pretrained_config, 'torch_dtype'):
         dtype = pretrained_config.torch_dtype
 
-    moe_cls = get_moe_cls(model_config, override_quant_config)
+    moe_cls = resolve_moe_cls(model_config, routing_method, dtype,
+                              override_quant_config)
 
     enable_configurable_moe = os.environ.get("ENABLE_CONFIGURABLE_MOE",
                                              "1") == "1"
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -318,6 +318,12 @@ def _is_unquantized_path(self) -> bool:
         return self.quant_config is None or not self.quant_config.layer_quant_mode.has_any_quant(
             exclude_kv_cache=True)
 
+    @staticmethod
+    def _supports_flashinfer_bf16_routing_method(
+        routing_method: BaseMoeRoutingMethod, ) -> bool:
+        # FIXME: ban DeepSeekV3 FlashInfer trtllm_bf16_routed_moe() as it appears to have bug
+        return not isinstance(routing_method, DeepSeekV3MoeRoutingMethod)
+
     def _requires_separated_routing(self) -> bool:
         """Whether this backend instance expects precomputed top-k routing."""
         # FIXME: ban FlashInfer BF16 MoE direct routing as it appears to have accuracy bug
@@ -331,6 +337,9 @@ def _check_flashinfer_backend_support(self) -> bool:
                 return False
             if self.activation_type != ActivationType.Swiglu:
                 return False
+            if not self._supports_flashinfer_bf16_routing_method(
+                    self.routing_method):
+                return False
             return True
 
         use_flashinfer = os.environ.get("TRTLLM_GEN_FUSED_MOE_USE_FLASHINFER",