PaddlePaddle · zccjjj · Mar 23, 2026 · Mar 25, 2026 · fastdeploy-bot · Apr 9, 2026
diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
@@ -240,7 +240,7 @@ class MetaxCutlassWeightOnlyMoEMethod(MetaxCutlassMoEMethod):
     def __init__(self, quant_config):
         super().__init__(quant_config)
         if quant_config is None:
-            self.quant_config = WeightOnlyConfig(algo="weight_only_int8", is_checkpoint_bf16=True)
+            self.quant_config = WeightOnlyConfig(algo="weight_only_int8")
         else:
             self.quant_config = quant_config
         self.moe_quant_type = self.quant_config.algo
@@ -480,21 +480,18 @@ def _process_quantize(weight_idx):
             getattr(layer, weight_name).copy_(weight.transpose([0, 2, 1]), False)
             getattr(layer, scale_name).copy_(scale, False)
 
-        if self.quant_config.is_checkpoint_bf16:
-            weight_id_map = {"gate_up": 0, "down": 1}
-            if weight_fully_copied(layer.up_gate_proj_weight):
-                weight_type = "gate_up"
-            else:
-                weight_type = "down"
-
-            if self.model_format == "torch":
-                unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
-                    "quant_weight", "weight"
-                )
-                process_weight_transpose(layer, unquantized_weight_name)
-            _process_quantize(weight_id_map[weight_type])
+        weight_id_map = {"gate_up": 0, "down": 1}
+        if weight_fully_copied(layer.up_gate_proj_weight):
+            weight_type = "gate_up"
         else:
-            return
+            weight_type = "down"
+
+        if self.model_format == "torch":
+            unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
+                "quant_weight", "weight"
+            )
+            process_weight_transpose(layer, unquantized_weight_name)
+        _process_quantize(weight_id_map[weight_type])
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
         """

diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py
@@ -69,8 +69,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
             layer.hidden_size,
         ]
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
-        is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True
-        if is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+        if layer.fd_config.load_config.load_choices == "default_v1":
             layer.up_gate_proj_weight = layer.create_parameter(
                 shape=self.up_gate_proj_weight_shape,
                 dtype=layer.weight_dtype,
@@ -184,10 +183,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
     @paddle.no_grad()
     def process_weights_after_loading(self, layer):
         """ """
-        is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True
-        if not is_checkpoint_bf16:
-            return
-
         if self.quant_config is not None:
             algo = layer.quant_method.quant_config.name()
             assert algo == "wint8"

diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
@@ -616,8 +616,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
 
     def process_weights_after_loading(self, layer):
         """ """
-        if not self.quant_config.is_checkpoint_bf16:
-            return
         weight_id_map = {"gate_up": 0, "down": 1}
         if (
             hasattr(layer.up_gate_proj_weight, "tensor_track")

diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
@@ -118,8 +118,6 @@ def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None
         layer.weight_scale.set_value(weight_scale)
 
     def process_weights_after_loading(self, layer) -> None:
-        if not self.quant_config.is_checkpoint_bf16:
-            return
 
         quanted_weight, weight_scale = self._quantize_weight_in_blocks(layer.weight)
         free_tensor(layer.weight)

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -1468,7 +1468,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]
         self.model_format = extra_weight_attrs.get("model_format")
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
-        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+        if layer.fd_config.load_config.load_choices == "default_v1":
             if self.model_format != "torch":
                 up_gate_proj_weight_shape = [
                     layer.num_local_experts,
@@ -1649,21 +1649,18 @@ def _process_quantize(weight_idx):
             getattr(layer, weight_name).copy_(weight, False)
             getattr(layer, scale_name).copy_(scale, False)
 
-        if self.quant_config.is_checkpoint_bf16:
-            weight_id_map = {"gate_up": 0, "down": 1}
-            if weight_fully_copied(layer.up_gate_proj_weight):
-                weight_type = "gate_up"
-            else:
-                weight_type = "down"
-
-            if self.model_format == "torch":
-                unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
-                    "quant_weight", "weight"
-                )
-                process_weight_transpose(layer, unquantized_weight_name)
-            _process_quantize(weight_id_map[weight_type])
+        weight_id_map = {"gate_up": 0, "down": 1}
+        if weight_fully_copied(layer.up_gate_proj_weight):
+            weight_type = "gate_up"
         else:
-            return
+            weight_type = "down"
+
+        if self.model_format == "torch":
+            unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
+                "quant_weight", "weight"
+            )
+            process_weight_transpose(layer, unquantized_weight_name)
+        _process_quantize(weight_id_map[weight_type])
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
         """

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -84,7 +84,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         ]
         self.model_format = extra_weight_attrs.get("model_format")
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
-        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+        if layer.fd_config.load_config.load_choices == "default_v1":
             if self.model_format != "torch":
                 up_gate_proj_weight_shape = [
                     layer.num_local_experts,
@@ -268,21 +268,17 @@ def _process_quantize(weight_idx):
             getattr(layer, weight_name).copy_(quanted_weight, False)
             getattr(layer, scale_name).copy_(quanted_weight_scale, False)
 
-        if self.quant_config.is_checkpoint_bf16:
-            weight_id_map = {"gate_up": 0, "down": 1}
-            if weight_fully_copied(layer.up_gate_proj_weight):
-                weight_type = "gate_up"
-            else:
-                weight_type = "down"
-            if self.model_format == "torch":
-                unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
-                    "quant_weight", "weight"
-                )
-                process_weight_transpose(layer, unquantized_weight_name)
-            _process_quantize(weight_id_map[weight_type])
-
+        weight_id_map = {"gate_up": 0, "down": 1}
+        if weight_fully_copied(layer.up_gate_proj_weight):
+            weight_type = "gate_up"
         else:
-            return
+            weight_type = "down"
+        if self.model_format == "torch":
+            unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
+                "quant_weight", "weight"
+            )
+            process_weight_transpose(layer, unquantized_weight_name)
+        _process_quantize(weight_id_map[weight_type])
 
     def apply(
         self,

diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
@@ -30,7 +30,7 @@
 from fastdeploy.model_executor.layers.moe.routing_indices_cache import (
     save_routing_to_buffer,
 )
-from fastdeploy.model_executor.layers.utils import get_tensor
+from fastdeploy.model_executor.layers.utils import get_tensor, modules_to_convert
 from fastdeploy.model_executor.utils import h2d_copy, slice_fn
 from fastdeploy.platforms import current_platform
 from fastdeploy.worker.experts_manager import RedundantExpertManger
@@ -152,6 +152,7 @@ def __init__(
         with_bias: bool = False,
         activation="swiglu",
         model_format: Optional[str] = None,
+        prefix: str = "",
     ):
         """
         Initialize the Moe layer with given parameters.
@@ -175,7 +176,7 @@ def __init__(
         if self.ep_size > 1:
             self.tp_size = 1
             self.tp_rank = 0
-
+        self.prefix = prefix
         self.attn_tp_size = fd_config.parallel_config.tensor_parallel_size
         self.attn_tp_rank = fd_config.parallel_config.tensor_parallel_rank
 
@@ -226,7 +227,7 @@ def __init__(
         moe_quant_config = fd_config.quant_config
         self.moe_quant_config = moe_quant_config
         self.moe_quant_type = None
-        if moe_quant_config and moe_quant_config.get_quant_method(self):
+        if moe_quant_config and moe_quant_config.get_quant_method(self) and modules_to_convert(prefix, self.fd_config):
             self.quant_method = moe_quant_config.get_quant_method(self)
             self.moe_quant_type = moe_quant_config.name()
         else:

diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py
@@ -18,6 +18,7 @@
 
 from fastdeploy.model_executor.layers.attention.attention import Attention
 from fastdeploy.model_executor.layers.moe.moe import FusedMoE
+from fastdeploy.model_executor.utils import get_special_quant_config
 
 from . import get_quantization_config
 from .quant_base import QuantConfigBase, QuantMethodBase
@@ -41,6 +42,7 @@ def __init__(
         hadamard_block_size: int = 128,
         moe_dynamic_quant: bool = False,
         is_moe_quantized: bool = False,
+        modules_to_quant: dict = {},
     ) -> None:
         super().__init__()
         self.dense_quant_type = dense_quant_type
@@ -61,6 +63,7 @@ def __init__(
         self.hadamard_block_size = hadamard_block_size
         self.moe_dynamic_quant = moe_dynamic_quant
         self.is_moe_quantized = is_moe_quantized
+        self.modules_to_quant = modules_to_quant
 
     def name(self) -> str:
         return "mix_quant"
@@ -79,14 +82,15 @@ def from_config(cls, config: dict) -> "MixQuantConfig":
             config.get("hadamard_block_size", 128),
             config.get("moe_dynamic_quant", False),
             config.get("is_moe_quantized", False),
+            config.get("modules_to_quant", {}),
         )
 
     def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
         if isinstance(layer, FusedMoE):
             if layer.moe_tag == "Image":
                 if self.image_moe_quant_type is not None:
                     return (
-                        get_quantization_config(self.image_moe_quant_type)
+                        get_special_quant_config(layer, self.modules_to_quant, self.image_moe_quant_type)
                         .from_config(
                             {
                                 "is_permuted": self.is_permuted,
@@ -101,7 +105,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
             else:
                 if self.moe_quant_type is not None:
                     return (
-                        get_quantization_config(self.moe_quant_type)
+                        get_special_quant_config(layer, self.modules_to_quant, self.moe_quant_type)
                         .from_config(
                             {
                                 "is_permuted": self.is_permuted,

diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -70,7 +70,6 @@ class WeightOnlyConfig(QuantConfigBase):
     def __init__(
         self,
         algo: str,
-        is_checkpoint_bf16: bool = False,
     ) -> None:
         super().__init__()
         self.algo = algo
@@ -82,7 +81,7 @@ def __init__(
         self.quant_max_bound = 0
         self.quant_min_bound = 0
         self.quant_round_type = 0
-        self.is_checkpoint_bf16 = is_checkpoint_bf16
+        self.is_checkpoint_bf16 = True  # weight only linear support dynamic quantization only
         self.group_size = -1
 
     def name(self) -> str:
@@ -91,11 +90,12 @@ def name(self) -> str:
     @classmethod
     def from_config(cls, config: dict) -> "WeightOnlyConfig":
         algo = config["algo"]
-        is_checkpoint_bf16 = not config.get("is_quantized", False)
-        return cls(algo, is_checkpoint_bf16)
+        return cls(algo)
 
     def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        # 根据平台类型和层类型选择对应的量化方法
         if current_platform.is_xpu():
+            # XPU平台：区分MoE层和普通Linear层
             if isinstance(layer, FusedMoE):
                 from fastdeploy.model_executor.layers.backends import (
                     XPUWeightOnlyMoEMethod,
@@ -109,6 +109,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
 
                 return XPUWeightOnlyLinearMethod(self)
         elif current_platform.is_gcu():
+            # GCU平台：区分MoE层和普通Linear层
             from fastdeploy.model_executor.layers.backends import (
                 GCUWeightOnlyLinearMethod,
                 GCUWeightOnlyMoEMethod,
@@ -119,6 +120,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
             else:
                 return GCUWeightOnlyLinearMethod(self)
         elif current_platform.is_dcu():
+            # DCU平台：区分MoE层和普通Linear层
             if isinstance(layer, FusedMoE):
                 from fastdeploy.model_executor.layers.backends import (
                     DCUTritonWeightOnlyMoEMethod,
@@ -132,6 +134,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
 
                 return DCUWeightOnlyLinearMethod(self)
         elif current_platform.is_maca():
+            # MACA平台：MoE层支持cutlass和triton两种后端
             if isinstance(layer, FusedMoE):
                 from fastdeploy.model_executor.layers.backends import (
                     MetaxCutlassWeightOnlyMoEMethod,
@@ -166,6 +169,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
 
                 return IluvatarWeightOnlyLinearMethod(self)
         else:
+            # GPU默认平台：MoE层支持cutlass/triton/marlin三种后端
             if isinstance(layer, FusedMoE):
                 if layer.use_method == "cutlass":
                     from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
@@ -188,6 +192,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
                 else:
                     raise ValueError(f"Unsupported MOE backend {layer.use_method}")
             else:
+                # 普通Linear层：满足条件时使用Machete优化内核，否则使用默认GPU方法
                 if (
                     _ENABLE_MACHETE
                     and envs.FD_USE_MACHETE == "1"
@@ -206,13 +211,12 @@ class WINT8Config(WeightOnlyConfig):
     weight only int8 config
     """
 
-    def __init__(self, is_checkpoint_bf16: bool = False) -> None:
-        super().__init__("weight_only_int8", is_checkpoint_bf16)
+    def __init__(self) -> None:
+        super().__init__("weight_only_int8")
 
     @classmethod
     def from_config(cls, config: dict) -> "WINT8Config":
-        is_checkpoint_bf16 = not config.get("is_quantized", False)
-        return cls(is_checkpoint_bf16)
+        return cls()
 
     def name(self) -> str:
         return "wint8"
@@ -225,14 +229,12 @@ class WINT4Config(WeightOnlyConfig):
 
     def __init__(
         self,
-        is_checkpoint_bf16: bool = False,
     ) -> None:
-        super().__init__("weight_only_int4", is_checkpoint_bf16)
+        super().__init__("weight_only_int4")
 
     @classmethod
     def from_config(cls, config: dict) -> "WINT4Config":
-        is_checkpoint_bf16 = not config.get("is_quantized", False)
-        return cls(is_checkpoint_bf16)
+        return cls()
 
     def name(self) -> str:
         return "wint4"
@@ -253,7 +255,7 @@ def __init__(
     def create_weights(self, layer, **extra_weight_attrs):
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
         self.model_format = extra_weight_attrs.get("model_format")
-        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+        if layer.fd_config.load_config.load_choices == "default_v1":
             weight_shape = layer.weight_shape[::-1] if self.model_format == "torch" else layer.weight_shape
             layer.weight = layer.create_parameter(
                 shape=weight_shape,
@@ -363,12 +365,9 @@ def _process_quantize():
             layer.weight.copy_(quanted_weight_tensor, False)
             layer.weight_scale.copy_(weight_scale_tensor, False)
 
-        if self.quant_config.is_checkpoint_bf16:
-            if self.model_format == "torch":
-                process_weight_transpose(layer, "weight")
-            _process_quantize()
-        else:
-            return
+        if self.model_format == "torch":
+            process_weight_transpose(layer, "weight")
+        _process_quantize()
 
     @abstractmethod
     def process_loaded_weights(self, layer, weights) -> None:

diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -178,6 +178,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None:
             layer_idx=layer_id,
             gate_correction_bias=self.gate.e_score_correction_bias,
             weight_key_map=weight_key_map,
+            prefix=f"{prefix}.experts",
         )
 
         self.num_shared_experts = fd_config.model_config.n_shared_experts

diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -217,6 +217,7 @@ def __init__(
             gate_correction_bias=None,
             redundant_table_manger=redundant_table_manger,
             weight_key_map=weight_key_map,
+            prefix=f"{prefix}.experts",
         )
 
         if fd_config.model_config.moe_use_aux_free: