Skip to content

Commit e150a41

Browse files
support moe offline quant (PaddlePaddle#5142)
1 parent 5ff93d4 commit e150a41

5 files changed

Lines changed: 12 additions & 3 deletions

File tree

fastdeploy/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ def __init__(
180180
):
181181
self.model = ""
182182
self.is_quantized = False
183+
self.is_moe_quantized = False
183184
self.max_model_len = 0
184185
self.dtype = "bfloat16"
185186
self.enable_logprob = False

fastdeploy/model_executor/layers/moe/moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ def load_state_dict(self, state_dict, is_rearrange: bool = False):
584584
"""
585585
load_state_dict function.
586586
"""
587-
if self.is_quantized:
587+
if self.is_quantized or self.fd_config.model_config.is_moe_quantized:
588588
if getattr(self.fd_config.quant_config, "is_permuted", True):
589589
self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
590590
else:

fastdeploy/model_executor/layers/quantization/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,14 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
5151
if quantization_config is not None:
5252
if "is_quantized" in quantization_config:
5353
model_config.is_quantized = quantization_config["is_quantized"]
54+
elif "is_moe_quantized" in quantization_config:
55+
model_config.is_moe_quantized = quantization_config["is_moe_quantized"]
5456
elif "kv_cache_quant_type" not in quantization_config:
5557
model_config.is_quantized = True
58+
if "is_moe_quantized" not in quantization_config:
59+
model_config.is_quantized = True
60+
else:
61+
model_config.is_moe_quantized = True
5662
if quantization_config is not None and quantization_config.get("quantization", None) is None:
5763
raise ValueError(
5864
"quantization_config should have a key named 'quantization' for specify quant config."

fastdeploy/model_executor/models/ernie4_5_moe.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ def __init__(
138138
"down_proj_expert_code_zp_key": f"{prefix}.experts.{{}}.down_proj.code_zp",
139139
}
140140
elif moe_quant_type == "tensor_wise_fp8" or (
141-
moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized
141+
moe_quant_type == "block_wise_fp8"
142+
and (fd_config.model_config.is_quantized or fd_config.model_config.is_moe_quantized)
142143
):
143144
weight_key_map = {
144145
"gate_weight_key": f"{prefix}.gate.weight",

fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ def __init__(
105105
moe_quant_type = fd_config.quant_config.moe_quant_type
106106

107107
if moe_quant_type == "tensor_wise_fp8" or (
108-
moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized
108+
moe_quant_type == "block_wise_fp8"
109+
and (fd_config.model_config.is_quantized or fd_config.model_config.is_moe_quantized)
109110
):
110111
weight_key_map = {
111112
"gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias",

0 commit comments

Comments
 (0)