Skip to content

Commit 96cd7e9

Browse files
bukejiyuzccjjj
authored andcommitted
add skip_layer_mixed_quantization
1 parent 1090f8b commit 96cd7e9

File tree

21 files changed

+361
-85
lines changed

21 files changed

+361
-85
lines changed

fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ class MetaxCutlassWeightOnlyMoEMethod(MetaxCutlassMoEMethod):
240240
def __init__(self, quant_config):
241241
super().__init__(quant_config)
242242
if quant_config is None:
243-
self.quant_config = WeightOnlyConfig(algo="weight_only_int8", is_checkpoint_bf16=True)
243+
self.quant_config = WeightOnlyConfig(algo="weight_only_int8")
244244
else:
245245
self.quant_config = quant_config
246246
self.moe_quant_type = self.quant_config.algo
@@ -480,21 +480,18 @@ def _process_quantize(weight_idx):
480480
getattr(layer, weight_name).copy_(weight.transpose([0, 2, 1]), False)
481481
getattr(layer, scale_name).copy_(scale, False)
482482

483-
if self.quant_config.is_checkpoint_bf16:
484-
weight_id_map = {"gate_up": 0, "down": 1}
485-
if weight_fully_copied(layer.up_gate_proj_weight):
486-
weight_type = "gate_up"
487-
else:
488-
weight_type = "down"
489-
490-
if self.model_format == "torch":
491-
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
492-
"quant_weight", "weight"
493-
)
494-
process_weight_transpose(layer, unquantized_weight_name)
495-
_process_quantize(weight_id_map[weight_type])
483+
weight_id_map = {"gate_up": 0, "down": 1}
484+
if weight_fully_copied(layer.up_gate_proj_weight):
485+
weight_type = "gate_up"
496486
else:
497-
return
487+
weight_type = "down"
488+
489+
if self.model_format == "torch":
490+
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
491+
"quant_weight", "weight"
492+
)
493+
process_weight_transpose(layer, unquantized_weight_name)
494+
_process_quantize(weight_id_map[weight_type])
498495

499496
def process_loaded_weights(self, layer: nn.Layer, state_dict):
500497
"""

fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
6969
layer.hidden_size,
7070
]
7171
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
72-
is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True
73-
if is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
72+
if layer.fd_config.load_config.load_choices == "default_v1":
7473
layer.up_gate_proj_weight = layer.create_parameter(
7574
shape=self.up_gate_proj_weight_shape,
7675
dtype=layer.weight_dtype,
@@ -184,10 +183,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
184183
@paddle.no_grad()
185184
def process_weights_after_loading(self, layer):
186185
""" """
187-
is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True
188-
if not is_checkpoint_bf16:
189-
return
190-
191186
if self.quant_config is not None:
192187
algo = layer.quant_method.quant_config.name()
193188
assert algo == "wint8"

fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -616,8 +616,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
616616

617617
def process_weights_after_loading(self, layer):
618618
""" """
619-
if not self.quant_config.is_checkpoint_bf16:
620-
return
621619
weight_id_map = {"gate_up": 0, "down": 1}
622620
if (
623621
hasattr(layer.up_gate_proj_weight, "tensor_track")

fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,6 @@ def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None
118118
layer.weight_scale.set_value(weight_scale)
119119

120120
def process_weights_after_loading(self, layer) -> None:
121-
if not self.quant_config.is_checkpoint_bf16:
122-
return
123121

124122
quanted_weight, weight_scale = self._quantize_weight_in_blocks(layer.weight)
125123
free_tensor(layer.weight)

fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1468,7 +1468,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
14681468
self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]
14691469
self.model_format = extra_weight_attrs.get("model_format")
14701470
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
1471-
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
1471+
if layer.fd_config.load_config.load_choices == "default_v1":
14721472
if self.model_format != "torch":
14731473
up_gate_proj_weight_shape = [
14741474
layer.num_local_experts,
@@ -1649,21 +1649,18 @@ def _process_quantize(weight_idx):
16491649
getattr(layer, weight_name).copy_(weight, False)
16501650
getattr(layer, scale_name).copy_(scale, False)
16511651

1652-
if self.quant_config.is_checkpoint_bf16:
1653-
weight_id_map = {"gate_up": 0, "down": 1}
1654-
if weight_fully_copied(layer.up_gate_proj_weight):
1655-
weight_type = "gate_up"
1656-
else:
1657-
weight_type = "down"
1658-
1659-
if self.model_format == "torch":
1660-
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
1661-
"quant_weight", "weight"
1662-
)
1663-
process_weight_transpose(layer, unquantized_weight_name)
1664-
_process_quantize(weight_id_map[weight_type])
1652+
weight_id_map = {"gate_up": 0, "down": 1}
1653+
if weight_fully_copied(layer.up_gate_proj_weight):
1654+
weight_type = "gate_up"
16651655
else:
1666-
return
1656+
weight_type = "down"
1657+
1658+
if self.model_format == "torch":
1659+
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
1660+
"quant_weight", "weight"
1661+
)
1662+
process_weight_transpose(layer, unquantized_weight_name)
1663+
_process_quantize(weight_id_map[weight_type])
16671664

16681665
def process_loaded_weights(self, layer: nn.Layer, state_dict):
16691666
"""

fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
8484
]
8585
self.model_format = extra_weight_attrs.get("model_format")
8686
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
87-
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
87+
if layer.fd_config.load_config.load_choices == "default_v1":
8888
if self.model_format != "torch":
8989
up_gate_proj_weight_shape = [
9090
layer.num_local_experts,
@@ -268,21 +268,17 @@ def _process_quantize(weight_idx):
268268
getattr(layer, weight_name).copy_(quanted_weight, False)
269269
getattr(layer, scale_name).copy_(quanted_weight_scale, False)
270270

271-
if self.quant_config.is_checkpoint_bf16:
272-
weight_id_map = {"gate_up": 0, "down": 1}
273-
if weight_fully_copied(layer.up_gate_proj_weight):
274-
weight_type = "gate_up"
275-
else:
276-
weight_type = "down"
277-
if self.model_format == "torch":
278-
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
279-
"quant_weight", "weight"
280-
)
281-
process_weight_transpose(layer, unquantized_weight_name)
282-
_process_quantize(weight_id_map[weight_type])
283-
271+
weight_id_map = {"gate_up": 0, "down": 1}
272+
if weight_fully_copied(layer.up_gate_proj_weight):
273+
weight_type = "gate_up"
284274
else:
285-
return
275+
weight_type = "down"
276+
if self.model_format == "torch":
277+
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
278+
"quant_weight", "weight"
279+
)
280+
process_weight_transpose(layer, unquantized_weight_name)
281+
_process_quantize(weight_id_map[weight_type])
286282

287283
def apply(
288284
self,

fastdeploy/model_executor/layers/moe/moe.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from fastdeploy.model_executor.layers.moe.routing_indices_cache import (
3131
save_routing_to_buffer,
3232
)
33-
from fastdeploy.model_executor.layers.utils import get_tensor
33+
from fastdeploy.model_executor.layers.utils import get_tensor, modules_to_convert
3434
from fastdeploy.model_executor.utils import h2d_copy, slice_fn
3535
from fastdeploy.platforms import current_platform
3636
from fastdeploy.worker.experts_manager import RedundantExpertManger
@@ -152,6 +152,7 @@ def __init__(
152152
with_bias: bool = False,
153153
activation="swiglu",
154154
model_format: Optional[str] = None,
155+
prefix: str = "",
155156
):
156157
"""
157158
Initialize the Moe layer with given parameters.
@@ -175,7 +176,7 @@ def __init__(
175176
if self.ep_size > 1:
176177
self.tp_size = 1
177178
self.tp_rank = 0
178-
179+
self.prefix = prefix
179180
self.attn_tp_size = fd_config.parallel_config.tensor_parallel_size
180181
self.attn_tp_rank = fd_config.parallel_config.tensor_parallel_rank
181182

@@ -226,7 +227,7 @@ def __init__(
226227
moe_quant_config = fd_config.quant_config
227228
self.moe_quant_config = moe_quant_config
228229
self.moe_quant_type = None
229-
if moe_quant_config and moe_quant_config.get_quant_method(self):
230+
if moe_quant_config and moe_quant_config.get_quant_method(self) and modules_to_convert(prefix, self.fd_config):
230231
self.quant_method = moe_quant_config.get_quant_method(self)
231232
self.moe_quant_type = moe_quant_config.name()
232233
else:

fastdeploy/model_executor/layers/quantization/mix_quant.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from fastdeploy.model_executor.layers.attention.attention import Attention
2020
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
21+
from fastdeploy.model_executor.utils import get_special_quant_config
2122

2223
from . import get_quantization_config
2324
from .quant_base import QuantConfigBase, QuantMethodBase
@@ -41,6 +42,7 @@ def __init__(
4142
hadamard_block_size: int = 128,
4243
moe_dynamic_quant: bool = False,
4344
is_moe_quantized: bool = False,
45+
modules_to_quant: dict = {},
4446
) -> None:
4547
super().__init__()
4648
self.dense_quant_type = dense_quant_type
@@ -61,6 +63,7 @@ def __init__(
6163
self.hadamard_block_size = hadamard_block_size
6264
self.moe_dynamic_quant = moe_dynamic_quant
6365
self.is_moe_quantized = is_moe_quantized
66+
self.modules_to_quant = modules_to_quant
6467

6568
def name(self) -> str:
6669
return "mix_quant"
@@ -79,14 +82,15 @@ def from_config(cls, config: dict) -> "MixQuantConfig":
7982
config.get("hadamard_block_size", 128),
8083
config.get("moe_dynamic_quant", False),
8184
config.get("is_moe_quantized", False),
85+
config.get("modules_to_quant", {}),
8286
)
8387

8488
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
8589
if isinstance(layer, FusedMoE):
8690
if layer.moe_tag == "Image":
8791
if self.image_moe_quant_type is not None:
8892
return (
89-
get_quantization_config(self.image_moe_quant_type)
93+
get_special_quant_config(layer, self.modules_to_quant, self.image_moe_quant_type)
9094
.from_config(
9195
{
9296
"is_permuted": self.is_permuted,
@@ -101,7 +105,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
101105
else:
102106
if self.moe_quant_type is not None:
103107
return (
104-
get_quantization_config(self.moe_quant_type)
108+
get_special_quant_config(layer, self.modules_to_quant, self.moe_quant_type)
105109
.from_config(
106110
{
107111
"is_permuted": self.is_permuted,

fastdeploy/model_executor/layers/quantization/weight_only.py

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ class WeightOnlyConfig(QuantConfigBase):
7070
def __init__(
7171
self,
7272
algo: str,
73-
is_checkpoint_bf16: bool = False,
7473
) -> None:
7574
super().__init__()
7675
self.algo = algo
@@ -82,7 +81,7 @@ def __init__(
8281
self.quant_max_bound = 0
8382
self.quant_min_bound = 0
8483
self.quant_round_type = 0
85-
self.is_checkpoint_bf16 = is_checkpoint_bf16
84+
self.is_checkpoint_bf16 = True # weight only linear support dynamic quantization only
8685
self.group_size = -1
8786

8887
def name(self) -> str:
@@ -91,11 +90,12 @@ def name(self) -> str:
9190
@classmethod
9291
def from_config(cls, config: dict) -> "WeightOnlyConfig":
9392
algo = config["algo"]
94-
is_checkpoint_bf16 = not config.get("is_quantized", False)
95-
return cls(algo, is_checkpoint_bf16)
93+
return cls(algo)
9694

9795
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
96+
# 根据平台类型和层类型选择对应的量化方法
9897
if current_platform.is_xpu():
98+
# XPU平台:区分MoE层和普通Linear层
9999
if isinstance(layer, FusedMoE):
100100
from fastdeploy.model_executor.layers.backends import (
101101
XPUWeightOnlyMoEMethod,
@@ -109,6 +109,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
109109

110110
return XPUWeightOnlyLinearMethod(self)
111111
elif current_platform.is_gcu():
112+
# GCU平台:区分MoE层和普通Linear层
112113
from fastdeploy.model_executor.layers.backends import (
113114
GCUWeightOnlyLinearMethod,
114115
GCUWeightOnlyMoEMethod,
@@ -119,6 +120,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
119120
else:
120121
return GCUWeightOnlyLinearMethod(self)
121122
elif current_platform.is_dcu():
123+
# DCU平台:区分MoE层和普通Linear层
122124
if isinstance(layer, FusedMoE):
123125
from fastdeploy.model_executor.layers.backends import (
124126
DCUTritonWeightOnlyMoEMethod,
@@ -132,6 +134,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
132134

133135
return DCUWeightOnlyLinearMethod(self)
134136
elif current_platform.is_maca():
137+
# MACA平台:MoE层支持cutlass和triton两种后端
135138
if isinstance(layer, FusedMoE):
136139
from fastdeploy.model_executor.layers.backends import (
137140
MetaxCutlassWeightOnlyMoEMethod,
@@ -166,6 +169,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
166169

167170
return IluvatarWeightOnlyLinearMethod(self)
168171
else:
172+
# GPU默认平台:MoE层支持cutlass/triton/marlin三种后端
169173
if isinstance(layer, FusedMoE):
170174
if layer.use_method == "cutlass":
171175
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
@@ -188,6 +192,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
188192
else:
189193
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
190194
else:
195+
# 普通Linear层:满足条件时使用Machete优化内核,否则使用默认GPU方法
191196
if (
192197
_ENABLE_MACHETE
193198
and envs.FD_USE_MACHETE == "1"
@@ -206,13 +211,12 @@ class WINT8Config(WeightOnlyConfig):
206211
weight only int8 config
207212
"""
208213

209-
def __init__(self, is_checkpoint_bf16: bool = False) -> None:
210-
super().__init__("weight_only_int8", is_checkpoint_bf16)
214+
def __init__(self) -> None:
215+
super().__init__("weight_only_int8")
211216

212217
@classmethod
213218
def from_config(cls, config: dict) -> "WINT8Config":
214-
is_checkpoint_bf16 = not config.get("is_quantized", False)
215-
return cls(is_checkpoint_bf16)
219+
return cls()
216220

217221
def name(self) -> str:
218222
return "wint8"
@@ -225,14 +229,12 @@ class WINT4Config(WeightOnlyConfig):
225229

226230
def __init__(
227231
self,
228-
is_checkpoint_bf16: bool = False,
229232
) -> None:
230-
super().__init__("weight_only_int4", is_checkpoint_bf16)
233+
super().__init__("weight_only_int4")
231234

232235
@classmethod
233236
def from_config(cls, config: dict) -> "WINT4Config":
234-
is_checkpoint_bf16 = not config.get("is_quantized", False)
235-
return cls(is_checkpoint_bf16)
237+
return cls()
236238

237239
def name(self) -> str:
238240
return "wint4"
@@ -253,7 +255,7 @@ def __init__(
253255
def create_weights(self, layer, **extra_weight_attrs):
254256
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
255257
self.model_format = extra_weight_attrs.get("model_format")
256-
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
258+
if layer.fd_config.load_config.load_choices == "default_v1":
257259
weight_shape = layer.weight_shape[::-1] if self.model_format == "torch" else layer.weight_shape
258260
layer.weight = layer.create_parameter(
259261
shape=weight_shape,
@@ -363,12 +365,9 @@ def _process_quantize():
363365
layer.weight.copy_(quanted_weight_tensor, False)
364366
layer.weight_scale.copy_(weight_scale_tensor, False)
365367

366-
if self.quant_config.is_checkpoint_bf16:
367-
if self.model_format == "torch":
368-
process_weight_transpose(layer, "weight")
369-
_process_quantize()
370-
else:
371-
return
368+
if self.model_format == "torch":
369+
process_weight_transpose(layer, "weight")
370+
_process_quantize()
372371

373372
@abstractmethod
374373
def process_loaded_weights(self, layer, weights) -> None:

fastdeploy/model_executor/models/deepseek_v3.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None:
178178
layer_idx=layer_id,
179179
gate_correction_bias=self.gate.e_score_correction_bias,
180180
weight_key_map=weight_key_map,
181+
prefix=f"{prefix}.experts",
181182
)
182183

183184
self.num_shared_experts = fd_config.model_config.n_shared_experts

0 commit comments

Comments
 (0)