Skip to content
Closed

Ref pr #7193

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ class MetaxCutlassWeightOnlyMoEMethod(MetaxCutlassMoEMethod):
def __init__(self, quant_config):
super().__init__(quant_config)
if quant_config is None:
self.quant_config = WeightOnlyConfig(algo="weight_only_int8", is_checkpoint_bf16=True)
self.quant_config = WeightOnlyConfig(algo="weight_only_int8")
else:
self.quant_config = quant_config
self.moe_quant_type = self.quant_config.algo
Expand Down Expand Up @@ -480,21 +480,18 @@ def _process_quantize(weight_idx):
getattr(layer, weight_name).copy_(weight.transpose([0, 2, 1]), False)
getattr(layer, scale_name).copy_(scale, False)

if self.quant_config.is_checkpoint_bf16:
weight_id_map = {"gate_up": 0, "down": 1}
if weight_fully_copied(layer.up_gate_proj_weight):
weight_type = "gate_up"
else:
weight_type = "down"

if self.model_format == "torch":
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
"quant_weight", "weight"
)
process_weight_transpose(layer, unquantized_weight_name)
_process_quantize(weight_id_map[weight_type])
weight_id_map = {"gate_up": 0, "down": 1}
if weight_fully_copied(layer.up_gate_proj_weight):
weight_type = "gate_up"
else:
return
weight_type = "down"

if self.model_format == "torch":
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
"quant_weight", "weight"
)
process_weight_transpose(layer, unquantized_weight_name)
_process_quantize(weight_id_map[weight_type])

def process_loaded_weights(self, layer: nn.Layer, state_dict):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
layer.hidden_size,
]
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True
if is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
if layer.fd_config.load_config.load_choices == "default_v1":
layer.up_gate_proj_weight = layer.create_parameter(
shape=self.up_gate_proj_weight_shape,
dtype=layer.weight_dtype,
Expand Down Expand Up @@ -184,10 +183,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
@paddle.no_grad()
def process_weights_after_loading(self, layer):
""" """
is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True
if not is_checkpoint_bf16:
return

if self.quant_config is not None:
algo = layer.quant_method.quant_config.name()
assert algo == "wint8"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,8 +616,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):

def process_weights_after_loading(self, layer):
""" """
if not self.quant_config.is_checkpoint_bf16:
return
weight_id_map = {"gate_up": 0, "down": 1}
if (
hasattr(layer.up_gate_proj_weight, "tensor_track")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,6 @@ def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None
layer.weight_scale.set_value(weight_scale)

def process_weights_after_loading(self, layer) -> None:
if not self.quant_config.is_checkpoint_bf16:
return

quanted_weight, weight_scale = self._quantize_weight_in_blocks(layer.weight)
free_tensor(layer.weight)
Expand Down
27 changes: 12 additions & 15 deletions fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1468,7 +1468,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]
self.model_format = extra_weight_attrs.get("model_format")
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
if layer.fd_config.load_config.load_choices == "default_v1":
if self.model_format != "torch":
up_gate_proj_weight_shape = [
layer.num_local_experts,
Expand Down Expand Up @@ -1649,21 +1649,18 @@ def _process_quantize(weight_idx):
getattr(layer, weight_name).copy_(weight, False)
getattr(layer, scale_name).copy_(scale, False)

if self.quant_config.is_checkpoint_bf16:
weight_id_map = {"gate_up": 0, "down": 1}
if weight_fully_copied(layer.up_gate_proj_weight):
weight_type = "gate_up"
else:
weight_type = "down"

if self.model_format == "torch":
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
"quant_weight", "weight"
)
process_weight_transpose(layer, unquantized_weight_name)
_process_quantize(weight_id_map[weight_type])
weight_id_map = {"gate_up": 0, "down": 1}
if weight_fully_copied(layer.up_gate_proj_weight):
weight_type = "gate_up"
else:
return
weight_type = "down"

if self.model_format == "torch":
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
"quant_weight", "weight"
)
process_weight_transpose(layer, unquantized_weight_name)
_process_quantize(weight_id_map[weight_type])

def process_loaded_weights(self, layer: nn.Layer, state_dict):
"""
Expand Down
26 changes: 11 additions & 15 deletions fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
]
self.model_format = extra_weight_attrs.get("model_format")
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
if layer.fd_config.load_config.load_choices == "default_v1":
if self.model_format != "torch":
up_gate_proj_weight_shape = [
layer.num_local_experts,
Expand Down Expand Up @@ -268,21 +268,17 @@ def _process_quantize(weight_idx):
getattr(layer, weight_name).copy_(quanted_weight, False)
getattr(layer, scale_name).copy_(quanted_weight_scale, False)

if self.quant_config.is_checkpoint_bf16:
weight_id_map = {"gate_up": 0, "down": 1}
if weight_fully_copied(layer.up_gate_proj_weight):
weight_type = "gate_up"
else:
weight_type = "down"
if self.model_format == "torch":
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
"quant_weight", "weight"
)
process_weight_transpose(layer, unquantized_weight_name)
_process_quantize(weight_id_map[weight_type])

weight_id_map = {"gate_up": 0, "down": 1}
if weight_fully_copied(layer.up_gate_proj_weight):
weight_type = "gate_up"
else:
return
weight_type = "down"
if self.model_format == "torch":
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
"quant_weight", "weight"
)
process_weight_transpose(layer, unquantized_weight_name)
_process_quantize(weight_id_map[weight_type])

def apply(
self,
Expand Down
7 changes: 4 additions & 3 deletions fastdeploy/model_executor/layers/moe/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from fastdeploy.model_executor.layers.moe.routing_indices_cache import (
save_routing_to_buffer,
)
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.layers.utils import get_tensor, modules_to_convert
from fastdeploy.model_executor.utils import h2d_copy, slice_fn
from fastdeploy.platforms import current_platform
from fastdeploy.worker.experts_manager import RedundantExpertManger
Expand Down Expand Up @@ -152,6 +152,7 @@ def __init__(
with_bias: bool = False,
activation="swiglu",
model_format: Optional[str] = None,
prefix: str = "",
):
"""
Initialize the Moe layer with given parameters.
Expand All @@ -175,7 +176,7 @@ def __init__(
if self.ep_size > 1:
self.tp_size = 1
self.tp_rank = 0

self.prefix = prefix
self.attn_tp_size = fd_config.parallel_config.tensor_parallel_size
self.attn_tp_rank = fd_config.parallel_config.tensor_parallel_rank

Expand Down Expand Up @@ -226,7 +227,7 @@ def __init__(
moe_quant_config = fd_config.quant_config
self.moe_quant_config = moe_quant_config
self.moe_quant_type = None
if moe_quant_config and moe_quant_config.get_quant_method(self):
if moe_quant_config and moe_quant_config.get_quant_method(self) and modules_to_convert(prefix, self.fd_config):
self.quant_method = moe_quant_config.get_quant_method(self)
self.moe_quant_type = moe_quant_config.name()
else:
Expand Down
8 changes: 6 additions & 2 deletions fastdeploy/model_executor/layers/quantization/mix_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.utils import get_special_quant_config

from . import get_quantization_config
from .quant_base import QuantConfigBase, QuantMethodBase
Expand All @@ -41,6 +42,7 @@ def __init__(
hadamard_block_size: int = 128,
moe_dynamic_quant: bool = False,
is_moe_quantized: bool = False,
modules_to_quant: dict = {},
) -> None:
super().__init__()
self.dense_quant_type = dense_quant_type
Expand All @@ -61,6 +63,7 @@ def __init__(
self.hadamard_block_size = hadamard_block_size
self.moe_dynamic_quant = moe_dynamic_quant
self.is_moe_quantized = is_moe_quantized
self.modules_to_quant = modules_to_quant

def name(self) -> str:
return "mix_quant"
Expand All @@ -79,14 +82,15 @@ def from_config(cls, config: dict) -> "MixQuantConfig":
config.get("hadamard_block_size", 128),
config.get("moe_dynamic_quant", False),
config.get("is_moe_quantized", False),
config.get("modules_to_quant", {}),
)

def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
if isinstance(layer, FusedMoE):
if layer.moe_tag == "Image":
if self.image_moe_quant_type is not None:
return (
get_quantization_config(self.image_moe_quant_type)
get_special_quant_config(layer, self.modules_to_quant, self.image_moe_quant_type)
.from_config(
{
"is_permuted": self.is_permuted,
Expand All @@ -101,7 +105,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
else:
if self.moe_quant_type is not None:
return (
get_quantization_config(self.moe_quant_type)
get_special_quant_config(layer, self.modules_to_quant, self.moe_quant_type)
.from_config(
{
"is_permuted": self.is_permuted,
Expand Down
37 changes: 18 additions & 19 deletions fastdeploy/model_executor/layers/quantization/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ class WeightOnlyConfig(QuantConfigBase):
def __init__(
self,
algo: str,
is_checkpoint_bf16: bool = False,
) -> None:
super().__init__()
self.algo = algo
Expand All @@ -82,7 +81,7 @@ def __init__(
self.quant_max_bound = 0
self.quant_min_bound = 0
self.quant_round_type = 0
self.is_checkpoint_bf16 = is_checkpoint_bf16
self.is_checkpoint_bf16 = True # weight only linear support dynamic quantization only
self.group_size = -1

def name(self) -> str:
Expand All @@ -91,11 +90,12 @@ def name(self) -> str:
@classmethod
def from_config(cls, config: dict) -> "WeightOnlyConfig":
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 from_config 删除了 is_quantized 配置的处理,可能影响已有配置的兼容性。

建议保留对 is_quantized 配置的兼容处理(即使不使用),或添加弃用警告:

@classmethod
def from_config(cls, config: dict) -> "WeightOnlyConfig":
    algo = config["algo"]
    # 保留兼容性,但不使用 is_quantized 配置
    if "is_quantized" in config:
        logger.warning(
            "'is_quantized' config in WeightOnlyConfig is deprecated and ignored. "
            "WeightOnlyConfig always uses dynamic quantization."
        )
    return cls(algo)

algo = config["algo"]
is_checkpoint_bf16 = not config.get("is_quantized", False)
return cls(algo, is_checkpoint_bf16)
return cls(algo)

def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
# 根据平台类型和层类型选择对应的量化方法
if current_platform.is_xpu():
# XPU平台:区分MoE层和普通Linear层
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyMoEMethod,
Expand All @@ -109,6 +109,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:

return XPUWeightOnlyLinearMethod(self)
elif current_platform.is_gcu():
# GCU平台:区分MoE层和普通Linear层
from fastdeploy.model_executor.layers.backends import (
GCUWeightOnlyLinearMethod,
GCUWeightOnlyMoEMethod,
Expand All @@ -119,6 +120,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
else:
return GCUWeightOnlyLinearMethod(self)
elif current_platform.is_dcu():
# DCU平台:区分MoE层和普通Linear层
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.backends import (
DCUTritonWeightOnlyMoEMethod,
Expand All @@ -132,6 +134,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:

return DCUWeightOnlyLinearMethod(self)
elif current_platform.is_maca():
# MACA平台:MoE层支持cutlass和triton两种后端
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.backends import (
MetaxCutlassWeightOnlyMoEMethod,
Expand Down Expand Up @@ -166,6 +169,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:

return IluvatarWeightOnlyLinearMethod(self)
else:
# GPU默认平台:MoE层支持cutlass/triton/marlin三种后端
if isinstance(layer, FusedMoE):
if layer.use_method == "cutlass":
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
Expand All @@ -188,6 +192,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
else:
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
else:
# 普通Linear层:满足条件时使用Machete优化内核,否则使用默认GPU方法
if (
_ENABLE_MACHETE
and envs.FD_USE_MACHETE == "1"
Expand All @@ -206,13 +211,12 @@ class WINT8Config(WeightOnlyConfig):
weight only int8 config
"""

def __init__(self, is_checkpoint_bf16: bool = False) -> None:
super().__init__("weight_only_int8", is_checkpoint_bf16)
def __init__(self) -> None:
super().__init__("weight_only_int8")

@classmethod
def from_config(cls, config: dict) -> "WINT8Config":
is_checkpoint_bf16 = not config.get("is_quantized", False)
return cls(is_checkpoint_bf16)
return cls()

def name(self) -> str:
return "wint8"
Expand All @@ -225,14 +229,12 @@ class WINT4Config(WeightOnlyConfig):

def __init__(
self,
is_checkpoint_bf16: bool = False,
) -> None:
super().__init__("weight_only_int4", is_checkpoint_bf16)
super().__init__("weight_only_int4")

@classmethod
def from_config(cls, config: dict) -> "WINT4Config":
is_checkpoint_bf16 = not config.get("is_quantized", False)
return cls(is_checkpoint_bf16)
return cls()

def name(self) -> str:
return "wint4"
Expand All @@ -253,7 +255,7 @@ def __init__(
def create_weights(self, layer, **extra_weight_attrs):
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
self.model_format = extra_weight_attrs.get("model_format")
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
if layer.fd_config.load_config.load_choices == "default_v1":
weight_shape = layer.weight_shape[::-1] if self.model_format == "torch" else layer.weight_shape
layer.weight = layer.create_parameter(
shape=weight_shape,
Expand Down Expand Up @@ -363,12 +365,9 @@ def _process_quantize():
layer.weight.copy_(quanted_weight_tensor, False)
layer.weight_scale.copy_(weight_scale_tensor, False)

if self.quant_config.is_checkpoint_bf16:
if self.model_format == "torch":
process_weight_transpose(layer, "weight")
_process_quantize()
else:
return
if self.model_format == "torch":
process_weight_transpose(layer, "weight")
_process_quantize()

@abstractmethod
def process_loaded_weights(self, layer, weights) -> None:
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/model_executor/models/deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None:
layer_idx=layer_id,
gate_correction_bias=self.gate.e_score_correction_bias,
weight_key_map=weight_key_map,
prefix=f"{prefix}.experts",
)

self.num_shared_experts = fd_config.model_config.n_shared_experts
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/model_executor/models/ernie4_5_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ def __init__(
gate_correction_bias=None,
redundant_table_manger=redundant_table_manger,
weight_key_map=weight_key_map,
prefix=f"{prefix}.experts",
)

if fd_config.model_config.moe_use_aux_free:
Expand Down
Loading
Loading