Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions angelslim/compressor/quant/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def __init__(self, config, global_config=None):
self.quant_helpers = quantization_args.quant_helpers
act_quant_method = quantization_args.quant_method.get("activation", None)
weight_quant_method = quantization_args.quant_method["weight"]
self.cpu_convert = quantization_args.cpu_convert
self.save_name = quantization_args.save_name

if global_config:
self.max_seq_length = global_config.max_seq_length
self.hidden_size = global_config.hidden_size
Expand Down
53 changes: 31 additions & 22 deletions angelslim/compressor/quant/core/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,13 @@ def __init__(self, quant_model):
super().__init__(quant_model=quant_model)

def save(self, save_path):
deploy_backend = self.quant_model.deploy_backend
ignore_field = "ignored_layers" if deploy_backend == "vllm" else "ignore"
save_name = self.quant_model.quant_config.save_name
ignore_field = (
"ignore" if save_name == "compressed-tensors" else "ignored_layers"
)
w_quant_algo = self.quant_model.quant_config.quant_algo_info["w"]
a_quant_algo = self.quant_model.quant_config.quant_algo_info["a"]
is_dynamic = "dynamic" in a_quant_algo
ignored_layers = self.quant_model.skip_layer_names()
trtllm_config = {
"quantization": {
Expand All @@ -130,7 +133,7 @@ def save(self, save_path):
act_config = {
"num_bits": 8,
"strategy": re.search(r"per-([a-zA-Z]+)", a_quant_algo).group(1),
"dynamic": "dynamic" in a_quant_algo,
"dynamic": is_dynamic,
"type": "float",
}
weight_config = {
Expand All @@ -145,7 +148,7 @@ def save(self, save_path):
act_config = {
"num_bits": 8,
"strategy": re.search(r"per-([a-zA-Z]+)", a_quant_algo).group(1),
"dynamic": "dynamic" in a_quant_algo,
"dynamic": is_dynamic,
"type": "int",
}
weight_config = {
Expand All @@ -162,7 +165,7 @@ def save(self, save_path):
act_config = {
"num_bits": 4,
"group_size": group_size,
"dynamic": "dynamic" in a_quant_algo,
"dynamic": is_dynamic,
"type": "float",
}
weight_config = {
Expand All @@ -176,23 +179,29 @@ def save(self, save_path):
f"{self.quant_model.quant_config.quant_algo} not supported"
)

quant_dict = {
"quantization_config": {
"config_groups": {
"group_0": {
"weights": weight_config,
"input_activations": act_config,
"output_activations": None,
"targets": ["Linear"],
}
},
"kv_cache_scheme": None,
"format": quant_format,
ignore_field: ignored_layers,
"quantization_status": "compressed",
"quant_method": "compressed-tensors",
}
}
quantization_config = {"quant_method": save_name, ignore_field: ignored_layers}
if save_name == "compressed-tensors":
quantization_config.update(
{
"config_groups": {
"group_0": {
"weights": weight_config,
"input_activations": act_config,
"output_activations": None,
"targets": ["Linear"],
}
},
"kv_cache_scheme": None,
"format": quant_format,
"quantization_status": "compressed",
}
)
else:
quantization_config["activation_scheme"] = (
"dynamic" if is_dynamic else "static"
)

quant_dict = {"quantization_config": quantization_config}
self.quant_model.get_model().config.update(quant_dict)
print_info("Save quantization_config: {}".format(quant_dict))

Expand Down
1 change: 1 addition & 0 deletions angelslim/compressor/quant/modules/helper_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ def __init__(
):
super().__init__()
self.quant_algo = quant_algo
weight_scale = weight_scale.to(weight.device)
if "fp8" in quant_algo:
if "w4a8" in self.quant_algo:
max_value_group_wise = weight_scale.clone()
Expand Down
37 changes: 37 additions & 0 deletions angelslim/compressor/quant/ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os

import torch
from safetensors.torch import load_file

from ...utils import find_parent_layer_and_sub_name, print_info
from ..compressor_factory import CompressorFactory
Expand All @@ -35,6 +39,7 @@ def __init__(self, model, slim_config=None):
self.quant_model = model
# init ptq config of model
self.quant_model.init_ptq(slim_config)
self.model_path = slim_config.get("model_path")
self.quant_algo = self.quant_model.quant_config.quant_algo
self.quant_helpers = self.quant_model.quant_config.quant_helpers
if (
Expand Down Expand Up @@ -206,6 +211,35 @@ def _convert(self):
)
is not None
):
if sub_layer.weight.device.type == "meta":
with open(
os.path.join(self.model_path, "model.safetensors.index.json"),
"r",
) as f:
model_index = json.load(f)
orign_w_file = os.path.join(
self.model_path, model_index["weight_map"][name + ".weight"]
)
orign_w = load_file(orign_w_file, device="cpu")
print_info(f"Load meta weight {name} from file {orign_w_file}")
sub_layer.to_empty(device="cpu")
sub_layer.weight.data = orign_w[name + ".weight"]

if hasattr(sub_layer, "bias"):
if (name + ".bias") in model_index["weight_map"]:
orign_b_file = os.path.join(
self.model_path,
model_index["weight_map"][name + ".bias"],
)
orign_b = load_file(orign_b_file, device="cpu")
print_info(
f"Load meta bias {name} from file {orign_b_file}"
)
sub_layer.bias.data = orign_b[name + ".bias"]
else:
print_info(f"{name + '.bias'} not found. Set bias to None.")
sub_layer.bias = None

weight_scales = self.quant_model.get_weight_scales(
sub_layer, self.ptq_hook.observer_dict[sub_layer].weight_observer
)
Expand All @@ -225,6 +259,9 @@ def _convert(self):
quant_convert_module, name
)

if self.quant_model.quant_config.cpu_convert:
sub_layer = sub_layer.to("cpu")
print_info(f"Convert layer {name} on cpu")
if "nvfp4" in self.quant_algo:
self.nvfp4.post_process(sub_layer, name)
qdq_module = self.quant_model.get_nvfp4_qdq_module(sub_layer, name)
Expand Down
6 changes: 5 additions & 1 deletion angelslim/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ def prepare_compressor(
slim_config = {
"global_config": global_config,
"compress_config": compress_config,
"model_path": self.model_path,
}
self.compress_type = compress_names
self.only_inference = (
Expand Down Expand Up @@ -271,7 +272,10 @@ def save(
}
config_dict["model_config"]["model_path"] = "Base Model Path"
config_dict["global_config"]["save_path"] = "Save Model Path"
config_dict["dataset_config"]["data_path"] = "Data Path"
if "dataset_config" in config_dict and isinstance(
config_dict["dataset_config"], dict
):
config_dict["dataset_config"]["data_path"] = "Data Path"
with open(os.path.join(save_path, "angelslim_config.json"), "w") as f:
json.dump(config_dict, f, indent=4)

Expand Down
1 change: 1 addition & 0 deletions angelslim/models/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from .deepseek import DeepSeek # noqa: F401
from .glm import GLM # noqa: F401
from .hunyuan_dense import HunyuanDense # noqa: F401
from .hunyuan_moe import HunyuanMoE # noqa: F401
from .kimi_k2 import KimiK2 # noqa: F401
Expand Down
136 changes: 136 additions & 0 deletions angelslim/models/llm/glm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# Copyright 2025 Tencent Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re

import torch.nn as nn

from ...compressor.quant.core import PTQSaveVllmHF
from ...utils.utils import find_layers
from ..base_model import BaseLLMModel
from ..model_factory import SlimModelFactory


@SlimModelFactory.register
class GLM(BaseLLMModel):
def __init__(
self,
model=None,
deploy_backend="vllm",
):
super().__init__(
model=model,
deploy_backend=deploy_backend,
)
self.block_name = "model.layers"

def get_observer_layers(self):
names = [
"k_proj",
"v_proj",
"q_proj",
"o_proj",
"up_proj",
"gate_proj",
"down_proj",
]
obs_layers = [nn.Linear]
observer_layers_dict = {}
layers_dict = find_layers(self.model, layers=obs_layers)

ignore_layers = self.skip_layer_names()
for name, module in layers_dict.items():
if name.startswith(self.block_name) and name.split(".")[-1] in names:
observer_layers_dict[name] = module
else:
ignore_layers.append(name)
ignore_layers = sorted(list(set(ignore_layers)))
self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers

if self.quant_config.custom_observe_layers_names != "default":
for custom_observe_name in self.quant_config.custom_observe_layers_names:
for default_name in observer_layers_dict.keys():
if custom_observe_name not in default_name:
observer_layers_dict.pop(default_name)
return observer_layers_dict

def get_smooth_mapping_layers(self, smooth_config, mappings=None):
if mappings is None:
mappings = [
(["q_proj", "k_proj", "v_proj"], "input_layernorm"),
(["gate_proj", "up_proj"], "post_attention_layernorm"),
]
print(f"smooth mappings={mappings}")
assert len(mappings) == 2
assert smooth_config.smooth_first_linears or smooth_config.smooth_last_linears
return super().get_smooth_mapping_layers(smooth_config, mappings)

def get_parent_dict(self, observer_layers_dict):
parent_mapping = {r"experts\.\d+": "experts"}
parent_dict = {}
for layer_name in observer_layers_dict.keys():
parent_name = layer_name
for k, v in parent_mapping.items():
parent_name = re.sub(k, v, layer_name)
if parent_name != layer_name:
parent_dict[layer_name] = parent_name
return parent_dict

def get_save_func(self):
if self.deploy_backend in ["vllm", "huggingface"]:
return PTQSaveVllmHF
else:
raise NotImplementedError(
f"deploy_backend {self.deploy_backend} is not supported for saving."
)

def fuse_observer_amax(self, sub_layer, name):
if "q_proj" in name or "k_proj" in name or "v_proj" in name:
prefix = name.rsplit(".", 1)[0]
q_name = f"{prefix}.q_proj"
k_name = f"{prefix}.k_proj"
v_name = f"{prefix}.v_proj"

weight_scales = []
for key in [q_name, k_name, v_name]:
tensor = self.weight_observer_amax_dict[key]
weight_scales.append(tensor)
weight_observer_amax = max(weight_scales)

act_scales = []
for key in [q_name, k_name, v_name]:
tensor = self.input_observer_amax_dict[key]
act_scales.append(tensor)
input_observer_amax = max(act_scales)
elif "gate_proj" in name or "up_proj" in name:
prefix = name.rsplit(".", 1)[0]
gate_name = f"{prefix}.gate_proj"
up_name = f"{prefix}.up_proj"

weight_scales = []
for key in [gate_name, up_name]:
tensor = self.weight_observer_amax_dict[key]
weight_scales.append(tensor)
weight_observer_amax = max(weight_scales)

act_scales = []
for key in [gate_name, up_name]:
tensor = self.input_observer_amax_dict[key]
act_scales.append(tensor)
input_observer_amax = max(act_scales)
else:
weight_observer_amax = self.weight_observer_amax_dict[name]
input_observer_amax = self.input_observer_amax_dict[name]

return weight_observer_amax, input_observer_amax
4 changes: 3 additions & 1 deletion angelslim/utils/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ class QuantizationConfig:
"""

name: str = field(default="fp8_dynamic")
save_name: str = field(default="compressed-tensors")
bits: int = field(default=8)
quant_method: Dict[str, Any] = field(
default_factory=lambda: {
Expand All @@ -171,6 +172,7 @@ class QuantizationConfig:
quant_helpers: List[str] = field(default_factory=list)
smooth_alpha: float = field(default=0.5)
low_memory: bool = field(default=False)
cpu_convert: bool = field(default=False)
modules_to_quantize: List[str] = field(default_factory=list)
zero_point: bool = field(default=True)
mse_range: bool = field(default=False)
Expand Down Expand Up @@ -493,7 +495,7 @@ def get_default_config() -> FullConfig:
quantization=QuantizationConfig(
name="fp8_dynamic",
bits=8,
ignore_layers=["lm_head", "model.embed_tokens"],
ignore_layers=["lm_head"],
),
),
dataset_config=None,
Expand Down
Loading