Skip to content

Commit 15ddbd4

Browse files
authored
support glm-4.6 (#89)
1 parent 32307ff commit 15ddbd4

156 files changed

Lines changed: 290 additions & 183 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

angelslim/compressor/quant/core/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ def __init__(self, config, global_config=None):
6060
self.quant_helpers = quantization_args.quant_helpers
6161
act_quant_method = quantization_args.quant_method.get("activation", None)
6262
weight_quant_method = quantization_args.quant_method["weight"]
63+
self.cpu_convert = quantization_args.cpu_convert
64+
self.save_name = quantization_args.save_name
65+
6366
if global_config:
6467
self.max_seq_length = global_config.max_seq_length
6568
self.hidden_size = global_config.hidden_size

angelslim/compressor/quant/core/save.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,13 @@ def __init__(self, quant_model):
112112
super().__init__(quant_model=quant_model)
113113

114114
def save(self, save_path):
115-
deploy_backend = self.quant_model.deploy_backend
116-
ignore_field = "ignored_layers" if deploy_backend == "vllm" else "ignore"
115+
save_name = self.quant_model.quant_config.save_name
116+
ignore_field = (
117+
"ignore" if save_name == "compressed-tensors" else "ignored_layers"
118+
)
117119
w_quant_algo = self.quant_model.quant_config.quant_algo_info["w"]
118120
a_quant_algo = self.quant_model.quant_config.quant_algo_info["a"]
121+
is_dynamic = "dynamic" in a_quant_algo
119122
ignored_layers = self.quant_model.skip_layer_names()
120123
trtllm_config = {
121124
"quantization": {
@@ -130,7 +133,7 @@ def save(self, save_path):
130133
act_config = {
131134
"num_bits": 8,
132135
"strategy": re.search(r"per-([a-zA-Z]+)", a_quant_algo).group(1),
133-
"dynamic": "dynamic" in a_quant_algo,
136+
"dynamic": is_dynamic,
134137
"type": "float",
135138
}
136139
weight_config = {
@@ -145,7 +148,7 @@ def save(self, save_path):
145148
act_config = {
146149
"num_bits": 8,
147150
"strategy": re.search(r"per-([a-zA-Z]+)", a_quant_algo).group(1),
148-
"dynamic": "dynamic" in a_quant_algo,
151+
"dynamic": is_dynamic,
149152
"type": "int",
150153
}
151154
weight_config = {
@@ -162,7 +165,7 @@ def save(self, save_path):
162165
act_config = {
163166
"num_bits": 4,
164167
"group_size": group_size,
165-
"dynamic": "dynamic" in a_quant_algo,
168+
"dynamic": is_dynamic,
166169
"type": "float",
167170
}
168171
weight_config = {
@@ -176,23 +179,29 @@ def save(self, save_path):
176179
f"{self.quant_model.quant_config.quant_algo} not supported"
177180
)
178181

179-
quant_dict = {
180-
"quantization_config": {
181-
"config_groups": {
182-
"group_0": {
183-
"weights": weight_config,
184-
"input_activations": act_config,
185-
"output_activations": None,
186-
"targets": ["Linear"],
187-
}
188-
},
189-
"kv_cache_scheme": None,
190-
"format": quant_format,
191-
ignore_field: ignored_layers,
192-
"quantization_status": "compressed",
193-
"quant_method": "compressed-tensors",
194-
}
195-
}
182+
quantization_config = {"quant_method": save_name, ignore_field: ignored_layers}
183+
if save_name == "compressed-tensors":
184+
quantization_config.update(
185+
{
186+
"config_groups": {
187+
"group_0": {
188+
"weights": weight_config,
189+
"input_activations": act_config,
190+
"output_activations": None,
191+
"targets": ["Linear"],
192+
}
193+
},
194+
"kv_cache_scheme": None,
195+
"format": quant_format,
196+
"quantization_status": "compressed",
197+
}
198+
)
199+
else:
200+
quantization_config["activation_scheme"] = (
201+
"dynamic" if is_dynamic else "static"
202+
)
203+
204+
quant_dict = {"quantization_config": quantization_config}
196205
self.quant_model.get_model().config.update(quant_dict)
197206
print_info("Save quantization_config: {}".format(quant_dict))
198207

angelslim/compressor/quant/modules/helper_layer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,7 @@ def __init__(
575575
):
576576
super().__init__()
577577
self.quant_algo = quant_algo
578+
weight_scale = weight_scale.to(weight.device)
578579
if "fp8" in quant_algo:
579580
if "w4a8" in self.quant_algo:
580581
max_value_group_wise = weight_scale.clone()

angelslim/compressor/quant/ptq.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import json
16+
import os
17+
1518
import torch
19+
from safetensors.torch import load_file
1620

1721
from ...utils import find_parent_layer_and_sub_name, print_info
1822
from ..compressor_factory import CompressorFactory
@@ -35,6 +39,7 @@ def __init__(self, model, slim_config=None):
3539
self.quant_model = model
3640
# init ptq config of model
3741
self.quant_model.init_ptq(slim_config)
42+
self.model_path = slim_config.get("model_path")
3843
self.quant_algo = self.quant_model.quant_config.quant_algo
3944
self.quant_helpers = self.quant_model.quant_config.quant_helpers
4045
if (
@@ -206,6 +211,35 @@ def _convert(self):
206211
)
207212
is not None
208213
):
214+
if sub_layer.weight.device.type == "meta":
215+
with open(
216+
os.path.join(self.model_path, "model.safetensors.index.json"),
217+
"r",
218+
) as f:
219+
model_index = json.load(f)
220+
orign_w_file = os.path.join(
221+
self.model_path, model_index["weight_map"][name + ".weight"]
222+
)
223+
orign_w = load_file(orign_w_file, device="cpu")
224+
print_info(f"Load meta weight {name} from file {orign_w_file}")
225+
sub_layer.to_empty(device="cpu")
226+
sub_layer.weight.data = orign_w[name + ".weight"]
227+
228+
if hasattr(sub_layer, "bias"):
229+
if (name + ".bias") in model_index["weight_map"]:
230+
orign_b_file = os.path.join(
231+
self.model_path,
232+
model_index["weight_map"][name + ".bias"],
233+
)
234+
orign_b = load_file(orign_b_file, device="cpu")
235+
print_info(
236+
f"Load meta bias {name} from file {orign_b_file}"
237+
)
238+
sub_layer.bias.data = orign_b[name + ".bias"]
239+
else:
240+
print_info(f"{name + '.bias'} not found. Set bias to None.")
241+
sub_layer.bias = None
242+
209243
weight_scales = self.quant_model.get_weight_scales(
210244
sub_layer, self.ptq_hook.observer_dict[sub_layer].weight_observer
211245
)
@@ -225,6 +259,9 @@ def _convert(self):
225259
quant_convert_module, name
226260
)
227261

262+
if self.quant_model.quant_config.cpu_convert:
263+
sub_layer = sub_layer.to("cpu")
264+
print_info(f"Convert layer {name} on cpu")
228265
if "nvfp4" in self.quant_algo:
229266
self.nvfp4.post_process(sub_layer, name)
230267
qdq_module = self.quant_model.get_nvfp4_qdq_module(sub_layer, name)

angelslim/engine.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ def prepare_compressor(
204204
slim_config = {
205205
"global_config": global_config,
206206
"compress_config": compress_config,
207+
"model_path": self.model_path,
207208
}
208209
self.compress_type = compress_names
209210
self.only_inference = (
@@ -271,7 +272,10 @@ def save(
271272
}
272273
config_dict["model_config"]["model_path"] = "Base Model Path"
273274
config_dict["global_config"]["save_path"] = "Save Model Path"
274-
config_dict["dataset_config"]["data_path"] = "Data Path"
275+
if "dataset_config" in config_dict and isinstance(
276+
config_dict["dataset_config"], dict
277+
):
278+
config_dict["dataset_config"]["data_path"] = "Data Path"
275279
with open(os.path.join(save_path, "angelslim_config.json"), "w") as f:
276280
json.dump(config_dict, f, indent=4)
277281

angelslim/models/llm/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
from .deepseek import DeepSeek # noqa: F401
16+
from .glm import GLM # noqa: F401
1617
from .hunyuan_dense import HunyuanDense # noqa: F401
1718
from .hunyuan_moe import HunyuanMoE # noqa: F401
1819
from .kimi_k2 import KimiK2 # noqa: F401

angelslim/models/llm/glm.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# Copyright 2025 Tencent Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import re
16+
17+
import torch.nn as nn
18+
19+
from ...compressor.quant.core import PTQSaveVllmHF
20+
from ...utils.utils import find_layers
21+
from ..base_model import BaseLLMModel
22+
from ..model_factory import SlimModelFactory
23+
24+
25+
@SlimModelFactory.register
26+
class GLM(BaseLLMModel):
27+
def __init__(
28+
self,
29+
model=None,
30+
deploy_backend="vllm",
31+
):
32+
super().__init__(
33+
model=model,
34+
deploy_backend=deploy_backend,
35+
)
36+
self.block_name = "model.layers"
37+
38+
def get_observer_layers(self):
39+
names = [
40+
"k_proj",
41+
"v_proj",
42+
"q_proj",
43+
"o_proj",
44+
"up_proj",
45+
"gate_proj",
46+
"down_proj",
47+
]
48+
obs_layers = [nn.Linear]
49+
observer_layers_dict = {}
50+
layers_dict = find_layers(self.model, layers=obs_layers)
51+
52+
ignore_layers = self.skip_layer_names()
53+
for name, module in layers_dict.items():
54+
if name.startswith(self.block_name) and name.split(".")[-1] in names:
55+
observer_layers_dict[name] = module
56+
else:
57+
ignore_layers.append(name)
58+
ignore_layers = sorted(list(set(ignore_layers)))
59+
self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers
60+
61+
if self.quant_config.custom_observe_layers_names != "default":
62+
for custom_observe_name in self.quant_config.custom_observe_layers_names:
63+
for default_name in observer_layers_dict.keys():
64+
if custom_observe_name not in default_name:
65+
observer_layers_dict.pop(default_name)
66+
return observer_layers_dict
67+
68+
def get_smooth_mapping_layers(self, smooth_config, mappings=None):
69+
if mappings is None:
70+
mappings = [
71+
(["q_proj", "k_proj", "v_proj"], "input_layernorm"),
72+
(["gate_proj", "up_proj"], "post_attention_layernorm"),
73+
]
74+
print(f"smooth mappings={mappings}")
75+
assert len(mappings) == 2
76+
assert smooth_config.smooth_first_linears or smooth_config.smooth_last_linears
77+
return super().get_smooth_mapping_layers(smooth_config, mappings)
78+
79+
def get_parent_dict(self, observer_layers_dict):
80+
parent_mapping = {r"experts\.\d+": "experts"}
81+
parent_dict = {}
82+
for layer_name in observer_layers_dict.keys():
83+
parent_name = layer_name
84+
for k, v in parent_mapping.items():
85+
parent_name = re.sub(k, v, layer_name)
86+
if parent_name != layer_name:
87+
parent_dict[layer_name] = parent_name
88+
return parent_dict
89+
90+
def get_save_func(self):
91+
if self.deploy_backend in ["vllm", "huggingface"]:
92+
return PTQSaveVllmHF
93+
else:
94+
raise NotImplementedError(
95+
f"deploy_backend {self.deploy_backend} is not supported for saving."
96+
)
97+
98+
def fuse_observer_amax(self, sub_layer, name):
99+
if "q_proj" in name or "k_proj" in name or "v_proj" in name:
100+
prefix = name.rsplit(".", 1)[0]
101+
q_name = f"{prefix}.q_proj"
102+
k_name = f"{prefix}.k_proj"
103+
v_name = f"{prefix}.v_proj"
104+
105+
weight_scales = []
106+
for key in [q_name, k_name, v_name]:
107+
tensor = self.weight_observer_amax_dict[key]
108+
weight_scales.append(tensor)
109+
weight_observer_amax = max(weight_scales)
110+
111+
act_scales = []
112+
for key in [q_name, k_name, v_name]:
113+
tensor = self.input_observer_amax_dict[key]
114+
act_scales.append(tensor)
115+
input_observer_amax = max(act_scales)
116+
elif "gate_proj" in name or "up_proj" in name:
117+
prefix = name.rsplit(".", 1)[0]
118+
gate_name = f"{prefix}.gate_proj"
119+
up_name = f"{prefix}.up_proj"
120+
121+
weight_scales = []
122+
for key in [gate_name, up_name]:
123+
tensor = self.weight_observer_amax_dict[key]
124+
weight_scales.append(tensor)
125+
weight_observer_amax = max(weight_scales)
126+
127+
act_scales = []
128+
for key in [gate_name, up_name]:
129+
tensor = self.input_observer_amax_dict[key]
130+
act_scales.append(tensor)
131+
input_observer_amax = max(act_scales)
132+
else:
133+
weight_observer_amax = self.weight_observer_amax_dict[name]
134+
input_observer_amax = self.input_observer_amax_dict[name]
135+
136+
return weight_observer_amax, input_observer_amax

angelslim/utils/config_parser.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ class QuantizationConfig:
160160
"""
161161

162162
name: str = field(default="fp8_dynamic")
163+
save_name: str = field(default="compressed-tensors")
163164
bits: int = field(default=8)
164165
quant_method: Dict[str, Any] = field(
165166
default_factory=lambda: {
@@ -171,6 +172,7 @@ class QuantizationConfig:
171172
quant_helpers: List[str] = field(default_factory=list)
172173
smooth_alpha: float = field(default=0.5)
173174
low_memory: bool = field(default=False)
175+
cpu_convert: bool = field(default=False)
174176
modules_to_quantize: List[str] = field(default_factory=list)
175177
zero_point: bool = field(default=True)
176178
mse_range: bool = field(default=False)
@@ -493,7 +495,7 @@ def get_default_config() -> FullConfig:
493495
quantization=QuantizationConfig(
494496
name="fp8_dynamic",
495497
bits=8,
496-
ignore_layers=["lm_head", "model.embed_tokens"],
498+
ignore_layers=["lm_head"],
497499
),
498500
),
499501
dataset_config=None,

0 commit comments

Comments
 (0)