Tencent · yghstill · Oct 22, 2025 · Oct 13, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/angelslim/compressor/quant/core/config.py b/angelslim/compressor/quant/core/config.py
@@ -60,6 +60,9 @@ def __init__(self, config, global_config=None):
         self.quant_helpers = quantization_args.quant_helpers
         act_quant_method = quantization_args.quant_method.get("activation", None)
         weight_quant_method = quantization_args.quant_method["weight"]
+        self.cpu_convert = quantization_args.cpu_convert
+        self.save_name = quantization_args.save_name
+
         if global_config:
             self.max_seq_length = global_config.max_seq_length
             self.hidden_size = global_config.hidden_size

diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py
@@ -112,10 +112,13 @@ def __init__(self, quant_model):
         super().__init__(quant_model=quant_model)
 
     def save(self, save_path):
-        deploy_backend = self.quant_model.deploy_backend
-        ignore_field = "ignored_layers" if deploy_backend == "vllm" else "ignore"
+        save_name = self.quant_model.quant_config.save_name
+        ignore_field = (
+            "ignore" if save_name == "compressed-tensors" else "ignored_layers"
+        )
         w_quant_algo = self.quant_model.quant_config.quant_algo_info["w"]
         a_quant_algo = self.quant_model.quant_config.quant_algo_info["a"]
+        is_dynamic = "dynamic" in a_quant_algo
         ignored_layers = self.quant_model.skip_layer_names()
         trtllm_config = {
             "quantization": {
@@ -130,7 +133,7 @@ def save(self, save_path):
             act_config = {
                 "num_bits": 8,
                 "strategy": re.search(r"per-([a-zA-Z]+)", a_quant_algo).group(1),
-                "dynamic": "dynamic" in a_quant_algo,
+                "dynamic": is_dynamic,
                 "type": "float",
             }
             weight_config = {
@@ -145,7 +148,7 @@ def save(self, save_path):
             act_config = {
                 "num_bits": 8,
                 "strategy": re.search(r"per-([a-zA-Z]+)", a_quant_algo).group(1),
-                "dynamic": "dynamic" in a_quant_algo,
+                "dynamic": is_dynamic,
                 "type": "int",
             }
             weight_config = {
@@ -162,7 +165,7 @@ def save(self, save_path):
             act_config = {
                 "num_bits": 4,
                 "group_size": group_size,
-                "dynamic": "dynamic" in a_quant_algo,
+                "dynamic": is_dynamic,
                 "type": "float",
             }
             weight_config = {
@@ -176,23 +179,29 @@ def save(self, save_path):
                 f"{self.quant_model.quant_config.quant_algo} not supported"
             )
 
-        quant_dict = {
-            "quantization_config": {
-                "config_groups": {
-                    "group_0": {
-                        "weights": weight_config,
-                        "input_activations": act_config,
-                        "output_activations": None,
-                        "targets": ["Linear"],
-                    }
-                },
-                "kv_cache_scheme": None,
-                "format": quant_format,
-                ignore_field: ignored_layers,
-                "quantization_status": "compressed",
-                "quant_method": "compressed-tensors",
-            }
-        }
+        quantization_config = {"quant_method": save_name, ignore_field: ignored_layers}
+        if save_name == "compressed-tensors":
+            quantization_config.update(
+                {
+                    "config_groups": {
+                        "group_0": {
+                            "weights": weight_config,
+                            "input_activations": act_config,
+                            "output_activations": None,
+                            "targets": ["Linear"],
+                        }
+                    },
+                    "kv_cache_scheme": None,
+                    "format": quant_format,
+                    "quantization_status": "compressed",
+                }
+            )
+        else:
+            quantization_config["activation_scheme"] = (
+                "dynamic" if is_dynamic else "static"
+            )
+
+        quant_dict = {"quantization_config": quantization_config}
         self.quant_model.get_model().config.update(quant_dict)
         print_info("Save quantization_config: {}".format(quant_dict))
 

diff --git a/angelslim/compressor/quant/modules/helper_layer.py b/angelslim/compressor/quant/modules/helper_layer.py
@@ -575,6 +575,7 @@ def __init__(
     ):
         super().__init__()
         self.quant_algo = quant_algo
+        weight_scale = weight_scale.to(weight.device)
         if "fp8" in quant_algo:
             if "w4a8" in self.quant_algo:
                 max_value_group_wise = weight_scale.clone()

diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
+import os
+
 import torch
+from safetensors.torch import load_file
 
 from ...utils import find_parent_layer_and_sub_name, print_info
 from ..compressor_factory import CompressorFactory
@@ -35,6 +39,7 @@ def __init__(self, model, slim_config=None):
         self.quant_model = model
         # init ptq config of model
         self.quant_model.init_ptq(slim_config)
+        self.model_path = slim_config.get("model_path")
         self.quant_algo = self.quant_model.quant_config.quant_algo
         self.quant_helpers = self.quant_model.quant_config.quant_helpers
         if (
@@ -206,6 +211,35 @@ def _convert(self):
                 )
                 is not None
             ):
+                if sub_layer.weight.device.type == "meta":
+                    with open(
+                        os.path.join(self.model_path, "model.safetensors.index.json"),
+                        "r",
+                    ) as f:
+                        model_index = json.load(f)
+                    orign_w_file = os.path.join(
+                        self.model_path, model_index["weight_map"][name + ".weight"]
+                    )
+                    orign_w = load_file(orign_w_file, device="cpu")
+                    print_info(f"Load meta weight {name} from file {orign_w_file}")
+                    sub_layer.to_empty(device="cpu")
+                    sub_layer.weight.data = orign_w[name + ".weight"]
+
+                    if hasattr(sub_layer, "bias"):
+                        if (name + ".bias") in model_index["weight_map"]:
+                            orign_b_file = os.path.join(
+                                self.model_path,
+                                model_index["weight_map"][name + ".bias"],
+                            )
+                            orign_b = load_file(orign_b_file, device="cpu")
+                            print_info(
+                                f"Load meta bias {name} from file {orign_b_file}"
+                            )
+                            sub_layer.bias.data = orign_b[name + ".bias"]
+                        else:
+                            print_info(f"{name + '.bias'} not found. Set bias to None.")
+                            sub_layer.bias = None
+
                 weight_scales = self.quant_model.get_weight_scales(
                     sub_layer, self.ptq_hook.observer_dict[sub_layer].weight_observer
                 )
@@ -225,6 +259,9 @@ def _convert(self):
                 quant_convert_module, name
             )
 
+            if self.quant_model.quant_config.cpu_convert:
+                sub_layer = sub_layer.to("cpu")
+                print_info(f"Convert layer {name} on cpu")
             if "nvfp4" in self.quant_algo:
                 self.nvfp4.post_process(sub_layer, name)
                 qdq_module = self.quant_model.get_nvfp4_qdq_module(sub_layer, name)

diff --git a/angelslim/engine.py b/angelslim/engine.py
@@ -204,6 +204,7 @@ def prepare_compressor(
             slim_config = {
                 "global_config": global_config,
                 "compress_config": compress_config,
+                "model_path": self.model_path,
             }
         self.compress_type = compress_names
         self.only_inference = (
@@ -271,7 +272,10 @@ def save(
             }
             config_dict["model_config"]["model_path"] = "Base Model Path"
             config_dict["global_config"]["save_path"] = "Save Model Path"
-            config_dict["dataset_config"]["data_path"] = "Data Path"
+            if "dataset_config" in config_dict and isinstance(
+                config_dict["dataset_config"], dict
+            ):
+                config_dict["dataset_config"]["data_path"] = "Data Path"
             with open(os.path.join(save_path, "angelslim_config.json"), "w") as f:
                 json.dump(config_dict, f, indent=4)
 

diff --git a/angelslim/models/llm/__init__.py b/angelslim/models/llm/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .deepseek import DeepSeek  # noqa: F401
+from .glm import GLM  # noqa: F401
 from .hunyuan_dense import HunyuanDense  # noqa: F401
 from .hunyuan_moe import HunyuanMoE  # noqa: F401
 from .kimi_k2 import KimiK2  # noqa: F401

diff --git a/angelslim/models/llm/glm.py b/angelslim/models/llm/glm.py
@@ -0,0 +1,136 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import torch.nn as nn
+
+from ...compressor.quant.core import PTQSaveVllmHF
+from ...utils.utils import find_layers
+from ..base_model import BaseLLMModel
+from ..model_factory import SlimModelFactory
+
+
+@SlimModelFactory.register
+class GLM(BaseLLMModel):
+    def __init__(
+        self,
+        model=None,
+        deploy_backend="vllm",
+    ):
+        super().__init__(
+            model=model,
+            deploy_backend=deploy_backend,
+        )
+        self.block_name = "model.layers"
+
+    def get_observer_layers(self):
+        names = [
+            "k_proj",
+            "v_proj",
+            "q_proj",
+            "o_proj",
+            "up_proj",
+            "gate_proj",
+            "down_proj",
+        ]
+        obs_layers = [nn.Linear]
+        observer_layers_dict = {}
+        layers_dict = find_layers(self.model, layers=obs_layers)
+
+        ignore_layers = self.skip_layer_names()
+        for name, module in layers_dict.items():
+            if name.startswith(self.block_name) and name.split(".")[-1] in names:
+                observer_layers_dict[name] = module
+            else:
+                ignore_layers.append(name)
+        ignore_layers = sorted(list(set(ignore_layers)))
+        self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers
+
+        if self.quant_config.custom_observe_layers_names != "default":
+            for custom_observe_name in self.quant_config.custom_observe_layers_names:
+                for default_name in observer_layers_dict.keys():
+                    if custom_observe_name not in default_name:
+                        observer_layers_dict.pop(default_name)
+        return observer_layers_dict
+
+    def get_smooth_mapping_layers(self, smooth_config, mappings=None):
+        if mappings is None:
+            mappings = [
+                (["q_proj", "k_proj", "v_proj"], "input_layernorm"),
+                (["gate_proj", "up_proj"], "post_attention_layernorm"),
+            ]
+        print(f"smooth mappings={mappings}")
+        assert len(mappings) == 2
+        assert smooth_config.smooth_first_linears or smooth_config.smooth_last_linears
+        return super().get_smooth_mapping_layers(smooth_config, mappings)
+
+    def get_parent_dict(self, observer_layers_dict):
+        parent_mapping = {r"experts\.\d+": "experts"}
+        parent_dict = {}
+        for layer_name in observer_layers_dict.keys():
+            parent_name = layer_name
+            for k, v in parent_mapping.items():
+                parent_name = re.sub(k, v, layer_name)
+            if parent_name != layer_name:
+                parent_dict[layer_name] = parent_name
+        return parent_dict
+
+    def get_save_func(self):
+        if self.deploy_backend in ["vllm", "huggingface"]:
+            return PTQSaveVllmHF
+        else:
+            raise NotImplementedError(
+                f"deploy_backend {self.deploy_backend} is not supported for saving."
+            )
+
+    def fuse_observer_amax(self, sub_layer, name):
+        if "q_proj" in name or "k_proj" in name or "v_proj" in name:
+            prefix = name.rsplit(".", 1)[0]
+            q_name = f"{prefix}.q_proj"
+            k_name = f"{prefix}.k_proj"
+            v_name = f"{prefix}.v_proj"
+
+            weight_scales = []
+            for key in [q_name, k_name, v_name]:
+                tensor = self.weight_observer_amax_dict[key]
+                weight_scales.append(tensor)
+            weight_observer_amax = max(weight_scales)
+
+            act_scales = []
+            for key in [q_name, k_name, v_name]:
+                tensor = self.input_observer_amax_dict[key]
+                act_scales.append(tensor)
+            input_observer_amax = max(act_scales)
+        elif "gate_proj" in name or "up_proj" in name:
+            prefix = name.rsplit(".", 1)[0]
+            gate_name = f"{prefix}.gate_proj"
+            up_name = f"{prefix}.up_proj"
+
+            weight_scales = []
+            for key in [gate_name, up_name]:
+                tensor = self.weight_observer_amax_dict[key]
+                weight_scales.append(tensor)
+            weight_observer_amax = max(weight_scales)
+
+            act_scales = []
+            for key in [gate_name, up_name]:
+                tensor = self.input_observer_amax_dict[key]
+                act_scales.append(tensor)
+            input_observer_amax = max(act_scales)
+        else:
+            weight_observer_amax = self.weight_observer_amax_dict[name]
+            input_observer_amax = self.input_observer_amax_dict[name]
+
+        return weight_observer_amax, input_observer_amax
diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py
@@ -160,6 +160,7 @@ class QuantizationConfig:
     """
 
     name: str = field(default="fp8_dynamic")
+    save_name: str = field(default="compressed-tensors")
     bits: int = field(default=8)
     quant_method: Dict[str, Any] = field(
         default_factory=lambda: {
@@ -171,6 +172,7 @@ class QuantizationConfig:
     quant_helpers: List[str] = field(default_factory=list)
     smooth_alpha: float = field(default=0.5)
     low_memory: bool = field(default=False)
+    cpu_convert: bool = field(default=False)
     modules_to_quantize: List[str] = field(default_factory=list)
     zero_point: bool = field(default=True)
     mse_range: bool = field(default=False)
@@ -493,7 +495,7 @@ def get_default_config() -> FullConfig:
                 quantization=QuantizationConfig(
                     name="fp8_dynamic",
                     bits=8,
-                    ignore_layers=["lm_head", "model.embed_tokens"],
+                    ignore_layers=["lm_head"],
                 ),
             ),
             dataset_config=None,