From b650b8b3ed26360a06aa3d611cd936cc5172a506 Mon Sep 17 00:00:00 2001 From: RuBing-Yang Date: Mon, 13 Oct 2025 19:39:43 +0800 Subject: [PATCH 1/5] support glm-4.6 --- angelslim/compressor/quant/core/config.py | 2 + .../compressor/quant/modules/helper_layer.py | 1 + angelslim/compressor/quant/ptq.py | 21 +++ angelslim/engine.py | 1 + angelslim/models/llm/__init__.py | 1 + angelslim/models/llm/glm.py | 135 ++++++++++++++++++ angelslim/utils/config_parser.py | 1 + .../glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml | 27 ++++ .../glm4/fp8_static/glm4_6-fp8_static.yaml | 35 +++++ 9 files changed, 224 insertions(+) create mode 100644 angelslim/models/llm/glm.py create mode 100644 configs/glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml create mode 100644 configs/glm4/fp8_static/glm4_6-fp8_static.yaml diff --git a/angelslim/compressor/quant/core/config.py b/angelslim/compressor/quant/core/config.py index 54b3b624..ab51314e 100644 --- a/angelslim/compressor/quant/core/config.py +++ b/angelslim/compressor/quant/core/config.py @@ -60,6 +60,8 @@ def __init__(self, config, global_config=None): self.quant_helpers = quantization_args.quant_helpers act_quant_method = quantization_args.quant_method.get("activation", None) weight_quant_method = quantization_args.quant_method["weight"] + self.cpu_convert = quantization_args.cpu_convert + if global_config: self.max_seq_length = global_config.max_seq_length self.hidden_size = global_config.hidden_size diff --git a/angelslim/compressor/quant/modules/helper_layer.py b/angelslim/compressor/quant/modules/helper_layer.py index a67a84e8..4ba0ff24 100644 --- a/angelslim/compressor/quant/modules/helper_layer.py +++ b/angelslim/compressor/quant/modules/helper_layer.py @@ -575,6 +575,7 @@ def __init__( ): super().__init__() self.quant_algo = quant_algo + weight_scale = weight_scale.to(weight.device) if "fp8" in quant_algo: if "w4a8" in self.quant_algo: max_value_group_wise = weight_scale.clone() diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index 1b917b3b..cb36d65f 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import os + import torch +from safetensors.torch import load_file from ...utils import find_parent_layer_and_sub_name, print_info from ..compressor_factory import CompressorFactory @@ -35,6 +39,7 @@ def __init__(self, model, slim_config=None): self.quant_model = model # init ptq config of model self.quant_model.init_ptq(slim_config) + self.model_path = slim_config.get("model_path") self.quant_algo = self.quant_model.quant_config.quant_algo self.quant_helpers = self.quant_model.quant_config.quant_helpers if ( @@ -206,6 +211,19 @@ def _convert(self): ) is not None ): + if sub_layer.weight.device.type == "meta": + with open( + os.path.join(self.model_path, "model.safetensors.index.json"), + "r", + ) as f: + model_index = json.load(f) + orign_w_file = os.path.join( + self.model_path, model_index["weight_map"][name + ".weight"] + ) + orign_w = load_file(orign_w_file, device="cpu") + print_info(f"Load meta weight {name} from file {orign_w_file}") + sub_layer.to_empty(device="cpu") + sub_layer.weight.data = orign_w[name + ".weight"] weight_scales = self.quant_model.get_weight_scales( sub_layer, self.ptq_hook.observer_dict[sub_layer].weight_observer ) @@ -225,6 +243,9 @@ def _convert(self): quant_convert_module, name ) + if self.quant_model.quant_config.cpu_convert: + sub_layer = sub_layer.to("cpu") + print_info(f"Convert layer {name} on cpu") if "nvfp4" in self.quant_algo: self.nvfp4.post_process(sub_layer, name) qdq_module = self.quant_model.get_nvfp4_qdq_module(sub_layer, name) diff --git a/angelslim/engine.py b/angelslim/engine.py index 5d1c45fd..44296126 100644 --- a/angelslim/engine.py +++ b/angelslim/engine.py @@ -204,6 +204,7 @@ def prepare_compressor( slim_config = { "global_config": global_config, "compress_config": compress_config, + "model_path": self.model_path, } self.compress_type = compress_names self.only_inference = ( diff --git a/angelslim/models/llm/__init__.py b/angelslim/models/llm/__init__.py index cb3b45ff..e8735382 100644 --- a/angelslim/models/llm/__init__.py +++ b/angelslim/models/llm/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from .deepseek import DeepSeek # noqa: F401 +from .glm import GLM # noqa: F401 from .hunyuan_dense import HunyuanDense # noqa: F401 from .hunyuan_moe import HunyuanMoE # noqa: F401 from .kimi_k2 import KimiK2 # noqa: F401 diff --git a/angelslim/models/llm/glm.py b/angelslim/models/llm/glm.py new file mode 100644 index 00000000..3c7990b6 --- /dev/null +++ b/angelslim/models/llm/glm.py @@ -0,0 +1,135 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import torch.nn as nn + +from ...compressor.quant.core import PTQSaveVllmHF +from ...utils.utils import find_layers +from ..base_model import BaseLLMModel +from ..model_factory import SlimModelFactory + + +@SlimModelFactory.register +class GLM(BaseLLMModel): + def __init__( + self, + model=None, + deploy_backend="vllm", + ): + super().__init__( + model=model, + deploy_backend=deploy_backend, + ) + self.block_name = "model.layers" + + def get_observer_layers(self): + names = [ + "k_proj", + "v_proj", + "q_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + ] + obs_layers = [nn.Linear] + observer_layers_dict = {} + layers_dict = find_layers(self.model, layers=obs_layers) + + ignore_layers = self.skip_layer_names() + for name, module in layers_dict.items(): + if name.startswith(self.block_name) and name.split(".")[-1] in names: + observer_layers_dict[name] = module + else: + ignore_layers.append(name) + self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers + + if self.quant_config.custom_observe_layers_names != "default": + for custom_observe_name in self.quant_config.custom_observe_layers_names: + for default_name in observer_layers_dict.keys(): + if custom_observe_name not in default_name: + observer_layers_dict.pop(default_name) + return observer_layers_dict + + def get_smooth_mapping_layers(self, smooth_config, mappings=None): + if mappings is None: + mappings = [ + (["q_proj", "k_proj", "v_proj"], "input_layernorm"), + (["gate_proj", "up_proj"], "post_attention_layernorm"), + ] + print(f"smooth mappings={mappings}") + assert len(mappings) == 2 + assert smooth_config.smooth_first_linears or smooth_config.smooth_last_linears + return super().get_smooth_mapping_layers(smooth_config, mappings) + + def get_parent_dict(self, observer_layers_dict): + parent_mapping = {r"experts\.\d+": "experts"} + parent_dict = {} + for layer_name in observer_layers_dict.keys(): + parent_name = layer_name + for k, v in parent_mapping.items(): + parent_name = re.sub(k, v, layer_name) + if parent_name != layer_name: + parent_dict[layer_name] = parent_name + return parent_dict + + def get_save_func(self): + if self.deploy_backend in ["vllm", "huggingface"]: + return PTQSaveVllmHF + else: + raise NotImplementedError( + f"deploy_backend {self.deploy_backend} is not supported for saving." + ) + + def fuse_observer_amax(self, sub_layer, name): + if "q_proj" in name or "k_proj" in name or "v_proj" in name: + prefix = name.rsplit(".", 1)[0] + q_name = f"{prefix}.q_proj" + k_name = f"{prefix}.k_proj" + v_name = f"{prefix}.v_proj" + + weight_scales = [] + for key in [q_name, k_name, v_name]: + tensor = self.weight_observer_amax_dict[key] + weight_scales.append(tensor) + weight_observer_amax = max(weight_scales) + + act_scales = [] + for key in [q_name, k_name, v_name]: + tensor = self.input_observer_amax_dict[key] + act_scales.append(tensor) + input_observer_amax = max(act_scales) + elif "gate_proj" in name or "up_proj" in name: + prefix = name.rsplit(".", 1)[0] + gate_name = f"{prefix}.gate_proj" + up_name = f"{prefix}.up_proj" + + weight_scales = [] + for key in [gate_name, up_name]: + tensor = self.weight_observer_amax_dict[key] + weight_scales.append(tensor) + weight_observer_amax = max(weight_scales) + + act_scales = [] + for key in [gate_name, up_name]: + tensor = self.input_observer_amax_dict[key] + act_scales.append(tensor) + input_observer_amax = max(act_scales) + else: + weight_observer_amax = self.weight_observer_amax_dict[name] + input_observer_amax = self.input_observer_amax_dict[name] + + return weight_observer_amax, input_observer_amax diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py index 240449f4..cddb445f 100644 --- a/angelslim/utils/config_parser.py +++ b/angelslim/utils/config_parser.py @@ -171,6 +171,7 @@ class QuantizationConfig: quant_helpers: List[str] = field(default_factory=list) smooth_alpha: float = field(default=0.5) low_memory: bool = field(default=False) + cpu_convert: bool = field(default=False) modules_to_quantize: List[str] = field(default_factory=list) zero_point: bool = field(default=True) mse_range: bool = field(default=False) diff --git a/configs/glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml b/configs/glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml new file mode 100644 index 00000000..d18384ba --- /dev/null +++ b/configs/glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml @@ -0,0 +1,27 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: GLM + model_path: zai-org/GLM-4.6 + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_dynamic + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + cpu_convert: true diff --git a/configs/glm4/fp8_static/glm4_6-fp8_static.yaml b/configs/glm4/fp8_static/glm4_6-fp8_static.yaml new file mode 100644 index 00000000..31b58216 --- /dev/null +++ b/configs/glm4/fp8_static/glm4_6-fp8_static.yaml @@ -0,0 +1,35 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: GLM + model_path: zai-org/GLM-4.6 + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_static + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + cpu_convert: true + +# Dataset for calibration +dataset: + name: TextDataset + data_path: dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl + max_seq_length: 4096 + num_samples: 256 + batch_size: 1 From d91655d2c256de163f81892c582ab38a243deefd Mon Sep 17 00:00:00 2001 From: RuBing-Yang Date: Tue, 14 Oct 2025 16:08:09 +0800 Subject: [PATCH 2/5] ignore glm shared_experts --- angelslim/models/llm/glm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/angelslim/models/llm/glm.py b/angelslim/models/llm/glm.py index 3c7990b6..6939e4f2 100644 --- a/angelslim/models/llm/glm.py +++ b/angelslim/models/llm/glm.py @@ -45,17 +45,24 @@ def get_observer_layers(self): "gate_proj", "down_proj", ] + ignore_name = "shared_experts" obs_layers = [nn.Linear] observer_layers_dict = {} layers_dict = find_layers(self.model, layers=obs_layers) ignore_layers = self.skip_layer_names() for name, module in layers_dict.items(): - if name.startswith(self.block_name) and name.split(".")[-1] in names: + if ( + name.startswith(self.block_name) + and name.split(".")[-1] in names + and ignore_name not in name + ): observer_layers_dict[name] = module else: ignore_layers.append(name) - self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers + self.quant_config.quant_algo_info["ignore_layers"] = sorted( + list(set(ignore_layers)) + ) if self.quant_config.custom_observe_layers_names != "default": for custom_observe_name in self.quant_config.custom_observe_layers_names: From fd24960550e01ceaf8baadb8ce2d9c3a5e82ee67 Mon Sep 17 00:00:00 2001 From: RuBing-Yang Date: Tue, 14 Oct 2025 19:31:01 +0800 Subject: [PATCH 3/5] fix dataset path problem --- angelslim/engine.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/angelslim/engine.py b/angelslim/engine.py index 44296126..f3cd979d 100644 --- a/angelslim/engine.py +++ b/angelslim/engine.py @@ -272,7 +272,10 @@ def save( } config_dict["model_config"]["model_path"] = "Base Model Path" config_dict["global_config"]["save_path"] = "Save Model Path" - config_dict["dataset_config"]["data_path"] = "Data Path" + if "dataset_config" in config_dict and isinstance( + config_dict["dataset_config"], dict + ): + config_dict["dataset_config"]["data_path"] = "Data Path" with open(os.path.join(save_path, "angelslim_config.json"), "w") as f: json.dump(config_dict, f, indent=4) From 261c1b55bf5ed10018001d2863876b56f5fe1d09 Mon Sep 17 00:00:00 2001 From: RuBing-Yang Date: Mon, 20 Oct 2025 19:29:42 +0800 Subject: [PATCH 4/5] fix offloaded bias nan bug & add save_name for saving quantization_config --- angelslim/compressor/quant/core/config.py | 1 + angelslim/compressor/quant/core/save.py | 53 +++++++++++-------- angelslim/compressor/quant/ptq.py | 16 ++++++ angelslim/models/llm/glm.py | 12 ++--- angelslim/utils/config_parser.py | 1 + .../glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml | 2 +- .../glm4/fp8_static/glm4_6-fp8_static.yaml | 2 +- 7 files changed, 54 insertions(+), 33 deletions(-) diff --git a/angelslim/compressor/quant/core/config.py b/angelslim/compressor/quant/core/config.py index ab51314e..709e6462 100644 --- a/angelslim/compressor/quant/core/config.py +++ b/angelslim/compressor/quant/core/config.py @@ -61,6 +61,7 @@ def __init__(self, config, global_config=None): act_quant_method = quantization_args.quant_method.get("activation", None) weight_quant_method = quantization_args.quant_method["weight"] self.cpu_convert = quantization_args.cpu_convert + self.save_name = quantization_args.save_name if global_config: self.max_seq_length = global_config.max_seq_length diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py index 21d2695c..be01d32a 100644 --- a/angelslim/compressor/quant/core/save.py +++ b/angelslim/compressor/quant/core/save.py @@ -112,10 +112,13 @@ def __init__(self, quant_model): super().__init__(quant_model=quant_model) def save(self, save_path): - deploy_backend = self.quant_model.deploy_backend - ignore_field = "ignored_layers" if deploy_backend == "vllm" else "ignore" + save_name = self.quant_model.quant_config.save_name + ignore_field = ( + "ignore" if save_name == "compressed-tensors" else "ignored_layers" + ) w_quant_algo = self.quant_model.quant_config.quant_algo_info["w"] a_quant_algo = self.quant_model.quant_config.quant_algo_info["a"] + is_dynamic = "dynamic" in a_quant_algo ignored_layers = self.quant_model.skip_layer_names() trtllm_config = { "quantization": { @@ -130,7 +133,7 @@ def save(self, save_path): act_config = { "num_bits": 8, "strategy": re.search(r"per-([a-zA-Z]+)", a_quant_algo).group(1), - "dynamic": "dynamic" in a_quant_algo, + "dynamic": is_dynamic, "type": "float", } weight_config = { @@ -145,7 +148,7 @@ def save(self, save_path): act_config = { "num_bits": 8, "strategy": re.search(r"per-([a-zA-Z]+)", a_quant_algo).group(1), - "dynamic": "dynamic" in a_quant_algo, + "dynamic": is_dynamic, "type": "int", } weight_config = { @@ -162,7 +165,7 @@ def save(self, save_path): act_config = { "num_bits": 4, "group_size": group_size, - "dynamic": "dynamic" in a_quant_algo, + "dynamic": is_dynamic, "type": "float", } weight_config = { @@ -176,23 +179,29 @@ def save(self, save_path): f"{self.quant_model.quant_config.quant_algo} not supported" ) - quant_dict = { - "quantization_config": { - "config_groups": { - "group_0": { - "weights": weight_config, - "input_activations": act_config, - "output_activations": None, - "targets": ["Linear"], - } - }, - "kv_cache_scheme": None, - "format": quant_format, - ignore_field: ignored_layers, - "quantization_status": "compressed", - "quant_method": "compressed-tensors", - } - } + quantization_config = {"quant_method": save_name, ignore_field: ignored_layers} + if save_name == "compressed-tensors": + quantization_config.update( + { + "config_groups": { + "group_0": { + "weights": weight_config, + "input_activations": act_config, + "output_activations": None, + "targets": ["Linear"], + } + }, + "kv_cache_scheme": None, + "format": quant_format, + "quantization_status": "compressed", + } + ) + else: + quantization_config["activation_scheme"] = ( + "dynamic" if is_dynamic else "static" + ) + + quant_dict = {"quantization_config": quantization_config} self.quant_model.get_model().config.update(quant_dict) print_info("Save quantization_config: {}".format(quant_dict)) diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index cb36d65f..e69120b1 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -224,6 +224,22 @@ def _convert(self): print_info(f"Load meta weight {name} from file {orign_w_file}") sub_layer.to_empty(device="cpu") sub_layer.weight.data = orign_w[name + ".weight"] + + if hasattr(sub_layer, "bias"): + if (name + ".bias") in model_index["weight_map"]: + orign_b_file = os.path.join( + self.model_path, + model_index["weight_map"][name + ".bias"], + ) + orign_b = load_file(orign_b_file, device="cpu") + print_info( + f"Load meta bias {name} from file {orign_b_file}" + ) + sub_layer.bias.data = orign_b[name + ".bias"] + else: + print_info(f"{name + '.bias'} not found. Set bias to None.") + sub_layer.bias = None + weight_scales = self.quant_model.get_weight_scales( sub_layer, self.ptq_hook.observer_dict[sub_layer].weight_observer ) diff --git a/angelslim/models/llm/glm.py b/angelslim/models/llm/glm.py index 6939e4f2..d11a0b78 100644 --- a/angelslim/models/llm/glm.py +++ b/angelslim/models/llm/glm.py @@ -45,24 +45,18 @@ def get_observer_layers(self): "gate_proj", "down_proj", ] - ignore_name = "shared_experts" obs_layers = [nn.Linear] observer_layers_dict = {} layers_dict = find_layers(self.model, layers=obs_layers) ignore_layers = self.skip_layer_names() for name, module in layers_dict.items(): - if ( - name.startswith(self.block_name) - and name.split(".")[-1] in names - and ignore_name not in name - ): + if name.startswith(self.block_name) and name.split(".")[-1] in names: observer_layers_dict[name] = module else: ignore_layers.append(name) - self.quant_config.quant_algo_info["ignore_layers"] = sorted( - list(set(ignore_layers)) - ) + ignore_layers = sorted(list(set(ignore_layers))) + self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers if self.quant_config.custom_observe_layers_names != "default": for custom_observe_name in self.quant_config.custom_observe_layers_names: diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py index cddb445f..8498ede4 100644 --- a/angelslim/utils/config_parser.py +++ b/angelslim/utils/config_parser.py @@ -160,6 +160,7 @@ class QuantizationConfig: """ name: str = field(default="fp8_dynamic") + save_name: str = field(default="compressed-tensors") bits: int = field(default=8) quant_method: Dict[str, Any] = field( default_factory=lambda: { diff --git a/configs/glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml b/configs/glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml index d18384ba..085053ff 100644 --- a/configs/glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml +++ b/configs/glm4/fp8_dynamic/glm4_6-fp8_dynamic.yaml @@ -17,11 +17,11 @@ compression: name: PTQ quantization: name: fp8_dynamic + save_name: fp8 bits: 8 quant_method: weight: "per-tensor" activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" cpu_convert: true diff --git a/configs/glm4/fp8_static/glm4_6-fp8_static.yaml b/configs/glm4/fp8_static/glm4_6-fp8_static.yaml index 31b58216..1d8f7290 100644 --- a/configs/glm4/fp8_static/glm4_6-fp8_static.yaml +++ b/configs/glm4/fp8_static/glm4_6-fp8_static.yaml @@ -17,13 +17,13 @@ compression: name: PTQ quantization: name: fp8_static + save_name: fp8 bits: 8 quant_method: weight: "per-tensor" activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" cpu_convert: true # Dataset for calibration From d56b9c0efa8db8e85cdd91ec5260839deef92163 Mon Sep 17 00:00:00 2001 From: RuBing-Yang Date: Tue, 21 Oct 2025 18:37:23 +0800 Subject: [PATCH 5/5] fix vllm cannot ignore embedding bug --- angelslim/utils/config_parser.py | 2 +- angelslim/utils/default_compress_config.py | 14 +++++++------- .../fp8_static/deepseek_r1_fp8_static.yaml | 1 - .../deepseek_r1_fp8_static_low_memmory.yaml | 1 - .../deepseek_r1/int4_awq/deepseek_r1_int4_awq.yaml | 1 - .../deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml | 1 - .../w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml | 1 - .../deepseek_r1_distill_qwen-14b_fp8_dynamic.yaml | 1 - .../deepseek_r1_distill_qwen-1_5b_fp8_dynamic.yaml | 1 - .../deepseek_r1_distill_qwen-32b_fp8_dynamic.yaml | 1 - .../deepseek_r1_distill_qwen-7b_fp8_dynamic.yaml | 1 - .../deepseek_r1_distill_qwen-14b_fp8_static.yaml | 1 - .../deepseek_r1_distill_qwen-1_5b_fp8_static.yaml | 1 - .../deepseek_r1_distill_qwen-32b_fp8_static.yaml | 1 - .../deepseek_r1_distill_qwen-7b_fp8_static.yaml | 1 - .../deepseek_r1_distill_qwen-14b_int4_awq.yaml | 1 - .../deepseek_r1_distill_qwen-1_5b_int4_awq.yaml | 1 - .../deepseek_r1_distill_qwen-32b_int4_awq.yaml | 1 - .../deepseek_r1_distill_qwen-7b_int4_awq.yaml | 1 - .../deepseek_r1_distill_qwen-32b_int4_gptaq.yaml | 1 - .../deepseek_r1_distill_qwen-14b_int4_gptq.yaml | 1 - .../deepseek_r1_distill_qwen-1_5b_int4_gptq.yaml | 1 - .../deepseek_r1_distill_qwen-32b_int4_gptq.yaml | 1 - .../deepseek_r1_distill_qwen-7b_int4_gptq.yaml | 1 - .../hunyuan_0_5b_dense_fp8_dynamic.yaml | 1 - .../hunyuan_1_8b_dense_fp8_dynamic.yaml | 1 - .../fp8_dynamic/hunyuan_4b_dense_fp8_dynamic.yaml | 1 - .../fp8_dynamic/hunyuan_7b_dense_fp8_dynamic.yaml | 1 - .../fp8_dynamic/hunyuan_a13b_fp8_dynamic.yaml | 1 - .../fp8_static/hunyuan_0_5b_dense_fp8_static.yaml | 1 - .../fp8_static/hunyuan_1_8b_dense_fp8_static.yaml | 1 - .../hunyuan_2b_dense_lepto_fp8_static.yaml | 1 - .../fp8_static/hunyuan_4b_dense_fp8_static.yaml | 1 - .../hunyuan_4b_dense_lepto_fp8_static.yaml | 1 - .../fp8_static/hunyuan_7b_dense_fp8_static.yaml | 1 - .../fp8_static/hunyuan_a13b_fp8_static.yaml | 1 - .../hunyuan_a13b_fp8_static_low_memory.yaml | 1 - .../hunyuan/int4_awq/hunyuan-a13b_int4_awq.yaml | 1 - .../int4_awq/hunyuan_0_5b_dense_int4_awq.yaml | 1 - .../int4_awq/hunyuan_1_8b_dense_int4_awq.yaml | 1 - .../int4_awq/hunyuan_4b_dense_int4_awq.yaml | 1 - .../int4_awq/hunyuan_7b_dense_int4_awq.yaml | 1 - .../int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml | 1 - .../int4_gptq/hunyuan_0_5b_dense_int4_gptq.yaml | 1 - .../int4_gptq/hunyuan_1_8b_dense_int4_gptq.yaml | 1 - .../int4_gptq/hunyuan_4b_dense_int4_gptq.yaml | 1 - .../int4_gptq/hunyuan_7b_dense_int4_gptq.yaml | 1 - .../hunyuan/int4_gptq/hunyuan_a13b_int4_gptq.yaml | 1 - configs/kimi_k2/fp8_static/kimi_k2_fp8_static.yaml | 1 - .../fp8_static/kimi_k2_fp8_static_low_memmory.yaml | 1 - .../qwen2_5-0_5b_instruct_fp8_dynamic.yaml | 1 - .../qwen2_5-14b_instruct_fp8_dynamic.yaml | 1 - .../qwen2_5-1_5b_instruct_fp8_dynamic.yaml | 1 - .../qwen2_5-32b_instruct_fp8_dynamic.yaml | 1 - .../qwen2_5-3b_instruct_fp8_dynamic.yaml | 1 - .../qwen2_5-72b_instruct_fp8_dynamic.yaml | 1 - .../qwen2_5-7b_instruct_fp8_dynamic.yaml | 1 - .../qwen2_5-0_5b_instruct_fp8_static.yaml | 1 - .../qwen2_5-14b_instruct_fp8_static.yaml | 1 - .../qwen2_5-1_5b_instruct_ados_fp8_static.yaml | 1 - .../qwen2_5-1_5b_instruct_fp8_static.yaml | 1 - .../qwen2_5-32b_instruct_fp8_static.yaml | 1 - .../fp8_static/qwen2_5-3b_instruct_fp8_static.yaml | 1 - .../qwen2_5-72b_instruct_fp8_static.yaml | 1 - .../qwen2_5-7b_fp8_static_low_memory.yaml | 1 - .../fp8_static/qwen2_5-7b_instruct_fp8_static.yaml | 1 - .../qwen2_5/int4_awq/qwen2_5-1_5b_int4_awq.yaml | 1 - configs/qwen2_5/int4_awq/qwen2_5-32b_int4_awq.yaml | 1 - configs/qwen2_5/int4_awq/qwen2_5-7b_int4_awq.yaml | 1 - .../qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml | 1 - .../qwen2_5/int4_gptq/qwen2_5-1_5b_int4_gptq.yaml | 1 - .../qwen2_5/int4_gptq/qwen2_5-32b_int4_gptq.yaml | 1 - .../qwen2_5/int4_gptq/qwen2_5-7b_int4_gptq.yaml | 1 - .../qwen3/fp8_dynamic/qwen3-0_6b_fp8_dynamic.yaml | 1 - .../qwen3/fp8_dynamic/qwen3-14b_fp8_dynamic.yaml | 1 - .../qwen3/fp8_dynamic/qwen3-1_7b_fp8_dynamic.yaml | 1 - .../qwen3/fp8_dynamic/qwen3-32b_fp8_dynamic.yaml | 1 - .../qwen3/fp8_dynamic/qwen3-4b_fp8_dynamic.yaml | 1 - .../qwen3/fp8_dynamic/qwen3-8b_fp8_dynamic.yaml | 1 - .../qwen3/fp8_dynamic/qwen3-a22b_fp8_dynamic.yaml | 1 - .../qwen3/fp8_dynamic/qwen3-a3b_fp8_dynamic.yaml | 2 +- .../fp8_dynamic/qwen3_coder-a35b_fp8_dynamic.yaml | 1 - .../qwen3_coder-a35b_fp8_dynamic_low_memory.yaml | 1 - .../qwen3/fp8_static/qwen3-0_6b_fp8_static.yaml | 1 - .../fp8_static/qwen3-0_6b_fp8_static_analyse.yaml | 1 - .../fp8_static/qwen3-0_6b_lepto_fp8_static.yaml | 1 - configs/qwen3/fp8_static/qwen3-14b_fp8_static.yaml | 1 - .../qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml | 1 - configs/qwen3/fp8_static/qwen3-32b_fp8_static.yaml | 1 - configs/qwen3/fp8_static/qwen3-4b_fp8_static.yaml | 1 - .../fp8_static/qwen3-4b_lepto_fp8_static.yaml | 1 - configs/qwen3/fp8_static/qwen3-8b_fp8_static.yaml | 1 - .../fp8_static/qwen3-8b_lepto_fp8_static.yaml | 1 - .../qwen3/fp8_static/qwen3-a22b_fp8_static.yaml | 1 - .../qwen3-a22b_fp8_static_low_memroy.yaml | 1 - configs/qwen3/fp8_static/qwen3-a3b_fp8_static.yaml | 1 - .../fp8_static/qwen3_coder-a35b_fp8_static.yaml | 1 - .../qwen3_coder-a35b_fp8_static_low_memory.yaml | 1 - configs/qwen3/int4_awq/qwen3-0_6b_int4_awq.yaml | 1 - configs/qwen3/int4_awq/qwen3-14b_int4_awq.yaml | 1 - configs/qwen3/int4_awq/qwen3-1_7b_int4_awq.yaml | 1 - configs/qwen3/int4_awq/qwen3-32b_int4_awq.yaml | 1 - configs/qwen3/int4_awq/qwen3-4b_int4_awq.yaml | 1 - configs/qwen3/int4_awq/qwen3-8b_int4_awq.yaml | 1 - configs/qwen3/int4_awq/qwen3-a22b_int4_awq.yaml | 1 - configs/qwen3/int4_awq/qwen3-a3b_int4_awq.yaml | 1 - configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml | 1 - configs/qwen3/int4_gptq/qwen3-0_6b_int4_gptq.yaml | 1 - configs/qwen3/int4_gptq/qwen3-14b_int4_gptq.yaml | 1 - configs/qwen3/int4_gptq/qwen3-1_7b_int4_gptq.yaml | 1 - configs/qwen3/int4_gptq/qwen3-32b_int4_gptq.yaml | 1 - configs/qwen3/int4_gptq/qwen3-4b_int4_gptq.yaml | 1 - configs/qwen3/int4_gptq/qwen3-8b_int4_gptq.yaml | 1 - configs/qwen3/int4_gptq/qwen3-a22b_int4_gptq.yaml | 1 - configs/qwen3/int4_gptq/qwen3-a3b_int4_gptq.yaml | 1 - .../int8_dynamic/qwen3-0_6b_int8_dynamic.yaml | 1 - .../qwen3/int8_dynamic/qwen3-14b_int8_dynamic.yaml | 1 - .../int8_dynamic/qwen3-1_7b_int8_dynamic.yaml | 1 - .../qwen3/int8_dynamic/qwen3-32b_int8_dynamic.yaml | 1 - .../qwen3/int8_dynamic/qwen3-4b_int8_dynamic.yaml | 1 - .../qwen3/int8_dynamic/qwen3-8b_int8_dynamic.yaml | 1 - .../int8_dynamic/qwen3-a22b_int8_dynamic.yaml | 1 - .../qwen3/int8_dynamic/qwen3-a3b_int8_dynamic.yaml | 1 - configs/qwen3/nvfp4/qwen3-0_6b_nvfp4.yaml | 1 - configs/qwen3/nvfp4/qwen3-14b_nvfp4.yaml | 1 - configs/qwen3/nvfp4/qwen3-1_7b_nvfp4.yaml | 1 - configs/qwen3/nvfp4/qwen3-32b_nvfp4.yaml | 1 - configs/qwen3/nvfp4/qwen3-4b_nvfp4.yaml | 1 - configs/qwen3/nvfp4/qwen3-8b_nvfp4.yaml | 1 - configs/qwen3/nvfp4/qwen3-a22b_nvfp4.yaml | 1 - .../qwen3-0_6b_int8_dynamic_smooth.yaml | 1 - .../smooth_int8/qwen3-14b_int8_dynamic_smooth.yaml | 1 - .../qwen3-1_7b_int8_dynamic_smooth.yaml | 1 - .../smooth_int8/qwen3-32b_int8_dynamic_smooth.yaml | 1 - .../smooth_int8/qwen3-4b_int8_dynamic_smooth.yaml | 1 - .../smooth_int8/qwen3-8b_int8_dynamic_smooth.yaml | 1 - configs/qwq/fp8_dynamic/qwq-32b_fp8_dynamic.yaml | 1 - configs/qwq/fp8_static/qwq-32b_fp8_static.yaml | 1 - configs/qwq/int4_awq/qwq-32b_int4_awq.yaml | 1 - configs/qwq/int4_gptq/qwq-32b_int4_gptq.yaml | 1 - .../seed_oss-36b_instruct_fp8_dynamic.yaml | 1 - .../seed_oss-36b_instruct_fp8_static.yaml | 1 - docs/source/deployment/deploy.md | 3 +-- docs/source/design/prepare_config.md | 1 - docs/source/features/quantization/fp8.md | 6 +----- docs/source/features/quantization/fp8_lepto.md | 1 - docs/source/features/quantization/int8.md | 4 +--- 147 files changed, 12 insertions(+), 160 deletions(-) diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py index 8498ede4..d8375145 100644 --- a/angelslim/utils/config_parser.py +++ b/angelslim/utils/config_parser.py @@ -495,7 +495,7 @@ def get_default_config() -> FullConfig: quantization=QuantizationConfig( name="fp8_dynamic", bits=8, - ignore_layers=["lm_head", "model.embed_tokens"], + ignore_layers=["lm_head"], ), ), dataset_config=None, diff --git a/angelslim/utils/default_compress_config.py b/angelslim/utils/default_compress_config.py index 7ccd58f4..9d229e33 100644 --- a/angelslim/utils/default_compress_config.py +++ b/angelslim/utils/default_compress_config.py @@ -38,7 +38,7 @@ def default_fp8_dynamic_config() -> dict: name="fp8_dynamic", bits=8, quant_method={"weight": "per-tensor", "activation": "per-tensor"}, - ignore_layers=["lm_head", "model.embed_tokens"], + ignore_layers=["lm_head"], ), ), } @@ -58,7 +58,7 @@ def default_fp8_static_config() -> dict: name="fp8_static", bits=8, quant_method={"weight": "per-tensor", "activation": "per-tensor"}, - ignore_layers=["lm_head", "model.embed_tokens"], + ignore_layers=["lm_head"], ), ), } @@ -78,7 +78,7 @@ def default_int8_dynamic_config() -> dict: name="int8_dynamic", bits=8, quant_method={"weight": "per-channel", "activation": "per-token"}, - ignore_layers=["lm_head", "model.embed_tokens"], + ignore_layers=["lm_head"], ), ), } @@ -98,7 +98,7 @@ def default_int4_gptq_config() -> dict: name="int4_gptq", bits=4, quant_method={"weight": "per-group", "group_size": 128}, - ignore_layers=["lm_head", "model.embed_tokens"], + ignore_layers=["lm_head"], ), ), } @@ -123,7 +123,7 @@ def default_int4_awq_config() -> dict: "zero_point": True, "mse_range": False, }, - ignore_layers=["lm_head", "model.embed_tokens"], + ignore_layers=["lm_head"], ), ), } @@ -147,7 +147,7 @@ def default_w4a8_fp8_static_config() -> dict: "group_size": 128, "activation": "per-tensor", }, - ignore_layers=["lm_head", "model.embed_tokens"], + ignore_layers=["lm_head"], ), ), } @@ -167,7 +167,7 @@ def default_int4_gptaq_config() -> dict: name="int4_gptaq", bits=4, quant_method={"weight": "per-group", "group_size": 128}, - ignore_layers=["lm_head", "model.embed_tokens"], + ignore_layers=["lm_head"], ), ), } diff --git a/configs/deepseek_r1/fp8_static/deepseek_r1_fp8_static.yaml b/configs/deepseek_r1/fp8_static/deepseek_r1_fp8_static.yaml index 77f54890..78daf9d2 100644 --- a/configs/deepseek_r1/fp8_static/deepseek_r1_fp8_static.yaml +++ b/configs/deepseek_r1/fp8_static/deepseek_r1_fp8_static.yaml @@ -24,7 +24,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1/fp8_static/deepseek_r1_fp8_static_low_memmory.yaml b/configs/deepseek_r1/fp8_static/deepseek_r1_fp8_static_low_memmory.yaml index 559d9c42..26acbb6b 100644 --- a/configs/deepseek_r1/fp8_static/deepseek_r1_fp8_static_low_memmory.yaml +++ b/configs/deepseek_r1/fp8_static/deepseek_r1_fp8_static_low_memmory.yaml @@ -24,7 +24,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1/int4_awq/deepseek_r1_int4_awq.yaml b/configs/deepseek_r1/int4_awq/deepseek_r1_int4_awq.yaml index 09aed313..8ada0ac7 100644 --- a/configs/deepseek_r1/int4_awq/deepseek_r1_int4_awq.yaml +++ b/configs/deepseek_r1/int4_awq/deepseek_r1_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" - "model.layers.61." # Dataset for calibration diff --git a/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml b/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml index 9370111e..d8c2fa8e 100644 --- a/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml +++ b/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml @@ -35,7 +35,6 @@ compression: - "mlp.down_proj" - "shared_expert" - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml b/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml index 841ec3a4..9490e0d6 100644 --- a/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml +++ b/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml @@ -35,7 +35,6 @@ compression: - "mlp.down_proj" - "shared_expert" - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-14b_fp8_dynamic.yaml b/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-14b_fp8_dynamic.yaml index ea23c648..db6bc576 100644 --- a/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-14b_fp8_dynamic.yaml +++ b/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-14b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-1_5b_fp8_dynamic.yaml b/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-1_5b_fp8_dynamic.yaml index 595ef0ec..593548bd 100644 --- a/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-1_5b_fp8_dynamic.yaml +++ b/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-1_5b_fp8_dynamic.yaml @@ -23,5 +23,4 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-32b_fp8_dynamic.yaml b/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-32b_fp8_dynamic.yaml index c48d7c51..3adb6948 100644 --- a/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-32b_fp8_dynamic.yaml +++ b/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-32b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-7b_fp8_dynamic.yaml b/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-7b_fp8_dynamic.yaml index 90b285cc..4c920e21 100644 --- a/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-7b_fp8_dynamic.yaml +++ b/configs/deepseek_r1_distill_qwen/fp8_dynamic/deepseek_r1_distill_qwen-7b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-14b_fp8_static.yaml b/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-14b_fp8_static.yaml index 1c2f229a..93c24aab 100644 --- a/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-14b_fp8_static.yaml +++ b/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-14b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-1_5b_fp8_static.yaml b/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-1_5b_fp8_static.yaml index 81cdece5..8bfb234b 100644 --- a/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-1_5b_fp8_static.yaml +++ b/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-1_5b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-32b_fp8_static.yaml b/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-32b_fp8_static.yaml index bb260230..fa4cf754 100644 --- a/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-32b_fp8_static.yaml +++ b/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-32b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-7b_fp8_static.yaml b/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-7b_fp8_static.yaml index d665e9c0..ff021393 100644 --- a/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-7b_fp8_static.yaml +++ b/configs/deepseek_r1_distill_qwen/fp8_static/deepseek_r1_distill_qwen-7b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-14b_int4_awq.yaml b/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-14b_int4_awq.yaml index e023418b..85af4970 100644 --- a/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-14b_int4_awq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-14b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-1_5b_int4_awq.yaml b/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-1_5b_int4_awq.yaml index dd06815e..a6a797b8 100644 --- a/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-1_5b_int4_awq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-1_5b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-32b_int4_awq.yaml b/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-32b_int4_awq.yaml index 81f71b84..e314d380 100644 --- a/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-32b_int4_awq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-32b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-7b_int4_awq.yaml b/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-7b_int4_awq.yaml index 2c5ffde4..1eeef3ba 100644 --- a/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-7b_int4_awq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_awq/deepseek_r1_distill_qwen-7b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_gptaq/deepseek_r1_distill_qwen-32b_int4_gptaq.yaml b/configs/deepseek_r1_distill_qwen/int4_gptaq/deepseek_r1_distill_qwen-32b_int4_gptaq.yaml index ffee696d..4c71092e 100644 --- a/configs/deepseek_r1_distill_qwen/int4_gptaq/deepseek_r1_distill_qwen-32b_int4_gptaq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_gptaq/deepseek_r1_distill_qwen-32b_int4_gptaq.yaml @@ -24,7 +24,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-14b_int4_gptq.yaml b/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-14b_int4_gptq.yaml index 7b37a8b2..cfc1df77 100644 --- a/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-14b_int4_gptq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-14b_int4_gptq.yaml @@ -24,7 +24,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-1_5b_int4_gptq.yaml b/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-1_5b_int4_gptq.yaml index 9069cc8a..09671d08 100644 --- a/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-1_5b_int4_gptq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-1_5b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-32b_int4_gptq.yaml b/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-32b_int4_gptq.yaml index 81d0de18..813baa06 100644 --- a/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-32b_int4_gptq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-32b_int4_gptq.yaml @@ -24,7 +24,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-7b_int4_gptq.yaml b/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-7b_int4_gptq.yaml index 137b7704..030b53dd 100644 --- a/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-7b_int4_gptq.yaml +++ b/configs/deepseek_r1_distill_qwen/int4_gptq/deepseek_r1_distill_qwen-7b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/fp8_dynamic/hunyuan_0_5b_dense_fp8_dynamic.yaml b/configs/hunyuan/fp8_dynamic/hunyuan_0_5b_dense_fp8_dynamic.yaml index f750f206..5608df68 100644 --- a/configs/hunyuan/fp8_dynamic/hunyuan_0_5b_dense_fp8_dynamic.yaml +++ b/configs/hunyuan/fp8_dynamic/hunyuan_0_5b_dense_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/hunyuan/fp8_dynamic/hunyuan_1_8b_dense_fp8_dynamic.yaml b/configs/hunyuan/fp8_dynamic/hunyuan_1_8b_dense_fp8_dynamic.yaml index 46303700..afd9c9de 100644 --- a/configs/hunyuan/fp8_dynamic/hunyuan_1_8b_dense_fp8_dynamic.yaml +++ b/configs/hunyuan/fp8_dynamic/hunyuan_1_8b_dense_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/hunyuan/fp8_dynamic/hunyuan_4b_dense_fp8_dynamic.yaml b/configs/hunyuan/fp8_dynamic/hunyuan_4b_dense_fp8_dynamic.yaml index c278ffcc..06157b9c 100644 --- a/configs/hunyuan/fp8_dynamic/hunyuan_4b_dense_fp8_dynamic.yaml +++ b/configs/hunyuan/fp8_dynamic/hunyuan_4b_dense_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/hunyuan/fp8_dynamic/hunyuan_7b_dense_fp8_dynamic.yaml b/configs/hunyuan/fp8_dynamic/hunyuan_7b_dense_fp8_dynamic.yaml index 0c7a69ee..13b42205 100644 --- a/configs/hunyuan/fp8_dynamic/hunyuan_7b_dense_fp8_dynamic.yaml +++ b/configs/hunyuan/fp8_dynamic/hunyuan_7b_dense_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/hunyuan/fp8_dynamic/hunyuan_a13b_fp8_dynamic.yaml b/configs/hunyuan/fp8_dynamic/hunyuan_a13b_fp8_dynamic.yaml index cb0f33ed..2c083dde 100644 --- a/configs/hunyuan/fp8_dynamic/hunyuan_a13b_fp8_dynamic.yaml +++ b/configs/hunyuan/fp8_dynamic/hunyuan_a13b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/hunyuan/fp8_static/hunyuan_0_5b_dense_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_0_5b_dense_fp8_static.yaml index bb451bb1..b71dd47a 100644 --- a/configs/hunyuan/fp8_static/hunyuan_0_5b_dense_fp8_static.yaml +++ b/configs/hunyuan/fp8_static/hunyuan_0_5b_dense_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/fp8_static/hunyuan_1_8b_dense_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_1_8b_dense_fp8_static.yaml index bf15fb86..eb76c7fe 100644 --- a/configs/hunyuan/fp8_static/hunyuan_1_8b_dense_fp8_static.yaml +++ b/configs/hunyuan/fp8_static/hunyuan_1_8b_dense_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml index e3cd159a..1647c90d 100644 --- a/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml +++ b/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/fp8_static/hunyuan_4b_dense_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_4b_dense_fp8_static.yaml index 286786b9..fdaadea9 100644 --- a/configs/hunyuan/fp8_static/hunyuan_4b_dense_fp8_static.yaml +++ b/configs/hunyuan/fp8_static/hunyuan_4b_dense_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml index b538b588..d3bd1ec1 100644 --- a/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml +++ b/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/fp8_static/hunyuan_7b_dense_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_7b_dense_fp8_static.yaml index e2b1feec..703ce6f1 100644 --- a/configs/hunyuan/fp8_static/hunyuan_7b_dense_fp8_static.yaml +++ b/configs/hunyuan/fp8_static/hunyuan_7b_dense_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/fp8_static/hunyuan_a13b_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_a13b_fp8_static.yaml index 920a959a..b238e7e8 100644 --- a/configs/hunyuan/fp8_static/hunyuan_a13b_fp8_static.yaml +++ b/configs/hunyuan/fp8_static/hunyuan_a13b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/fp8_static/hunyuan_a13b_fp8_static_low_memory.yaml b/configs/hunyuan/fp8_static/hunyuan_a13b_fp8_static_low_memory.yaml index 60686673..e3f0f582 100644 --- a/configs/hunyuan/fp8_static/hunyuan_a13b_fp8_static_low_memory.yaml +++ b/configs/hunyuan/fp8_static/hunyuan_a13b_fp8_static_low_memory.yaml @@ -24,7 +24,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/int4_awq/hunyuan-a13b_int4_awq.yaml b/configs/hunyuan/int4_awq/hunyuan-a13b_int4_awq.yaml index 38ac32b6..1c92f5df 100644 --- a/configs/hunyuan/int4_awq/hunyuan-a13b_int4_awq.yaml +++ b/configs/hunyuan/int4_awq/hunyuan-a13b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/int4_awq/hunyuan_0_5b_dense_int4_awq.yaml b/configs/hunyuan/int4_awq/hunyuan_0_5b_dense_int4_awq.yaml index eecd1245..31b3fa5e 100644 --- a/configs/hunyuan/int4_awq/hunyuan_0_5b_dense_int4_awq.yaml +++ b/configs/hunyuan/int4_awq/hunyuan_0_5b_dense_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/int4_awq/hunyuan_1_8b_dense_int4_awq.yaml b/configs/hunyuan/int4_awq/hunyuan_1_8b_dense_int4_awq.yaml index 585112e8..1c8341f3 100644 --- a/configs/hunyuan/int4_awq/hunyuan_1_8b_dense_int4_awq.yaml +++ b/configs/hunyuan/int4_awq/hunyuan_1_8b_dense_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/int4_awq/hunyuan_4b_dense_int4_awq.yaml b/configs/hunyuan/int4_awq/hunyuan_4b_dense_int4_awq.yaml index 150c891f..3d57150a 100644 --- a/configs/hunyuan/int4_awq/hunyuan_4b_dense_int4_awq.yaml +++ b/configs/hunyuan/int4_awq/hunyuan_4b_dense_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/int4_awq/hunyuan_7b_dense_int4_awq.yaml b/configs/hunyuan/int4_awq/hunyuan_7b_dense_int4_awq.yaml index eecd1245..31b3fa5e 100644 --- a/configs/hunyuan/int4_awq/hunyuan_7b_dense_int4_awq.yaml +++ b/configs/hunyuan/int4_awq/hunyuan_7b_dense_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/hunyuan/int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml b/configs/hunyuan/int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml index 30d6a40a..7eadc080 100644 --- a/configs/hunyuan/int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml +++ b/configs/hunyuan/int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" - "mlp.gate.wg" # Dataset for calibration diff --git a/configs/hunyuan/int4_gptq/hunyuan_0_5b_dense_int4_gptq.yaml b/configs/hunyuan/int4_gptq/hunyuan_0_5b_dense_int4_gptq.yaml index c36e3f9d..3011a3f4 100644 --- a/configs/hunyuan/int4_gptq/hunyuan_0_5b_dense_int4_gptq.yaml +++ b/configs/hunyuan/int4_gptq/hunyuan_0_5b_dense_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" - "mlp.gate.wg" # Dataset for calibration diff --git a/configs/hunyuan/int4_gptq/hunyuan_1_8b_dense_int4_gptq.yaml b/configs/hunyuan/int4_gptq/hunyuan_1_8b_dense_int4_gptq.yaml index dc94b315..a6c01405 100644 --- a/configs/hunyuan/int4_gptq/hunyuan_1_8b_dense_int4_gptq.yaml +++ b/configs/hunyuan/int4_gptq/hunyuan_1_8b_dense_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" - "mlp.gate.wg" # Dataset for calibration diff --git a/configs/hunyuan/int4_gptq/hunyuan_4b_dense_int4_gptq.yaml b/configs/hunyuan/int4_gptq/hunyuan_4b_dense_int4_gptq.yaml index 18627b08..8252cad3 100644 --- a/configs/hunyuan/int4_gptq/hunyuan_4b_dense_int4_gptq.yaml +++ b/configs/hunyuan/int4_gptq/hunyuan_4b_dense_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" - "mlp.gate.wg" # Dataset for calibration diff --git a/configs/hunyuan/int4_gptq/hunyuan_7b_dense_int4_gptq.yaml b/configs/hunyuan/int4_gptq/hunyuan_7b_dense_int4_gptq.yaml index b5ac009a..42be423d 100644 --- a/configs/hunyuan/int4_gptq/hunyuan_7b_dense_int4_gptq.yaml +++ b/configs/hunyuan/int4_gptq/hunyuan_7b_dense_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" - "mlp.gate.wg" # Dataset for calibration diff --git a/configs/hunyuan/int4_gptq/hunyuan_a13b_int4_gptq.yaml b/configs/hunyuan/int4_gptq/hunyuan_a13b_int4_gptq.yaml index c0f2cba9..785b83a2 100644 --- a/configs/hunyuan/int4_gptq/hunyuan_a13b_int4_gptq.yaml +++ b/configs/hunyuan/int4_gptq/hunyuan_a13b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" - "mlp.gate.wg" # Dataset for calibration diff --git a/configs/kimi_k2/fp8_static/kimi_k2_fp8_static.yaml b/configs/kimi_k2/fp8_static/kimi_k2_fp8_static.yaml index 27550197..75f2a7ce 100644 --- a/configs/kimi_k2/fp8_static/kimi_k2_fp8_static.yaml +++ b/configs/kimi_k2/fp8_static/kimi_k2_fp8_static.yaml @@ -24,7 +24,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/kimi_k2/fp8_static/kimi_k2_fp8_static_low_memmory.yaml b/configs/kimi_k2/fp8_static/kimi_k2_fp8_static_low_memmory.yaml index 669ec8ce..d980abfd 100644 --- a/configs/kimi_k2/fp8_static/kimi_k2_fp8_static_low_memmory.yaml +++ b/configs/kimi_k2/fp8_static/kimi_k2_fp8_static_low_memmory.yaml @@ -24,7 +24,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-0_5b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-0_5b_instruct_fp8_dynamic.yaml index e65710af..dc006c68 100644 --- a/configs/qwen2_5/fp8_dynamic/qwen2_5-0_5b_instruct_fp8_dynamic.yaml +++ b/configs/qwen2_5/fp8_dynamic/qwen2_5-0_5b_instruct_fp8_dynamic.yaml @@ -23,5 +23,4 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-14b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-14b_instruct_fp8_dynamic.yaml index 376b803b..7c09f6d6 100644 --- a/configs/qwen2_5/fp8_dynamic/qwen2_5-14b_instruct_fp8_dynamic.yaml +++ b/configs/qwen2_5/fp8_dynamic/qwen2_5-14b_instruct_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-1_5b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-1_5b_instruct_fp8_dynamic.yaml index e120f9e7..1fd138ca 100644 --- a/configs/qwen2_5/fp8_dynamic/qwen2_5-1_5b_instruct_fp8_dynamic.yaml +++ b/configs/qwen2_5/fp8_dynamic/qwen2_5-1_5b_instruct_fp8_dynamic.yaml @@ -23,5 +23,4 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-32b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-32b_instruct_fp8_dynamic.yaml index 3febd2b4..6bc718d5 100644 --- a/configs/qwen2_5/fp8_dynamic/qwen2_5-32b_instruct_fp8_dynamic.yaml +++ b/configs/qwen2_5/fp8_dynamic/qwen2_5-32b_instruct_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-3b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-3b_instruct_fp8_dynamic.yaml index 703cbf75..02eaafc8 100644 --- a/configs/qwen2_5/fp8_dynamic/qwen2_5-3b_instruct_fp8_dynamic.yaml +++ b/configs/qwen2_5/fp8_dynamic/qwen2_5-3b_instruct_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-72b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-72b_instruct_fp8_dynamic.yaml index 23198680..8a110d4f 100644 --- a/configs/qwen2_5/fp8_dynamic/qwen2_5-72b_instruct_fp8_dynamic.yaml +++ b/configs/qwen2_5/fp8_dynamic/qwen2_5-72b_instruct_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-7b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-7b_instruct_fp8_dynamic.yaml index 6965f756..380780c3 100644 --- a/configs/qwen2_5/fp8_dynamic/qwen2_5-7b_instruct_fp8_dynamic.yaml +++ b/configs/qwen2_5/fp8_dynamic/qwen2_5-7b_instruct_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen2_5/fp8_static/qwen2_5-0_5b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-0_5b_instruct_fp8_static.yaml index 8dcca5f5..df2d58a2 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-0_5b_instruct_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-0_5b_instruct_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_static/qwen2_5-14b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-14b_instruct_fp8_static.yaml index 7353bc93..3986d36d 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-14b_instruct_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-14b_instruct_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml index 1a89a022..cc2f3e3a 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_fp8_static.yaml index 2428ad77..79330b91 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_static/qwen2_5-32b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-32b_instruct_fp8_static.yaml index 9dce5793..8788cfdd 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-32b_instruct_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-32b_instruct_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_static/qwen2_5-3b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-3b_instruct_fp8_static.yaml index 2cfe37c7..4d85a5a7 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-3b_instruct_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-3b_instruct_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_static/qwen2_5-72b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-72b_instruct_fp8_static.yaml index 801f11fa..532000dc 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-72b_instruct_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-72b_instruct_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_static/qwen2_5-7b_fp8_static_low_memory.yaml b/configs/qwen2_5/fp8_static/qwen2_5-7b_fp8_static_low_memory.yaml index e002ba2d..99976656 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-7b_fp8_static_low_memory.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-7b_fp8_static_low_memory.yaml @@ -24,7 +24,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/fp8_static/qwen2_5-7b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-7b_instruct_fp8_static.yaml index 6ec52ba6..4bb51bc6 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-7b_instruct_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-7b_instruct_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/int4_awq/qwen2_5-1_5b_int4_awq.yaml b/configs/qwen2_5/int4_awq/qwen2_5-1_5b_int4_awq.yaml index 73766a2d..f6caa104 100644 --- a/configs/qwen2_5/int4_awq/qwen2_5-1_5b_int4_awq.yaml +++ b/configs/qwen2_5/int4_awq/qwen2_5-1_5b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/int4_awq/qwen2_5-32b_int4_awq.yaml b/configs/qwen2_5/int4_awq/qwen2_5-32b_int4_awq.yaml index 461ffabf..014734b1 100644 --- a/configs/qwen2_5/int4_awq/qwen2_5-32b_int4_awq.yaml +++ b/configs/qwen2_5/int4_awq/qwen2_5-32b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/int4_awq/qwen2_5-7b_int4_awq.yaml b/configs/qwen2_5/int4_awq/qwen2_5-7b_int4_awq.yaml index e699958f..d0d9e43a 100644 --- a/configs/qwen2_5/int4_awq/qwen2_5-7b_int4_awq.yaml +++ b/configs/qwen2_5/int4_awq/qwen2_5-7b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml b/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml index 1486e6a7..0b217119 100644 --- a/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml +++ b/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/int4_gptq/qwen2_5-1_5b_int4_gptq.yaml b/configs/qwen2_5/int4_gptq/qwen2_5-1_5b_int4_gptq.yaml index 7448d001..0a5a0bc8 100644 --- a/configs/qwen2_5/int4_gptq/qwen2_5-1_5b_int4_gptq.yaml +++ b/configs/qwen2_5/int4_gptq/qwen2_5-1_5b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/int4_gptq/qwen2_5-32b_int4_gptq.yaml b/configs/qwen2_5/int4_gptq/qwen2_5-32b_int4_gptq.yaml index 9cf2616c..04069f08 100644 --- a/configs/qwen2_5/int4_gptq/qwen2_5-32b_int4_gptq.yaml +++ b/configs/qwen2_5/int4_gptq/qwen2_5-32b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen2_5/int4_gptq/qwen2_5-7b_int4_gptq.yaml b/configs/qwen2_5/int4_gptq/qwen2_5-7b_int4_gptq.yaml index 3d44b8fa..ef7d6d36 100644 --- a/configs/qwen2_5/int4_gptq/qwen2_5-7b_int4_gptq.yaml +++ b/configs/qwen2_5/int4_gptq/qwen2_5-7b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_dynamic/qwen3-0_6b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3-0_6b_fp8_dynamic.yaml index a6307062..be31a3d7 100644 --- a/configs/qwen3/fp8_dynamic/qwen3-0_6b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3-0_6b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3-14b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3-14b_fp8_dynamic.yaml index f1650449..b9b32cca 100644 --- a/configs/qwen3/fp8_dynamic/qwen3-14b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3-14b_fp8_dynamic.yaml @@ -23,5 +23,4 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3-1_7b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3-1_7b_fp8_dynamic.yaml index 740ca098..198b224f 100644 --- a/configs/qwen3/fp8_dynamic/qwen3-1_7b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3-1_7b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3-32b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3-32b_fp8_dynamic.yaml index c977c8c9..b1b94ec6 100644 --- a/configs/qwen3/fp8_dynamic/qwen3-32b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3-32b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3-4b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3-4b_fp8_dynamic.yaml index 9538f9d5..bd142899 100644 --- a/configs/qwen3/fp8_dynamic/qwen3-4b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3-4b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3-8b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3-8b_fp8_dynamic.yaml index 3ebaf4a4..41bbb32e 100644 --- a/configs/qwen3/fp8_dynamic/qwen3-8b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3-8b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3-a22b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3-a22b_fp8_dynamic.yaml index 6002906c..a6a34793 100644 --- a/configs/qwen3/fp8_dynamic/qwen3-a22b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3-a22b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3-a3b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3-a3b_fp8_dynamic.yaml index 21dae972..8bf23913 100644 --- a/configs/qwen3/fp8_dynamic/qwen3-a3b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3-a3b_fp8_dynamic.yaml @@ -17,11 +17,11 @@ compression: name: PTQ quantization: name: fp8_dynamic + save_name: fp8 bits: 8 quant_method: weight: "per-tensor" activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic.yaml b/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic.yaml index e201492c..51792cf7 100644 --- a/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic_low_memory.yaml b/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic_low_memory.yaml index 483b6c3c..d8b18fbf 100644 --- a/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic_low_memory.yaml +++ b/configs/qwen3/fp8_dynamic/qwen3_coder-a35b_fp8_dynamic_low_memory.yaml @@ -24,4 +24,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/fp8_static/qwen3-0_6b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-0_6b_fp8_static.yaml index df01f2e4..1c65248f 100644 --- a/configs/qwen3/fp8_static/qwen3-0_6b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-0_6b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-0_6b_fp8_static_analyse.yaml b/configs/qwen3/fp8_static/qwen3-0_6b_fp8_static_analyse.yaml index 4af82489..5d04c2c6 100644 --- a/configs/qwen3/fp8_static/qwen3-0_6b_fp8_static_analyse.yaml +++ b/configs/qwen3/fp8_static/qwen3-0_6b_fp8_static_analyse.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" quant_analyse: true # Dataset for calibration diff --git a/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml index fe876fab..f633e5e2 100644 --- a/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-14b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-14b_fp8_static.yaml index c7b7423d..eccc1002 100644 --- a/configs/qwen3/fp8_static/qwen3-14b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-14b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml index 2f2424de..9456e67d 100644 --- a/configs/qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-32b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-32b_fp8_static.yaml index 01b881c7..278e9fd4 100644 --- a/configs/qwen3/fp8_static/qwen3-32b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-32b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-4b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-4b_fp8_static.yaml index e5d3e617..0371c38e 100644 --- a/configs/qwen3/fp8_static/qwen3-4b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-4b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml index 814c1462..11537ab3 100644 --- a/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-8b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-8b_fp8_static.yaml index 84067db1..cc013918 100644 --- a/configs/qwen3/fp8_static/qwen3-8b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-8b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml index 108576de..b490a724 100644 --- a/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-a22b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-a22b_fp8_static.yaml index f0e86ddd..a2400d5f 100644 --- a/configs/qwen3/fp8_static/qwen3-a22b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-a22b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-a22b_fp8_static_low_memroy.yaml b/configs/qwen3/fp8_static/qwen3-a22b_fp8_static_low_memroy.yaml index c3e668c5..6bb891e4 100644 --- a/configs/qwen3/fp8_static/qwen3-a22b_fp8_static_low_memroy.yaml +++ b/configs/qwen3/fp8_static/qwen3-a22b_fp8_static_low_memroy.yaml @@ -24,7 +24,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3-a3b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-a3b_fp8_static.yaml index 1154cba8..b5d28d4a 100644 --- a/configs/qwen3/fp8_static/qwen3-a3b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3-a3b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static.yaml index d50180cf..0e83a97f 100644 --- a/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static.yaml +++ b/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static_low_memory.yaml b/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static_low_memory.yaml index e4ee611a..98bd390d 100644 --- a/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static_low_memory.yaml +++ b/configs/qwen3/fp8_static/qwen3_coder-a35b_fp8_static_low_memory.yaml @@ -24,7 +24,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_awq/qwen3-0_6b_int4_awq.yaml b/configs/qwen3/int4_awq/qwen3-0_6b_int4_awq.yaml index 9ff1dda0..48446042 100644 --- a/configs/qwen3/int4_awq/qwen3-0_6b_int4_awq.yaml +++ b/configs/qwen3/int4_awq/qwen3-0_6b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_awq/qwen3-14b_int4_awq.yaml b/configs/qwen3/int4_awq/qwen3-14b_int4_awq.yaml index 32c35621..59b20a54 100644 --- a/configs/qwen3/int4_awq/qwen3-14b_int4_awq.yaml +++ b/configs/qwen3/int4_awq/qwen3-14b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_awq/qwen3-1_7b_int4_awq.yaml b/configs/qwen3/int4_awq/qwen3-1_7b_int4_awq.yaml index 30904283..4912bcf2 100644 --- a/configs/qwen3/int4_awq/qwen3-1_7b_int4_awq.yaml +++ b/configs/qwen3/int4_awq/qwen3-1_7b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_awq/qwen3-32b_int4_awq.yaml b/configs/qwen3/int4_awq/qwen3-32b_int4_awq.yaml index 7ad5abc9..cdbbe518 100644 --- a/configs/qwen3/int4_awq/qwen3-32b_int4_awq.yaml +++ b/configs/qwen3/int4_awq/qwen3-32b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration diff --git a/configs/qwen3/int4_awq/qwen3-4b_int4_awq.yaml b/configs/qwen3/int4_awq/qwen3-4b_int4_awq.yaml index 5301234a..9f01092d 100644 --- a/configs/qwen3/int4_awq/qwen3-4b_int4_awq.yaml +++ b/configs/qwen3/int4_awq/qwen3-4b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_awq/qwen3-8b_int4_awq.yaml b/configs/qwen3/int4_awq/qwen3-8b_int4_awq.yaml index 91c5911d..c0fbc54c 100644 --- a/configs/qwen3/int4_awq/qwen3-8b_int4_awq.yaml +++ b/configs/qwen3/int4_awq/qwen3-8b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_awq/qwen3-a22b_int4_awq.yaml b/configs/qwen3/int4_awq/qwen3-a22b_int4_awq.yaml index 7d93e4ef..40b5414f 100644 --- a/configs/qwen3/int4_awq/qwen3-a22b_int4_awq.yaml +++ b/configs/qwen3/int4_awq/qwen3-a22b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_awq/qwen3-a3b_int4_awq.yaml b/configs/qwen3/int4_awq/qwen3-a3b_int4_awq.yaml index 197728f2..911036c4 100644 --- a/configs/qwen3/int4_awq/qwen3-a3b_int4_awq.yaml +++ b/configs/qwen3/int4_awq/qwen3-a3b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml b/configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml index 24891219..57be001e 100644 --- a/configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml +++ b/configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptq/qwen3-0_6b_int4_gptq.yaml b/configs/qwen3/int4_gptq/qwen3-0_6b_int4_gptq.yaml index 553a8481..1160321c 100644 --- a/configs/qwen3/int4_gptq/qwen3-0_6b_int4_gptq.yaml +++ b/configs/qwen3/int4_gptq/qwen3-0_6b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptq/qwen3-14b_int4_gptq.yaml b/configs/qwen3/int4_gptq/qwen3-14b_int4_gptq.yaml index e9671abd..1dc73998 100644 --- a/configs/qwen3/int4_gptq/qwen3-14b_int4_gptq.yaml +++ b/configs/qwen3/int4_gptq/qwen3-14b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptq/qwen3-1_7b_int4_gptq.yaml b/configs/qwen3/int4_gptq/qwen3-1_7b_int4_gptq.yaml index 39978220..19b602e2 100644 --- a/configs/qwen3/int4_gptq/qwen3-1_7b_int4_gptq.yaml +++ b/configs/qwen3/int4_gptq/qwen3-1_7b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptq/qwen3-32b_int4_gptq.yaml b/configs/qwen3/int4_gptq/qwen3-32b_int4_gptq.yaml index 979d1511..26037fe2 100644 --- a/configs/qwen3/int4_gptq/qwen3-32b_int4_gptq.yaml +++ b/configs/qwen3/int4_gptq/qwen3-32b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptq/qwen3-4b_int4_gptq.yaml b/configs/qwen3/int4_gptq/qwen3-4b_int4_gptq.yaml index 939a7568..4537bad7 100644 --- a/configs/qwen3/int4_gptq/qwen3-4b_int4_gptq.yaml +++ b/configs/qwen3/int4_gptq/qwen3-4b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptq/qwen3-8b_int4_gptq.yaml b/configs/qwen3/int4_gptq/qwen3-8b_int4_gptq.yaml index 6110fa2d..11d0ab31 100644 --- a/configs/qwen3/int4_gptq/qwen3-8b_int4_gptq.yaml +++ b/configs/qwen3/int4_gptq/qwen3-8b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptq/qwen3-a22b_int4_gptq.yaml b/configs/qwen3/int4_gptq/qwen3-a22b_int4_gptq.yaml index 03f54b4d..86bc2a5e 100644 --- a/configs/qwen3/int4_gptq/qwen3-a22b_int4_gptq.yaml +++ b/configs/qwen3/int4_gptq/qwen3-a22b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int4_gptq/qwen3-a3b_int4_gptq.yaml b/configs/qwen3/int4_gptq/qwen3-a3b_int4_gptq.yaml index 2e5ba5e2..13f99f60 100644 --- a/configs/qwen3/int4_gptq/qwen3-a3b_int4_gptq.yaml +++ b/configs/qwen3/int4_gptq/qwen3-a3b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/int8_dynamic/qwen3-0_6b_int8_dynamic.yaml b/configs/qwen3/int8_dynamic/qwen3-0_6b_int8_dynamic.yaml index 75a70fe8..aa7fcdd2 100644 --- a/configs/qwen3/int8_dynamic/qwen3-0_6b_int8_dynamic.yaml +++ b/configs/qwen3/int8_dynamic/qwen3-0_6b_int8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-token" ignore_layers: - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/int8_dynamic/qwen3-14b_int8_dynamic.yaml b/configs/qwen3/int8_dynamic/qwen3-14b_int8_dynamic.yaml index a4692ff4..c47d075c 100644 --- a/configs/qwen3/int8_dynamic/qwen3-14b_int8_dynamic.yaml +++ b/configs/qwen3/int8_dynamic/qwen3-14b_int8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-token" ignore_layers: - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/int8_dynamic/qwen3-1_7b_int8_dynamic.yaml b/configs/qwen3/int8_dynamic/qwen3-1_7b_int8_dynamic.yaml index a8ee384b..0ec66eb6 100644 --- a/configs/qwen3/int8_dynamic/qwen3-1_7b_int8_dynamic.yaml +++ b/configs/qwen3/int8_dynamic/qwen3-1_7b_int8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-token" ignore_layers: - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/int8_dynamic/qwen3-32b_int8_dynamic.yaml b/configs/qwen3/int8_dynamic/qwen3-32b_int8_dynamic.yaml index 2a3090a2..0b0ad2e0 100644 --- a/configs/qwen3/int8_dynamic/qwen3-32b_int8_dynamic.yaml +++ b/configs/qwen3/int8_dynamic/qwen3-32b_int8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-token" ignore_layers: - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/int8_dynamic/qwen3-4b_int8_dynamic.yaml b/configs/qwen3/int8_dynamic/qwen3-4b_int8_dynamic.yaml index 4bd917b9..0e771c9d 100644 --- a/configs/qwen3/int8_dynamic/qwen3-4b_int8_dynamic.yaml +++ b/configs/qwen3/int8_dynamic/qwen3-4b_int8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-token" ignore_layers: - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/int8_dynamic/qwen3-8b_int8_dynamic.yaml b/configs/qwen3/int8_dynamic/qwen3-8b_int8_dynamic.yaml index b44ab5e7..14340fe2 100644 --- a/configs/qwen3/int8_dynamic/qwen3-8b_int8_dynamic.yaml +++ b/configs/qwen3/int8_dynamic/qwen3-8b_int8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-token" ignore_layers: - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/int8_dynamic/qwen3-a22b_int8_dynamic.yaml b/configs/qwen3/int8_dynamic/qwen3-a22b_int8_dynamic.yaml index b8ef3b77..987b51d0 100644 --- a/configs/qwen3/int8_dynamic/qwen3-a22b_int8_dynamic.yaml +++ b/configs/qwen3/int8_dynamic/qwen3-a22b_int8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-token" ignore_layers: - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/int8_dynamic/qwen3-a3b_int8_dynamic.yaml b/configs/qwen3/int8_dynamic/qwen3-a3b_int8_dynamic.yaml index bcd13955..6ccc3fce 100644 --- a/configs/qwen3/int8_dynamic/qwen3-a3b_int8_dynamic.yaml +++ b/configs/qwen3/int8_dynamic/qwen3-a3b_int8_dynamic.yaml @@ -23,4 +23,3 @@ compression: activation: "per-token" ignore_layers: - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwen3/nvfp4/qwen3-0_6b_nvfp4.yaml b/configs/qwen3/nvfp4/qwen3-0_6b_nvfp4.yaml index f6ebcbba..cdd0b98e 100644 --- a/configs/qwen3/nvfp4/qwen3-0_6b_nvfp4.yaml +++ b/configs/qwen3/nvfp4/qwen3-0_6b_nvfp4.yaml @@ -24,7 +24,6 @@ compression: group_size: 16 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/nvfp4/qwen3-14b_nvfp4.yaml b/configs/qwen3/nvfp4/qwen3-14b_nvfp4.yaml index d5b2b91e..d9854fd4 100644 --- a/configs/qwen3/nvfp4/qwen3-14b_nvfp4.yaml +++ b/configs/qwen3/nvfp4/qwen3-14b_nvfp4.yaml @@ -24,7 +24,6 @@ compression: group_size: 16 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/nvfp4/qwen3-1_7b_nvfp4.yaml b/configs/qwen3/nvfp4/qwen3-1_7b_nvfp4.yaml index 55cee8cc..94f82e06 100644 --- a/configs/qwen3/nvfp4/qwen3-1_7b_nvfp4.yaml +++ b/configs/qwen3/nvfp4/qwen3-1_7b_nvfp4.yaml @@ -24,7 +24,6 @@ compression: group_size: 16 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/nvfp4/qwen3-32b_nvfp4.yaml b/configs/qwen3/nvfp4/qwen3-32b_nvfp4.yaml index 321a1bd7..e8ab625b 100644 --- a/configs/qwen3/nvfp4/qwen3-32b_nvfp4.yaml +++ b/configs/qwen3/nvfp4/qwen3-32b_nvfp4.yaml @@ -24,7 +24,6 @@ compression: group_size: 16 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/nvfp4/qwen3-4b_nvfp4.yaml b/configs/qwen3/nvfp4/qwen3-4b_nvfp4.yaml index addd696b..20ab8f30 100644 --- a/configs/qwen3/nvfp4/qwen3-4b_nvfp4.yaml +++ b/configs/qwen3/nvfp4/qwen3-4b_nvfp4.yaml @@ -24,7 +24,6 @@ compression: group_size: 16 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/nvfp4/qwen3-8b_nvfp4.yaml b/configs/qwen3/nvfp4/qwen3-8b_nvfp4.yaml index 0c80b721..13011b88 100644 --- a/configs/qwen3/nvfp4/qwen3-8b_nvfp4.yaml +++ b/configs/qwen3/nvfp4/qwen3-8b_nvfp4.yaml @@ -24,7 +24,6 @@ compression: group_size: 16 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/nvfp4/qwen3-a22b_nvfp4.yaml b/configs/qwen3/nvfp4/qwen3-a22b_nvfp4.yaml index dddac200..b1e9ace8 100644 --- a/configs/qwen3/nvfp4/qwen3-a22b_nvfp4.yaml +++ b/configs/qwen3/nvfp4/qwen3-a22b_nvfp4.yaml @@ -24,7 +24,6 @@ compression: group_size: 16 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/smooth_int8/qwen3-0_6b_int8_dynamic_smooth.yaml b/configs/qwen3/smooth_int8/qwen3-0_6b_int8_dynamic_smooth.yaml index 6f5bd79a..362fef85 100644 --- a/configs/qwen3/smooth_int8/qwen3-0_6b_int8_dynamic_smooth.yaml +++ b/configs/qwen3/smooth_int8/qwen3-0_6b_int8_dynamic_smooth.yaml @@ -25,7 +25,6 @@ compression: - "smooth" ignore_layers: - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/smooth_int8/qwen3-14b_int8_dynamic_smooth.yaml b/configs/qwen3/smooth_int8/qwen3-14b_int8_dynamic_smooth.yaml index 222780b6..e98f7889 100644 --- a/configs/qwen3/smooth_int8/qwen3-14b_int8_dynamic_smooth.yaml +++ b/configs/qwen3/smooth_int8/qwen3-14b_int8_dynamic_smooth.yaml @@ -25,7 +25,6 @@ compression: - "smooth" ignore_layers: - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/smooth_int8/qwen3-1_7b_int8_dynamic_smooth.yaml b/configs/qwen3/smooth_int8/qwen3-1_7b_int8_dynamic_smooth.yaml index ff697809..f487879c 100644 --- a/configs/qwen3/smooth_int8/qwen3-1_7b_int8_dynamic_smooth.yaml +++ b/configs/qwen3/smooth_int8/qwen3-1_7b_int8_dynamic_smooth.yaml @@ -25,7 +25,6 @@ compression: - "smooth" ignore_layers: - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/smooth_int8/qwen3-32b_int8_dynamic_smooth.yaml b/configs/qwen3/smooth_int8/qwen3-32b_int8_dynamic_smooth.yaml index 0981830e..b31d5375 100644 --- a/configs/qwen3/smooth_int8/qwen3-32b_int8_dynamic_smooth.yaml +++ b/configs/qwen3/smooth_int8/qwen3-32b_int8_dynamic_smooth.yaml @@ -25,7 +25,6 @@ compression: - "smooth" ignore_layers: - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/smooth_int8/qwen3-4b_int8_dynamic_smooth.yaml b/configs/qwen3/smooth_int8/qwen3-4b_int8_dynamic_smooth.yaml index 3954b2bd..c756f5a5 100644 --- a/configs/qwen3/smooth_int8/qwen3-4b_int8_dynamic_smooth.yaml +++ b/configs/qwen3/smooth_int8/qwen3-4b_int8_dynamic_smooth.yaml @@ -25,7 +25,6 @@ compression: - "smooth" ignore_layers: - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwen3/smooth_int8/qwen3-8b_int8_dynamic_smooth.yaml b/configs/qwen3/smooth_int8/qwen3-8b_int8_dynamic_smooth.yaml index 213a6c39..052c3395 100644 --- a/configs/qwen3/smooth_int8/qwen3-8b_int8_dynamic_smooth.yaml +++ b/configs/qwen3/smooth_int8/qwen3-8b_int8_dynamic_smooth.yaml @@ -25,7 +25,6 @@ compression: - "smooth" ignore_layers: - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwq/fp8_dynamic/qwq-32b_fp8_dynamic.yaml b/configs/qwq/fp8_dynamic/qwq-32b_fp8_dynamic.yaml index be79c662..5bb4278e 100644 --- a/configs/qwq/fp8_dynamic/qwq-32b_fp8_dynamic.yaml +++ b/configs/qwq/fp8_dynamic/qwq-32b_fp8_dynamic.yaml @@ -20,4 +20,3 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/qwq/fp8_static/qwq-32b_fp8_static.yaml b/configs/qwq/fp8_static/qwq-32b_fp8_static.yaml index 9de17574..7a2a99e9 100644 --- a/configs/qwq/fp8_static/qwq-32b_fp8_static.yaml +++ b/configs/qwq/fp8_static/qwq-32b_fp8_static.yaml @@ -20,7 +20,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwq/int4_awq/qwq-32b_int4_awq.yaml b/configs/qwq/int4_awq/qwq-32b_int4_awq.yaml index 3e417225..d8cec5b2 100644 --- a/configs/qwq/int4_awq/qwq-32b_int4_awq.yaml +++ b/configs/qwq/int4_awq/qwq-32b_int4_awq.yaml @@ -25,7 +25,6 @@ compression: mse_range: false ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/qwq/int4_gptq/qwq-32b_int4_gptq.yaml b/configs/qwq/int4_gptq/qwq-32b_int4_gptq.yaml index 81003ab3..3450f241 100644 --- a/configs/qwq/int4_gptq/qwq-32b_int4_gptq.yaml +++ b/configs/qwq/int4_gptq/qwq-32b_int4_gptq.yaml @@ -23,7 +23,6 @@ compression: group_size: 128 ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/seed_oss/fp8_dynamic/seed_oss-36b_instruct_fp8_dynamic.yaml b/configs/seed_oss/fp8_dynamic/seed_oss-36b_instruct_fp8_dynamic.yaml index 59593380..d3a8ee90 100644 --- a/configs/seed_oss/fp8_dynamic/seed_oss-36b_instruct_fp8_dynamic.yaml +++ b/configs/seed_oss/fp8_dynamic/seed_oss-36b_instruct_fp8_dynamic.yaml @@ -23,5 +23,4 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" diff --git a/configs/seed_oss/fp8_static/seed_oss-36b_instruct_fp8_static.yaml b/configs/seed_oss/fp8_static/seed_oss-36b_instruct_fp8_static.yaml index da22b7f8..18a2b77e 100644 --- a/configs/seed_oss/fp8_static/seed_oss-36b_instruct_fp8_static.yaml +++ b/configs/seed_oss/fp8_static/seed_oss-36b_instruct_fp8_static.yaml @@ -23,7 +23,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/docs/source/deployment/deploy.md b/docs/source/deployment/deploy.md index 8b0fe910..c1983165 100644 --- a/docs/source/deployment/deploy.md +++ b/docs/source/deployment/deploy.md @@ -29,8 +29,7 @@ }, "format": "naive-quantized", "ignored_layers": [ - "lm_head", - "model.embed_tokens" + "lm_head" ], "quant_method": "compressed-tensors", "quantization_status": "compressed" diff --git a/docs/source/design/prepare_config.md b/docs/source/design/prepare_config.md index b5438ab8..fa748a13 100644 --- a/docs/source/design/prepare_config.md +++ b/docs/source/design/prepare_config.md @@ -27,7 +27,6 @@ compression: activation: "per-tensor" ignore_layers: - "lm_head" - - "model.embed_tokens" # 数据集相关配置 dataset: diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index b424989b..20f7717d 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -30,7 +30,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" ``` @@ -92,8 +91,7 @@ dataset: }, "format": "naive-quantized", "ignore": [ - "lm_head", - "model.embed_tokens" + "lm_head" ], "quant_method": "compressed-tensors", "quantization_status": "compressed" @@ -135,7 +133,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" ``` ## 开启FP8量化分析 @@ -159,7 +156,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" quant_analyse: true ``` diff --git a/docs/source/features/quantization/fp8_lepto.md b/docs/source/features/quantization/fp8_lepto.md index 4ee5d54c..d2ffa168 100644 --- a/docs/source/features/quantization/fp8_lepto.md +++ b/docs/source/features/quantization/fp8_lepto.md @@ -34,7 +34,6 @@ compression: activation: "per-tensor" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" ``` 激活静态量化需要指定校准数据集,例如使用`sharegpt`数据集: diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index dd9f14df..a2bd9909 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -30,7 +30,6 @@ compression: activation: "per-token" ignore_layers: # Skip quantization for these layers - "lm_head" - - "model.embed_tokens" ``` ## 产出模型 @@ -64,8 +63,7 @@ compression: }, "format": "int-quantized", "ignore": [ - "lm_head", - "model.embed_tokens" + "lm_head" ], "kv_cache_scheme": null, "quant_method": "compressed-tensors",