From 2fced721ae1318aa5b87f687ed0b84a13f26db66 Mon Sep 17 00:00:00 2001 From: StromNoNo Date: Thu, 21 Aug 2025 16:09:27 +0800 Subject: [PATCH 1/2] add int4_gptaq --- angelslim/compressor/quant/core/config.py | 2 +- .../compressor/quant/modules/__init__.py | 1 + .../quant/modules/gptq/gptaq_module.py | 236 ++++++++++++++++++ .../compressor/quant/modules/gptq/gptq.py | 52 +++- angelslim/compressor/quant/ptq.py | 8 +- angelslim/engine.py | 1 + angelslim/models/base_model.py | 1 + angelslim/models/vlm/qwen_vl.py | 1 + angelslim/utils/config_parser.py | 1 + angelslim/utils/default_compress_config.py | 21 ++ ...epseek_r1_distill_qwen-32b_int4_gptaq.yaml | 35 +++ .../hunyuan_7b_dense_int4_gptaq.yaml | 35 +++ .../int4_gptaq/qwen2_5-32b_int4_gptaq.yaml | 34 +++ .../qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml | 34 +++ 14 files changed, 455 insertions(+), 7 deletions(-) create mode 100644 angelslim/compressor/quant/modules/gptq/gptaq_module.py create mode 100644 configs/deepseek_r1_distill_qwen/int4_gptaq/deepseek_r1_distill_qwen-32b_int4_gptaq.yaml create mode 100644 configs/hunyuan/int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml create mode 100644 configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml create mode 100644 configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml diff --git a/angelslim/compressor/quant/core/config.py b/angelslim/compressor/quant/core/config.py index 42e7fc29..a562859b 100644 --- a/angelslim/compressor/quant/core/config.py +++ b/angelslim/compressor/quant/core/config.py @@ -138,7 +138,7 @@ def __init__(self, config, global_config=None): self.hidden_size = global_config.hidden_size self.model_arch_type = global_config.model_arch_type self.low_memory = config.quantization.low_memory - elif "int4_gptq" in self.quant_algo: + elif "int4_gptq" in self.quant_algo or "int4_gptaq" in self.quant_algo: self.act_observer = None self.weight_observer = None self.kv_cache_observer = None diff --git a/angelslim/compressor/quant/modules/__init__.py b/angelslim/compressor/quant/modules/__init__.py index d655c9e7..2468a4e4 100644 --- a/angelslim/compressor/quant/modules/__init__.py +++ b/angelslim/compressor/quant/modules/__init__.py @@ -15,6 +15,7 @@ from .awq.awq import AWQ # noqa: F401 from .fp8.fp8 import FP8 # noqa: F401 from .fp8.lepto_fp8 import LeptoFP8 # noqa: F401 +from .gptq.gptaq_module import GPTAQModule # noqa: F401 from .gptq.gptq import GPTQ # noqa: F401 from .gptq.gptq_module import GPTQModule # noqa: F401 from .helper_layer import GPTQQuantLinear # noqa: F401 diff --git a/angelslim/compressor/quant/modules/gptq/gptaq_module.py b/angelslim/compressor/quant/modules/gptq/gptaq_module.py new file mode 100644 index 00000000..263b6019 --- /dev/null +++ b/angelslim/compressor/quant/modules/gptq/gptaq_module.py @@ -0,0 +1,236 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import time + +import torch + +from .....utils import get_tensor_item, print_info +from ...core import compute_scales_with_zero + +__all__ = ["GPTAQModule"] + + +class GPTAQModule: + def __init__(self, layer, quant_bits=4): + """ + GPTAQ quantization wrapper for neural network layers. + + Args: + layer: Full-precision torch.nn.Module to quantize (Linear) + quant_bits: Quantization bitwidth (2-8 bits, default=4) + """ + super(GPTAQModule, self).__init__() + self.layer = layer + self.dev = self.layer.weight.device + self.w = layer.weight.data.clone() + self.rows = self.w.shape[0] + self.columns = self.w.shape[1] + self.h = torch.zeros((self.columns, self.columns), device=self.dev) + self.dXXT = torch.zeros((self.columns, self.columns), device=self.dev) + self.nsamples = 0 + self.quant_bits = quant_bits + + def add_batch(self, inp, out, native_inp): + if len(inp.shape) == 4: + inp = inp[0, 0, :, :] + native_inp = native_inp[0, 0, :, :] + inp = inp.squeeze() + native_inp = native_inp.squeeze() + if len(inp.shape) == 2: + inp = inp.unsqueeze(0) + native_inp = native_inp.unsqueeze(0) + tmp = inp.shape[0] + if len(inp.shape) == 3: + inp = inp.reshape((-1, inp.shape[-1])) + native_inp = native_inp.reshape((-1, native_inp.shape[-1])) + inp = inp.t() + native_inp = native_inp.t() + self.h *= self.nsamples / (self.nsamples + tmp) + self.dXXT *= self.nsamples / (self.nsamples + tmp) + self.nsamples += tmp + inp = math.sqrt(2 / self.nsamples) * inp.float() + self.h += inp.matmul(inp.t()) + native_inp = math.sqrt(2 / self.nsamples) * native_inp + self.dXXT += (native_inp - inp).matmul(inp.t()) + + def fasterquant( + self, + blocksize=128, + percdamp=0.01, + group_size=-1, + actorder=True, + sym=True, + ): + w_weight = self.w.float() + + tick = time.time() + + hessian = self.h + if torch.isnan(hessian).any(): + print_info("[error] Hessian contains nan!") + exit() + self.h.detach().cpu() + del self.h + dead = torch.diag(hessian) == 0 + hessian[dead, dead] = 1 + w_weight[:, dead] = 0 + self.dXXT[:, dead] = 0 + + g_idx = [] + scale = [] + zero = [] + now_idx = 1 + static_groups = True + + if static_groups: + for i in range(0, self.columns, group_size): + weight_scale, weight_zero = compute_scales_with_zero( + w_weight[:, i : (i + group_size)], bits=self.quant_bits, sym=sym + ) + scale.append(weight_scale) + zero.append(weight_zero) + + if actorder: + perm = torch.argsort(torch.diag(hessian), descending=True) + w_weight = w_weight[:, perm] + hessian = hessian[perm][:, perm] + self.dXXT = self.dXXT[perm][:, perm] + invperm = torch.argsort(perm) + + losses = torch.zeros_like(w_weight) + q_weight = torch.zeros_like(w_weight) + + while 1 > percdamp > 0: + try: + damp = percdamp * torch.mean(torch.diag(hessian)) + diag = torch.arange(self.columns, device=self.dev) + hessian[diag, diag] += damp + hessian = torch.linalg.cholesky(hessian) + hessian = torch.cholesky_inverse(hessian) + hessian = torch.linalg.cholesky(hessian, upper=True) + hinv = hessian + break + except torch._C._LinAlgError as e: + print_info(e) + print_info(f"Cholesky failed with percdamp={percdamp:.5f}") + percdamp += 0.01 + + P = ((self.dXXT @ hinv.T).triu(diagonal=1)) @ hinv + del self.dXXT + + for i1 in range(0, self.columns, blocksize): + i2 = min(i1 + blocksize, self.columns) + count = i2 - i1 + + w1 = w_weight[:, i1:i2].clone() + q1 = torch.zeros_like(w1) + err1 = torch.zeros_like(w1) + losses1 = torch.zeros_like(w1) + hinv1 = hinv[i1:i2, i1:i2] + P1 = P[i1:i2, i1:i2] + + for i in range(count): + w = w1[:, i] + d = hinv1[i, i] + + if group_size != -1: + if not static_groups: + if (i1 + i) % group_size == 0: + weight_scale, weight_zero = compute_scales_with_zero( + w_weight[:, (i1 + i) : (i1 + i + group_size)], + bits=self.quant_bits, + sym=sym, + ) + + if ((i1 + i) // group_size) - now_idx == -1: + scale.append(weight_scale) + zero.append(weight_zero) + now_idx += 1 + else: + idx = i1 + i + if actorder: + idx = perm[idx] + weight_scale = scale[idx // group_size] + weight_zero = zero[idx // group_size] + + maxq = torch.tensor(2**self.quant_bits - 1) + q = torch.clamp( + torch.round(w.unsqueeze(1) / weight_scale) + weight_zero, 0, maxq + ) + q = weight_scale * (q - weight_zero) + q = q.flatten() + q1[:, i] = q + losses1[:, i] = (w - q) ** 2 / d**2 + + err = (w - q) / d + w1[:, i:] -= err.unsqueeze(1).matmul( + hinv1[i, i:].unsqueeze(0) + ) - w.unsqueeze(1).matmul(P1[i, i:].unsqueeze(0)) + err1[:, i] = err + + q_weight[:, i1:i2] = q1 + losses[:, i1:i2] = losses1 / 2 + + w_weight[:, i2:] -= err1.matmul(hinv[i1:i2, i2:]) - w1.matmul(P[i1:i2, i2:]) + + torch.cuda.synchronize() + print_info(f" duration: {(time.time() - tick)}") + print_info(f" avg loss: {torch.sum(losses).item() / self.nsamples}") + + group_size = group_size if group_size != -1 else self.columns + if static_groups and actorder: + g_idx = [perm[i] // group_size for i in range(self.columns)] + else: + g_idx = [i // group_size for i in range(self.columns)] + g_idx = torch.tensor(g_idx, dtype=torch.int32, device=q_weight.device) + if actorder: + q_weight = q_weight[:, invperm] + g_idx = g_idx[invperm] + + norm_loss = torch.norm( + q_weight.reshape(self.layer.weight.shape).type_as(self.layer.weight.data) + - self.layer.weight.data + ) + all_norm_loss = [norm_loss] + + print_info(" self.layer.weight: {}, {}".format(q_weight.shape, q_weight.sum())) + print_info(f" norm loss: {list(map(get_tensor_item, all_norm_loss))}") + + self.layer.weight.data.copy_( + q_weight.reshape(self.layer.weight.shape).type_as(self.layer.weight.data) + ) + + if scale == []: + scale = weight_scale + zero = torch.zeros_like(weight_scale) + scale = torch.cat(scale, dim=1) + zero = torch.cat(zero, dim=1) + losses = losses.cpu() + q_weight = q_weight.cpu() + w_weight = w_weight.cpu() + hessian = hessian.cpu() + hinv = hinv.cpu() + del losses, q_weight, w_weight, hessian, hinv, P + self.w = self.w.cpu() + del self.w + torch.cuda.empty_cache() + return scale, zero, g_idx + + def free(self): + self.h = None + self.w = None + self.losses = None + torch.cuda.empty_cache() diff --git a/angelslim/compressor/quant/modules/gptq/gptq.py b/angelslim/compressor/quant/modules/gptq/gptq.py index c1d84abd..6fb05f4e 100644 --- a/angelslim/compressor/quant/modules/gptq/gptq.py +++ b/angelslim/compressor/quant/modules/gptq/gptq.py @@ -23,6 +23,7 @@ from .....utils import print_info from ...modules.catcher import Catcher from ...modules.helper_layer import GPTQQuantLinear +from .gptaq_module import GPTAQModule from .gptq_module import GPTQModule __all__ = ["GPTQ"] @@ -51,6 +52,8 @@ def __init__( self.dtype = next(iter(self.layers.parameters())).dtype self.quantizers = {} self.gptq = {} + self.quant_algo = self.model.quant_config.quant_algo + self.native_inp_caches = {} @torch.no_grad() def run(self, dataloader): @@ -86,6 +89,8 @@ def run(self, dataloader): torch.cuda.empty_cache() outs = torch.zeros_like(inps) + if "gptaq" in self.quant_algo: + native_inps = inps.clone().detach() # begin the gptq process print_info("Ready.") @@ -96,18 +101,61 @@ def run(self, dataloader): subset = self._find_layers(layer) print_info("subset:{}".format(subset)) self.gptq = {} + if "gptaq" in self.quant_algo: + self.native_inp_caches = {} print_info("GPTQMoe start layer {}".format(i)) for name in subset: if name in self.ignore_layers: continue - self.gptq[name] = GPTQModule(subset[name], quant_bits=self.quant_bits) + if "gptaq" in self.quant_algo: + self.native_inp_caches[name] = [] + self.gptq[name] = GPTAQModule( + subset[name], quant_bits=self.quant_bits + ) + else: + self.gptq[name] = GPTQModule( + subset[name], quant_bits=self.quant_bits + ) + + def pre_process_fwd_hook(layer_name): + def tmp(_, inp, out): + self.native_inp_caches[layer_name] += [inp[0].data] + del inp, out + + return tmp def add_batch(layer_name): def tmp(_, inp, out): - self.gptq[layer_name].add_batch(inp[0].data, out.data) + if "gptaq" in self.quant_algo: + native_inp = self.native_inp_caches[layer_name].pop(0) + self.gptq[layer_name].add_batch( + inp[0].data, out.data, native_inp + ) + else: + self.gptq[layer_name].add_batch(inp[0].data, out.data) return tmp + if "gptaq" in self.quant_algo: + native_handles = [] + for name in self.native_inp_caches: + native_handles.append( + subset[name].register_forward_hook(pre_process_fwd_hook(name)) + ) + + # being native hook + for j in range(nsamples): + with torch.no_grad(): + outs[j, :, :] = layer( + hidden_states=native_inps[j, :, :].unsqueeze(0), + **layer_kwargs, + )[0].squeeze(1) + native_inps = outs + + print_info("Native HOOK Step{}".format(j)) + for h in native_handles: + h.remove() + handles = [] for name in self.gptq: handles.append(subset[name].register_forward_hook(add_batch(name))) diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index 3f860b59..b3d0ee2b 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -45,7 +45,7 @@ def __init__(self, model, slim_config=None): self.ptq_hook = PTQHook(self.quant_model) self.ptq_hook.apply_hook() - if "gptq" in self.quant_algo: + if "gptq" in self.quant_algo or "gptaq" in self.quant_algo: max_seq_length = self.quant_model.quant_config.max_seq_length hidden_size = self.quant_model.quant_config.hidden_size self.gptq = GPTQ( @@ -105,7 +105,7 @@ def __init__(self, model, slim_config=None): ) def calibrate(self, dataloader): - if "gptq" in self.quant_algo: + if "gptq" in self.quant_algo or "gptaq" in self.quant_algo: self.gptq.run(dataloader) elif "awq" in self.quant_algo: self.awq.run(dataloader) @@ -123,7 +123,7 @@ def convert(self): Saves scales and inserts QDQ modules. """ print_info("Start convert model...") - if "gptq" in self.quant_algo: + if "gptq" in self.quant_algo or "gptaq" in self.quant_algo: self.gptq.convert() elif "awq" in self.quant_algo: self.awq.convert() @@ -166,7 +166,7 @@ def save(self, save_path: str): ) print_info("Start save PTQ ckpt to: {}".format(save_path)) - if "gptq" in self.quant_algo: + if "gptq" in self.quant_algo or "gptaq" in self.quant_algo: self.gptq.save(save_path) elif "awq" in self.quant_algo: self.awq.save(save_path) diff --git a/angelslim/engine.py b/angelslim/engine.py index 17ed47bd..adb06825 100644 --- a/angelslim/engine.py +++ b/angelslim/engine.py @@ -32,6 +32,7 @@ "int4_awq": default_compress_config.default_int4_awq_config(), "int4_gptq": default_compress_config.default_int4_gptq_config(), "w4a8_fp8": default_compress_config.default_w4a8_fp8_static_config(), + "int4_gptaq": default_compress_config.default_int4_gptaq_config(), } diff --git a/angelslim/models/base_model.py b/angelslim/models/base_model.py index b4642fe0..36d6524f 100644 --- a/angelslim/models/base_model.py +++ b/angelslim/models/base_model.py @@ -256,6 +256,7 @@ def model_forward(self, dataloader, **kwargs): if ( "gptq" in self.quant_config.quant_algo or "awq" in self.quant_config.quant_algo + or "gptaq" in self.quant_config.quant_algo ): device = "cuda:0" else: diff --git a/angelslim/models/vlm/qwen_vl.py b/angelslim/models/vlm/qwen_vl.py index 34c08abd..f72558a8 100644 --- a/angelslim/models/vlm/qwen_vl.py +++ b/angelslim/models/vlm/qwen_vl.py @@ -124,6 +124,7 @@ def model_forward(self, dataloader, **kwargs): if ( "gptq" in self.quant_config.quant_algo or "awq" in self.quant_config.quant_algo + or "gptaq" in self.quant_config.quant_algo ): device = "cuda:0" else: diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py index 91b13406..fe1bec67 100644 --- a/angelslim/utils/config_parser.py +++ b/angelslim/utils/config_parser.py @@ -261,6 +261,7 @@ def __init__(self): "int4_gptq", "int8_dynamic", "w4a8_fp8", + "int4_gptaq", ] # Supported speculative decoding methods self.supported_speculative_decoding_methods = ["EAGLE", "EAGLE2", "EAGLE3"] diff --git a/angelslim/utils/default_compress_config.py b/angelslim/utils/default_compress_config.py index 49a1f10c..7ccd58f4 100644 --- a/angelslim/utils/default_compress_config.py +++ b/angelslim/utils/default_compress_config.py @@ -20,6 +20,7 @@ "default_int8_dynamic_config", "default_int4_gptq_config", "default_int4_awq_config", + "default_int4_gptaq_config", ] @@ -150,3 +151,23 @@ def default_w4a8_fp8_static_config() -> dict: ), ), } + + +def default_int4_gptaq_config() -> dict: + """ + Returns a default configuration dictionary for model compression. + + This configuration includes global settings and specific compression parameters. + """ + return { + "global_config": GlobalConfig(), + "compress_config": CompressionConfig( + name="PTQ", + quantization=QuantizationConfig( + name="int4_gptaq", + bits=4, + quant_method={"weight": "per-group", "group_size": 128}, + ignore_layers=["lm_head", "model.embed_tokens"], + ), + ), + } diff --git a/configs/deepseek_r1_distill_qwen/int4_gptaq/deepseek_r1_distill_qwen-32b_int4_gptaq.yaml b/configs/deepseek_r1_distill_qwen/int4_gptaq/deepseek_r1_distill_qwen-32b_int4_gptaq.yaml new file mode 100644 index 00000000..ffee696d --- /dev/null +++ b/configs/deepseek_r1_distill_qwen/int4_gptaq/deepseek_r1_distill_qwen-32b_int4_gptaq.yaml @@ -0,0 +1,35 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: cpu + save_path: ./output + +# Compression configuration +compression: + name: PTQ + quantization: + name: int4_gptaq # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq + bits: 4 # Quantization bits (4/8) + quant_method: + weight: "per-group" + group_size: 128 + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl + max_seq_length: 4096 + num_samples: 128 + batch_size: 1 diff --git a/configs/hunyuan/int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml b/configs/hunyuan/int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml new file mode 100644 index 00000000..30d6a40a --- /dev/null +++ b/configs/hunyuan/int4_gptaq/hunyuan_7b_dense_int4_gptaq.yaml @@ -0,0 +1,35 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: HunyuanDense + model_path: tencent/Hunyuan-7B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: cpu + +# Compression configuration +compression: + name: PTQ + quantization: + name: int4_gptaq # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq + bits: 4 # Quantization bits (4/8) + quant_method: + weight: "per-group" + group_size: 128 + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + - "mlp.gate.wg" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 8192 + num_samples: 128 + batch_size: 1 diff --git a/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml b/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml new file mode 100644 index 00000000..c4195be0 --- /dev/null +++ b/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen2.5-32B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: cpu + +# Compression configuration +compression: + name: PTQ + quantization: + name: int4_gptaq # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq + bits: 4 # Quantization bits (4/8) + quant_method: + weight: "per-group" + group_size: 128 + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl + max_seq_length: 4096 + num_samples: 128 + batch_size: 1 diff --git a/configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml b/configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml new file mode 100644 index 00000000..24891219 --- /dev/null +++ b/configs/qwen3/int4_gptaq/qwen3-4b_int4_gptaq.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen3-4B + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: cpu + +# Compression configuration +compression: + name: PTQ + quantization: + name: int4_gptaq + bits: 4 + quant_method: + weight: "per-group" + group_size: 128 + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl + max_seq_length: 4096 + num_samples: 128 + batch_size: 1 From 1e838b2880f34d28055628b1f272e42f6a48b593 Mon Sep 17 00:00:00 2001 From: StromNoNo Date: Thu, 21 Aug 2025 17:32:42 +0800 Subject: [PATCH 2/2] change qwen2_5-32b-int4_gptaq config --- configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml b/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml index c4195be0..1486e6a7 100644 --- a/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml +++ b/configs/qwen2_5/int4_gptaq/qwen2_5-32b_int4_gptaq.yaml @@ -16,7 +16,7 @@ model: compression: name: PTQ quantization: - name: int4_gptaq # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq + name: int4_gptaq # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq, int4_gptaq bits: 4 # Quantization bits (4/8) quant_method: weight: "per-group"