diff --git a/angelslim/compressor/quant/core/fp8_analyse_tools.py b/angelslim/compressor/quant/core/fp8_analyse_tools.py index 49eb9cb8..4c2fac86 100644 --- a/angelslim/compressor/quant/core/fp8_analyse_tools.py +++ b/angelslim/compressor/quant/core/fp8_analyse_tools.py @@ -158,7 +158,7 @@ def get_weight_dict(model_path): def draw_hist(uniform_data, ax, name): uniform_data.sort() s = pd.Series(uniform_data) - ax.hist(s, bins=50, rwidth=1) + ax.hist(s, bins=1000, rwidth=1) ax.set_title(name + "_histgram") ax.grid(True) diff --git a/angelslim/compressor/quant/modules/__init__.py b/angelslim/compressor/quant/modules/__init__.py index 6bfd736a..d655c9e7 100644 --- a/angelslim/compressor/quant/modules/__init__.py +++ b/angelslim/compressor/quant/modules/__init__.py @@ -14,6 +14,7 @@ from .awq.awq import AWQ # noqa: F401 from .fp8.fp8 import FP8 # noqa: F401 +from .fp8.lepto_fp8 import LeptoFP8 # noqa: F401 from .gptq.gptq import GPTQ # noqa: F401 from .gptq.gptq_module import GPTQModule # noqa: F401 from .helper_layer import GPTQQuantLinear # noqa: F401 diff --git a/angelslim/compressor/quant/modules/fp8/lepto_fp8.py b/angelslim/compressor/quant/modules/fp8/lepto_fp8.py new file mode 100644 index 00000000..fee006ba --- /dev/null +++ b/angelslim/compressor/quant/modules/fp8/lepto_fp8.py @@ -0,0 +1,261 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +from collections import defaultdict + +import torch +import torch.nn as nn + +from .....utils import find_parent_layer_and_sub_name, get_best_device, print_info +from ...core.quant_func import get_fp_maxval +from ...modules.catcher import Catcher +from .lepto_scale import AutoLayerScale + +__all__ = ["LeptoFP8"] + + +class LeptoFP8: + def __init__( + self, + ptq_hook, + model, + seq_length=2048, + hidden_size=2560, + model_arch_type=None, + low_memory=False, + ): + """ + Args: + model(nn.Module, required): The model to be smoothed. + seq_length(int, optional): The length of the sequence. Default: 2048. + hidden_size(int, optional): The size of the hidden layer. Default: 2560. + model_arch_type(str, optional): model arch type.Default: None. + low_memory(boll, optional): using low memory .Default: None. + """ + super(LeptoFP8, self).__init__() + self.ptq_hook = ptq_hook + self.quant_model = model # self.quant_model + self.modal_type = self.quant_model.modal_type + self.layers = self.quant_model.model.model.layers + self.quant_bits = self.quant_model.quant_config.quant_bit + self.seq_length = seq_length + self.hidden_size = hidden_size + self.model_arch_type = model_arch_type + self.low_memory = low_memory + self.dtype = torch.bfloat16 + self.scales_dict = {} + self.inps = None + self.observer_layer_classes = [nn.Linear] + self.scale_function = AutoLayerScale( + model_type=self.model_arch_type, + observer_layer_classes=self.observer_layer_classes, + ) + + def move_embed(self, model, device: str): + print_info(model) + model.model.model.embed_tokens = model.model.model.embed_tokens.to(device) + model.model.model.rotary_emb = model.model.model.rotary_emb.to(device) + + @torch.no_grad() + def run(self, dataloader): + for model_module in self.layers: + model_module.eval() + layers = self.layers + dev = get_best_device() + nsamples = len(dataloader) + self.inps = torch.zeros( + (int(nsamples), self.seq_length, self.hidden_size), + device=dev, + dtype=self.dtype, + ) + cache = {"i": 0} + layers[0] = layers[0].to(dev) + self.quant_model.model.model.embed_tokens = ( + self.quant_model.model.model.embed_tokens.to(dev) + ) + layers[0] = Catcher(layers[0], self.inps, cache) + self.quant_model.model_forward(dataloader) + layer_kwargs = layers[0].layer_kwargs + for k, v in layer_kwargs.items(): + # position embeddings + if isinstance(v, tuple): + layer_kwargs[k] = tuple( + ( + item.to(dev) + if isinstance(item, (torch.Tensor, nn.Module)) + else item + ) + for item in v + ) + + print_info("cache['i']:{}".format(cache["i"])) + print_info(len(layers)) + layers[0] = layers[0].module + print_info(self.inps.shape) + outs = torch.zeros_like(self.inps) + # begin the lepto process + print_info("Ready.") + layers = layers.cpu() + torch.cuda.empty_cache() + + outs = outs.to("cpu") + self.inps = self.inps.to("cpu") + print_info(layer_kwargs) + + for i in range(len(layers)): + if torch.cuda.is_available(): + print_info( + f"GPU Memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB" + ) + + layer = layers[i].to(dev) + outs = outs.to(dev) + self.inps = self.inps.to(dev) + subset = self._find_layers(layer) + + if self.model_arch_type == "qwen3_moe": + subset = { + **subset, + "mlp": layer.mlp, + } + + # firstly, get input features of all linear layers + def cache_input_hook(m, x, y, name, feat_dict, layer): + x = x[0] + x = x.detach().cpu() + feat_dict[name].append(x) + + input_feat = defaultdict(list) + handles = [] + for name in subset: + handles.append( + subset[name].register_forward_hook( + functools.partial( + cache_input_hook, + name=name, + feat_dict=input_feat, + layer=subset[name], + ) + ) + ) + # being hook + for j in range(min(self.inps.shape[0], nsamples)): + with torch.no_grad(): + outs[j, :, :] = layer( + hidden_states=self.inps[j, :, :].unsqueeze(0), **layer_kwargs + )[0].squeeze(1) + + # remove duplicate + def deduplicate_tensors(tensor_list): + unique_tensors = [] + assert len(tensor_list) % 2 == 0 + for i in range(int(len(tensor_list) / 2)): + if torch.equal(tensor_list[i * 2], tensor_list[i * 2 + 1]): + unique_tensors.append(tensor_list[i * 2]) + else: + raise ValueError + for tensor in tensor_list: + if not any(torch.equal(tensor, t) for t in unique_tensors): + unique_tensors.append(tensor) + return unique_tensors + + for k, v in input_feat.items(): + if len(v) > nsamples: + print_info(f"Warning: repetition hook {k}") + input_feat[k] = deduplicate_tensors(v) + + print_info("HOOK Step{}".format(j)) + for h in handles: + h.remove() + + # now solve for scaling and clipping + input_feat = {k: torch.cat(v, dim=0) for k, v in input_feat.items()} + + # Clear GPU memory + torch.cuda.empty_cache() + + scales_list = self.scale_function.auto_scale( + self.ptq_hook, layer, input_feat, layer_kwargs + ) + + for scales in scales_list: + for kn in scales[0]: + name = "model.layers.{}.{}".format(i, kn) + self.scales_dict[name] = scales[1] + + layers[i] = layers[i].cpu() + layer = layer.cpu() + torch.cuda.empty_cache() + self.inps, outs = outs, self.inps + print_info("LEPTO FP8 end layer {}\n".format(i)) + + print_info(self.scales_dict) + + def _find_layers(self, module, layers=None, name=""): + if not layers: + layers = self.observer_layer_classes + if type(module) in layers: + return {name: module} + res = {} + for name1, child in module.named_children(): + res.update( + self._find_layers( + child, + layers=layers, + name=name + "." + name1 if name != "" else name1, + ) + ) + return res + + def convert(self): + # 1. get act, weight and kv-cache scale + old_list = [] + new_list = [] + for name, sub_layer in self.ptq_hook.quant_layers_dict.items(): + weight_scales = self.quant_model.get_weight_scales( + sub_layer, self.ptq_hook.observer_dict[sub_layer].weight_observer + ) + + self.quant_model.weight_scales_dict[name] = weight_scales / get_fp_maxval( + bits=8 + ).type(weight_scales.dtype) + old_scale = self.ptq_hook.observer_dict[sub_layer].act_observer.scales() + lepto_scale = torch.clamp( + self.scales_dict.pop(name).squeeze().detach().to(old_scale.device), + min=0, + max=99999, + ) + + self.quant_model.act_scales_dict[name] = lepto_scale + print_info( + f"{name} , {old_scale}, " + f"{old_scale / get_fp_maxval(bits=8).type(weight_scales.dtype).item()} " + f"{lepto_scale.item()}" + ) + old_list.append(old_scale / get_fp_maxval(bits=8).type(weight_scales.dtype)) + new_list.append(self.quant_model.act_scales_dict[name]) + print_info(sum(old_list)) + print_info(sum(new_list)) + self.ptq_hook.remove_hook() + torch.cuda.empty_cache() + + # 2. insert qdq module + layers = self.quant_model.get_model() + for name, sub_layer in self.ptq_hook.quant_layers_dict.items(): + parent_layer, sub_name = find_parent_layer_and_sub_name(layers, name) + + qdq_module = self.quant_model.get_qdq_module(sub_layer, name) + setattr(parent_layer, sub_name, qdq_module) + self.quant_model.quantized = True diff --git a/angelslim/compressor/quant/modules/fp8/lepto_scale.py b/angelslim/compressor/quant/modules/fp8/lepto_scale.py new file mode 100644 index 00000000..57d2dd4d --- /dev/null +++ b/angelslim/compressor/quant/modules/fp8/lepto_scale.py @@ -0,0 +1,290 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import functools + +import torch + +from .....utils import get_op_name, print_info +from ...core import get_fp_maxval, mse_loss +from ...core.quant_func import quantize_weight_per_tensor_fp8, tensor_quant_dequant_fp8 + + +class AutoLayerScale: + def __init__( + self, + loss_function=mse_loss, + merge_samples=True, + model_type="dense", + observer_layer_classes=None, + ): + """ """ + self.loss_function = loss_function + self.merge_samples = merge_samples + self.model_type = model_type + self.layer_count = 0 + self.observer_layer_classes = observer_layer_classes + self.n_exponent = 5 + self.search_step = 10 + + def auto_scale(self, ptq_hook, module, input_feat, cache): + print_info("[auto scale] start") + + def _auto_get_scale(layer_name, layers, inp, module2inspect=None, cache=None): + if module2inspect is None: + assert len(layers) == 1 + module2inspect = layers[0] + + inp = inp.to(layers[0].weight.device) + if self.merge_samples: + act_abs_max = ( + inp.abs().reshape(-1, inp.shape[-1]).mean(0).reshape(1, -1) + ) + else: + all_inp = inp + act_abs_max = ( + all_inp.abs().reshape(-1, all_inp.shape[-1]).mean(0).reshape(1, -1) + ) + del all_inp + + print_info(f"[auto scale] {layer_name} act_abs_max: {act_abs_max}") + + scales = self.search_by_block( + layer_name, + inp, + act_abs_max, + layers, + module2inspect, + cache, + ) + scales = scales.detach().cpu() + print_info(f"[auto scale] {layer_name} scales: {scales}") + inp = inp.cpu() + torch.cuda.empty_cache() + + # prev_op_name, [layer_name], scale + return ( + tuple([get_op_name(module, m) for m in layers]), + scales, + ) + + scales_list = [] + print_info(input_feat.keys()) + scales_list.append( + _auto_get_scale( + layer_name="attn.qkv", + layers=[ + module.self_attn.q_proj, + module.self_attn.k_proj, + module.self_attn.v_proj, + ], + inp=input_feat["self_attn.q_proj"], + module2inspect=module.self_attn, + cache=cache, + ) + ) + + # attention output + scales_list.append( + _auto_get_scale( + layer_name="attn.o", + layers=[module.self_attn.o_proj], + inp=input_feat["self_attn.q_proj"], + module2inspect=module.self_attn, + cache=cache, + ) + ) + + print_info("auto scale -> Denselepto") + # fc1 + scales_list.append( + _auto_get_scale( + layer_name="mlp.gate_proj", + layers=[module.mlp.gate_proj, module.mlp.up_proj], + inp=input_feat["mlp.gate_proj"], + module2inspect=module.mlp, + cache=cache, + ) + ) + # fc2 + scales_list.append( + _auto_get_scale( + layer_name="mlp.down_proj", + layers=[module.mlp.down_proj], + inp=input_feat["mlp.gate_proj"], + module2inspect=module.mlp, + cache=cache, + ) + ) + self.layer_count += 1 + print_info("[auto scale] end") + return scales_list + + def _get_out(self, layer_name, act, block, cache): + if "att" in layer_name: + return block(act, **cache)[0].squeeze(1) + else: + return block(act)[0].squeeze(1) + + def lepto_qdq_fp8_tensor(self, tensor, ratio): + assert len(tensor.shape) == 1, f"tensor.device:{tensor.device}" + w_scale = tensor.abs().max() / get_fp_maxval(bits=8) + + orig_fp8w, _ = quantize_weight_per_tensor_fp8(tensor, w_scale) + + outlier_point = 0.999 + 0.00005 * ratio + n = min(round(len(tensor) * outlier_point), len(tensor) - 1) # 0.001%0.001 + sorted_indices = torch.argsort(tensor.abs()) + closest_indices = sorted_indices[n] + + cut_np_fp8w1 = orig_fp8w[closest_indices].float().abs() + + adapt_scale = tensor.abs().max() / cut_np_fp8w1.type(tensor.dtype) + print_info( + f"w_scale:{w_scale.item()}, adapt_scale:{adapt_scale.item()}," + f" cut_np_fp8w1: {cut_np_fp8w1.item()}" + ) + return adapt_scale.to(tensor.dtype) + + def lepto_qdq_fp8_tensor_v2(self, tensor, ratio): + assert len(tensor.shape) == 1, f"tensor.device:{tensor.device}" + w_scale = tensor.abs().max() / get_fp_maxval(bits=8) + + orig_fp8w, _ = quantize_weight_per_tensor_fp8(tensor, w_scale) + + outlier_point = 0.999 + n = min(round(len(tensor) * outlier_point), len(tensor) - 1) # 0.001%0.001 + sorted_indices = torch.argsort(tensor.abs()) + closest_indices = sorted_indices[n] + + cut_np_fp8w1 = max( + orig_fp8w[closest_indices].float().abs(), get_fp_maxval(bits=8) / 7 + ) + + step = (get_fp_maxval(bits=8) - cut_np_fp8w1) / self.search_step + break_point = min(cut_np_fp8w1 + (step * (ratio + 1)), get_fp_maxval(bits=8)) + + # FP8-list + # r_list = [22, 30, 44, 60, 88, 120, 176, 224, 288, 320, 352, 384, 416, 448] + # break_point = torch.tensor(r_list[ratio]) + + adapt_scale = tensor.abs().max() / break_point.type(tensor.dtype) + print_info( + f"{w_scale.item()}, {adapt_scale.item()}, " + f"{tensor.abs().max().item()}, " + f"{break_point.type(tensor.dtype).item()}" + ) + return adapt_scale.to(tensor.dtype) + + def lepto_input_hook(self, module, input, scale): + modified_input = tensor_quant_dequant_fp8(input[0], scale) + new_input = [modified_input] + for i in range(len(input) - 1): + new_input.append(input[1 + i]) + exit() + return tuple(new_input) + + def search_by_block( + self, + layer_name, + act_input, + act_abs_max, + layers, + block, + cache, + ): + print_info(f"inp.shape:{act_input.shape}") + print_info(f"block:{block}") + print_info(f"act_abs_max.shape:{act_abs_max.shape}") + act = act_input + print_info("[lepto search] search input of %s" % layer_name) + best_error = float("inf") + best_ratio = -1 + best_scales = None + + with torch.no_grad(): + if cache is not None: + origin_out = torch.ones_like(act) + new_out = torch.ones_like(act) + else: + origin_out = torch.ones( + (act.shape[0], act.shape[1], layers[0].weight.shape[0]), + dtype=act.dtype, + device=act.device, + ) + new_out = torch.ones( + (act.shape[0], act.shape[1], layers[0].weight.shape[0]), + dtype=act.dtype, + device=act.device, + ) + + for j in range(act.shape[0]): + origin_out[j, :, :] = self._get_out( + layer_name, act[j, :, :].unsqueeze(0), block, cache + ) + print_info(f"origin_out.shape:{origin_out.shape}") + org_w = [] + for layer in layers: + org_w.append(layer.weight.clone().cpu()) + + for ratio in range(8, 21): + adapt_scale = self.lepto_qdq_fp8_tensor( + act.unsqueeze(0).view(-1), ratio + ).unsqueeze(0) + handles = [] + for layer in layers: + handles.append( + layer.register_forward_pre_hook( + functools.partial(self.lepto_input_hook, scale=adapt_scale) + ) + ) + + for j in range(act.shape[0]): + new_act = act[j, :, :].unsqueeze(0) + new_out[j, :, :] = self._get_out(layer_name, new_act, block, cache) + + loss = self.loss_function(origin_out, new_out).to(torch.float32) + print_info( + "ratio: {}, adscale: {}, loss: {}".format(ratio, adapt_scale, loss) + ) + if loss < best_error: + best_error = loss + best_ratio = ratio + best_scales = adapt_scale + + for layer, w in zip(layers, org_w): + layer.weight.data.copy_(w) + + for h in handles: + h.remove() + + origin_out = origin_out.detach().cpu() + new_out = w.detach().cpu() + del origin_out + del new_out + for w in org_w: + w = w.detach().cpu() + del w + + if best_scales is None: + best_scales = torch.ones(adapt_scale.shape, dtype=act.dtype) + print_info("Cannot find better ratio.") + else: + print_info( + "Best ratio :{}, minimal loss : {}, best_scales:{}.".format( + best_ratio, best_error, best_scales + ) + ) + return best_scales.detach().cpu() diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index d2ac1ee1..3f860b59 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -18,7 +18,7 @@ from ...utils import find_parent_layer_and_sub_name, print_info from ..compressor_factory import CompressorFactory from .core import PTQHook -from .modules import AWQ, FP8, GPTQ, INT8, SmoothQuant +from .modules import AWQ, FP8, GPTQ, INT8, LeptoFP8, SmoothQuant __all__ = ["PTQ"] @@ -69,13 +69,23 @@ def __init__(self, model, slim_config=None): max_seq_length = self.quant_model.quant_config.max_seq_length hidden_size = self.quant_model.quant_config.hidden_size model_arch_type = self.quant_model.quant_config.model_arch_type - self.fp8 = FP8( - self.quant_model, - seq_length=max_seq_length, - hidden_size=hidden_size, - model_arch_type=model_arch_type, - low_memory=self.quant_model.quant_config.low_memory, - ) + if "lepto" in self.quant_algo: + self.fp8 = LeptoFP8( + self.ptq_hook, + self.quant_model, + seq_length=max_seq_length, + hidden_size=hidden_size, + model_arch_type=model_arch_type, + low_memory=self.quant_model.quant_config.low_memory, + ) + else: + self.fp8 = FP8( + self.quant_model, + seq_length=max_seq_length, + hidden_size=hidden_size, + model_arch_type=model_arch_type, + low_memory=self.quant_model.quant_config.low_memory, + ) if "int8" in self.quant_algo: max_seq_length = self.quant_model.quant_config.max_seq_length hidden_size = self.quant_model.quant_config.hidden_size @@ -117,6 +127,8 @@ def convert(self): self.gptq.convert() elif "awq" in self.quant_algo: self.awq.convert() + elif "lepto" in self.quant_algo: + self.fp8.convert() else: if self.modal_type in ["LLM", "VLM"]: if "smooth" in self.quant_helpers: @@ -139,7 +151,7 @@ def save(self, save_path: str): for k in self.quant_model.act_scales_dict.keys(): act_scales_data = self.quant_model.act_scales_dict[k].data if act_scales_data > 1.5: - print( + print_info( f"[AngelSlim Warning] Act_scales {k}: " f"The weight is too high:{act_scales_data}. " f"It is recommended to clip it to 1.5 " @@ -147,7 +159,7 @@ def save(self, save_path: str): for k in self.quant_model.weight_scales_dict.keys(): weight_scales_data = self.quant_model.weight_scales_dict[k].data if weight_scales_data > 1.5: - print( + print_info( f"[AngelSlim Warning] Weight_scales {k}: " f"The weight is too high:{weight_scales_data}. " f"It is recommended to clip it to 1.5 " diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py index 55457e8d..91b13406 100644 --- a/angelslim/utils/config_parser.py +++ b/angelslim/utils/config_parser.py @@ -256,6 +256,7 @@ def __init__(self): self.supported_quant_methods = [ "fp8_static", "fp8_dynamic", + "fp8_lepto", "int4_awq", "int4_gptq", "int8_dynamic", @@ -395,6 +396,7 @@ def get_default_config() -> FullConfig: ), dataset_config=None, global_config=global_config, + infer_config=None, ) diff --git a/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml new file mode 100644 index 00000000..e3cd159a --- /dev/null +++ b/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: HunyuanDense + model_path: tencent/Hunyuan-1.8B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_lepto # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq + bits: 8 # Quantization bits (4/8) + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 4096 + num_samples: 32 + batch_size: 1 diff --git a/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml new file mode 100644 index 00000000..b538b588 --- /dev/null +++ b/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: HunyuanDense + model_path: tencent/Hunyuan-4B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_lepto # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq + bits: 8 # Quantization bits (4/8) + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 4096 + num_samples: 32 + batch_size: 1 diff --git a/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml new file mode 100644 index 00000000..1a89a022 --- /dev/null +++ b/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen2.5-1.5B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_lepto # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq + bits: 8 # Quantization bits (4/8) + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 4096 + num_samples: 32 + batch_size: 1 diff --git a/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml new file mode 100644 index 00000000..fe876fab --- /dev/null +++ b/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen3-0.6B + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_lepto + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 4096 + num_samples: 32 + batch_size: 1 diff --git a/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml new file mode 100644 index 00000000..814c1462 --- /dev/null +++ b/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen3-4B + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_lepto + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 4096 + num_samples: 32 + batch_size: 1 diff --git a/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml new file mode 100644 index 00000000..108576de --- /dev/null +++ b/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml @@ -0,0 +1,34 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen3-8B + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_lepto + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 4096 + num_samples: 32 + batch_size: 1 diff --git a/docs/source/features/quantization/fp8_lepto.md b/docs/source/features/quantization/fp8_lepto.md new file mode 100644 index 00000000..4ee5d54c --- /dev/null +++ b/docs/source/features/quantization/fp8_lepto.md @@ -0,0 +1,63 @@ +(fp8_lepto)= + +# FP8_LEPTO量化 + +通常情况下PTQ统计Activation和Weight的abs Max值作为量化缩放系数。通过观察FP8 PTQ量化后有损的模型的数值分布发现,相较于量化无损模型会出现激活值分布方差过大的情况,这种数值分布会使得量化数值落在FP8难以表达的量化范围,导致模型在一些数学难题或文本格式要求严格的任务上损失过大。 + + +通过观察原始精度数值分布发现,该权重整体数值集中分布为尖峰分布,存在明显的outlier且大部分数据集中在0点附近,数值之间的相对距离较小导致计算过程中对于数值的精度要求更高。FP8 QDQ量化后的权重分布如右图所示,可以发现量化后的分布对比原始精度较为平滑。由于FP8-E4M3的数值表达在越靠近零点可表示的数值越多,趋近于正态分布的下的原始精度权重Worigin,通过传统FP8量化会导致原本数值较密集的数被平滑到了表达能力较差的FP8精度范围,导致精度表达能力下降带来效果损失。 + +针对上述FP8量化问题,我们推出了Leptokurtic Quant(LeptoQuant),一种通过隔离outlier将FP8权重映射范围集中至高精度区域的搜索策略。通常情况下对于权重和激活的量化难易度上激活值难度更高,因此我们着重优化激活的FP8。LEPTO将原始FP8的outlier值作为FP8精度表达上限,从而计算出新的Scale将数值分布压缩至高精度分布范围,使得量化后的激活值具有更好的精度表达。 + + +运行示例如下: + +```shell +python3 tools/run.py -c configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml +``` + +该配置文件中,量化相关参数如下: +- `name`:压缩策略。 +- `quantization.name`:压缩算法填`fp8_lepto`。 +- `quantization.bits`:fp8量化对应填写8bit。 +- `quantization.quant_method`:主要指定权重和激活的量化粒度为`per-tensor`。 +- `quantization.ignore_layers`:需要忽略不进行量化的线性层。 + +```yaml +compression: + name: PTQ + quantization: + name: fp8_lepto + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" +``` + +激活静态量化需要指定校准数据集,例如使用`sharegpt`数据集: + +```yaml +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 4096 + num_samples: 256 + batch_size: 1 +``` + +数据集相关参数如下: +- `name`:校准数据集类型,文生文任务选择`TextDataset`。 +- `data_path`:校准数据JSONL文件位置。 +- `max_seq_length`:校准截断最大上下文长度。 +- `num_samples`:校准最大样本个数。 +- `batch_size`:校准批量大小。 + +支持数据格式详见[数据准备文档](../design/prepare_dataset.md)。 + + +## 产出模型&部署 + +同FP8 diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index d5dd153d..49636faa 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -11,4 +11,5 @@ fp8 int8 awq gptq +fp8_lepto ::: diff --git a/requirements.txt b/requirements.txt index c6e65511..e8c8abf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,12 +2,12 @@ torch>=2.6.0 torchvision>=0.21.0 transformers>=4.52.0 safetensors>=0.5.3 -diffusers +diffusers==0.34.0 numpy tqdm pyarrow threadpoolctl -qwen_vl_utils +qwen_vl_utils==0.0.11 tiktoken triton datasets