From f4076175e8ab39d510a4a306572fcb9669b32caf Mon Sep 17 00:00:00 2001 From: ali-12388 <1940747290@qq.com> Date: Fri, 5 Sep 2025 10:34:28 +0800 Subject: [PATCH 1/2] DeepSeek;modify scale save;fix thinking data calibration --- angelslim/compressor/quant/core/__init__.py | 4 +- angelslim/compressor/quant/core/save.py | 186 +++++++++--------- .../compressor/quant/modules/helper_layer.py | 18 +- angelslim/compressor/quant/ptq.py | 3 +- angelslim/data/text_dataset.py | 18 ++ angelslim/models/llm/deepseek.py | 15 +- angelslim/models/llm/modeling_deepseek.py | 2 +- .../w4a8_fp8/deepseek_r1_w4a8_fp8.yaml | 2 + .../deepseek_r1_w4a8_fp8_low_memmory.yaml | 2 + 9 files changed, 142 insertions(+), 108 deletions(-) diff --git a/angelslim/compressor/quant/core/__init__.py b/angelslim/compressor/quant/core/__init__.py index 5dd61769..3bbaf481 100644 --- a/angelslim/compressor/quant/core/__init__.py +++ b/angelslim/compressor/quant/core/__init__.py @@ -18,8 +18,8 @@ from .packing_utils import dequantize_gemm, pack_weight_to_int8 # noqa: F401 from .quant_func import * # noqa: F401 F403 from .sample_func import EMASampler, MultiStepSampler # noqa: F401 -from .save import DeepseekV3HfPTQSave # noqa: F401 -from .save import DeepseekV3PTQSaveTRTLLM # noqa: F401 +from .save import DeepSeekV3PTQSaveMulti # noqa: F401 +from .save import DeepSeekV3PTQSaveSingle # noqa: F401 from .save import PTQDiffusionSave # noqa: F401 from .save import PTQOnlyScaleSave # noqa: F401 from .save import PTQPTMSave # noqa: F401 diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py index 1c846de8..dd6c9fa9 100644 --- a/angelslim/compressor/quant/core/save.py +++ b/angelslim/compressor/quant/core/save.py @@ -328,7 +328,7 @@ def save(self, save_path): print_info("save weight scales done.") -class DeepseekV3HfPTQSave(PTQSaveBase): +class DeepSeekV3PTQSaveMulti(PTQSaveBase): def __init__(self, quant_model, check_scales=False): super().__init__(quant_model=quant_model) self.moe_act_scales_dict = {} @@ -371,7 +371,6 @@ def __init__(self, quant_model, check_scales=False): ".mlp.down_proj", ".mlp.shared_experts.down_proj", ] - self.exclude_key = ["*head*", "*kv_b*"] def save(self, save_path): save_path = os.path.join(save_path, "scales") @@ -411,7 +410,7 @@ def save(self, save_path): _save_path = os.path.join( save_path, "{}.weight_scale.{}.{}.pt".format(k, "int4", _index) ) - scale_int4 = max_value_group_wise / 7 + scale_int4 = max_value_group_wise / 8 # save weigth-int4-pergroup scale if "experts" in k and "shared_experts" not in k: @@ -464,7 +463,7 @@ def save(self, save_path): ) os.makedirs(save_model_path, exist_ok=True) - self.convert_scales_to_safetensors(save_path, save_model_path) + self.convert_scales_to_safetensors(save_path, tmp_path) print_info("convert scales to safetensors done.") file_name = self.merge_model( @@ -604,7 +603,7 @@ def merge_model(self, input_path, save_model_path, mp=16): model_save_ind = 0 localind = 0 - scale_path = os.path.join(save_model_path, "model-scales.safetensors") + scale_path = os.path.join(input_path, "model-scales.safetensors") scales_dict = load_file(scale_path) for mpind in range(mp): @@ -684,7 +683,6 @@ def merge_model(self, input_path, save_model_path, mp=16): param_list.append(param) newparam = torch.cat(param_list, dim=0) new_save_dict[k] = newparam - print_info(f"shape of {k}: {new_save_dict[k].shape}") index_dict["weight_map"][k] = str(filename) safe_save(new_save_dict, os.path.join(save_model_path, filename)) # process others @@ -715,13 +713,8 @@ def merge_model(self, input_path, save_model_path, mp=16): index_dict, filename, ) - print_info(f"shape of {k}: {new_save_dict[k].shape}") safe_save(new_save_dict, os.path.join(save_model_path, filename)) - # update scales map - for k, _ in scales_dict.items(): - index_dict["weight_map"][k] = "model-scales.safetensors" - path = self.quant_model.model.ori_model_path for file_path in glob(os.path.join(path, "*token*")): new_file_path = os.path.join(save_model_path, os.path.basename(file_path)) @@ -754,21 +747,25 @@ def merge_model(self, input_path, save_model_path, mp=16): quant_dict = { "quantization_config": { "quant_method": "w4a8_awq", - "kv_cache_quant_method": "fp8", + "weight_group_size": 128, "activation_scheme": ( "dynamic" if "dynamic" in a_quant_algo else "static" ), - "ignored_modules": [ + "kv_cache_quant_method": "fp8", + "ignored_layers": [ "*self_attn*", "*gate_up_proj", "*down_proj", "*layers.61*", ], "ignored_quantization_config": { - "quant_method": "fp8_block_scales", + "quant_method": "fp8", + "activation_scheme": "dynamic", + "fmt": "e4m3", "kv_cache_quant_method": "fp8", + "weight_block_size": [128, 128], }, - }, + } } else: raise NotImplementedError( @@ -813,40 +810,44 @@ def _transform_keys( filename, ): if "fp8" in self.quant_model.quant_config.quant_algo: - if "w4a8" in self.quant_model.quant_config.quant_algo: - if self._is_packed(param_name, param, scales_dict): - param = self._packed_weight( - param_name, - param, - self.quant_model.quant_config.quant_algo_info["w_group_size"], - scales_dict, - ) - param_name = param_name.replace("weight", "qweight") - else: + if not any( + substring in param_name + for substring in self.quant_model.quant_config.quant_algo_info[ + "ignore_layers" + ] + ): if param_name.endswith("weight_scale_inv"): return + weight_scale = scales_dict.get(f"{param_name}_scale", None) + if weight_scale is not None: + new_save_dict[f"{param_name}_scale"] = weight_scale + new_save_dict[f"{param_name[:-7]}.input_scale"] = scales_dict[ + f"{param_name[:-7]}.input_scale" + ] + index_dict["weight_map"][f"{param_name}_scale"] = str(filename) + index_dict["weight_map"][f"{param_name[:-7]}.input_scale"] = str( + filename + ) + if "w4a8" in self.quant_model.quant_config.quant_algo: + param = self._packed_weight( + param_name, + param, + self.quant_model.quant_config.quant_algo_info[ + "w_group_size" + ], + scales_dict, + ) + new_save_dict[f"{param_name}_scale.int4"] = scales_dict[ + f"{param_name}_scale.int4" + ] + index_dict["weight_map"][f"{param_name}_scale.int4"] = str( + filename + ) + param_name = param_name.replace("weight", "qweight") new_save_dict[param_name] = param index_dict["weight_map"][param_name] = str(filename) - def _is_packed(self, weight_name, weight, scales_dict): - if weight_name.endswith("weight_scale_inv") or self._is_exclude(weight_name): - return False - elif weight.element_size() == 1: - if f"{weight_name}_scale.int4" in scales_dict.keys(): - return True - return False - else: - return False - - def _is_exclude(self, tensor_name): - for pattern in self.exclude_key: - # Convert fnmatch-style pattern to regex - regex_pattern = pattern.replace("*", ".*").replace("?", ".") - if re.fullmatch(regex_pattern, tensor_name): - return True - return False - def _packed_weight(self, weight_name, weight, block_wise, scales_dict): target_shape = (weight.shape[0] // block_wise, weight.shape[1] // block_wise) scale_inv = scales_dict[f"{weight_name}_scale"] @@ -920,65 +921,68 @@ def add_mtp_weight(self, input_path=None, save_path=None, file_name=None): json.dump(new_model_index, f, indent=2) -class DeepseekV3PTQSaveTRTLLM(DeepseekV3HfPTQSave): +class DeepSeekV3PTQSaveSingle(DeepSeekV3PTQSaveMulti): def __init__(self, quant_model): super().__init__(quant_model=quant_model) def save(self, save_path): + # setting quantization config a_quant_algo = self.quant_model.quant_config.quant_algo_info["a"] - if "w4a8" in self.quant_model.quant_config.quant_algo: - quant_dict = { - "quantization_config": { - "quant_method": "w4a8_awq", - "kv_cache_quant_method": "fp8", - "activation_scheme": ( - "dynamic" if "dynamic" in a_quant_algo else "static" - ), - "ignored_modules": [ - "*self_attn*", - "*gate_up_proj", - "*down_proj", - "*layers.61*", - ], - "ignored_quantization_config": { - "quant_method": "fp8_block_scales", - "kv_cache_quant_method": "fp8", - }, - }, - } - int4_scales = {} - for name, sub_layer in self.quant_model.model.named_modules(): - if isinstance(sub_layer, QDQModule): - max_value_group_wise = sub_layer.weight_scale.data.clone() - int4_scales[f"{name}.weight_scale.int4"] = max_value_group_wise / 8 - sub_layer.weight_scale = None - sub_layer.weight_scale = torch.nn.Parameter( - (max_value_group_wise.max() / 448.0).to( - max_value_group_wise.dtype - ), - requires_grad=False, + if "fp8" in self.quant_model.quant_config.quant_algo: + if "w4a8" in self.quant_model.quant_config.quant_algo: + if self.quant_model.deploy_backend == "trtllm": + quant_dict = { + "quantization_config": { + "quant_method": "w4a8_awq", + "weight_group_size": 128, + "activation_scheme": ( + "dynamic" if "dynamic" in a_quant_algo else "static" + ), + "kv_cache_quant_method": "fp8", + "ignored_layers": [ + "*self_attn*", + "*gate_up_proj", + "*down_proj", + "*layers.61*", + ], + "ignored_quantization_config": { + "quant_method": "fp8", + "activation_scheme": "dynamic", + "fmt": "e4m3", + "kv_cache_quant_method": "fp8", + "weight_block_size": [128, 128], + }, + } + } + else: + raise NotImplementedError( + f"deploy_backend {self.quant_model.deploy_backend} \ + is not supported for w4a8_fp8." + ) + else: + ignore_layers = self.quant_model.quant_config.quant_algo_info[ + "ignore_layers" + ] + if self.quant_model.deploy_backend == "vllm": + quant_dict = { + "quantization_config": { + "quant_method": "fp8", + "activation_scheme": ( + "dynamic" if "dynamic" in a_quant_algo else "static" + ), + "ignored_layers": ignore_layers, + } + } + else: + raise NotImplementedError( + f"deploy_backend {self.quant_model.deploy_backend} \ + is not supported for fp8_static." ) os.makedirs(save_path, exist_ok=True) - safetensor_file = os.path.join(save_path, "model-scales.safetensors") - safe_save(int4_scales, safetensor_file) - print_info(f"Save int4 scales to {safetensor_file}") - self.quant_model.get_model().config.update(quant_dict) print_info("Save quantization_config: {}".format(quant_dict)) - self.quant_model.get_model().save_pretrained(save_path) - - new_model_index_file = os.path.join( - save_path, "model.safetensors.index.json" - ) - with open(new_model_index_file, "r") as f: - new_model_index = json.load(f) - for key in int4_scales.keys(): - new_model_index["weight_map"][key] = "model-scales.safetensors" - with open(new_model_index_file, "w") as f: - json.dump(new_model_index, f, indent=2) - self.add_mtp_weight(save_path=save_path) else: raise ValueError( diff --git a/angelslim/compressor/quant/modules/helper_layer.py b/angelslim/compressor/quant/modules/helper_layer.py index e05f6dfe..044e9714 100644 --- a/angelslim/compressor/quant/modules/helper_layer.py +++ b/angelslim/compressor/quant/modules/helper_layer.py @@ -576,8 +576,8 @@ def __init__( self.quant_algo = quant_algo if "fp8" in quant_algo: if "w4a8" in self.quant_algo: - tensor_max_value = weight_scale.clone() - tensor_wise_scale = tensor_max_value.max() / 448.0 + max_value_group_wise = weight_scale.clone() + tensor_wise_scale = max_value_group_wise.max() / 448.0 quant_weight, _ = quantize_weight_per_tensor_fp8( weight, tensor_wise_scale ) @@ -587,10 +587,14 @@ def __init__( new_weight_bf16, method="groupwise", bits=4, group_size=group_size ) quant_weight, _ = quantize_weight_int( - new_weight_bf16_qdq, tensor_max_value, bits=4 + new_weight_bf16_qdq, max_value_group_wise, bits=4 ) quant_weight = pack_weight_to_int8(quant_weight) del new_weight_bf16_qdq, new_weight_bf16 + self.weight_scale_int4 = torch.nn.Parameter( + max_value_group_wise / 8, requires_grad=False + ) + weight_scale = tensor_wise_scale else: quant_weight, weight_scale = quantize_weight_per_tensor_fp8( weight, weight_scale @@ -653,6 +657,14 @@ def forward(self, x): output = qoutput.to(output.dtype) * self.output_scale return output + def state_dict(self, *args, **kwargs): + state_dict = super().state_dict(*args, **kwargs) + keys_to_rename = [k for k in state_dict.keys() if "weight_scale_int4" in k] + for old_key in keys_to_rename: + new_key = old_key.replace("weight_scale_int4", "weight_scale.int4") + state_dict[new_key] = state_dict.pop(old_key) + return state_dict + class QLinear(torch.nn.Module): def __init__( diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index d7d44c6e..5529a426 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -217,7 +217,8 @@ def _convert(self): ) qdq_module = self.quant_model.get_qdq_module(sub_layer, name) - setattr(parent_layer, sub_name, qdq_module) + if qdq_module is not sub_layer: + setattr(parent_layer, sub_name, qdq_module) self.quant_model.quantized = True def __getattr__(self, item): diff --git a/angelslim/data/text_dataset.py b/angelslim/data/text_dataset.py index 3d9a213f..5084218d 100644 --- a/angelslim/data/text_dataset.py +++ b/angelslim/data/text_dataset.py @@ -95,6 +95,24 @@ def _load_jsonl_data(self, data_path: str, num_samples: int): messages, tokenize=False, add_generation_prompt=True ) + thinking_data = False + for dic in messages: + if dic["role"] == "assistant": + if "" and "" in dic["content"]: + thinking_data = True + break + if thinking_data: + text = self.processor.bos_token + for dic in messages: + if dic["role"] == "system": + text += dic["content"] + elif dic["role"] == "user": + text = ( + text + "<|User|>" + dic["content"] + "<|Assistant|>" + ) + elif dic["role"] == "assistant": + text = text + dic["content"] + self.processor.eos_token + model_inputs = self.processor( [text], return_tensors="pt", diff --git a/angelslim/models/llm/deepseek.py b/angelslim/models/llm/deepseek.py index 9aef0ca2..1871c02f 100644 --- a/angelslim/models/llm/deepseek.py +++ b/angelslim/models/llm/deepseek.py @@ -21,9 +21,8 @@ from transformers.models.deepseek_v3 import DeepseekV3Config from ...compressor.quant.core import ( - DeepseekV3HfPTQSave, - DeepseekV3PTQSaveTRTLLM, - PTQSaveVllmHF, + DeepSeekV3PTQSaveMulti, + DeepSeekV3PTQSaveSingle, weight_dequant, ) from ...compressor.quant.modules import QDQModule @@ -207,14 +206,10 @@ def model_forward(self, dataloader, **kwargs): pass def get_save_func(self): - if self.deploy_backend in ["vllm", "huggingface"]: + if self.deploy_backend in ["vllm", "trtllm"]: if self.model.using_multi_nodes: - return DeepseekV3HfPTQSave - return PTQSaveVllmHF - elif self.deploy_backend == "trtllm": - if self.quant_config.low_memory: - return DeepseekV3PTQSaveTRTLLM - return DeepseekV3HfPTQSave + return DeepSeekV3PTQSaveMulti + return DeepSeekV3PTQSaveSingle else: raise NotImplementedError( f"deploy_backend {self.deploy_backend} is not supported for saving." diff --git a/angelslim/models/llm/modeling_deepseek.py b/angelslim/models/llm/modeling_deepseek.py index 1f7a58ca..878db915 100755 --- a/angelslim/models/llm/modeling_deepseek.py +++ b/angelslim/models/llm/modeling_deepseek.py @@ -1017,7 +1017,7 @@ def from_pretrained( if using_multi_nodes: cls.using_multi_nodes = True rank = int(os.getenv("RANK", "0")) - parent_dir = os.path.dirname(model_path) + parent_dir = os.path.dirname(model_path.rstrip("/")) tp_model_path = os.path.join(parent_dir, f"ds_ckpt_tp{cls.world_size}") os.makedirs(tp_model_path, exist_ok=True) diff --git a/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml b/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml index 8f1a2783..9370111e 100644 --- a/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml +++ b/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8.yaml @@ -34,6 +34,8 @@ compression: - "mlp.up_proj" - "mlp.down_proj" - "shared_expert" + - "lm_head" + - "model.embed_tokens" # Dataset for calibration dataset: diff --git a/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml b/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml index aadf5f13..841ec3a4 100644 --- a/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml +++ b/configs/deepseek_r1/w4a8_fp8/deepseek_r1_w4a8_fp8_low_memmory.yaml @@ -34,6 +34,8 @@ compression: - "mlp.up_proj" - "mlp.down_proj" - "shared_expert" + - "lm_head" + - "model.embed_tokens" # Dataset for calibration dataset: From b4b43c4a143e6433d9d052ec104b6204ac1c67d2 Mon Sep 17 00:00:00 2001 From: ali-12388 <1940747290@qq.com> Date: Fri, 5 Sep 2025 11:42:13 +0800 Subject: [PATCH 2/2] DeepSeek; rm tmp dir --- angelslim/compressor/quant/core/save.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py index dd6c9fa9..e146e123 100644 --- a/angelslim/compressor/quant/core/save.py +++ b/angelslim/compressor/quant/core/save.py @@ -473,6 +473,17 @@ def save(self, save_path): self.add_mtp_weight(save_path=save_model_path, file_name=file_name) + if os.path.exists(tmp_path): + shutil.rmtree(tmp_path) + parent_dir = os.path.dirname( + self.quant_model.model.ori_model_path.rstrip("/") + ) + tp_model_path = os.path.join( + parent_dir, f"ds_ckpt_tp{self.quant_model.model.world_size}" + ) + if os.path.exists(tp_model_path): + shutil.rmtree(tp_model_path) + def _save_ckpt(self, scale, save_path, all_reduce=True): if all_reduce: if self.rank == 0: