[fix]: Fix reference for weights to quant. (#276)

C-KRSW · krizaltang · web-flow · commit b0af8065ccde · 2026-03-26T20:40:14.000+08:00
Co-authored-by: krizaltang &lt;krizaltang@tencent.com&gt;
diff --git a/angelslim/compressor/quant/modules/daq/daq.py b/angelslim/compressor/quant/modules/daq/daq.py
@@ -18,13 +18,13 @@
 import json
 import multiprocessing as mp
 import os
+import shutil
 from collections import OrderedDict, defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from glob import glob
 
 import torch
-from huggingface_hub import snapshot_download
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 
@@ -40,6 +40,27 @@
     prefetch_base_shard,
 )
 
+# Suffixes that identify weight tensors to be quantized.
+# Imported from fp8_quant_blockwise for consistency; any weight whose name
+# ends with one of these suffixes will be quantized by DAQ.
+SUFFIX_TO_QUANT = [
+    ".gate_and_up_proj.weight",
+    ".gate_proj.weight",
+    ".up_proj.weight",
+    ".down_proj.weight",
+    ".q_a_proj.weight",
+    ".q_b_proj.weight",
+    ".kv_a_proj_with_mqa.weight",
+    ".kv_b_proj.weight",
+    ".qkv_proj.weight",
+    ".q_proj.weight",
+    ".k_proj.weight",
+    ".v_proj.weight",
+    ".o_proj.weight",
+    ".experts.gate_up_proj",
+    ".experts.down_proj",
+]
+
 __all__ = ["DAQ"]
 
 
@@ -127,7 +148,6 @@ def __init__(self, quant_config, sft_model_path: str):
         self.quantization_method = quant_config.quantization_method
         self.num_workers = quant_config.num_workers
         self.ignore_layers = getattr(quant_config, "ignore_layers", []) or []
-        self.base_model_repo = quant_config.base_model_repo
 
         gpus_str = quant_config.gpus
         if gpus_str:
@@ -228,7 +248,6 @@ def run(self, save_path: str):
         model_index_file = os.path.join(save_path, "model.safetensors.index.json")
         with open(model_index_file, "r") as f:
             model_index = json.load(f)
-        weight_map = model_index["weight_map"]
 
         base_weight_map = get_weight_map(self.base_model_path)
         if not base_weight_map:
@@ -253,7 +272,6 @@ def run(self, save_path: str):
                 safetensor_files,
                 self.base_model_path,
                 save_path,
-                weight_map,
                 base_weight_map,
                 dynamic_cache_size,
             )
@@ -262,7 +280,6 @@ def run(self, save_path: str):
                 safetensor_files,
                 self.base_model_path,
                 save_path,
-                weight_map,
                 base_weight_map,
                 dynamic_cache_size,
             )
@@ -284,32 +301,12 @@ def run(self, save_path: str):
         print_info("DAQ quantization complete!")
 
     def _prepare_output_dir(self, save_path: str):
-        # TODO: Currently we only support quantizing BF16 DeepSeek V3/R1 models to FP8.
-        # To support all model architectures, the logic for determining which weights
-        # to quantize should be changed from referencing the target model's
-        # model.safetensors.index.json to using regex-based include/exclude lists
-        # (e.g. regex patterns for weights to quantize and weights to ignore).
-        model_index_file = os.path.join(save_path, "model.safetensors.index.json")
-        config_file = os.path.join(save_path, "config.json")
-
-        # Check if files need to be downloaded
-        if not os.path.exists(model_index_file) or not os.path.exists(config_file):
-            print(f"Model index or config file not found in {save_path}")
-            print(f"Downloading config files from HuggingFace: {self.base_model_repo}")
-            try:
-                snapshot_download(
-                    repo_id=self.base_model_repo,
-                    ignore_patterns=["*.safetensors"],
-                    local_dir=save_path,
-                    local_dir_use_symlinks=False,
-                )
-            except Exception as e:
-                raise RuntimeError(
-                    f"Failed to download config files from HuggingFace repo "
-                    f"'{self.base_model_repo}'. Please check your network connection "
-                    f"and ensure the repo_id is correct. Original error: {e}"
-                ) from e
-            print(f"✓ Model index file and config file downloaded to {save_path}")
+        for item in os.listdir(self.sft_model_path):
+            src = os.path.join(self.sft_model_path, item)
+            dst = os.path.join(save_path, item)
+            if os.path.isfile(src) and not item.endswith(".safetensors"):
+                if not os.path.exists(dst):
+                    shutil.copy2(src, dst)
 
     def _update_config_json(self, save_path: str):
         config_file = os.path.join(save_path, "config.json")
@@ -346,7 +343,6 @@ def _run_single_process(
         safetensor_files,
         base_path,
         save_path,
-        weight_map,
         base_weight_map,
         dynamic_cache_size,
     ):
@@ -357,7 +353,6 @@ def _run_single_process(
                 safetensor_file,
                 base_path,
                 save_path,
-                weight_map,
                 base_weight_map,
                 self.scale_search_kwargs,
                 True,
@@ -377,7 +372,6 @@ def _run_multiprocess(
         safetensor_files,
         base_path,
         save_path,
-        weight_map,
         base_weight_map,
         dynamic_cache_size,
     ):
@@ -403,7 +397,6 @@ def _run_multiprocess(
                     worker_file_groups[wid],
                     base_path,
                     save_path,
-                    weight_map,
                     base_weight_map,
                     self.scale_search_kwargs,
                     worker_devices[wid],
@@ -487,7 +480,6 @@ def _worker_process_files(args):
         file_list,
         base_path,
         save_path,
-        weight_map,
         base_weight_map,
         scale_search_kwargs,
         device,
@@ -512,7 +504,6 @@ def _worker_process_files(args):
             safetensor_file,
             base_path,
             save_path,
-            weight_map,
             base_weight_map,
             scale_search_kwargs,
             False,
@@ -532,7 +523,6 @@ def _process_single_file(
     safetensor_file,
     base_path,
     fp8_path,
-    weight_map,
     base_weight_map,
     scale_search_kwargs,
     verbose,
@@ -622,8 +612,9 @@ def _process_single_file(
         scale_inv_name = f"{weight_name}_scale_inv"
 
         should_ignore = any(ignore_pattern in weight_name for ignore_pattern in ignore_layers)
+        should_quant = any(weight_name.endswith(suffix) for suffix in SUFFIX_TO_QUANT)
 
-        if scale_inv_name in weight_map and not should_ignore:
+        if should_quant and not should_ignore:
             assert weight.element_size() == 2, f"Expected BF16, got {weight.dtype}"
 
             base_weight = load_base_weight(
diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py
@@ -229,7 +229,6 @@ class QuantizationConfig:
     scale_search: Optional[Dict[str, Any]] = field(default=None)
     num_workers: int = field(default=8)
     gpus: Optional[str] = field(default=None)
-    base_model_repo: Optional[str] = field(default=None)
 
 
 @dataclass
diff --git a/configs/deepseek_r1/fp8_daq/deepseek_r1_daq_fp8_w8a8_block.yaml b/configs/deepseek_r1/fp8_daq/deepseek_r1_daq_fp8_w8a8_block.yaml
@@ -20,7 +20,6 @@ compression:
     bits: 8
     # DAQ-specific: path to the base (pretrained) model
     base_model_path: deepseek-ai/DeepSeek-R1-Base
-    base_model_repo: deepseek-ai/DeepSeek-R1
     # Set to true if the base model is FP8 format
     base_is_fp8: true
     # Optimization metric: "sign" (sign preservation rate),
diff --git a/configs/deepseek_r1/fp8_daq/deepseek_r1_daq_fp8_w8a8_channel.yaml b/configs/deepseek_r1/fp8_daq/deepseek_r1_daq_fp8_w8a8_channel.yaml
@@ -20,7 +20,6 @@ compression:
     bits: 8
     # DAQ-specific: path to the base (pretrained) model
     base_model_path: deepseek-ai/DeepSeek-R1-Base
-    base_model_repo: deepseek-ai/DeepSeek-R1
     # Set to true if the base model is FP8 format
     base_is_fp8: true
     # Optimization metric: "sign" (sign preservation rate),
diff --git a/docs/source/features/quantization/daq.md b/docs/source/features/quantization/daq.md
@@ -43,7 +43,6 @@ python3 tools/run.py -c configs/deepseek_r1/fp8_daq/deepseek_r1_daq_fp8_w8a8_blo
 - `quantization.name`：压缩算法选填`daq`。
 - `quantization.bits`：目标量化比特数，如fp8量化对应填写8bit。
 - `quantization.base_model_path`：基座模型路径。
-- `quantization.base_model_repo`：基座模型在huggingface的路径。
 - `quantization.base_is_fp8`：基座模型是否是FP8格式。
 - `quantization.metric`：优化指标，选填`sign`、`cosine`、`mse`。详细说明可参见[指标说明](#指标说明)或[技术报告](https://arxiv.org/abs/2603.22324)
 - `quantization.quantization_method`：量化方式，选填`blockwise`、`per_channel`。详细说明可参见[量化方式](#量化方式)
@@ -63,7 +62,6 @@ compression:
     name: daq
     bits: 8
     base_model_path: deepseek-ai/DeepSeek-R1-Base # DAQ-specific: path to the base model
-    base_model_repo: deepseek-ai/DeepSeek-R1
     base_is_fp8: true # Set to true if the base model is FP8 format
     metric: cosine    # Optimization metric: "sign"，"cosine"，or "mse"
     quantization_method: blockwise # Quantization method: "blockwise" or "per_channel"