Merge branch 'main' into feat/wan2.2-t2v

Charles2530 · web-flow · commit e3e12431f24e · 2026-03-30T16:29:34.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,5 @@ save*
 model/
 output_*
 datasets/
+.venv/
+*.sh
diff --git a/README.md b/README.md
@@ -34,6 +34,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 **Docs**: [English](https://llmc-en.readthedocs.io/en/latest/), [Chinese](https://llmc-zhcn.readthedocs.io/en/latest/).
 
+> **Recommended Python Version**: We recommend using **Python 3.11** for local development and installation. This matches the project's Docker images and CI configuration, and is generally more stable than Python 3.12 for the current dependency set.
+
 ## :fire: Latest News
 
 - **Nov 9, 2025:** 🍺🍺🍺 Our work [**LLMC+: Benchmarking Vision-Language Model Compression with a Plug-and-play Toolkit**](https://arxiv.org/abs/2508.09981) has been accepted by AAAI 2026.
diff --git a/README_zh.md b/README_zh.md
@@ -31,6 +31,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 **文档**： [English](https://llmc-en.readthedocs.io/en/latest/)、[中文](https://llmc-zhcn.readthedocs.io/en/latest/)。
 
+> **推荐 Python 版本**：建议本地开发和安装使用 **Python 3.11**。这与项目的 Docker 镜像和 CI 配置保持一致，并且对当前依赖集合而言通常比 Python 3.12 更稳定。
+
 ## :fire: 最新动态
 
 - **2025年8月13日:** 🚀 我们已开源针对 **视觉语言模型（VLMs）** 的压缩方案，支持共计超过 **20 种算法**，涵盖 **token reduction** 和 **quantization**。此次发布为多模态任务提供了灵活、即插即用的压缩策略。具体请参阅[文档](https://llmc-zhcn.readthedocs.io/en/latest/advanced/token_reduction.html)。
diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml
@@ -0,0 +1,48 @@
+base:
+    seed: &seed 42
+model:
+    type: model_type
+    path: model path
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: 1
+    seq_len: 2048
+    preproc: txt_general_preproc
+    seed: *seed
+eval:
+    eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos
+    name: wikitext2
+    type: decode_ppl
+    download: False
+    path: eval_data_path
+    bs: 1
+    inference_per_block: False
+    num_samples: 10
+    # num_eval_tokens: 3
+quant:
+    method: RTN
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_tensor
+        static: True
+    kvcache:
+        method: Naive
+        bit: 8
+        symmetric: True
+        granularity: per_head
+        head_num: kv head num
+save:
+    save_lightllm_kv_calib: True
+    lightllm_kv_cache_name: kv_cache_calib.json
+    save_fake: False
+    save_path: /path/to/save/
diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml
@@ -41,5 +41,7 @@ quant:
         symmetric: True
         granularity: per_tensor
 save:
+    save_lightllm_kv_calib: True
+    lightllm_kv_cache_name: kv_cache_calib.json
     save_fake: False
-    save_path: /path/to/save/
+    save_path: /path/to/save/
diff --git a/llmc/__main__.py b/llmc/__main__.py
@@ -20,6 +20,7 @@
 from llmc.models import *
 from llmc.utils import (check_config, deploy_all_modality, get_modality,
                         mkdirs, print_important_package_version, seed_all,
+                        collect_lightllm_kv_calib_json,
                         update_autoawq_quant_config,
                         update_lightx2v_quant_config, update_vllm_quant_config)
 from llmc.utils.registry_factory import ALGO_REGISTRY, MODEL_REGISTRY
@@ -72,6 +73,21 @@ def main(config):
 
     eval_model(model, blockwise_opts, eval_list, eval_pos='transformed')
     if int(os.environ['RANK']) == 0:
+        if 'save' in config and config.save.get('save_lightllm_kv_cache_calib', False):
+            calib_json_list = [
+                collect_lightllm_kv_calib_json(blockwise_opt)
+                for blockwise_opt in blockwise_opts
+                if hasattr(blockwise_opt, 'quant_kvcache')
+            ]
+            calib_json_payload = (
+                calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list
+            )
+            with open(save_lightllm_kv_cache_calib_path, 'w') as file:
+                json.dump(calib_json_payload, file, ensure_ascii=False, indent=4)
+            logger.info(
+                f'save lightllm kv cache calib done -- {save_lightllm_kv_cache_calib_path}'
+            )
+
         if 'save' in config and config.save.get('save_trans', False):
             blockwise_opt.save_model(save_trans_path)
 
@@ -209,6 +225,14 @@ def main(config):
     # Ensure only the main process creates directories
     if int(os.environ['RANK']) == 0:
         if 'save' in config:
+            if config.save.get('save_lightllm_kv_cache_calib', False):
+                mkdirs(config.save.save_path)
+                save_lightllm_kv_cache_calib_path = os.path.join(
+                    config.save.save_path,
+                    config.save.get(
+                        'lightllm_kv_cache_calib_name', 'kv_cache_calib.json'
+                    ),
+                )
             if config.save.get('save_trans', False):
                 save_trans_path = os.path.join(
                     config.save.save_path, 'transformed_model'
@@ -266,3 +290,4 @@ def main(config):
     llmc_duration_time = llmc_end_time - llmc_start_time
     logger.info(f'llmc_duration_time: {llmc_duration_time} s')
     logger.info('--- llmc finished ---')
+    
diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py
@@ -1,7 +1,6 @@
 import copy
 import functools
 import gc
-import json
 import os
 import re
 import shutil
@@ -180,18 +179,18 @@ def set_quant_config(self):
                     self.act_quant_module = IntegerQuantizer
             elif quant_type == 'float-quant':
                 self.act_quant_module = FloatQuantizer
-            else:
-                raise ValueError(
-                    f"Unsupported act quant_type: {quant_type}. "
-                    "Supported: int-quant, float-quant, hif4."
-                )
-            self.quant_config['act']['tp'] = self.tp
-            self.aquantizer = self.act_quant_module(**self.quant_config['act'])
             self.act_static = self.quant_config['act'].get('static', False)
             if self.act_static:
                 assert (
                     self.quant_config['act']['granularity'] == 'per_tensor'
                 ), 'Only support per_tensor static quant'
+                # Static activation quantization uses the batched calibration
+                # path, so normalize the default minmax setting to
+                # static_minmax to match the downstream calibration logic.
+                if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax':
+                    self.quant_config['act']['calib_algo'] = 'static_minmax'
+            self.quant_config['act']['tp'] = self.tp
+            self.aquantizer = self.act_quant_module(**self.quant_config['act'])
             self.quant_attn = self.quant_config['act'].get('quant_attn', False)
             if self.quant_attn:
                 assert self.config['model']['type'] in ['Vit', 'DeepseekV2']
@@ -213,8 +212,10 @@ def set_quant_config(self):
             kv_special_cfg = self.quant_config['kvcache'].get('special', {})
             act_static_cfg = {}
             if self.act_static:
-                act_static_cfg.update(self.config.calib.n_sample)
-                act_static_cfg.update(self.config.calib.bs)
+                # The KV cache constructor expects num_samples / bsz, so map
+                # the calibration config fields to the parameter names it uses.
+                act_static_cfg['num_samples'] = self.config.calib.n_samples
+                act_static_cfg['bsz'] = self.config.calib.bs
             kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant')
             self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']](
                 kv_quant_type, self.quant_config['kvcache'],
diff --git a/llmc/compression/quantization/kvquant.py b/llmc/compression/quantization/kvquant.py
@@ -1,3 +1,4 @@
+import copy
 import torch
 from loguru import logger
 from transformers import DynamicCache
@@ -12,12 +13,20 @@ class NaiveQuantKVCache(DynamicCache):
     def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1):
         super().__init__()
 
-        assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group']
+        # Copy the config to avoid mutating the original quantization config in static KV calibration.
+        kvquant_cfg = copy.deepcopy(kvquant_cfg)
+        assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group', 'per_head']
         self.num_hidden_layers, self.num_samples, self.bsz = (
             num_hidden_layers,
             num_samples,
             bsz,
         )
+        if kvquant_cfg.get('static', False) and kvquant_cfg.get(
+            'calib_algo', 'minmax'
+        ) == 'minmax':
+            # Static KV calibration uses the batched tensor statistics path, so convert the default
+            # minmax setting to static_minmax here to avoid a later calibration algo name mismatch.
+            kvquant_cfg['calib_algo'] = 'static_minmax'
         if quant_type == 'int-quant':
             self.kvquantizer = IntegerQuantizer(**kvquant_cfg)
         elif quant_type == 'float-quant':
diff --git a/llmc/compression/quantization/quant.py b/llmc/compression/quantization/quant.py
@@ -226,27 +226,24 @@ def get_minmax_stats(self, act_tensors):
             for tensor in tensors:
                 tensor = self.reshape_tensor(tensor)
                 tensor_range = self.get_minmax_range(tensor)
-                min_val, max_val = tensor_range[0], tensor_range[1]
+                min_val = tensor_range[0].detach().cpu().to(torch.float32)
+                max_val = tensor_range[1].detach().cpu().to(torch.float32)
 
                 if input_idx not in stats_min_max:
                     stats_min_max[input_idx] = {}
-                    stats_min_max[input_idx]['min'] = torch.tensor(
-                        [min_val], dtype=torch.float32
-                    )
-                    stats_min_max[input_idx]['max'] = torch.tensor(
-                        [max_val], dtype=torch.float32
-                    )
+                    stats_min_max[input_idx]['min'] = min_val.unsqueeze(0)
+                    stats_min_max[input_idx]['max'] = max_val.unsqueeze(0)
                 else:
                     stats_min_max[input_idx]['min'] = torch.cat(
                         [
                             stats_min_max[input_idx]['min'],
-                            torch.tensor([min_val], dtype=torch.float32),
+                            min_val.unsqueeze(0),
                         ]
                     )
                     stats_min_max[input_idx]['max'] = torch.cat(
                         [
                             stats_min_max[input_idx]['max'],
-                            torch.tensor([max_val], dtype=torch.float32),
+                            max_val.unsqueeze(0),
                         ]
                     )
 
@@ -257,8 +254,8 @@ def get_static_minmax_range(self, act_tensors):
         stats_min_max = self.get_minmax_stats(act_tensors)
         min_vals, max_vals = [], []
         for input_idx, tensor_range in stats_min_max.items():
-            min_val = tensor_range['min'].mean()
-            max_val = tensor_range['max'].mean()
+            min_val = tensor_range['min'].mean(dim=0)
+            max_val = tensor_range['max'].mean(dim=0)
             min_vals.append(min_val)
             max_vals.append(max_val)
 
diff --git a/llmc/compression/quantization/quarot.py b/llmc/compression/quantization/quarot.py
@@ -96,7 +96,7 @@ def get_orthogonal_matrix(self):
             raise ValueError(f'Unsupported mode {self.mode}')
 
     def block_transform(self, block):
-        logger.info(f'Start transform the {self.block_idx+1}-th block')
+        logger.info(f'Start transform the {self.block_idx + 1}-th block')
 
         if self.online_rotate:
             self.replace_rotate_linears(block)
@@ -108,7 +108,7 @@ def block_transform(self, block):
         gc.collect()
 
         logger.info(f'block:{block}')
-        logger.info(f'End transform the {self.block_idx+1}-th block')
+        logger.info(f'End transform the {self.block_idx + 1}-th block')
 
     @torch.no_grad()
     def subset_transform(self, block, subset):
diff --git a/llmc/compression/quantization/spqr.py b/llmc/compression/quantization/spqr.py
@@ -90,7 +90,7 @@ def block_transform_true_sequential(self, block, input_feat):
 
     @torch.no_grad()
     def block_transform(self, block, input_feat, *block_kwargs):
-        logger.info(f'Start transform the {self.block_idx+1}-th block')
+        logger.info(f'Start transform the {self.block_idx + 1}-th block')
 
         if self.true_sequential:
             self.block_transform_true_sequential(block, input_feat)
@@ -103,7 +103,7 @@ def block_transform(self, block, input_feat, *block_kwargs):
                 self.get_replacement_params(mode='fake_quant', w_only=True),
             )
 
-        logger.info(f'End transform the {self.block_idx+1}-th block')
+        logger.info(f'End transform the {self.block_idx + 1}-th block')
 
     @torch.no_grad()
     def subset_transform(self, layers_dict):
diff --git a/llmc/compression/quantization/tesseraq.py b/llmc/compression/quantization/tesseraq.py
@@ -176,7 +176,7 @@ def collect_block_qparams(self, block, input_feat):
 
     @torch.no_grad()
     def block_transform(self, block, input_feat, block_kwargs):
-        logger.info(f'Start transform the {self.block_idx+1}-th block')
+        logger.info(f'Start transform the {self.block_idx + 1}-th block')
 
         with torch.no_grad():
             block.float()
@@ -204,7 +204,7 @@ def block_transform(self, block, input_feat, block_kwargs):
         if self.reduce_memory:
             block.to(self.model_dtype)
 
-        logger.info(f'End transform the {self.block_idx+1}-th block')
+        logger.info(f'End transform the {self.block_idx + 1}-th block')
 
     def tesseraq_train(self, block):
         self.set_dynamic_tmp_quant(block, on=True)
@@ -273,8 +273,8 @@ def tesseraq_train(self, block):
                     norm = loss_scaler(loss, optimizer, parameters=params_r + params_s)
 
                 logger.info(
-                    f'block {self.block_idx} iter {i+1} loss:{loss.item():5f} \
-                    norm:{norm.item():4f} HR progress:{(1-thresholds[i])*100:1f}% '
+                    f'block {self.block_idx} iter {i + 1} loss:{loss.item():5f} \
+                    norm:{norm.item():4f} HR progress:{(1 - thresholds[i]) * 100:1f}% '
                 )
                 for p in params_r + params_s:
                     p.requires_grad = False
diff --git a/llmc/compression/sparsification/base_blockwise_sparsification.py b/llmc/compression/sparsification/base_blockwise_sparsification.py
@@ -154,7 +154,7 @@ def block_opt(self, block):
             self.block_transform(block)
 
     def block_transform(self, block, input_feat, block_kwargs):
-        logger.info(f'Start transform the {self.block_idx+1}-th block')
+        logger.info(f'Start transform the {self.block_idx + 1}-th block')
         subsets = self.model.get_subsets_in_block(block)
         for index, subset in enumerate(subsets):
             if not self.filter_subset(subset):
@@ -174,7 +174,7 @@ def block_transform(self, block, input_feat, block_kwargs):
                 inspect_module,
                 subset_kwargs
             )
-        logger.info(f'End transform the {self.block_idx+1}-th block')
+        logger.info(f'End transform the {self.block_idx + 1}-th block')
 
     def filter_subset(self, subset):
         return True
diff --git a/llmc/compression/sparsification/dense.py b/llmc/compression/sparsification/dense.py
@@ -11,6 +11,6 @@ def __init__(self, model, sparsity_config, input, padding_mask, config):
         super().__init__(model, sparsity_config, input, padding_mask, config)
 
     def block_transform(self, block):
-        logger.info(f'Start transform the {self.block_idx+1}-th block')
+        logger.info(f'Start transform the {self.block_idx + 1}-th block')
         logger.info(block)
-        logger.info(f'End transform the {self.block_idx+1}-th block')
+        logger.info(f'End transform the {self.block_idx + 1}-th block')
diff --git a/llmc/compression/sparsification/shortgpt.py b/llmc/compression/sparsification/shortgpt.py
@@ -30,7 +30,7 @@ def block_opt(self, block):
         self.input['data'] = output_feat
 
     def block_transform(self, input_feat, output_feat):
-        logger.info(f'Start transform the {self.block_idx+1}-th block')
+        logger.info(f'Start transform the {self.block_idx + 1}-th block')
         self.subset_transform(
             input_feat,
             output_feat
diff --git a/llmc/data/dataset/base_dataset.py b/llmc/data/dataset/base_dataset.py
@@ -38,6 +38,7 @@ def __init__(self, tokenizer, calib_cfg, batch_process=None):
         self.seed = calib_cfg['seed']
         self.calib_dataset_field_map = {
             'pileval': 'text',
+            'pile': 'text',
             'c4': 'text',
             'wikitext2': 'text',
             'ptb': 'sentence',
@@ -66,6 +67,10 @@ def build_calib_dataset(self):
                 self.calib_dataset = load_dataset(
                     'ptb_text_only', 'penn_treebank', split='train'
                 )
+            elif self.calib_dataset_name == 'pile':
+                self.calib_dataset = load_dataset(
+                    'mit-han-lab/pile-val-backup', split='validation'
+                )
             elif self.calib_dataset_name == 'ultrachat':
                 self.calib_dataset = load_dataset(
                     'HuggingFaceH4/ultrachat_200k', split='train_sft'
diff --git a/llmc/data/dataset/specified_preproc.py b/llmc/data/dataset/specified_preproc.py
@@ -47,6 +47,18 @@ def c4_gptq(calib_dataset, tokenizer, n_samples, seq_len):
     return samples
 
 
+@PREPROC_REGISTRY
+def pile_gptq(calib_dataset, tokenizer, n_samples, seq_len):
+    trainenc = tokenizer('\n\n'.join(calib_dataset['text'][:1000]), return_tensors='pt')
+    samples = []
+    for _ in range(n_samples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1)
+        j = i + seq_len
+        inp = trainenc.input_ids[:, i:j]
+        samples.append(inp)
+    return samples
+
+
 @PREPROC_REGISTRY
 def pileval_awq(calib_dataset, tokenizer, n_samples, seq_len):
     dataset = calib_dataset.shuffle(seed=42)
diff --git a/llmc/eval/eval_base.py b/llmc/eval/eval_base.py
diff --git a/llmc/eval/eval_vqa.py b/llmc/eval/eval_vqa.py
diff --git a/llmc/models/falcon.py b/llmc/models/falcon.py
diff --git a/llmc/models/mixtral.py b/llmc/models/mixtral.py
diff --git a/llmc/utils/__init__.py b/llmc/utils/__init__.py
diff --git a/llmc/utils/export_calib.py b/llmc/utils/export_calib.py