Skip to content

Commit e3e1243

Browse files
authored
Merge branch 'main' into feat/wan2.2-t2v
2 parents 366478b + 50e9a4b commit e3e1243

23 files changed

Lines changed: 336 additions & 64 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,5 @@ save*
2525
model/
2626
output_*
2727
datasets/
28+
.venv/
29+
*.sh

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
3434

3535
**Docs**: [English](https://llmc-en.readthedocs.io/en/latest/), [Chinese](https://llmc-zhcn.readthedocs.io/en/latest/).
3636

37+
> **Recommended Python Version**: We recommend using **Python 3.11** for local development and installation. This matches the project's Docker images and CI configuration, and is generally more stable than Python 3.12 for the current dependency set.
38+
3739
## :fire: Latest News
3840

3941
- **Nov 9, 2025:** 🍺🍺🍺 Our work [**LLMC+: Benchmarking Vision-Language Model Compression with a Plug-and-play Toolkit**](https://arxiv.org/abs/2508.09981) has been accepted by AAAI 2026.

README_zh.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
3131

3232
**文档**[English](https://llmc-en.readthedocs.io/en/latest/)[中文](https://llmc-zhcn.readthedocs.io/en/latest/)
3333

34+
> **推荐 Python 版本**:建议本地开发和安装使用 **Python 3.11**。这与项目的 Docker 镜像和 CI 配置保持一致,并且对当前依赖集合而言通常比 Python 3.12 更稳定。
35+
3436
## :fire: 最新动态
3537

3638
- **2025年8月13日:** 🚀 我们已开源针对 **视觉语言模型(VLMs)** 的压缩方案,支持共计超过 **20 种算法**,涵盖 **token reduction****quantization**。此次发布为多模态任务提供了灵活、即插即用的压缩策略。具体请参阅[文档](https://llmc-zhcn.readthedocs.io/en/latest/advanced/token_reduction.html)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
base:
2+
seed: &seed 42
3+
model:
4+
type: model_type
5+
path: model path
6+
torch_dtype: auto
7+
calib:
8+
name: pileval
9+
download: False
10+
path: calib data path
11+
n_samples: 128
12+
bs: 1
13+
seq_len: 2048
14+
preproc: txt_general_preproc
15+
seed: *seed
16+
eval:
17+
eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos
18+
name: wikitext2
19+
type: decode_ppl
20+
download: False
21+
path: eval_data_path
22+
bs: 1
23+
inference_per_block: False
24+
num_samples: 10
25+
# num_eval_tokens: 3
26+
quant:
27+
method: RTN
28+
weight:
29+
bit: 8
30+
symmetric: True
31+
granularity: per_channel
32+
group_size: -1
33+
act:
34+
bit: 8
35+
symmetric: True
36+
granularity: per_tensor
37+
static: True
38+
kvcache:
39+
method: Naive
40+
bit: 8
41+
symmetric: True
42+
granularity: per_head
43+
head_num: kv head num
44+
save:
45+
save_lightllm_kv_calib: True
46+
lightllm_kv_cache_name: kv_cache_calib.json
47+
save_fake: False
48+
save_path: /path/to/save/

configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,5 +41,7 @@ quant:
4141
symmetric: True
4242
granularity: per_tensor
4343
save:
44+
save_lightllm_kv_calib: True
45+
lightllm_kv_cache_name: kv_cache_calib.json
4446
save_fake: False
45-
save_path: /path/to/save/
47+
save_path: /path/to/save/

llmc/__main__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from llmc.models import *
2121
from llmc.utils import (check_config, deploy_all_modality, get_modality,
2222
mkdirs, print_important_package_version, seed_all,
23+
collect_lightllm_kv_calib_json,
2324
update_autoawq_quant_config,
2425
update_lightx2v_quant_config, update_vllm_quant_config)
2526
from llmc.utils.registry_factory import ALGO_REGISTRY, MODEL_REGISTRY
@@ -72,6 +73,21 @@ def main(config):
7273

7374
eval_model(model, blockwise_opts, eval_list, eval_pos='transformed')
7475
if int(os.environ['RANK']) == 0:
76+
if 'save' in config and config.save.get('save_lightllm_kv_cache_calib', False):
77+
calib_json_list = [
78+
collect_lightllm_kv_calib_json(blockwise_opt)
79+
for blockwise_opt in blockwise_opts
80+
if hasattr(blockwise_opt, 'quant_kvcache')
81+
]
82+
calib_json_payload = (
83+
calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list
84+
)
85+
with open(save_lightllm_kv_cache_calib_path, 'w') as file:
86+
json.dump(calib_json_payload, file, ensure_ascii=False, indent=4)
87+
logger.info(
88+
f'save lightllm kv cache calib done -- {save_lightllm_kv_cache_calib_path}'
89+
)
90+
7591
if 'save' in config and config.save.get('save_trans', False):
7692
blockwise_opt.save_model(save_trans_path)
7793

@@ -209,6 +225,14 @@ def main(config):
209225
# Ensure only the main process creates directories
210226
if int(os.environ['RANK']) == 0:
211227
if 'save' in config:
228+
if config.save.get('save_lightllm_kv_cache_calib', False):
229+
mkdirs(config.save.save_path)
230+
save_lightllm_kv_cache_calib_path = os.path.join(
231+
config.save.save_path,
232+
config.save.get(
233+
'lightllm_kv_cache_calib_name', 'kv_cache_calib.json'
234+
),
235+
)
212236
if config.save.get('save_trans', False):
213237
save_trans_path = os.path.join(
214238
config.save.save_path, 'transformed_model'
@@ -266,3 +290,4 @@ def main(config):
266290
llmc_duration_time = llmc_end_time - llmc_start_time
267291
logger.info(f'llmc_duration_time: {llmc_duration_time} s')
268292
logger.info('--- llmc finished ---')
293+

llmc/compression/quantization/base_blockwise_quantization.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import copy
22
import functools
33
import gc
4-
import json
54
import os
65
import re
76
import shutil
@@ -180,18 +179,18 @@ def set_quant_config(self):
180179
self.act_quant_module = IntegerQuantizer
181180
elif quant_type == 'float-quant':
182181
self.act_quant_module = FloatQuantizer
183-
else:
184-
raise ValueError(
185-
f"Unsupported act quant_type: {quant_type}. "
186-
"Supported: int-quant, float-quant, hif4."
187-
)
188-
self.quant_config['act']['tp'] = self.tp
189-
self.aquantizer = self.act_quant_module(**self.quant_config['act'])
190182
self.act_static = self.quant_config['act'].get('static', False)
191183
if self.act_static:
192184
assert (
193185
self.quant_config['act']['granularity'] == 'per_tensor'
194186
), 'Only support per_tensor static quant'
187+
# Static activation quantization uses the batched calibration
188+
# path, so normalize the default minmax setting to
189+
# static_minmax to match the downstream calibration logic.
190+
if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax':
191+
self.quant_config['act']['calib_algo'] = 'static_minmax'
192+
self.quant_config['act']['tp'] = self.tp
193+
self.aquantizer = self.act_quant_module(**self.quant_config['act'])
195194
self.quant_attn = self.quant_config['act'].get('quant_attn', False)
196195
if self.quant_attn:
197196
assert self.config['model']['type'] in ['Vit', 'DeepseekV2']
@@ -213,8 +212,10 @@ def set_quant_config(self):
213212
kv_special_cfg = self.quant_config['kvcache'].get('special', {})
214213
act_static_cfg = {}
215214
if self.act_static:
216-
act_static_cfg.update(self.config.calib.n_sample)
217-
act_static_cfg.update(self.config.calib.bs)
215+
# The KV cache constructor expects num_samples / bsz, so map
216+
# the calibration config fields to the parameter names it uses.
217+
act_static_cfg['num_samples'] = self.config.calib.n_samples
218+
act_static_cfg['bsz'] = self.config.calib.bs
218219
kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant')
219220
self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']](
220221
kv_quant_type, self.quant_config['kvcache'],

llmc/compression/quantization/kvquant.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import copy
12
import torch
23
from loguru import logger
34
from transformers import DynamicCache
@@ -12,12 +13,20 @@ class NaiveQuantKVCache(DynamicCache):
1213
def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1):
1314
super().__init__()
1415

15-
assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group']
16+
# Copy the config to avoid mutating the original quantization config in static KV calibration.
17+
kvquant_cfg = copy.deepcopy(kvquant_cfg)
18+
assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group', 'per_head']
1619
self.num_hidden_layers, self.num_samples, self.bsz = (
1720
num_hidden_layers,
1821
num_samples,
1922
bsz,
2023
)
24+
if kvquant_cfg.get('static', False) and kvquant_cfg.get(
25+
'calib_algo', 'minmax'
26+
) == 'minmax':
27+
# Static KV calibration uses the batched tensor statistics path, so convert the default
28+
# minmax setting to static_minmax here to avoid a later calibration algo name mismatch.
29+
kvquant_cfg['calib_algo'] = 'static_minmax'
2130
if quant_type == 'int-quant':
2231
self.kvquantizer = IntegerQuantizer(**kvquant_cfg)
2332
elif quant_type == 'float-quant':

llmc/compression/quantization/quant.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -226,27 +226,24 @@ def get_minmax_stats(self, act_tensors):
226226
for tensor in tensors:
227227
tensor = self.reshape_tensor(tensor)
228228
tensor_range = self.get_minmax_range(tensor)
229-
min_val, max_val = tensor_range[0], tensor_range[1]
229+
min_val = tensor_range[0].detach().cpu().to(torch.float32)
230+
max_val = tensor_range[1].detach().cpu().to(torch.float32)
230231

231232
if input_idx not in stats_min_max:
232233
stats_min_max[input_idx] = {}
233-
stats_min_max[input_idx]['min'] = torch.tensor(
234-
[min_val], dtype=torch.float32
235-
)
236-
stats_min_max[input_idx]['max'] = torch.tensor(
237-
[max_val], dtype=torch.float32
238-
)
234+
stats_min_max[input_idx]['min'] = min_val.unsqueeze(0)
235+
stats_min_max[input_idx]['max'] = max_val.unsqueeze(0)
239236
else:
240237
stats_min_max[input_idx]['min'] = torch.cat(
241238
[
242239
stats_min_max[input_idx]['min'],
243-
torch.tensor([min_val], dtype=torch.float32),
240+
min_val.unsqueeze(0),
244241
]
245242
)
246243
stats_min_max[input_idx]['max'] = torch.cat(
247244
[
248245
stats_min_max[input_idx]['max'],
249-
torch.tensor([max_val], dtype=torch.float32),
246+
max_val.unsqueeze(0),
250247
]
251248
)
252249

@@ -257,8 +254,8 @@ def get_static_minmax_range(self, act_tensors):
257254
stats_min_max = self.get_minmax_stats(act_tensors)
258255
min_vals, max_vals = [], []
259256
for input_idx, tensor_range in stats_min_max.items():
260-
min_val = tensor_range['min'].mean()
261-
max_val = tensor_range['max'].mean()
257+
min_val = tensor_range['min'].mean(dim=0)
258+
max_val = tensor_range['max'].mean(dim=0)
262259
min_vals.append(min_val)
263260
max_vals.append(max_val)
264261

llmc/compression/quantization/quarot.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def get_orthogonal_matrix(self):
9696
raise ValueError(f'Unsupported mode {self.mode}')
9797

9898
def block_transform(self, block):
99-
logger.info(f'Start transform the {self.block_idx+1}-th block')
99+
logger.info(f'Start transform the {self.block_idx + 1}-th block')
100100

101101
if self.online_rotate:
102102
self.replace_rotate_linears(block)
@@ -108,7 +108,7 @@ def block_transform(self, block):
108108
gc.collect()
109109

110110
logger.info(f'block:{block}')
111-
logger.info(f'End transform the {self.block_idx+1}-th block')
111+
logger.info(f'End transform the {self.block_idx + 1}-th block')
112112

113113
@torch.no_grad()
114114
def subset_transform(self, block, subset):

0 commit comments

Comments
 (0)