Skip to content

Commit 36026bf

Browse files
author
gushiqiao
committed
Reconstruct mix precison.
1 parent e762736 commit 36026bf

10 files changed

Lines changed: 205 additions & 322 deletions

File tree

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
base:
2+
seed: &seed 42
3+
model:
4+
type: DeepseekV2
5+
path: /path/to/DeepseekV2
6+
torch_dtype: auto
7+
calib:
8+
name: pileval
9+
download: False
10+
path: /path/to/pileval
11+
n_samples: 128
12+
bs: -1
13+
seq_len: 512
14+
preproc: pileval_awq
15+
seed: *seed
16+
eval:
17+
eval_pos: [fake_quant]
18+
name: wikitext2
19+
download: False
20+
path: /path/to/wikitext2
21+
seq_len: 2048
22+
bs: 1
23+
inference_per_block: False
24+
quant:
25+
method: Awq
26+
weight:
27+
bit: 8
28+
symmetric: True
29+
granularity: per_channel
30+
group_size: -1
31+
act:
32+
bit: 8
33+
symmetric: True
34+
granularity: per_token
35+
special:
36+
trans: True
37+
trans_version: v2
38+
weight_clip: False
39+
clip_sym: True
40+
ignored_layers:
41+
# block_ids and layer_names together determine which layers use high precision (such as bf16 or fp16) for computation.
42+
# For example, '4' and 'self_attn.q_proj' represent the model.layers.4.mlp.self_attn.q_proj layer using high precision,
43+
# while '15-23' and 'self_attn.kv_b_proj' represent layers 15 to 23 of self_attn.kv_b_proj not being quantized.
44+
block_ids: [4, 5, 6, 15-23]
45+
layer_names: ["self_attn.q_proj", "self_attn.kv_a_proj_with_mqa", "self_attn.kv_b_proj", "self_attn.o_proj"]
46+
# You can also specify certain layers for high precision computation using speical_names,
47+
# but you must provide the full name of the layer
48+
speical_names: ["model.layers.0.mlp.down_proj"]
49+
save:
50+
save_vllm: False
51+
save_fake: False
52+
save_path: /path/to/save/
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
base:
2+
seed: &seed 42
3+
model:
4+
type: DeepseekV2
5+
path: /path/to/DeepseekV2
6+
torch_dtype: auto
7+
calib:
8+
name: pileval
9+
download: False
10+
path: /path/to/pileval
11+
n_samples: 128
12+
bs: -1
13+
seq_len: 512
14+
preproc: pileval_awq
15+
seed: *seed
16+
eval:
17+
eval_pos: [fake_quant]
18+
name: wikitext2
19+
download: False
20+
path: /path/to/wikitext2
21+
seq_len: 2048
22+
bs: 1
23+
inference_per_block: False
24+
quant:
25+
method: Awq
26+
weight:
27+
bit: 8
28+
symmetric: True
29+
granularity: per_channel
30+
group_size: -1
31+
act:
32+
bit: 8
33+
symmetric: True
34+
granularity: per_tensor
35+
static: True
36+
calib_algo: static_hist
37+
special:
38+
trans: True
39+
trans_version: v2
40+
weight_clip: False
41+
clip_sym: True
42+
ignored_layers:
43+
block_ids: [0-26]
44+
layer_names: ["self_attn.q_proj", "self_attn.kv_a_proj_with_mqa", "self_attn.kv_b_proj", "self_attn.o_proj"]
45+
speical_names: []
46+
save:
47+
save_vllm: False
48+
save_fake: False
49+
save_path: /path/to/save/
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
base:
2+
seed: &seed 42
3+
model:
4+
type: DeepseekV2
5+
path: /path/to/DeepseekV2
6+
torch_dtype: auto
7+
eval:
8+
eval_pos: [pretrain, fake_quant]
9+
name: wikitext2
10+
download: False
11+
path: /path/to/wikitext2
12+
seq_len: 2048
13+
bs: 1
14+
inference_per_block: False
15+
quant:
16+
method: RTN
17+
weight:
18+
bit: 8
19+
symmetric: True
20+
granularity: per_channel
21+
group_size: -1
22+
act:
23+
bit: 8
24+
symmetric: True
25+
granularity: per_token
26+
ignored_layers:
27+
# block_ids and layer_names together determine which layers use high precision (such as bf16 or fp16) for computation.
28+
# For example, '4' and 'self_attn.q_proj' represent the model.layers.4.mlp.self_attn.q_proj layer using high precision,
29+
# while '15-23' and 'self_attn.kv_b_proj' represent layers 15 to 23 of self_attn.kv_b_proj not being quantized.
30+
block_ids: [4, 5, 6, 15-23]
31+
layer_names: ["self_attn.q_proj", "self_attn.kv_a_proj_with_mqa", "self_attn.kv_b_proj", "self_attn.o_proj"]
32+
# You can also specify certain layers for high precision computation using speical_names,
33+
# but you must provide the full name of the layer
34+
speical_names: ["model.layers.0.mlp.down_proj"]
35+
save:
36+
save_vllm: False
37+
save_fake: False
38+
save_path: /path/to/save/
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
base:
2+
seed: &seed 42
3+
model:
4+
type: DeepseekV2
5+
path: /path/to/DeepseekV2
6+
torch_dtype: auto
7+
calib:
8+
name: pileval
9+
download: False
10+
path: /path/to/pileval
11+
n_samples: 128
12+
bs: 1
13+
seq_len: 2048
14+
preproc: txt_general_preproc
15+
seed: *seed
16+
eval:
17+
eval_pos: [fake_quant]
18+
name: wikitext2
19+
download: False
20+
path: /path/to/wikitext2
21+
seq_len: 2048
22+
bs: 1
23+
inference_per_block: False
24+
quant:
25+
method: RTN
26+
weight:
27+
bit: 8
28+
symmetric: True
29+
granularity: per_channel
30+
group_size: -1
31+
act:
32+
bit: 8
33+
symmetric: True
34+
granularity: per_tensor
35+
static: True
36+
calib_algo: static_hist
37+
ignored_layers:
38+
block_ids: [0-26]
39+
layer_names: ["self_attn.q_proj", "self_attn.kv_a_proj_with_mqa", "self_attn.kv_b_proj", "self_attn.o_proj"]
40+
speical_names: []
41+
save:
42+
save_vllm: False
43+
save_fake: False
44+
save_path: /path/to/save/

llmc/compression/quantization/auto_clip.py

Lines changed: 8 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
from loguru import logger
77

88
from .module_utils import _LLMC_LINEAR_TYPES_, _TRANSFORMERS_LINEAR_TYPES_
9-
from .utils import (check_do_quant, check_w_only, get_aquantizer,
10-
get_wquantizer, is_fp8_supported_gpu)
9+
from .utils import is_fp8_supported_gpu
1110

1211
if is_fp8_supported_gpu():
1312
from .kernel import weight_cast_to_bf16, weight_cast_to_fp8
@@ -21,17 +20,13 @@ class AutoClipper:
2120
def __init__(
2221
self,
2322
w_only,
24-
mix_bits_map,
25-
quantizer_mix_bits,
2623
wquantizer,
2724
aquantizer,
2825
clip_version,
2926
clip_sym,
3027
save_clip,
3128
padding_mask,
3229
):
33-
self.mix_bits_map = mix_bits_map
34-
self.quantizer_mix_bits = quantizer_mix_bits
3530
self.wquantizer = wquantizer
3631
self.aquantizer = aquantizer
3732
self.clip_version = clip_version
@@ -45,14 +40,6 @@ def __init__(
4540
@torch.no_grad()
4641
def run(self, block, block_idx, input_feat, n_sample_token):
4742
for n, m in block.named_modules():
48-
if not check_do_quant(
49-
block_idx, n, self.mix_bits_map, self.quantizer_mix_bits
50-
):
51-
logger.info(
52-
f'This layer {n} in {block_idx}-th block is set to float.'
53-
f'No need to clip this layer.'
54-
)
55-
continue
5643
if isinstance(m, tuple(_LLMC_LINEAR_TYPES_ + _TRANSFORMERS_LINEAR_TYPES_)):
5744
if m.weight.data.dtype == torch.float8_e4m3fn:
5845
is_fp8_weight = True
@@ -105,15 +92,8 @@ def auto_clip_layer(
10592

10693
assert w.dim() == 2
10794

108-
wquantizer = get_wquantizer(
109-
block_idx,
110-
layer_name,
111-
self.mix_bits_map,
112-
self.quantizer_mix_bits,
113-
self.wquantizer,
114-
)
115-
if wquantizer.granularity == 'per_group':
116-
group_size = wquantizer.group_size
95+
if self.wquantizer.granularity == 'per_group':
96+
group_size = self.wquantizer.group_size
11797
else:
11898
group_size = w.shape[1]
11999

@@ -143,13 +123,7 @@ def auto_clip_layer(
143123
org_out_dict = {}
144124
for i_s in range(int(max_shrink * n_grid)):
145125
if i_s == 0:
146-
if self.clip_version == 'v2' and not check_w_only(
147-
block_idx,
148-
layer_name,
149-
self.mix_bits_map,
150-
self.quantizer_mix_bits,
151-
self.w_only,
152-
):
126+
if self.clip_version == 'v2' and not self.w_only:
153127
i_s += eps
154128
err_mean = 0
155129
for i in range(len(inputs)):
@@ -254,15 +228,8 @@ def apply_clip(self, block_idx, layer, min_val, max_val, layer_name):
254228
raise Exception('Not support other clip version')
255229

256230
def get_clip_factor(self, block_idx, layer, min_val, max_val, layer_name):
257-
wquantizer = get_wquantizer(
258-
block_idx,
259-
layer_name,
260-
self.mix_bits_map,
261-
self.quantizer_mix_bits,
262-
self.wquantizer,
263-
)
264-
org_min_val, org_max_val = wquantizer.get_minmax_range(
265-
wquantizer.reshape_tensor(layer.weight.data)
231+
org_min_val, org_max_val = self.wquantizer.get_minmax_range(
232+
self.wquantizer.reshape_tensor(layer.weight.data)
266233
)
267234
org_val_shape = org_max_val.shape
268235

@@ -304,20 +271,8 @@ def fake_quantize_weight(self, w, min_val, max_val, org_min_val, org_max_val):
304271
return q_w
305272

306273
def fake_quantize_input(self, block_idx, x, layer_name):
307-
if not check_w_only(
308-
block_idx,
309-
layer_name,
310-
self.mix_bits_map,
311-
self.quantizer_mix_bits,
312-
self.w_only,
313-
):
314-
q_x = get_aquantizer(
315-
block_idx,
316-
layer_name,
317-
self.mix_bits_map,
318-
self.quantizer_mix_bits,
319-
self.aquantizer,
320-
).fake_quant_act_dynamic(x)
274+
if not self.w_only:
275+
q_x = self.aquantizer.fake_quant_act_dynamic(x)
321276
else:
322277
q_x = x
323278
return q_x

0 commit comments

Comments
 (0)