-
Notifications
You must be signed in to change notification settings - Fork 80
Expand file tree
/
Copy pathexport_vllm.py
More file actions
executable file
·126 lines (119 loc) · 4.67 KB
/
export_vllm.py
File metadata and controls
executable file
·126 lines (119 loc) · 4.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import json
def update_vllm_quant_config(
model,
config,
save_quant_path,
vllm_quant_method='compressed-tensors',
):
need_pack = config.quant.weight.get('need_pack', False)
weight_quant_type = config.quant.weight.get('quant_type', 'int-quant')
if 'act' in config.quant:
act_quant_type = config.quant.act.get('quant_type', 'int-quant')
assert act_quant_type == weight_quant_type
else:
act_quant_type = None
if act_quant_type is not None and act_quant_type == 'float-quant':
if config.quant.act.get('static', False):
quant_config = {
'activation_scheme': 'static',
'ignored_layers': [
model.skip_layer_name()
],
'quant_method': 'fp8'
}
config_file = save_quant_path + '/config.json'
with open(config_file, 'r') as file:
config_vllm = json.load(file)
config_vllm['quantization_config'] = quant_config
with open(config_file, 'w') as file:
json.dump(config_vllm, file, indent=4)
return
# elif config.quant.weight.get('granularity', 'per_block'):
elif config.quant.weight.get('granularity') == 'per_block':
quant_config = {
'activation_scheme': 'dynamic',
'fmt': 'e4m3',
'quant_method': 'fp8',
'weight_block_size': [
config.quant.weight.block_size,
config.quant.weight.block_size
]
}
config_file = save_quant_path + '/config.json'
with open(config_file, 'r') as file:
config_vllm = json.load(file)
config_vllm['quantization_config'] = quant_config
with open(config_file, 'w') as file:
json.dump(config_vllm, file, indent=4)
return
else:
vllm_quant_format = 'float-quantized'
quant_type = 'float'
w_num_bits = 8
a_num_bits = 8
elif need_pack:
vllm_quant_format = 'pack-quantized'
quant_type = 'int'
w_num_bits = config.quant.weight.bit
elif weight_quant_type == 'float-quant':
vllm_quant_format = 'float-quantized'
quant_type = 'float'
w_num_bits = 8
else:
vllm_quant_format = 'int-quantized'
quant_type = 'int'
w_num_bits = config.quant.weight.bit
if 'act' in config.quant:
a_num_bits = config.quant.act.bit
if config.quant.weight.granularity == 'per_group':
group_size = config.quant.weight.group_size
else:
group_size = None
if 'act' in config.quant and 'static' in config.quant.act:
dynamic = not config.quant.act.static
else:
dynamic = True
quant_config = {
'config_groups': {
'group_0': {
'targets': ['Linear'], # Now only support "Linear".
'input_activations': {
'dynamic': dynamic,
'group_size': None, # Don't support activations per-group quant.
'num_bits': a_num_bits,
'observer': 'minmax',
'observer_kwargs': {},
'strategy': 'token'
if config.quant.act.granularity == 'per_token'
else 'tensor',
'symmetric': config.quant.act.symmetric,
'type': quant_type
} if 'act' in config.quant else None,
'weights': {
'dynamic': False,
'group_size': group_size,
'num_bits': w_num_bits,
'observer': 'minmax', # Now only support "minmax".
'observer_kwargs': {},
'strategy': (
'group'
if config.quant.weight.granularity == 'per_group'
else 'channel'
),
'symmetric': config.quant.weight.symmetric,
'type': quant_type,
},
}
},
'format': vllm_quant_format,
'ignore': model.skip_layer_name(),
'quant_method': vllm_quant_method,
}
config_file = save_quant_path + '/config.json'
with open(config_file, 'r') as file:
config_vllm = json.load(file)
if weight_quant_type == 'int-quant' and 'quantization_config' in config_vllm:
del config_vllm['quantization_config']
config_vllm['compression_config'] = quant_config
with open(config_file, 'w') as file:
json.dump(config_vllm, file, indent=4)