LightCompress/llmc/utils/export_vllm.py at 63e1825fb2689f2fe043a6e48e48adc1a2a5df69 · ModelTC/LightCompress · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import json


def update_vllm_quant_config(
    model,
    config,
    save_quant_path,
    vllm_quant_method='compressed-tensors',

):
    need_pack = config.quant.weight.get('need_pack', False)
    weight_quant_type = config.quant.weight.get('quant_type', 'int-quant')
    if 'act' in config.quant:
        act_quant_type = config.quant.act.get('quant_type', 'int-quant')
        assert act_quant_type == weight_quant_type
    else:
        act_quant_type = None
    if act_quant_type is not None and act_quant_type == 'float-quant':
        if config.quant.act.get('static', False):
            quant_config = {
                'activation_scheme': 'static',
                'ignored_layers': [
                    model.skip_layer_name()
                ],
                'quant_method': 'fp8'
            }
            config_file = save_quant_path + '/config.json'
            with open(config_file, 'r') as file:
                config_vllm = json.load(file)
            config_vllm['quantization_config'] = quant_config
            with open(config_file, 'w') as file:
                json.dump(config_vllm, file, indent=4)
            return
        # elif config.quant.weight.get('granularity', 'per_block'):
        elif config.quant.weight.get('granularity') == 'per_block':
            quant_config = {
                'activation_scheme': 'dynamic',
                'fmt': 'e4m3',
                'quant_method': 'fp8',
                'weight_block_size': [
                    config.quant.weight.block_size,
                    config.quant.weight.block_size
                ]
            }
            config_file = save_quant_path + '/config.json'
            with open(config_file, 'r') as file:
                config_vllm = json.load(file)
            config_vllm['quantization_config'] = quant_config
            with open(config_file, 'w') as file:
                json.dump(config_vllm, file, indent=4)
            return
        else:
            vllm_quant_format = 'float-quantized'
            quant_type = 'float'
            w_num_bits = 8
            a_num_bits = 8
    elif need_pack:
        vllm_quant_format = 'pack-quantized'
        quant_type = 'int'
        w_num_bits = config.quant.weight.bit
    elif weight_quant_type == 'float-quant':
        vllm_quant_format = 'float-quantized'
        quant_type = 'float'
        w_num_bits = 8
    else:
        vllm_quant_format = 'int-quantized'
        quant_type = 'int'
        w_num_bits = config.quant.weight.bit
        if 'act' in config.quant:
            a_num_bits = config.quant.act.bit

    if config.quant.weight.granularity == 'per_group':
        group_size = config.quant.weight.group_size
    else:
        group_size = None

    if 'act' in config.quant and 'static' in config.quant.act:
        dynamic = not config.quant.act.static
    else:
        dynamic = True

    quant_config = {
        'config_groups': {
            'group_0': {
                'targets': ['Linear'],  # Now only support "Linear".
                'input_activations': {
                    'dynamic': dynamic,
                    'group_size': None,   # Don't support activations per-group quant.
                    'num_bits': a_num_bits,
                    'observer': 'minmax',
                    'observer_kwargs': {},
                    'strategy': 'token'
                                if config.quant.act.granularity == 'per_token'
                                else 'tensor',
                    'symmetric': config.quant.act.symmetric,
                    'type': quant_type
                } if 'act' in config.quant else None,
                'weights': {
                    'dynamic': False,
                    'group_size': group_size,
                    'num_bits': w_num_bits,
                    'observer': 'minmax',  # Now only support "minmax".
                    'observer_kwargs': {},
                    'strategy': (
                        'group'
                        if config.quant.weight.granularity == 'per_group'
                        else 'channel'
                    ),
                    'symmetric': config.quant.weight.symmetric,
                    'type': quant_type,
                },
            }
        },
        'format': vllm_quant_format,
        'ignore': model.skip_layer_name(),
        'quant_method': vllm_quant_method,
    }

    config_file = save_quant_path + '/config.json'
    with open(config_file, 'r') as file:
        config_vllm = json.load(file)
    if weight_quant_type == 'int-quant' and 'quantization_config' in config_vllm:
        del config_vllm['quantization_config']
    config_vllm['compression_config'] = quant_config
    with open(config_file, 'w') as file:
        json.dump(config_vllm, file, indent=4)