Skip to content

Commit 05bb58a

Browse files
authored
Support QDQ format for weight-only quantization (#35)
## Type of Change feature ## Description Support QDQ format for weight-only quantization It requires: - onnxruntime >= 1.19.0 - opset_version of model >=21 - quantized bits in [4, 8] ## Expected Behavior & Potential Risk the expected behavior that triggered by this PR ## How has this PR been tested? how to reproduce the test (including hardware information) ## Dependency Change? any library dependency introduced or removed --------- Signed-off-by: Mengni Wang <mengni.wang@intel.com> Signed-off-by: Wang, Mengni <mengni.wang@intel.com>
1 parent 71c2484 commit 05bb58a

27 files changed

Lines changed: 933 additions & 697 deletions

File tree

examples/.config/model_params_onnxrt.json

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@
1818
"batch_size": 1,
1919
"algorithm": "RTN"
2020
},
21+
"llama-2-7b-rtn-with-past-qdq": {
22+
"model_name": "meta-llama/Llama-2-7b-hf",
23+
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
24+
"dataset_location": "",
25+
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
26+
"main_script": "main.py",
27+
"batch_size": 1,
28+
"algorithm": "RTN"
29+
},
2130
"llama-2-7b-awq": {
2231
"model_name": "meta-llama/Llama-2-7b-hf",
2332
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
@@ -36,6 +45,15 @@
3645
"batch_size": 1,
3746
"algorithm": "AWQ"
3847
},
48+
"llama-2-7b-awq-with-past-qdq": {
49+
"model_name": "meta-llama/Llama-2-7b-hf",
50+
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
51+
"dataset_location": "",
52+
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
53+
"main_script": "main.py",
54+
"batch_size": 1,
55+
"algorithm": "AWQ"
56+
},
3957
"llama-2-7b-gptq": {
4058
"model_name": "meta-llama/Llama-2-7b-hf",
4159
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
@@ -54,6 +72,15 @@
5472
"batch_size": 1,
5573
"algorithm": "GPTQ"
5674
},
75+
"llama-2-7b-gptq-with-past-qdq": {
76+
"model_name": "meta-llama/Llama-2-7b-hf",
77+
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",
78+
"dataset_location": "",
79+
"input_model": "/tf_dataset2/models/onnx/Llama-2-7b-hf-with-past-opset-21",
80+
"main_script": "main.py",
81+
"batch_size": 1,
82+
"algorithm": "GPTQ"
83+
},
5784
"llama-2-7b-woq_tune": {
5885
"model_name": "meta-llama/Llama-2-7b-hf",
5986
"model_src_dir": "nlp/huggingface_model/text_generation/quantization/weight_only",

examples/nlp/huggingface_model/text_generation/quantization/weight_only/README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,21 @@ python prepare_model.py --input_model="meta-llama/Llama-2-7b-hf" \
4141

4242
Set `algorithm=WOQ_TUNE` to tune weight-only quantization algorithm or specify algorithm to `RTN` or `GPTQ` or `AWQ`.
4343

44+
`quant_format=QDQ` works only when:
45+
- onnxruntime >= 1.19.0
46+
- opset version of the model >= 21
47+
- quantized bits is in [4, 8]
48+
49+
otherwise it will execute QOperator automatically.
50+
4451
```bash
4552
bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
4653
--output_model=/path/to/model_tune \ # folder path to save onnx model
4754
--batch_size=batch_size # optional \
4855
--dataset=NeelNanda/pile-10k \
4956
--tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
50-
--algorithm=WOQ_TUNE # support WOQ_TUNE, RTN, AWQ, GPTQ
57+
--algorithm=WOQ_TUNE # support WOQ_TUNE, RTN, AWQ, GPTQ \
58+
--quant_format=QDQ # support QOperator and QDQ
5159
```
5260

5361
## 2. Benchmark

examples/nlp/huggingface_model/text_generation/quantization/weight_only/main.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from torch.utils import data
3535

3636
from onnx_neural_compressor import data_reader
37-
from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning
37+
from onnx_neural_compressor.quantization import QuantFormat, config, matmul_nbits_quantizer, tuning
3838

3939
logging.basicConfig(
4040
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
@@ -74,7 +74,8 @@
7474
parser.add_argument(
7575
"--tasks",
7676
nargs="+",
77-
default=[
77+
default=["lambada_openai"],
78+
choices=[
7879
"winogrande",
7980
"copa",
8081
"piqa",
@@ -105,6 +106,7 @@
105106
default=[],
106107
help="nodes that will not be quantized. Doesn't take effect when 'algorithm' is 'WOQ_TUNE'",
107108
)
109+
parser.add_argument("--quant_format", type=str, default="QDQ", choices=["QOperator", "QDQ"])
108110
args = parser.parse_args()
109111

110112
if args.tune and not os.path.exists(args.output_model):
@@ -347,8 +349,11 @@ def rewind(self):
347349

348350
nodes_to_exclude = ["/lm_head/MatMul"] if not args.quantize_lm_head else []
349351
nodes_to_exclude = list(set(args.nodes_to_exclude + nodes_to_exclude))
352+
quant_format = QuantFormat.QOperator if args.quant_format == "QOperator" else QuantFormat.QDQ
350353
if args.algorithm.upper() == "RTN":
351-
algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(layer_wise_quant=args.layer_wise)
354+
algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig(
355+
layer_wise_quant=args.layer_wise, quant_format=quant_format
356+
)
352357
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
353358
model_path,
354359
n_bits=4,
@@ -363,7 +368,9 @@ def rewind(self):
363368
elif args.algorithm.upper() == "AWQ":
364369
calibration_data_reader = AWQDataloader(model_path, pad_max=args.pad_max, batch_size=1)
365370
algo_config = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig(
366-
calibration_data_reader=calibration_data_reader, enable_mse_search=False
371+
calibration_data_reader=calibration_data_reader,
372+
enable_mse_search=False,
373+
quant_format=quant_format,
367374
)
368375
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
369376
model_path,
@@ -379,7 +386,9 @@ def rewind(self):
379386
elif args.algorithm.upper() == "GPTQ":
380387
calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1)
381388
algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig(
382-
calibration_data_reader=calibration_data_reader, layer_wise_quant=args.layer_wise
389+
calibration_data_reader=calibration_data_reader,
390+
layer_wise_quant=args.layer_wise,
391+
quant_format=quant_format,
383392
)
384393
quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
385394
model_path,
@@ -395,7 +404,9 @@ def rewind(self):
395404
elif args.algorithm.upper() == "WOQ_TUNE":
396405
calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1)
397406
# set tolerable_loss to 0.5% for test, default is 1%
398-
custom_tune_config = tuning.TuningConfig(config_set=config.get_woq_tuning_config(), tolerable_loss=0.005)
407+
custom_tune_config = tuning.TuningConfig(
408+
config_set=config.get_woq_tuning_config(quant_format=quant_format), tolerable_loss=0.005
409+
)
399410
best_model = tuning.autotune(
400411
model_input=model_path,
401412
tune_config=custom_tune_config,

examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_benchmark.sh

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@ function init_params {
1414
do
1515
case $var in
1616
--input_model=*)
17-
input_model=$(echo $var |cut -f2 -d=)
17+
input_model=$(echo "$var" |cut -f2 -d=)
1818
;;
1919
--batch_size=*)
20-
batch_size=$(echo $var |cut -f2 -d=)
20+
batch_size=$(echo "$var" |cut -f2 -d=)
2121
;;
2222
--tokenizer=*)
23-
tokenizer=$(echo $var |cut -f2 -d=)
23+
tokenizer=$(echo "$var" |cut -f2 -d=)
2424
;;
2525
--mode=*)
26-
mode=$(echo $var |cut -f2 -d=)
26+
mode=$(echo "$var" |cut -f2 -d=)
2727
;;
2828
--intra_op_num_threads=*)
29-
intra_op_num_threads=$(echo $var |cut -f2 -d=)
29+
intra_op_num_threads=$(echo "$var" |cut -f2 -d=)
3030
;;
3131
esac
3232
done
@@ -42,19 +42,27 @@ function run_benchmark {
4242
input_model=$(dirname "$input_model")
4343
fi
4444

45+
extra_cmd=""
46+
4547
if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
46-
extra_cmd="--trust_remote_code True"
48+
extra_cmd=$extra_cmd"--trust_remote_code True "
49+
fi
50+
51+
if [ "${batch_size}" ]; then
52+
extra_cmd=$extra_cmd"--batch_size ${batch_size} "
53+
fi
54+
if [ "${tokenizer}" ]; then
55+
extra_cmd=$extra_cmd"--tokenizer ${tokenizer} "
56+
fi
57+
if [ "${tasks}" ]; then
58+
extra_cmd=$extra_cmd"--tasks ${tasks} "
59+
fi
60+
if [ "${intra_op_num_threads}" ]; then
61+
extra_cmd=$extra_cmd"--intra_op_num_threads ${intra_op_num_threads} "
4762
fi
4863

49-
eval "python main.py \
50-
--model_path ${input_model} \
51-
--batch_size=${batch_size-1} \
52-
--tokenizer=${tokenizer-meta-llama/Llama-2-7b-hf} \
53-
--tasks=${tasks-lambada_openai} \
54-
--mode=${mode} \
55-
--intra_op_num_threads=${intra_op_num_threads-24} \
56-
--benchmark \
57-
${extra_cmd}"
64+
extra_cmd=$extra_cmd"--benchmark"
65+
eval "python main.py --model_path ${input_model} --mode ${mode} ${extra_cmd}"
5866

5967
}
6068

examples/nlp/huggingface_model/text_generation/quantization/weight_only/run_quant.sh

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,25 @@ function init_params {
1212
do
1313
case $var in
1414
--input_model=*)
15-
input_model=$(echo $var |cut -f2 -d=)
15+
input_model=$(echo "$var" |cut -f2 -d=)
1616
;;
1717
--output_model=*)
18-
output_model=$(echo $var |cut -f2 -d=)
18+
output_model=$(echo "$var" |cut -f2 -d=)
1919
;;
2020
--batch_size=*)
21-
batch_size=$(echo $var |cut -f2 -d=)
21+
batch_size=$(echo "$var" |cut -f2 -d=)
2222
;;
2323
--dataset=*)
24-
dataset=$(echo $var |cut -f2 -d=)
24+
dataset=$(echo "$var" |cut -f2 -d=)
2525
;;
2626
--tokenizer=*)
27-
tokenizer=$(echo $var |cut -f2 -d=)
27+
tokenizer=$(echo "$var" |cut -f2 -d=)
2828
;;
2929
--algorithm=*)
30-
algorithm=$(echo $var |cut -f2 -d=)
30+
algorithm=$(echo "$var" |cut -f2 -d=)
31+
;;
32+
--quant_format=*)
33+
quant_format=$(echo "$var" |cut -f2 -d=)
3134
;;
3235
esac
3336
done
@@ -56,30 +59,42 @@ function run_tuning {
5659
echo "Created directory $output_model"
5760
fi
5861

62+
extra_cmd=""
63+
5964
if [[ "${tokenizer}" =~ "Phi-3-mini" ]]; then
6065
nodes_to_exclude="/model/layers.*/self_attn/qkv_proj/MatMul /model/layers.*/mlp/down_proj/MatMul"
61-
extra_cmd="--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True"
66+
extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} --trust_remote_code True "
6267
fi
6368
if [[ "${tokenizer}" =~ "Llama-3-8B" ]]; then
6469
nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul"
65-
extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
70+
extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} "
6671
fi
6772
if [[ "${tokenizer}" =~ "Qwen2-7B" ]]; then
6873
nodes_to_exclude="/model/layers.*/mlp/down_proj/MatMul /model/layers.*/mlp/up_proj/MatMul"
69-
extra_cmd="--nodes_to_exclude ${nodes_to_exclude}"
74+
extra_cmd=$extra_cmd"--nodes_to_exclude ${nodes_to_exclude} "
75+
fi
76+
77+
if [ "${tokenizer}" ]; then
78+
extra_cmd=$extra_cmd"--tokenizer ${tokenizer} "
79+
fi
80+
if [ "${batch_size}" ]; then
81+
extra_cmd=$extra_cmd"--batch_size ${batch_size} "
82+
fi
83+
if [ "${dataset}" ]; then
84+
extra_cmd=$extra_cmd"--dataset ${dataset} "
85+
fi
86+
if [ "${algorithm}" ]; then
87+
extra_cmd=$extra_cmd"--algorithm ${algorithm} "
88+
fi
89+
if [ "${tasks}" ]; then
90+
extra_cmd=$extra_cmd"--tasks ${tasks} "
91+
fi
92+
if [ "${quant_format}" ]; then
93+
extra_cmd=$extra_cmd"--quant_format ${quant_format} "
7094
fi
7195

72-
eval "python main.py \
73-
--model_path ${input_model} \
74-
--tokenizer ${tokenizer-meta-llama/Llama-2-7b-hf} \
75-
--output_model ${output_model} \
76-
--batch_size ${batch_size-1} \
77-
--dataset ${dataset-NeelNanda/pile-10k} \
78-
--algorithm ${algorithm-WOQ_TUNE} \
79-
--tasks ${tasks-lambada_openai} \
80-
--layer_wise \
81-
--tune \
82-
${extra_cmd}"
96+
extra_cmd=$extra_cmd"--layer_wise --tune"
97+
eval "python main.py --model_path ${input_model} --output_model ${output_model} ${extra_cmd}"
8398
}
8499

85100
main "$@"

onnx_neural_compressor/algorithms/layer_wise/core.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import copy
1919
import os
2020
import pathlib
21+
import tempfile
2122

2223
import onnx
2324
import onnxruntime as ort
@@ -60,7 +61,7 @@ def layer_wise_quant(
6061
model = onnx_model.ONNXModel(model, ignore_warning=True, load_external_data=False)
6162

6263
origin_model = copy.deepcopy(model)
63-
64+
tmp_file = tempfile.TemporaryDirectory()
6465
providers = kwargs.get("providers", ["CPUExecutionProvider"])
6566

6667
# get and check split nodes
@@ -97,7 +98,7 @@ def layer_wise_quant(
9798

9899
# split model with given split node
99100
split_model_part_1, split_model_part_2 = split_model.split_model_with_node(
100-
split_node.name, model.model_path, save_both_split_models
101+
split_node.name, model.model_path, save_both_split_models, save_path=tmp_file.name
101102
)
102103

103104
if not save_both_split_models:
@@ -201,6 +202,8 @@ def layer_wise_quant(
201202
onnx.external_data_helper.load_external_data_for_model(
202203
quantized_model_merged.model, os.path.dirname(quantized_model_merged.model_path)
203204
)
205+
206+
tmp_file.cleanup()
204207
return quantized_model_merged
205208

206209

0 commit comments

Comments
 (0)