Skip to content

Commit ee351cb

Browse files
author
Ronak Mahawar
committed
Recipies for Llama-3.1-8b & Phi-4-14b-reasoning models
1 parent 23714b4 commit ee351cb

6 files changed

Lines changed: 200 additions & 16 deletions

File tree

meta-llama-Llama-3.1-8B-Instruct/QNN/README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,20 @@ export BUILD_CUDA_EXT=0
1616
# Windows
1717
# set BUILD_CUDA_EXT=0
1818

19-
# Install AutoGPTQ from source
20-
pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git
21-
2219
# Install GptqModel from source
23-
pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@5d2911a4b2a709afb0941d53c3882d0cd80b9649
20+
pip install --no-build-isolation git+https://github.com/CodeLinaro/GPTQModel.git@rel_4.2.5
2421
```
2522

2623
### AOT Compilation Python Environment Setup
2724
Model compilation using QNN Execution Provider requires a Python environment with onnxruntime-qnn installed. In a separate Python environment, install the required packages:
2825

2926
```bash
3027
# Install Olive
31-
pip install olive-ai==0.9.3
28+
pip install olive-ai==0.11.1
3229

3330
# Install ONNX Runtime QNN
3431
pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
35-
pip install --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple "onnxruntime-qnn==1.22.2" --no-deps
32+
pip install --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple "onnxruntime-qnn==1.23.2" --no-deps
3633
```
3734

3835
Replace `/path/to/qnn/env/bin` in [config.json](config.json) with the path to the directory containing your QNN environment's Python executable. This path can be found by running the following command in the environment:
@@ -49,6 +46,8 @@ This command will return the path to the Python executable. Set the parent direc
4946
### Run the Quantization + Compilation Config
5047
Activate the **Quantization Python Environment** and run the workflow:
5148

49+
### Change the soc_model param in config.json file corressponding to the target platform
50+
5251
```bash
5352
olive run --config config.json
5453
```

meta-llama-Llama-3.1-8B-Instruct/QNN/config.json

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,39 +33,50 @@
3333
],
3434
"passes": {
3535
"q": { "type": "QuaRot" },
36+
"cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
3637
"g": {
3738
"type": "GptqModel",
3839
"bits": 4,
3940
"sym": true,
4041
"group_size": -1,
41-
"lm_head": false,
42+
"lm_head": true,
4243
"device": "cuda",
43-
"data_config": "wikitext2_train_joined"
44+
"data_config": "wikitext2_train_joined",
45+
"dynamic": {
46+
"+:.*v_proj*": { "bits": 8, "sym": true, "group_size": -1, "desc_act": true },
47+
"+:.*lm_head*": { "bits": 4, "sym": true, "group_size": 32, "desc_act": false }
48+
}
4449
},
45-
"cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
4650
"mb": {
4751
"type": "ModelBuilder",
4852
"precision": "int4",
4953
"int4_block_size": 32,
5054
"int4_accuracy_level": 4,
51-
"int4_op_types_to_quantize": [ "MatMul", "Gather" ]
55+
"int4_op_types_to_quantize": [ "Gather" ]
5256
},
5357
"mq": {
5458
"type": "MatMulNBitsToQDQ",
5559
"use_int4": true,
5660
"add_zero_point": true,
57-
"nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
61+
"nodes_to_exclude": [ "/lm_head/MatMulNBits" ],
5862
"save_as_external_data": true
5963
},
6064
"gs": {
6165
"type": "GraphSurgeries",
6266
"surgeries": [
6367
{ "surgeon": "RemoveRopeMultiCache" },
6468
{ "surgeon": "AttentionMaskToSequenceLengths" },
69+
{ "surgeon": "RemoveGidxFromMatMulNBits" },
6570
{ "surgeon": "SimplifiedLayerNormToL2Norm" }
6671
],
6772
"save_as_external_data": true
6873
},
74+
"f16": {
75+
"type": "OnnxFloatToFloat16",
76+
"op_include_list": [ "GroupQueryAttention" ],
77+
"keep_io_types": [ "logits" ],
78+
"save_as_external_data": true
79+
},
6980
"sq": {
7081
"type": "OnnxStaticQuantization",
7182
"data_config": "wikitext2_train_act",
@@ -74,7 +85,8 @@
7485
"calibration_providers": [ "CUDAExecutionProvider" ],
7586
"quant_preprocess": true,
7687
"op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
77-
"save_as_external_data": true
88+
"save_as_external_data": true,
89+
"extra_options": { "CalibStridedMinMax": 1 }
7890
},
7991
"sp": { "type": "SplitModel" },
8092
"st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
datasets
2-
olive-ai==0.9.3
2+
olive-ai==0.11.1
33
# these are the versions the recipes were last validated with
4-
onnxruntime-genai-cuda==0.7.1
5-
onnxruntime-gpu==1.21.1
4+
onnxruntime-genai-cuda==0.11.2
5+
onnxruntime-gpu==1.23.2
66
optimum
77
# newer transformers might have incompatibility with gptq passes
8-
transformers==4.53.2
8+
transformers==4.57.3
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Llama-3.1-8B-Instruct Model Optimization
2+
3+
This repository demonstrates the optimization of the [Phi-4-reasoning](https://huggingface.co/microsoft/Phi-4-reasoning) model using **post-training quantization (PTQ)** techniques.
4+
5+
6+
### Quantization Python Environment Setup
7+
Quantization is resource-intensive and requires GPU acceleration. In an x64 Python environment, install the required packages:
8+
9+
```bash
10+
pip install -r requirements.txt
11+
12+
# AutoGPTQ: Install from source (stable package may be slow for weight packing)
13+
# Disable CUDA extension build (not required)
14+
# Linux
15+
export BUILD_CUDA_EXT=0
16+
# Windows
17+
# set BUILD_CUDA_EXT=0
18+
19+
# Install GptqModel from source
20+
pip install --no-build-isolation git+https://github.com/CodeLinaro/GPTQModel.git@rel_4.2.5
21+
```
22+
23+
### AOT Compilation Python Environment Setup
24+
Model compilation using QNN Execution Provider requires a Python environment with onnxruntime-qnn installed. In a separate Python environment, install the required packages:
25+
26+
```bash
27+
# Install Olive
28+
pip install olive-ai==0.11.1
29+
30+
# Install ONNX Runtime QNN
31+
pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
32+
pip install --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple "onnxruntime-qnn==1.23.2" --no-deps
33+
```
34+
35+
Replace `/path/to/qnn/env/bin` in [config.json](config.json) with the path to the directory containing your QNN environment's Python executable. This path can be found by running the following command in the environment:
36+
37+
```bash
38+
# Linux
39+
command -v python
40+
# Windows
41+
# where python
42+
```
43+
44+
This command will return the path to the Python executable. Set the parent directory of the executable as the `/path/to/qnn/env/bin` in the config file.
45+
46+
### Run the Quantization + Compilation Config
47+
Activate the **Quantization Python Environment** and run the workflow:
48+
49+
### Change the soc_model param in config.json file corressponding to the target platform
50+
51+
```bash
52+
olive run --config config.json
53+
```
54+
55+
Olive will run the AOT compilation step in the **AOT Compilation Python Environment** specified in the config file using a subprocess. All other steps will run in the **Quantization Python Environment** natively.
56+
57+
✅ Optimized model saved in: `models/phi4_reasoning`
58+
59+
> ⚠️ If optimization fails during context binary generation, rerun the command. The process will resume from the last completed step.
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
{
2+
"input_model": { "type": "HfModel", "model_path": "microsoft/Phi-4-reasoning" },
3+
"systems": {
4+
"qnn_system": {
5+
"type": "PythonEnvironment",
6+
"python_environment_path": "/path/to/qnn/env/bin",
7+
"accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
8+
}
9+
},
10+
"data_configs": [
11+
{
12+
"name": "wikitext2_train_joined",
13+
"type": "HuggingfaceContainer",
14+
"load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
15+
"pre_process_data_config": {
16+
"strategy": "join",
17+
"add_special_tokens": false,
18+
"max_seq_len": 4096,
19+
"max_samples": 128
20+
}
21+
},
22+
{
23+
"name": "wikitext2_train_act",
24+
"type": "HuggingfaceContainer",
25+
"load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
26+
"pre_process_data_config": {
27+
"strategy": "line-by-line",
28+
"add_special_tokens": true,
29+
"max_samples": 256,
30+
"max_seq_len": 4096
31+
}
32+
}
33+
],
34+
"passes": {
35+
"q": { "type": "QuaRot" },
36+
"cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
37+
"g": {
38+
"type": "GptqModel",
39+
"bits": 4,
40+
"sym": true,
41+
"group_size": -1,
42+
"lm_head": true,
43+
"device": "cuda",
44+
"data_config": "wikitext2_train_joined",
45+
"dynamic": { "+:.*lm_head*": { "bits": 4, "sym": true, "group_size": 128, "desc_act": false } }
46+
},
47+
"mb": {
48+
"type": "ModelBuilder",
49+
"precision": "int4",
50+
"int4_block_size": 128,
51+
"int4_accuracy_level": 4,
52+
"int4_op_types_to_quantize": [ "Gather" ]
53+
},
54+
"mq": {
55+
"type": "MatMulNBitsToQDQ",
56+
"use_int4": true,
57+
"add_zero_point": true,
58+
"nodes_to_exclude": [ "/lm_head/MatMulNBits" ],
59+
"save_as_external_data": true
60+
},
61+
"gs": {
62+
"type": "GraphSurgeries",
63+
"surgeries": [
64+
{ "surgeon": "RemoveRopeMultiCache" },
65+
{ "surgeon": "AttentionMaskToSequenceLengths" },
66+
{ "surgeon": "RemoveGidxFromMatMulNBits" },
67+
{ "surgeon": "SimplifiedLayerNormToL2Norm" }
68+
],
69+
"save_as_external_data": true
70+
},
71+
"f16": {
72+
"type": "OnnxFloatToFloat16",
73+
"op_include_list": [ "GroupQueryAttention" ],
74+
"keep_io_types": [ "logits" ],
75+
"save_as_external_data": true
76+
},
77+
"sq": {
78+
"type": "OnnxStaticQuantization",
79+
"data_config": "wikitext2_train_act",
80+
"activation_type": "uint16",
81+
"precision": "uint8",
82+
"calibration_providers": [ "CUDAExecutionProvider" ],
83+
"quant_preprocess": true,
84+
"op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
85+
"save_as_external_data": true,
86+
"extra_options": { "CalibStridedMinMax": 1 }
87+
},
88+
"sp": { "type": "SplitModel" },
89+
"st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
90+
"cb": {
91+
"type": "EPContextBinaryGenerator",
92+
"provider_options": {
93+
"htp_performance_mode": "burst",
94+
"htp_graph_finalization_optimization_mode": "3",
95+
"soc_model": "60"
96+
},
97+
"weight_sharing": true
98+
},
99+
"cp": { "type": "ComposeOnnxModels" }
100+
},
101+
"target": "qnn_system",
102+
"log_severity_level": 1,
103+
"output_dir": "models/phi4_reasoning",
104+
"cache_dir": "cache",
105+
"no_artifacts": true
106+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datasets
2+
olive-ai==0.11.1
3+
# these are the versions the recipes were last validated with
4+
onnxruntime-genai-cuda==0.11.2
5+
onnxruntime-gpu==1.23.2
6+
optimum
7+
# newer transformers might have incompatibility with gptq passes
8+
transformers==4.57.3

0 commit comments

Comments
 (0)