Skip to content

Commit b7542ee

Browse files
author
Trishansh Bhardwaj
committed
Adds QNN recipe for Phi-4-mini-reasoning
This recipe quantize weights to 8bit.
1 parent b61d799 commit b7542ee

2 files changed

Lines changed: 208 additions & 0 deletions

File tree

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
{
2+
"input_model": { "type": "HfModel", "model_path": "microsoft/Phi-4-mini-reasoning" },
3+
"systems": {
4+
"qnn_system": {
5+
"type": "PythonEnvironment",
6+
"python_environment_path": "/prj/qct/lv/computeai-scratch/trishansh/MY_HOME/.pyenv/versions/3.11.13/envs/qnn/bin/",
7+
"accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
8+
}
9+
},
10+
"data_configs": [
11+
{
12+
"name": "wikitext2_train_joined",
13+
"type": "HuggingfaceContainer",
14+
"load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
15+
"pre_process_data_config": {
16+
"strategy": "join",
17+
"add_special_tokens": false,
18+
"max_seq_len": 4096,
19+
"max_samples": 128
20+
}
21+
},
22+
{
23+
"name": "wikitext2_train_act",
24+
"type": "HuggingfaceContainer",
25+
"load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
26+
"pre_process_data_config": {
27+
"strategy": "line-by-line",
28+
"add_special_tokens": true,
29+
"max_samples": 256,
30+
"max_seq_len": 2048
31+
}
32+
}
33+
],
34+
"passes": {
35+
"q": { "type": "QuaRot" },
36+
"g": {
37+
"type": "GptqModel",
38+
"bits": 8,
39+
"sym": true,
40+
"group_size": -1,
41+
"lm_head": true,
42+
"device": "cuda",
43+
"data_config": "wikitext2_train_joined",
44+
"dynamic": {
45+
"+:.*lm_head*": {"bits": 8, "sym": true, "group_size": 32, "desc_act": false}
46+
}
47+
},
48+
"cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
49+
"mb": {
50+
"type": "ModelBuilder",
51+
"precision": "int4",
52+
"int4_block_size": 32,
53+
"int4_accuracy_level": 4,
54+
"int4_op_types_to_quantize": [ "Gather" ]
55+
},
56+
"mq": {
57+
"type": "MatMulNBitsToQDQ",
58+
"use_int4": true,
59+
"add_zero_point": true,
60+
"nodes_to_exclude": [ "/lm_head/MatMul_Q4", "/lm_head/MatMulNBits" ],
61+
"save_as_external_data": true
62+
},
63+
"gs": {
64+
"type": "GraphSurgeries",
65+
"surgeries": [
66+
{ "surgeon": "RemoveRopeMultiCache" },
67+
{ "surgeon": "AttentionMaskToSequenceLengths" },
68+
{ "surgeon": "RemoveGidxFromMatMulNBits" },
69+
{ "surgeon": "SimplifiedLayerNormToL2Norm" }
70+
],
71+
"save_as_external_data": true
72+
},
73+
"sq": {
74+
"type": "OnnxStaticQuantization",
75+
"data_config": "wikitext2_train_act",
76+
"activation_type": "uint16",
77+
"precision": "uint8",
78+
"calibration_providers": [ "CUDAExecutionProvider" ],
79+
"quant_preprocess": true,
80+
"op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
81+
"save_as_external_data": true
82+
},
83+
"sp": { "type": "SplitModel" },
84+
"st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
85+
"cb": {
86+
"type": "EPContextBinaryGenerator",
87+
"provider_options": {
88+
"htp_performance_mode": "burst",
89+
"htp_graph_finalization_optimization_mode": "3",
90+
"soc_model": "60"
91+
},
92+
"session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
93+
"weight_sharing": true
94+
},
95+
"cp": { "type": "ComposeOnnxModels" }
96+
},
97+
"target": "qnn_system",
98+
"log_severity_level": 0,
99+
"output_dir": "phi-4-mini-reasoning_1",
100+
"cache_dir": "cache",
101+
"no_artifacts": true
102+
}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
accelerate==1.13.0
2+
aiohappyeyeballs==2.6.1
3+
aiohttp==3.13.3
4+
aiosignal==1.4.0
5+
alembic==1.18.4
6+
annotated-types==0.7.0
7+
anyio==4.12.1
8+
attrs==26.1.0
9+
autopep8==2.3.2
10+
certifi==2026.2.25
11+
charset-normalizer==3.4.6
12+
colorlog==6.10.1
13+
cuda-bindings==12.9.4
14+
cuda-pathfinder==1.4.3
15+
datasets==4.2.0
16+
device-smi==0.4.1
17+
dill==0.4.0
18+
filelock==3.25.2
19+
flatbuffers==25.12.19
20+
frozenlist==1.8.0
21+
fsspec==2025.9.0
22+
gptqmodel @ git+https://github.com/CodeLinaro/GPTQModel.git@64231a266cc70c5597fe97f26e7ec5ccda660c37
23+
greenlet==3.3.2
24+
h11==0.16.0
25+
hf-xet==1.4.2
26+
hf_transfer==0.1.9
27+
httpcore==1.0.9
28+
httpx==0.28.1
29+
huggingface_hub==0.36.2
30+
idna==3.11
31+
iniconfig==2.3.0
32+
Jinja2==3.1.6
33+
lightning-utilities==0.15.3
34+
logbar==0.0.4
35+
Mako==1.3.10
36+
MarkupSafe==3.0.3
37+
maturin==1.12.6
38+
ml_dtypes==0.5.4
39+
mpmath==1.3.0
40+
multidict==6.7.1
41+
multiprocess==0.70.16
42+
networkx==3.6.1
43+
numpy==2.4.3
44+
nvidia-cublas-cu12==12.8.4.1
45+
nvidia-cuda-cupti-cu12==12.8.90
46+
nvidia-cuda-nvrtc-cu12==12.8.93
47+
nvidia-cuda-runtime-cu12==12.8.90
48+
nvidia-cudnn-cu12==9.10.2.21
49+
nvidia-cufft-cu12==11.3.3.83
50+
nvidia-cufile-cu12==1.13.1.3
51+
nvidia-curand-cu12==10.3.9.90
52+
nvidia-cusolver-cu12==11.7.3.90
53+
nvidia-cusparse-cu12==12.5.8.93
54+
nvidia-cusparselt-cu12==0.7.1
55+
nvidia-nccl-cu12==2.27.5
56+
nvidia-nvjitlink-cu12==12.8.93
57+
nvidia-nvshmem-cu12==3.4.5
58+
nvidia-nvtx-cu12==12.8.90
59+
olive-ai==0.11.0
60+
onnx==1.19.1
61+
onnx-ir==0.1.13
62+
onnxruntime-genai-cuda==0.11.2
63+
onnxruntime-gpu==1.24.1
64+
onnxscript==0.5.7
65+
optimum==2.0.0
66+
optuna==4.8.0
67+
packaging==26.0
68+
pandas==3.0.1
69+
pillow==12.1.1
70+
pluggy==1.6.0
71+
propcache==0.4.1
72+
protobuf==6.32.1
73+
psutil==7.2.2
74+
pyarrow==23.0.1
75+
pycodestyle==2.14.0
76+
pydantic==2.12.5
77+
pydantic_core==2.41.5
78+
Pygments==2.19.2
79+
pytest==8.4.2
80+
python-dateutil==2.9.0.post0
81+
PyYAML==6.0.3
82+
random_word==1.0.13
83+
regex==2026.2.28
84+
requests==2.32.5
85+
safetensors==0.6.2
86+
sentencepiece==0.2.1
87+
setuptools==82.0.1
88+
six==1.17.0
89+
SQLAlchemy==2.0.48
90+
sympy==1.14.0
91+
tabulate==0.10.0
92+
threadpoolctl==3.6.0
93+
tiktoken==0.12.0
94+
tokenicer==0.0.5
95+
tokenizers==0.22.2
96+
torch==2.10.0
97+
torchmetrics==1.9.0
98+
tqdm==4.67.3
99+
transformers==4.57.3
100+
triton==3.6.0
101+
typing-inspection==0.4.2
102+
typing_extensions==4.15.0
103+
urllib3==2.6.3
104+
wheel==0.46.3
105+
xxhash==3.6.0
106+
yarl==1.23.0

0 commit comments

Comments
 (0)