Adds QNN recipe for Phi-4-mini-reasoning

Trishansh Bhardwaj · Trishansh Bhardwaj · commit b7542eeaa40a · 2026-04-17T16:15:55.000+05:30
This recipe quantize weights to 8bit.
diff --git a/microsoft-Phi-4-mini-reasoning/QNN/config.json b/microsoft-Phi-4-mini-reasoning/QNN/config.json
@@ -0,0 +1,102 @@
+{
+    "input_model": { "type": "HfModel", "model_path": "microsoft/Phi-4-mini-reasoning" },
+    "systems": {
+        "qnn_system": {
+            "type": "PythonEnvironment",
+            "python_environment_path": "/prj/qct/lv/computeai-scratch/trishansh/MY_HOME/.pyenv/versions/3.11.13/envs/qnn/bin/",
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "wikitext2_train_joined",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
+            "pre_process_data_config": {
+                "strategy": "join",
+                "add_special_tokens": false,
+                "max_seq_len": 4096,
+                "max_samples": 128
+            }
+        },
+        {
+            "name": "wikitext2_train_act",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
+            "pre_process_data_config": {
+                "strategy": "line-by-line",
+                "add_special_tokens": true,
+                "max_samples": 256,
+                "max_seq_len": 2048
+            }
+        }
+    ],
+    "passes": {
+        "q": { "type": "QuaRot" },
+        "g": {
+            "type": "GptqModel",
+            "bits": 8,
+            "sym": true,
+            "group_size": -1,
+            "lm_head": true,
+            "device": "cuda",
+            "data_config": "wikitext2_train_joined",
+            "dynamic": {
+                    "+:.*lm_head*": {"bits": 8, "sym": true, "group_size": 32, "desc_act": false}
+            }
+        },
+        "cs": { "type": "CaptureSplitInfo", "num_splits": 2, "unique_embeds_lm_head_splits": true },
+        "mb": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_block_size": 32,
+            "int4_accuracy_level": 4,
+            "int4_op_types_to_quantize": [ "Gather" ]
+        },
+        "mq": {
+            "type": "MatMulNBitsToQDQ",
+            "use_int4": true,
+            "add_zero_point": true,
+            "nodes_to_exclude": [ "/lm_head/MatMul_Q4", "/lm_head/MatMulNBits" ],
+            "save_as_external_data": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                { "surgeon": "RemoveRopeMultiCache" },
+                { "surgeon": "AttentionMaskToSequenceLengths" },
+		{ "surgeon": "RemoveGidxFromMatMulNBits" },
+                { "surgeon": "SimplifiedLayerNormToL2Norm" }
+            ],
+            "save_as_external_data": true
+        },
+        "sq": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "wikitext2_train_act",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "quant_preprocess": true,
+            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
+            "save_as_external_data": true
+        },
+        "sp": { "type": "SplitModel" },
+        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_performance_mode": "burst",
+                "htp_graph_finalization_optimization_mode": "3",
+                "soc_model": "60"
+            },
+            "session_options": { "intra_op_num_threads": 2, "inter_op_num_threads": 1 },
+            "weight_sharing": true
+        },
+        "cp": { "type": "ComposeOnnxModels" }
+    },
+    "target": "qnn_system",
+    "log_severity_level": 0,
+    "output_dir": "phi-4-mini-reasoning_1",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/microsoft-Phi-4-mini-reasoning/QNN/requirement.txt b/microsoft-Phi-4-mini-reasoning/QNN/requirement.txt
@@ -0,0 +1,106 @@
+accelerate==1.13.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+alembic==1.18.4
+annotated-types==0.7.0
+anyio==4.12.1
+attrs==26.1.0
+autopep8==2.3.2
+certifi==2026.2.25
+charset-normalizer==3.4.6
+colorlog==6.10.1
+cuda-bindings==12.9.4
+cuda-pathfinder==1.4.3
+datasets==4.2.0
+device-smi==0.4.1
+dill==0.4.0
+filelock==3.25.2
+flatbuffers==25.12.19
+frozenlist==1.8.0
+fsspec==2025.9.0
+gptqmodel @ git+https://github.com/CodeLinaro/GPTQModel.git@64231a266cc70c5597fe97f26e7ec5ccda660c37
+greenlet==3.3.2
+h11==0.16.0
+hf-xet==1.4.2
+hf_transfer==0.1.9
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==0.36.2
+idna==3.11
+iniconfig==2.3.0
+Jinja2==3.1.6
+lightning-utilities==0.15.3
+logbar==0.0.4
+Mako==1.3.10
+MarkupSafe==3.0.3
+maturin==1.12.6
+ml_dtypes==0.5.4
+mpmath==1.3.0
+multidict==6.7.1
+multiprocess==0.70.16
+networkx==3.6.1
+numpy==2.4.3
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.4.5
+nvidia-nvtx-cu12==12.8.90
+olive-ai==0.11.0
+onnx==1.19.1
+onnx-ir==0.1.13
+onnxruntime-genai-cuda==0.11.2
+onnxruntime-gpu==1.24.1
+onnxscript==0.5.7
+optimum==2.0.0
+optuna==4.8.0
+packaging==26.0
+pandas==3.0.1
+pillow==12.1.1
+pluggy==1.6.0
+propcache==0.4.1
+protobuf==6.32.1
+psutil==7.2.2
+pyarrow==23.0.1
+pycodestyle==2.14.0
+pydantic==2.12.5
+pydantic_core==2.41.5
+Pygments==2.19.2
+pytest==8.4.2
+python-dateutil==2.9.0.post0
+PyYAML==6.0.3
+random_word==1.0.13
+regex==2026.2.28
+requests==2.32.5
+safetensors==0.6.2
+sentencepiece==0.2.1
+setuptools==82.0.1
+six==1.17.0
+SQLAlchemy==2.0.48
+sympy==1.14.0
+tabulate==0.10.0
+threadpoolctl==3.6.0
+tiktoken==0.12.0
+tokenicer==0.0.5
+tokenizers==0.22.2
+torch==2.10.0
+torchmetrics==1.9.0
+tqdm==4.67.3
+transformers==4.57.3
+triton==3.6.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+wheel==0.46.3
+xxhash==3.6.0
+yarl==1.23.0