Recipies for Llama-3.1-8b & Phi-4-14b-reasoning models

Ronak Mahawar · Ronak Mahawar · commit ee351cbfad10 · 2025-12-17T14:52:54.000-08:00
diff --git a/meta-llama-Llama-3.1-8B-Instruct/QNN/README.md b/meta-llama-Llama-3.1-8B-Instruct/QNN/README.md
@@ -16,23 +16,20 @@ export BUILD_CUDA_EXT=0
 # Windows
 # set BUILD_CUDA_EXT=0
 
-# Install AutoGPTQ from source
-pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git
-
 # Install GptqModel from source
-pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@5d2911a4b2a709afb0941d53c3882d0cd80b9649
+pip install --no-build-isolation git+https://github.com/CodeLinaro/GPTQModel.git@rel_4.2.5
 ```
 
 ### AOT Compilation Python Environment Setup
 Model compilation using QNN Execution Provider requires a Python environment with onnxruntime-qnn installed. In a separate Python environment, install the required packages:
 
 ```bash
 # Install Olive
-pip install olive-ai==0.9.3
+pip install olive-ai==0.11.1
 
 # Install ONNX Runtime QNN
 pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
-pip install --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple "onnxruntime-qnn==1.22.2" --no-deps
+pip install --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple "onnxruntime-qnn==1.23.2" --no-deps
 ```
 
 Replace `/path/to/qnn/env/bin` in [config.json](config.json) with the path to the directory containing your QNN environment's Python executable. This path can be found by running the following command in the environment:
@@ -49,6 +46,8 @@ This command will return the path to the Python executable. Set the parent direc
 ### Run the Quantization + Compilation Config
 Activate the **Quantization Python Environment** and run the workflow:
 
+### Change the soc_model param in config.json file corressponding to the target platform
+
 ```bash
 olive run --config config.json
 ```
diff --git a/meta-llama-Llama-3.1-8B-Instruct/QNN/config.json b/meta-llama-Llama-3.1-8B-Instruct/QNN/config.json
@@ -33,39 +33,50 @@
     ],
     "passes": {
         "q": { "type": "QuaRot" },
+        "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
         "g": {
             "type": "GptqModel",
             "bits": 4,
             "sym": true,
             "group_size": -1,
-            "lm_head": false,
+            "lm_head": true,
             "device": "cuda",
-            "data_config": "wikitext2_train_joined"
+            "data_config": "wikitext2_train_joined",
+            "dynamic": {
+                "+:.*v_proj*": { "bits": 8, "sym": true, "group_size": -1, "desc_act": true },
+                "+:.*lm_head*": { "bits": 4, "sym": true, "group_size": 32, "desc_act": false }
+            }
         },
-        "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
         "mb": {
             "type": "ModelBuilder",
             "precision": "int4",
             "int4_block_size": 32,
             "int4_accuracy_level": 4,
-            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
+            "int4_op_types_to_quantize": [ "Gather" ]
         },
         "mq": {
             "type": "MatMulNBitsToQDQ",
             "use_int4": true,
             "add_zero_point": true,
-            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
+            "nodes_to_exclude": [ "/lm_head/MatMulNBits" ],
             "save_as_external_data": true
         },
         "gs": {
             "type": "GraphSurgeries",
             "surgeries": [
                 { "surgeon": "RemoveRopeMultiCache" },
                 { "surgeon": "AttentionMaskToSequenceLengths" },
+                { "surgeon": "RemoveGidxFromMatMulNBits" },
                 { "surgeon": "SimplifiedLayerNormToL2Norm" }
             ],
             "save_as_external_data": true
         },
+        "f16": {
+            "type": "OnnxFloatToFloat16",
+            "op_include_list": [ "GroupQueryAttention" ],
+            "keep_io_types": [ "logits" ],
+            "save_as_external_data": true
+        },
         "sq": {
             "type": "OnnxStaticQuantization",
             "data_config": "wikitext2_train_act",
@@ -74,7 +85,8 @@
             "calibration_providers": [ "CUDAExecutionProvider" ],
             "quant_preprocess": true,
             "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
-            "save_as_external_data": true
+            "save_as_external_data": true,
+            "extra_options": { "CalibStridedMinMax": 1 }
         },
         "sp": { "type": "SplitModel" },
         "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
diff --git a/meta-llama-Llama-3.1-8B-Instruct/QNN/requirements.txt b/meta-llama-Llama-3.1-8B-Instruct/QNN/requirements.txt
@@ -1,8 +1,8 @@
 datasets
-olive-ai==0.9.3
+olive-ai==0.11.1
 # these are the versions the recipes were last validated with
-onnxruntime-genai-cuda==0.7.1
-onnxruntime-gpu==1.21.1
+onnxruntime-genai-cuda==0.11.2
+onnxruntime-gpu==1.23.2
 optimum
 # newer transformers might have incompatibility with gptq passes
-transformers==4.53.2
+transformers==4.57.3
diff --git a/microsoft-Phi-4-reasoning/QNN/README.md b/microsoft-Phi-4-reasoning/QNN/README.md
@@ -0,0 +1,59 @@
+# Llama-3.1-8B-Instruct Model Optimization
+
+This repository demonstrates the optimization of the [Phi-4-reasoning](https://huggingface.co/microsoft/Phi-4-reasoning) model using **post-training quantization (PTQ)** techniques.
+
+
+### Quantization Python Environment Setup
+Quantization is resource-intensive and requires GPU acceleration. In an x64 Python environment, install the required packages:
+
+```bash
+pip install -r requirements.txt
+
+# AutoGPTQ: Install from source (stable package may be slow for weight packing)
+# Disable CUDA extension build (not required)
+# Linux
+export BUILD_CUDA_EXT=0
+# Windows
+# set BUILD_CUDA_EXT=0
+
+# Install GptqModel from source
+pip install --no-build-isolation git+https://github.com/CodeLinaro/GPTQModel.git@rel_4.2.5
+```
+
+### AOT Compilation Python Environment Setup
+Model compilation using QNN Execution Provider requires a Python environment with onnxruntime-qnn installed. In a separate Python environment, install the required packages:
+
+```bash
+# Install Olive
+pip install olive-ai==0.11.1
+
+# Install ONNX Runtime QNN
+pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt
+pip install --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple "onnxruntime-qnn==1.23.2" --no-deps
+```
+
+Replace `/path/to/qnn/env/bin` in [config.json](config.json) with the path to the directory containing your QNN environment's Python executable. This path can be found by running the following command in the environment:
+
+```bash
+# Linux
+command -v python
+# Windows
+# where python
+```
+
+This command will return the path to the Python executable. Set the parent directory of the executable as the `/path/to/qnn/env/bin` in the config file.
+
+### Run the Quantization + Compilation Config
+Activate the **Quantization Python Environment** and run the workflow:
+
+### Change the soc_model param in config.json file corressponding to the target platform
+
+```bash
+olive run --config config.json
+```
+
+Olive will run the AOT compilation step in the **AOT Compilation Python Environment** specified in the config file using a subprocess. All other steps will run in the **Quantization Python Environment** natively.
+
+✅ Optimized model saved in: `models/phi4_reasoning`
+
+> ⚠️ If optimization fails during context binary generation, rerun the command. The process will resume from the last completed step.
diff --git a/microsoft-Phi-4-reasoning/QNN/config.json b/microsoft-Phi-4-reasoning/QNN/config.json
@@ -0,0 +1,106 @@
+{
+    "input_model": { "type": "HfModel", "model_path": "microsoft/Phi-4-reasoning" },
+    "systems": {
+        "qnn_system": {
+            "type": "PythonEnvironment",
+            "python_environment_path": "/path/to/qnn/env/bin",
+            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
+        }
+    },
+    "data_configs": [
+        {
+            "name": "wikitext2_train_joined",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
+            "pre_process_data_config": {
+                "strategy": "join",
+                "add_special_tokens": false,
+                "max_seq_len": 4096,
+                "max_samples": 128
+            }
+        },
+        {
+            "name": "wikitext2_train_act",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
+            "pre_process_data_config": {
+                "strategy": "line-by-line",
+                "add_special_tokens": true,
+                "max_samples": 256,
+                "max_seq_len": 4096
+            }
+        }
+    ],
+    "passes": {
+        "q": { "type": "QuaRot" },
+        "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
+        "g": {
+            "type": "GptqModel",
+            "bits": 4,
+            "sym": true,
+            "group_size": -1,
+            "lm_head": true,
+            "device": "cuda",
+            "data_config": "wikitext2_train_joined",
+            "dynamic": { "+:.*lm_head*": { "bits": 4, "sym": true, "group_size": 128, "desc_act": false } }
+        },
+        "mb": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_block_size": 128,
+            "int4_accuracy_level": 4,
+            "int4_op_types_to_quantize": [ "Gather" ]
+        },
+        "mq": {
+            "type": "MatMulNBitsToQDQ",
+            "use_int4": true,
+            "add_zero_point": true,
+            "nodes_to_exclude": [ "/lm_head/MatMulNBits" ],
+            "save_as_external_data": true
+        },
+        "gs": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                { "surgeon": "RemoveRopeMultiCache" },
+                { "surgeon": "AttentionMaskToSequenceLengths" },
+                { "surgeon": "RemoveGidxFromMatMulNBits" },
+                { "surgeon": "SimplifiedLayerNormToL2Norm" }
+            ],
+            "save_as_external_data": true
+        },
+        "f16": {
+            "type": "OnnxFloatToFloat16",
+            "op_include_list": [ "GroupQueryAttention" ],
+            "keep_io_types": [ "logits" ],
+            "save_as_external_data": true
+        },
+        "sq": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "wikitext2_train_act",
+            "activation_type": "uint16",
+            "precision": "uint8",
+            "calibration_providers": [ "CUDAExecutionProvider" ],
+            "quant_preprocess": true,
+            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
+            "save_as_external_data": true,
+            "extra_options": { "CalibStridedMinMax": 1 }
+        },
+        "sp": { "type": "SplitModel" },
+        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_performance_mode": "burst",
+                "htp_graph_finalization_optimization_mode": "3",
+                "soc_model": "60"
+            },
+            "weight_sharing": true
+        },
+        "cp": { "type": "ComposeOnnxModels" }
+    },
+    "target": "qnn_system",
+    "log_severity_level": 1,
+    "output_dir": "models/phi4_reasoning",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/microsoft-Phi-4-reasoning/QNN/requirements.txt b/microsoft-Phi-4-reasoning/QNN/requirements.txt
@@ -0,0 +1,8 @@
+datasets
+olive-ai==0.11.1
+# these are the versions the recipes were last validated with
+onnxruntime-genai-cuda==0.11.2
+onnxruntime-gpu==1.23.2
+optimum
+# newer transformers might have incompatibility with gptq passes
+transformers==4.57.3