Intel® OpenVINO add gemma-3-1b-it recipes (microsoft#365)

Anirudh-Swaminathan · xieofxie · web-flow · commit c4a7d925193b · 2026-04-15T10:27:15.000+08:00
Co-authored-by: xieofxie &lt;xieofxie@126.com&gt;
diff --git a/README.md b/README.md
diff --git a/google-gemma-3-1b-it/OpenVINO/README.md b/google-gemma-3-1b-it/OpenVINO/README.md
@@ -0,0 +1,76 @@
+# Gemma-3-1b-it Compression
+
+This folder contains a sample use case of Olive to optimize the [google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it) model using Intel® OpenVINO tools.
+
+- Intel® NPU: [Gemma 3 1b it Dynamic Shape model optimized for NPU](#gemma-3-1b-it-npu)
+- Intel® GPU: [Gemma 3 1b it Dynamic Shape model optimized for GPU](#gemma-3-1b-it-gpu)
+
+## Quantization Workflows
+
+This workflow performs quantization with Optimum Intel®. It performs the optimization pipeline:
+
+- *Huggingface Model -> Quantized OpenVINO model -> Quantized encapsulated ONNX OpenVINO IR model*
+
+### Gemma 3 1b it NPU
+
+The flow in the following config file executes the above workflow producing a dynamic shape model.
+
+1. [gemma_3_1b_it_context_ov_npu_config.json](gemma_3_1b_it_context_ov_npu_config.json)
+
+### Gemma 3 1b it GPU
+
+The flow in the following config file executes the above workflow producing a dynamic shape model.
+
+1. [gemma_3_1b_it_context_ov_gpu_config.json](gemma_3_1b_it_context_ov_gpu_config.json)
+
+## How to run
+
+### Setup
+
+Install the necessary python packages:
+
+```bash
+python -m pip install olive-ai[openvino]
+python -m pip install -r requirements.txt
+```
+
+### Run Olive config
+
+The optimization techniques to run are specified in the relevant config json file.
+
+Optimize the model using the following command:
+
+```bash
+olive run --config <config_file.json>
+```
+
+Example:
+
+```bash
+olive run --config gemma_3_1b_it_context_ov_npu_config.json
+```
+
+or run simply with python code:
+
+```python
+from olive import run
+workflow_output = run("<config_file.json>")
+```
+
+After running the above command, the model candidates and corresponding config will be saved in the output directory.
+
+### (Optional) Run Console-Based Chat Interface
+
+To run ONNX OpenVINO IR Encapsulated GenAI models, please setup latest ONNXRuntime GenAI with ONNXRuntime OpenVINO EP support.
+
+The sample chat app to run is found as [model-chat.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-chat.py) in the [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai/) GitHub repository.
+
+```bash
+python model-chat.py -e follow_config -v -g -m models/<model_folder>/
+```
+
+Example:
+
+```bash
+python model-chat.py -e follow_config -v -g -m models/gemma_3_1b_it_context_ov_npu/
+```
diff --git a/google-gemma-3-1b-it/OpenVINO/gemma_3_1b_it_context_ov_gpu_config.json b/google-gemma-3-1b-it/OpenVINO/gemma_3_1b_it_context_ov_gpu_config.json
@@ -0,0 +1,57 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "google/gemma-3-1b-it"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "OpenVINOExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "optimum_convert": {
+            "type": "OpenVINOOptimumConversion",
+            "extra_args": {
+                "device": "gpu"
+            },
+            "ov_quant_config": {
+                "weight_format": "int4",
+                "group_size": 128,
+                "ratio": 0.8,
+                "trust_remote_code": true
+            }
+        },
+        "encapsulation": {
+            "type": "OpenVINOEncapsulation",
+            "target_device": "gpu",
+            "keep_ov_dynamic_dims": true,
+            "ov_version": "2026.1",
+            "reuse_cache": true,
+            "genai_config_override": {
+                "model": {
+                    "context_length": 32768
+                },
+                "search": {
+                    "do_sample": true,
+                    "max_length": 32768,
+                    "repetition_penalty": 1.1,
+                    "temperature": 1.0,
+                    "top_k": 64,
+                    "top_p": 0.95
+                }
+            }
+        }
+    },
+    "search_strategy": false,
+    "target": "local_system",
+    "evaluate_input_model": false,
+    "output_dir": "models/gemma_3_1b_it_context_ov_gpu"
+}
diff --git a/google-gemma-3-1b-it/OpenVINO/gemma_3_1b_it_context_ov_npu_config.json b/google-gemma-3-1b-it/OpenVINO/gemma_3_1b_it_context_ov_npu_config.json
@@ -0,0 +1,73 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "google/gemma-3-1b-it"
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "npu",
+                    "execution_providers": [
+                        "OpenVINOExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "optimum_convert": {
+            "type": "OpenVINOOptimumConversion",
+            "extra_args": {
+                "device": "npu"
+            },
+            "ov_quant_config": {
+                "weight_format": "int4",
+                "group_size": 128,
+                "ratio": 1,
+                "dataset": "wikitext2",
+                "sym": true,
+                "trust_remote_code": true,
+                "sensitivity_metric": "weight_quantization_error"
+            }
+        },
+        "encapsulation": {
+            "type": "OpenVINOEncapsulation",
+            "target_device": "npu",
+            "keep_ov_dynamic_dims": true,
+            "ov_version": "2026.1",
+            "reuse_cache": true,
+            "genai_config_override": {
+                "model": {
+                    "context_length": 32768,
+                    "decoder": {
+                        "session_options": {
+                            "provider_options": [
+                                {
+                                    "OpenVINO": {
+                                        "device_type": "NPU",
+                                        "enable_causallm": "True",
+                                        "load_config": "{\"NPU\":{\"MAX_PROMPT_LEN\":\"4096\",\"MIN_RESPONSE_LEN\":\"128\"}}"
+                                    }
+                                }
+                            ]
+                        }
+                    }
+                },
+                "search": {
+                    "do_sample": true,
+                    "max_length": 32768,
+                    "repetition_penalty": 1.1,
+                    "temperature": 1.0,
+                    "top_k": 64,
+                    "top_p": 0.95
+                }
+            }
+        }
+    },
+    "search_strategy": false,
+    "target": "local_system",
+    "evaluate_input_model": false,
+    "output_dir": "models/gemma_3_1b_it_context_ov_npu"
+}
diff --git a/google-gemma-3-1b-it/OpenVINO/info.yml b/google-gemma-3-1b-it/OpenVINO/info.yml
@@ -0,0 +1,13 @@
+keywords:
+    openvino
+    olive
+arch: gemma
+recipes:
+    - file: "gemma_3_1b_it_context_ov_npu_config.json"
+      devices:
+        - npu
+      ep: OpenVINOExecutionProvider
+    - file: "gemma_3_1b_it_context_ov_gpu_config.json"
+      devices:
+        - gpu
+      ep: OpenVINOExecutionProvider
diff --git a/google-gemma-3-1b-it/OpenVINO/requirements.txt b/google-gemma-3-1b-it/OpenVINO/requirements.txt
@@ -0,0 +1,2 @@
+optimum-intel
+transformers