microsoft
diff --git a/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore‎
Lines changed: 9 additions & 0 deletions b/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md‎
Lines changed: 93 additions & 0 deletions b/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/codes/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/codes/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/codes/modeling_ministral3.py‎
Lines changed: 211 additions & 0 deletions b/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/codes/modeling_ministral3.py‎
Lines changed: 211 additions & 0 deletions
diff --git a/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json‎
Lines changed: 18 additions & 0 deletions b/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json‎
Lines changed: 30 additions & 0 deletions b/‎mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json‎
Lines changed: 30 additions & 0 deletions
@@ -0,0 +1,9 @@
+# Generated model artifacts
+models/
+
+# Python bytecode
+__pycache__/
+*.pyc
+
+# Olive cache
+.olive-cache/
@@ -0,0 +1,93 @@
+# Ministral-3-3B ONNX Runtime GenAI Example
+
+This example demonstrates how to convert [Ministral-3-3B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512) vision-language model to ONNX format using Olive and run inference with ONNX Runtime GenAI.
+
+Ministral-3-3B is a multimodal (VLM) model combining a Pixtral vision encoder with a Mistral text decoder using YaRN RoPE for extended context. The pipeline exports three sub-models:
+- **Vision encoder** and **embedding** via [mobius](https://github.com/onnxruntime/mobius) (declarative ONNX graph construction)
+- **Text decoder** via Olive/ModelBuilder (GQA + INT4/FP16 quantization)
+
+## Prerequisites
+
+```bash
+pip install -r requirements.txt
+```
+
+Install ONNX Runtime GenAI:
+
+| Device | Install Command |
+|--------|-----------------|
+| CPU | `pip install onnxruntime-genai --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple` |
+| GPU (CUDA) | `pip install onnxruntime-genai-cuda --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple` |
+
+## Steps
+
+### 1. Export & Optimize Models
+
+**CPU (INT4 text decoder, FP16 vision/embedding):**
+
+```bash
+python optimize.py --config-dir cpu_and_mobile --device cpu
+```
+
+**CUDA (FP16):**
+
+```bash
+python optimize.py --config-dir cuda --device gpu
+```
+
+**With local dequantized checkpoint (skips FP8 dequant):**
+
+```bash
+python optimize.py --config-dir cpu_and_mobile --device cpu --model-path /path/to/Ministral-3-3B-dequantized
+```
+
+This runs:
+- **Olive/ModelBuilder** for text decoder (GQA attention, YaRN RoPE, INT4/FP16)
+- **Mobius** for vision encoder (Pixtral, dynamic H×W, 2D RoPE) and embedding (token + image fusion)
+
+Then generates `genai_config.json` and `processor_config.json` for the ORT GenAI runtime.
+
+### 2. Output Structure
+
+```
+cpu_and_mobile/models/          # or cuda/models/
+├── vision.onnx                 # Pixtral vision encoder
+├── vision.onnx.data
+├── embedding.onnx              # Embedding fusion model
+├── embedding.onnx.data
+├── text.onnx                   # Text decoder (Mistral + YaRN)
+├── text.onnx.data
+├── genai_config.json           # Runtime configuration
+├── processor_config.json       # Pixtral image preprocessing
+├── tokenizer.json
+└── tokenizer_config.json
+```
+
+### 3. Run Inference
+
+```bash
+# Text-only
+python inference.py --prompt "What is the capital of France?"
+
+# Image + text
+python inference.py --image photo.jpg --prompt "Describe this image"
+
+# Interactive mode
+python inference.py --interactive
+
+# CUDA model
+python inference.py --model_path cuda/models --prompt "Hello"
+```
+
+Alternatively, use the built-in GenAI multimodal demo:
+
+```bash
+python -m onnxruntime_genai.models.model_mm -m cpu_and_mobile/models --max_length 4096
+```
+
+## Notes
+
+- The HuggingFace checkpoint uses FP8 quantized weights. The export pipeline dequantizes these automatically (`weight * weight_scale_inv`).
+- The tokenizer uses `TokenizersBackend` class which genai doesn't support. The optimize script fixes this to `LlamaTokenizer`.
+- Pixtral vision supports dynamic image sizes (multiples of 28, up to 1540×1540).
+- The text decoder includes `llama_4_attn_scale` for long-context attention (>16K tokens).
@@ -0,0 +1,2 @@
+# Reference-only: Ministral3Model is not used by optimize.py (see modeling_ministral3.py)
+from .modeling_ministral3 import Ministral3Model as Ministral3Model
@@ -0,0 +1,211 @@
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0.
+#
+# Adapted from transformers/models/mistral3/modeling_mistral3.py
+#
+# REFERENCE ONLY: This module is NOT used by optimize.py (which uses mobius
+# for vision/embedding export). It is kept as a reference implementation
+# showing how to build an ONNX-export-friendly Ministral3 vision + embedding
+# model for potential future Olive-based export.
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from transformers import AutoModel
+from transformers.models.mistral3.configuration_mistral3 import Mistral3Config
+
+
+class Mistral3PatchMerger(nn.Module):
+    """ONNX-export-friendly Mistral3PatchMerger.
+
+    Uses pure tensor operations during export instead of Python for-loops.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        hidden_size = config.vision_config.hidden_size
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.vision_config.patch_size
+        self.merging_layer = nn.Linear(
+            hidden_size * self.spatial_merge_size**2, hidden_size, bias=False
+        )
+
+    def forward(
+        self, image_features: torch.Tensor, image_sizes: torch.Tensor
+    ) -> torch.Tensor:
+        if torch.compiler.is_exporting():
+            return self._forward_export(image_features, image_sizes)
+        return self._forward_eager(image_features, image_sizes)
+
+    def _forward_export(self, image_features, image_sizes):
+        patch_h = image_sizes[0, 0] // self.patch_size
+        patch_w = image_sizes[0, 1] // self.patch_size
+        d = image_features.shape[-1]
+
+        image_grid = (
+            image_features.view(patch_h, patch_w, d).permute(2, 0, 1).unsqueeze(0)
+        )
+
+        torch._check(image_grid.shape[2] != 0)
+        torch._check(image_grid.shape[3] != 0)
+        torch._check(image_grid.shape[2] // self.spatial_merge_size > 0)
+        torch._check(image_grid.shape[3] // self.spatial_merge_size > 0)
+
+        grid = torch.nn.functional.unfold(
+            image_grid,
+            kernel_size=self.spatial_merge_size,
+            stride=self.spatial_merge_size,
+        )
+        image_features = grid.view(d * self.spatial_merge_size**2, -1).t()
+        return self.merging_layer(image_features)
+
+    def _forward_eager(self, image_features, image_sizes):
+        image_sizes_list = [
+            (sz[0] // self.patch_size, sz[1] // self.patch_size) for sz in image_sizes
+        ]
+        tokens_per_image = [h * w for h, w in image_sizes_list]
+        d = image_features.shape[-1]
+
+        permuted = []
+        for idx, image_tokens in enumerate(image_features.split(tokens_per_image)):
+            h, w = image_sizes_list[idx]
+            image_grid = image_tokens.view(h, w, d).permute(2, 0, 1).unsqueeze(0)
+            grid = torch.nn.functional.unfold(
+                image_grid,
+                kernel_size=self.spatial_merge_size,
+                stride=self.spatial_merge_size,
+            )
+            permuted.append(grid.view(d * self.spatial_merge_size**2, -1).t())
+
+        return self.merging_layer(torch.cat(permuted, dim=0))
+
+
+def pixtral_vision_forward_export(self, pixel_values, **kwargs):
+    """ONNX-export-friendly forward for PixtralVisionModel (batch=1).
+
+    Skips generate_block_attention_mask and computes position_ids inline.
+    """
+    torch._check(pixel_values.shape[0] == 1)
+
+    target_dtype = self.patch_conv.weight.dtype
+    patch_embeds = self.patch_conv(pixel_values.to(dtype=target_dtype))
+
+    grid_h = patch_embeds.shape[2]
+    grid_w = patch_embeds.shape[3]
+
+    patch_embeds = patch_embeds[0].flatten(1).T.unsqueeze(0)
+    patch_embeds = self.ln_pre(patch_embeds)
+
+    max_width = self.config.image_size // self.config.patch_size
+    h_indices = torch.arange(grid_h, device=pixel_values.device)
+    w_indices = torch.arange(grid_w, device=pixel_values.device)
+    mesh_h, mesh_w = torch.meshgrid(h_indices, w_indices, indexing="ij")
+    position_ids = (mesh_h * max_width + mesh_w).reshape(-1)
+    kwargs["position_ids"] = position_ids.unsqueeze(0)
+
+    position_embeddings = self.patch_positional_embedding(patch_embeds, position_ids)
+
+    return self.transformer(
+        patch_embeds,
+        attention_mask=None,
+        position_embeddings=position_embeddings,
+        **kwargs,
+    )
+
+
+def _pixtral_vision_forward_dispatch(self, pixel_values, **kwargs):
+    if torch.compiler.is_exporting():
+        return pixtral_vision_forward_export(self, pixel_values, **kwargs)
+    return self._original_forward(pixel_values, **kwargs)
+
+
+def patch_model_for_onnx_export(model):
+    """Apply ONNX-export-friendly patches to a Mistral 3 model."""
+    import types
+
+    if hasattr(model, "model") and hasattr(model.model, "multi_modal_projector"):
+        patch_merger = model.model.multi_modal_projector.patch_merger
+        vision_tower = model.model.vision_tower
+    elif hasattr(model, "multi_modal_projector"):
+        patch_merger = model.multi_modal_projector.patch_merger
+        vision_tower = model.vision_tower
+    else:
+        raise ValueError("Cannot find multi_modal_projector.patch_merger on the model.")
+
+    patch_merger.__class__ = Mistral3PatchMerger
+
+    vision_tower._original_forward = vision_tower.forward
+    vision_tower.forward = types.MethodType(
+        _pixtral_vision_forward_dispatch, vision_tower
+    )
+
+    return model
+
+
+class Ministral3Model(nn.Module):
+    """Ministral3 composite model for vision + embedding ONNX export.
+
+    Wraps HF Mistral3Model and provides:
+    - get_image_features(): vision encoder export
+    - get_fused_input_embeddings(): embedding fusion export
+    """
+
+    def __init__(self, config: Mistral3Config):
+        super().__init__()
+        self.config = config
+
+        # Build the full HF model, then patch for export
+        self.hf_model = AutoModel.from_config(
+            config, attn_implementation="sdpa", trust_remote_code=True
+        )
+        patch_model_for_onnx_export(self.hf_model)
+
+        # Expose sub-components for weight loading
+        self.vision_tower = self.hf_model.vision_tower
+        self.multi_modal_projector = self.hf_model.multi_modal_projector
+        self.embed_tokens = self.hf_model.language_model.embed_tokens
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def get_image_features(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        """Vision encoder: pixel_values -> image_features."""
+        image_outputs = self.vision_tower(pixel_values, return_dict=True)
+        selected_image_feature = image_outputs.last_hidden_state
+
+        image_sizes = torch.tensor(
+            [[pixel_values.shape[-2], pixel_values.shape[-1]]],
+            dtype=torch.int64,
+            device=pixel_values.device,
+        )
+        image_features = self.multi_modal_projector(
+            selected_image_feature.squeeze(0), image_sizes
+        )
+        return image_features
+
+    def get_fused_input_embeddings(
+        self, input_ids: torch.LongTensor, image_features: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Embedding fusion: input_ids + image_features -> inputs_embeds."""
+        inputs_embeds = self.embed_tokens(input_ids)
+        if image_features is not None:
+            image_features = image_features.to(inputs_embeds.dtype)
+            special_image_mask = input_ids == self.config.image_token_index
+            expanded_mask = (
+                special_image_mask.unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(expanded_mask, image_features)
+        return inputs_embeds
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Use get_image_features() or get_fused_input_embeddings() via method swap."
+        )
+
+
+__all__ = ["Ministral3Model", "patch_model_for_onnx_export"]
@@ -0,0 +1,18 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "mistralai/Ministral-3-3B-Instruct-2512"
+    },
+    "passes": {
+        "convert": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_accuracy_level": 4,
+            "extra_options": {
+                "filename": "text.onnx"
+            }
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "cpu_and_mobile/models/text.onnx"
+}
@@ -0,0 +1,30 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "mistralai/Ministral-3-3B-Instruct-2512"
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "fp16",
+            "extra_options": {
+                "filename": "text.onnx"
+            }
+        }
+    },
+    "engine": {
+        "target": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": [
+                        "CUDAExecutionProvider"
+                    ]
+                }
+            ]
+        }
+    },
+    "no_artifacts": true,
+    "output_dir": "cuda/models/text.onnx"
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Reference-only: Ministral3Model is not used by optimize.py (see modeling_ministral3.py)`
	`2`	`+from .modeling_ministral3 import Ministral3Model as Ministral3Model`