CodeLinaro
diff --git a/‎cmake/deps.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/deps.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/python/common.py‎
Lines changed: 112 additions & 27 deletions b/‎examples/python/common.py‎
Lines changed: 112 additions & 27 deletions
diff --git a/‎src/config.cpp‎
Lines changed: 6 additions & 0 deletions b/‎src/config.cpp‎
Lines changed: 6 additions & 0 deletions
@@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;539d380ce9c2fcdfc9fd9f151ef5604425215aa9
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;e094cc816679d0b2b5fe2b4fd7f73e5b1844b425
 
 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
 llguidance;https://github.com/microsoft/llguidance.git;94fa39128ef184ffeda33845f6d333f332a34b4d
 
@@ -4,10 +4,11 @@
 import argparse
 import json
 import os
+from dataclasses import asdict, dataclass
+from typing import Any
+
 import onnxruntime_genai as og
 
-from dataclasses import dataclass, asdict
-from typing import Any
 
 def set_logger(inputs: bool = True, outputs: bool = True) -> None:
     """
@@ -21,6 +22,7 @@ def set_logger(inputs: bool = True, outputs: bool = True) -> None:
     """
     og.set_log_options(enabled=True, model_input_values=inputs, model_output_values=outputs)
 
+
 def register_ep(ep: str, ep_path: str, use_winml: bool) -> None:
     """
     Register execution provider if path is provided or via Windows ML
@@ -42,6 +44,7 @@ def register_ep(ep: str, ep_path: str, use_winml: bool) -> None:
         # Modified from here: https://learn.microsoft.com/en-us/windows/ai/new-windows-ml/tutorial?tabs=python#acquiring-the-model-and-preprocessing
         try:
             import winml
+
             print(winml.register_execution_providers(ort=False, ort_genai=True))
         except ImportError:
             print("WinML not available, using default execution providers")
@@ -53,11 +56,14 @@ def register_ep(ep: str, ep_path: str, use_winml: bool) -> None:
         og.register_execution_provider_library("NvTensorRTRTXExecutionProvider", ep_path)
     else:
         print(f"Warning: EP registration not supported for {ep}")
-        print("Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries. Use Windows ML via '--use_winml' to register EPs.")
+        print(
+            "Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries. Use Windows ML via '--use_winml' to register EPs."
+        )
         return
 
     print(f"Registered {ep} successfully!")
 
+
 def get_config(path: str, ep: str, ep_options: dict[str, str] = {}, search_options: dict[str, int] = {}) -> og.Config:
     """
     Get og.Config object and set EP-specific and search-specific options inside it
@@ -98,6 +104,7 @@ def get_config(path: str, ep: str, ep_options: dict[str, str] = {}, search_optio
     config.overlay(json.dumps({"search": search_options}))
     return config
 
+
 def get_search_options(args: argparse.Namespace):
     """
     Get search options for a generator's params during decoding
@@ -128,7 +135,10 @@ def get_search_options(args: argparse.Namespace):
     search_options["batch_size"] = search_options.get("batch_size", 1)
     return search_options
 
-def apply_chat_template(model_path: str, tokenizer: og.Tokenizer, messages: str, add_generation_prompt: bool, tools: str = "") -> str:
+
+def apply_chat_template(
+    model_path: str, tokenizer: og.Tokenizer, messages: str, add_generation_prompt: bool, tools: str = ""
+) -> str:
     """
     Apply the chat template with various fallback options
 
@@ -151,6 +161,7 @@ def apply_chat_template(model_path: str, tokenizer: og.Tokenizer, messages: str,
     )
     return prompt
 
+
 def get_user_prompt(prompt: str, non_interactive: bool) -> str:
     """
     Get prompt for 'user' role in chat template
@@ -179,6 +190,7 @@ def get_user_prompt(prompt: str, non_interactive: bool) -> str:
 
     return text
 
+
 def get_user_media_paths(media_paths: list[str], non_interactive: bool, media_type: str) -> list[str]:
     """
     Get paths to media for user
@@ -202,7 +214,9 @@ def get_user_media_paths(media_paths: list[str], non_interactive: bool, media_ty
         # If interactive mode is on
         paths = [
             path.strip()
-            for path in input(f"{media_type.capitalize()} Path (comma separated; leave empty if no {media_type}): ").split(",")
+            for path in input(
+                f"{media_type.capitalize()} Path (comma separated; leave empty if no {media_type}): "
+            ).split(",")
         ]
 
     paths = [path for path in paths if path]
@@ -213,6 +227,7 @@ def get_user_media_paths(media_paths: list[str], non_interactive: bool, media_ty
 
     return paths
 
+
 def get_user_images(image_paths: list[str], non_interactive: bool) -> tuple[og.Images, int]:
     """
     Get images for user
@@ -232,6 +247,7 @@ def get_user_images(image_paths: list[str], non_interactive: bool) -> tuple[og.I
     images = og.Images.open(*paths)
     return images, len(paths)
 
+
 def get_user_audios(audio_paths: list[str], non_interactive: bool) -> tuple[og.Audios, int]:
     """
     Get audios for user
@@ -251,6 +267,7 @@ def get_user_audios(audio_paths: list[str], non_interactive: bool) -> tuple[og.A
     audios = og.Audios.open(*paths)
     return audios, len(paths)
 
+
 def get_user_content(model_type: str, num_images: int, num_audios: int, prompt: str) -> str | list[dict[str, str]]:
     """
     Get content for 'user' role in chat template
@@ -284,49 +301,59 @@ def get_user_content(model_type: str, num_images: int, num_audios: int, prompt:
         image_tags = "".join(["[IMG]" for _ in range(num_images)])
         content = image_tags + prompt
     else:
-        # Gemma-3 style: structured content
+        # Gemma-3/4 style: structured content with image and audio entries
         image_tags = [{"type": "image"} for _ in range(num_images)]
-        content = image_tags + [{"type": "text", "text": prompt}]
+        audio_tags = [{"type": "audio"} for _ in range(num_audios)]
+        content = image_tags + audio_tags + [{"type": "text", "text": prompt}]
     return content
 
+
 @dataclass
 class ToolSchema:
     """
     A class for defining a tool in a JSON schema compatible way
     """
+
     description: str
     type: str
     properties: dict[str, Any]
     required: list[str]
     additionalProperties: bool
 
+
 @dataclass
 class JsonSchema:
     """
     A class for defining a JSON schema for guidance
     """
+
     x_guidance: dict[str, Any]
     type: str
     items: dict[str, list[ToolSchema]]
     minItems: int
 
+
 @dataclass
 class FunctionDefinition:
     """
     A class for defining a function in an OpenAI-compatible way
     """
+
     name: str
     description: str
     parameters: dict[str, Any]
 
+
 @dataclass
 class Tool:
     """
     A class for defining a tool in an OpenAI-compatible way
     """
+
     type: str
     function: FunctionDefinition
 
+
 def tools_to_schemas(tools: list[Tool]) -> list[ToolSchema]:
     """
     Convert a list of tools to a list of tool schemas
@@ -360,6 +387,7 @@ def tools_to_schemas(tools: list[Tool]) -> list[ToolSchema]:
         tool_schemas.append(tool_schema)
     return tool_schemas
 
+
 def get_json_schema(tools: list[Tool], tool_output: bool) -> str:
     """
     Create a JSON schema from a list of tools
@@ -376,6 +404,7 @@ def get_json_schema(tools: list[Tool], tool_output: bool) -> str:
     d = {k.replace("x_guidance", "x-guidance"): v for k, v in asdict(json_schema).items()}
     return json.dumps(d)
 
+
 def get_lark_grammar(
     tools: list[Tool],
     text_output: bool,
@@ -423,6 +452,7 @@ def get_lark_grammar(
 
     return "\n".join(rows)
 
+
 def to_tool(tool_defs: list[dict[str, Any]]) -> list[Tool]:
     """
     Convert a JSON-deserialized object of tools to a list of Tool objects
@@ -443,6 +473,7 @@ def to_tool(tool_defs: list[dict[str, Any]]) -> list[Tool]:
         tools.append(tool)
     return tools
 
+
 def get_guidance(
     response_format: str = "",
     filepath: str = "",
@@ -474,7 +505,7 @@ def get_guidance(
     if tool_output:
         if os.path.exists(filepath):
             # If tools are provided as a file
-            with open(filepath, 'r') as f:
+            with open(filepath) as f:
                 tool_defs = json.load(f)
                 tools = to_tool(tool_defs)
         elif tools_str != "":
@@ -488,14 +519,18 @@ def get_guidance(
             if type(tools[0]) != Tool:
                 tools = to_tool(tools)
         else:
-            raise ValueError("Please provide the list of tools through a file, JSON-serialized string, or a list of tools")
+            raise ValueError(
+                "Please provide the list of tools through a file, JSON-serialized string, or a list of tools"
+            )
 
         assert len(tools) > 0, "Could not obtain a list of tools in memory"
 
     # Create guidance based on user-provided response format
     if response_format in {"text", "lark_grammar"}:
         if response_format == "text":
-            assert text_output and not tool_output, "A response format of 'text' requires text_output = True and tool_output = False"
+            assert text_output and not tool_output, (
+                "A response format of 'text' requires text_output = True and tool_output = False"
+            )
 
         guidance_type = "lark_grammar"
         guidance_data = get_lark_grammar(
@@ -506,7 +541,9 @@ def get_guidance(
             tool_call_end=tool_call_end,
         )
     elif response_format in {"json_schema", "json_object"}:
-        assert tool_output and not text_output, "A response format of 'json_schema' or 'json_object' requires text_output = False and tool_output = True"
+        assert tool_output and not text_output, (
+            "A response format of 'json_schema' or 'json_object' requires text_output = False and tool_output = True"
+        )
 
         guidance_type = "json_schema"
         guidance_data = get_json_schema(tools=tools, tool_output=tool_output)
@@ -515,6 +552,7 @@ def get_guidance(
 
     return guidance_type, guidance_data, json.dumps([asdict(tool) for tool in tools])
 
+
 def get_generator_params_args(parser: argparse.ArgumentParser) -> None:
     """
     Add an argument group for the generator params
@@ -525,16 +563,34 @@ def get_generator_params_args(parser: argparse.ArgumentParser) -> None:
         None
     """
     generator_params = parser.add_argument_group("Generator Params")
-    generator_params.add_argument('-c', '--chunk_size', type=int, default=0, help="Chunk size for prefill chunking during context processing (default: 0 = disabled, >0 = enabled)")
-    generator_params.add_argument('-s', '--do_sample', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
-    generator_params.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
-    generator_params.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
-    generator_params.add_argument('-b', '--num_beams', type=int, default=1, help='Number of beams to create')
-    generator_params.add_argument('-rs', '--num_return_sequences', type=int, default=1, help='Number of return sequences to produce')
-    generator_params.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
-    generator_params.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
-    generator_params.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
-    generator_params.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
+    generator_params.add_argument(
+        "-c",
+        "--chunk_size",
+        type=int,
+        default=0,
+        help="Chunk size for prefill chunking during context processing (default: 0 = disabled, >0 = enabled)",
+    )
+    generator_params.add_argument(
+        "-s",
+        "--do_sample",
+        action="store_true",
+        help="Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false",
+    )
+    generator_params.add_argument(
+        "-i", "--min_length", type=int, help="Min number of tokens to generate including the prompt"
+    )
+    generator_params.add_argument(
+        "-l", "--max_length", type=int, help="Max number of tokens to generate including the prompt"
+    )
+    generator_params.add_argument("-b", "--num_beams", type=int, default=1, help="Number of beams to create")
+    generator_params.add_argument(
+        "-rs", "--num_return_sequences", type=int, default=1, help="Number of return sequences to produce"
+    )
+    generator_params.add_argument("-r", "--repetition_penalty", type=float, help="Repetition penalty to sample with")
+    generator_params.add_argument("-t", "--temperature", type=float, help="Temperature to sample with")
+    generator_params.add_argument("-k", "--top_k", type=int, help="Top k tokens to sample from")
+    generator_params.add_argument("-p", "--top_p", type=float, help="Top p probability to sample with")
+
 
 def get_guidance_args(parser: argparse.ArgumentParser) -> None:
     """
@@ -546,9 +602,38 @@ def get_guidance_args(parser: argparse.ArgumentParser) -> None:
         None
     """
     guidance = parser.add_argument_group("Guidance Arguments")
-    guidance.add_argument('-rf', '--response_format', type=str, default="", choices=["", "text", "json_object", "json_schema", "lark_grammar"], help='Provide response format for the model')
-    guidance.add_argument('-tf', '--tools_file', type=str, default="", help='Path to file containing list of OpenAI-compatible tool definitions. Ex: test/test_models/tool-definitions/weather.json')
-    guidance.add_argument('-text', '--text_output', action='store_true', default=False, help='Produce a text response in the output')
-    guidance.add_argument('-tool', '--tool_output', action='store_true', default=False, help='Produce a tool call in the output')
-    guidance.add_argument('-tcs', '--tool_call_start', type=str, default="", help='String representation of tool call start (ex: <|tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work.')
-    guidance.add_argument('-tce', '--tool_call_end', type=str, default="", help='String representation of tool call end (ex: <|/tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work.')
+    guidance.add_argument(
+        "-rf",
+        "--response_format",
+        type=str,
+        default="",
+        choices=["", "text", "json_object", "json_schema", "lark_grammar"],
+        help="Provide response format for the model",
+    )
+    guidance.add_argument(
+        "-tf",
+        "--tools_file",
+        type=str,
+        default="",
+        help="Path to file containing list of OpenAI-compatible tool definitions. Ex: test/test_models/tool-definitions/weather.json",
+    )
+    guidance.add_argument(
+        "-text", "--text_output", action="store_true", default=False, help="Produce a text response in the output"
+    )
+    guidance.add_argument(
+        "-tool", "--tool_output", action="store_true", default=False, help="Produce a tool call in the output"
+    )
+    guidance.add_argument(
+        "-tcs",
+        "--tool_call_start",
+        type=str,
+        default="",
+        help="String representation of tool call start (ex: <|tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work.",
+    )
+    guidance.add_argument(
+        "-tce",
+        "--tool_call_end",
+        type=str,
+        default="",
+        help="String representation of tool call end (ex: <|/tool_call|>). Needs to be marked as special in tokenizer.json for guidance to work.",
+    )
@@ -648,6 +648,8 @@ struct VisionInputs_Element : JSON::Element {
   void OnValue(std::string_view name, JSON::Value value) override {
     if (name == "pixel_values") {
       v_.pixel_values = JSON::Get<std::string_view>(value);
+    } else if (name == "pixel_position_ids") {
+      v_.pixel_position_ids = JSON::Get<std::string_view>(value);
     } else if (name == "image_sizes") {
       v_.image_sizes = JSON::Get<std::string_view>(value);
     } else if (name == "image_grid_thw") {
@@ -1096,6 +1098,10 @@ struct Model_Element : JSON::Element {
       v_.sep_token_id = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "image_token_id") {
       v_.image_token_id = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "audio_token_id") {
+      v_.audio_token_id = static_cast<int>(JSON::Get<double>(value));
+    } else if (name == "boa_token_id") {
+      v_.boa_token_id = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "video_token_id") {
       v_.video_token_id = static_cast<int>(JSON::Get<double>(value));
     } else if (name == "vision_start_token_id") {