pytorch
diff --git a/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 46 additions & 1 deletion b/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 46 additions & 1 deletion
diff --git a/‎.ci/scripts/test_lora.sh‎
Lines changed: 12 additions & 11 deletions b/‎.ci/scripts/test_lora.sh‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎.ci/scripts/test_lora_multimethod.sh‎
Lines changed: 120 additions & 0 deletions b/‎.ci/scripts/test_lora_multimethod.sh‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 20 additions & 4 deletions b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎.ci/scripts/unittest-linux-cmake.sh‎
Lines changed: 4 additions & 0 deletions b/‎.ci/scripts/unittest-linux-cmake.sh‎
Lines changed: 4 additions & 0 deletions
@@ -19,6 +19,7 @@ Arguments:
   hf_model     HuggingFace model ID (required)
                Supported models:
                  - mistralai/Voxtral-Mini-3B-2507
+                 - mistralai/Voxtral-Mini-4B-Realtime-2602
                  - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                  - google/gemma-3-4b-it
                  - nvidia/parakeet-tdt
@@ -119,9 +120,17 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
+  mistralai/Voxtral-Mini-4B-Realtime-2602)
+    MODEL_NAME="voxtral_realtime"
+    TASK=""
+    MAX_SEQ_LEN=""
+    EXTRA_PIP="mistral-common librosa"
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -201,6 +210,42 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
   exit 0
 fi
 
+# Voxtral Realtime uses a custom export script (streaming mode)
+if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
+  pip install safetensors huggingface_hub
+
+  # Download model weights from HuggingFace (requires HF_TOKEN for gated model)
+  LOCAL_MODEL_DIR="${OUTPUT_DIR}/model_weights"
+  python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"
+
+  # Per-component quantization flags
+  VR_QUANT_ARGS=""
+  if [ "$QUANT_NAME" = "quantized-8da4w" ]; then
+    VR_QUANT_ARGS="--qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
+  fi
+
+  python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
+      --model-path "$LOCAL_MODEL_DIR" \
+      --backend xnnpack \
+      --streaming \
+      --output-dir "${OUTPUT_DIR}" \
+      ${VR_QUANT_ARGS}
+
+  # Export streaming preprocessor (no chunk padding)
+  python -m executorch.extension.audio.mel_spectrogram \
+      --feature_size 128 \
+      --streaming \
+      --output_file "${OUTPUT_DIR}/preprocessor.pte"
+
+  test -f "${OUTPUT_DIR}/model.pte"
+  test -f "${OUTPUT_DIR}/preprocessor.pte"
+  # Copy tokenizer from downloaded model weights
+  cp "$LOCAL_MODEL_DIR/tekken.json" "${OUTPUT_DIR}/tekken.json"
+  ls -al "${OUTPUT_DIR}"
+  echo "::endgroup::"
+  exit 0
+fi
+
 MAX_SEQ_LEN_ARG=""
 if [ -n "$MAX_SEQ_LEN" ]; then
   MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
 
@@ -41,12 +41,14 @@ HF_ADAPTER_PATH=$(
     --files "adapter_config.json" "adapter_model.safetensors"
 )
 
+# Set environment variables for OmegaConf interpolation in yaml.
+export LORA_ADAPTER_CHECKPOINT="${HF_ADAPTER_PATH}/adapter_model.safetensors"
+export LORA_ADAPTER_CONFIG="${HF_ADAPTER_PATH}/adapter_config.json"
+
 ### SINGLE LORA PTE ###
 # Export LoRA PTE file.
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
-    --config examples/models/qwen3/config/qwen3_xnnpack.yaml \
-    +base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
-    +base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
+    --config examples/models/qwen3/config/qwen3_xnnpack_lora.yaml \
     +export.output_name="qwen_lora_math_full.pte"
 
 # Capture the path of the downloaded qwen artifacts
@@ -93,9 +95,7 @@ fi
 ### PROGRAM DATA SEPARATION ###
 # Export LoRA PTE, LoRA PTD, foundation PTD file.
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
-    --config examples/models/qwen3/config/qwen3_xnnpack.yaml \
-    +base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
-    +base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
+    --config examples/models/qwen3/config/qwen3_xnnpack_lora.yaml \
     +export.output_name="qwen_lora_math.pte" \
     +export.foundation_weights_file="qwen_foundation.ptd" \
     +export.lora_weights_file="qwen_lora_math.ptd"
@@ -108,7 +108,7 @@ cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math.pte --dat
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
-RESULT=$(cat result.txt)
+RESULT=$(cat result2.txt)
 if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT}"
@@ -143,18 +143,19 @@ So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
 The answer is: 12<|im_end|>"
 
 # Export Quantized PTE, PTD file, no LoRA.
+# override base.lora_config=null to avoid creating a lora model
+# and loading lora weights.
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
-    --config examples/models/qwen3/config/qwen3_xnnpack.yaml \
+    --config examples/models/qwen3/config/qwen3_xnnpack_lora.yaml \
+    base.lora_config=null \
     +export.output_name="qwen_q.pte" \
     +export.foundation_weights_file="qwen_foundation_q.ptd" \
     +quantization.qmode="8da4w" \
     +quantization.group_size=32
 
 # Export Quantized LoRA PTE, LoRA PTD, foundation PTD file.
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
-    --config examples/models/qwen3/config/qwen3_xnnpack.yaml \
-    +base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
-    +base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
+    --config examples/models/qwen3/config/qwen3_xnnpack_lora.yaml \
     +export.output_name="qwen_lora_math_q.pte" \
     +export.foundation_weights_file="qwen_foundation_lora_q.ptd" \
     +export.lora_weights_file="qwen_lora_math_q.ptd" \
 
@@ -0,0 +1,120 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+cmake_install_executorch_libraries() {
+    echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
+    rm -rf cmake-out
+    cmake --workflow llm-release
+}
+
+cmake_build_llama_runner() {
+    echo "Building llama runner"
+    pushd extension/llm/tokenizers
+    echo "Updating tokenizers submodule"
+    git submodule update --init
+    popd
+    make llama-cpu
+}
+
+cleanup_files() {
+  echo "Deleting downloaded and generated files"
+  rm -rf "${HF_QWEN_PATH}/"
+  rm -rf "${HF_ADAPTER_PATH}/"
+  rm -rf *.pte
+  rm -f result*.txt
+}
+
+# Download LoRA adapter.
+python -m pip install -q huggingface_hub
+HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
+HF_ADAPTER_PATH=$(
+  bash "$(dirname "${BASH_SOURCE[0]}")/download_hf_hub.sh" \
+    --model_id "${HF_ADAPTER_REPO}" \
+    --files "adapter_config.json" "adapter_model.safetensors"
+)
+
+# Download base model (for tokenizer path).
+HF_QWEN_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('unsloth/Qwen3-0.6B'))")
+echo "Model downloaded to: $HF_QWEN_PATH"
+
+### EXPORT MULTIMETHOD PTE ###
+# Set environment variables for OmegaConf interpolation in yaml.
+export LORA_ADAPTER_CHECKPOINT="${HF_ADAPTER_PATH}/adapter_model.safetensors"
+export LORA_ADAPTER_CONFIG="${HF_ADAPTER_PATH}/adapter_config.json"
+
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
+    --config examples/models/qwen3/config/qwen3_multimethod.yaml
+
+### BUILD LLAMA RUNNER ###
+cmake_install_executorch_libraries
+cmake_build_llama_runner
+
+# Runner constants.
+RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1"
+PROMPT="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant"
+
+# Expected outputs.
+EXPECTED_LORA_PREFIX="
+<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
+To calculate 15% of 80"
+
+EXPECTED_BASE_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant:
+<think>
+Okay, so I need to calculate 15% of 80."
+
+### TEST 1: Run lora_forward method ###
+NOW=$(date +"%H:%M:%S")
+echo "Test 1: Multimethod lora_forward. Starting at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main \
+    --model_path=multimethod_qwen.pte \
+    --method_name=lora_forward \
+    --prompt="${PROMPT}" \
+    ${RUNTIME_ARGS} > result_lora.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT=$(cat result_lora.txt)
+if [[ "${RESULT}" == "${EXPECTED_LORA_PREFIX}"* ]]; then
+  echo "Test 1 (lora_forward): Success"
+else
+  echo "Test 1 (lora_forward): Failure"
+  echo "Expected result prefix: ${EXPECTED_LORA_PREFIX}"
+  echo "Actual result: ${RESULT}"
+  cleanup_files
+  exit 1
+fi
+
+### TEST 2: Run base_forward method ###
+NOW=$(date +"%H:%M:%S")
+echo "Test 2: Multimethod base_forward. Starting at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main \
+    --model_path=multimethod_qwen.pte \
+    --method_name=base_forward \
+    --prompt="${PROMPT}" \
+    ${RUNTIME_ARGS} > result_base.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT=$(cat result_base.txt)
+if [[ "${RESULT}" == "${EXPECTED_BASE_PREFIX}"* ]]; then
+  echo "Test 2 (base_forward): Success"
+else
+  echo "Test 2 (base_forward): Failure"
+  echo "Expected result prefix: ${EXPECTED_BASE_PREFIX}"
+  echo "Actual result: ${RESULT}"
+  cleanup_files
+  exit 1
+fi
+
+echo "Multimethod tests passed!"
+cleanup_files
@@ -22,6 +22,7 @@ Arguments:
                 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                 - google/gemma-3-4b-it
                 - nvidia/parakeet-tdt
+                - mistralai/Voxtral-Mini-4B-Realtime-2602
 
   quant_name  Quantization type (required)
               Options:
@@ -135,9 +136,21 @@ case "$HF_MODEL" in
     AUDIO_FILE="test_audio.wav"
     IMAGE_PATH=""
     ;;
+  mistralai/Voxtral-Mini-4B-Realtime-2602)
+    MODEL_NAME="voxtral_realtime"
+    RUNNER_TARGET="voxtral_realtime_runner"
+    RUNNER_PATH="voxtral_realtime"
+    EXPECTED_OUTPUT="Quilter"
+    PREPROCESSOR="preprocessor.pte"
+    TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main" # @lint-ignore
+    TOKENIZER_FILE="tekken.json"
+    AUDIO_URL=""
+    AUDIO_FILE="test_audio.wav"
+    IMAGE_PATH=""
+    ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -150,8 +163,8 @@ echo "::endgroup::"
 echo "::group::Prepare $MODEL_NAME Artifacts"
 
 
-# Download tokenizer files (skip for parakeet which exports tokenizer with model)
-if [ "$MODEL_NAME" != "parakeet" ]; then
+# Download tokenizer files (skip for parakeet and voxtral_realtime which bundle tokenizer in export)
+if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ]; then
   if [ "$TOKENIZER_FILE" != "" ]; then
     curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
   else
@@ -164,7 +177,7 @@ fi
 # Download test files
 if [ "$AUDIO_URL" != "" ]; then
   curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
-elif [[ "$MODEL_NAME" == *whisper* ]]; then
+elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
   pip install torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/cpu
@@ -222,6 +235,9 @@ case "$MODEL_NAME" in
       RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
     fi
     ;;
+  voxtral_realtime)
+    RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0 --streaming"
+    ;;
 esac
 
 OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
 
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,6 +19,9 @@ if ! python -c "import tosa_serializer" >/dev/null 2>&1; then
     TOSA_SERIALIZATION_DIR="${TOSA_TOOLS_DIR}/serialization"
   fi
 
+  # Workaround to allow TOSA serializer to build for v2025.11.0
+  python -m pip install pybind11==2.10.4
+
   CMAKE_POLICY_VERSION_MINIMUM=3.5 BUILD_PYBIND=1 \
     python -m pip install --no-dependencies \
     "${TOSA_SERIALIZATION_DIR}"