Skip to content

Commit b74f088

Browse files
authored
Merge branch 'main' into Arm-backend-Use-symbolic-links-for-dev-pre-commit/push-hooks
2 parents 95d3a29 + adf975c commit b74f088

343 files changed

Lines changed: 11875 additions & 2329 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/scripts/export_model_artifact.sh

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Arguments:
1919
hf_model HuggingFace model ID (required)
2020
Supported models:
2121
- mistralai/Voxtral-Mini-3B-2507
22+
- mistralai/Voxtral-Mini-4B-Realtime-2602
2223
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2324
- google/gemma-3-4b-it
2425
- nvidia/parakeet-tdt
@@ -119,9 +120,17 @@ case "$HF_MODEL" in
119120
PREPROCESSOR_FEATURE_SIZE=""
120121
PREPROCESSOR_OUTPUT=""
121122
;;
123+
mistralai/Voxtral-Mini-4B-Realtime-2602)
124+
MODEL_NAME="voxtral_realtime"
125+
TASK=""
126+
MAX_SEQ_LEN=""
127+
EXTRA_PIP="mistral-common librosa"
128+
PREPROCESSOR_FEATURE_SIZE=""
129+
PREPROCESSOR_OUTPUT=""
130+
;;
122131
*)
123132
echo "Error: Unsupported model '$HF_MODEL'"
124-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
133+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
125134
exit 1
126135
;;
127136
esac
@@ -201,6 +210,42 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
201210
exit 0
202211
fi
203212

213+
# Voxtral Realtime uses a custom export script (streaming mode)
214+
if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
215+
pip install safetensors huggingface_hub
216+
217+
# Download model weights from HuggingFace (requires HF_TOKEN for gated model)
218+
LOCAL_MODEL_DIR="${OUTPUT_DIR}/model_weights"
219+
python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"
220+
221+
# Per-component quantization flags
222+
VR_QUANT_ARGS=""
223+
if [ "$QUANT_NAME" = "quantized-8da4w" ]; then
224+
VR_QUANT_ARGS="--qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
225+
fi
226+
227+
python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
228+
--model-path "$LOCAL_MODEL_DIR" \
229+
--backend xnnpack \
230+
--streaming \
231+
--output-dir "${OUTPUT_DIR}" \
232+
${VR_QUANT_ARGS}
233+
234+
# Export streaming preprocessor (no chunk padding)
235+
python -m executorch.extension.audio.mel_spectrogram \
236+
--feature_size 128 \
237+
--streaming \
238+
--output_file "${OUTPUT_DIR}/preprocessor.pte"
239+
240+
test -f "${OUTPUT_DIR}/model.pte"
241+
test -f "${OUTPUT_DIR}/preprocessor.pte"
242+
# Copy tokenizer from downloaded model weights
243+
cp "$LOCAL_MODEL_DIR/tekken.json" "${OUTPUT_DIR}/tekken.json"
244+
ls -al "${OUTPUT_DIR}"
245+
echo "::endgroup::"
246+
exit 0
247+
fi
248+
204249
MAX_SEQ_LEN_ARG=""
205250
if [ -n "$MAX_SEQ_LEN" ]; then
206251
MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"

.ci/scripts/test_lora.sh

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,14 @@ HF_ADAPTER_PATH=$(
4141
--files "adapter_config.json" "adapter_model.safetensors"
4242
)
4343

44+
# Set environment variables for OmegaConf interpolation in yaml.
45+
export LORA_ADAPTER_CHECKPOINT="${HF_ADAPTER_PATH}/adapter_model.safetensors"
46+
export LORA_ADAPTER_CONFIG="${HF_ADAPTER_PATH}/adapter_config.json"
47+
4448
### SINGLE LORA PTE ###
4549
# Export LoRA PTE file.
4650
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
47-
--config examples/models/qwen3/config/qwen3_xnnpack.yaml \
48-
+base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
49-
+base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
51+
--config examples/models/qwen3/config/qwen3_xnnpack_lora.yaml \
5052
+export.output_name="qwen_lora_math_full.pte"
5153

5254
# Capture the path of the downloaded qwen artifacts
@@ -93,9 +95,7 @@ fi
9395
### PROGRAM DATA SEPARATION ###
9496
# Export LoRA PTE, LoRA PTD, foundation PTD file.
9597
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
96-
--config examples/models/qwen3/config/qwen3_xnnpack.yaml \
97-
+base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
98-
+base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
98+
--config examples/models/qwen3/config/qwen3_xnnpack_lora.yaml \
9999
+export.output_name="qwen_lora_math.pte" \
100100
+export.foundation_weights_file="qwen_foundation.ptd" \
101101
+export.lora_weights_file="qwen_lora_math.ptd"
@@ -108,7 +108,7 @@ cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math.pte --dat
108108
NOW=$(date +"%H:%M:%S")
109109
echo "Finished at ${NOW}"
110110

111-
RESULT=$(cat result.txt)
111+
RESULT=$(cat result2.txt)
112112
if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
113113
echo "Expected result prefix: ${EXPECTED_PREFIX}"
114114
echo "Actual result: ${RESULT}"
@@ -143,18 +143,19 @@ So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
143143
The answer is: 12<|im_end|>"
144144

145145
# Export Quantized PTE, PTD file, no LoRA.
146+
# override base.lora_config=null to avoid creating a lora model
147+
# and loading lora weights.
146148
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
147-
--config examples/models/qwen3/config/qwen3_xnnpack.yaml \
149+
--config examples/models/qwen3/config/qwen3_xnnpack_lora.yaml \
150+
base.lora_config=null \
148151
+export.output_name="qwen_q.pte" \
149152
+export.foundation_weights_file="qwen_foundation_q.ptd" \
150153
+quantization.qmode="8da4w" \
151154
+quantization.group_size=32
152155

153156
# Export Quantized LoRA PTE, LoRA PTD, foundation PTD file.
154157
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
155-
--config examples/models/qwen3/config/qwen3_xnnpack.yaml \
156-
+base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
157-
+base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
158+
--config examples/models/qwen3/config/qwen3_xnnpack_lora.yaml \
158159
+export.output_name="qwen_lora_math_q.pte" \
159160
+export.foundation_weights_file="qwen_foundation_lora_q.ptd" \
160161
+export.lora_weights_file="qwen_lora_math_q.ptd" \
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
# shellcheck source=/dev/null
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
cmake_install_executorch_libraries() {
13+
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
14+
rm -rf cmake-out
15+
cmake --workflow llm-release
16+
}
17+
18+
cmake_build_llama_runner() {
19+
echo "Building llama runner"
20+
pushd extension/llm/tokenizers
21+
echo "Updating tokenizers submodule"
22+
git submodule update --init
23+
popd
24+
make llama-cpu
25+
}
26+
27+
cleanup_files() {
28+
echo "Deleting downloaded and generated files"
29+
rm -rf "${HF_QWEN_PATH}/"
30+
rm -rf "${HF_ADAPTER_PATH}/"
31+
rm -rf *.pte
32+
rm -f result*.txt
33+
}
34+
35+
# Download LoRA adapter.
36+
python -m pip install -q huggingface_hub
37+
HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
38+
HF_ADAPTER_PATH=$(
39+
bash "$(dirname "${BASH_SOURCE[0]}")/download_hf_hub.sh" \
40+
--model_id "${HF_ADAPTER_REPO}" \
41+
--files "adapter_config.json" "adapter_model.safetensors"
42+
)
43+
44+
# Download base model (for tokenizer path).
45+
HF_QWEN_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('unsloth/Qwen3-0.6B'))")
46+
echo "Model downloaded to: $HF_QWEN_PATH"
47+
48+
### EXPORT MULTIMETHOD PTE ###
49+
# Set environment variables for OmegaConf interpolation in yaml.
50+
export LORA_ADAPTER_CHECKPOINT="${HF_ADAPTER_PATH}/adapter_model.safetensors"
51+
export LORA_ADAPTER_CONFIG="${HF_ADAPTER_PATH}/adapter_config.json"
52+
53+
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
54+
--config examples/models/qwen3/config/qwen3_multimethod.yaml
55+
56+
### BUILD LLAMA RUNNER ###
57+
cmake_install_executorch_libraries
58+
cmake_build_llama_runner
59+
60+
# Runner constants.
61+
RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1"
62+
PROMPT="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant"
63+
64+
# Expected outputs.
65+
EXPECTED_LORA_PREFIX="
66+
<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
67+
To calculate 15% of 80"
68+
69+
EXPECTED_BASE_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant:
70+
<think>
71+
Okay, so I need to calculate 15% of 80."
72+
73+
### TEST 1: Run lora_forward method ###
74+
NOW=$(date +"%H:%M:%S")
75+
echo "Test 1: Multimethod lora_forward. Starting at ${NOW}"
76+
# shellcheck source=/dev/null
77+
cmake-out/examples/models/llama/llama_main \
78+
--model_path=multimethod_qwen.pte \
79+
--method_name=lora_forward \
80+
--prompt="${PROMPT}" \
81+
${RUNTIME_ARGS} > result_lora.txt
82+
NOW=$(date +"%H:%M:%S")
83+
echo "Finished at ${NOW}"
84+
85+
RESULT=$(cat result_lora.txt)
86+
if [[ "${RESULT}" == "${EXPECTED_LORA_PREFIX}"* ]]; then
87+
echo "Test 1 (lora_forward): Success"
88+
else
89+
echo "Test 1 (lora_forward): Failure"
90+
echo "Expected result prefix: ${EXPECTED_LORA_PREFIX}"
91+
echo "Actual result: ${RESULT}"
92+
cleanup_files
93+
exit 1
94+
fi
95+
96+
### TEST 2: Run base_forward method ###
97+
NOW=$(date +"%H:%M:%S")
98+
echo "Test 2: Multimethod base_forward. Starting at ${NOW}"
99+
# shellcheck source=/dev/null
100+
cmake-out/examples/models/llama/llama_main \
101+
--model_path=multimethod_qwen.pte \
102+
--method_name=base_forward \
103+
--prompt="${PROMPT}" \
104+
${RUNTIME_ARGS} > result_base.txt
105+
NOW=$(date +"%H:%M:%S")
106+
echo "Finished at ${NOW}"
107+
108+
RESULT=$(cat result_base.txt)
109+
if [[ "${RESULT}" == "${EXPECTED_BASE_PREFIX}"* ]]; then
110+
echo "Test 2 (base_forward): Success"
111+
else
112+
echo "Test 2 (base_forward): Failure"
113+
echo "Expected result prefix: ${EXPECTED_BASE_PREFIX}"
114+
echo "Actual result: ${RESULT}"
115+
cleanup_files
116+
exit 1
117+
fi
118+
119+
echo "Multimethod tests passed!"
120+
cleanup_files

.ci/scripts/test_model_e2e.sh

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Arguments:
2222
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2323
- google/gemma-3-4b-it
2424
- nvidia/parakeet-tdt
25+
- mistralai/Voxtral-Mini-4B-Realtime-2602
2526
2627
quant_name Quantization type (required)
2728
Options:
@@ -135,9 +136,21 @@ case "$HF_MODEL" in
135136
AUDIO_FILE="test_audio.wav"
136137
IMAGE_PATH=""
137138
;;
139+
mistralai/Voxtral-Mini-4B-Realtime-2602)
140+
MODEL_NAME="voxtral_realtime"
141+
RUNNER_TARGET="voxtral_realtime_runner"
142+
RUNNER_PATH="voxtral_realtime"
143+
EXPECTED_OUTPUT="Quilter"
144+
PREPROCESSOR="preprocessor.pte"
145+
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main" # @lint-ignore
146+
TOKENIZER_FILE="tekken.json"
147+
AUDIO_URL=""
148+
AUDIO_FILE="test_audio.wav"
149+
IMAGE_PATH=""
150+
;;
138151
*)
139152
echo "Error: Unsupported model '$HF_MODEL'"
140-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
153+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
141154
exit 1
142155
;;
143156
esac
@@ -150,8 +163,8 @@ echo "::endgroup::"
150163
echo "::group::Prepare $MODEL_NAME Artifacts"
151164

152165

153-
# Download tokenizer files (skip for parakeet which exports tokenizer with model)
154-
if [ "$MODEL_NAME" != "parakeet" ]; then
166+
# Download tokenizer files (skip for parakeet and voxtral_realtime which bundle tokenizer in export)
167+
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ]; then
155168
if [ "$TOKENIZER_FILE" != "" ]; then
156169
curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
157170
else
@@ -164,7 +177,7 @@ fi
164177
# Download test files
165178
if [ "$AUDIO_URL" != "" ]; then
166179
curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
167-
elif [[ "$MODEL_NAME" == *whisper* ]]; then
180+
elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
168181
conda install -y -c conda-forge "ffmpeg<8"
169182
pip install datasets soundfile
170183
pip install torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/cpu
@@ -222,6 +235,9 @@ case "$MODEL_NAME" in
222235
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
223236
fi
224237
;;
238+
voxtral_realtime)
239+
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0 --streaming"
240+
;;
225241
esac
226242

227243
OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)

.ci/scripts/unittest-linux-cmake.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env bash
22
# Copyright (c) Meta Platforms, Inc. and affiliates.
33
# All rights reserved.
4+
# Copyright 2026 Arm Limited and/or its affiliates.
45
#
56
# This source code is licensed under the BSD-style license found in the
67
# LICENSE file in the root directory of this source tree.
@@ -18,6 +19,9 @@ if ! python -c "import tosa_serializer" >/dev/null 2>&1; then
1819
TOSA_SERIALIZATION_DIR="${TOSA_TOOLS_DIR}/serialization"
1920
fi
2021

22+
# Workaround to allow TOSA serializer to build for v2025.11.0
23+
python -m pip install pybind11==2.10.4
24+
2125
CMAKE_POLICY_VERSION_MINIMUM=3.5 BUILD_PYBIND=1 \
2226
python -m pip install --no-dependencies \
2327
"${TOSA_SERIALIZATION_DIR}"

0 commit comments

Comments
 (0)