Skip to content

Commit 0a12cfc

Browse files
authored
Merge branch 'main' into fix/16032-tensors-same-dim-order-semantic-equivalence
2 parents 0c8b8e8 + 464a978 commit 0a12cfc

403 files changed

Lines changed: 14874 additions & 3284 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
5bf1aeb587e9b1f3572b0bd60265c5dafd007b73
1+
a9592258daacad7423fd5f39aaa59c6e36471520

.ci/scripts/build-qnn-sdk.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ build_qnn_backend() {
1818
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
1919

2020
parallelism=$(( $(nproc) - 1 ))
21-
bash backends/qualcomm/scripts/build.sh --skip_linux_android --skip_linux_embedded --job_number ${parallelism} --release
21+
bash backends/qualcomm/scripts/build.sh --skip_linux_android --job_number ${parallelism} --release
2222
}
2323

2424
set_up_aot() {

.ci/scripts/export_model_artifact.sh

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Arguments:
2222
- mistralai/Voxtral-Mini-4B-Realtime-2602
2323
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2424
- google/gemma-3-4b-it
25+
- nvidia/diar_streaming_sortformer_4spk-v2
2526
- nvidia/parakeet-tdt
2627
2728
quant_name Quantization type (optional, default: non-quantized)
@@ -45,6 +46,7 @@ Examples:
4546
export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal"
4647
export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
4748
export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
49+
export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
4850
export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
4951
export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
5052
export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
@@ -141,6 +143,14 @@ case "$HF_MODEL" in
141143
PREPROCESSOR_FEATURE_SIZE=""
142144
PREPROCESSOR_OUTPUT=""
143145
;;
146+
Qwen/Qwen3-0.6B)
147+
MODEL_NAME="qwen3"
148+
TASK="text-generation"
149+
MAX_SEQ_LEN="64"
150+
EXTRA_PIP=""
151+
PREPROCESSOR_FEATURE_SIZE=""
152+
PREPROCESSOR_OUTPUT=""
153+
;;
144154
nvidia/parakeet-tdt)
145155
MODEL_NAME="parakeet"
146156
TASK=""
@@ -149,6 +159,14 @@ case "$HF_MODEL" in
149159
PREPROCESSOR_FEATURE_SIZE=""
150160
PREPROCESSOR_OUTPUT=""
151161
;;
162+
nvidia/diar_streaming_sortformer_4spk-v2)
163+
MODEL_NAME="sortformer"
164+
TASK=""
165+
MAX_SEQ_LEN=""
166+
EXTRA_PIP=""
167+
PREPROCESSOR_FEATURE_SIZE=""
168+
PREPROCESSOR_OUTPUT=""
169+
;;
152170
mistralai/Voxtral-Mini-4B-Realtime-2602)
153171
MODEL_NAME="voxtral_realtime"
154172
TASK=""
@@ -159,7 +177,7 @@ case "$HF_MODEL" in
159177
;;
160178
*)
161179
echo "Error: Unsupported model '$HF_MODEL'"
162-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
180+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt"
163181
exit 1
164182
;;
165183
esac
@@ -239,6 +257,42 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
239257
exit 0
240258
fi
241259

260+
# Sortformer uses a custom export script
261+
if [ "$MODEL_NAME" = "sortformer" ]; then
262+
if [ "$QUANT_NAME" != "non-quantized" ]; then
263+
echo "Error: Sortformer currently supports only non-quantized export"
264+
exit 1
265+
fi
266+
267+
pip install -r examples/models/sortformer/install_requirements.txt
268+
269+
SORTFORMER_BACKEND="$DEVICE"
270+
if [ "$DEVICE" = "cuda-windows" ]; then
271+
SORTFORMER_BACKEND="cuda-windows"
272+
elif [ "$DEVICE" = "cuda" ]; then
273+
SORTFORMER_BACKEND="cuda"
274+
elif [ "$DEVICE" = "xnnpack" ]; then
275+
SORTFORMER_BACKEND="xnnpack"
276+
else
277+
SORTFORMER_BACKEND="portable"
278+
fi
279+
280+
python -m executorch.examples.models.sortformer.export_sortformer \
281+
--hf-model "${HF_MODEL}" \
282+
--backend "${SORTFORMER_BACKEND}" \
283+
--output-dir "${OUTPUT_DIR}"
284+
285+
test -f "${OUTPUT_DIR}/sortformer.pte"
286+
mv "${OUTPUT_DIR}/sortformer.pte" "${OUTPUT_DIR}/model.pte"
287+
# CUDA saves named data to separate .ptd file, XNNPACK/portable do not.
288+
if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
289+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
290+
fi
291+
ls -al "${OUTPUT_DIR}"
292+
echo "::endgroup::"
293+
exit 0
294+
fi
295+
242296
# Voxtral Realtime uses a custom export script
243297
if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
244298
pip install safetensors huggingface_hub
@@ -249,10 +303,14 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
249303

250304
# Per-component quantization flags
251305
VR_QUANT_ARGS=""
306+
VR_DTYPE_ARGS=""
252307
if [ "$QUANT_NAME" = "quantized-8da4w" ]; then
253308
VR_QUANT_ARGS="--qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
254309
elif [ "$QUANT_NAME" = "quantized-int4-metal" ]; then
255310
VR_QUANT_ARGS="--qlinear-encoder fpa4w --qlinear fpa4w"
311+
elif [ "$QUANT_NAME" = "quantized-int4-tile-packed" ]; then
312+
VR_QUANT_ARGS="--qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
313+
VR_DTYPE_ARGS="--dtype bf16"
256314
fi
257315

258316
# Determine streaming mode based on MODE parameter
@@ -276,13 +334,17 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
276334
--backend "$DEVICE" \
277335
${STREAMING_ARG} \
278336
--output-dir "${OUTPUT_DIR}" \
279-
${VR_QUANT_ARGS}
337+
${VR_QUANT_ARGS} \
338+
${VR_DTYPE_ARGS}
280339

281340
# Export preprocessor
282341
python -m executorch.extension.audio.mel_spectrogram ${PREPROCESSOR_ARGS}
283342

284343
test -f "${OUTPUT_DIR}/model.pte"
285344
test -f "${OUTPUT_DIR}/preprocessor.pte"
345+
if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
346+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
347+
fi
286348
# Copy tokenizer from downloaded model weights
287349
cp "$LOCAL_MODEL_DIR/tekken.json" "${OUTPUT_DIR}/tekken.json"
288350
ls -al "${OUTPUT_DIR}"

.ci/scripts/test_huggingface_optimum_model.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,21 +142,50 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
142142
"--qembedding",
143143
"8w",
144144
]
145+
elif recipe == "cuda":
146+
command += [
147+
"--dtype",
148+
"bfloat16",
149+
"--device",
150+
"cuda",
151+
]
152+
if quantize:
153+
command += [
154+
"--qlinear",
155+
"4w",
156+
"--qlinear_packing_format",
157+
"tile_packed_to_4d",
158+
"--qembedding",
159+
"8w",
160+
]
145161
else:
146162
assert (
147163
not quantize
148-
), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
164+
), "Quantization is only supported for XnnPack, CoreML, and CUDA recipes at the moment."
149165

150166
if not run_only:
151167
cli_export(command, model_dir)
152168

169+
if recipe == "cuda":
170+
model_path = Path(model_dir) / "model.pte"
171+
cuda_blob_path = Path(model_dir) / "aoti_cuda_blob.ptd"
172+
assert model_path.exists(), f"Main model file not found: {model_path}"
173+
assert cuda_blob_path.exists(), f"CUDA blob not found: {cuda_blob_path}"
174+
153175
tokenizer = AutoTokenizer.from_pretrained(model_id)
154176
saved_files = tokenizer.save_pretrained(model_dir)
155177
tokenizer_path = get_tokenizer_path(model_dir, saved_files)
156178

157179
from executorch.extension.llm.runner import GenerationConfig, TextLLMRunner
158180

159-
runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
181+
if recipe == "cuda":
182+
runner = TextLLMRunner(
183+
f"{model_dir}/model.pte",
184+
tokenizer_path,
185+
f"{model_dir}/aoti_cuda_blob.ptd",
186+
)
187+
else:
188+
runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
160189
tokens = []
161190
runner.generate(
162191
"Simply put, the theory of relativity states that",

.ci/scripts/test_model_e2e.sh

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ Arguments:
1919
hf_model HuggingFace model ID (required)
2020
Supported models:
2121
- mistralai/Voxtral-Mini-3B-2507
22+
- nvidia/diar_streaming_sortformer_4spk-v2
2223
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2324
- google/gemma-3-4b-it
25+
- Qwen/Qwen3-0.6B
2426
- nvidia/parakeet-tdt
2527
- mistralai/Voxtral-Mini-4B-Realtime-2602
2628
@@ -43,6 +45,7 @@ Arguments:
4345
Examples:
4446
test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
4547
test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
48+
test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output"
4649
test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
4750
test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
4851
test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
@@ -151,6 +154,18 @@ case "$HF_MODEL" in
151154
AUDIO_FILE=""
152155
IMAGE_PATH="docs/source/_static/img/et-logo.png"
153156
;;
157+
Qwen/Qwen3-0.6B)
158+
MODEL_NAME="qwen3"
159+
RUNNER_TARGET="llama_main"
160+
RUNNER_PATH="llama"
161+
EXPECTED_OUTPUT="Paris"
162+
PREPROCESSOR=""
163+
TOKENIZER_URL="https://huggingface.co/Qwen/Qwen3-0.6B/resolve/main" # @lint-ignore
164+
TOKENIZER_FILE=""
165+
AUDIO_URL=""
166+
AUDIO_FILE=""
167+
IMAGE_PATH=""
168+
;;
154169
nvidia/parakeet-tdt)
155170
MODEL_NAME="parakeet"
156171
RUNNER_TARGET="parakeet_runner"
@@ -163,6 +178,18 @@ case "$HF_MODEL" in
163178
AUDIO_FILE="test_audio.wav"
164179
IMAGE_PATH=""
165180
;;
181+
nvidia/diar_streaming_sortformer_4spk-v2)
182+
MODEL_NAME="sortformer"
183+
RUNNER_TARGET="sortformer_runner"
184+
RUNNER_PATH="sortformer"
185+
EXPECTED_OUTPUT="Speaker 1"
186+
PREPROCESSOR=""
187+
TOKENIZER_URL=""
188+
TOKENIZER_FILE=""
189+
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
190+
AUDIO_FILE="poem.wav"
191+
IMAGE_PATH=""
192+
;;
166193
mistralai/Voxtral-Mini-4B-Realtime-2602)
167194
MODEL_NAME="voxtral_realtime"
168195
RUNNER_TARGET="voxtral_realtime_runner"
@@ -177,7 +204,7 @@ case "$HF_MODEL" in
177204
;;
178205
*)
179206
echo "Error: Unsupported model '$HF_MODEL'"
180-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
207+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
181208
exit 1
182209
;;
183210
esac
@@ -190,8 +217,8 @@ echo "::endgroup::"
190217
echo "::group::Prepare $MODEL_NAME Artifacts"
191218

192219

193-
# Download tokenizer files (skip for parakeet and voxtral_realtime which bundle tokenizer in export)
194-
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ]; then
220+
# Download tokenizer files (skip for models that bundle tokenizer in export or do not use one)
221+
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ]; then
195222
if [ "$TOKENIZER_FILE" != "" ]; then
196223
curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
197224
else
@@ -246,9 +273,14 @@ if [ "$(uname -s)" = "Darwin" ] && [ -f "$RUNNER_BIN" ]; then
246273
install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER_BIN"
247274
fi
248275
fi
249-
# For CUDA, add data_path argument (Metal embeds data in .pte)
276+
# For CUDA, add named data argument (Metal embeds data in .pte).
277+
# Llama runner uses --data_paths, other runners use --data_path.
250278
if [ "$DEVICE" = "cuda" ]; then
251-
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
279+
if [ "$RUNNER_PATH" = "llama" ]; then
280+
RUNNER_ARGS="$RUNNER_ARGS --data_paths ${MODEL_DIR}/aoti_cuda_blob.ptd"
281+
else
282+
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
283+
fi
252284
fi
253285

254286
# Add model-specific arguments
@@ -262,15 +294,34 @@ case "$MODEL_NAME" in
262294
gemma3)
263295
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
264296
;;
297+
qwen3)
298+
PROMPT_FILE="${MODEL_DIR}/qwen3_prompt.txt"
299+
cat > "${PROMPT_FILE}" << 'EOF'
300+
<|im_start|>user
301+
What is the capital of France?<|im_end|>
302+
<|im_start|>assistant
303+
EOF
304+
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --prompt_file ${PROMPT_FILE}"
305+
;;
265306
parakeet)
266307
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE"
267308
# For CUDA, add data_path argument (Metal embeds data in .pte)
268309
if [ "$DEVICE" = "cuda" ]; then
269310
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
270311
fi
271312
;;
313+
sortformer)
314+
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE"
315+
if [ "$DEVICE" = "cuda" ]; then
316+
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
317+
fi
318+
;;
272319
voxtral_realtime)
273320
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
321+
# Add CUDA data path if present
322+
if [ "$DEVICE" = "cuda" ] && [ -f "${MODEL_DIR}/aoti_cuda_blob.ptd" ]; then
323+
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
324+
fi
274325
# Determine streaming mode based on MODE parameter
275326
USE_STREAMING="true"
276327
if [ "$MODE" = "vr-offline" ]; then

0 commit comments

Comments
 (0)