@@ -22,7 +22,9 @@ Arguments:
2222 - mistralai/Voxtral-Mini-4B-Realtime-2602
2323 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2424 - google/gemma-3-4b-it
25+ - nvidia/diar_streaming_sortformer_4spk-v2
2526 - nvidia/parakeet-tdt
27+ - facebook/dinov2-small-imagenet1k-1-layer
2628
2729 quant_name Quantization type (optional, default: non-quantized)
2830 Options:
@@ -45,6 +47,7 @@ Examples:
4547 export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal"
4648 export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
4749 export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
50+ export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
4851 export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
4952 export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
5053 export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
@@ -157,6 +160,22 @@ case "$HF_MODEL" in
157160 PREPROCESSOR_FEATURE_SIZE=" "
158161 PREPROCESSOR_OUTPUT=" "
159162 ;;
163+ nvidia/diar_streaming_sortformer_4spk-v2)
164+ MODEL_NAME=" sortformer"
165+ TASK=" "
166+ MAX_SEQ_LEN=" "
167+ EXTRA_PIP=" "
168+ PREPROCESSOR_FEATURE_SIZE=" "
169+ PREPROCESSOR_OUTPUT=" "
170+ ;;
171+ facebook/dinov2-small-imagenet1k-1-layer)
172+ MODEL_NAME=" dinov2"
173+ TASK=" "
174+ MAX_SEQ_LEN=" "
175+ EXTRA_PIP=" "
176+ PREPROCESSOR_FEATURE_SIZE=" "
177+ PREPROCESSOR_OUTPUT=" "
178+ ;;
160179 mistralai/Voxtral-Mini-4B-Realtime-2602)
161180 MODEL_NAME=" voxtral_realtime"
162181 TASK=" "
@@ -165,9 +184,17 @@ case "$HF_MODEL" in
165184 PREPROCESSOR_FEATURE_SIZE=" "
166185 PREPROCESSOR_OUTPUT=" "
167186 ;;
187+ SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
188+ MODEL_NAME=" qwen3_5_moe"
189+ TASK=" "
190+ MAX_SEQ_LEN=" "
191+ EXTRA_PIP=" "
192+ PREPROCESSOR_FEATURE_SIZE=" "
193+ PREPROCESSOR_OUTPUT=" "
194+ ;;
168195 * )
169196 echo " Error: Unsupported model '$HF_MODEL '"
170- echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
197+ echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/ parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4 "
171198 exit 1
172199 ;;
173200esac
@@ -247,6 +274,59 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
247274 exit 0
248275fi
249276
277+ # Sortformer uses a custom export script
278+ if [ " $MODEL_NAME " = " sortformer" ]; then
279+ if [ " $QUANT_NAME " != " non-quantized" ]; then
280+ echo " Error: Sortformer currently supports only non-quantized export"
281+ exit 1
282+ fi
283+
284+ pip install -r examples/models/sortformer/install_requirements.txt
285+
286+ SORTFORMER_BACKEND=" $DEVICE "
287+ if [ " $DEVICE " = " cuda-windows" ]; then
288+ SORTFORMER_BACKEND=" cuda-windows"
289+ elif [ " $DEVICE " = " cuda" ]; then
290+ SORTFORMER_BACKEND=" cuda"
291+ elif [ " $DEVICE " = " xnnpack" ]; then
292+ SORTFORMER_BACKEND=" xnnpack"
293+ else
294+ SORTFORMER_BACKEND=" portable"
295+ fi
296+
297+ python -m executorch.examples.models.sortformer.export_sortformer \
298+ --hf-model " ${HF_MODEL} " \
299+ --backend " ${SORTFORMER_BACKEND} " \
300+ --output-dir " ${OUTPUT_DIR} "
301+
302+ test -f " ${OUTPUT_DIR} /sortformer.pte"
303+ mv " ${OUTPUT_DIR} /sortformer.pte" " ${OUTPUT_DIR} /model.pte"
304+ # CUDA saves named data to separate .ptd file, XNNPACK/portable do not.
305+ if [ " $DEVICE " = " cuda" ] || [ " $DEVICE " = " cuda-windows" ]; then
306+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
307+ fi
308+ ls -al " ${OUTPUT_DIR} "
309+ echo " ::endgroup::"
310+ exit 0
311+ fi
312+
313+ # DINOv2 uses a custom export script
314+ if [ " $MODEL_NAME " = " dinov2" ]; then
315+ pip install -r examples/models/dinov2/install_requirements.txt
316+
317+ python -m executorch.examples.models.dinov2.export_dinov2 \
318+ --backend " $DEVICE " \
319+ --output-dir " ${OUTPUT_DIR} "
320+
321+ test -f " ${OUTPUT_DIR} /model.pte"
322+ if [ " $DEVICE " = " cuda" ] || [ " $DEVICE " = " cuda-windows" ]; then
323+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
324+ fi
325+ ls -al " ${OUTPUT_DIR} "
326+ echo " ::endgroup::"
327+ exit 0
328+ fi
329+
250330# Voxtral Realtime uses a custom export script
251331if [ " $MODEL_NAME " = " voxtral_realtime" ]; then
252332 pip install safetensors huggingface_hub
@@ -262,6 +342,7 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
262342 VR_QUANT_ARGS=" --qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
263343 elif [ " $QUANT_NAME " = " quantized-int4-metal" ]; then
264344 VR_QUANT_ARGS=" --qlinear-encoder fpa4w --qlinear fpa4w"
345+ VR_DTYPE_ARGS=" --dtype bf16"
265346 elif [ " $QUANT_NAME " = " quantized-int4-tile-packed" ]; then
266347 VR_QUANT_ARGS=" --qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
267348 VR_DTYPE_ARGS=" --dtype bf16"
@@ -301,11 +382,51 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
301382 fi
302383 # Copy tokenizer from downloaded model weights
303384 cp " $LOCAL_MODEL_DIR /tekken.json" " ${OUTPUT_DIR} /tekken.json"
385+ rm -rf " $LOCAL_MODEL_DIR "
304386 ls -al " ${OUTPUT_DIR} "
305387 echo " ::endgroup::"
306388 exit 0
307389fi
308390
391+ # Qwen 3.5 MoE uses a prequantized checkpoint and custom export script
392+ if [ " $MODEL_NAME " = " qwen3_5_moe" ]; then
393+ pip install safetensors huggingface_hub
394+ pip install -r examples/models/qwen3_5_moe/requirements.txt
395+
396+ # Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
397+ LOCAL_MODEL_DIR=$( mktemp -d)
398+ INDUCTOR_CACHE=$( mktemp -d)
399+ trap ' rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
400+
401+ python -c " from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL} ', local_dir='${LOCAL_MODEL_DIR} ')"
402+
403+ # Sanity check: run inference on the prequantized model
404+ echo " ::group::Inference sanity check"
405+ python -m executorch.examples.models.qwen3_5_moe.inference \
406+ --prequantized " $LOCAL_MODEL_DIR " \
407+ --prompt " What is the capital of France?" \
408+ --max-new-tokens 32 \
409+ --temperature 0 \
410+ --no-compile
411+ echo " ::endgroup::"
412+
413+ # Copy tokenizer for the runner
414+ cp " $LOCAL_MODEL_DIR /tokenizer.json" " ${OUTPUT_DIR} /tokenizer.json"
415+
416+ # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417+ echo " ::group::Export"
418+ TORCHINDUCTOR_CACHE_DIR=" $INDUCTOR_CACHE " \
419+ python -m executorch.examples.models.qwen3_5_moe.export \
420+ --prequantized " $LOCAL_MODEL_DIR " \
421+ --output-dir " ${OUTPUT_DIR} "
422+ echo " ::endgroup::"
423+
424+ test -f " ${OUTPUT_DIR} /model.pte"
425+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
426+ ls -al " ${OUTPUT_DIR} "
427+ exit 0
428+ fi
429+
309430MAX_SEQ_LEN_ARG=" "
310431if [ -n " $MAX_SEQ_LEN " ]; then
311432 MAX_SEQ_LEN_ARG=" --max_seq_len $MAX_SEQ_LEN "
0 commit comments