@@ -24,6 +24,7 @@ Arguments:
2424 - google/gemma-3-4b-it
2525 - nvidia/diar_streaming_sortformer_4spk-v2
2626 - nvidia/parakeet-tdt
27+ - facebook/dinov2-small-imagenet1k-1-layer
2728
2829 quant_name Quantization type (optional, default: non-quantized)
2930 Options:
@@ -167,6 +168,14 @@ case "$HF_MODEL" in
167168 PREPROCESSOR_FEATURE_SIZE=" "
168169 PREPROCESSOR_OUTPUT=" "
169170 ;;
171+ facebook/dinov2-small-imagenet1k-1-layer)
172+ MODEL_NAME=" dinov2"
173+ TASK=" "
174+ MAX_SEQ_LEN=" "
175+ EXTRA_PIP=" "
176+ PREPROCESSOR_FEATURE_SIZE=" "
177+ PREPROCESSOR_OUTPUT=" "
178+ ;;
170179 mistralai/Voxtral-Mini-4B-Realtime-2602)
171180 MODEL_NAME=" voxtral_realtime"
172181 TASK=" "
@@ -175,9 +184,17 @@ case "$HF_MODEL" in
175184 PREPROCESSOR_FEATURE_SIZE=" "
176185 PREPROCESSOR_OUTPUT=" "
177186 ;;
187+ SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
188+ MODEL_NAME=" qwen3_5_moe"
189+ TASK=" "
190+ MAX_SEQ_LEN=" "
191+ EXTRA_PIP=" "
192+ PREPROCESSOR_FEATURE_SIZE=" "
193+ PREPROCESSOR_OUTPUT=" "
194+ ;;
178195 * )
179196 echo " Error: Unsupported model '$HF_MODEL '"
180- echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt"
197+ echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4 "
181198 exit 1
182199 ;;
183200esac
@@ -293,6 +310,23 @@ if [ "$MODEL_NAME" = "sortformer" ]; then
293310 exit 0
294311fi
295312
313+ # DINOv2 uses a custom export script
314+ if [ " $MODEL_NAME " = " dinov2" ]; then
315+ pip install -r examples/models/dinov2/install_requirements.txt
316+
317+ python -m executorch.examples.models.dinov2.export_dinov2 \
318+ --backend " $DEVICE " \
319+ --output-dir " ${OUTPUT_DIR} "
320+
321+ test -f " ${OUTPUT_DIR} /model.pte"
322+ if [ " $DEVICE " = " cuda" ] || [ " $DEVICE " = " cuda-windows" ]; then
323+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
324+ fi
325+ ls -al " ${OUTPUT_DIR} "
326+ echo " ::endgroup::"
327+ exit 0
328+ fi
329+
296330# Voxtral Realtime uses a custom export script
297331if [ " $MODEL_NAME " = " voxtral_realtime" ]; then
298332 pip install safetensors huggingface_hub
@@ -308,6 +342,7 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
308342 VR_QUANT_ARGS=" --qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
309343 elif [ " $QUANT_NAME " = " quantized-int4-metal" ]; then
310344 VR_QUANT_ARGS=" --qlinear-encoder fpa4w --qlinear fpa4w"
345+ VR_DTYPE_ARGS=" --dtype bf16"
311346 elif [ " $QUANT_NAME " = " quantized-int4-tile-packed" ]; then
312347 VR_QUANT_ARGS=" --qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
313348 VR_DTYPE_ARGS=" --dtype bf16"
@@ -323,7 +358,7 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
323358 STREAMING_ARG=" "
324359 PREPROCESSOR_ARGS=" --feature_size 128 --output_file ${OUTPUT_DIR} /preprocessor.pte"
325360 if [ " $USE_STREAMING " = " true" ]; then
326- STREAMING_ARG=" --streaming"
361+ STREAMING_ARG=" --streaming --sliding-window 2048 "
327362 PREPROCESSOR_ARGS=" $PREPROCESSOR_ARGS --streaming"
328363 else
329364 PREPROCESSOR_ARGS=" $PREPROCESSOR_ARGS --stack_output --max_audio_len 300"
@@ -347,11 +382,52 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
347382 fi
348383 # Copy tokenizer from downloaded model weights
349384 cp " $LOCAL_MODEL_DIR /tekken.json" " ${OUTPUT_DIR} /tekken.json"
385+ rm -rf " $LOCAL_MODEL_DIR "
350386 ls -al " ${OUTPUT_DIR} "
351387 echo " ::endgroup::"
352388 exit 0
353389fi
354390
391+ # Qwen 3.5 MoE uses a prequantized checkpoint and custom export script
392+ if [ " $MODEL_NAME " = " qwen3_5_moe" ]; then
393+ pip install safetensors huggingface_hub
394+ pip install -r examples/models/qwen3_5_moe/requirements.txt
395+
396+ # Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
397+ LOCAL_MODEL_DIR=$( mktemp -d)
398+ INDUCTOR_CACHE=$( mktemp -d)
399+ trap ' rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
400+
401+ python -c " from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL} ', local_dir='${LOCAL_MODEL_DIR} ')"
402+
403+ # Sanity check: run inference on the prequantized model
404+ echo " ::group::Inference sanity check"
405+ python -m executorch.examples.models.qwen3_5_moe.inference \
406+ --prequantized " $LOCAL_MODEL_DIR " \
407+ --prompt " What is the capital of France?" \
408+ --max-new-tokens 32 \
409+ --temperature 0 \
410+ --no-compile
411+ echo " ::endgroup::"
412+
413+ # Copy tokenizer for the runner
414+ cp " $LOCAL_MODEL_DIR /tokenizer.json" " ${OUTPUT_DIR} /tokenizer.json"
415+
416+ # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417+ echo " ::group::Export"
418+ TORCHINDUCTOR_CACHE_DIR=" $INDUCTOR_CACHE " \
419+ python -m executorch.examples.models.qwen3_5_moe.export \
420+ --prequantized " $LOCAL_MODEL_DIR " \
421+ --output-dir " ${OUTPUT_DIR} "
422+ echo " ::endgroup::"
423+
424+ test -f " ${OUTPUT_DIR} /model.pte"
425+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
426+ ls -al " ${OUTPUT_DIR} "
427+
428+ exit 0
429+ fi
430+
355431MAX_SEQ_LEN_ARG=" "
356432if [ -n " $MAX_SEQ_LEN " ]; then
357433 MAX_SEQ_LEN_ARG=" --max_seq_len $MAX_SEQ_LEN "
0 commit comments