@@ -184,9 +184,17 @@ case "$HF_MODEL" in
184184 PREPROCESSOR_FEATURE_SIZE=" "
185185 PREPROCESSOR_OUTPUT=" "
186186 ;;
187+ SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
188+ MODEL_NAME=" qwen3_5_moe"
189+ TASK=" "
190+ MAX_SEQ_LEN=" "
191+ EXTRA_PIP=" "
192+ PREPROCESSOR_FEATURE_SIZE=" "
193+ PREPROCESSOR_OUTPUT=" "
194+ ;;
187195 * )
188196 echo " Error: Unsupported model '$HF_MODEL '"
189- echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
197+ echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4 "
190198 exit 1
191199 ;;
192200esac
@@ -350,7 +358,7 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
350358 STREAMING_ARG=" "
351359 PREPROCESSOR_ARGS=" --feature_size 128 --output_file ${OUTPUT_DIR} /preprocessor.pte"
352360 if [ " $USE_STREAMING " = " true" ]; then
353- STREAMING_ARG=" --streaming"
361+ STREAMING_ARG=" --streaming --sliding-window 2048 "
354362 PREPROCESSOR_ARGS=" $PREPROCESSOR_ARGS --streaming"
355363 else
356364 PREPROCESSOR_ARGS=" $PREPROCESSOR_ARGS --stack_output --max_audio_len 300"
@@ -380,6 +388,46 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
380388 exit 0
381389fi
382390
391+ # Qwen 3.5 MoE uses a prequantized checkpoint and custom export script
392+ if [ " $MODEL_NAME " = " qwen3_5_moe" ]; then
393+ pip install safetensors huggingface_hub
394+ pip install -r examples/models/qwen3_5_moe/requirements.txt
395+
396+ # Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
397+ LOCAL_MODEL_DIR=$( mktemp -d)
398+ INDUCTOR_CACHE=$( mktemp -d)
399+ trap ' rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
400+
401+ python -c " from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL} ', local_dir='${LOCAL_MODEL_DIR} ')"
402+
403+ # Sanity check: run inference on the prequantized model
404+ echo " ::group::Inference sanity check"
405+ python -m executorch.examples.models.qwen3_5_moe.inference \
406+ --prequantized " $LOCAL_MODEL_DIR " \
407+ --prompt " What is the capital of France?" \
408+ --max-new-tokens 32 \
409+ --temperature 0 \
410+ --no-compile
411+ echo " ::endgroup::"
412+
413+ # Copy tokenizer for the runner
414+ cp " $LOCAL_MODEL_DIR /tokenizer.json" " ${OUTPUT_DIR} /tokenizer.json"
415+
416+ # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417+ echo " ::group::Export"
418+ TORCHINDUCTOR_CACHE_DIR=" $INDUCTOR_CACHE " \
419+ python -m executorch.examples.models.qwen3_5_moe.export \
420+ --prequantized " $LOCAL_MODEL_DIR " \
421+ --output-dir " ${OUTPUT_DIR} "
422+ echo " ::endgroup::"
423+
424+ test -f " ${OUTPUT_DIR} /model.pte"
425+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
426+ ls -al " ${OUTPUT_DIR} "
427+
428+ exit 0
429+ fi
430+
383431MAX_SEQ_LEN_ARG=" "
384432if [ -n " $MAX_SEQ_LEN " ]; then
385433 MAX_SEQ_LEN_ARG=" --max_seq_len $MAX_SEQ_LEN "
0 commit comments