@@ -67,6 +67,9 @@ if [ -z "${1:-}" ]; then
6767 exit 1
6868fi
6969
70+ # Disable HF Xet storage to avoid stalled downloads on CI runners
71+ export HF_HUB_DISABLE_XET=1
72+
7073set -eux
7174
7275DEVICE=" $1 "
@@ -192,9 +195,17 @@ case "$HF_MODEL" in
192195 PREPROCESSOR_FEATURE_SIZE=" "
193196 PREPROCESSOR_OUTPUT=" "
194197 ;;
198+ SocialLocalMobile/gemma-4-31B-it-HQQ-INT4)
199+ MODEL_NAME=" gemma4_31b"
200+ TASK=" "
201+ MAX_SEQ_LEN=" "
202+ EXTRA_PIP=" "
203+ PREPROCESSOR_FEATURE_SIZE=" "
204+ PREPROCESSOR_OUTPUT=" "
205+ ;;
195206 * )
196207 echo " Error: Unsupported model '$HF_MODEL '"
197- echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
208+ echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4, SocialLocalMobile/gemma-4-31B-it-HQQ-INT4 "
198209 exit 1
199210 ;;
200211esac
@@ -415,8 +426,80 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
415426
416427 # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417428 echo " ::group::Export"
429+ EXPORT_LOG=$( mktemp)
418430 TORCHINDUCTOR_CACHE_DIR=" $INDUCTOR_CACHE " \
419431 python -m executorch.examples.models.qwen3_5_moe.export \
432+ --prequantized " $LOCAL_MODEL_DIR " \
433+ --output-dir " ${OUTPUT_DIR} " \
434+ --dense-prefill dequant \
435+ --moe-activation-dtype int8 2>&1 | tee " $EXPORT_LOG "
436+ EXPORT_RC=${PIPESTATUS[0]}
437+ echo " ::endgroup::"
438+
439+ if [ " $EXPORT_RC " -ne 0 ]; then
440+ echo " ERROR: Qwen3.5 MoE export failed (exit $EXPORT_RC )"
441+ rm -f " $EXPORT_LOG "
442+ exit " $EXPORT_RC "
443+ fi
444+
445+ # Gate peak GPU memory so we keep the export viable on consumer GPUs
446+ # (e.g. RTX 4090 with 24 GB). The export script prints a machine-
447+ # parseable marker line "EXPORT_GPU_PEAK_MEMORY_MB: <float>".
448+ EXPORT_GPU_PEAK_MB_LIMIT=" ${EXPORT_GPU_PEAK_MB_LIMIT:- 20480} "
449+ PEAK_LINE=$( grep -E ' ^EXPORT_GPU_PEAK_MEMORY_MB:' " $EXPORT_LOG " | tail -1)
450+ rm -f " $EXPORT_LOG "
451+ if [ -z " $PEAK_LINE " ]; then
452+ echo " ERROR: export did not emit EXPORT_GPU_PEAK_MEMORY_MB marker; cannot enforce GPU memory budget"
453+ exit 1
454+ fi
455+ PEAK_MB=$( echo " $PEAK_LINE " | awk ' {print $2}' )
456+ echo " Export GPU peak memory: ${PEAK_MB} MB (limit ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
457+ if awk -v p=" $PEAK_MB " -v l=" $EXPORT_GPU_PEAK_MB_LIMIT " ' BEGIN{exit !(p>l)}' ; then
458+ echo " ERROR: export exceeded GPU memory budget (${PEAK_MB} MB > ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
459+ echo " — this would prevent the model from being exported on a 24 GB consumer GPU."
460+ exit 1
461+ fi
462+
463+ test -f " ${OUTPUT_DIR} /model.pte"
464+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
465+ ls -al " ${OUTPUT_DIR} "
466+
467+ exit 0
468+ fi
469+
470+ # Gemma 4 31B uses a prequantized checkpoint and custom export script
471+ if [ " $MODEL_NAME " = " gemma4_31b" ]; then
472+ pip install safetensors huggingface_hub gguf
473+
474+ # Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
475+ LOCAL_MODEL_DIR=$( mktemp -d)
476+ INDUCTOR_CACHE=$( mktemp -d)
477+ trap ' rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
478+
479+ python -c " from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL} ', local_dir='${LOCAL_MODEL_DIR} ')"
480+
481+ # Sanity check: run inference on the prequantized model
482+ echo " ::group::Inference sanity check"
483+ INFERENCE_OUTPUT=$( python -m executorch.examples.models.gemma4_31b.inference \
484+ --prequantized " $LOCAL_MODEL_DIR " \
485+ --prompt " What is the capital of France?" \
486+ --max-new-tokens 32 \
487+ --temperature 0 \
488+ --no-compile 2>&1 )
489+ echo " $INFERENCE_OUTPUT "
490+ if ! echo " $INFERENCE_OUTPUT " | grep -q " Paris" ; then
491+ echo " ERROR: Inference sanity check failed — expected 'Paris' in output"
492+ exit 1
493+ fi
494+ echo " ::endgroup::"
495+
496+ # Copy tokenizer for the runner
497+ cp " $LOCAL_MODEL_DIR /tokenizer.json" " ${OUTPUT_DIR} /tokenizer.json"
498+
499+ # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
500+ echo " ::group::Export"
501+ TORCHINDUCTOR_CACHE_DIR=" $INDUCTOR_CACHE " \
502+ python -m executorch.examples.models.gemma4_31b.export \
420503 --prequantized " $LOCAL_MODEL_DIR " \
421504 --output-dir " ${OUTPUT_DIR} "
422505 echo " ::endgroup::"
0 commit comments