99
1010show_help () {
1111 cat << EOF
12- Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]
12+ Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir] [mode]
1313
1414Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.
1515
@@ -22,6 +22,7 @@ Arguments:
2222 - mistralai/Voxtral-Mini-4B-Realtime-2602
2323 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2424 - google/gemma-3-4b-it
25+ - nvidia/diar_streaming_sortformer_4spk-v2
2526 - nvidia/parakeet-tdt
2627
2728 quant_name Quantization type (optional, default: non-quantized)
@@ -34,13 +35,23 @@ Arguments:
3435
3536 output_dir Output directory for artifacts (optional, default: current directory)
3637
38+ mode Export mode (optional, default: vr-streaming)
39+ Supported modes:
40+ - vr-streaming: Voxtral Realtime streaming mode
41+ - vr-offline: Voxtral Realtime offline mode
42+
3743Examples:
3844 export_model_artifact.sh metal "openai/whisper-small"
3945 export_model_artifact.sh metal "nvidia/parakeet-tdt" "quantized-int4-metal"
46+ export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal"
47+ export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
4048 export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
49+ export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
4150 export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
4251 export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
4352 export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
53+ export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-8da4w" "./output"
54+ export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "./output" "vr-offline"
4455EOF
4556}
4657
@@ -61,6 +72,26 @@ DEVICE="$1"
6172HF_MODEL=" $2 "
6273QUANT_NAME=" ${3:- non-quantized} "
6374OUTPUT_DIR=" ${4:- .} "
75+ MODE=" ${5:- } "
76+
77+ # Validate mode if specified
78+ if [ -n " $MODE " ]; then
79+ case " $MODE " in
80+ vr-streaming|vr-offline)
81+ # Voxtral Realtime modes require Voxtral Realtime model
82+ if [ " $HF_MODEL " != " mistralai/Voxtral-Mini-4B-Realtime-2602" ]; then
83+ echo " Error: Mode '$MODE ' can only be used with Voxtral Realtime model"
84+ echo " Provided model: $HF_MODEL "
85+ exit 1
86+ fi
87+ ;;
88+ * )
89+ echo " Error: Unsupported mode '$MODE '"
90+ echo " Supported modes: vr-streaming, vr-offline"
91+ exit 1
92+ ;;
93+ esac
94+ fi
6495
6596case " $DEVICE " in
6697 cuda)
@@ -112,6 +143,14 @@ case "$HF_MODEL" in
112143 PREPROCESSOR_FEATURE_SIZE=" "
113144 PREPROCESSOR_OUTPUT=" "
114145 ;;
146+ Qwen/Qwen3-0.6B)
147+ MODEL_NAME=" qwen3"
148+ TASK=" text-generation"
149+ MAX_SEQ_LEN=" 64"
150+ EXTRA_PIP=" "
151+ PREPROCESSOR_FEATURE_SIZE=" "
152+ PREPROCESSOR_OUTPUT=" "
153+ ;;
115154 nvidia/parakeet-tdt)
116155 MODEL_NAME=" parakeet"
117156 TASK=" "
@@ -120,6 +159,14 @@ case "$HF_MODEL" in
120159 PREPROCESSOR_FEATURE_SIZE=" "
121160 PREPROCESSOR_OUTPUT=" "
122161 ;;
162+ nvidia/diar_streaming_sortformer_4spk-v2)
163+ MODEL_NAME=" sortformer"
164+ TASK=" "
165+ MAX_SEQ_LEN=" "
166+ EXTRA_PIP=" "
167+ PREPROCESSOR_FEATURE_SIZE=" "
168+ PREPROCESSOR_OUTPUT=" "
169+ ;;
123170 mistralai/Voxtral-Mini-4B-Realtime-2602)
124171 MODEL_NAME=" voxtral_realtime"
125172 TASK=" "
@@ -130,7 +177,7 @@ case "$HF_MODEL" in
130177 ;;
131178 * )
132179 echo " Error: Unsupported model '$HF_MODEL '"
133- echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
180+ echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt"
134181 exit 1
135182 ;;
136183esac
@@ -210,7 +257,43 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
210257 exit 0
211258fi
212259
213- # Voxtral Realtime uses a custom export script (streaming mode)
260+ # Sortformer uses a custom export script
261+ if [ " $MODEL_NAME " = " sortformer" ]; then
262+ if [ " $QUANT_NAME " != " non-quantized" ]; then
263+ echo " Error: Sortformer currently supports only non-quantized export"
264+ exit 1
265+ fi
266+
267+ pip install -r examples/models/sortformer/install_requirements.txt
268+
269+ SORTFORMER_BACKEND=" $DEVICE "
270+ if [ " $DEVICE " = " cuda-windows" ]; then
271+ SORTFORMER_BACKEND=" cuda-windows"
272+ elif [ " $DEVICE " = " cuda" ]; then
273+ SORTFORMER_BACKEND=" cuda"
274+ elif [ " $DEVICE " = " xnnpack" ]; then
275+ SORTFORMER_BACKEND=" xnnpack"
276+ else
277+ SORTFORMER_BACKEND=" portable"
278+ fi
279+
280+ python -m executorch.examples.models.sortformer.export_sortformer \
281+ --hf-model " ${HF_MODEL} " \
282+ --backend " ${SORTFORMER_BACKEND} " \
283+ --output-dir " ${OUTPUT_DIR} "
284+
285+ test -f " ${OUTPUT_DIR} /sortformer.pte"
286+ mv " ${OUTPUT_DIR} /sortformer.pte" " ${OUTPUT_DIR} /model.pte"
287+ # CUDA saves named data to separate .ptd file, XNNPACK/portable do not.
288+ if [ " $DEVICE " = " cuda" ] || [ " $DEVICE " = " cuda-windows" ]; then
289+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
290+ fi
291+ ls -al " ${OUTPUT_DIR} "
292+ echo " ::endgroup::"
293+ exit 0
294+ fi
295+
296+ # Voxtral Realtime uses a custom export script
214297if [ " $MODEL_NAME " = " voxtral_realtime" ]; then
215298 pip install safetensors huggingface_hub
216299
@@ -220,25 +303,48 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
220303
221304 # Per-component quantization flags
222305 VR_QUANT_ARGS=" "
306+ VR_DTYPE_ARGS=" "
223307 if [ " $QUANT_NAME " = " quantized-8da4w" ]; then
224308 VR_QUANT_ARGS=" --qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
309+ elif [ " $QUANT_NAME " = " quantized-int4-metal" ]; then
310+ VR_QUANT_ARGS=" --qlinear-encoder fpa4w --qlinear fpa4w"
311+ elif [ " $QUANT_NAME " = " quantized-int4-tile-packed" ]; then
312+ VR_QUANT_ARGS=" --qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
313+ VR_DTYPE_ARGS=" --dtype bf16"
314+ fi
315+
316+ # Determine streaming mode based on MODE parameter
317+ USE_STREAMING=" true"
318+ if [ " $MODE " = " vr-offline" ]; then
319+ USE_STREAMING=" false"
320+ fi
321+
322+ # Configure export and preprocessor based on streaming mode
323+ STREAMING_ARG=" "
324+ PREPROCESSOR_ARGS=" --feature_size 128 --output_file ${OUTPUT_DIR} /preprocessor.pte"
325+ if [ " $USE_STREAMING " = " true" ]; then
326+ STREAMING_ARG=" --streaming"
327+ PREPROCESSOR_ARGS=" $PREPROCESSOR_ARGS --streaming"
328+ else
329+ PREPROCESSOR_ARGS=" $PREPROCESSOR_ARGS --stack_output --max_audio_len 300"
225330 fi
226331
227332 python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
228333 --model-path " $LOCAL_MODEL_DIR " \
229- --backend xnnpack \
230- --streaming \
334+ --backend " $DEVICE " \
335+ ${STREAMING_ARG} \
231336 --output-dir " ${OUTPUT_DIR} " \
232- ${VR_QUANT_ARGS}
337+ ${VR_QUANT_ARGS} \
338+ ${VR_DTYPE_ARGS}
233339
234- # Export streaming preprocessor (no chunk padding)
235- python -m executorch.extension.audio.mel_spectrogram \
236- --feature_size 128 \
237- --streaming \
238- --output_file " ${OUTPUT_DIR} /preprocessor.pte"
340+ # Export preprocessor
341+ python -m executorch.extension.audio.mel_spectrogram ${PREPROCESSOR_ARGS}
239342
240343 test -f " ${OUTPUT_DIR} /model.pte"
241344 test -f " ${OUTPUT_DIR} /preprocessor.pte"
345+ if [ " $DEVICE " = " cuda" ] || [ " $DEVICE " = " cuda-windows" ]; then
346+ test -f " ${OUTPUT_DIR} /aoti_cuda_blob.ptd"
347+ fi
242348 # Copy tokenizer from downloaded model weights
243349 cp " $LOCAL_MODEL_DIR /tekken.json" " ${OUTPUT_DIR} /tekken.json"
244350 ls -al " ${OUTPUT_DIR} "
0 commit comments