Skip to content

Commit 1693048

Browse files
committed
up
1 parent 7a82d9b commit 1693048

32 files changed

Lines changed: 4223 additions & 51 deletions

.github/workflows/mlx.yml

Lines changed: 364 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ on:
99
paths:
1010
- .github/workflows/mlx.yml
1111
- backends/mlx/**
12+
- examples/models/parakeet/**
13+
- examples/models/voxtral/**
14+
- examples/models/voxtral_realtime/**
1215
workflow_dispatch:
1316

1417
concurrency:
@@ -105,3 +108,364 @@ jobs:
105108
echo "::error::Too many test failures: $FAILED > $MAX_FAILURES"
106109
exit 1
107110
fi
111+
112+
test-mlx-parakeet:
113+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
114+
with:
115+
job-name: test-mlx-parakeet
116+
runner: macos-14-xlarge
117+
python-version: "3.12"
118+
submodules: recursive
119+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
120+
timeout: 90
121+
script: |
122+
set -eux
123+
124+
echo "::group::Install ExecuTorch"
125+
${CONDA_RUN} python install_executorch.py > /dev/null
126+
echo "::endgroup::"
127+
128+
echo "::group::Install Parakeet requirements"
129+
${CONDA_RUN} pip install -r examples/models/parakeet/install_requirements.txt
130+
echo "::endgroup::"
131+
132+
${CONDA_RUN} pip list
133+
134+
echo "::group::Export Parakeet"
135+
${CONDA_RUN} python -m executorch.examples.models.parakeet.export_parakeet_tdt \
136+
--backend mlx \
137+
--dtype bf16 \
138+
--qlinear_encoder 4w \
139+
--qlinear_encoder_group_size 128 \
140+
--qlinear 4w \
141+
--qlinear_group_size 128 \
142+
--output-dir /tmp/parakeet_mlx
143+
echo "::endgroup::"
144+
145+
echo "::group::Build Parakeet MLX runner"
146+
${CONDA_RUN} make parakeet-mlx
147+
echo "::endgroup::"
148+
149+
echo "::group::Run Parakeet MLX runner"
150+
curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
151+
OUTPUT=$(./cmake-out/examples/models/parakeet/parakeet_runner \
152+
--model_path /tmp/parakeet_mlx/model.pte \
153+
--audio_path /tmp/test_audio.wav \
154+
--tokenizer_path /tmp/parakeet_mlx/tokenizer.model 2>&1)
155+
echo "Runner output:"
156+
echo "$OUTPUT"
157+
if echo "$OUTPUT" | grep -iq "Phoebe"; then
158+
echo "Success: 'Phoebe' found in output"
159+
else
160+
echo "Failed: Expected 'Phoebe' not found in output"
161+
exit 1
162+
fi
163+
echo "::endgroup::"
164+
165+
test-mlx-voxtral:
166+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
167+
secrets: inherit
168+
with:
169+
job-name: test-mlx-voxtral
170+
runner: macos-14-xlarge
171+
python-version: "3.12"
172+
submodules: recursive
173+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
174+
secrets-env: EXECUTORCH_HF_TOKEN
175+
timeout: 90
176+
script: |
177+
set -eux
178+
179+
echo "::group::Install ExecuTorch"
180+
${CONDA_RUN} python install_executorch.py > /dev/null
181+
echo "::endgroup::"
182+
183+
echo "::group::Install Voxtral requirements"
184+
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
185+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
186+
${CONDA_RUN} pip install mistral_common librosa soundfile datasets
187+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
188+
${CONDA_RUN} pip install "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
189+
echo "::endgroup::"
190+
191+
${CONDA_RUN} pip list
192+
193+
echo "::group::Export Voxtral"
194+
${CONDA_RUN} python -m executorch.backends.mlx.examples.voxtral.export_voxtral_hf \
195+
--output-dir /tmp/voxtral_mlx \
196+
--dtype bf16 \
197+
--quantize-linear int4
198+
echo "::endgroup::"
199+
200+
echo "::group::Build Voxtral MLX runner"
201+
${CONDA_RUN} make voxtral-mlx
202+
echo "::endgroup::"
203+
204+
echo "::group::Run Voxtral MLX runner"
205+
curl -L https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json -o /tmp/tekken.json
206+
curl -L https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav -o /tmp/test_audio.wav
207+
OUTPUT=$(./cmake-out/examples/models/voxtral/voxtral_runner \
208+
--model_path /tmp/voxtral_mlx/model.pte \
209+
--tokenizer_path /tmp/tekken.json \
210+
--audio_path /tmp/test_audio.wav \
211+
--processor_path /tmp/voxtral_mlx/preprocessor.pte \
212+
--prompt "What is happening in this audio?" \
213+
--temperature 0 2>&1)
214+
echo "Runner output:"
215+
echo "$OUTPUT"
216+
if echo "$OUTPUT" | grep -iq "poem"; then
217+
echo "Success: 'poem' found in output"
218+
else
219+
echo "Failed: Expected 'poem' not found in output"
220+
exit 1
221+
fi
222+
echo "::endgroup::"
223+
224+
test-mlx-voxtral-realtime:
225+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
226+
secrets: inherit
227+
with:
228+
job-name: test-mlx-voxtral-realtime
229+
runner: macos-14-xlarge
230+
python-version: "3.12"
231+
submodules: recursive
232+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
233+
secrets-env: EXECUTORCH_HF_TOKEN
234+
timeout: 90
235+
script: |
236+
set -eux
237+
238+
echo "::group::Install ExecuTorch"
239+
${CONDA_RUN} python install_executorch.py > /dev/null
240+
echo "::endgroup::"
241+
242+
echo "::group::Install Voxtral Realtime requirements"
243+
${CONDA_RUN} pip install -U "huggingface_hub[cli]" safetensors
244+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
245+
echo "::endgroup::"
246+
247+
${CONDA_RUN} pip list
248+
249+
echo "::group::Download model"
250+
${CONDA_RUN} huggingface-cli download mistralai/Voxtral-Mini-4B-Realtime-2602
251+
MODEL_PATH=$(${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))")
252+
echo "Model path: ${MODEL_PATH}"
253+
echo "::endgroup::"
254+
255+
echo "::group::Export Voxtral Realtime (streaming)"
256+
${CONDA_RUN} python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
257+
--model-path "${MODEL_PATH}" \
258+
--backend mlx \
259+
--streaming \
260+
--output-dir /tmp/voxtral_rt_mlx \
261+
--qlinear-encoder 4w \
262+
--qlinear 4w \
263+
--qembedding 8w \
264+
--qembedding-group-size 128 \
265+
--export-preprocessor
266+
echo "::endgroup::"
267+
268+
echo "::group::Build Voxtral Realtime MLX runner"
269+
${CONDA_RUN} make voxtral_realtime-mlx
270+
echo "::endgroup::"
271+
272+
echo "::group::Run Voxtral Realtime MLX runner"
273+
curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
274+
OUTPUT=$(./cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
275+
--model_path /tmp/voxtral_rt_mlx/model.pte \
276+
--tokenizer_path "${MODEL_PATH}/tekken.json" \
277+
--preprocessor_path /tmp/voxtral_rt_mlx/preprocessor.pte \
278+
--audio_path /tmp/test_audio.wav \
279+
--streaming 2>&1)
280+
echo "Runner output:"
281+
echo "$OUTPUT"
282+
if echo "$OUTPUT" | grep -iq "Phoebe"; then
283+
echo "Success: 'Phoebe' found in output"
284+
else
285+
echo "Failed: Expected 'Phoebe' not found in output"
286+
exit 1
287+
fi
288+
echo "::endgroup::"
289+
290+
test-mlx-whisper:
291+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
292+
secrets: inherit
293+
with:
294+
job-name: test-mlx-whisper
295+
runner: macos-14-xlarge
296+
python-version: "3.12"
297+
submodules: recursive
298+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
299+
secrets-env: EXECUTORCH_HF_TOKEN
300+
timeout: 90
301+
script: |
302+
set -eux
303+
304+
echo "::group::Install ExecuTorch and configure MLX build"
305+
${CONDA_RUN} python install_executorch.py > /dev/null
306+
echo "::endgroup::"
307+
308+
echo "::group::Install Whisper requirements"
309+
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
310+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
311+
${CONDA_RUN} pip install transformers soundfile datasets librosa
312+
echo "::endgroup::"
313+
314+
${CONDA_RUN} pip list
315+
316+
echo "::group::Export Whisper"
317+
${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.export_whisper \
318+
--model-id "openai/whisper-tiny" \
319+
--output-dir /tmp/whisper_mlx \
320+
--dtype bf16 \
321+
--quantize-linear int4
322+
echo "::endgroup::"
323+
324+
echo "::group::Run Whisper inference"
325+
OUTPUT=$( ${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.run_whisper \
326+
--model-dir /tmp/whisper_mlx \
327+
--use-sample-audio 2>&1)
328+
echo "$OUTPUT"
329+
if echo "$OUTPUT" | grep -iq "Mr. Quilter"; then
330+
echo "Success: 'Mr. Quilter' found in transcription"
331+
else
332+
echo "Failed: Expected 'Mr. Quilter' not found in transcription"
333+
exit 1
334+
fi
335+
echo "::endgroup::"
336+
337+
338+
test-mlx-stories110m:
339+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
340+
with:
341+
job-name: test-mlx-stories110m
342+
runner: macos-14-xlarge
343+
python-version: "3.12"
344+
submodules: recursive
345+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
346+
timeout: 90
347+
script: |
348+
set -eux
349+
350+
echo "::group::Install ExecuTorch"
351+
${CONDA_RUN} python install_executorch.py > /dev/null
352+
echo "::endgroup::"
353+
354+
echo "::group::Install Llama requirements"
355+
${CONDA_RUN} sh examples/models/llama/install_requirements.sh
356+
echo "::endgroup::"
357+
358+
${CONDA_RUN} pip list
359+
360+
echo "::group::Build ExecuTorch with MLX delegate"
361+
${CONDA_RUN} cmake --workflow --preset mlx-release
362+
echo "::endgroup::"
363+
364+
echo "::group::Build Llama runner with MLX"
365+
pushd examples/models/llama
366+
${CONDA_RUN} cmake --workflow --preset llama-release
367+
popd
368+
echo "::endgroup::"
369+
370+
echo "::group::Download stories110M artifacts"
371+
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
372+
curl -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
373+
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
374+
echo "::endgroup::"
375+
376+
echo "::group::Create tokenizer.bin"
377+
${CONDA_RUN} python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
378+
echo "::endgroup::"
379+
380+
echo "::group::Export stories110M with MLX backend via export_llama_lib"
381+
${CONDA_RUN} python -m extension.llm.export.export_llm \
382+
base.checkpoint=stories110M.pt \
383+
base.params=params.json \
384+
model.use_kv_cache=true \
385+
model.dtype_override=fp32 \
386+
backend.mlx.enabled=true \
387+
quantization.qmode=4w \
388+
quantization.group_size=32 \
389+
export.output_name=/tmp/stories110m_mlx.pte
390+
echo "::endgroup::"
391+
392+
echo "::group::Run inference with C++ llama runner"
393+
./cmake-out/examples/models/llama/llama_main \
394+
--model_path=/tmp/stories110m_mlx.pte \
395+
--tokenizer_path=tokenizer.bin \
396+
--prompt="Once upon a time," \
397+
--temperature=0 \
398+
--seq_len=10
399+
echo "::endgroup::"
400+
401+
test-mlx-llm:
402+
strategy:
403+
fail-fast: false
404+
matrix:
405+
model:
406+
- id: "unsloth/Llama-3.2-1B-Instruct"
407+
name: "llama-1b"
408+
- id: "unsloth/Qwen3-0.6B"
409+
name: "qwen3-0.6b"
410+
- id: "unsloth/gemma-3-1b-it"
411+
name: "gemma3-1b"
412+
use-custom: [false, true]
413+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
414+
secrets: inherit
415+
with:
416+
job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}
417+
runner: macos-14-xlarge
418+
python-version: "3.12"
419+
submodules: recursive
420+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
421+
secrets-env: EXECUTORCH_HF_TOKEN
422+
timeout: 90
423+
script: |
424+
set -eux
425+
426+
MODEL_ID="${{ matrix.model.id }}"
427+
MODEL_NAME="${{ matrix.model.name }}"
428+
USE_CUSTOM="${{ matrix.use-custom }}"
429+
430+
CUSTOM_ARGS=""
431+
if [ "${USE_CUSTOM}" = "true" ]; then
432+
CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
433+
fi
434+
435+
echo "::group::Install ExecuTorch and configure MLX build"
436+
${CONDA_RUN} python install_executorch.py > /dev/null
437+
${CONDA_RUN} cmake --preset mlx-release
438+
echo "::endgroup::"
439+
440+
echo "::group::Install LLM requirements"
441+
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
442+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
443+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
444+
${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
445+
echo "::endgroup::"
446+
447+
${CONDA_RUN} pip list
448+
449+
echo "::group::Export ${MODEL_NAME}"
450+
${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \
451+
--model-id "${MODEL_ID}" \
452+
--output /tmp/${MODEL_NAME}.pte \
453+
--quantize-linear int4 \
454+
--quantize-embeddings int4 \
455+
${CUSTOM_ARGS}
456+
echo "::endgroup::"
457+
458+
echo "::group::Run ${MODEL_NAME} inference"
459+
OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \
460+
--pte /tmp/${MODEL_NAME}.pte \
461+
--model-id "${MODEL_ID}" \
462+
--prompt "What is the capital of France?" \
463+
--max-new-tokens 50 2>&1)
464+
echo "$OUTPUT"
465+
if echo "$OUTPUT" | grep -iq "Paris"; then
466+
echo "Success: 'Paris' found in output"
467+
else
468+
echo "Failed: Expected 'Paris' not found in output"
469+
exit 1
470+
fi
471+
echo "::endgroup::"

0 commit comments

Comments
 (0)