pytorch
diff --git a/‎.github/workflows/mlx.yml‎
Lines changed: 364 additions & 0 deletions b/‎.github/workflows/mlx.yml‎
Lines changed: 364 additions & 0 deletions
@@ -9,6 +9,9 @@ on:
     paths:
       - .github/workflows/mlx.yml
       - backends/mlx/**
+      - examples/models/parakeet/**
+      - examples/models/voxtral/**
+      - examples/models/voxtral_realtime/**
   workflow_dispatch:
 
 concurrency:
@@ -105,3 +108,364 @@ jobs:
           echo "::error::Too many test failures: $FAILED > $MAX_FAILURES"
           exit 1
         fi
+
+  test-mlx-parakeet:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      job-name: test-mlx-parakeet
+      runner: macos-14-xlarge
+      python-version: "3.12"
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Install ExecuTorch"
+        ${CONDA_RUN} python install_executorch.py > /dev/null
+        echo "::endgroup::"
+
+        echo "::group::Install Parakeet requirements"
+        ${CONDA_RUN} pip install -r examples/models/parakeet/install_requirements.txt
+        echo "::endgroup::"
+
+        ${CONDA_RUN} pip list
+
+        echo "::group::Export Parakeet"
+        ${CONDA_RUN} python -m executorch.examples.models.parakeet.export_parakeet_tdt \
+          --backend mlx \
+          --dtype bf16 \
+          --qlinear_encoder 4w \
+          --qlinear_encoder_group_size 128 \
+          --qlinear 4w \
+          --qlinear_group_size 128 \
+          --output-dir /tmp/parakeet_mlx
+        echo "::endgroup::"
+
+        echo "::group::Build Parakeet MLX runner"
+        ${CONDA_RUN} make parakeet-mlx
+        echo "::endgroup::"
+
+        echo "::group::Run Parakeet MLX runner"
+        curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
+        OUTPUT=$(./cmake-out/examples/models/parakeet/parakeet_runner \
+          --model_path /tmp/parakeet_mlx/model.pte \
+          --audio_path /tmp/test_audio.wav \
+          --tokenizer_path /tmp/parakeet_mlx/tokenizer.model 2>&1)
+        echo "Runner output:"
+        echo "$OUTPUT"
+        if echo "$OUTPUT" | grep -iq "Phoebe"; then
+          echo "Success: 'Phoebe' found in output"
+        else
+          echo "Failed: Expected 'Phoebe' not found in output"
+          exit 1
+        fi
+        echo "::endgroup::"
+
+  test-mlx-voxtral:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
+    with:
+      job-name: test-mlx-voxtral
+      runner: macos-14-xlarge
+      python-version: "3.12"
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      secrets-env: EXECUTORCH_HF_TOKEN
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Install ExecuTorch"
+        ${CONDA_RUN} python install_executorch.py > /dev/null
+        echo "::endgroup::"
+
+        echo "::group::Install Voxtral requirements"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        ${CONDA_RUN} pip install mistral_common librosa soundfile datasets
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        ${CONDA_RUN} pip install "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
+        echo "::endgroup::"
+
+        ${CONDA_RUN} pip list
+
+        echo "::group::Export Voxtral"
+        ${CONDA_RUN} python -m executorch.backends.mlx.examples.voxtral.export_voxtral_hf \
+          --output-dir /tmp/voxtral_mlx \
+          --dtype bf16 \
+          --quantize-linear int4
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral MLX runner"
+        ${CONDA_RUN} make voxtral-mlx
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral MLX runner"
+        curl -L https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json -o /tmp/tekken.json
+        curl -L https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav -o /tmp/test_audio.wav
+        OUTPUT=$(./cmake-out/examples/models/voxtral/voxtral_runner \
+          --model_path /tmp/voxtral_mlx/model.pte \
+          --tokenizer_path /tmp/tekken.json \
+          --audio_path /tmp/test_audio.wav \
+          --processor_path /tmp/voxtral_mlx/preprocessor.pte \
+          --prompt "What is happening in this audio?" \
+          --temperature 0 2>&1)
+        echo "Runner output:"
+        echo "$OUTPUT"
+        if echo "$OUTPUT" | grep -iq "poem"; then
+          echo "Success: 'poem' found in output"
+        else
+          echo "Failed: Expected 'poem' not found in output"
+          exit 1
+        fi
+        echo "::endgroup::"
+
+  test-mlx-voxtral-realtime:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
+    with:
+      job-name: test-mlx-voxtral-realtime
+      runner: macos-14-xlarge
+      python-version: "3.12"
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      secrets-env: EXECUTORCH_HF_TOKEN
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Install ExecuTorch"
+        ${CONDA_RUN} python install_executorch.py > /dev/null
+        echo "::endgroup::"
+
+        echo "::group::Install Voxtral Realtime requirements"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" safetensors
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        echo "::endgroup::"
+
+        ${CONDA_RUN} pip list
+
+        echo "::group::Download model"
+        ${CONDA_RUN} huggingface-cli download mistralai/Voxtral-Mini-4B-Realtime-2602
+        MODEL_PATH=$(${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))")
+        echo "Model path: ${MODEL_PATH}"
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral Realtime (streaming)"
+        ${CONDA_RUN} python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
+          --model-path "${MODEL_PATH}" \
+          --backend mlx \
+          --streaming \
+          --output-dir /tmp/voxtral_rt_mlx \
+          --qlinear-encoder 4w \
+          --qlinear 4w \
+          --qembedding 8w \
+          --qembedding-group-size 128 \
+          --export-preprocessor
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Realtime MLX runner"
+        ${CONDA_RUN} make voxtral_realtime-mlx
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Realtime MLX runner"
+        curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
+        OUTPUT=$(./cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
+          --model_path /tmp/voxtral_rt_mlx/model.pte \
+          --tokenizer_path "${MODEL_PATH}/tekken.json" \
+          --preprocessor_path /tmp/voxtral_rt_mlx/preprocessor.pte \
+          --audio_path /tmp/test_audio.wav \
+          --streaming 2>&1)
+        echo "Runner output:"
+        echo "$OUTPUT"
+        if echo "$OUTPUT" | grep -iq "Phoebe"; then
+          echo "Success: 'Phoebe' found in output"
+        else
+          echo "Failed: Expected 'Phoebe' not found in output"
+          exit 1
+        fi
+        echo "::endgroup::"
+
+  test-mlx-whisper:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
+    with:
+      job-name: test-mlx-whisper
+      runner: macos-14-xlarge
+      python-version: "3.12"
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      secrets-env: EXECUTORCH_HF_TOKEN
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Install ExecuTorch and configure MLX build"
+        ${CONDA_RUN} python install_executorch.py > /dev/null
+        echo "::endgroup::"
+
+        echo "::group::Install Whisper requirements"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        ${CONDA_RUN} pip install transformers soundfile datasets librosa
+        echo "::endgroup::"
+
+        ${CONDA_RUN} pip list
+
+        echo "::group::Export Whisper"
+        ${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.export_whisper \
+          --model-id "openai/whisper-tiny" \
+          --output-dir /tmp/whisper_mlx \
+          --dtype bf16 \
+          --quantize-linear int4
+        echo "::endgroup::"
+
+        echo "::group::Run Whisper inference"
+        OUTPUT=$( ${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.run_whisper \
+          --model-dir /tmp/whisper_mlx \
+          --use-sample-audio 2>&1)
+        echo "$OUTPUT"
+        if echo "$OUTPUT" | grep -iq "Mr. Quilter"; then
+          echo "Success: 'Mr. Quilter' found in transcription"
+        else
+          echo "Failed: Expected 'Mr. Quilter' not found in transcription"
+          exit 1
+        fi
+        echo "::endgroup::"
+
+
+  test-mlx-stories110m:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      job-name: test-mlx-stories110m
+      runner: macos-14-xlarge
+      python-version: "3.12"
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Install ExecuTorch"
+        ${CONDA_RUN} python install_executorch.py > /dev/null
+        echo "::endgroup::"
+
+        echo "::group::Install Llama requirements"
+        ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
+        echo "::endgroup::"
+
+        ${CONDA_RUN} pip list
+
+        echo "::group::Build ExecuTorch with MLX delegate"
+        ${CONDA_RUN} cmake --workflow --preset mlx-release
+        echo "::endgroup::"
+
+        echo "::group::Build Llama runner with MLX"
+        pushd examples/models/llama
+        ${CONDA_RUN} cmake --workflow --preset llama-release
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Download stories110M artifacts"
+        curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
+        curl -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
+        echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+        echo "::endgroup::"
+
+        echo "::group::Create tokenizer.bin"
+        ${CONDA_RUN} python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+        echo "::endgroup::"
+
+        echo "::group::Export stories110M with MLX backend via export_llama_lib"
+        ${CONDA_RUN} python -m extension.llm.export.export_llm \
+          base.checkpoint=stories110M.pt \
+          base.params=params.json \
+          model.use_kv_cache=true \
+          model.dtype_override=fp32 \
+          backend.mlx.enabled=true \
+          quantization.qmode=4w \
+          quantization.group_size=32 \
+          export.output_name=/tmp/stories110m_mlx.pte
+        echo "::endgroup::"
+
+        echo "::group::Run inference with C++ llama runner"
+        ./cmake-out/examples/models/llama/llama_main \
+          --model_path=/tmp/stories110m_mlx.pte \
+          --tokenizer_path=tokenizer.bin \
+          --prompt="Once upon a time," \
+          --temperature=0 \
+          --seq_len=10
+        echo "::endgroup::"
+
+  test-mlx-llm:
+    strategy:
+      fail-fast: false
+      matrix:
+        model:
+          - id: "unsloth/Llama-3.2-1B-Instruct"
+            name: "llama-1b"
+          - id: "unsloth/Qwen3-0.6B"
+            name: "qwen3-0.6b"
+          - id: "unsloth/gemma-3-1b-it"
+            name: "gemma3-1b"
+        use-custom: [false, true]
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
+    with:
+      job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}
+      runner: macos-14-xlarge
+      python-version: "3.12"
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      secrets-env: EXECUTORCH_HF_TOKEN
+      timeout: 90
+      script: |
+        set -eux
+
+        MODEL_ID="${{ matrix.model.id }}"
+        MODEL_NAME="${{ matrix.model.name }}"
+        USE_CUSTOM="${{ matrix.use-custom }}"
+
+        CUSTOM_ARGS=""
+        if [ "${USE_CUSTOM}" = "true" ]; then
+          CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
+        fi
+
+        echo "::group::Install ExecuTorch and configure MLX build"
+        ${CONDA_RUN} python install_executorch.py > /dev/null
+        ${CONDA_RUN} cmake --preset mlx-release
+        echo "::endgroup::"
+
+        echo "::group::Install LLM requirements"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        ${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
+        echo "::endgroup::"
+
+        ${CONDA_RUN} pip list
+
+        echo "::group::Export ${MODEL_NAME}"
+        ${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \
+          --model-id "${MODEL_ID}" \
+          --output /tmp/${MODEL_NAME}.pte \
+          --quantize-linear int4 \
+          --quantize-embeddings int4 \
+          ${CUSTOM_ARGS}
+        echo "::endgroup::"
+
+        echo "::group::Run ${MODEL_NAME} inference"
+        OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \
+          --pte /tmp/${MODEL_NAME}.pte \
+          --model-id "${MODEL_ID}" \
+          --prompt "What is the capital of France?" \
+          --max-new-tokens 50 2>&1)
+        echo "$OUTPUT"
+        if echo "$OUTPUT" | grep -iq "Paris"; then
+          echo "Success: 'Paris' found in output"
+        else
+          echo "Failed: Expected 'Paris' not found in output"
+          exit 1
+        fi
+        echo "::endgroup::"