Skip to content

Fix Multiple constraints for allocation for two cat inputs of same un… #16

Fix Multiple constraints for allocation for two cat inputs of same un…

Fix Multiple constraints for allocation for two cat inputs of same un… #16

Workflow file for this run

name: MLX
on:
push:
branches:
- main
- release/*
pull_request:
paths:
- .github/workflows/mlx.yml
- backends/mlx/**
- extension/llm/export/**
- extension/audio/**
- examples/models/parakeet/**
- examples/models/voxtral_realtime/**
- examples/models/qwen3_5_moe/**
workflow_dispatch:
permissions: {}
jobs:
test-mlx:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
job-name: test-mlx
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
script: |
set -eux
echo "::group::Install ExecuTorch and configure build"
${CONDA_RUN} python install_executorch.py > /dev/null
# The sanitizers fail on github VM runner, but pass on real device
# TODO: figure out why
${CONDA_RUN} cmake --preset mlx-release -DEXECUTORCH_BUILD_TESTS=ON -DEXECUTORCH_MLX_ENABLE_SANITIZERS=OFF
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Build test runners"
${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 ))
echo "::endgroup::"
echo "::group::Run op unit tests"
${CONDA_RUN} python -m executorch.backends.mlx.test.run_all_tests -j4 --max-tasks-per-worker 10 --clean-after
echo "::endgroup::"
echo "::group::Run Python unit tests"
${CONDA_RUN} python -m pytest \
backends/mlx/test/test_passes.py \
backends/mlx/test/test_pattern_utils.py \
backends/mlx/test/test_partitioner.py \
-v
echo "::endgroup::"
echo "::group::Run multi-thread stress test"
${CONDA_RUN} python backends/mlx/test/export_multi_thread_test_model.py /tmp/multi_thread_test_model.pte
ET_TESTING_MODEL_PATH=/tmp/multi_thread_test_model.pte \
ET_TESTING_NUM_THREADS=50 \
ET_PREDICTIONS_PER_THREAD=100 \
./cmake-out/backends/mlx/test/multi_thread_test_runner
echo "::endgroup::"
echo "::group::Run gated_delta_rule op tests"
${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_gated_delta_rule run -v
echo "::endgroup::"
test-mlx-qwen35-moe:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
job-name: test-mlx-qwen35-moe
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
script: |
set -eux
echo "::group::Install ExecuTorch"
${CONDA_RUN} python install_executorch.py > /dev/null
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Export Qwen 3.5 MoE (tiny model)"
${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \
--tiny-test \
--backend mlx \
--qlinear 4w \
--qlinear-group-size 32 \
--output-dir /tmp/qwen35_moe_mlx_tiny
echo "::endgroup::"
echo "::group::Check AsType node count"
ASTYPE_COUNT=$(${CONDA_RUN} python -m executorch.backends.mlx.pte_inspector \
/tmp/qwen35_moe_mlx_tiny/model.pte --mlx-instructions 2>&1 | grep -c "AsTypeNode" || true)
echo "AsType nodes: ${ASTYPE_COUNT}"
if [ "$ASTYPE_COUNT" -gt 23 ]; then
echo "Failed: expected no more than 23 AsType nodes, got ${ASTYPE_COUNT}"
exit 1
fi
echo "::endgroup::"
echo "::group::Run Qwen 3.5 MoE inference"
OUTPUT=$(${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.run \
--pte /tmp/qwen35_moe_mlx_tiny/model.pte \
--prompt-len 4 \
--max-new-tokens 5 2>&1)
echo "$OUTPUT"
if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 167, 81, 167, 81\]"; then
echo "Success: Qwen 3.5 MoE MLX export + inference completed with expected output"
else
echo "Failed: unexpected output (expected [167, 167, 81, 167, 81])"
exit 1
fi
echo "::endgroup::"
backend-tester:
strategy:
fail-fast: false
matrix:
suite: [models, operators]
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
job-name: test-mlx-backend-${{ matrix.suite }}
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 120
script: |
set -eux
echo "::group::Install ExecuTorch"
${CONDA_RUN} python install_executorch.py > /dev/null
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Run backend test suite (${{ matrix.suite }})"
${CONDA_RUN} pytest -c /dev/null backends/test/suite/${{ matrix.suite }}/ -m flow_mlx -n auto 2>&1 | tee pytest_output.txt || true
echo "::endgroup::"
# Parse pytest summary and check failure threshold
if grep -E "^=+ .* =+$" pytest_output.txt | tail -1 | grep -q "failed"; then
FAILED=$(grep -E "^=+ .* =+$" pytest_output.txt | tail -1 | grep -oE "[0-9]+ failed" | grep -oE "[0-9]+")
else
FAILED=0
fi
if [ "${{ matrix.suite }}" = "operators" ]; then
MAX_FAILURES=0
else
MAX_FAILURES=3
fi
echo "Failed tests: $FAILED (max allowed: $MAX_FAILURES)"
if [ "$FAILED" -gt "$MAX_FAILURES" ]; then
echo "::error::Too many test failures: $FAILED > $MAX_FAILURES"
exit 1
fi
test-mlx-parakeet:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
job-name: test-mlx-parakeet
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
script: |
set -eux
echo "::group::Install ExecuTorch"
${CONDA_RUN} python install_executorch.py > /dev/null
echo "::endgroup::"
echo "::group::Install Parakeet requirements"
${CONDA_RUN} pip install -r examples/models/parakeet/install_requirements.txt
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Export Parakeet"
${CONDA_RUN} python -m executorch.examples.models.parakeet.export_parakeet_tdt \
--backend mlx \
--dtype bf16 \
--qlinear_encoder 4w \
--qlinear_encoder_group_size 128 \
--qlinear 4w \
--qlinear_group_size 128 \
--output-dir /tmp/parakeet_mlx
echo "::endgroup::"
echo "::group::Build Parakeet MLX runner"
${CONDA_RUN} make parakeet-mlx
echo "::endgroup::"
echo "::group::Run Parakeet MLX runner"
curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
OUTPUT=$(./cmake-out/examples/models/parakeet/parakeet_runner \
--model_path /tmp/parakeet_mlx/model.pte \
--audio_path /tmp/test_audio.wav \
--tokenizer_path /tmp/parakeet_mlx/tokenizer.model 2>&1)
echo "Runner output:"
echo "$OUTPUT"
if echo "$OUTPUT" | grep -iq "Phoebe"; then
echo "Success: 'Phoebe' found in output"
else
echo "Failed: Expected 'Phoebe' not found in output"
exit 1
fi
echo "::endgroup::"
test-mlx-voxtral:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
job-name: test-mlx-voxtral
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
secrets-env: EXECUTORCH_HF_TOKEN
timeout: 90
script: |
set -eux
echo "::group::Install ExecuTorch"
${CONDA_RUN} python install_executorch.py > /dev/null
echo "::endgroup::"
echo "::group::Install Voxtral requirements"
${CONDA_RUN} pip install mistral_common librosa soundfile datasets
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
${CONDA_RUN} pip install "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Export Voxtral"
${CONDA_RUN} python -m executorch.backends.mlx.examples.voxtral.export_voxtral_hf \
--output-dir /tmp/voxtral_mlx \
--dtype bf16 \
--qlinear 4w
echo "::endgroup::"
echo "::group::Build Voxtral MLX runner"
${CONDA_RUN} make voxtral-mlx
echo "::endgroup::"
echo "::group::Run Voxtral MLX runner"
curl -L https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json -o /tmp/tekken.json
curl -L https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav -o /tmp/test_audio.wav
OUTPUT=$(./cmake-out/examples/models/voxtral/voxtral_runner \
--model_path /tmp/voxtral_mlx/model.pte \
--tokenizer_path /tmp/tekken.json \
--audio_path /tmp/test_audio.wav \
--processor_path /tmp/voxtral_mlx/preprocessor.pte \
--prompt "What is happening in this audio?" \
--temperature 0 2>&1)
echo "Runner output:"
echo "$OUTPUT"
if echo "$OUTPUT" | grep -iq "poem"; then
echo "Success: 'poem' found in output"
else
echo "Failed: Expected 'poem' not found in output"
exit 1
fi
echo "::endgroup::"
test-mlx-voxtral-realtime:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
job-name: test-mlx-voxtral-realtime
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
secrets-env: EXECUTORCH_HF_TOKEN
timeout: 90
script: |
set -eux
echo "::group::Install ExecuTorch"
${CONDA_RUN} python install_executorch.py > /dev/null
echo "::endgroup::"
echo "::group::Install Voxtral Realtime requirements"
${CONDA_RUN} pip install safetensors
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Download model"
HF_TOKEN=$SECRET_EXECUTORCH_HF_TOKEN ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602')"
MODEL_PATH=$(HF_TOKEN=$SECRET_EXECUTORCH_HF_TOKEN ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))")
echo "Model path: ${MODEL_PATH}"
echo "::endgroup::"
echo "::group::Export preprocessor"
${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \
--feature_size 128 \
--streaming \
--backend mlx \
--output_file /tmp/voxtral_rt_mlx/preprocessor.pte
echo "::endgroup::"
echo "::group::Export Voxtral Realtime (streaming)"
${CONDA_RUN} python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
--model-path "${MODEL_PATH}" \
--backend mlx \
--streaming \
--output-dir /tmp/voxtral_rt_mlx \
--qlinear-encoder 4w \
--qlinear 4w \
--qembedding 8w \
--qembedding-group-size 128
echo "::endgroup::"
echo "::group::Build Voxtral Realtime MLX runner"
${CONDA_RUN} make voxtral_realtime-mlx
echo "::endgroup::"
echo "::group::Run Voxtral Realtime MLX runner"
curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
OUTPUT=$(./cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
--model_path /tmp/voxtral_rt_mlx/model.pte \
--tokenizer_path "${MODEL_PATH}/tekken.json" \
--preprocessor_path /tmp/voxtral_rt_mlx/preprocessor.pte \
--audio_path /tmp/test_audio.wav \
--streaming 2>&1)
echo "Runner output:"
echo "$OUTPUT"
if echo "$OUTPUT" | grep -iq "Phoebe"; then
echo "Success: 'Phoebe' found in output"
else
echo "Failed: Expected 'Phoebe' not found in output"
exit 1
fi
echo "::endgroup::"
test-mlx-whisper:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
job-name: test-mlx-whisper
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
secrets-env: EXECUTORCH_HF_TOKEN
timeout: 90
script: |
set -eux
echo "::group::Install ExecuTorch and configure MLX build"
${CONDA_RUN} python install_executorch.py > /dev/null
echo "::endgroup::"
echo "::group::Install Whisper requirements"
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
${CONDA_RUN} pip install transformers soundfile datasets librosa
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Export Whisper"
${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.export_whisper \
--model-id "openai/whisper-tiny" \
--output-dir /tmp/whisper_mlx \
--dtype bf16 \
--qlinear 4w
echo "::endgroup::"
echo "::group::Run Whisper inference"
OUTPUT=$( ${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.run_whisper \
--model-dir /tmp/whisper_mlx \
--use-sample-audio 2>&1)
echo "$OUTPUT"
if echo "$OUTPUT" | grep -iq "Mr. Quilter"; then
echo "Success: 'Mr. Quilter' found in transcription"
else
echo "Failed: Expected 'Mr. Quilter' not found in transcription"
exit 1
fi
echo "::endgroup::"
test-mlx-stories110m:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
job-name: test-mlx-stories110m
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
script: |
set -eux
echo "::group::Install ExecuTorch"
${CONDA_RUN} python install_executorch.py > /dev/null
echo "::endgroup::"
echo "::group::Install Llama requirements"
${CONDA_RUN} sh examples/models/llama/install_requirements.sh
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Build ExecuTorch with MLX delegate"
${CONDA_RUN} cmake --workflow --preset mlx-release
echo "::endgroup::"
echo "::group::Build Llama runner with MLX"
pushd examples/models/llama
${CONDA_RUN} cmake --workflow --preset llama-release
popd
echo "::endgroup::"
echo "::group::Download stories110M artifacts"
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
curl -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
echo "::endgroup::"
echo "::group::Create tokenizer.bin"
${CONDA_RUN} python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
echo "::endgroup::"
echo "::group::Export stories110M with MLX backend via export_llama_lib"
${CONDA_RUN} python -m extension.llm.export.export_llm \
base.checkpoint=stories110M.pt \
base.params=params.json \
model.use_kv_cache=true \
model.dtype_override=fp32 \
backend.mlx.enabled=true \
quantization.qmode=4w \
quantization.group_size=32 \
export.output_name=/tmp/stories110m_mlx.pte
echo "::endgroup::"
echo "::group::Run inference with C++ llama runner"
./cmake-out/examples/models/llama/llama_main \
--model_path=/tmp/stories110m_mlx.pte \
--tokenizer_path=tokenizer.bin \
--prompt="Once upon a time," \
--temperature=0 \
--seq_len=10
echo "::endgroup::"
test-mlx-llm:
strategy:
fail-fast: false
matrix:
model:
- id: "unsloth/Llama-3.2-1B-Instruct"
name: "llama-1b"
- id: "unsloth/Qwen3-0.6B"
name: "qwen3-0.6b"
- id: "unsloth/gemma-3-1b-it"
name: "gemma3-1b"
use-custom: [false, true]
qconfig: ["4w", "nvfp4"]
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }}
runner: macos-14-xlarge
python-version: "3.12"
submodules: recursive
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
secrets-env: EXECUTORCH_HF_TOKEN
timeout: 90
script: |
set -eux
MODEL_ID="${{ matrix.model.id }}"
MODEL_NAME="${{ matrix.model.name }}"
USE_CUSTOM="${{ matrix.use-custom }}"
QCONFIG="${{ matrix.qconfig }}"
CUSTOM_ARGS=""
if [ "${USE_CUSTOM}" = "true" ]; then
CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
fi
echo "::group::Install ExecuTorch and configure MLX build"
${CONDA_RUN} python install_executorch.py > /dev/null
${CONDA_RUN} cmake --preset mlx-release
echo "::endgroup::"
echo "::group::Install LLM requirements"
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
echo "::endgroup::"
${CONDA_RUN} pip list
echo "::group::Export ${MODEL_NAME}"
${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \
--model-id "${MODEL_ID}" \
--output /tmp/${MODEL_NAME}.pte \
--qlinear ${QCONFIG} \
--qembedding ${QCONFIG} \
${CUSTOM_ARGS}
echo "::endgroup::"
echo "::group::Run ${MODEL_NAME} inference"
OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \
--pte /tmp/${MODEL_NAME}.pte \
--model-id "${MODEL_ID}" \
--prompt "What is the capital of France?" \
--max-new-tokens 50 2>&1)
echo "$OUTPUT"
if echo "$OUTPUT" | grep -iq "Paris"; then
echo "Success: 'Paris' found in output"
else
echo "Failed: Expected 'Paris' not found in output"
exit 1
fi
echo "::endgroup::"