Fix Multiple constraints for allocation for two cat inputs of same un… #16
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: MLX | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - release/* | |
| pull_request: | |
| paths: | |
| - .github/workflows/mlx.yml | |
| - backends/mlx/** | |
| - extension/llm/export/** | |
| - extension/audio/** | |
| - examples/models/parakeet/** | |
| - examples/models/voxtral_realtime/** | |
| - examples/models/qwen3_5_moe/** | |
| workflow_dispatch: | |
| permissions: {} | |
| jobs: | |
| test-mlx: | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| with: | |
| job-name: test-mlx | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| echo "::group::Install ExecuTorch and configure build" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| # The sanitizers fail on github VM runner, but pass on real device | |
| # TODO: figure out why | |
| ${CONDA_RUN} cmake --preset mlx-release -DEXECUTORCH_BUILD_TESTS=ON -DEXECUTORCH_MLX_ENABLE_SANITIZERS=OFF | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Build test runners" | |
| ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 )) | |
| echo "::endgroup::" | |
| echo "::group::Run op unit tests" | |
| ${CONDA_RUN} python -m executorch.backends.mlx.test.run_all_tests -j4 --max-tasks-per-worker 10 --clean-after | |
| echo "::endgroup::" | |
| echo "::group::Run Python unit tests" | |
| ${CONDA_RUN} python -m pytest \ | |
| backends/mlx/test/test_passes.py \ | |
| backends/mlx/test/test_pattern_utils.py \ | |
| backends/mlx/test/test_partitioner.py \ | |
| -v | |
| echo "::endgroup::" | |
| echo "::group::Run multi-thread stress test" | |
| ${CONDA_RUN} python backends/mlx/test/export_multi_thread_test_model.py /tmp/multi_thread_test_model.pte | |
| ET_TESTING_MODEL_PATH=/tmp/multi_thread_test_model.pte \ | |
| ET_TESTING_NUM_THREADS=50 \ | |
| ET_PREDICTIONS_PER_THREAD=100 \ | |
| ./cmake-out/backends/mlx/test/multi_thread_test_runner | |
| echo "::endgroup::" | |
| echo "::group::Run gated_delta_rule op tests" | |
| ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_gated_delta_rule run -v | |
| echo "::endgroup::" | |
| test-mlx-qwen35-moe: | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| with: | |
| job-name: test-mlx-qwen35-moe | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| echo "::group::Install ExecuTorch" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Export Qwen 3.5 MoE (tiny model)" | |
| ${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \ | |
| --tiny-test \ | |
| --backend mlx \ | |
| --qlinear 4w \ | |
| --qlinear-group-size 32 \ | |
| --output-dir /tmp/qwen35_moe_mlx_tiny | |
| echo "::endgroup::" | |
| echo "::group::Check AsType node count" | |
| ASTYPE_COUNT=$(${CONDA_RUN} python -m executorch.backends.mlx.pte_inspector \ | |
| /tmp/qwen35_moe_mlx_tiny/model.pte --mlx-instructions 2>&1 | grep -c "AsTypeNode" || true) | |
| echo "AsType nodes: ${ASTYPE_COUNT}" | |
| if [ "$ASTYPE_COUNT" -gt 23 ]; then | |
| echo "Failed: expected no more than 23 AsType nodes, got ${ASTYPE_COUNT}" | |
| exit 1 | |
| fi | |
| echo "::endgroup::" | |
| echo "::group::Run Qwen 3.5 MoE inference" | |
| OUTPUT=$(${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.run \ | |
| --pte /tmp/qwen35_moe_mlx_tiny/model.pte \ | |
| --prompt-len 4 \ | |
| --max-new-tokens 5 2>&1) | |
| echo "$OUTPUT" | |
| if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 167, 81, 167, 81\]"; then | |
| echo "Success: Qwen 3.5 MoE MLX export + inference completed with expected output" | |
| else | |
| echo "Failed: unexpected output (expected [167, 167, 81, 167, 81])" | |
| exit 1 | |
| fi | |
| echo "::endgroup::" | |
| backend-tester: | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| suite: [models, operators] | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| with: | |
| job-name: test-mlx-backend-${{ matrix.suite }} | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 120 | |
| script: | | |
| set -eux | |
| echo "::group::Install ExecuTorch" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Run backend test suite (${{ matrix.suite }})" | |
| ${CONDA_RUN} pytest -c /dev/null backends/test/suite/${{ matrix.suite }}/ -m flow_mlx -n auto 2>&1 | tee pytest_output.txt || true | |
| echo "::endgroup::" | |
| # Parse pytest summary and check failure threshold | |
| if grep -E "^=+ .* =+$" pytest_output.txt | tail -1 | grep -q "failed"; then | |
| FAILED=$(grep -E "^=+ .* =+$" pytest_output.txt | tail -1 | grep -oE "[0-9]+ failed" | grep -oE "[0-9]+") | |
| else | |
| FAILED=0 | |
| fi | |
| if [ "${{ matrix.suite }}" = "operators" ]; then | |
| MAX_FAILURES=0 | |
| else | |
| MAX_FAILURES=3 | |
| fi | |
| echo "Failed tests: $FAILED (max allowed: $MAX_FAILURES)" | |
| if [ "$FAILED" -gt "$MAX_FAILURES" ]; then | |
| echo "::error::Too many test failures: $FAILED > $MAX_FAILURES" | |
| exit 1 | |
| fi | |
| test-mlx-parakeet: | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| with: | |
| job-name: test-mlx-parakeet | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| echo "::group::Install ExecuTorch" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| echo "::endgroup::" | |
| echo "::group::Install Parakeet requirements" | |
| ${CONDA_RUN} pip install -r examples/models/parakeet/install_requirements.txt | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Export Parakeet" | |
| ${CONDA_RUN} python -m executorch.examples.models.parakeet.export_parakeet_tdt \ | |
| --backend mlx \ | |
| --dtype bf16 \ | |
| --qlinear_encoder 4w \ | |
| --qlinear_encoder_group_size 128 \ | |
| --qlinear 4w \ | |
| --qlinear_group_size 128 \ | |
| --output-dir /tmp/parakeet_mlx | |
| echo "::endgroup::" | |
| echo "::group::Build Parakeet MLX runner" | |
| ${CONDA_RUN} make parakeet-mlx | |
| echo "::endgroup::" | |
| echo "::group::Run Parakeet MLX runner" | |
| curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav | |
| OUTPUT=$(./cmake-out/examples/models/parakeet/parakeet_runner \ | |
| --model_path /tmp/parakeet_mlx/model.pte \ | |
| --audio_path /tmp/test_audio.wav \ | |
| --tokenizer_path /tmp/parakeet_mlx/tokenizer.model 2>&1) | |
| echo "Runner output:" | |
| echo "$OUTPUT" | |
| if echo "$OUTPUT" | grep -iq "Phoebe"; then | |
| echo "Success: 'Phoebe' found in output" | |
| else | |
| echo "Failed: Expected 'Phoebe' not found in output" | |
| exit 1 | |
| fi | |
| echo "::endgroup::" | |
| test-mlx-voxtral: | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| secrets: inherit | |
| with: | |
| job-name: test-mlx-voxtral | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| echo "::group::Install ExecuTorch" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| echo "::endgroup::" | |
| echo "::group::Install Voxtral requirements" | |
| ${CONDA_RUN} pip install mistral_common librosa soundfile datasets | |
| OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) | |
| ${CONDA_RUN} pip install "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}" | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Export Voxtral" | |
| ${CONDA_RUN} python -m executorch.backends.mlx.examples.voxtral.export_voxtral_hf \ | |
| --output-dir /tmp/voxtral_mlx \ | |
| --dtype bf16 \ | |
| --qlinear 4w | |
| echo "::endgroup::" | |
| echo "::group::Build Voxtral MLX runner" | |
| ${CONDA_RUN} make voxtral-mlx | |
| echo "::endgroup::" | |
| echo "::group::Run Voxtral MLX runner" | |
| curl -L https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json -o /tmp/tekken.json | |
| curl -L https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav -o /tmp/test_audio.wav | |
| OUTPUT=$(./cmake-out/examples/models/voxtral/voxtral_runner \ | |
| --model_path /tmp/voxtral_mlx/model.pte \ | |
| --tokenizer_path /tmp/tekken.json \ | |
| --audio_path /tmp/test_audio.wav \ | |
| --processor_path /tmp/voxtral_mlx/preprocessor.pte \ | |
| --prompt "What is happening in this audio?" \ | |
| --temperature 0 2>&1) | |
| echo "Runner output:" | |
| echo "$OUTPUT" | |
| if echo "$OUTPUT" | grep -iq "poem"; then | |
| echo "Success: 'poem' found in output" | |
| else | |
| echo "Failed: Expected 'poem' not found in output" | |
| exit 1 | |
| fi | |
| echo "::endgroup::" | |
| test-mlx-voxtral-realtime: | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| secrets: inherit | |
| with: | |
| job-name: test-mlx-voxtral-realtime | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| echo "::group::Install ExecuTorch" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| echo "::endgroup::" | |
| echo "::group::Install Voxtral Realtime requirements" | |
| ${CONDA_RUN} pip install safetensors | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Download model" | |
| HF_TOKEN=$SECRET_EXECUTORCH_HF_TOKEN ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602')" | |
| MODEL_PATH=$(HF_TOKEN=$SECRET_EXECUTORCH_HF_TOKEN ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))") | |
| echo "Model path: ${MODEL_PATH}" | |
| echo "::endgroup::" | |
| echo "::group::Export preprocessor" | |
| ${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \ | |
| --feature_size 128 \ | |
| --streaming \ | |
| --backend mlx \ | |
| --output_file /tmp/voxtral_rt_mlx/preprocessor.pte | |
| echo "::endgroup::" | |
| echo "::group::Export Voxtral Realtime (streaming)" | |
| ${CONDA_RUN} python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \ | |
| --model-path "${MODEL_PATH}" \ | |
| --backend mlx \ | |
| --streaming \ | |
| --output-dir /tmp/voxtral_rt_mlx \ | |
| --qlinear-encoder 4w \ | |
| --qlinear 4w \ | |
| --qembedding 8w \ | |
| --qembedding-group-size 128 | |
| echo "::endgroup::" | |
| echo "::group::Build Voxtral Realtime MLX runner" | |
| ${CONDA_RUN} make voxtral_realtime-mlx | |
| echo "::endgroup::" | |
| echo "::group::Run Voxtral Realtime MLX runner" | |
| curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav | |
| OUTPUT=$(./cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \ | |
| --model_path /tmp/voxtral_rt_mlx/model.pte \ | |
| --tokenizer_path "${MODEL_PATH}/tekken.json" \ | |
| --preprocessor_path /tmp/voxtral_rt_mlx/preprocessor.pte \ | |
| --audio_path /tmp/test_audio.wav \ | |
| --streaming 2>&1) | |
| echo "Runner output:" | |
| echo "$OUTPUT" | |
| if echo "$OUTPUT" | grep -iq "Phoebe"; then | |
| echo "Success: 'Phoebe' found in output" | |
| else | |
| echo "Failed: Expected 'Phoebe' not found in output" | |
| exit 1 | |
| fi | |
| echo "::endgroup::" | |
| test-mlx-whisper: | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| secrets: inherit | |
| with: | |
| job-name: test-mlx-whisper | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| echo "::group::Install ExecuTorch and configure MLX build" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| echo "::endgroup::" | |
| echo "::group::Install Whisper requirements" | |
| ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" | |
| ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN | |
| ${CONDA_RUN} pip install transformers soundfile datasets librosa | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Export Whisper" | |
| ${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.export_whisper \ | |
| --model-id "openai/whisper-tiny" \ | |
| --output-dir /tmp/whisper_mlx \ | |
| --dtype bf16 \ | |
| --qlinear 4w | |
| echo "::endgroup::" | |
| echo "::group::Run Whisper inference" | |
| OUTPUT=$( ${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.run_whisper \ | |
| --model-dir /tmp/whisper_mlx \ | |
| --use-sample-audio 2>&1) | |
| echo "$OUTPUT" | |
| if echo "$OUTPUT" | grep -iq "Mr. Quilter"; then | |
| echo "Success: 'Mr. Quilter' found in transcription" | |
| else | |
| echo "Failed: Expected 'Mr. Quilter' not found in transcription" | |
| exit 1 | |
| fi | |
| echo "::endgroup::" | |
| test-mlx-stories110m: | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| with: | |
| job-name: test-mlx-stories110m | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| echo "::group::Install ExecuTorch" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| echo "::endgroup::" | |
| echo "::group::Install Llama requirements" | |
| ${CONDA_RUN} sh examples/models/llama/install_requirements.sh | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Build ExecuTorch with MLX delegate" | |
| ${CONDA_RUN} cmake --workflow --preset mlx-release | |
| echo "::endgroup::" | |
| echo "::group::Build Llama runner with MLX" | |
| pushd examples/models/llama | |
| ${CONDA_RUN} cmake --workflow --preset llama-release | |
| popd | |
| echo "::endgroup::" | |
| echo "::group::Download stories110M artifacts" | |
| curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt | |
| curl -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model | |
| echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json | |
| echo "::endgroup::" | |
| echo "::group::Create tokenizer.bin" | |
| ${CONDA_RUN} python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin | |
| echo "::endgroup::" | |
| echo "::group::Export stories110M with MLX backend via export_llama_lib" | |
| ${CONDA_RUN} python -m extension.llm.export.export_llm \ | |
| base.checkpoint=stories110M.pt \ | |
| base.params=params.json \ | |
| model.use_kv_cache=true \ | |
| model.dtype_override=fp32 \ | |
| backend.mlx.enabled=true \ | |
| quantization.qmode=4w \ | |
| quantization.group_size=32 \ | |
| export.output_name=/tmp/stories110m_mlx.pte | |
| echo "::endgroup::" | |
| echo "::group::Run inference with C++ llama runner" | |
| ./cmake-out/examples/models/llama/llama_main \ | |
| --model_path=/tmp/stories110m_mlx.pte \ | |
| --tokenizer_path=tokenizer.bin \ | |
| --prompt="Once upon a time," \ | |
| --temperature=0 \ | |
| --seq_len=10 | |
| echo "::endgroup::" | |
| test-mlx-llm: | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| model: | |
| - id: "unsloth/Llama-3.2-1B-Instruct" | |
| name: "llama-1b" | |
| - id: "unsloth/Qwen3-0.6B" | |
| name: "qwen3-0.6b" | |
| - id: "unsloth/gemma-3-1b-it" | |
| name: "gemma3-1b" | |
| use-custom: [false, true] | |
| qconfig: ["4w", "nvfp4"] | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| secrets: inherit | |
| with: | |
| job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }} | |
| runner: macos-14-xlarge | |
| python-version: "3.12" | |
| submodules: recursive | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| MODEL_ID="${{ matrix.model.id }}" | |
| MODEL_NAME="${{ matrix.model.name }}" | |
| USE_CUSTOM="${{ matrix.use-custom }}" | |
| QCONFIG="${{ matrix.qconfig }}" | |
| CUSTOM_ARGS="" | |
| if [ "${USE_CUSTOM}" = "true" ]; then | |
| CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache" | |
| fi | |
| echo "::group::Install ExecuTorch and configure MLX build" | |
| ${CONDA_RUN} python install_executorch.py > /dev/null | |
| ${CONDA_RUN} cmake --preset mlx-release | |
| echo "::endgroup::" | |
| echo "::group::Install LLM requirements" | |
| ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" | |
| ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN | |
| OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) | |
| ${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}" | |
| echo "::endgroup::" | |
| ${CONDA_RUN} pip list | |
| echo "::group::Export ${MODEL_NAME}" | |
| ${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \ | |
| --model-id "${MODEL_ID}" \ | |
| --output /tmp/${MODEL_NAME}.pte \ | |
| --qlinear ${QCONFIG} \ | |
| --qembedding ${QCONFIG} \ | |
| ${CUSTOM_ARGS} | |
| echo "::endgroup::" | |
| echo "::group::Run ${MODEL_NAME} inference" | |
| OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \ | |
| --pte /tmp/${MODEL_NAME}.pte \ | |
| --model-id "${MODEL_ID}" \ | |
| --prompt "What is the capital of France?" \ | |
| --max-new-tokens 50 2>&1) | |
| echo "$OUTPUT" | |
| if echo "$OUTPUT" | grep -iq "Paris"; then | |
| echo "Success: 'Paris' found in output" | |
| else | |
| echo "Failed: Expected 'Paris' not found in output" | |
| exit 1 | |
| fi | |
| echo "::endgroup::" |