MLX

Fix Multiple constraints for allocation for two cat inputs of same un… #16

Workflow file for this run

	name: MLX

	on:
	push:
	branches:
	- main
	- release/*
	pull_request:
	paths:
	- .github/workflows/mlx.yml
	- backends/mlx/**
	- extension/llm/export/**
	- extension/audio/**
	- examples/models/parakeet/**
	- examples/models/voxtral_realtime/**
	- examples/models/qwen3_5_moe/**
	workflow_dispatch:

	permissions: {}

	jobs:
	test-mlx:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	with:
	job-name: test-mlx
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	timeout: 90
	script: \|
	set -eux

	echo "::group::Install ExecuTorch and configure build"
	${CONDA_RUN} python install_executorch.py > /dev/null
	# The sanitizers fail on github VM runner, but pass on real device
	# TODO: figure out why
	${CONDA_RUN} cmake --preset mlx-release -DEXECUTORCH_BUILD_TESTS=ON -DEXECUTORCH_MLX_ENABLE_SANITIZERS=OFF
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Build test runners"
	${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 ))
	echo "::endgroup::"

	echo "::group::Run op unit tests"
	${CONDA_RUN} python -m executorch.backends.mlx.test.run_all_tests -j4 --max-tasks-per-worker 10 --clean-after
	echo "::endgroup::"

	echo "::group::Run Python unit tests"
	${CONDA_RUN} python -m pytest \
	backends/mlx/test/test_passes.py \
	backends/mlx/test/test_pattern_utils.py \
	backends/mlx/test/test_partitioner.py \
	-v
	echo "::endgroup::"

	echo "::group::Run multi-thread stress test"
	${CONDA_RUN} python backends/mlx/test/export_multi_thread_test_model.py /tmp/multi_thread_test_model.pte
	ET_TESTING_MODEL_PATH=/tmp/multi_thread_test_model.pte \
	ET_TESTING_NUM_THREADS=50 \
	ET_PREDICTIONS_PER_THREAD=100 \
	./cmake-out/backends/mlx/test/multi_thread_test_runner
	echo "::endgroup::"

	echo "::group::Run gated_delta_rule op tests"
	${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_gated_delta_rule run -v
	echo "::endgroup::"

	test-mlx-qwen35-moe:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	with:
	job-name: test-mlx-qwen35-moe
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	timeout: 90
	script: \|
	set -eux

	echo "::group::Install ExecuTorch"
	${CONDA_RUN} python install_executorch.py > /dev/null
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Export Qwen 3.5 MoE (tiny model)"
	${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \
	--tiny-test \
	--backend mlx \
	--qlinear 4w \
	--qlinear-group-size 32 \
	--output-dir /tmp/qwen35_moe_mlx_tiny
	echo "::endgroup::"

	echo "::group::Check AsType node count"
	ASTYPE_COUNT=$(${CONDA_RUN} python -m executorch.backends.mlx.pte_inspector \
	/tmp/qwen35_moe_mlx_tiny/model.pte --mlx-instructions 2>&1 \| grep -c "AsTypeNode" \|\| true)
	echo "AsType nodes: ${ASTYPE_COUNT}"
	if [ "$ASTYPE_COUNT" -gt 23 ]; then
	echo "Failed: expected no more than 23 AsType nodes, got ${ASTYPE_COUNT}"
	exit 1
	fi
	echo "::endgroup::"

	echo "::group::Run Qwen 3.5 MoE inference"
	OUTPUT=$(${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.run \
	--pte /tmp/qwen35_moe_mlx_tiny/model.pte \
	--prompt-len 4 \
	--max-new-tokens 5 2>&1)
	echo "$OUTPUT"
	if echo "$OUTPUT" \| grep -q "Generated token ids: \[167, 167, 81, 167, 81\]"; then
	echo "Success: Qwen 3.5 MoE MLX export + inference completed with expected output"
	else
	echo "Failed: unexpected output (expected [167, 167, 81, 167, 81])"
	exit 1
	fi
	echo "::endgroup::"

	backend-tester:
	strategy:
	fail-fast: false
	matrix:
	suite: [models, operators]
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	with:
	job-name: test-mlx-backend-${{ matrix.suite }}
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	timeout: 120
	script: \|
	set -eux

	echo "::group::Install ExecuTorch"
	${CONDA_RUN} python install_executorch.py > /dev/null
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Run backend test suite (${{ matrix.suite }})"
	${CONDA_RUN} pytest -c /dev/null backends/test/suite/${{ matrix.suite }}/ -m flow_mlx -n auto 2>&1 \| tee pytest_output.txt \|\| true
	echo "::endgroup::"

	# Parse pytest summary and check failure threshold
	if grep -E "^=+ .* =+$" pytest_output.txt \| tail -1 \| grep -q "failed"; then
	FAILED=$(grep -E "^=+ .* =+$" pytest_output.txt \| tail -1 \| grep -oE "[0-9]+ failed" \| grep -oE "[0-9]+")
	else
	FAILED=0
	fi

	if [ "${{ matrix.suite }}" = "operators" ]; then
	MAX_FAILURES=0
	else
	MAX_FAILURES=3
	fi

	echo "Failed tests: $FAILED (max allowed: $MAX_FAILURES)"
	if [ "$FAILED" -gt "$MAX_FAILURES" ]; then
	echo "::error::Too many test failures: $FAILED > $MAX_FAILURES"
	exit 1
	fi

	test-mlx-parakeet:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	with:
	job-name: test-mlx-parakeet
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	timeout: 90
	script: \|
	set -eux

	echo "::group::Install ExecuTorch"
	${CONDA_RUN} python install_executorch.py > /dev/null
	echo "::endgroup::"

	echo "::group::Install Parakeet requirements"
	${CONDA_RUN} pip install -r examples/models/parakeet/install_requirements.txt
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Export Parakeet"
	${CONDA_RUN} python -m executorch.examples.models.parakeet.export_parakeet_tdt \
	--backend mlx \
	--dtype bf16 \
	--qlinear_encoder 4w \
	--qlinear_encoder_group_size 128 \
	--qlinear 4w \
	--qlinear_group_size 128 \
	--output-dir /tmp/parakeet_mlx
	echo "::endgroup::"

	echo "::group::Build Parakeet MLX runner"
	${CONDA_RUN} make parakeet-mlx
	echo "::endgroup::"

	echo "::group::Run Parakeet MLX runner"
	curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
	OUTPUT=$(./cmake-out/examples/models/parakeet/parakeet_runner \
	--model_path /tmp/parakeet_mlx/model.pte \
	--audio_path /tmp/test_audio.wav \
	--tokenizer_path /tmp/parakeet_mlx/tokenizer.model 2>&1)
	echo "Runner output:"
	echo "$OUTPUT"
	if echo "$OUTPUT" \| grep -iq "Phoebe"; then
	echo "Success: 'Phoebe' found in output"
	else
	echo "Failed: Expected 'Phoebe' not found in output"
	exit 1
	fi
	echo "::endgroup::"

	test-mlx-voxtral:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	secrets: inherit
	with:
	job-name: test-mlx-voxtral
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	secrets-env: EXECUTORCH_HF_TOKEN
	timeout: 90
	script: \|
	set -eux

	echo "::group::Install ExecuTorch"
	${CONDA_RUN} python install_executorch.py > /dev/null
	echo "::endgroup::"

	echo "::group::Install Voxtral requirements"
	${CONDA_RUN} pip install mistral_common librosa soundfile datasets
	OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
	${CONDA_RUN} pip install "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Export Voxtral"
	${CONDA_RUN} python -m executorch.backends.mlx.examples.voxtral.export_voxtral_hf \
	--output-dir /tmp/voxtral_mlx \
	--dtype bf16 \
	--qlinear 4w
	echo "::endgroup::"

	echo "::group::Build Voxtral MLX runner"
	${CONDA_RUN} make voxtral-mlx
	echo "::endgroup::"

	echo "::group::Run Voxtral MLX runner"
	curl -L https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json -o /tmp/tekken.json
	curl -L https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav -o /tmp/test_audio.wav
	OUTPUT=$(./cmake-out/examples/models/voxtral/voxtral_runner \
	--model_path /tmp/voxtral_mlx/model.pte \
	--tokenizer_path /tmp/tekken.json \
	--audio_path /tmp/test_audio.wav \
	--processor_path /tmp/voxtral_mlx/preprocessor.pte \
	--prompt "What is happening in this audio?" \
	--temperature 0 2>&1)
	echo "Runner output:"
	echo "$OUTPUT"
	if echo "$OUTPUT" \| grep -iq "poem"; then
	echo "Success: 'poem' found in output"
	else
	echo "Failed: Expected 'poem' not found in output"
	exit 1
	fi
	echo "::endgroup::"

	test-mlx-voxtral-realtime:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	secrets: inherit
	with:
	job-name: test-mlx-voxtral-realtime
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	secrets-env: EXECUTORCH_HF_TOKEN
	timeout: 90
	script: \|
	set -eux

	echo "::group::Install ExecuTorch"
	${CONDA_RUN} python install_executorch.py > /dev/null
	echo "::endgroup::"

	echo "::group::Install Voxtral Realtime requirements"
	${CONDA_RUN} pip install safetensors
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Download model"
	HF_TOKEN=$SECRET_EXECUTORCH_HF_TOKEN ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602')"
	MODEL_PATH=$(HF_TOKEN=$SECRET_EXECUTORCH_HF_TOKEN ${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))")
	echo "Model path: ${MODEL_PATH}"
	echo "::endgroup::"

	echo "::group::Export preprocessor"
	${CONDA_RUN} python -m executorch.extension.audio.mel_spectrogram \
	--feature_size 128 \
	--streaming \
	--backend mlx \
	--output_file /tmp/voxtral_rt_mlx/preprocessor.pte
	echo "::endgroup::"

	echo "::group::Export Voxtral Realtime (streaming)"
	${CONDA_RUN} python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
	--model-path "${MODEL_PATH}" \
	--backend mlx \
	--streaming \
	--output-dir /tmp/voxtral_rt_mlx \
	--qlinear-encoder 4w \
	--qlinear 4w \
	--qembedding 8w \
	--qembedding-group-size 128
	echo "::endgroup::"

	echo "::group::Build Voxtral Realtime MLX runner"
	${CONDA_RUN} make voxtral_realtime-mlx
	echo "::endgroup::"

	echo "::group::Run Voxtral Realtime MLX runner"
	curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
	OUTPUT=$(./cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
	--model_path /tmp/voxtral_rt_mlx/model.pte \
	--tokenizer_path "${MODEL_PATH}/tekken.json" \
	--preprocessor_path /tmp/voxtral_rt_mlx/preprocessor.pte \
	--audio_path /tmp/test_audio.wav \
	--streaming 2>&1)
	echo "Runner output:"
	echo "$OUTPUT"
	if echo "$OUTPUT" \| grep -iq "Phoebe"; then
	echo "Success: 'Phoebe' found in output"
	else
	echo "Failed: Expected 'Phoebe' not found in output"
	exit 1
	fi
	echo "::endgroup::"

	test-mlx-whisper:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	secrets: inherit
	with:
	job-name: test-mlx-whisper
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	secrets-env: EXECUTORCH_HF_TOKEN
	timeout: 90
	script: \|
	set -eux

	echo "::group::Install ExecuTorch and configure MLX build"
	${CONDA_RUN} python install_executorch.py > /dev/null
	echo "::endgroup::"

	echo "::group::Install Whisper requirements"
	${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
	${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
	${CONDA_RUN} pip install transformers soundfile datasets librosa
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Export Whisper"
	${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.export_whisper \
	--model-id "openai/whisper-tiny" \
	--output-dir /tmp/whisper_mlx \
	--dtype bf16 \
	--qlinear 4w
	echo "::endgroup::"

	echo "::group::Run Whisper inference"
	OUTPUT=$( ${CONDA_RUN} python -m executorch.backends.mlx.examples.whisper.run_whisper \
	--model-dir /tmp/whisper_mlx \
	--use-sample-audio 2>&1)
	echo "$OUTPUT"
	if echo "$OUTPUT" \| grep -iq "Mr. Quilter"; then
	echo "Success: 'Mr. Quilter' found in transcription"
	else
	echo "Failed: Expected 'Mr. Quilter' not found in transcription"
	exit 1
	fi
	echo "::endgroup::"


	test-mlx-stories110m:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	with:
	job-name: test-mlx-stories110m
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	timeout: 90
	script: \|
	set -eux

	echo "::group::Install ExecuTorch"
	${CONDA_RUN} python install_executorch.py > /dev/null
	echo "::endgroup::"

	echo "::group::Install Llama requirements"
	${CONDA_RUN} sh examples/models/llama/install_requirements.sh
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Build ExecuTorch with MLX delegate"
	${CONDA_RUN} cmake --workflow --preset mlx-release
	echo "::endgroup::"

	echo "::group::Build Llama runner with MLX"
	pushd examples/models/llama
	${CONDA_RUN} cmake --workflow --preset llama-release
	popd
	echo "::endgroup::"

	echo "::group::Download stories110M artifacts"
	curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
	curl -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
	echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
	echo "::endgroup::"

	echo "::group::Create tokenizer.bin"
	${CONDA_RUN} python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
	echo "::endgroup::"

	echo "::group::Export stories110M with MLX backend via export_llama_lib"
	${CONDA_RUN} python -m extension.llm.export.export_llm \
	base.checkpoint=stories110M.pt \
	base.params=params.json \
	model.use_kv_cache=true \
	model.dtype_override=fp32 \
	backend.mlx.enabled=true \
	quantization.qmode=4w \
	quantization.group_size=32 \
	export.output_name=/tmp/stories110m_mlx.pte
	echo "::endgroup::"

	echo "::group::Run inference with C++ llama runner"
	./cmake-out/examples/models/llama/llama_main \
	--model_path=/tmp/stories110m_mlx.pte \
	--tokenizer_path=tokenizer.bin \
	--prompt="Once upon a time," \
	--temperature=0 \
	--seq_len=10
	echo "::endgroup::"

	test-mlx-llm:
	strategy:
	fail-fast: false
	matrix:
	model:
	- id: "unsloth/Llama-3.2-1B-Instruct"
	name: "llama-1b"
	- id: "unsloth/Qwen3-0.6B"
	name: "qwen3-0.6b"
	- id: "unsloth/gemma-3-1b-it"
	name: "gemma3-1b"
	use-custom: [false, true]
	qconfig: ["4w", "nvfp4"]
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	secrets: inherit
	with:
	job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' \|\| '' }}-${{ matrix.qconfig }}
	runner: macos-14-xlarge
	python-version: "3.12"
	submodules: recursive
	ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha \|\| github.sha }}
	secrets-env: EXECUTORCH_HF_TOKEN
	timeout: 90
	script: \|
	set -eux

	MODEL_ID="${{ matrix.model.id }}"
	MODEL_NAME="${{ matrix.model.name }}"
	USE_CUSTOM="${{ matrix.use-custom }}"
	QCONFIG="${{ matrix.qconfig }}"

	CUSTOM_ARGS=""
	if [ "${USE_CUSTOM}" = "true" ]; then
	CUSTOM_ARGS="--use-custom-sdpa --use-custom-kv-cache"
	fi

	echo "::group::Install ExecuTorch and configure MLX build"
	${CONDA_RUN} python install_executorch.py > /dev/null
	${CONDA_RUN} cmake --preset mlx-release
	echo "::endgroup::"

	echo "::group::Install LLM requirements"
	${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0"
	${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
	OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
	${CONDA_RUN} pip install transformers "optimum-executorch @ git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}"
	echo "::endgroup::"

	${CONDA_RUN} pip list

	echo "::group::Export ${MODEL_NAME}"
	${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.export_llm_hf \
	--model-id "${MODEL_ID}" \
	--output /tmp/${MODEL_NAME}.pte \
	--qlinear ${QCONFIG} \
	--qembedding ${QCONFIG} \
	${CUSTOM_ARGS}
	echo "::endgroup::"

	echo "::group::Run ${MODEL_NAME} inference"
	OUTPUT=$(${CONDA_RUN} python -m executorch.backends.mlx.examples.llm.run_llm_hf \
	--pte /tmp/${MODEL_NAME}.pte \
	--model-id "${MODEL_ID}" \
	--prompt "What is the capital of France?" \
	--max-new-tokens 50 2>&1)
	echo "$OUTPUT"
	if echo "$OUTPUT" \| grep -iq "Paris"; then
	echo "Success: 'Paris' found in output"
	else
	echo "Failed: Expected 'Paris' not found in output"
	exit 1
	fi
	echo "::endgroup::"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix Multiple constraints for allocation for two cat inputs of same un… #16

Workflow file

Fix Multiple constraints for allocation for two cat inputs of same un… #16

Uh oh!

Workflow file for this run