Skip to content

Commit d8ee9d2

Browse files
committed
up
1 parent d4ba78f commit d8ee9d2

8 files changed

Lines changed: 522 additions & 44 deletions

File tree

.github/workflows/mlx.yml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ on:
1111
- backends/mlx/**
1212
- examples/models/parakeet/**
1313
- examples/models/voxtral/**
14+
- examples/models/voxtral_realtime/**
1415
workflow_dispatch:
1516

1617
concurrency:
@@ -220,6 +221,72 @@ jobs:
220221
fi
221222
echo "::endgroup::"
222223
224+
test-mlx-voxtral-realtime:
225+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
226+
secrets: inherit
227+
with:
228+
job-name: test-mlx-voxtral-realtime
229+
runner: macos-14-xlarge
230+
python-version: "3.12"
231+
submodules: recursive
232+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
233+
secrets-env: EXECUTORCH_HF_TOKEN
234+
timeout: 90
235+
script: |
236+
set -eux
237+
238+
echo "::group::Install ExecuTorch"
239+
${CONDA_RUN} python install_executorch.py > /dev/null
240+
echo "::endgroup::"
241+
242+
echo "::group::Install Voxtral Realtime requirements"
243+
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" safetensors
244+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
245+
echo "::endgroup::"
246+
247+
${CONDA_RUN} pip list
248+
249+
echo "::group::Download model"
250+
${CONDA_RUN} huggingface-cli download mistralai/Voxtral-Mini-4B-Realtime-2602
251+
MODEL_PATH=$(${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))")
252+
echo "Model path: ${MODEL_PATH}"
253+
echo "::endgroup::"
254+
255+
echo "::group::Export Voxtral Realtime (streaming)"
256+
${CONDA_RUN} python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
257+
--model-path "${MODEL_PATH}" \
258+
--backend mlx \
259+
--streaming \
260+
--output-dir /tmp/voxtral_rt_mlx \
261+
--qlinear-encoder 4w \
262+
--qlinear 4w \
263+
--qembedding 8w \
264+
--qembedding-group-size 128 \
265+
--export-preprocessor
266+
echo "::endgroup::"
267+
268+
echo "::group::Build Voxtral Realtime MLX runner"
269+
${CONDA_RUN} make voxtral_realtime-mlx
270+
echo "::endgroup::"
271+
272+
echo "::group::Run Voxtral Realtime MLX runner"
273+
curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
274+
OUTPUT=$(./cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
275+
--model_path /tmp/voxtral_rt_mlx/model.pte \
276+
--tokenizer_path "${MODEL_PATH}/tekken.json" \
277+
--preprocessor_path /tmp/voxtral_rt_mlx/preprocessor.pte \
278+
--audio_path /tmp/test_audio.wav \
279+
--streaming 2>&1)
280+
echo "Runner output:"
281+
echo "$OUTPUT"
282+
if echo "$OUTPUT" | grep -iq "Phoebe"; then
283+
echo "Success: 'Phoebe' found in output"
284+
else
285+
echo "Failed: Expected 'Phoebe' not found in output"
286+
exit 1
287+
fi
288+
echo "::endgroup::"
289+
223290
test-mlx-whisper:
224291
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
225292
secrets: inherit

Makefile

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# SUPPORTED MODELS:
1616
# -----------------
1717
# - voxtral: Multimodal voice + text model (CPU, CUDA, Metal, MLX)
18-
# - voxtral_realtime: Realtime speech-to-text model (CPU)
18+
# - voxtral_realtime: Realtime speech-to-text model (CPU, Metal, MLX)
1919
# - whisper: Speech recognition model (CPU, CUDA, Metal)
2020
# - parakeet: Speech recognition model (CPU, CUDA, Metal, MLX)
2121
# - sortformer: Speaker diarization model (CPU)
@@ -91,16 +91,17 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
9898
@echo " voxtral-cuda - Build Voxtral runner with CUDA backend"
9999
@echo " voxtral-cpu - Build Voxtral runner with CPU backend"
100100
@echo " voxtral-metal - Build Voxtral runner with Metal backend (macOS only)"
101-
@echo " voxtral-mlx - Build Voxtral runner with MLX backend (macOS only)"
101+
@echo " voxtral-mlx - Build Voxtral runner with MLX backend"
102102
@echo " voxtral_realtime-cpu - Build Voxtral Realtime runner with CPU backend"
103103
@echo " voxtral_realtime-metal - Build Voxtral Realtime runner with Metal backend (macOS only)"
104+
@echo " voxtral_realtime-mlx - Build Voxtral Realtime runner with MLX backend"
104105
@echo " whisper-cuda - Build Whisper runner with CUDA backend"
105106
@echo " whisper-cuda-debug - Build Whisper runner with CUDA backend (debug mode)"
106107
@echo " whisper-cpu - Build Whisper runner with CPU backend"
@@ -109,7 +110,7 @@ help:
109110
@echo " parakeet-cuda-debug - Build Parakeet runner with CUDA backend (debug mode)"
110111
@echo " parakeet-cpu - Build Parakeet runner with CPU backend"
111112
@echo " parakeet-metal - Build Parakeet runner with Metal backend (macOS only)"
112-
@echo " parakeet-mlx - Build Parakeet runner with MLX backend (macOS only)"
113+
@echo " parakeet-mlx - Build Parakeet runner with MLX backend"
113114
@echo " sortformer-cpu - Build Sortformer runner with CPU backend"
114115
@echo " silero-vad-cpu - Build Silero VAD runner with CPU backend"
115116
@echo " llama-cuda - Build Llama runner with CUDA backend"
@@ -264,6 +265,15 @@ voxtral_realtime-metal:
264265
@echo "✓ Build complete!"
265266
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
266267

268+
voxtral_realtime-mlx:
269+
@echo "==> Building and installing ExecuTorch with MLX..."
270+
cmake --workflow --preset mlx-release
271+
@echo "==> Building Voxtral Realtime runner with MLX..."
272+
cd examples/models/voxtral_realtime && cmake --workflow --preset voxtral-realtime-mlx
273+
@echo ""
274+
@echo "✓ Build complete!"
275+
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
276+
267277
silero-vad-cpu:
268278
@echo "==> Building and installing ExecuTorch..."
269279
cmake --workflow --preset llm-release

backends/mlx/CMakeLists.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,7 @@ set(MLX_METAL_JIT
215215
# Auto-apply patches to MLX submodule. Each patch is applied idempotently: `git
216216
# apply --check` tests whether the patch is still applicable (i.e. not yet
217217
# applied), and only then applies it.
218-
set(_mlx_patches
219-
"${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx_json.patch"
220-
"${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx_metal_device_retain.patch"
221-
)
218+
set(_mlx_patches "${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx_json.patch")
222219
foreach(_patch IN LISTS _mlx_patches)
223220
if(EXISTS "${_patch}" AND EXISTS "${MLX_SOURCE_DIR}")
224221
get_filename_component(_patch_name "${_patch}" NAME)

examples/models/voxtral_realtime/CMakeLists.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
3333
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
3434

3535
# CPU-only builds need quantized and custom ops
36-
if(NOT EXECUTORCH_BUILD_CUDA)
36+
if(NOT EXECUTORCH_BUILD_CUDA AND NOT EXECUTORCH_BUILD_MLX)
3737
list(APPEND link_libraries quantized_ops_lib custom_ops)
3838
executorch_target_link_options_shared_lib(quantized_ops_lib)
3939
executorch_target_link_options_shared_lib(custom_ops)
@@ -87,6 +87,12 @@ if(EXECUTORCH_BUILD_METAL)
8787
executorch_target_link_options_shared_lib(metal_backend)
8888
endif()
8989

90+
# Link MLX delegate
91+
if(TARGET mlxdelegate)
92+
list(APPEND link_libraries mlxdelegate mlx)
93+
executorch_target_link_options_shared_lib(mlxdelegate)
94+
endif()
95+
9096
# Tokenizer
9197
list(APPEND link_libraries tokenizers::tokenizers)
9298

@@ -106,6 +112,11 @@ target_compile_options(
106112
voxtral_realtime_runner PUBLIC ${_common_compile_options}
107113
)
108114

115+
# Copy MLX metallib for runtime if MLX delegate is enabled
116+
if(TARGET mlxdelegate)
117+
executorch_target_copy_mlx_metallib(voxtral_realtime_runner)
118+
endif()
119+
109120
# On Windows, copy required DLLs to the executable directory
110121
if(MSVC AND EXECUTORCH_BUILD_CUDA)
111122
add_custom_command(

examples/models/voxtral_realtime/CMakePresets.json

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,19 @@
2828
"type": "equals",
2929
"rhs": "Darwin"
3030
}
31+
},
32+
{
33+
"name": "voxtral-realtime-mlx",
34+
"displayName": "Voxtral Realtime runner (MLX)",
35+
"inherits": ["voxtral-realtime-base"],
36+
"cacheVariables": {
37+
"EXECUTORCH_BUILD_MLX": "ON"
38+
},
39+
"condition": {
40+
"lhs": "${hostSystemName}",
41+
"type": "equals",
42+
"rhs": "Darwin"
43+
}
3144
}
3245
],
3346
"buildPresets": [
@@ -43,6 +56,13 @@
4356
"configurePreset": "voxtral-realtime-metal",
4457
"configuration": "Release",
4558
"targets": ["voxtral_realtime_runner"]
59+
},
60+
{
61+
"name": "voxtral-realtime-mlx",
62+
"displayName": "Build Voxtral Realtime runner (MLX)",
63+
"configurePreset": "voxtral-realtime-mlx",
64+
"configuration": "Release",
65+
"targets": ["voxtral_realtime_runner"]
4666
}
4767
],
4868
"workflowPresets": [
@@ -73,6 +93,20 @@
7393
"name": "voxtral-realtime-metal"
7494
}
7595
]
96+
},
97+
{
98+
"name": "voxtral-realtime-mlx",
99+
"displayName": "Configure and build Voxtral Realtime runner (MLX)",
100+
"steps": [
101+
{
102+
"type": "configure",
103+
"name": "voxtral-realtime-mlx"
104+
},
105+
{
106+
"type": "build",
107+
"name": "voxtral-realtime-mlx"
108+
}
109+
]
76110
}
77111
]
78112
}

examples/models/voxtral_realtime/README.md

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,9 @@ python export_voxtral_rt.py \
8888
|---------|---------|-----------|--------------|
8989
| `xnnpack` ||| `4w`, `8w`, `8da4w`, `8da8w` |
9090
| `metal` ||| none (fp32) or `fpa4w` (Metal-specific 4-bit) |
91+
| `mlx` ||| `4w`, `8w` |
9192

92-
Metal backend provides Apple GPU acceleration.
93+
Metal and MLX backends provide Apple GPU acceleration.
9394

9495
#### Metal export examples
9596

@@ -128,12 +129,48 @@ Alternatively, you can build torchao with Metal support while installing ExecuTo
128129
EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
129130
```
130131

132+
#### MLX export examples
133+
134+
MLX backend uses the MLX delegate for Apple Silicon GPU acceleration.
135+
136+
Offline:
137+
138+
```bash
139+
python export_voxtral_rt.py \
140+
--model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
141+
--backend mlx \
142+
--output-dir ./voxtral_rt_exports \
143+
--qlinear-encoder 4w \
144+
--qlinear 4w \
145+
--qembedding 8w \
146+
--qembedding-group-size 128 \
147+
--export-preprocessor
148+
```
149+
150+
Streaming:
151+
152+
```bash
153+
python export_voxtral_rt.py \
154+
--model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
155+
--backend mlx \
156+
--streaming \
157+
--output-dir ./voxtral_rt_exports \
158+
--qlinear-encoder 4w \
159+
--qlinear 4w \
160+
--qembedding 8w \
161+
--qembedding-group-size 128 \
162+
--export-preprocessor
163+
```
164+
165+
`--export-preprocessor` bundles the mel preprocessor into the output directory
166+
using the MLX partitioner, so no separate preprocessor export step is needed.
167+
131168
### Options
132169

133170
| Flag | Default | Description |
134171
|------|---------|-------------|
135172
| `--model-path` | (required) | Directory with `params.json` + `consolidated.safetensors` |
136-
| `--backend` | `xnnpack` | `xnnpack`, `metal`, or `portable` |
173+
| `--backend` | `xnnpack` | `xnnpack`, `metal`, `mlx`, or `portable` |
137174
| `--output-dir` | `./voxtral_rt_exports` | Output directory |
138175
| `--max-seq-len` | `4096` | KV cache length |
139176
| `--delay-tokens` | `6` | Transcription delay in tokens (6 = 480ms) |
@@ -142,6 +179,8 @@ EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_ex
142179
| `--qlinear-encoder` | (none) | Encoder linear layer quantization (`4w`, `8w`, `8da4w`, `8da8w`, `fpa4w`) |
143180
| `--qlinear-encoder-group-size` | `32` | Group size for encoder linear quantization |
144181
| `--qembedding` | (none) | Embedding layer quantization (`8w`) |
182+
| `--qembedding-group-size` | `0` | Group size for embedding quantization (0 = per-channel) |
183+
| `--export-preprocessor` | off | Export `preprocessor.pte` alongside the model |
145184
| `--streaming` | off | Export streaming encoder with KV cache |
146185
| `--max-enc-len` | `750` | Encoder sliding window size (streaming only) |
147186

@@ -173,6 +212,15 @@ make voxtral_realtime-metal
173212
This builds ExecuTorch with Metal backend support. The runner binary is at
174213
the same path as above. Metal exports can only run on macOS with Apple Silicon.
175214

215+
### MLX (Apple GPU)
216+
217+
```bash
218+
make voxtral_realtime-mlx
219+
```
220+
221+
This builds ExecuTorch with MLX backend support. MLX provides GPU acceleration
222+
on Apple Silicon via the MLX delegate.
223+
176224
## Run
177225

178226
The runner requires:

0 commit comments

Comments
 (0)