up

metascroy · metascroy · commit d8ee9d2b565e · 2026-03-02T16:53:13.000-08:00
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -11,6 +11,7 @@ on:
       - backends/mlx/**
       - examples/models/parakeet/**
       - examples/models/voxtral/**
+      - examples/models/voxtral_realtime/**
   workflow_dispatch:
 
 concurrency:
@@ -220,6 +221,72 @@ jobs:
         fi
         echo "::endgroup::"
 
+  test-mlx-voxtral-realtime:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    secrets: inherit
+    with:
+      job-name: test-mlx-voxtral-realtime
+      runner: macos-14-xlarge
+      python-version: "3.12"
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      secrets-env: EXECUTORCH_HF_TOKEN
+      timeout: 90
+      script: |
+        set -eux
+
+        echo "::group::Install ExecuTorch"
+        ${CONDA_RUN} python install_executorch.py > /dev/null
+        echo "::endgroup::"
+
+        echo "::group::Install Voxtral Realtime requirements"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" safetensors
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        echo "::endgroup::"
+
+        ${CONDA_RUN} pip list
+
+        echo "::group::Download model"
+        ${CONDA_RUN} huggingface-cli download mistralai/Voxtral-Mini-4B-Realtime-2602
+        MODEL_PATH=$(${CONDA_RUN} python -c "from huggingface_hub import snapshot_download; print(snapshot_download('mistralai/Voxtral-Mini-4B-Realtime-2602'))")
+        echo "Model path: ${MODEL_PATH}"
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral Realtime (streaming)"
+        ${CONDA_RUN} python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
+          --model-path "${MODEL_PATH}" \
+          --backend mlx \
+          --streaming \
+          --output-dir /tmp/voxtral_rt_mlx \
+          --qlinear-encoder 4w \
+          --qlinear 4w \
+          --qembedding 8w \
+          --qembedding-group-size 128 \
+          --export-preprocessor
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Realtime MLX runner"
+        ${CONDA_RUN} make voxtral_realtime-mlx
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Realtime MLX runner"
+        curl -L https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav -o /tmp/test_audio.wav
+        OUTPUT=$(./cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
+          --model_path /tmp/voxtral_rt_mlx/model.pte \
+          --tokenizer_path "${MODEL_PATH}/tekken.json" \
+          --preprocessor_path /tmp/voxtral_rt_mlx/preprocessor.pte \
+          --audio_path /tmp/test_audio.wav \
+          --streaming 2>&1)
+        echo "Runner output:"
+        echo "$OUTPUT"
+        if echo "$OUTPUT" | grep -iq "Phoebe"; then
+          echo "Success: 'Phoebe' found in output"
+        else
+          echo "Failed: Expected 'Phoebe' not found in output"
+          exit 1
+        fi
+        echo "::endgroup::"
+
   test-mlx-whisper:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
diff --git a/Makefile b/Makefile
@@ -15,7 +15,7 @@
 # SUPPORTED MODELS:
 # -----------------
 # - voxtral:  Multimodal voice + text model (CPU, CUDA, Metal, MLX)
-# - voxtral_realtime: Realtime speech-to-text model (CPU)
+# - voxtral_realtime: Realtime speech-to-text model (CPU, Metal, MLX)
 # - whisper:  Speech recognition model (CPU, CUDA, Metal)
 # - parakeet: Speech recognition model (CPU, CUDA, Metal, MLX)
 # - sortformer: Speaker diarization model (CPU)
@@ -91,16 +91,17 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
 	@echo "  voxtral-cuda        - Build Voxtral runner with CUDA backend"
 	@echo "  voxtral-cpu         - Build Voxtral runner with CPU backend"
 	@echo "  voxtral-metal       - Build Voxtral runner with Metal backend (macOS only)"
-	@echo "  voxtral-mlx         - Build Voxtral runner with MLX backend (macOS only)"
+	@echo "  voxtral-mlx         - Build Voxtral runner with MLX backend"
 	@echo "  voxtral_realtime-cpu - Build Voxtral Realtime runner with CPU backend"
 	@echo "  voxtral_realtime-metal - Build Voxtral Realtime runner with Metal backend (macOS only)"
+	@echo "  voxtral_realtime-mlx - Build Voxtral Realtime runner with MLX backend"
 	@echo "  whisper-cuda        - Build Whisper runner with CUDA backend"
 	@echo "  whisper-cuda-debug  - Build Whisper runner with CUDA backend (debug mode)"
 	@echo "  whisper-cpu         - Build Whisper runner with CPU backend"
@@ -109,7 +110,7 @@ help:
 	@echo "  parakeet-cuda-debug - Build Parakeet runner with CUDA backend (debug mode)"
 	@echo "  parakeet-cpu        - Build Parakeet runner with CPU backend"
 	@echo "  parakeet-metal      - Build Parakeet runner with Metal backend (macOS only)"
-	@echo "  parakeet-mlx        - Build Parakeet runner with MLX backend (macOS only)"
+	@echo "  parakeet-mlx        - Build Parakeet runner with MLX backend"
 	@echo "  sortformer-cpu      - Build Sortformer runner with CPU backend"
 	@echo "  silero-vad-cpu      - Build Silero VAD runner with CPU backend"
 	@echo "  llama-cuda          - Build Llama runner with CUDA backend"
@@ -264,6 +265,15 @@ voxtral_realtime-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
 
+voxtral_realtime-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building Voxtral Realtime runner with MLX..."
+	cd examples/models/voxtral_realtime && cmake --workflow --preset voxtral-realtime-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
+
 silero-vad-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release
diff --git a/backends/mlx/CMakeLists.txt b/backends/mlx/CMakeLists.txt
@@ -215,10 +215,7 @@ set(MLX_METAL_JIT
 # Auto-apply patches to MLX submodule. Each patch is applied idempotently: `git
 # apply --check` tests whether the patch is still applicable (i.e. not yet
 # applied), and only then applies it.
-set(_mlx_patches
-    "${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx_json.patch"
-    "${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx_metal_device_retain.patch"
-)
+set(_mlx_patches "${CMAKE_CURRENT_SOURCE_DIR}/patches/mlx_json.patch")
 foreach(_patch IN LISTS _mlx_patches)
   if(EXISTS "${_patch}" AND EXISTS "${MLX_SOURCE_DIR}")
     get_filename_component(_patch_name "${_patch}" NAME)
diff --git a/examples/models/voxtral_realtime/CMakeLists.txt b/examples/models/voxtral_realtime/CMakeLists.txt
@@ -33,7 +33,7 @@ list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
 executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
 
 # CPU-only builds need quantized and custom ops
-if(NOT EXECUTORCH_BUILD_CUDA)
+if(NOT EXECUTORCH_BUILD_CUDA AND NOT EXECUTORCH_BUILD_MLX)
   list(APPEND link_libraries quantized_ops_lib custom_ops)
   executorch_target_link_options_shared_lib(quantized_ops_lib)
   executorch_target_link_options_shared_lib(custom_ops)
@@ -87,6 +87,12 @@ if(EXECUTORCH_BUILD_METAL)
   executorch_target_link_options_shared_lib(metal_backend)
 endif()
 
+# Link MLX delegate
+if(TARGET mlxdelegate)
+  list(APPEND link_libraries mlxdelegate mlx)
+  executorch_target_link_options_shared_lib(mlxdelegate)
+endif()
+
 # Tokenizer
 list(APPEND link_libraries tokenizers::tokenizers)
 
@@ -106,6 +112,11 @@ target_compile_options(
   voxtral_realtime_runner PUBLIC ${_common_compile_options}
 )
 
+# Copy MLX metallib for runtime if MLX delegate is enabled
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(voxtral_realtime_runner)
+endif()
+
 # On Windows, copy required DLLs to the executable directory
 if(MSVC AND EXECUTORCH_BUILD_CUDA)
   add_custom_command(
diff --git a/examples/models/voxtral_realtime/CMakePresets.json b/examples/models/voxtral_realtime/CMakePresets.json
@@ -28,6 +28,19 @@
                 "type": "equals",
                 "rhs": "Darwin"
             }
+        },
+        {
+            "name": "voxtral-realtime-mlx",
+            "displayName": "Voxtral Realtime runner (MLX)",
+            "inherits": ["voxtral-realtime-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_MLX": "ON"
+            },
+            "condition": {
+                "lhs": "${hostSystemName}",
+                "type": "equals",
+                "rhs": "Darwin"
+            }
         }
     ],
     "buildPresets": [
@@ -43,6 +56,13 @@
             "configurePreset": "voxtral-realtime-metal",
             "configuration": "Release",
             "targets": ["voxtral_realtime_runner"]
+        },
+        {
+            "name": "voxtral-realtime-mlx",
+            "displayName": "Build Voxtral Realtime runner (MLX)",
+            "configurePreset": "voxtral-realtime-mlx",
+            "configuration": "Release",
+            "targets": ["voxtral_realtime_runner"]
         }
     ],
     "workflowPresets": [
@@ -73,6 +93,20 @@
                     "name": "voxtral-realtime-metal"
                 }
             ]
+        },
+        {
+            "name": "voxtral-realtime-mlx",
+            "displayName": "Configure and build Voxtral Realtime runner (MLX)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "voxtral-realtime-mlx"
+                },
+                {
+                    "type": "build",
+                    "name": "voxtral-realtime-mlx"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/voxtral_realtime/README.md b/examples/models/voxtral_realtime/README.md
@@ -88,8 +88,9 @@ python export_voxtral_rt.py \
 |---------|---------|-----------|--------------|
 | `xnnpack` | ✓ | ✓ | `4w`, `8w`, `8da4w`, `8da8w` |
 | `metal` | ✓ | ✓ | none (fp32) or `fpa4w` (Metal-specific 4-bit) |
+| `mlx` | ✓ | ✓ | `4w`, `8w` |
 
-Metal backend provides Apple GPU acceleration.
+Metal and MLX backends provide Apple GPU acceleration.
 
 #### Metal export examples
 
@@ -128,12 +129,48 @@ Alternatively, you can build torchao with Metal support while installing ExecuTo
 EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
 ```
 
+#### MLX export examples
+
+MLX backend uses the MLX delegate for Apple Silicon GPU acceleration.
+
+Offline:
+
+```bash
+python export_voxtral_rt.py \
+    --model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
+    --backend mlx \
+    --output-dir ./voxtral_rt_exports \
+    --qlinear-encoder 4w \
+    --qlinear 4w \
+    --qembedding 8w \
+    --qembedding-group-size 128 \
+    --export-preprocessor
+```
+
+Streaming:
+
+```bash
+python export_voxtral_rt.py \
+    --model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
+    --backend mlx \
+    --streaming \
+    --output-dir ./voxtral_rt_exports \
+    --qlinear-encoder 4w \
+    --qlinear 4w \
+    --qembedding 8w \
+    --qembedding-group-size 128 \
+    --export-preprocessor
+```
+
+`--export-preprocessor` bundles the mel preprocessor into the output directory
+using the MLX partitioner, so no separate preprocessor export step is needed.
+
 ### Options
 
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--model-path` | (required) | Directory with `params.json` + `consolidated.safetensors` |
-| `--backend` | `xnnpack` | `xnnpack`, `metal`, or `portable` |
+| `--backend` | `xnnpack` | `xnnpack`, `metal`, `mlx`, or `portable` |
 | `--output-dir` | `./voxtral_rt_exports` | Output directory |
 | `--max-seq-len` | `4096` | KV cache length |
 | `--delay-tokens` | `6` | Transcription delay in tokens (6 = 480ms) |
@@ -142,6 +179,8 @@ EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_ex
 | `--qlinear-encoder` | (none) | Encoder linear layer quantization (`4w`, `8w`, `8da4w`, `8da8w`, `fpa4w`) |
 | `--qlinear-encoder-group-size` | `32` | Group size for encoder linear quantization |
 | `--qembedding` | (none) | Embedding layer quantization (`8w`) |
+| `--qembedding-group-size` | `0` | Group size for embedding quantization (0 = per-channel) |
+| `--export-preprocessor` | off | Export `preprocessor.pte` alongside the model |
 | `--streaming` | off | Export streaming encoder with KV cache |
 | `--max-enc-len` | `750` | Encoder sliding window size (streaming only) |
 
@@ -173,6 +212,15 @@ make voxtral_realtime-metal
 This builds ExecuTorch with Metal backend support. The runner binary is at
 the same path as above. Metal exports can only run on macOS with Apple Silicon.
 
+### MLX (Apple GPU)
+
+```bash
+make voxtral_realtime-mlx
+```
+
+This builds ExecuTorch with MLX backend support. MLX provides GPU acceleration
+on Apple Silicon via the MLX delegate.
+
 ## Run
 
 The runner requires:
diff --git a/examples/models/voxtral_realtime/export_voxtral_rt.py b/examples/models/voxtral_realtime/export_voxtral_rt.py
diff --git a/examples/models/voxtral_realtime/model.py b/examples/models/voxtral_realtime/model.py