[MLX] Add MLX support to the Qwen3.5 C++ runner (pytorch#20364)

metascroy · web-flow · commit ce9a7f53fffe · 2026-06-22T10:34:11.000-07:00
Adds MLX support to the C++ Qwen3.5 runner. See updated README.md for
instructions.
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -161,6 +161,29 @@ jobs:
         fi
         echo "::endgroup::"
 
+        echo "::group::Verify chunked == unchunked prefill"
+        QWEN_TINY_PTE=/tmp/qwen35_moe_mlx_tiny/model.pte \
+          ${CONDA_RUN} python -m pytest \
+          examples/models/qwen3_5_moe/test_chunked_prefill.py -v
+        echo "::endgroup::"
+
+        echo "::group::Build Qwen 3.5 MoE MLX C++ runner"
+        # Validates the MLX C++ runner build wiring (compile + link + metallib).
+        # The tiny model has no compatible tokenizer (vocab 256, random weights),
+        # so we don't run C++ inference here — only confirm it builds.
+        ${CONDA_RUN} make qwen3_5_moe-mlx
+        RUNNER=cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner
+        if [ ! -x "$RUNNER" ]; then
+          echo "Failed: runner not found at $RUNNER"
+          exit 1
+        fi
+        if [ ! -f "$(dirname "$RUNNER")/mlx.metallib" ]; then
+          echo "Failed: mlx.metallib not copied next to runner"
+          exit 1
+        fi
+        echo "Success: built $RUNNER"
+        echo "::endgroup::"
+
   backend-tester:
     needs: run-decision
     if: |
diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal qwen3_5_moe-mlx clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -131,6 +131,7 @@ help:
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
+	@echo "  qwen3_5_moe-mlx     - Build Qwen3.5 MoE runner with MLX backend"
 	@echo "  clean               - Clean build artifacts"
 
 voxtral-cuda:
@@ -467,6 +468,15 @@ qwen3_5_moe-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
 
+qwen3_5_moe-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building Qwen3.5 MoE runner with MLX..."
+	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+
 clean:
 	rm -rf cmake-out \
 	       extension/llm/tokenizers/build \
diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt
@@ -54,9 +54,14 @@ elseif(EXECUTORCH_BUILD_CUDA)
   list(APPEND link_libraries aoti_cuda_backend)
   executorch_target_link_options_shared_lib(aoti_cuda_backend)
   add_compile_definitions(EXECUTORCH_BUILD_CUDA)
+elseif(TARGET mlxdelegate)
+  list(APPEND link_libraries mlxdelegate mlx)
+  executorch_target_link_options_shared_lib(mlxdelegate)
+  add_compile_definitions(EXECUTORCH_BUILD_MLX)
 else()
   message(
-    FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_METAL=ON"
+    FATAL_ERROR
+      "Set EXECUTORCH_BUILD_CUDA=ON, EXECUTORCH_BUILD_METAL=ON, or EXECUTORCH_BUILD_MLX=ON"
   )
 endif()
 
@@ -82,6 +87,10 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
 endif()
 
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(qwen3_5_moe_runner)
+endif()
+
 if(EXECUTORCH_BUILD_CUDA)
   enable_testing()
   add_executable(
diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json
@@ -36,6 +36,19 @@
                 "type": "equals",
                 "rhs": "Darwin"
             }
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Qwen3.5 MoE runner (MLX)",
+            "inherits": ["qwen3-5-moe-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_MLX": "ON"
+            },
+            "condition": {
+                "type": "equals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Darwin"
+            }
         }
     ],
     "buildPresets": [
@@ -54,6 +67,12 @@
             "displayName": "Build Qwen3.5 MoE runner and worker (Metal)",
             "configurePreset": "qwen3-5-moe-metal",
             "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Build Qwen3.5 MoE runner (MLX)",
+            "configurePreset": "qwen3-5-moe-mlx",
+            "targets": ["qwen3_5_moe_runner"]
         }
     ],
     "workflowPresets": [
@@ -84,6 +103,20 @@
                     "name": "qwen3-5-moe-metal"
                 }
             ]
+        },
+        {
+            "name": "qwen3-5-moe-mlx",
+            "displayName": "Configure and build Qwen3.5 MoE runner (MLX)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "qwen3-5-moe-mlx"
+                },
+                {
+                    "type": "build",
+                    "name": "qwen3-5-moe-mlx"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -261,7 +261,38 @@ python export.py \
 | `--qembedding` | (none) | Embedding quantization: `8w` |
 | `--tiny-test` | off | Build tiny model with random weights for CI testing |
 
-### Run (MLX)
+### Build (MLX)
+
+Like the CUDA/Metal builds, the `make` target builds ExecuTorch core with the
+MLX backend and the runner binary. Requires Apple Silicon (Darwin).
+
+```bash
+make qwen3_5_moe-mlx
+```
+
+This builds ExecuTorch with MLX support, then the runner binary at
+`cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` (with `mlx.metallib`
+copied next to it). Unlike CUDA, the MLX `.pte` is self-contained — no `.ptd`
+data file is produced or needed.
+
+### Run (MLX, C++ runner)
+
+The C++ runner requires a local HuggingFace `tokenizer.json` (the MLX `.pte` and
+a `tokenizer.json`; no `--data_path`):
+
+```bash
+cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
+    --model_path ./qwen35_moe_mlx/model.pte \
+    --tokenizer_path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
+    --prompt "What is the capital of France?" \
+    --max_new_tokens 50
+```
+
+The MLX export emits a single dynamic-seq `forward` method; the runner loads and
+calls it for both prefill and decode (sampling on host), matching the Python
+runner. See the [Run](#run) section above for the full flag list.
+
+### Run (MLX, Python)
 
 ```bash
 python -m executorch.examples.models.qwen3_5_moe.run \
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -768,10 +768,16 @@ def _export_mlx(model, config, args):
     gc.collect()
 
     print("Lowering to ExecuTorch with MLX backend...")
+    # Largest prefill chunk the runner may submit in one forward call. The MLX
+    # runner chunks long prompts to cap peak memory; bound it by the compiled
+    # dynamic max (max_seq_len - 1) so a chunk can never exceed what `forward`
+    # was compiled for.
+    max_prefill_chunk = min(1024, config.max_seq_len - 1)
     metadata = {
         "get_max_seq_len": config.max_seq_len,
         "get_vocab_size": config.vocab_size,
         "get_n_layers": config.num_hidden_layers,
+        "get_max_prefill_chunk": max_prefill_chunk,
         "use_kv_cache": True,
         "use_sdpa_with_kv_cache": False,
         "enable_dynamic_shape": True,
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
@@ -19,6 +19,8 @@
 #include <cmath>
 #include <cstring>
 
+#include <algorithm>
+
 #ifdef EXECUTORCH_BUILD_CUDA
 #include <cuda_runtime.h>
 #include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
@@ -39,6 +41,22 @@ using SizesType = executorch::aten::SizesType;
 
 namespace {
 
+#ifdef EXECUTORCH_BUILD_MLX
+// The MLX export emits a single dynamic-seq `forward` method that handles both
+// prefill (T>=2) and decode (T=1). Mirror gemma4_31b's MLX runner, which loads
+// and calls `forward` for both phases.
+constexpr const char* kPrefillMethod = "forward";
+constexpr const char* kDecodeMethod = "forward";
+#else
+// CUDA/Metal exports emit two separate methods.
+constexpr const char* kPrefillMethod = "prefill";
+constexpr const char* kDecodeMethod = "decode";
+#endif
+
+// Constant method exported by the MLX .pte giving the largest prefill chunk the
+// `forward` method was compiled for. Read into the metadata map in create().
+constexpr const char* kMaxPrefillChunk = "get_max_prefill_chunk";
+
 Result<uint64_t> read_sampled_token(
     const executorch::aten::Tensor& output,
     float temperature) {
@@ -98,8 +116,10 @@ Result<std::unique_ptr<Module>> build_qwen_module(
   }
 #endif
 
-  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("prefill"));
-  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("decode"));
+  ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kPrefillMethod));
+  if (std::string(kDecodeMethod) != std::string(kPrefillMethod)) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kDecodeMethod));
+  }
   return module;
 }
 
@@ -240,34 +260,63 @@ class Qwen35MoESession : public LLMSession {
     }
 
     stop_.store(false, std::memory_order_relaxed);
-    std::vector<int64_t> token_data(tokens.begin(), tokens.end());
-    std::vector<int64_t> pos_data(T);
-    for (int64_t i = 0; i < T; ++i) {
-      pos_data[i] = pos_ + i;
+
+    // On MLX, run prefill in fixed-size chunks (caps peak memory and the
+    // compiled prefill shape). Other backends prefill the whole prompt in one
+    // pass. Only the final chunk's sampled token is kept; the recurrence/KV
+    // state from earlier chunks persists via pos_ advancement.
+#ifdef EXECUTORCH_BUILD_MLX
+    // Chunk size: default to the compiled max (kMaxSeqLen - 1), overridden by
+    // the exported get_max_prefill_chunk constant when present (mirrors
+    // gemma4_31b). Falls back to T (single pass) if no metadata is available at
+    // all.
+    int64_t chunk_size = T;
+    if (auto it = metadata_.find(kMaxSeqLen);
+        it != metadata_.end() && it->second > 1) {
+      chunk_size = it->second - 1;
     }
-    auto tokens_tensor = from_blob(
-        token_data.data(),
-        {1, static_cast<SizesType>(T)},
-        executorch::aten::ScalarType::Long);
-    auto pos_tensor = from_blob(
-        pos_data.data(),
-        {static_cast<SizesType>(T)},
-        executorch::aten::ScalarType::Long);
-
-    const char* method = (T >= 2) ? "prefill" : "decode";
-    std::vector<EValue> inputs;
-    inputs.push_back(tokens_tensor);
-    inputs.push_back(pos_tensor);
+    if (auto it = metadata_.find(kMaxPrefillChunk);
+        it != metadata_.end() && it->second > 0) {
+      chunk_size = it->second;
+    }
+#else
+    const int64_t chunk_size = T;
+#endif
+
+    uint64_t sampled_token = 0;
+    for (int64_t off = 0; off < T; off += chunk_size) {
+      const int64_t len = std::min(chunk_size, T - off);
+      std::vector<int64_t> token_data(
+          tokens.begin() + off, tokens.begin() + off + len);
+      std::vector<int64_t> pos_data(len);
+      for (int64_t i = 0; i < len; ++i) {
+        pos_data[i] = pos_ + i;
+      }
+      auto tokens_tensor = from_blob(
+          token_data.data(),
+          {1, static_cast<SizesType>(len)},
+          executorch::aten::ScalarType::Long);
+      auto pos_tensor = from_blob(
+          pos_data.data(),
+          {static_cast<SizesType>(len)},
+          executorch::aten::ScalarType::Long);
+
+      const char* method = (len >= 2) ? kPrefillMethod : kDecodeMethod;
+      std::vector<EValue> inputs;
+      inputs.push_back(tokens_tensor);
+      inputs.push_back(pos_tensor);
 #ifdef EXECUTORCH_BUILD_CUDA
-    set_temp(first_token_temp);
-    inputs.push_back(EValue(temp_tensor_));
+      set_temp(first_token_temp);
+      inputs.push_back(EValue(temp_tensor_));
 #endif
-    auto sampled =
-        run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
-    ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
-    pending_ = sampled.get();
+      auto sampled =
+          run_locked(method, inputs, first_token_temp, /*sync_after=*/true);
+      ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
+      sampled_token = sampled.get();
+      pos_ += len;
+    }
+    pending_ = sampled_token;
     prev_decode_token_.reset();
-    pos_ += T;
     return Error::Ok;
   }
 
@@ -334,7 +383,7 @@ class Qwen35MoESession : public LLMSession {
     inputs.push_back(EValue(temp_tensor_));
 #endif
     auto sampled =
-        run_locked("decode", inputs, temperature_, /*sync_after=*/false);
+        run_locked(kDecodeMethod, inputs, temperature_, /*sync_after=*/false);
     ET_CHECK_OK_OR_RETURN_ERROR(sampled.error());
     pending_ = sampled.get();
     prev_decode_token_ = token;
@@ -457,6 +506,14 @@ Result<std::unique_ptr<Qwen35MoEEngine>> Qwen35MoEEngine::create(
     ET_LOG(Error, "Qwen35MoEEngine: failed to read metadata");
     return metadata_result.error();
   }
+#ifdef EXECUTORCH_BUILD_MLX
+  // Surface the compiled max prefill chunk (a constant method get_llm_metadata
+  // doesn't harvest) into the metadata map so the session can chunk long
+  // prompts within the shape `forward` was compiled for.
+  if (auto mpc = meta_module->get(kMaxPrefillChunk); mpc.ok()) {
+    metadata_result.get()[kMaxPrefillChunk] = mpc->toScalar().to<int64_t>();
+  }
+#endif
   auto eos_ids = get_eos_ids(tokenizer.get(), meta_module.get());
   // This export's metadata doesn't carry the chat-turn EOS (config.json has no
   // eos_token_id and the .pte exports no get_eos_ids method), so get_eos_ids()
diff --git a/examples/models/qwen3_5_moe/test_chunked_prefill.py b/examples/models/qwen3_5_moe/test_chunked_prefill.py