pytorch
diff --git a/‎Makefile‎
Lines changed: 13 additions & 1 deletion b/‎Makefile‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎examples/models/qwen3_5_moe/CMakeLists.txt‎
Lines changed: 21 additions & 1 deletion b/‎examples/models/qwen3_5_moe/CMakeLists.txt‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎examples/models/qwen3_5_moe/CMakePresets.json‎
Lines changed: 25 additions & 0 deletions b/‎examples/models/qwen3_5_moe/CMakePresets.json‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎examples/models/qwen3_5_moe/README.md‎
Lines changed: 84 additions & 0 deletions b/‎examples/models/qwen3_5_moe/README.md‎
Lines changed: 84 additions & 0 deletions
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-cuda-serve qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -130,6 +130,7 @@ help:
 	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
+	@echo "  qwen3_5_moe-cuda-serve - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  clean               - Clean build artifacts"
 
@@ -455,6 +456,17 @@ gemma4_31b-mlx:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
 
+qwen3_5_moe-cuda-serve:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Qwen3.5 MoE runner + serving worker with CUDA..."
+	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda-serve
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+	@echo "  Serving worker: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
+	@echo "  Launch: see examples/models/qwen3_5_moe/README.md (Serving)"
+
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."
 	cmake --workflow --preset llm-release-metal
 
@@ -15,6 +15,11 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+# Vendored single-include nlohmann/json for the worker JSONL protocol (no new
+# dependency).
+set(_json_include
+    ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
+)
 
 # gflags
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
@@ -60,7 +65,7 @@ endif()
 # Tokenizer
 list(APPEND link_libraries tokenizers::tokenizers)
 
-add_executable(qwen3_5_moe_runner main.cpp)
+add_executable(qwen3_5_moe_runner main.cpp qwen35_moe_engine.cpp)
 target_include_directories(
   qwen3_5_moe_runner PUBLIC ${_common_include_directories}
 )
@@ -70,3 +75,18 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(qwen3_5_moe_runner)
   target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
 endif()
+
+# Process-isolated serving worker (qwen3_5_moe_worker): constructs
+# Qwen35MoEEngine directly and speaks the JSONL worker protocol that the Python
+# control plane drives via WorkerClient (no pybind, no Python model code). Used
+# by the qwen3_5_moe-cuda-serve flow.
+add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
+target_include_directories(
+  qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}
+)
+target_link_libraries(qwen3_5_moe_worker PUBLIC ${link_libraries})
+
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(qwen3_5_moe_worker)
+  target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
+endif()
@@ -24,6 +24,11 @@
                 "list": ["Linux", "Windows"]
             }
         },
+        {
+            "name": "qwen3-5-moe-cuda-serve",
+            "displayName": "Qwen3.5 MoE runner + serving worker (CUDA)",
+            "inherits": ["qwen3-5-moe-cuda"]
+        },
         {
             "name": "qwen3-5-moe-metal",
             "displayName": "Qwen3.5 MoE runner (Metal)",
@@ -45,6 +50,12 @@
             "configurePreset": "qwen3-5-moe-cuda",
             "targets": ["qwen3_5_moe_runner"]
         },
+        {
+            "name": "qwen3-5-moe-cuda-serve",
+            "displayName": "Build Qwen3.5 MoE runner + serving worker (CUDA)",
+            "configurePreset": "qwen3-5-moe-cuda-serve",
+            "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
+        },
         {
             "name": "qwen3-5-moe-metal",
             "displayName": "Build Qwen3.5 MoE runner (Metal)",
@@ -67,6 +78,20 @@
                 }
             ]
         },
+        {
+            "name": "qwen3-5-moe-cuda-serve",
+            "displayName": "Configure and build Qwen3.5 MoE runner + serving worker (CUDA)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "qwen3-5-moe-cuda-serve"
+                },
+                {
+                    "type": "build",
+                    "name": "qwen3-5-moe-cuda-serve"
+                }
+            ]
+        },
         {
             "name": "qwen3-5-moe-metal",
             "displayName": "Configure and build Qwen3.5 MoE runner (Metal)",
 
@@ -133,11 +133,95 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
 | `--data_path` | (none) | Path to `.ptd` delegate data file (required for CUDA) |
 | `--tokenizer_path` | (required) | Path to HuggingFace `tokenizer.json` |
 | `--prompt` | `"Hello"` | Input prompt text |
+| `--prompt_file` | (none) | Path to a file with the prompt (overrides `--prompt`) |
 | `--temperature` | `0.8` | Sampling temperature (0 = greedy) |
 | `--max_new_tokens` | `128` | Maximum tokens to generate |
+| `--cuda_graph` | off | Capture/replay the decode method as a CUDA graph (CUDA only). See the caveat below. |
+| `--warmup` | `0` | Warmup iterations to discard before timing (one model load; the session is reset between iterations) |
+| `--num_iters` | `1` | Timed iterations to average, after warmup |
+
+## Serving (OpenAI-compatible)
+
+Run an OpenAI-compatible HTTP server so an agent harness (pi, opencode, …) can
+use the model for local tool-use. Point your client at `http://<host>:<port>/v1`.
+
+Build the runner **and** the serving worker:
+
+```bash
+make qwen3_5_moe-cuda-serve
+```
+
+Launch (the `LD_LIBRARY_PATH` shim is forwarded to the worker for the CUDA blob):
+
+```bash
+LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
+  python -m executorch.examples.models.qwen3_5_moe.serve \
+    --model-path  qwen35_moe_exports/model.pte \
+    --data-path   qwen35_moe_exports/aoti_cuda_blob.ptd \
+    --tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
+    --hf-tokenizer   ~/models/Qwen3.5-35B-A3B \
+    --model-id qwen3.5-moe --no-think
+```
+
+### Architecture (process isolation)
+
+Two processes, one model load:
+
+```
+serve.py            (control plane: FastAPI/asyncio, OpenAI protocol, chat
+                     templating, tool parsing, validation — NO CUDA, NO pybind)
+   │  JSONL over stdin/stdout
+   ▼
+qwen3_5_moe_worker  (C++ binary: one Qwen35MoEEngine + one session, synchronous
+                     loop — the CUDA model; NO asyncio server)
+```
+
+The model runs in a **separate worker process** because executing the AOTI CUDA
+model inside a live asyncio server process segfaults in the int4 matmul
+(reproducible, and isolated by elimination to the asyncio-loop × CUDA
+interaction). The worker runs the model like the CLI — a plain synchronous loop —
+which is reliable. The control plane only does blocking pipe I/O (no CUDA), which
+is safe under asyncio.
+
+### Serve Options
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--model-path` | (required) | Path to exported `.pte` model |
+| `--data-path` | (none) | Path to `.ptd` delegate data file (required for CUDA) |
+| `--tokenizer-path` | (required) | Path to HuggingFace `tokenizer.json` |
+| `--hf-tokenizer` | (required) | HF tokenizer id/dir for the chat template + encoding |
+| `--model-id` | `qwen3.5-moe` | Model id reported on `/v1/models` |
+| `--host` / `--port` | `127.0.0.1` / `8000` | Bind address |
+| `--max-context` | (none) | Reject prompts that exceed it with 400 |
+| `--no-think` | off | Default reasoning off (`enable_thinking=False`) |
+
+### V1 limitations
+
+- **Single-slot** (`serving_capacity=1`): one worker, one session, one model
+  load. `--num-runners > 1` is rejected; concurrent requests queue on the worker.
+- **No prefix cache**: the recurrent/conv state cannot be rewound by position
+  (`seek()` is NotSupported), so turn-to-turn KV reuse is off.
+- Supports the chat-completions contract of the generic server; `top_p != 1`,
+  `seed`, `top_k`, `logprobs`, etc. are rejected (only temperature is plumbed).
 
 ## Troubleshooting
 
+- **Runner exits silently right after `Loading methods...`**: the AOTI CUDA blob
+  is compiled with the conda toolchain's `libstdc++`, which is newer than the
+  system one (it needs e.g. `GLIBCXX_3.4.34`). Prepend the conda lib dir so the
+  runner loads the matching `libstdc++`:
+
+  ```bash
+  LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
+    cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner ...
+  ```
+- **`aoti_torch_cuda_sort_stable ... API call failed` when re-running prefill
+  with `--cuda_graph`**: capturing the decode CUDA graph and then running another
+  prefill in the same process currently fails (allocator interaction). Use
+  `--cuda_graph` for single prefill+decode runs; omit it when looping with
+  `--warmup`/`--num_iters`.
+
 - **OOM during export**: The model requires significant GPU memory even
   with int4 quantization. Try reducing `--max-seq-len` or using a GPU
   with more VRAM.