pytorch · mergennachin · Jun 25, 2026 · Jun 23, 2026 · metascroy · Jun 24, 2026
diff --git a/Makefile b/Makefile
@@ -127,8 +127,8 @@ help:
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
-	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
-	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
+	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner and worker with CUDA backend"
+	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner and worker with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  qwen3_5_moe-mlx     - Build Qwen3.5 MoE runner with MLX backend"
@@ -444,20 +444,23 @@ qwen3_5_moe-cuda:
 gemma4_31b-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
-	@echo "==> Building Gemma 4 31B runner with CUDA..."
+	@echo "==> Building Gemma 4 31B runner, worker, and no-bleed test with CUDA..."
 	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
 	@echo ""
 	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
+	@echo "  Test:   cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed"
 
 gemma4_31b-mlx:
 	@echo "==> Building and installing ExecuTorch with MLX..."
 	cmake --workflow --preset mlx-release
-	@echo "==> Building Gemma 4 31B runner with MLX..."
+	@echo "==> Building Gemma 4 31B runner and worker with MLX..."
 	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-mlx
 	@echo ""
 	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
 
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."

@@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_json_include
+    ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
+)
 
 # gflags
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
@@ -51,26 +54,51 @@ if(EXECUTORCH_BUILD_CUDA)
 elseif(TARGET mlxdelegate)
   list(APPEND link_libraries mlxdelegate mlx)
   executorch_target_link_options_shared_lib(mlxdelegate)
+  add_compile_definitions(EXECUTORCH_BUILD_MLX)
 else()
   message(FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_MLX=ON")
 endif()
 
 # Tokenizer (HuggingFace tokenizer.json)
 list(APPEND link_libraries tokenizers::tokenizers)
 
-add_executable(gemma4_31b_runner main.cpp)
+add_executable(gemma4_31b_runner main.cpp gemma4_31b_engine.cpp)
 target_include_directories(
-  gemma4_31b_runner PUBLIC ${_common_include_directories}
+  gemma4_31b_runner PUBLIC ${_common_include_directories} ${_json_include}
 )
 target_link_libraries(gemma4_31b_runner PUBLIC ${link_libraries})
 
+add_executable(gemma4_31b_worker gemma4_31b_worker.cpp gemma4_31b_engine.cpp)
+target_include_directories(
+  gemma4_31b_worker PUBLIC ${_common_include_directories} ${_json_include}
+)
+target_link_libraries(gemma4_31b_worker PUBLIC ${link_libraries})
+
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(gemma4_31b_runner)
   if(NOT APPLE AND NOT MSVC)
     target_link_options(gemma4_31b_runner PRIVATE "LINKER:-s")
   endif()
+  target_link_options_gc_sections(gemma4_31b_worker)
+  if(NOT APPLE AND NOT MSVC)
+    target_link_options(gemma4_31b_worker PRIVATE "LINKER:-s")
+  endif()
 endif()
 
 if(TARGET mlxdelegate)
   executorch_target_copy_mlx_metallib(gemma4_31b_runner)
+  executorch_target_copy_mlx_metallib(gemma4_31b_worker)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  enable_testing()
+  add_executable(
+    test_gemma4_31b_nobleed test_gemma4_31b_nobleed.cpp gemma4_31b_engine.cpp
+  )
+  target_include_directories(
+    test_gemma4_31b_nobleed PUBLIC ${_common_include_directories}
+                                   ${_json_include}
+  )
+  target_link_libraries(test_gemma4_31b_nobleed PUBLIC ${link_libraries})
+  add_test(NAME gemma4_31b_nobleed COMMAND test_gemma4_31b_nobleed)
 endif()
@@ -13,7 +13,7 @@
         },
         {
             "name": "gemma4-31b-cuda",
-            "displayName": "Gemma 4 31B runner (CUDA)",
+            "displayName": "Gemma 4 31B runner and worker (CUDA)",
             "inherits": ["gemma4-31b-base"],
             "cacheVariables": {
                 "EXECUTORCH_BUILD_CUDA": "ON"
@@ -26,7 +26,7 @@
         },
         {
             "name": "gemma4-31b-mlx",
-            "displayName": "Gemma 4 31B runner (MLX)",
+            "displayName": "Gemma 4 31B runner and worker (MLX)",
             "inherits": ["gemma4-31b-base"],
             "cacheVariables": {},
             "condition": {
@@ -39,21 +39,25 @@
     "buildPresets": [
         {
             "name": "gemma4-31b-cuda",
-            "displayName": "Build Gemma 4 31B runner (CUDA)",
+            "displayName": "Build Gemma 4 31B runner, worker, and no-bleed test (CUDA)",
             "configurePreset": "gemma4-31b-cuda",
-            "targets": ["gemma4_31b_runner"]
+            "targets": [
+                "gemma4_31b_runner",
+                "gemma4_31b_worker",
+                "test_gemma4_31b_nobleed"
+            ]
         },
         {
             "name": "gemma4-31b-mlx",
-            "displayName": "Build Gemma 4 31B runner (MLX)",
+            "displayName": "Build Gemma 4 31B runner and worker (MLX)",
             "configurePreset": "gemma4-31b-mlx",
-            "targets": ["gemma4_31b_runner"]
+            "targets": ["gemma4_31b_runner", "gemma4_31b_worker"]
         }
     ],
     "workflowPresets": [
         {
             "name": "gemma4-31b-cuda",
-            "displayName": "Configure and build Gemma 4 31B runner (CUDA)",
+            "displayName": "Configure and build Gemma 4 31B runner and worker (CUDA)",
             "steps": [
                 {
                     "type": "configure",
@@ -67,7 +71,7 @@
         },
         {
             "name": "gemma4-31b-mlx",
-            "displayName": "Configure and build Gemma 4 31B runner (MLX)",
+            "displayName": "Configure and build Gemma 4 31B runner and worker (MLX)",
             "steps": [
                 {
                     "type": "configure",

diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md
@@ -153,14 +153,17 @@ python examples/models/gemma4_31b/inference.py \
 Useful before spending the export+lowering time to confirm the quantized
 model produces sensible text.
 
-## Build the runner
+## Build the runner and worker
 
 ```bash
 make gemma4_31b-cuda   # Linux — CUDA backend
 make gemma4_31b-mlx    # macOS — MLX backend (Apple Silicon)
 ```
 
-The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`.
+The binaries land at:
+
+- `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`
+- `cmake-out/examples/models/gemma4_31b/gemma4_31b_worker`
 
 ## Run the .pte
 
@@ -179,3 +182,61 @@ Pass `--raw_prompt` to skip template wrapping for pre-formatted input.
 
 For benchmarking, add `--cuda_graph` to capture the decode method in a CUDA
 graph (decode is fully static — `T=1`).
+
+## OpenAI-compatible serving harness
+
+The serving path is a test harness for local-agent workflows. Python owns HTTP,
+chat templating, request validation, and tool parsing; the C++ worker owns model
+loading, prefill/decode, and session state. Use the runner or engine/session API
+directly for production integrations.
+
+### CUDA
+
+```bash
+python -m executorch.examples.models.gemma4_31b.serve \
+    --model-path ./gemma4_31b_exports/model.pte \
+    --data-path ./gemma4_31b_exports/aoti_cuda_blob.ptd \
+    --tokenizer-path ./gemma4_31b_int4/tokenizer.json \
+    --hf-tokenizer ./gemma4_31b_int4 \
+    --model-id gemma4_31b \
+    --max-context 4096 \
+    --max-sessions 4 \
+    --host 127.0.0.1 \
+    --port 8000
+```
+
+### MLX
+
+```bash
+python -m executorch.examples.models.gemma4_31b.serve \
+    --model-path ./gemma4_31b_exports_mlx/model.pte \
+    --tokenizer-path ./gemma4_31b_int4/tokenizer.json \
+    --hf-tokenizer ./gemma4_31b_int4 \
+    --model-id gemma4_31b \
+    --max-context 4096 \
+    --max-sessions 4 \
+    --host 127.0.0.1 \
+    --port 8000
+```
+
+Named sessions use one loaded model with isolated mutable state when the backend
+supports it. Set `--max-sessions >= 2` and send a stable `session_id` (or one of
+the supported affinity headers) to enable separate conversations and warm
+append-only resume. One capacity slot is reserved for anonymous requests.
+
+The default parser is Gemma's tool-call format. Use `--tool-parser hermes`,
+`--tool-parser qwen`, or `--tool-parser none` if a different prompt/template
+emits another format.
+
+### CUDA no-bleed test
+
+The CUDA build also produces `test_gemma4_31b_nobleed`, which validates that
+two sessions can interleave prefill/decode on one loaded model without sharing
+mutable state:
+
+```bash
+GEMMA_MODEL_PATH=gemma4_31b_exports/model.pte \
+GEMMA_DATA_PATH=gemma4_31b_exports/aoti_cuda_blob.ptd \
+GEMMA_TOKENIZER_PATH=gemma4_31b_int4/tokenizer.json \
+  cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed
+```
diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py
@@ -24,6 +24,7 @@
 """
 
 import argparse
+import json
 import os
 
 import torch
@@ -135,6 +136,11 @@ def _pack_for_backend(model: nn.Module, path: str, backend: str) -> None:
 # Export + lower
 
 
+def _mutable_buffer_metadata(model: nn.Module) -> str:
+    mutable = [name for name, _ in model.named_buffers() if ".kv_cache." in name]
+    return json.dumps({"version": 1, "mutable_buffers": mutable})
+
+
 def export_and_lower(
     model: Gemma4_31B,
     config: Gemma4_31BConfig,
@@ -181,6 +187,7 @@ def _export_cuda(
     import executorch.backends.cuda.quantize_op_dispatch  # noqa: F401
 
     materialize_runtime_buffers(model, dtype=torch.bfloat16)
+    mutable_buffer_metadata = _mutable_buffer_metadata(model)
 
     if use_turboquant:
         from executorch.examples.models.gemma4_31b.cuda_source_transformations import (
@@ -255,6 +262,8 @@ def _export_cuda(
             "get_vocab_size": config.vocab_size,
             "get_n_layers": config.num_hidden_layers,
             "get_max_prefill_chunk": max_prefill,
+            "get_min_prefill_chunk": 5,
+            "get_mutable_buffer_metadata": mutable_buffer_metadata,
             "use_kv_cache": True,
             "use_sdpa_with_kv_cache": False,
             "enable_dynamic_shape": True,