Skip to content

Commit fa5d85a

Browse files
authored
gemma4_31b: add OpenAI serving entrypoint (#20473)
Add Gemma4 31B model-specific serving support on top of the shared examples/llm_server harness. This extracts the existing runner flow into a small Gemma4_31BEngine/LLMSession adapter, keeps main.cpp as a thin runner wrapper, and adds a C++ JSONL worker plus Python OpenAI-compatible launcher. The generic server remains model-agnostic; Gemma-specific behavior stays in examples/models/gemma4_31b, including chat-template options, BOS handling, channel cleanup, and Gemma tool-call parsing. Also wire the worker into the existing Gemma CUDA/MLX CMake presets and Makefile targets, document the serving harness usage, and add validation coverage: hermetic launcher tests, an opt-in on-device BOS/template regression test, and a CUDA no-bleed integration proof for interleaved multi-session execution. #20001
1 parent a0a730a commit fa5d85a

13 files changed

Lines changed: 1887 additions & 392 deletions

Makefile

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ help:
127127
@echo " llava-cpu - Build Llava runner with CPU backend"
128128
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
129129
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
130-
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend"
131-
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend"
130+
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner and worker with CUDA backend"
131+
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner and worker with MLX backend"
132132
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
133133
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
134134
@echo " qwen3_5_moe-mlx - Build Qwen3.5 MoE runner with MLX backend"
@@ -444,20 +444,23 @@ qwen3_5_moe-cuda:
444444
gemma4_31b-cuda:
445445
@echo "==> Building and installing ExecuTorch with CUDA..."
446446
cmake --workflow --preset llm-release-cuda
447-
@echo "==> Building Gemma 4 31B runner with CUDA..."
447+
@echo "==> Building Gemma 4 31B runner, worker, and no-bleed test with CUDA..."
448448
cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
449449
@echo ""
450450
@echo "✓ Build complete!"
451-
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
451+
@echo " Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
452+
@echo " Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
453+
@echo " Test: cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed"
452454

453455
gemma4_31b-mlx:
454456
@echo "==> Building and installing ExecuTorch with MLX..."
455457
cmake --workflow --preset mlx-release
456-
@echo "==> Building Gemma 4 31B runner with MLX..."
458+
@echo "==> Building Gemma 4 31B runner and worker with MLX..."
457459
cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-mlx
458460
@echo ""
459461
@echo "✓ Build complete!"
460-
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
462+
@echo " Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
463+
@echo " Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
461464

462465
qwen3_5_moe-metal:
463466
@echo "==> Building and installing ExecuTorch with Metal..."

examples/models/gemma4_31b/CMakeLists.txt

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
1515
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
1616

1717
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
18+
set(_json_include
19+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
20+
)
1821

1922
# gflags
2023
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
@@ -51,26 +54,51 @@ if(EXECUTORCH_BUILD_CUDA)
5154
elseif(TARGET mlxdelegate)
5255
list(APPEND link_libraries mlxdelegate mlx)
5356
executorch_target_link_options_shared_lib(mlxdelegate)
57+
add_compile_definitions(EXECUTORCH_BUILD_MLX)
5458
else()
5559
message(FATAL_ERROR "Set EXECUTORCH_BUILD_CUDA=ON or EXECUTORCH_BUILD_MLX=ON")
5660
endif()
5761

5862
# Tokenizer (HuggingFace tokenizer.json)
5963
list(APPEND link_libraries tokenizers::tokenizers)
6064

61-
add_executable(gemma4_31b_runner main.cpp)
65+
add_executable(gemma4_31b_runner main.cpp gemma4_31b_engine.cpp)
6266
target_include_directories(
63-
gemma4_31b_runner PUBLIC ${_common_include_directories}
67+
gemma4_31b_runner PUBLIC ${_common_include_directories} ${_json_include}
6468
)
6569
target_link_libraries(gemma4_31b_runner PUBLIC ${link_libraries})
6670

71+
add_executable(gemma4_31b_worker gemma4_31b_worker.cpp gemma4_31b_engine.cpp)
72+
target_include_directories(
73+
gemma4_31b_worker PUBLIC ${_common_include_directories} ${_json_include}
74+
)
75+
target_link_libraries(gemma4_31b_worker PUBLIC ${link_libraries})
76+
6777
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
6878
target_link_options_gc_sections(gemma4_31b_runner)
6979
if(NOT APPLE AND NOT MSVC)
7080
target_link_options(gemma4_31b_runner PRIVATE "LINKER:-s")
7181
endif()
82+
target_link_options_gc_sections(gemma4_31b_worker)
83+
if(NOT APPLE AND NOT MSVC)
84+
target_link_options(gemma4_31b_worker PRIVATE "LINKER:-s")
85+
endif()
7286
endif()
7387

7488
if(TARGET mlxdelegate)
7589
executorch_target_copy_mlx_metallib(gemma4_31b_runner)
90+
executorch_target_copy_mlx_metallib(gemma4_31b_worker)
91+
endif()
92+
93+
if(EXECUTORCH_BUILD_CUDA)
94+
enable_testing()
95+
add_executable(
96+
test_gemma4_31b_nobleed test_gemma4_31b_nobleed.cpp gemma4_31b_engine.cpp
97+
)
98+
target_include_directories(
99+
test_gemma4_31b_nobleed PUBLIC ${_common_include_directories}
100+
${_json_include}
101+
)
102+
target_link_libraries(test_gemma4_31b_nobleed PUBLIC ${link_libraries})
103+
add_test(NAME gemma4_31b_nobleed COMMAND test_gemma4_31b_nobleed)
76104
endif()

examples/models/gemma4_31b/CMakePresets.json

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
},
1414
{
1515
"name": "gemma4-31b-cuda",
16-
"displayName": "Gemma 4 31B runner (CUDA)",
16+
"displayName": "Gemma 4 31B runner and worker (CUDA)",
1717
"inherits": ["gemma4-31b-base"],
1818
"cacheVariables": {
1919
"EXECUTORCH_BUILD_CUDA": "ON"
@@ -26,7 +26,7 @@
2626
},
2727
{
2828
"name": "gemma4-31b-mlx",
29-
"displayName": "Gemma 4 31B runner (MLX)",
29+
"displayName": "Gemma 4 31B runner and worker (MLX)",
3030
"inherits": ["gemma4-31b-base"],
3131
"cacheVariables": {},
3232
"condition": {
@@ -39,21 +39,25 @@
3939
"buildPresets": [
4040
{
4141
"name": "gemma4-31b-cuda",
42-
"displayName": "Build Gemma 4 31B runner (CUDA)",
42+
"displayName": "Build Gemma 4 31B runner, worker, and no-bleed test (CUDA)",
4343
"configurePreset": "gemma4-31b-cuda",
44-
"targets": ["gemma4_31b_runner"]
44+
"targets": [
45+
"gemma4_31b_runner",
46+
"gemma4_31b_worker",
47+
"test_gemma4_31b_nobleed"
48+
]
4549
},
4650
{
4751
"name": "gemma4-31b-mlx",
48-
"displayName": "Build Gemma 4 31B runner (MLX)",
52+
"displayName": "Build Gemma 4 31B runner and worker (MLX)",
4953
"configurePreset": "gemma4-31b-mlx",
50-
"targets": ["gemma4_31b_runner"]
54+
"targets": ["gemma4_31b_runner", "gemma4_31b_worker"]
5155
}
5256
],
5357
"workflowPresets": [
5458
{
5559
"name": "gemma4-31b-cuda",
56-
"displayName": "Configure and build Gemma 4 31B runner (CUDA)",
60+
"displayName": "Configure and build Gemma 4 31B runner and worker (CUDA)",
5761
"steps": [
5862
{
5963
"type": "configure",
@@ -67,7 +71,7 @@
6771
},
6872
{
6973
"name": "gemma4-31b-mlx",
70-
"displayName": "Configure and build Gemma 4 31B runner (MLX)",
74+
"displayName": "Configure and build Gemma 4 31B runner and worker (MLX)",
7175
"steps": [
7276
{
7377
"type": "configure",

examples/models/gemma4_31b/README.md

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,14 +153,17 @@ python examples/models/gemma4_31b/inference.py \
153153
Useful before spending the export+lowering time to confirm the quantized
154154
model produces sensible text.
155155

156-
## Build the runner
156+
## Build the runner and worker
157157

158158
```bash
159159
make gemma4_31b-cuda # Linux — CUDA backend
160160
make gemma4_31b-mlx # macOS — MLX backend (Apple Silicon)
161161
```
162162

163-
The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`.
163+
The binaries land at:
164+
165+
- `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`
166+
- `cmake-out/examples/models/gemma4_31b/gemma4_31b_worker`
164167

165168
## Run the .pte
166169

@@ -179,3 +182,61 @@ Pass `--raw_prompt` to skip template wrapping for pre-formatted input.
179182

180183
For benchmarking, add `--cuda_graph` to capture the decode method in a CUDA
181184
graph (decode is fully static — `T=1`).
185+
186+
## OpenAI-compatible serving harness
187+
188+
The serving path is a test harness for local-agent workflows. Python owns HTTP,
189+
chat templating, request validation, and tool parsing; the C++ worker owns model
190+
loading, prefill/decode, and session state. Use the runner or engine/session API
191+
directly for production integrations.
192+
193+
### CUDA
194+
195+
```bash
196+
python -m executorch.examples.models.gemma4_31b.serve \
197+
--model-path ./gemma4_31b_exports/model.pte \
198+
--data-path ./gemma4_31b_exports/aoti_cuda_blob.ptd \
199+
--tokenizer-path ./gemma4_31b_int4/tokenizer.json \
200+
--hf-tokenizer ./gemma4_31b_int4 \
201+
--model-id gemma4_31b \
202+
--max-context 4096 \
203+
--max-sessions 4 \
204+
--host 127.0.0.1 \
205+
--port 8000
206+
```
207+
208+
### MLX
209+
210+
```bash
211+
python -m executorch.examples.models.gemma4_31b.serve \
212+
--model-path ./gemma4_31b_exports_mlx/model.pte \
213+
--tokenizer-path ./gemma4_31b_int4/tokenizer.json \
214+
--hf-tokenizer ./gemma4_31b_int4 \
215+
--model-id gemma4_31b \
216+
--max-context 4096 \
217+
--max-sessions 4 \
218+
--host 127.0.0.1 \
219+
--port 8000
220+
```
221+
222+
Named sessions use one loaded model with isolated mutable state when the backend
223+
supports it. Set `--max-sessions >= 2` and send a stable `session_id` (or one of
224+
the supported affinity headers) to enable separate conversations and warm
225+
append-only resume. One capacity slot is reserved for anonymous requests.
226+
227+
The default parser is Gemma's tool-call format. Use `--tool-parser hermes`,
228+
`--tool-parser qwen`, or `--tool-parser none` if a different prompt/template
229+
emits another format.
230+
231+
### CUDA no-bleed test
232+
233+
The CUDA build also produces `test_gemma4_31b_nobleed`, which validates that
234+
two sessions can interleave prefill/decode on one loaded model without sharing
235+
mutable state:
236+
237+
```bash
238+
GEMMA_MODEL_PATH=gemma4_31b_exports/model.pte \
239+
GEMMA_DATA_PATH=gemma4_31b_exports/aoti_cuda_blob.ptd \
240+
GEMMA_TOKENIZER_PATH=gemma4_31b_int4/tokenizer.json \
241+
cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed
242+
```

examples/models/gemma4_31b/export.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"""
2525

2626
import argparse
27+
import json
2728
import os
2829

2930
import torch
@@ -135,6 +136,11 @@ def _pack_for_backend(model: nn.Module, path: str, backend: str) -> None:
135136
# Export + lower
136137

137138

139+
def _mutable_buffer_metadata(model: nn.Module) -> str:
140+
mutable = [name for name, _ in model.named_buffers() if ".kv_cache." in name]
141+
return json.dumps({"version": 1, "mutable_buffers": mutable})
142+
143+
138144
def export_and_lower(
139145
model: Gemma4_31B,
140146
config: Gemma4_31BConfig,
@@ -181,6 +187,7 @@ def _export_cuda(
181187
import executorch.backends.cuda.quantize_op_dispatch # noqa: F401
182188

183189
materialize_runtime_buffers(model, dtype=torch.bfloat16)
190+
mutable_buffer_metadata = _mutable_buffer_metadata(model)
184191

185192
if use_turboquant:
186193
from executorch.examples.models.gemma4_31b.cuda_source_transformations import (
@@ -255,6 +262,8 @@ def _export_cuda(
255262
"get_vocab_size": config.vocab_size,
256263
"get_n_layers": config.num_hidden_layers,
257264
"get_max_prefill_chunk": max_prefill,
265+
"get_min_prefill_chunk": 5,
266+
"get_mutable_buffer_metadata": mutable_buffer_metadata,
258267
"use_kv_cache": True,
259268
"use_sdpa_with_kv_cache": False,
260269
"enable_dynamic_shape": True,

0 commit comments

Comments
 (0)