Skip to content

Commit 7056f0f

Browse files
committed
examples/models/gemma4_31b: CUDA Engine/Session adapter + OpenAI serving
Adds the Gemma 4 31B serving path, mirroring qwen3_5_moe: a CUDA Engine/Session adapter (chunked prefill, per-session mutable rebinding, in-graph sampling) behind the model-agnostic LLMEngine/LLMSession contract, a JSONL worker, and a serve.py launcher. The generic worker loop gains an optional prompt_prefix_ids (Gemma BOS prepend) and serving_chat a matching prompt_token_offset so the context count stays honest. export.py emits get_mutable_buffer_metadata and prefill-chunk bounds for multi-session. ghstack-source-id: c21c647 ghstack-comment-id: 4674805750 Pull-Request: #20207
1 parent 7ebc37f commit 7056f0f

23 files changed

Lines changed: 2399 additions & 48 deletions

Makefile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ help:
127127
@echo " llava-cpu - Build Llava runner with CPU backend"
128128
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
129129
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
130-
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend"
130+
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner + OpenAI serving worker with CUDA backend"
131131
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend"
132132
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
133133
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
@@ -444,11 +444,13 @@ qwen3_5_moe-cuda:
444444
gemma4_31b-cuda:
445445
@echo "==> Building and installing ExecuTorch with CUDA..."
446446
cmake --workflow --preset llm-release-cuda
447-
@echo "==> Building Gemma 4 31B runner with CUDA..."
447+
@echo "==> Building Gemma 4 31B runner + serving worker with CUDA..."
448448
cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
449449
@echo ""
450450
@echo "✓ Build complete!"
451451
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
452+
@echo " Serving worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
453+
@echo " Launch: see examples/models/gemma4_31b/README.md (Serving)"
452454

453455
gemma4_31b-mlx:
454456
@echo "==> Building and installing ExecuTorch with MLX..."

examples/models/gemma4_31b/CMakeLists.txt

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
1515
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
1616

1717
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
18+
set(_json_include
19+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
20+
)
1821

1922
# gflags
2023
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
@@ -58,9 +61,13 @@ endif()
5861
# Tokenizer (HuggingFace tokenizer.json)
5962
list(APPEND link_libraries tokenizers::tokenizers)
6063

61-
add_executable(gemma4_31b_runner main.cpp)
64+
if(EXECUTORCH_BUILD_CUDA)
65+
add_executable(gemma4_31b_runner main.cpp gemma4_31b_engine.cpp)
66+
else()
67+
add_executable(gemma4_31b_runner main.cpp)
68+
endif()
6269
target_include_directories(
63-
gemma4_31b_runner PUBLIC ${_common_include_directories}
70+
gemma4_31b_runner PUBLIC ${_common_include_directories} ${_json_include}
6471
)
6572
target_link_libraries(gemma4_31b_runner PUBLIC ${link_libraries})
6673

@@ -71,6 +78,21 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
7178
endif()
7279
endif()
7380

81+
if(EXECUTORCH_BUILD_CUDA)
82+
add_executable(gemma4_31b_worker gemma4_31b_worker.cpp gemma4_31b_engine.cpp)
83+
target_include_directories(
84+
gemma4_31b_worker PUBLIC ${_common_include_directories} ${_json_include}
85+
)
86+
target_link_libraries(gemma4_31b_worker PUBLIC ${link_libraries})
87+
88+
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
89+
target_link_options_gc_sections(gemma4_31b_worker)
90+
if(NOT APPLE AND NOT MSVC)
91+
target_link_options(gemma4_31b_worker PRIVATE "LINKER:-s")
92+
endif()
93+
endif()
94+
endif()
95+
7496
if(TARGET mlxdelegate)
7597
executorch_target_copy_mlx_metallib(gemma4_31b_runner)
7698
endif()

examples/models/gemma4_31b/CMakePresets.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
},
1414
{
1515
"name": "gemma4-31b-cuda",
16-
"displayName": "Gemma 4 31B runner (CUDA)",
16+
"displayName": "Gemma 4 31B runner + serving worker (CUDA)",
1717
"inherits": ["gemma4-31b-base"],
1818
"cacheVariables": {
1919
"EXECUTORCH_BUILD_CUDA": "ON"
@@ -39,9 +39,9 @@
3939
"buildPresets": [
4040
{
4141
"name": "gemma4-31b-cuda",
42-
"displayName": "Build Gemma 4 31B runner (CUDA)",
42+
"displayName": "Build Gemma 4 31B runner + serving worker (CUDA)",
4343
"configurePreset": "gemma4-31b-cuda",
44-
"targets": ["gemma4_31b_runner"]
44+
"targets": ["gemma4_31b_runner", "gemma4_31b_worker"]
4545
},
4646
{
4747
"name": "gemma4-31b-mlx",
@@ -53,7 +53,7 @@
5353
"workflowPresets": [
5454
{
5555
"name": "gemma4-31b-cuda",
56-
"displayName": "Configure and build Gemma 4 31B runner (CUDA)",
56+
"displayName": "Configure and build Gemma 4 31B runner + serving worker (CUDA)",
5757
"steps": [
5858
{
5959
"type": "configure",

examples/models/gemma4_31b/README.md

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,12 @@ model produces sensible text.
139139
## Build the runner
140140

141141
```bash
142-
make gemma4_31b-cuda # Linux — CUDA backend
143-
make gemma4_31b-mlx # macOS — MLX backend (Apple Silicon)
142+
make gemma4_31b-cuda # Linux — CUDA runner + serving worker
143+
make gemma4_31b-mlx # macOS — MLX runner (serving later)
144144
```
145145

146-
The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`.
146+
The CUDA build also produces
147+
`cmake-out/examples/models/gemma4_31b/gemma4_31b_worker`.
147148

148149
## Run the .pte
149150

@@ -162,3 +163,29 @@ Pass `--raw_prompt` to skip template wrapping for pre-formatted input.
162163

163164
For benchmarking, add `--cuda_graph` to capture the decode method in a CUDA
164165
graph (decode is fully static — `T=1`).
166+
167+
## Serving
168+
169+
The CUDA OpenAI-compatible server is a Python control plane plus a C++ model worker.
170+
The worker owns the ExecuTorch model and speaks the shared JSONL protocol used by
171+
the generic LLM server.
172+
173+
```bash
174+
LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
175+
python -m executorch.examples.models.gemma4_31b.serve \
176+
--model-path ./gemma4_31b_exports/model.pte \
177+
--data-path ./gemma4_31b_exports/aoti_cuda_blob.ptd \
178+
--tokenizer-path ./gemma4_31b_int4/tokenizer.json \
179+
--hf-tokenizer ./gemma4_31b_int4 \
180+
--model-id gemma4-31b \
181+
--max-sessions 1
182+
```
183+
184+
The launcher defaults to Gemma's `<|tool_call>call:...<tool_call|>` parser. Use
185+
`--tool-parser hermes`, `--tool-parser qwen`, or `--tool-parser none` if the
186+
model/template you are testing emits a different tool-call format.
187+
188+
Named sessions and warm resume require worker capacity above one. CUDA exports
189+
with `get_mutable_buffer_metadata` can use per-session mutable rebinding and
190+
advertise `--max-sessions > 1`; older exports fail closed to a single scratch
191+
session. MLX serving is intentionally left for a later change.

examples/models/gemma4_31b/export.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"""
2525

2626
import argparse
27+
import json
2728
import os
2829

2930
import torch
@@ -135,6 +136,11 @@ def _pack_for_backend(model: nn.Module, path: str, backend: str) -> None:
135136
# Export + lower
136137

137138

139+
def _mutable_buffer_metadata(model: nn.Module) -> str:
140+
mutable = [name for name, _ in model.named_buffers() if ".kv_cache." in name]
141+
return json.dumps({"version": 1, "mutable_buffers": mutable})
142+
143+
138144
def export_and_lower(
139145
model: Gemma4_31B,
140146
config: Gemma4_31BConfig,
@@ -181,6 +187,7 @@ def _export_cuda(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -
181187
import executorch.backends.cuda.quantize_op_dispatch # noqa: F401
182188

183189
materialize_runtime_buffers(model, dtype=torch.bfloat16)
190+
mutable_buffer_metadata = _mutable_buffer_metadata(model)
184191

185192
# Int4Tensor weights are used directly — no format conversion.
186193
# F.linear dispatches to executorch_cuda::int4_plain_mm (CUDA shim).
@@ -248,6 +255,8 @@ def _export_cuda(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -
248255
"get_vocab_size": config.vocab_size,
249256
"get_n_layers": config.num_hidden_layers,
250257
"get_max_prefill_chunk": max_prefill,
258+
"get_min_prefill_chunk": 5,
259+
"get_mutable_buffer_metadata": mutable_buffer_metadata,
251260
"use_kv_cache": True,
252261
"use_sdpa_with_kv_cache": False,
253262
"enable_dynamic_shape": True,

0 commit comments

Comments
 (0)