Skip to content

Commit 06d6943

Browse files
authored
Add Gemma 4 E2B/E4B multimodal model, export, and runner for ExecuTorch (#19166)
Differential Revision: D99603811 Pull Request resolved: #19166
1 parent c4ec988 commit 06d6943

30 files changed

Lines changed: 5855 additions & 0 deletions

examples/models/gemma4/BUCK

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
2+
load(":targets.bzl", "define_common_targets")
3+
4+
oncall("executorch")
5+
6+
non_fbcode_target(_kind = define_common_targets,)
7+
8+
fbcode_target(_kind = define_common_targets,)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
#
8+
# Simple CMake build system for gemma4 e2e runner.
9+
#
10+
cmake_minimum_required(VERSION 3.24)
11+
project(gemma4)
12+
13+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
14+
15+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
16+
17+
if(NOT CMAKE_CXX_STANDARD)
18+
set(CMAKE_CXX_STANDARD 17)
19+
endif()
20+
21+
# Let files say "include <executorch/path/to/header.h>"
22+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
23+
24+
# Need this for gflags for some reason
25+
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
26+
find_package(gflags REQUIRED)
27+
28+
# Find `executorch` libraries, same as for gflags
29+
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
30+
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
31+
executorch_target_link_options_shared_lib(executorch)
32+
33+
set(link_libraries executorch gflags)
34+
set(_srcs e2e_runner.cpp)
35+
36+
list(
37+
APPEND
38+
link_libraries
39+
optimized_native_cpu_ops_lib
40+
quantized_ops_lib
41+
custom_ops
42+
cpublas
43+
eigen_blas
44+
)
45+
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
46+
executorch_target_link_options_shared_lib(quantized_ops_lib)
47+
executorch_target_link_options_shared_lib(custom_ops)
48+
49+
# XNNPACK
50+
if(TARGET xnnpack_backend)
51+
set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
52+
if(TARGET kleidiai)
53+
list(APPEND xnnpack_backend_libs kleidiai)
54+
endif()
55+
list(APPEND link_libraries ${xnnpack_backend_libs})
56+
executorch_target_link_options_shared_lib(xnnpack_backend)
57+
endif()
58+
59+
# Needed for cpuinfo where it uses android specific log lib
60+
if(ANDROID)
61+
list(APPEND link_libraries log)
62+
endif()
63+
64+
# Add the required ExecuTorch extensions
65+
list(APPEND link_libraries extension_module extension_data_loader
66+
extension_tensor
67+
)
68+
69+
# Add tokenizers
70+
list(APPEND link_libraries tokenizers::tokenizers)
71+
72+
add_executable(gemma4_e2e_runner ${_srcs})
73+
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
74+
target_link_options_gc_sections(gemma4_e2e_runner)
75+
if(NOT APPLE)
76+
target_link_options(gemma4_e2e_runner PRIVATE "LINKER:-s")
77+
endif()
78+
endif()
79+
80+
target_include_directories(
81+
gemma4_e2e_runner PUBLIC ${_common_include_directories}
82+
)
83+
target_link_libraries(gemma4_e2e_runner PUBLIC ${link_libraries})
84+
target_compile_options(gemma4_e2e_runner PUBLIC ${_common_compile_options})
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
"version": 6,
3+
"configurePresets": [
4+
{
5+
"name": "gemma4-base",
6+
"hidden": true,
7+
"binaryDir": "${sourceDir}/../../../cmake-out/examples/models/gemma4",
8+
"cacheVariables": {
9+
"CMAKE_BUILD_TYPE": "Release",
10+
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out"
11+
}
12+
},
13+
{
14+
"name": "gemma4-cpu",
15+
"displayName": "Gemma4 runner (CPU)",
16+
"inherits": ["gemma4-base"]
17+
}
18+
],
19+
"buildPresets": [
20+
{
21+
"name": "gemma4-cpu",
22+
"displayName": "Build Gemma4 runner (CPU)",
23+
"configurePreset": "gemma4-cpu",
24+
"targets": ["gemma4_e2e_runner"]
25+
}
26+
],
27+
"workflowPresets": [
28+
{
29+
"name": "gemma4-cpu",
30+
"displayName": "Configure and build Gemma4 runner (CPU)",
31+
"steps": [
32+
{
33+
"type": "configure",
34+
"name": "gemma4-cpu"
35+
},
36+
{
37+
"type": "build",
38+
"name": "gemma4-cpu"
39+
}
40+
]
41+
}
42+
]
43+
}

examples/models/gemma4/README.md

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Gemma 4 on ExecuTorch
2+
3+
Multimodal inference for Gemma 4 on ExecuTorch.
4+
Supports audio transcription, translation, image understanding, and text generation on mobile devices.
5+
6+
Variants: E2B (2B params) and E4B (4B params).
7+
8+
## Architecture
9+
10+
Single PTE with up to 4 methods:
11+
- `speech_transform` — Waveform to log-mel spectrogram (no learned weights)
12+
- `audio_encoder` — USM Conformer via HF's Gemma4AudioModel
13+
- `vision_encoder` — ViT with 2D RoPE via HF's Gemma4VisionModel (8-bit, int8 position embeddings)
14+
- `text_decoder` — Autoregressive decoder with YOCO, PLE, partial RoPE
15+
16+
Use `--no-audio` or `--no-vision` at export time to exclude unused encoders.
17+
18+
| | E2B | E4B |
19+
|---|---|---|
20+
| Hidden size | 1536 | 2560 |
21+
| Layers | 35 | 42 |
22+
| KV heads | 1 (MQA) | 2 |
23+
24+
## Export
25+
26+
```bash
27+
# E2B default (4-bit text, 8-bit vision, all modalities):
28+
buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
29+
--checkpoint_path /tmp/gemma4-e2b-it
30+
31+
# E2B 4-bit with tied embedding (smaller, for on-device deployment):
32+
buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
33+
--checkpoint_path /tmp/gemma4-e2b-it --tied_embedding
34+
35+
# E4B (4-bit):
36+
buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
37+
--checkpoint_path /tmp/gemma4-e4b-it --variant e4b
38+
39+
# Audio-only (no vision encoder, saves ~129 MB):
40+
buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
41+
--checkpoint_path /tmp/gemma4-e2b-it --no-vision
42+
43+
# Vision-only (no audio encoder, saves ~100 MB):
44+
buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
45+
--checkpoint_path /tmp/gemma4-e2b-it --no-audio
46+
```
47+
48+
## Model Variants
49+
50+
Default export includes all modalities (audio + vision + text). Default context length: 1024 tokens (`--max_seq_len`).
51+
52+
### Pre-exported Models
53+
54+
**E2B:**
55+
56+
| File | Size | Config | Description |
57+
|------|------|--------|-------------|
58+
| `gemma4.pte` | 4.1 GB | 4-bit, audio-only | Default — fastest |
59+
| `gemma4_vision.pte` | 4.3 GB | 4-bit, all modalities | Audio + vision + text |
60+
| `gemma4_tied_emb4.pte` | 2.5 GB | 4-bit tied + emb4, audio-only | Smallest |
61+
62+
**E4B:**
63+
64+
| File | Size | Config | Description |
65+
|------|------|--------|-------------|
66+
| `gemma4.pte` | 6.1 GB | 4-bit, audio-only | Default — fastest |
67+
| `gemma4_vision.pte` | 6.2 GB | 4-bit, all modalities | Audio + vision + text |
68+
| `gemma4_tied_emb4.pte` | 4.0 GB | 4-bit tied + emb4, audio-only | Smallest |
69+
70+
### Export Flags
71+
72+
| Variant | Size | Flag |
73+
|---------|------|------|
74+
| E2B 4-bit (default) | 4.3 GB | (none) |
75+
| E2B 4-bit audio-only | 4.1 GB | `--no-vision` |
76+
| E2B 4-bit emb4 tied | 2.5 GB | `--quantize 8da4w+emb4 --tied_embedding --no-vision` |
77+
| E4B 4-bit | 6.2 GB | `--variant e4b` |
78+
| E4B 4-bit audio-only | 6.1 GB | `--variant e4b --no-vision` |
79+
| E4B 4-bit emb4 tied | 4.0 GB | `--variant e4b --quantize 8da4w+emb4 --tied_embedding --no-vision` |
80+
81+
Vision encoder adds ~129 MB (8-bit linears + int8 position embedding table).
82+
83+
- **Untied models** (`gemma4.pte`, `gemma4_vision.pte`) work with both Python and C++ runners.
84+
- **emb4 tied** uses packed INT4 embeddings and shared embed_tokens/lm_head weights. Requires C++ runner with TorchAO shared embedding kernels.
85+
86+
## Build (CMake, host)
87+
88+
```bash
89+
cmake --preset gemma4-cpu -S examples/models/gemma4
90+
cmake --build --preset gemma4-cpu -j$(nproc)
91+
```
92+
93+
## Run
94+
95+
```bash
96+
# Audio transcription (C++ runner):
97+
./cmake-out/examples/models/gemma4/gemma4_e2e_runner \
98+
--model_path gemma4.pte \
99+
--tokenizer_path tokenizer.model \
100+
--audio_path test_audio.wav
101+
102+
# Image understanding (C++ runner):
103+
./cmake-out/examples/models/gemma4/gemma4_e2e_runner \
104+
--model_path gemma4.pte \
105+
--tokenizer_path tokenizer.model \
106+
--image_path photo.jpg \
107+
--prompt "Describe this image:"
108+
109+
# Text-only:
110+
./cmake-out/examples/models/gemma4/gemma4_e2e_runner \
111+
--model_path gemma4.pte \
112+
--tokenizer_path tokenizer.model \
113+
--prompt "What is 2+2?"
114+
115+
# Python runner (audio):
116+
buck2 run fbcode//executorch/examples/models/gemma4:run_gemma4 -- \
117+
--model_path /tmp/gemma4.pte \
118+
--tokenizer_path /tmp/tokenizer.model \
119+
--audio_path /tmp/test_audio.wav
120+
121+
# Python runner (image):
122+
buck2 run fbcode//executorch/examples/models/gemma4:run_gemma4 -- \
123+
--model_path /tmp/gemma4.pte \
124+
--tokenizer_path /tmp/tokenizer.model \
125+
--image_path /tmp/photo.jpg \
126+
--prompt "Describe this image:"
127+
```
128+
129+
## Input Requirements
130+
131+
**Audio**: WAV, 16kHz, 16-bit PCM, mono, max 30 seconds.
132+
133+
**Image**: JPEG or PNG. Resized to fit `--max_vision_tokens` soft tokens (default 140). Aspect ratio preserved, dimensions rounded to multiples of 48 pixels. Lower tokens = faster but less detail (25 ~= 240x240, 70 ~= 384x384, 140 ~= 528x528, 280 ~= 768x768).
134+
135+
## Samsung S25 Performance
136+
137+
### Audio (23s)
138+
139+
| Model | Size | Load | Prefill | Gen | TTFT | RTF | Mem load | Mem peak |
140+
|-------|------|------|---------|-----|------|-----|----------|----------|
141+
| E2B gemma4.pte | 4.1 GB | 705ms | 166 tok/s | 6 tok/s | 4.50s | 0.71 | 1885 MB | 2251 MB |
142+
| E2B gemma4_vision.pte | 4.3 GB | 648ms | 163 tok/s | 6 tok/s | 4.56s | 0.72 | 1890 MB | 2257 MB |
143+
| E2B gemma4_tied_emb4.pte | 2.5 GB | 645ms | 164 tok/s | 6 tok/s | 4.52s | 0.71 | 1683 MB | 2241 MB |
144+
| E4B gemma4.pte | 6.1 GB | 1.30s | 91 tok/s | 4 tok/s | 7.50s | 1.07 | 3231 MB | 3601 MB |
145+
| E4B gemma4_vision.pte | 6.2 GB | 1.28s | 92 tok/s | 4 tok/s | 7.47s | 1.00 | 3231 MB | 3602 MB |
146+
| E4B gemma4_tied_emb4.pte | 4.0 GB | 1.17s | 85 tok/s | 4 tok/s | 8.00s | 1.07 | 2899 MB | 3590 MB |
147+
148+
### Vision (dog.jpg, "Describe this image in two sentences.", 140 tokens ~528x528)
149+
150+
| Model | Size | Load | Encode | Prefill | Gen | TTFT | Total | Mem load | Mem peak |
151+
|-------|------|------|--------|---------|-----|------|-------|----------|----------|
152+
| E2B gemma4_vision.pte | 4.3 GB | 798ms | 2.73s | 134 tok/s | 6 tok/s | 3.83s | 10.14s | 1884 MB | 2600 MB |
153+
| E4B gemma4_vision.pte | 6.2 GB | 1.36s | 2.44s | 85 tok/s | 4 tok/s | 4.17s | 14.62s | 3232 MB | 3950 MB |
154+
155+
### Text ("Write a short paragraph about the history of artificial intelligence")
156+
157+
| Model | Size | Load | Prefill | Gen | TTFT | Total | Mem load | Mem peak |
158+
|-------|------|------|---------|-----|------|-------|----------|----------|
159+
| E2B gemma4.pte | 4.1 GB | 625ms | 57 tok/s | 6 tok/s | 332ms | 26.94s | 1890 MB | 1950 MB |
160+
| E4B gemma4.pte | 6.1 GB | 1.51s | 38 tok/s | 3 tok/s | 506ms | 44.66s | 3231 MB | 3287 MB |

examples/models/gemma4/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from executorch.examples.models.gemma4.text_decoder import (
8+
convert_hf_to_custom,
9+
Gemma4Config,
10+
Gemma4Model,
11+
)
12+
13+
__all__ = ["Gemma4Config", "Gemma4Model", "convert_hf_to_custom"]
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
{
2+
"model_type": "gemma4",
3+
"text_config": {
4+
"hidden_size": 1536,
5+
"intermediate_size": 6144,
6+
"use_double_wide_mlp": true,
7+
"num_hidden_layers": 35,
8+
"num_attention_heads": 8,
9+
"head_dim": 256,
10+
"global_head_dim": 512,
11+
"num_key_value_heads": 1,
12+
"vocab_size": 262144,
13+
"vocab_size_per_layer_input": 262144,
14+
"max_position_embeddings": 131072,
15+
"rms_norm_eps": 1e-06,
16+
"rope_theta": 1000000.0,
17+
"rope_local_base_freq": 10000.0,
18+
"partial_rotary_factor": 0.25,
19+
"sliding_window": 512,
20+
"hidden_activation": "gelu_pytorch_tanh",
21+
"final_logit_softcapping": 30.0,
22+
"hidden_size_per_layer_input": 256,
23+
"num_kv_shared_layers": 20,
24+
"tie_word_embeddings": true,
25+
"layer_types": [
26+
"sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
27+
"sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
28+
"sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
29+
"sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
30+
"sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
31+
"sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
32+
"sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention"
33+
]
34+
},
35+
"audio_config": {
36+
"model_type": "gemma4_audio",
37+
"hidden_size": 1024,
38+
"output_proj_dims": 1536,
39+
"num_attention_heads": 8,
40+
"num_hidden_layers": 12,
41+
"rms_norm_eps": 1e-06,
42+
"conv_kernel_size": 5,
43+
"residual_weight": 0.5,
44+
"attention_chunk_size": 12,
45+
"attention_context_left": 13,
46+
"attention_context_right": 0,
47+
"attention_logit_cap": 50.0
48+
},
49+
"audio_seq_length": 750,
50+
"audio_ms_per_token": 40,
51+
"audio_sample_rate": 16000,
52+
"max_audio_length_sec": 30,
53+
"special_tokens": {
54+
"audio_token_id": 258881,
55+
"boa_token_id": 256000,
56+
"eoa_token_id": 258883,
57+
"boi_token_id": 255999,
58+
"eoi_token_id": 258882,
59+
"image_token_id": 258880,
60+
"video_token_id": 258884
61+
}
62+
}

0 commit comments

Comments
 (0)