pytorch
diff --git a/‎examples/models/gemma4/BUCK‎
Lines changed: 8 additions & 0 deletions b/‎examples/models/gemma4/BUCK‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/models/gemma4/CMakeLists.txt‎
Lines changed: 84 additions & 0 deletions b/‎examples/models/gemma4/CMakeLists.txt‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎examples/models/gemma4/CMakePresets.json‎
Lines changed: 43 additions & 0 deletions b/‎examples/models/gemma4/CMakePresets.json‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎examples/models/gemma4/README.md‎
Lines changed: 160 additions & 0 deletions b/‎examples/models/gemma4/README.md‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎examples/models/gemma4/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎examples/models/gemma4/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/models/gemma4/config/e2b_config.json‎
Lines changed: 62 additions & 0 deletions b/‎examples/models/gemma4/config/e2b_config.json‎
Lines changed: 62 additions & 0 deletions
@@ -0,0 +1,8 @@
+load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+non_fbcode_target(_kind = define_common_targets,)
+
+fbcode_target(_kind = define_common_targets,)
@@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Simple CMake build system for gemma4 e2e runner.
+#
+cmake_minimum_required(VERSION 3.24)
+project(gemma4)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags for some reason
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# Find `executorch` libraries, same as for gflags
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(link_libraries executorch gflags)
+set(_srcs e2e_runner.cpp)
+
+list(
+  APPEND
+  link_libraries
+  optimized_native_cpu_ops_lib
+  quantized_ops_lib
+  custom_ops
+  cpublas
+  eigen_blas
+)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+executorch_target_link_options_shared_lib(quantized_ops_lib)
+executorch_target_link_options_shared_lib(custom_ops)
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Needed for cpuinfo where it uses android specific log lib
+if(ANDROID)
+  list(APPEND link_libraries log)
+endif()
+
+# Add the required ExecuTorch extensions
+list(APPEND link_libraries extension_module extension_data_loader
+     extension_tensor
+)
+
+# Add tokenizers
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(gemma4_e2e_runner ${_srcs})
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(gemma4_e2e_runner)
+  if(NOT APPLE)
+    target_link_options(gemma4_e2e_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(
+  gemma4_e2e_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(gemma4_e2e_runner PUBLIC ${link_libraries})
+target_compile_options(gemma4_e2e_runner PUBLIC ${_common_compile_options})
@@ -0,0 +1,43 @@
+{
+    "version": 6,
+    "configurePresets": [
+        {
+            "name": "gemma4-base",
+            "hidden": true,
+            "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/gemma4",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out"
+            }
+        },
+        {
+            "name": "gemma4-cpu",
+            "displayName": "Gemma4 runner (CPU)",
+            "inherits": ["gemma4-base"]
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "gemma4-cpu",
+            "displayName": "Build Gemma4 runner (CPU)",
+            "configurePreset": "gemma4-cpu",
+            "targets": ["gemma4_e2e_runner"]
+        }
+    ],
+    "workflowPresets": [
+        {
+            "name": "gemma4-cpu",
+            "displayName": "Configure and build Gemma4 runner (CPU)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "gemma4-cpu"
+                },
+                {
+                    "type": "build",
+                    "name": "gemma4-cpu"
+                }
+            ]
+        }
+    ]
+}
@@ -0,0 +1,160 @@
+# Gemma 4 on ExecuTorch
+
+Multimodal inference for Gemma 4 on ExecuTorch.
+Supports audio transcription, translation, image understanding, and text generation on mobile devices.
+
+Variants: E2B (2B params) and E4B (4B params).
+
+## Architecture
+
+Single PTE with up to 4 methods:
+- `speech_transform` — Waveform to log-mel spectrogram (no learned weights)
+- `audio_encoder` — USM Conformer via HF's Gemma4AudioModel
+- `vision_encoder` — ViT with 2D RoPE via HF's Gemma4VisionModel (8-bit, int8 position embeddings)
+- `text_decoder` — Autoregressive decoder with YOCO, PLE, partial RoPE
+
+Use `--no-audio` or `--no-vision` at export time to exclude unused encoders.
+
+| | E2B | E4B |
+|---|---|---|
+| Hidden size | 1536 | 2560 |
+| Layers | 35 | 42 |
+| KV heads | 1 (MQA) | 2 |
+
+## Export
+
+```bash
+# E2B default (4-bit text, 8-bit vision, all modalities):
+buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
+    --checkpoint_path /tmp/gemma4-e2b-it
+
+# E2B 4-bit with tied embedding (smaller, for on-device deployment):
+buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
+    --checkpoint_path /tmp/gemma4-e2b-it --tied_embedding
+
+# E4B (4-bit):
+buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
+    --checkpoint_path /tmp/gemma4-e4b-it --variant e4b
+
+# Audio-only (no vision encoder, saves ~129 MB):
+buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
+    --checkpoint_path /tmp/gemma4-e2b-it --no-vision
+
+# Vision-only (no audio encoder, saves ~100 MB):
+buck2 run fbcode//executorch/examples/models/gemma4:export_gemma4 -- \
+    --checkpoint_path /tmp/gemma4-e2b-it --no-audio
+```
+
+## Model Variants
+
+Default export includes all modalities (audio + vision + text). Default context length: 1024 tokens (`--max_seq_len`).
+
+### Pre-exported Models
+
+**E2B:**
+
+| File | Size | Config | Description |
+|------|------|--------|-------------|
+| `gemma4.pte` | 4.1 GB | 4-bit, audio-only | Default — fastest |
+| `gemma4_vision.pte` | 4.3 GB | 4-bit, all modalities | Audio + vision + text |
+| `gemma4_tied_emb4.pte` | 2.5 GB | 4-bit tied + emb4, audio-only | Smallest |
+
+**E4B:**
+
+| File | Size | Config | Description |
+|------|------|--------|-------------|
+| `gemma4.pte` | 6.1 GB | 4-bit, audio-only | Default — fastest |
+| `gemma4_vision.pte` | 6.2 GB | 4-bit, all modalities | Audio + vision + text |
+| `gemma4_tied_emb4.pte` | 4.0 GB | 4-bit tied + emb4, audio-only | Smallest |
+
+### Export Flags
+
+| Variant | Size | Flag |
+|---------|------|------|
+| E2B 4-bit (default) | 4.3 GB | (none) |
+| E2B 4-bit audio-only | 4.1 GB | `--no-vision` |
+| E2B 4-bit emb4 tied | 2.5 GB | `--quantize 8da4w+emb4 --tied_embedding --no-vision` |
+| E4B 4-bit | 6.2 GB | `--variant e4b` |
+| E4B 4-bit audio-only | 6.1 GB | `--variant e4b --no-vision` |
+| E4B 4-bit emb4 tied | 4.0 GB | `--variant e4b --quantize 8da4w+emb4 --tied_embedding --no-vision` |
+
+Vision encoder adds ~129 MB (8-bit linears + int8 position embedding table).
+
+- **Untied models** (`gemma4.pte`, `gemma4_vision.pte`) work with both Python and C++ runners.
+- **emb4 tied** uses packed INT4 embeddings and shared embed_tokens/lm_head weights. Requires C++ runner with TorchAO shared embedding kernels.
+
+## Build (CMake, host)
+
+```bash
+cmake --preset gemma4-cpu -S examples/models/gemma4
+cmake --build --preset gemma4-cpu -j$(nproc)
+```
+
+## Run
+
+```bash
+# Audio transcription (C++ runner):
+./cmake-out/examples/models/gemma4/gemma4_e2e_runner \
+    --model_path gemma4.pte \
+    --tokenizer_path tokenizer.model \
+    --audio_path test_audio.wav
+
+# Image understanding (C++ runner):
+./cmake-out/examples/models/gemma4/gemma4_e2e_runner \
+    --model_path gemma4.pte \
+    --tokenizer_path tokenizer.model \
+    --image_path photo.jpg \
+    --prompt "Describe this image:"
+
+# Text-only:
+./cmake-out/examples/models/gemma4/gemma4_e2e_runner \
+    --model_path gemma4.pte \
+    --tokenizer_path tokenizer.model \
+    --prompt "What is 2+2?"
+
+# Python runner (audio):
+buck2 run fbcode//executorch/examples/models/gemma4:run_gemma4 -- \
+    --model_path /tmp/gemma4.pte \
+    --tokenizer_path /tmp/tokenizer.model \
+    --audio_path /tmp/test_audio.wav
+
+# Python runner (image):
+buck2 run fbcode//executorch/examples/models/gemma4:run_gemma4 -- \
+    --model_path /tmp/gemma4.pte \
+    --tokenizer_path /tmp/tokenizer.model \
+    --image_path /tmp/photo.jpg \
+    --prompt "Describe this image:"
+```
+
+## Input Requirements
+
+**Audio**: WAV, 16kHz, 16-bit PCM, mono, max 30 seconds.
+
+**Image**: JPEG or PNG. Resized to fit `--max_vision_tokens` soft tokens (default 140). Aspect ratio preserved, dimensions rounded to multiples of 48 pixels. Lower tokens = faster but less detail (25 ~= 240x240, 70 ~= 384x384, 140 ~= 528x528, 280 ~= 768x768).
+
+## Samsung S25 Performance
+
+### Audio (23s)
+
+| Model | Size | Load | Prefill | Gen | TTFT | RTF | Mem load | Mem peak |
+|-------|------|------|---------|-----|------|-----|----------|----------|
+| E2B gemma4.pte | 4.1 GB | 705ms | 166 tok/s | 6 tok/s | 4.50s | 0.71 | 1885 MB | 2251 MB |
+| E2B gemma4_vision.pte | 4.3 GB | 648ms | 163 tok/s | 6 tok/s | 4.56s | 0.72 | 1890 MB | 2257 MB |
+| E2B gemma4_tied_emb4.pte | 2.5 GB | 645ms | 164 tok/s | 6 tok/s | 4.52s | 0.71 | 1683 MB | 2241 MB |
+| E4B gemma4.pte | 6.1 GB | 1.30s | 91 tok/s | 4 tok/s | 7.50s | 1.07 | 3231 MB | 3601 MB |
+| E4B gemma4_vision.pte | 6.2 GB | 1.28s | 92 tok/s | 4 tok/s | 7.47s | 1.00 | 3231 MB | 3602 MB |
+| E4B gemma4_tied_emb4.pte | 4.0 GB | 1.17s | 85 tok/s | 4 tok/s | 8.00s | 1.07 | 2899 MB | 3590 MB |
+
+### Vision (dog.jpg, "Describe this image in two sentences.", 140 tokens ~528x528)
+
+| Model | Size | Load | Encode | Prefill | Gen | TTFT | Total | Mem load | Mem peak |
+|-------|------|------|--------|---------|-----|------|-------|----------|----------|
+| E2B gemma4_vision.pte | 4.3 GB | 798ms | 2.73s | 134 tok/s | 6 tok/s | 3.83s | 10.14s | 1884 MB | 2600 MB |
+| E4B gemma4_vision.pte | 6.2 GB | 1.36s | 2.44s | 85 tok/s | 4 tok/s | 4.17s | 14.62s | 3232 MB | 3950 MB |
+
+### Text ("Write a short paragraph about the history of artificial intelligence")
+
+| Model | Size | Load | Prefill | Gen | TTFT | Total | Mem load | Mem peak |
+|-------|------|------|---------|-----|------|-------|----------|----------|
+| E2B gemma4.pte | 4.1 GB | 625ms | 57 tok/s | 6 tok/s | 332ms | 26.94s | 1890 MB | 1950 MB |
+| E4B gemma4.pte | 6.1 GB | 1.51s | 38 tok/s | 3 tok/s | 506ms | 44.66s | 3231 MB | 3287 MB |
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.gemma4.text_decoder import (
+    convert_hf_to_custom,
+    Gemma4Config,
+    Gemma4Model,
+)
+
+__all__ = ["Gemma4Config", "Gemma4Model", "convert_hf_to_custom"]
@@ -0,0 +1,62 @@
+{
+  "model_type": "gemma4",
+  "text_config": {
+    "hidden_size": 1536,
+    "intermediate_size": 6144,
+    "use_double_wide_mlp": true,
+    "num_hidden_layers": 35,
+    "num_attention_heads": 8,
+    "head_dim": 256,
+    "global_head_dim": 512,
+    "num_key_value_heads": 1,
+    "vocab_size": 262144,
+    "vocab_size_per_layer_input": 262144,
+    "max_position_embeddings": 131072,
+    "rms_norm_eps": 1e-06,
+    "rope_theta": 1000000.0,
+    "rope_local_base_freq": 10000.0,
+    "partial_rotary_factor": 0.25,
+    "sliding_window": 512,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "final_logit_softcapping": 30.0,
+    "hidden_size_per_layer_input": 256,
+    "num_kv_shared_layers": 20,
+    "tie_word_embeddings": true,
+    "layer_types": [
+      "sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
+      "sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
+      "sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
+      "sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
+      "sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
+      "sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention",
+      "sliding_attention","sliding_attention","sliding_attention","sliding_attention","full_attention"
+    ]
+  },
+  "audio_config": {
+    "model_type": "gemma4_audio",
+    "hidden_size": 1024,
+    "output_proj_dims": 1536,
+    "num_attention_heads": 8,
+    "num_hidden_layers": 12,
+    "rms_norm_eps": 1e-06,
+    "conv_kernel_size": 5,
+    "residual_weight": 0.5,
+    "attention_chunk_size": 12,
+    "attention_context_left": 13,
+    "attention_context_right": 0,
+    "attention_logit_cap": 50.0
+  },
+  "audio_seq_length": 750,
+  "audio_ms_per_token": 40,
+  "audio_sample_rate": 16000,
+  "max_audio_length_sec": 30,
+  "special_tokens": {
+    "audio_token_id": 258881,
+    "boa_token_id": 256000,
+    "eoa_token_id": 258883,
+    "boi_token_id": 255999,
+    "eoi_token_id": 258882,
+    "image_token_id": 258880,
+    "video_token_id": 258884
+  }
+}