On-the-fly RoPE and fix imports after gemma4 upstream rebase

mergennachin · mergennachin · commit 6273bb234e9a · 2026-05-12T08:18:06.000-07:00
Adapt gemma4_31b to upstream gemma4 changes (33419c0) that removed precompute_freqs_cis in favor of on-the-fly RoPE computation: - Store inv_freq buffer instead of precomputed [max_seq_len, head_dim] cos/sin tables — saves memory, matches qwen3_5_moe and gemma4 E2B - Compute cos/sin per forward via torch.outer(positions, inv_freq) - Fix gemma4/text_decoder/__init__.py to remove stale precompute_freqs_cis re-export - Update model.md to reflect current architecture
diff --git a/examples/models/gemma4/text_decoder/__init__.py b/examples/models/gemma4/text_decoder/__init__.py
@@ -10,7 +10,6 @@
     apply_rotary_emb,
     apply_rotary_emb_single,
     Gemma4KVCache,
-    precompute_freqs_cis,
     rotate_half,
 )
 from .gemma4_config import Gemma4Config  # noqa: F401
diff --git a/examples/models/gemma4_31b/model.md b/examples/models/gemma4_31b/model.md
@@ -105,7 +105,7 @@ Decoder norms per layer: `input_layernorm`, `post_attention_layernorm`,
 | Method    | Input                                                      | Output (sampled) |
 |-----------|------------------------------------------------------------|------------------|
 | `decode`  | tokens `(1, 1)` + input_pos `(1,)` + temperature `(1,)`    | `(1, 1)` float   |
-| `prefill` | tokens `(1, T)` + input_pos `(T,)` + temperature `(1,)`, T∈[2, min(max_seq_len-1, 2×sliding_window)] | `(1, 1)` float   |
+| `prefill` | tokens `(1, T)` + input_pos `(T,)` + temperature `(1,)`, T∈[5, min(max_seq_len-1, 2×sliding_window)] | `(1, 1)` float   |
 
 Both methods share the same KV-cache buffers via
 `MemoryPlanningPass(share_mutable_buffers=True)` and
@@ -145,11 +145,11 @@ quantize_and_save.py                    export.py / inference.py
      |                                       |
   quantize_weight()                     load (torchao safetensors)
      |                                       |
-  Int4Tensor / IntxUnpacked             Int4Tensor / IntxUnpacked
+  Int4Tensor / IntxUnpacked             Int4Tensor / IntxUnpacked (used directly)
      |                                       |
-  save (torchao safetensors)            pack_model()
+  save (torchao safetensors)            int4_dispatch routes to int4_plain_mm
      |                                       |
-  model.safetensors                     Int4TilePackedTo4dTensor (runtime)
+  model.safetensors                     dp4a decode / dequant+cuBLAS prefill
 ```
 
 `embed_tokens` and `lm_head` start tied; they are untied before
@@ -159,18 +159,17 @@ lossless for index lookup).
 
 ## Runtime buffer materialization
 
-After weight loading (via `pack_model()` or `from_hf_checkpoint()`), the
-model's KV caches, RoPE tables, and scalar constants are still on the meta
-device. `materialize_runtime_buffers(model, dtype, device)` in `model.py`
-replaces them with real tensors:
+After weight loading (via `from_hf_checkpoint()`), the model's KV caches,
+RoPE inv_freq buffers, and scalar constants are still on the meta device.
+`materialize_runtime_buffers(model, dtype, device)` in `model.py` replaces
+them with real tensors:
 
 - KV caches → zeros in `dtype` (bf16 for inference, bf16 for export)
-- RoPE tables → computed per-layer (sliding vs full, different θ and head_dim)
+- `inv_freq` → moved to target device (cos/sin computed on the fly per forward)
 - `embed_normalizer`, `logit_softcap`, `cache_positions` → scalar constants
 
 Called by `export.py` (device="cpu" for tracing) and `inference.py`
-(device="cuda" for eager execution). Having one function avoids duplicating
-the RoPE computation and constant setup across scripts.
+(device="cuda" for eager execution).
 
 ## Customizations vs. vLLM / transformers reference
 
@@ -183,9 +182,10 @@ These exist solely to make the model exportable / efficient under ExecuTorch:
   via modulo and the attention mask reconstructs which slots are valid.
   Full-attention layers use a flat `Gemma4KVCache` sized to `max_seq_len`.
   Both use `index_copy_(dim=2, ...)` for trace-friendly updates.
-- **Per-layer RoPE tables** registered as `persistent=False` buffers (sliding
-  uses full RoPE, full uses proportional partial RoPE — head_dim and θ
-  differ, so the table is not shared).
+- **On-the-fly RoPE**: stores only `inv_freq` per layer, computes cos/sin
+  via `torch.outer(positions, inv_freq)` each forward. Saves memory vs
+  precomputed `[max_seq_len, head_dim]` tables (sliding uses full RoPE,
+  full uses proportional partial RoPE — head_dim and θ differ).
 - **On-device Gumbel-max sampling** so the exported program emits a token
   rather than a full logits tensor — keeps the runner GPU↔CPU traffic to a
   single float per step.
@@ -198,6 +198,6 @@ These exist solely to make the model exportable / efficient under ExecuTorch:
 The numerically-sensitive math primitives are imported from
 `examples.models.gemma4.text_decoder` and shared with the Gemma 4 E2B/E4B
 example: `RMSNorm`, `RMSNormNoWeight`, `Gemma4MLP`, `Gemma4KVCache`,
-`precompute_freqs_cis`, `apply_rotary_emb`. The 31B-specific pieces
-(attention with K=V branch, decoder layer, top-level model with softcap +
-sampling, checkpoint loader) live in `model.py`.
+`apply_rotary_emb`. The 31B-specific pieces (attention with K=V branch,
+decoder layer, top-level model with softcap + sampling, checkpoint loader)
+live in `model.py`.
diff --git a/examples/models/gemma4_31b/model.py b/examples/models/gemma4_31b/model.py
@@ -49,7 +49,6 @@
     apply_rotary_emb,
     Gemma4KVCache,
     Gemma4MLP,
-    precompute_freqs_cis,
     RMSNorm,
     RMSNormNoWeight,
 )
@@ -255,21 +254,22 @@ def __init__(self, config: Gemma4_31BConfig, layer_idx: int):
         # Precomputed RoPE table for this layer (per-layer because head_dim
         # and theta differ between sliding and full attention). For full
         # attention layers we pass freq_base_dim=head_dim so the zero-padded
-        # inv_freq matches HF's "proportional" partial RoPE.
+        # On-the-fly RoPE: store only inv_freq, compute cos/sin per forward.
+        # Saves memory vs precomputed [max_seq_len, head_dim] tables.
         if self.is_sliding:
             rotary_dim = self.head_dim
-            freq_base_dim = None
         else:
             rotary_dim = int(self.head_dim * self.partial_rotary)
-            freq_base_dim = self.head_dim
-        freqs_cos, freqs_sin = precompute_freqs_cis(
-            rotary_dim,
-            config.max_seq_len,
-            theta=self.rope_theta,
-            freq_base_dim=freq_base_dim,
+        rope_angles = rotary_dim // 2
+        inv_freq_rotated = 1.0 / (
+            self.rope_theta ** (torch.arange(0, rotary_dim, 2).float() / self.head_dim)
         )
-        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
-        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+        nope_angles = self.head_dim // 2 - rope_angles
+        if nope_angles > 0:
+            inv_freq = torch.cat([inv_freq_rotated, torch.zeros(nope_angles)])
+        else:
+            inv_freq = inv_freq_rotated
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # KV cache. Sliding layers use a ring buffer (2x window) to save
         # memory; full layers use a flat buffer (max_seq_len).
@@ -316,10 +316,11 @@ def forward(
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
 
-        # RoPE on Q and K only (V is not rotated). cos/sin are gathered for
-        # the current positions to avoid baking the full table into the graph.
-        cos = self.freqs_cos[input_pos]
-        sin = self.freqs_sin[input_pos]
+        # RoPE on Q and K only (V is not rotated). cos/sin computed on the fly.
+        freqs = torch.outer(input_pos.float(), self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = torch.cos(emb)
+        sin = torch.sin(emb)
         q, k = apply_rotary_emb(q, k, cos, sin)
 
         # Update cache and read back full K/V.
@@ -533,8 +534,7 @@ def from_hf_checkpoint(
         # and not in the checkpoint — those are the "expected" missing keys.
         runtime_prefixes = (
             ".kv_cache.",
-            ".freqs_cos",
-            ".freqs_sin",
+            ".inv_freq",
             "embed_normalizer",
             "logit_softcap",
             "cache_positions",
@@ -675,19 +675,7 @@ def materialize_runtime_buffers(
 
     for layer in model.layers:
         attn = layer.self_attn
-        if attn.is_sliding:
-            rotary_dim, freq_base_dim = attn.head_dim, None
-        else:
-            rotary_dim = int(attn.head_dim * attn.partial_rotary)
-            freq_base_dim = attn.head_dim
-        cos, sin = precompute_freqs_cis(
-            rotary_dim,
-            config.max_seq_len,
-            theta=attn.rope_theta,
-            freq_base_dim=freq_base_dim,
-        )
-        attn.register_buffer("freqs_cos", cos.to(device), persistent=False)
-        attn.register_buffer("freqs_sin", sin.to(device), persistent=False)
+        attn.inv_freq = attn.inv_freq.to(device)
 
     model.register_buffer(
         "embed_normalizer",

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,6 @@`
`10`	`10`	`apply_rotary_emb,`
`11`	`11`	`apply_rotary_emb_single,`
`12`	`12`	`Gemma4KVCache,`
`13`		`- precompute_freqs_cis,`
`14`	`13`	`rotate_half,`
`15`	`14`	`)`
`16`	`15`	`from .gemma4_config import Gemma4Config # noqa: F401`