Ring-buffer KV cache, chunked prefill, INT8 embedding, and cleanup

mergennachin · mergennachin · commit 9108a5bd6337 · 2026-04-29T14:48:48.000-07:00
- Sliding window layers use RingKVCache (2×window) instead of flat
  max_seq_len buffer, reducing KV cache memory for long sequences.
- Prefill is capped to ring buffer size; the C++ runner chunks longer
  prompts automatically via get_max_prefill_chunk metadata.
- Both recipes now quantize embed_tokens to INT8 per-axis (~1.4 GB
  savings vs bf16). Embedding packer uses IntxUnpackedToInt8Tensor
  which supports gather.
- pack_model handles top-level FQNs (no parent module).
- C++ runner aligned with Qwen patterns: #ifdef guards for non-CUDA
  builds, better weight_sharing error handling, cudaDeviceSynchronize
  between prefill and decode.
- Test suite split into test_pipeline.py (CPU) and test_cuda_pipeline.py
  (CUDA) with shared fixtures. New chunked prefill correctness test.
- Prequantized checkpoint available at
  huggingface.co/SocialLocalMobile/gemma-4-31B-it-HQQ-INT4.
- Added Gemma 4 31B tests to cuda.yml CI workflow.
- Cleaned up stale terminology, docstrings, and comments throughout.
diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md
@@ -32,17 +32,36 @@ Two built-in recipes (see `quantize_and_save.py`):
 | `default` | INT4 min_max linears, INT8 per-axis embedding |
 | `sensitive` | INT8 for edge-layer v_proj/down_proj, INT4 hqq elsewhere, INT8 per-axis embedding |
 
-## Quantize once
+## Prequantized checkpoint
+
+A prequantized checkpoint (sensitive recipe) is available on HuggingFace:
+
+```bash
+huggingface-cli download SocialLocalMobile/gemma-4-31B-it-HQQ-INT4 --local-dir gemma-4-31B-it-HQQ-INT4
+```
+
+> **Note**: This checkpoint is intended for development and testing of the
+> ExecuTorch CUDA export pipeline. Output quality has not been formally
+> evaluated against the base model.
+
+Use it directly with `--prequantized` in the export and inference scripts
+below — no need to run `quantize_and_save.py`.
+
+## Quantize from scratch (optional)
+
+To quantize from the original bf16 checkpoint instead, pass
+`--quant-recipe` to select a recipe (`default` or `sensitive`):
 
 ```bash
 python examples/models/gemma4_31b/quantize_and_save.py \
-    --model-dir ~/local/scripts/models/gemma-4-31B-it \
+    --model-dir /path/to/gemma-4-31B-it \
     --output ./gemma4_31b_int4 \
-    --quant-recipe default
+    --quant-recipe sensitive
 ```
 
-Writes `model.safetensors`, `config.json`, and
-`tokenizer.json` into `--output`.
+See [Quantization recipes](#quantization-recipes) above for details on each
+recipe. Writes `model.safetensors`, `config.json`, and `tokenizer.json` into
+`--output`.
 
 ## Export to ExecuTorch
 
diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py
@@ -161,7 +161,9 @@ def _export_cuda(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -
             strict=True,
         )
 
-    max_prefill = config.max_seq_len - 1
+    # Cap prefill length to the ring-buffer KV cache size (2×sliding_window).
+    # Longer prompts are chunked by the runner.
+    max_prefill = min(config.max_seq_len - 1, config.sliding_window * 2)
     seq_dim = Dim("seq_len", min=2, max=max_prefill)
     print(f"Exporting prefill (T in [2, {max_prefill}])...")
     with torch.no_grad():
@@ -199,6 +201,7 @@ def _export_cuda(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -
             "get_max_seq_len": config.max_seq_len,
             "get_vocab_size": config.vocab_size,
             "get_n_layers": config.num_hidden_layers,
+            "get_max_prefill_chunk": max_prefill,
             "use_kv_cache": True,
             "use_sdpa_with_kv_cache": False,
             "enable_dynamic_shape": True,
diff --git a/examples/models/gemma4_31b/main.cpp b/examples/models/gemma4_31b/main.cpp
@@ -34,20 +34,20 @@
 #include <cuda_runtime.h>
 #endif
 
-DEFINE_string(model_path, "", "Path to model.pte.");
-DEFINE_string(data_path, "", "Path to model.ptd (CUDA tensor data).");
+DEFINE_string(model_path, "", "Model .pte file path.");
+DEFINE_string(data_path, "", "Data file (.ptd) for CUDA backend.");
 DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
 DEFINE_string(prompt, "Hello", "Prompt text.");
 DEFINE_string(
     prompt_file,
     "",
-    "Optional path to a file with the prompt text (overrides --prompt).");
+    "Path to file containing prompt text (overrides --prompt).");
 DEFINE_double(temperature, 0.8, "Sampling temperature (0 = near-greedy).");
 DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate.");
 DEFINE_bool(
     cuda_graph,
     false,
-    "Enable CUDA graph capture for the decode method.");
+    "Enable CUDA graph capture for the decode method. CUDA only.");
 
 namespace llm = ::executorch::extension::llm;
 using ::executorch::extension::from_blob;
@@ -57,8 +57,6 @@ using ::executorch::runtime::EValue;
 
 using SizesType = executorch::aten::SizesType;
 
-// The model performs sampling on-device and returns a [B, 1] float tensor
-// holding a token ID. Copy it to host and convert to uint64.
 static uint64_t read_token(const executorch::aten::Tensor& output) {
   const void* ptr = output.const_data_ptr();
   float val = 0.0f;
@@ -135,12 +133,14 @@ int main(int argc, char** argv) {
       /*temp_allocator=*/nullptr,
       /*share_memory_arenas=*/true);
 
+  // Get metadata
   auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
   if (metadata_result.error() != Error::Ok) {
     ET_LOG(Error, "Failed to read model metadata");
     return 1;
   }
 
+#ifdef EXECUTORCH_BUILD_CUDA
   if (FLAGS_cuda_graph) {
     executorch::runtime::BackendOptions<2> cuda_opts;
     cuda_opts.set_option("enable_cuda_graph_for_method", "decode");
@@ -154,14 +154,30 @@ int main(int argc, char** argv) {
   // load_method.
   {
     executorch::runtime::BackendOptions<1> backend_options;
-    if (backend_options.set_option("weight_sharing_across_methods", true) !=
-            Error::Ok ||
-        executorch::runtime::set_option(
-            "CudaBackend", backend_options.view()) != Error::Ok) {
-      ET_LOG(Error, "Failed to enable weight_sharing_across_methods");
+    auto set_err =
+        backend_options.set_option("weight_sharing_across_methods", true);
+    if (set_err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to construct weight_sharing_across_methods option: %d",
+          static_cast<int>(set_err));
+      return 1;
+    }
+    auto opt_err =
+        executorch::runtime::set_option("CudaBackend", backend_options.view());
+    if (opt_err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to enable weight_sharing_across_methods: %d",
+          static_cast<int>(opt_err));
       return 1;
     }
   }
+#else
+  if (FLAGS_cuda_graph) {
+    ET_LOG(Info, "--cuda_graph ignored on non-CUDA build");
+  }
+#endif
 
   printf("Loading methods...\n");
   if (module->load_method("prefill") != Error::Ok) {
@@ -181,6 +197,7 @@ int main(int argc, char** argv) {
 
   auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get());
 
+  // Read prompt from file or flag
   std::string prompt_text = FLAGS_prompt;
   if (!FLAGS_prompt_file.empty()) {
     std::ifstream f(FLAGS_prompt_file);
@@ -189,10 +206,11 @@ int main(int argc, char** argv) {
           Error, "Failed to open prompt file: %s", FLAGS_prompt_file.c_str());
       return 1;
     }
-    prompt_text.assign(
+    prompt_text = std::string(
         (std::istreambuf_iterator<char>(f)), std::istreambuf_iterator<char>());
   }
 
+  // Encode prompt
   auto encode_result = tokenizer->encode(prompt_text);
   if (!encode_result.ok()) {
     ET_LOG(Error, "Failed to encode prompt");
@@ -207,49 +225,66 @@ int main(int argc, char** argv) {
 
   auto S = [](int64_t v) -> SizesType { return static_cast<SizesType>(v); };
 
-  // Temperature: clamp 0 to a tiny epsilon so the divide in the exported
-  // sampler stays well-defined. Gumbel noise then becomes negligible
-  // relative to logit gaps and we get effectively-greedy sampling.
+#ifdef EXECUTORCH_BUILD_CUDA
+  // CUDA build: model fuses the sampler. Pass temperature as a third input.
   float temp_val =
       FLAGS_temperature <= 0.0 ? 1e-6f : static_cast<float>(FLAGS_temperature);
   auto temp_tensor =
       from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
+#endif
 
   // ---------------------------------------------------------------
-  // Prefill
+  // Prefill (chunked to respect ring-buffer KV cache limit)
   // ---------------------------------------------------------------
-  std::string run_method = "prefill";
-  if (num_prompt_tokens == 1) {
-    // prefill was exported with min seq_len=2; decode handles T==1.
-    run_method = "decode";
+  // Sliding layers use a ring buffer sized to 2×sliding_window. A single
+  // prefill call must not exceed this size, otherwise index_copy_ with
+  // wrapped indices produces non-deterministic results on CUDA.
+  int64_t max_prefill_chunk = (*metadata_result)[llm::kMaxSeqLen] - 1;
+  {
+    auto get_result = module->get("get_max_prefill_chunk");
+    if (get_result.ok()) {
+      max_prefill_chunk = get_result->toScalar().to<int64_t>();
+    }
   }
 
-  std::vector<int64_t> token_data(prompt_tokens.begin(), prompt_tokens.end());
-  std::vector<int64_t> pos_data(num_prompt_tokens);
-  for (int64_t i = 0; i < num_prompt_tokens; i++) {
-    pos_data[i] = i;
-  }
-  auto tokens_tensor = from_blob(
-      token_data.data(),
-      {1, S(num_prompt_tokens)},
-      executorch::aten::ScalarType::Long);
-  auto pos_tensor = from_blob(
-      pos_data.data(),
-      {S(num_prompt_tokens)},
-      executorch::aten::ScalarType::Long);
-
-  std::vector<EValue> prefill_inputs = {
-      EValue(tokens_tensor),
-      EValue(pos_tensor),
-      EValue(temp_tensor),
-  };
-
-  auto prefill_result = module->execute(run_method, prefill_inputs);
-  if (prefill_result.error() != Error::Ok) {
-    ET_LOG(Error, "%s failed", run_method.c_str());
-    return 1;
+  uint64_t cur_token = 0;
+  int64_t prefill_pos = 0;
+  while (prefill_pos < num_prompt_tokens) {
+    int64_t chunk_len =
+        std::min(num_prompt_tokens - prefill_pos, max_prefill_chunk);
+
+    std::string run_method = (chunk_len == 1) ? "decode" : "prefill";
+
+    std::vector<int64_t> token_data(
+        prompt_tokens.begin() + prefill_pos,
+        prompt_tokens.begin() + prefill_pos + chunk_len);
+    std::vector<int64_t> pos_data(chunk_len);
+    for (int64_t i = 0; i < chunk_len; i++) {
+      pos_data[i] = prefill_pos + i;
+    }
+    auto tokens_tensor = from_blob(
+        token_data.data(),
+        {1, S(chunk_len)},
+        executorch::aten::ScalarType::Long);
+    auto pos_tensor = from_blob(
+        pos_data.data(), {S(chunk_len)}, executorch::aten::ScalarType::Long);
+
+    std::vector<EValue> prefill_inputs;
+    prefill_inputs.push_back(EValue(tokens_tensor));
+    prefill_inputs.push_back(EValue(pos_tensor));
+#ifdef EXECUTORCH_BUILD_CUDA
+    prefill_inputs.push_back(EValue(temp_tensor));
+#endif
+
+    auto prefill_result = module->execute(run_method, prefill_inputs);
+    if (prefill_result.error() != Error::Ok) {
+      ET_LOG(
+          Error, "%s failed at pos %" PRId64, run_method.c_str(), prefill_pos);
+      return 1;
+    }
+    cur_token = read_token(prefill_result.get()[0].toTensor());
+    prefill_pos += chunk_len;
   }
-  uint64_t cur_token = read_token(prefill_result.get()[0].toTensor());
 
   stats.prompt_eval_end_ms = llm::time_in_ms();
   double prefill_ms =
@@ -261,8 +296,9 @@ int main(int argc, char** argv) {
       num_prompt_tokens * 1000.0 / prefill_ms);
 
 #ifdef EXECUTORCH_BUILD_CUDA
-  // Make prefill's writes to the shared KV cache visible before decode
-  // potentially runs on a different stream.
+  // Synchronize CUDA device to ensure prefill's writes to shared mutable
+  // buffers (KV cache) are visible to the decode method, which may run on
+  // a different CUDA stream.
   cudaDeviceSynchronize();
 #endif
 
@@ -282,11 +318,12 @@ int main(int argc, char** argv) {
     decode_token_data[0] = static_cast<int64_t>(cur_token);
     decode_pos_data[0] = pos;
 
-    std::vector<EValue> decode_inputs = {
-        EValue(decode_tokens),
-        EValue(decode_pos),
-        EValue(temp_tensor),
-    };
+    std::vector<EValue> decode_inputs;
+    decode_inputs.push_back(EValue(decode_tokens));
+    decode_inputs.push_back(EValue(decode_pos));
+#ifdef EXECUTORCH_BUILD_CUDA
+    decode_inputs.push_back(EValue(temp_tensor));
+#endif
 
     auto decode_result = module->execute("decode", decode_inputs);
     if (decode_result.error() != Error::Ok) {
diff --git a/examples/models/gemma4_31b/model.md b/examples/models/gemma4_31b/model.md
@@ -105,14 +105,20 @@ Decoder norms per layer: `input_layernorm`, `post_attention_layernorm`,
 | Method    | Input                                                      | Output (sampled) |
 |-----------|------------------------------------------------------------|------------------|
 | `decode`  | tokens `(1, 1)` + input_pos `(1,)` + temperature `(1,)`    | `(1, 1)` float   |
-| `prefill` | tokens `(1, T)` + input_pos `(T,)` + temperature `(1,)`, T∈[2, max_seq_len-1] | `(1, 1)` float   |
+| `prefill` | tokens `(1, T)` + input_pos `(T,)` + temperature `(1,)`, T∈[2, min(max_seq_len-1, 2×sliding_window)] | `(1, 1)` float   |
 
 Both methods share the same KV-cache buffers via
 `MemoryPlanningPass(share_mutable_buffers=True)` and
 `emit_mutable_buffer_names=True`. The exported program performs Gumbel-max
 sampling on-device and returns a single token ID per call so the C++ runner
 only has to feed tokens.
 
+Prefill length is capped to the ring-buffer KV cache size
+(`2 × sliding_window`) to avoid duplicate wrapped indices in
+`index_copy_`. The C++ runner chunks longer prompts automatically using
+the `get_max_prefill_chunk` constant method. Chunked prefill produces
+identical logits to sequential one-token-at-a-time prefill.
+
 ## Quantization
 
 Three modules in `quant/`:
diff --git a/examples/models/gemma4_31b/model.py b/examples/models/gemma4_31b/model.py
@@ -89,6 +89,9 @@ def update(
         k_val: torch.Tensor,
         v_val: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        # seq_len must not exceed buf_size, otherwise wrapped indices contain
+        # duplicates and index_copy_ is non-deterministic on CUDA. The C++
+        # runner must chunk prefill to respect this limit.
         wrapped = input_pos % self.buf_size
         self.k_cache.index_copy_(2, wrapped, k_val)
         self.v_cache.index_copy_(2, wrapped, v_val)
diff --git a/examples/models/gemma4_31b/quant/pack.py b/examples/models/gemma4_31b/quant/pack.py
@@ -61,11 +61,13 @@ def pack_model(
 
     module_weights: dict[str, dict[str, CanonicalQuantizedWeight]] = defaultdict(dict)
     for fqn, cw in quantized.items():
-        parent_fqn, attr = fqn.rsplit(".", 1)
+        parts = fqn.rsplit(".", 1)
+        parent_fqn = parts[0] if len(parts) > 1 else ""
+        attr = parts[-1]
         module_weights[parent_fqn][attr] = cw
 
     for parent_fqn, weights in module_weights.items():
-        module = model.get_submodule(parent_fqn)
+        module = model.get_submodule(parent_fqn) if parent_fqn else model
         packer = packers.get(type(module))
         if packer is None:
             raise ValueError(
diff --git a/examples/models/gemma4_31b/quant/recipe.py b/examples/models/gemma4_31b/quant/recipe.py
@@ -20,7 +20,7 @@
 class QuantConfig:
     """Per-weight quantization parameters."""
 
-    bits: int  # 4, 6, 8
+    bits: int  # 4, 8
     group_size: int  # 32, 64, 128
     symmetric: bool  # True = no zero point
     method: str  # "min_max" | "hqq"
diff --git a/examples/models/gemma4_31b/quant/test_recipe.py b/examples/models/gemma4_31b/quant/test_recipe.py
@@ -13,7 +13,7 @@
 from .recipe import QuantConfig, QuantRecipe, QuantRule
 
 _Q4 = QuantConfig(4, 32, True, "min_max")
-_Q6 = QuantConfig(6, 32, False, "min_max")
+_Q8 = QuantConfig(8, 32, True, "min_max")
 
 
 class TestQuantRecipeGetConfig(unittest.TestCase):
@@ -23,13 +23,13 @@ class TestQuantRecipeGetConfig(unittest.TestCase):
         [
             (
                 "first_match_wins",
-                [QuantRule(r".*v_proj\.weight", _Q6), QuantRule(r".*\.weight", _Q4)],
+                [QuantRule(r".*v_proj\.weight", _Q8), QuantRule(r".*\.weight", _Q4)],
                 "layers.0.self_attn.v_proj.weight",
-                6,
+                8,
             ),
             (
                 "fallthrough_to_catchall",
-                [QuantRule(r".*v_proj\.weight", _Q6), QuantRule(r".*\.weight", _Q4)],
+                [QuantRule(r".*v_proj\.weight", _Q8), QuantRule(r".*\.weight", _Q4)],
                 "layers.0.self_attn.q_proj.weight",
                 4,
             ),
@@ -85,13 +85,13 @@ def test_layer_filter(self):
         recipe = QuantRecipe(
             rules=[
                 QuantRule(r".*norm\.weight", None),
-                QuantRule(r".*\.(v_proj|down_proj)\.weight", _Q6, layers=edge),
+                QuantRule(r".*\.(v_proj|down_proj)\.weight", _Q8, layers=edge),
                 QuantRule(r".*\.weight", _Q4),
             ]
         )
-        # Edge v_proj → 6-bit
-        self.assertEqual(recipe.get_config("layers.0.self_attn.v_proj.weight").bits, 6)
-        self.assertEqual(recipe.get_config("layers.58.self_attn.v_proj.weight").bits, 6)
+        # Edge v_proj → 8-bit
+        self.assertEqual(recipe.get_config("layers.0.self_attn.v_proj.weight").bits, 8)
+        self.assertEqual(recipe.get_config("layers.58.self_attn.v_proj.weight").bits, 8)
         # Middle v_proj → falls through → 4-bit
         self.assertEqual(recipe.get_config("layers.30.self_attn.v_proj.weight").bits, 4)
         # q_proj always 4-bit
diff --git a/examples/models/gemma4_31b/quant/test_serialize.py b/examples/models/gemma4_31b/quant/test_serialize.py
diff --git a/examples/models/gemma4_31b/test_cuda_pipeline.py b/examples/models/gemma4_31b/test_cuda_pipeline.py
diff --git a/examples/models/gemma4_31b/test_pipeline.py b/examples/models/gemma4_31b/test_pipeline.py