Apply Gemma 4 IT chat template in inference.py and C++ runner

mnachin · mergennachin · commit 78ee61fc6fcb · 2026-05-15T06:51:46.000-07:00
Gemma 4 31B-IT is instruction-tuned and produces degenerate output
without the chat template wrapping. Auto-wrap --prompt with the IT
template (&lt;bos&gt;&lt;|turn&gt;user\n{prompt}&lt;turn|&gt;\n&lt;|turn&gt;model\n
&lt;|channel&gt;thought\n&lt;channel|&gt;) by default; --raw-prompt / --raw_prompt
skips wrapping for pre-formatted input.
diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md
@@ -79,6 +79,9 @@ Writes `model.pte` and `model.ptd` into `--output-dir`.
 
 ## Eager inference
 
+The prompt is automatically wrapped with the Gemma 4 IT chat template.
+Pass `--raw-prompt` to skip template wrapping for pre-formatted input.
+
 ```bash
 python examples/models/gemma4_31b/inference.py \
     --prequantized ./gemma4_31b_int4 \
@@ -109,6 +112,9 @@ The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`.
 
 ## Run the .pte
 
+The prompt is automatically wrapped with the Gemma 4 IT chat template.
+Pass `--raw_prompt` to skip template wrapping for pre-formatted input.
+
 ```bash
 ./gemma4_31b_runner \
     --model_path  ./gemma4_31b_exports/model.pte \
diff --git a/examples/models/gemma4_31b/inference.py b/examples/models/gemma4_31b/inference.py
@@ -13,6 +13,11 @@
 Packs for the target backend (--backend cuda), materializes runtime buffers,
 optionally compiles with ``torch.compile``, and generates text autoregressively.
 
+Gemma 4 31B-IT is instruction-tuned and requires chat-template formatting.
+The ``--prompt`` is automatically wrapped with the Gemma 4 chat template
+(``<bos><|turn>user\\n{prompt}<turn|>\\n<|turn>model\\n<|channel>thought\\n<channel|>``).
+Pass ``--raw-prompt`` to skip template wrapping (e.g., for pre-formatted input).
+
 Usage:
     python inference.py \\
         --prequantized ./gemma4_31b_int4 \\
@@ -63,6 +68,16 @@ def _move_to_cuda(model, config) -> None:
     materialize_runtime_buffers(model, dtype=torch.bfloat16, device="cuda")
 
 
+_CHAT_TEMPLATE = (
+    "<bos><|turn>user\n{prompt}<turn|>\n<|turn>model\n<|channel>thought\n<channel|>"
+)
+
+
+def apply_chat_template(prompt: str) -> str:
+    """Wrap a user prompt in the Gemma 4 IT chat template."""
+    return _CHAT_TEMPLATE.format(prompt=prompt)
+
+
 def generate(
     model,
     tokenizer,
@@ -155,6 +170,11 @@ def main() -> None:
         default=4096,
         help="KV cache length to allocate for this run.",
     )
+    parser.add_argument(
+        "--raw-prompt",
+        action="store_true",
+        help="Skip chat-template wrapping (use if the prompt is already formatted).",
+    )
     parser.add_argument(
         "--no-compile",
         action="store_true",
@@ -204,14 +224,16 @@ def main() -> None:
     # Gemma 4 EOS tokens (from generation_config.json: ids 1, 50, 106).
     eos_token_ids = {1, 50, 106}
 
+    prompt = args.prompt if args.raw_prompt else apply_chat_template(args.prompt)
+
     print(f"\nPrompt: {args.prompt}")
     print("-" * 40)
 
     t0 = time.perf_counter()
     output = generate(
         model,
         tokenizer,
-        args.prompt,
+        prompt,
         max_new_tokens=args.max_new_tokens,
         temperature=args.temperature,
         eos_token_ids=eos_token_ids,
diff --git a/examples/models/gemma4_31b/main.cpp b/examples/models/gemma4_31b/main.cpp
@@ -65,6 +65,10 @@ DEFINE_double(temperature, 0.8, "Sampling temperature (0 = near-greedy).");
 DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate.");
 DEFINE_int32(bos_id, 2, "BOS token id to prepend (Gemma convention: 2).");
 DEFINE_int32(eos_id, 1, "EOS token id (Gemma convention: 1).");
+DEFINE_bool(
+    raw_prompt,
+    false,
+    "Skip chat-template wrapping (use if the prompt is already formatted).");
 DEFINE_bool(
     cuda_graph,
     false,
@@ -232,6 +236,14 @@ int main(int argc, char** argv) {
         (std::istreambuf_iterator<char>(f)), std::istreambuf_iterator<char>());
   }
 
+  // Wrap with Gemma 4 IT chat template unless --raw_prompt is set.
+  // BOS is prepended separately below; this adds the turn structure and the
+  // empty thought block required by the instruction-tuned model.
+  if (!FLAGS_raw_prompt) {
+    prompt_text = "<|turn>user\n" + prompt_text +
+        "<turn|>\n<|turn>model\n<|channel>thought\n<channel|>";
+  }
+
   // Encode prompt
   auto encode_result = tokenizer->encode(prompt_text);
   if (!encode_result.ok()) {