Merge pull request #4010 from AI-Hypercomputer:gemma4_sft

Google-ML-Automation · Google-ML-Automation · commit af334f19e691 · 2026-05-29T14:03:11.000-07:00
PiperOrigin-RevId: 923592408
diff --git a/src/maxtext/input_pipeline/hf_data_processing.py b/src/maxtext/input_pipeline/hf_data_processing.py
@@ -321,7 +321,6 @@ def preprocessing_pipeline(
   )
   operations = []
   if use_sft:
-    input_pipeline_utils.verify_chat_template_generation_prompt_logic(tokenizer)
     operations.append(
         input_pipeline_utils.SFTPromptMasking(
             text_column_name=data_column_names[0],
diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py
@@ -19,8 +19,6 @@
 from threading import current_thread
 from typing import Any, Iterable, TYPE_CHECKING
 
-from jinja2 import TemplateError
-
 if TYPE_CHECKING:
   import datasets
   import tensorflow as tf
@@ -210,83 +208,24 @@ def extract_token_ids(tokens):
     raise ValueError(f"Can't extract token_ids from type {type(tokens)}")
 
 
-def verify_chat_template_generation_prompt_logic(tokenizer_model):
-  """Verifies the tokenizer's chat template for correct SFT loss masking.
-
-  This function ensures that the tokens added by `add_generation_prompt=True`
-  are identical to the tokens that begin an assistant's turn in a complete
-  conversation, which is critical for masking prompt tokens during SFT loss
-  calculation.
-
-  Example of a mismatch:
-    A `ValueError` is raised if the generation prompt and the actual
-    assistant prefix do not match. For example:
-
-    - `add_generation_prompt=True` on a user message produces a prompt ending in:
-      `...<|im_start|>generation\n`
-    - A full turn with an assistant message starts the reply with:
-      `...<|im_start|>assistant\n...`
-
-    This function would fail because the tokens for "generation" do not
-    match the tokens for "assistant".
-
-  Args:
-    tokenizer_model: The Hugging Face tokenizer instance to verify.
-
-  Raises:
-    ValueError: If the `add_generation_prompt` tokens do not exactly
-      match the beginning of an assistant message in the template.
-  """
-  dummy_msgs = [{"role": "system", "content": "System message"}, {"role": "user", "content": "Test message"}]
-
-  try:
-    prompt_wo_gen_tokens = tokenizer_model.apply_chat_template(dummy_msgs, add_generation_prompt=False, tokenize=True)
-  except TemplateError:
-    max_logging.info(
-        "Tokenizer failed to apply chat template with 'system' role. "
-        "Falling back to 'user' role only for chat template verification."
-    )
-    dummy_msgs.pop(0)
-    prompt_wo_gen_tokens = tokenizer_model.apply_chat_template(dummy_msgs, add_generation_prompt=False, tokenize=True)
-  prompt_wo_gen_ids = extract_token_ids(prompt_wo_gen_tokens)
-
-  prompt_w_gen_tokens = tokenizer_model.apply_chat_template(dummy_msgs, add_generation_prompt=True, tokenize=True)
-  prompt_w_gen_ids = extract_token_ids(prompt_w_gen_tokens)
-
-  if prompt_w_gen_ids[: len(prompt_wo_gen_ids)] != prompt_wo_gen_ids:
-    raise ValueError("Unable to extract generation prompt tokens.")
-  # Extract the tokenized generation prompt (the expected assistant prefix)
-  assistant_prefix = prompt_w_gen_ids[len(prompt_wo_gen_ids) :]
-  full_turn_tokens = extract_token_ids(
-      tokenizer_model.apply_chat_template(
-          dummy_msgs + [{"role": "assistant", "content": "Dummy response"}], add_generation_prompt=False, tokenize=True
-      )
-  )
-  full_turn_ids = extract_token_ids(full_turn_tokens)
-  # Extract the actual tokens that appear right after the user message in the full turn
-  actual_prefix_in_full_turn = full_turn_ids[len(prompt_wo_gen_ids) : len(prompt_wo_gen_ids) + len(assistant_prefix)]
-
-  if actual_prefix_in_full_turn != assistant_prefix:
-    expected_str = tokenizer_model.decode(assistant_prefix)
-    actual_str = tokenizer_model.decode(actual_prefix_in_full_turn)
-    raise ValueError(
-        "Chat template generation prompt mismatch!\n"
-        f"Expected assistant prefix tokens: {assistant_prefix} ('{expected_str}')\n"
-        f"Actual prefix tokens found: {actual_prefix_in_full_turn} ('{actual_str}')\n"
-        "This means the tokenizer's chat template will break the sft masking logic."
-    )
+def _get_completion_in_chat_template(tokenizer_model, round_msgs):
+  """Calculates the completion part of a conversation turn formatted with a chat template.
 
+  Uses the longest-common-prefix between the full conversation tokens and the
+  generation-prompt tokens to locate where the completion starts.
 
-def _get_completion_in_chat_template(tokenizer_model, round_msgs):
-  """
-  Calculates the completion part of a conversation turn when formatted with a chat template.
+  For most models (Llama, Qwen, …) the generation prompt is an exact prefix of the
+  full conversation, so common_len == len(prompt_ids).
 
-  This function handles both older and current Hugging Face tokenizers. Modern tokenizers
-  may return a `BatchEncoding` object instead of a simple list of token IDs.
+  For Gemma4, add_generation_prompt=True emits thinking-channel tokens
+  (<|channel>thought\\n<channel|>) that diverge from the plain conversation
+  at the model-turn boundary. The common prefix ends just before that
+  divergence, and the completion correctly captures the thinking content
+  and response tokens.
 
   Args:
     tokenizer_model: The tokenizer instance.
-    round_msgs: A list of messages for the current conversational turn, including the assistant's response.
+    round_msgs: Messages for the current conversational turn including the assistant response.
 
   Returns:
     A string representing the completion formatted by the chat template.
@@ -298,9 +237,24 @@ def _get_completion_in_chat_template(tokenizer_model, round_msgs):
   prompt_completion_ids = extract_token_ids(prompt_completion_tokens)
   prompt_ids = extract_token_ids(prompt_tokens)
 
-  completion_tokens = prompt_completion_ids[len(prompt_ids) :]
-  completion_in_chat_template = tokenizer_model.decode(completion_tokens, skip_special_tokens=False)
-  return completion_in_chat_template
+  # Walk forward until the two sequences diverge
+  common_len = 0
+  for full_id, prompt_id in zip(prompt_completion_ids, prompt_ids):
+    if full_id == prompt_id:
+      common_len += 1
+    else:
+      break
+
+  if common_len == 0:
+    raise ValueError(
+        "Chat template generation prompt mismatch: no common prefix tokens found.\n"
+        f"Full conversation tokens: {prompt_completion_ids} ('{tokenizer_model.decode(prompt_completion_ids)}')\n"
+        f"Generation prompt tokens: {prompt_ids} ('{tokenizer_model.decode(prompt_ids)}')\n"
+        "Cannot determine completion boundary."
+    )
+
+  completion_tokens = prompt_completion_ids[common_len:]
+  return tokenizer_model.decode(completion_tokens, skip_special_tokens=False)
 
 
 def apply_chat_template(example, tokenizer_model, data_column_name):
diff --git a/tests/post_training/unit/sft_data_processing_test.py b/tests/post_training/unit/sft_data_processing_test.py
@@ -22,19 +22,17 @@
 import os.path
 import numpy as np
 import jax
-import re
 from jax.sharding import Mesh
 from jax.experimental import mesh_utils
 from datasets import Dataset
 import transformers
 from parameterized import parameterized_class
-from unittest.mock import patch
 from maxtext.configs import pyconfig
 from maxtext.utils.globals import MAXTEXT_PKG_DIR, MAXTEXT_CONFIGS_DIR, MAXTEXT_ASSETS_ROOT
 from maxtext.input_pipeline import hf_data_processing
 from maxtext.input_pipeline import input_pipeline_interface
 from maxtext.input_pipeline.hf_data_processing import _get_pad_id
-from maxtext.input_pipeline.input_pipeline_utils import verify_chat_template_generation_prompt_logic
+from maxtext.input_pipeline.input_pipeline_utils import apply_chat_template, SFTPromptMasking, tokenization
 
 PROMPT_DATA = [
     [
@@ -512,26 +510,118 @@ def setUp(self):
     super().setUp()
     self.qwen3_tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
     self.llama2_tokenizer = transformers.AutoTokenizer.from_pretrained(self.LLAMA_TOKENIZER_PATH)
+    self.gemma4_tokenizer = transformers.AutoTokenizer.from_pretrained("google/gemma-4-26B-A4B-it")
+
+  def _apply_chat_template(self, tokenizer):
+    """Helper function to apply the chat template to a sample input and return the result for testing."""
+    messages = [
+        {"role": "user", "content": "Q1"},
+        {"role": "assistant", "content": "A1"},
+        {"role": "user", "content": "Q2"},
+        {"role": "assistant", "content": "A2"},
+    ]
+    example = {"messages": messages}
+    return apply_chat_template(example, tokenizer, "messages")
+
+  def test_apply_chat_template_with_qwen3_tokenizer(self):
+    """Verifies that apply_chat_template correctly applies Qwen3's chat template."""
+    result = self._apply_chat_template(self.qwen3_tokenizer)
+    self.assertEqual(result["is_prompt"], [True, False, True, False])
+    self.assertEqual(len(result["messages"]), 4)
+    self.assertIn("<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n", result["messages"][0])
+    self.assertIn("<think>\n\n</think>\n\nA1<|im_end|>\n", result["messages"][1])
+    self.assertIn("<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n", result["messages"][2])
+    self.assertIn("<think>\n\n</think>\n\nA2<|im_end|>\n", result["messages"][3])
+
+  def test_apply_chat_template_with_llama2_tokenizer(self):
+    """Verifies that apply_chat_template correctly applies Llama2's chat template."""
+    result = self._apply_chat_template(self.llama2_tokenizer)
+    self.assertEqual(result["is_prompt"], [True, False, True, False])
+    self.assertEqual(len(result["messages"]), 4)
+    self.assertIn("<s>[INST] Q1 [/INST]", result["messages"][0])
+    self.assertIn("A1 </s>", result["messages"][1])
+    self.assertIn("<s>[INST] Q2 [/INST]", result["messages"][2])
+    self.assertIn("A2 </s>", result["messages"][3])
+
+  def test_apply_chat_template_with_gemma4_tokenizer(self):
+    """Verifies that apply_chat_template correctly applies Gemma4's chat template."""
+    result = self._apply_chat_template(self.gemma4_tokenizer)
+    self.assertEqual(result["is_prompt"], [True, False, True, False])
+    self.assertEqual(len(result["messages"]), 4)
+    self.assertIn("<|turn>user\nQ1<turn|>\n<|turn>model\n<|channel>thought\n<channel|>", result["messages"][0])
+    self.assertIn("A1<turn|>\n", result["messages"][1])
+    self.assertIn("<|turn>user\nQ2<turn|>\n<|turn>model\n<|channel>thought\n<channel|>", result["messages"][2])
+    self.assertIn("A2<turn|>\n", result["messages"][3])
 
-  def test_tokenizer_w_generation_prompt(self):
-    verify_chat_template_generation_prompt_logic(self.qwen3_tokenizer)
 
-  def test_tokenizer_wo_generation_prompt(self):
-    verify_chat_template_generation_prompt_logic(self.llama2_tokenizer)
+@pytest.mark.external_training
+class SFTPromptMaskingTest(unittest.TestCase):
 
-  def test_failure_path_with_modified_template(self):
-    """Verifies the function correctly raises a ValueError on a bad template."""
-    # Replace the role within the existing add_generation_prompt block with a deliberately faulty one.
-    fault_chat_template = re.sub(
-        r"(\{%-?\s*if add_generation_prompt\s*%\}.*?<\|im_start\|>)assistant(.*?\{%-?\s*endif\s*%\})",
-        r"\1wrong_role\2",
-        self.qwen3_tokenizer.chat_template,
-        flags=re.DOTALL,
+  def setUp(self):
+    super().setUp()
+    self.max_target_length = 50
+    self.qwen3_tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
+    self.gemma4_tokenizer = transformers.AutoTokenizer.from_pretrained("google/gemma-4-26B-A4B-it")
+
+  def _apply_prompt_masking(self, tokenizer, unk_id, completion_only=True):
+    """Helper function to apply the prompt masking to a sample input and return the result for testing."""
+    messages = [
+        {"role": "user", "content": "Q1"},
+        {"role": "assistant", "content": "A1"},
+        {"role": "user", "content": "Q2"},
+        {"role": "assistant", "content": "A2"},
+    ]
+    example = {"messages": messages}
+    modified_example = apply_chat_template(example, tokenizer, "messages")
+    tokenized_example = tokenization(modified_example, tokenizer, False, self.max_target_length, ["messages"])
+    op = SFTPromptMasking(
+        text_column_name="messages",
+        completion_only=completion_only,
+        max_target_length=self.max_target_length,
+        unk_id=unk_id,
     )
-    with patch.object(self.qwen3_tokenizer, "chat_template", fault_chat_template):
-      # Verify that our function catches the mismatch and raises the expected error
-      with self.assertRaisesRegex(ValueError, "Chat template generation prompt mismatch!"):
-        verify_chat_template_generation_prompt_logic(self.qwen3_tokenizer)
+    return op.map({"messages": tokenized_example["messages"], "is_prompt": modified_example["is_prompt"]})
+
+  def _verify_prompt_masking(self, tokenizer, inputs, targets, unk_id):
+    """Helper function to verify that the prompt masking was applied correctly."""
+    # Unmasked positions must match inputs exactly
+    np.testing.assert_array_equal(inputs[targets != unk_id], targets[targets != unk_id])
+
+    # Some tokens must be masked
+    self.assertTrue(np.any(targets == unk_id))
+
+    # Decoding unmasked tokens yields completions, not prompts
+    completion = tokenizer.decode(targets[targets != unk_id], skip_special_tokens=False)
+    self.assertIn("A1", completion)
+    self.assertIn("A2", completion)
+    self.assertNotIn("Q1", completion)
+    self.assertNotIn("Q2", completion)
+
+  def test_sft_prompt_masking_with_qwen3_tokenizer(self):
+    """Verifies that SFTPromptMasking correctly applies masking for Qwen3's chat template."""
+    unk_id = _get_pad_id(self.qwen3_tokenizer)
+    result = self._apply_prompt_masking(self.qwen3_tokenizer, unk_id)
+    inputs, targets = result["inputs"], result["targets"]
+    self._verify_prompt_masking(self.qwen3_tokenizer, inputs, targets, unk_id)
+
+  def test_sft_prompt_masking_with_gemma4_tokenizer(self):
+    """Verifies that SFTPromptMasking correctly applies masking for Gemma4's chat template."""
+    unk_id = _get_pad_id(self.gemma4_tokenizer)
+    result = self._apply_prompt_masking(self.gemma4_tokenizer, unk_id)
+    inputs, targets = result["inputs"], result["targets"]
+    self._verify_prompt_masking(self.gemma4_tokenizer, inputs, targets, unk_id)
+
+  def test_sft_no_prompt_masking_with_qwen3_tokenizer(self):
+    """Verifies that prompt masking is not applied when completion_only=False with Qwen3 tokenizer."""
+    unk_id = _get_pad_id(self.qwen3_tokenizer)
+    result = self._apply_prompt_masking(self.qwen3_tokenizer, unk_id, completion_only=False)
+    np.testing.assert_array_equal(result["inputs"], result["targets"])
+
+  def test_sft_no_prompt_masking_with_gemma4_tokenizer(self):
+    """Verifies that prompt masking is not applied when completion_only=False with Gemma4 tokenizer."""
+    unk_id = _get_pad_id(self.gemma4_tokenizer)
+    result = self._apply_prompt_masking(self.gemma4_tokenizer, unk_id, completion_only=False)
+    np.testing.assert_array_equal(result["inputs"], result["targets"])
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -321,7 +321,6 @@ def preprocessing_pipeline(`
`321`	`321`	`)`
`322`	`322`	`operations = []`
`323`	`323`	`if use_sft:`
`324`		`- input_pipeline_utils.verify_chat_template_generation_prompt_logic(tokenizer)`
`325`	`324`	`operations.append(`
`326`	`325`	`input_pipeline_utils.SFTPromptMasking(`
`327`	`326`	`text_column_name=data_column_names[0],`