Merge pull request #3288 from AI-Hypercomputer:vladk/sft-completion-fix2

Google-ML-Automation · Google-ML-Automation · commit 891b76499641 · 2026-03-04T13:55:13.000-08:00
PiperOrigin-RevId: 878653657
diff --git a/src/maxtext/input_pipeline/input_pipeline_utils.py b/src/maxtext/input_pipeline/input_pipeline_utils.py
@@ -32,6 +32,7 @@
 
 Features = dict[str, tf.Tensor]
 AUTOTUNE = tf.data.experimental.AUTOTUNE
+INPUT_TOKENS_KEY = "input_ids"
 
 ########## Functions used by TFDS pipeline
 
@@ -171,6 +172,42 @@ def is_conversational(features, data_columns):
   return False
 
 
+def _get_completion_in_chat_template(tokenizer_model, round_msgs):
+  """
+  Calculates the completion part of a conversation turn when formatted with a chat template.
+
+  This function handles both older and current Hugging Face tokenizers. Modern tokenizers
+  may return a `BatchEncoding` object instead of a simple list of token IDs.
+
+  Args:
+    tokenizer_model: The tokenizer instance.
+    round_msgs: A list of messages for the current conversational turn, including the assistant's response.
+
+  Returns:
+    A string representing the completion formatted by the chat template.
+  """
+  prompt_completion_tokens = tokenizer_model.apply_chat_template(round_msgs, add_generation_prompt=False, tokenize=True)
+  # include generation_prompt as part of the prompt tokens
+  prompt_tokens = tokenizer_model.apply_chat_template(round_msgs[:-1], add_generation_prompt=True, tokenize=True)
+
+  # attention masks in BatchEncoding are effectively ignored
+  if hasattr(prompt_completion_tokens, INPUT_TOKENS_KEY):
+    prompt_completion_ids = getattr(prompt_completion_tokens, INPUT_TOKENS_KEY)
+    prompt_ids = getattr(prompt_tokens, INPUT_TOKENS_KEY)
+  elif isinstance(prompt_completion_tokens, dict) and INPUT_TOKENS_KEY in prompt_completion_tokens:
+    prompt_completion_ids = prompt_completion_tokens[INPUT_TOKENS_KEY]
+    prompt_ids = prompt_tokens[INPUT_TOKENS_KEY]
+  elif isinstance(prompt_completion_tokens, list):
+    prompt_completion_ids = prompt_completion_tokens
+    prompt_ids = prompt_tokens
+  else:
+    raise ValueError(f"Can't handle the chat template output of type {type(prompt_completion_tokens)}")
+
+  completion_tokens = prompt_completion_ids[len(prompt_ids) :]
+  completion_in_chat_template = tokenizer_model.decode(completion_tokens, skip_special_tokens=False)
+  return completion_in_chat_template
+
+
 def apply_chat_template(example, tokenizer_model, data_column_name):
   """Formats conversational data by applying the tokenizer's chat template
   and identifying prompt/completion segments for SFT masking.
@@ -210,14 +247,7 @@ def apply_chat_template(example, tokenizer_model, data_column_name):
         is_prompt.append(True)
       elif message["role"] == "assistant":
         round_msgs.append(message)
-        prompt_completion_tokens = tokenizer_model.apply_chat_template(
-            round_msgs, add_generation_prompt=False, tokenize=True
-        )
-        # include generation_prompt as part of the prompt tokens
-        prompt_tokens = tokenizer_model.apply_chat_template(round_msgs[:-1], add_generation_prompt=True, tokenize=True)
-        completion_tokens = prompt_completion_tokens[len(prompt_tokens) :]
-        completion_in_chat_template = tokenizer_model.decode(completion_tokens, skip_special_tokens=False)
-        messages.append(completion_in_chat_template)
+        messages.append(_get_completion_in_chat_template(tokenizer_model, round_msgs))
         is_prompt.append(False)
         # Round ended, clearing the buffer.
         round_msgs.clear()
diff --git a/tests/assets/golden_logits/golden_data_sft_default.jsonl b/tests/assets/golden_logits/golden_data_sft_default.jsonl
@@ -1 +1 @@
-{"data": {"messages": [{"role": "user", "content": "Hello, what is your name?"}, {"role": "assistant", "content": "I am a chatbot. How can I help?"}]}, "tokens": [1, 518, 25580, 29962, 15043, 29892, 825, 338, 596, 1024, 29973, 518, 29914, 25580, 29962, 306, 626, 263, 13563, 7451, 29889, 1128, 508, 306, 1371, 29973, 29871, 2], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "token_log_probs": [-10.900627136230469, -11.805438995361328, -9.937392234802246, -10.478547096252441, -10.477544784545898, -10.665718078613281, -11.027463912963867, -10.303316116333008, -10.548932075500488, -10.392480850219727, -11.593963623046875, -11.837165832519531, -12.416250228881836, -10.1104736328125, -11.313142776489258, -12.341060638427734, -11.190383911132812, -9.143855094909668, -10.817261695861816, -11.793390274047852, -11.39107894897461, -11.716558456420898, -11.232498168945312, -12.146818161010742, -11.292530059814453, -10.039775848388672, -9.972617149353027]}
+{"data": {"messages": [{"role": "user", "content": "Hello, what is your name?"}, {"role": "assistant", "content": "I am a chatbot. How can I help?"}]}, "tokens": [1, 29961, 25580, 29962, 15043, 29892, 825, 338, 596, 1024, 29973, 518, 29914, 25580, 29962, 306, 626, 263, 13563, 7451, 29889, 1128, 508, 306, 1371, 29973, 29871, 2], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "token_log_probs": [-10.360702514648438, -11.012994766235352, -10.751636505126953, -9.73588752746582, -11.174783706665039, -11.906787872314453, -10.50442123413086, -11.422593116760254, -12.447595596313477, -10.885910034179688, -11.982933044433594, -10.058539390563965, -10.950790405273438, -12.060896873474121, -10.68459701538086, -11.916288375854492, -12.050270080566406, -9.983818054199219, -10.710721015930176, -9.216376304626465, -11.008810043334961, -9.728713989257812, -12.391929626464844, -11.235883712768555, -9.664995193481445, -11.548173904418945, -10.014203071594238]}
diff --git a/tests/assets/logits_generation/generate_sft_golden_data.py b/tests/assets/logits_generation/generate_sft_golden_data.py
@@ -39,7 +39,7 @@
 from trl import SFTConfig, SFTTrainer
 
 from maxtext.configs import pyconfig
-from maxtext.utils.globals import MAXTEXT_PKG_DIR, MAXTEXT_TEST_ASSETS_ROOT
+from maxtext.utils.globals import MAXTEXT_PKG_DIR, MAXTEXT_TEST_ASSETS_ROOT, MAXTEXT_ASSETS_ROOT
 from tests.integration.sft_trainer_correctness_test import get_maxtext_logits, get_token_log_probs, prepare_maxtext_inputs
 
 
@@ -54,7 +54,7 @@
 def initialize_maxtext_config(config):
   """Initializes configuration for MaxText."""
   cfg_with_ckpt = pyconfig.initialize(
-      [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "sft.yml")],
+      [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "post_train", "sft.yml")],
       run_name="compare_maxtext_with_trl_logits",
       model_name=config.model_name,
       tokenizer_path=config.tokenizer_path,
@@ -70,7 +70,7 @@ def initialize_maxtext_config(config):
   )
 
   cfg_without_ckpt = pyconfig.initialize(
-      [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "sft.yml")],
+      [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "post_train", "sft.yml")],
       run_name="generate_sft_golden_data",
       model_name="default",
       enable_checkpointing=False,
@@ -85,10 +85,10 @@ def initialize_maxtext_config(config):
   return cfg_with_ckpt, cfg_without_ckpt
 
 
-def get_hf_model(tokenizer_path):
+def get_hf_model(model_path):
   """Load model from Hugging Face."""
   return AutoModelForCausalLM.from_pretrained(
-      tokenizer_path,
+      model_path,
       torch_dtype=torch.float32,
   )
 
@@ -116,7 +116,7 @@ def setup_sft_trainer(data, hf_model, tokenizer, max_target_length):
       data_collator=None,
       args=SFTConfig(
           dataset_kwargs={"skip_prepare_dataset": True},
-          max_seq_length=max_target_length,
+          max_length=max_target_length,
           **training_args.to_dict(),
       ),
   )
@@ -143,7 +143,7 @@ def prepare_trl_inputs(tokenizer_path, max_target_length):
 
 def get_trl_logits(config, trl_data, max_target_length):
   """Get logits generated by TRL."""
-  hf_model = get_hf_model(config.tokenizer_path)
+  hf_model = get_hf_model(config.hf_model_path)
   tokenizer = get_tokenizer(config.tokenizer_path, max_target_length)
   trl_trainer = setup_sft_trainer(trl_data, hf_model, tokenizer, max_target_length)
   _, trl_outputs = trl_trainer.compute_loss(hf_model, trl_data, return_outputs=True)
@@ -199,7 +199,16 @@ def test_with_trl_and_save_golden_data(config):
 
   parser = argparse.ArgumentParser()
   parser.add_argument("--model-name", type=str, required=False, default="llama2-7b")
-  parser.add_argument("--tokenizer-path", type=str, required=False, default="meta-llama/Llama-2-7b-chat-hf")
+
+  # Reasons to use the local tokenizer.
+  # 1. In transformers=5.2.0 (at least), the Llama-2 tokenizer incorrectly injects an extra space
+  # before <s> and [INST] tokens when applying its chat template.
+  # 2. Consistency with the tokenizer used by sft_trainer_correctness_test.py
+  # which depends on generated gold data here.
+  parser.add_argument(
+      "--tokenizer-path", type=str, required=False, default=os.path.join(MAXTEXT_ASSETS_ROOT, "llama2-chat-tokenizer")
+  )
+  parser.add_argument("--hf-model-path", type=str, required=False, default="meta-llama/Llama-2-7b-chat-hf")
   parser.add_argument(
       "--model-ckpt-path",
       type=str,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"data": {"messages": [{"role": "user", "content": "Hello, what is your name?"}, {"role": "assistant", "content": "I am a chatbot. How can I help?"}]}, "tokens": [1, 518, 25580, 29962, 15043, 29892, 825, 338, 596, 1024, 29973, 518, 29914, 25580, 29962, 306, 626, 263, 13563, 7451, 29889, 1128, 508, 306, 1371, 29973, 29871, 2], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "token_log_probs": [-10.900627136230469, -11.805438995361328, -9.937392234802246, -10.478547096252441, -10.477544784545898, -10.665718078613281, -11.027463912963867, -10.303316116333008, -10.548932075500488, -10.392480850219727, -11.593963623046875, -11.837165832519531, -12.416250228881836, -10.1104736328125, -11.313142776489258, -12.341060638427734, -11.190383911132812, -9.143855094909668, -10.817261695861816, -11.793390274047852, -11.39107894897461, -11.716558456420898, -11.232498168945312, -12.146818161010742, -11.292530059814453, -10.039775848388672, -9.972617149353027]}
	`1`	+{"data": {"messages": [{"role": "user", "content": "Hello, what is your name?"}, {"role": "assistant", "content": "I am a chatbot. How can I help?"}]}, "tokens": [1, 29961, 25580, 29962, 15043, 29892, 825, 338, 596, 1024, 29973, 518, 29914, 25580, 29962, 306, 626, 263, 13563, 7451, 29889, 1128, 508, 306, 1371, 29973, 29871, 2], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "token_log_probs": [-10.360702514648438, -11.012994766235352, -10.751636505126953, -9.73588752746582, -11.174783706665039, -11.906787872314453, -10.50442123413086, -11.422593116760254, -12.447595596313477, -10.885910034179688, -11.982933044433594, -10.058539390563965, -10.950790405273438, -12.060896873474121, -10.68459701538086, -11.916288375854492, -12.050270080566406, -9.983818054199219, -10.710721015930176, -9.216376304626465, -11.008810043334961, -9.728713989257812, -12.391929626464844, -11.235883712768555, -9.664995193481445, -11.548173904418945, -10.014203071594238]}