Emit chat-template generation warning once, not per tokenization worker

realAsma · realAsma · commit 2154693170b5 · 2026-04-18T23:54:10.000Z
Hoist the `{% generation %}` detection out of the tokenize closure in
make_chat_tokenize_fn so the heuristic-mode warning fires once in the main
process instead of ~N times (once per num_proc worker). Drop
return_assistant_tokens_mask=True on the heuristic path to silence the
matching transformers-internal warning_once at the source.

Also repoint the LAQ recipe test at configs/quantize/experimental/.

Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/examples/llm_qat/dataset_utils.py b/examples/llm_qat/dataset_utils.py
@@ -37,6 +37,7 @@
 import concurrent.futures
 import hashlib
 import os
+import re
 import shutil
 import tempfile
 import time
@@ -209,6 +210,20 @@ def _supports_chatml_heuristic(tokenizer: PreTrainedTokenizerBase) -> bool:
         return False
 
 
+# Mirrors the regex transformers uses to detect {% generation %} in chat templates
+# (see transformers/utils/chat_template_utils.py).
+_GENERATION_KEYWORD_RE = re.compile(r"\{\%-?\s*generation\s*-?\%\}")
+
+
+def _chat_template_has_generation(tokenizer: PreTrainedTokenizerBase) -> bool:
+    tpl = getattr(tokenizer, "chat_template", None)
+    if isinstance(tpl, dict):
+        tpl = tpl.get("default")
+    if not isinstance(tpl, str):
+        return False
+    return bool(_GENERATION_KEYWORD_RE.search(tpl))
+
+
 def _encode_role(tokenizer: PreTrainedTokenizerBase, role: str) -> list[int]:
     """Encode a role string, returning only the role tokens (no special tokens)."""
     return tokenizer.encode(role, add_special_tokens=False)
@@ -295,7 +310,19 @@ def make_chat_tokenize_fn(
     Tested model families (ChatML format): Qwen2, Qwen2.5, Qwen3, Qwen3.5, Nemotron 3.
     """
     _check_model_family(tokenizer)
-    _heuristic_checked = {"done": False}
+    use_heuristic = not _chat_template_has_generation(tokenizer)
+    if use_heuristic:
+        if not _supports_chatml_heuristic(tokenizer):
+            model_name = getattr(tokenizer, "name_or_path", "unknown")
+            raise ValueError(
+                f"Chat template for '{model_name}' does not support "
+                f"{{% generation %}} and does not use ChatML format. "
+                f"Use make_pretrain_tokenize_fn instead."
+            )
+        warn_rank_0(
+            "Chat template lacks {% generation %} support. "
+            "Using heuristic ChatML-based assistant masking."
+        )
 
     def tokenize(sample):
         messages = sample.get(chat_key)
@@ -308,15 +335,25 @@ def tokenize(sample):
             }
 
         try:
-            result = tokenizer.apply_chat_template(
-                messages,
-                tokenize=True,
-                return_dict=True,
-                return_assistant_tokens_mask=True,
-                padding="max_length",
-                truncation=True,
-                max_length=max_length,
-            )
+            if use_heuristic:
+                result = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=True,
+                    return_dict=True,
+                    padding="max_length",
+                    truncation=True,
+                    max_length=max_length,
+                )
+            else:
+                result = tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=True,
+                    return_dict=True,
+                    return_assistant_tokens_mask=True,
+                    padding="max_length",
+                    truncation=True,
+                    max_length=max_length,
+                )
         except (ValueError, TypeError, KeyError) as e:
             print_rank_0(f"WARNING: Failed to tokenize sample: {e}. Skipping.")
             pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id or 0
@@ -327,25 +364,10 @@ def tokenize(sample):
             }
 
         input_ids = result["input_ids"]
-        assistant_masks = result["assistant_masks"]
-
-        # Fallback: if native masks are all zeros, use heuristic ChatML masking
-        if any(m == "assistant" for m in (msg.get("role") for msg in messages)):
-            if not any(assistant_masks):
-                if not _heuristic_checked["done"]:
-                    _heuristic_checked["done"] = True
-                    if not _supports_chatml_heuristic(tokenizer):
-                        model_name = getattr(tokenizer, "name_or_path", "unknown")
-                        raise ValueError(
-                            f"Chat template for '{model_name}' does not support "
-                            f"{{% generation %}} and does not use ChatML format. "
-                            f"Use make_pretrain_tokenize_fn instead."
-                        )
-                    print_rank_0(
-                        "WARNING: Chat template lacks {% generation %} support. "
-                        "Using heuristic ChatML-based assistant masking."
-                    )
-                assistant_masks = _chatml_assistant_mask(input_ids, tokenizer)
+        if use_heuristic:
+            assistant_masks = _chatml_assistant_mask(input_ids, tokenizer)
+        else:
+            assistant_masks = result["assistant_masks"]
 
         labels = [tid if mask else IGNORE_TOKEN_ID for tid, mask in zip(input_ids, assistant_masks)]
         return {
diff --git a/tests/unit/recipe/test_laq_recipes.py b/tests/unit/recipe/test_laq_recipes.py
@@ -20,7 +20,14 @@
 import pytest
 import yaml
 
-CONFIGS_DIR = Path(__file__).resolve().parents[3] / "examples" / "llm_qat" / "configs" / "quantize"
+CONFIGS_DIR = (
+    Path(__file__).resolve().parents[3]
+    / "examples"
+    / "llm_qat"
+    / "configs"
+    / "quantize"
+    / "experimental"
+)
 
 # (filename, expected learnable_amax, expected tied_amax)
 _LAQ_RECIPES = [