fix(tokenizer): fall back to direct fast-tokenizer load when model config build fails

hallerite · claude · hallerite · commit 30655d6e7b3a · 2026-05-27T22:02:06.000Z
`AutoTokenizer.from_pretrained` eagerly constructs the *model* config to
resolve the tokenizer class — even for a plain `PreTrainedTokenizerFast`.
That construction runs HF's RoPE validator, which rejects configs carrying
nested `rope_parameters` (e.g. poolside/Laguna-XS.2: `full_attention` /
`sliding_attention` blocks with no top-level `rope_theta`) when the config
is built outside vLLM's `patch_rope_parameters`. The resulting `KeyError`
escapes (AutoTokenizer only catches `ValueError`/`OSError`) and kills the
tokenizer load — a modeling-only concern breaking something the tokenizer
never needed.

renderers needs the tokenizer, not the model. When `AutoTokenizer` fails
while building the config, fall back to loading the repo's self-contained
`tokenizer.json` directly via `PreTrainedTokenizerFast`, which never touches
the model config. The fallback runs under the fastokens patch, so models
like Laguna keep the Rust fast-path speedup. Custom `auto_map` tokenizers
and repos without a fast tokenizer are left to surface the original error.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/renderers/base.py b/renderers/base.py
@@ -1089,7 +1089,6 @@ def _patched_load(model_name_or_path: str, **kwargs):
     path is still discoverable in logs.
     """
     import fastokens
-    from transformers import AutoTokenizer
 
     global _FASTOKENS_ANNOUNCED
 
@@ -1102,13 +1101,72 @@ def _patched_load(model_name_or_path: str, **kwargs):
             )
             _FASTOKENS_ANNOUNCED = True
     try:
-        return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
+        return _load_tokenizer_via_auto(model_name_or_path, **kwargs)
     finally:
         with _FASTOKENS_PATCH_LOCK:
             with contextlib.redirect_stdout(io.StringIO()):
                 fastokens.unpatch_transformers()
 
 
+def _load_fast_tokenizer_directly(
+    model_name_or_path: str, revision: str | None
+) -> Any | None:
+    """Load a self-contained fast tokenizer without building the model config.
+
+    ``AutoTokenizer.from_pretrained`` eagerly constructs the *model* config to
+    resolve the tokenizer class — even for a plain ``PreTrainedTokenizerFast``.
+    That construction can raise on modeling-only concerns the tokenizer never
+    needs (e.g. RoPE parameter validation for configs that carry nested
+    ``rope_parameters``). When the repo ships a complete ``tokenizer.json`` and
+    declares no custom tokenizer, the tokenizer is fully self-describing, so we
+    load it directly and skip the config detour.
+
+    Returns ``None`` when there's nothing safe to load this way — a custom
+    ``auto_map`` tokenizer (which must run through ``AutoTokenizer`` with
+    ``trust_remote_code``) or no fast tokenizer at all — so the caller can
+    surface its original error instead.
+    """
+    from transformers import PreTrainedTokenizerFast
+    from transformers.models.auto.tokenization_auto import get_tokenizer_config
+
+    try:
+        if "auto_map" in get_tokenizer_config(model_name_or_path, revision=revision):
+            return None
+        return PreTrainedTokenizerFast.from_pretrained(
+            model_name_or_path, revision=revision
+        )
+    except Exception:
+        return None
+
+
+def _load_tokenizer_via_auto(model_name_or_path: str, **kwargs) -> Any:
+    """``AutoTokenizer.from_pretrained`` with a config-free fallback.
+
+    renderers needs the tokenizer, not the model. If ``AutoTokenizer`` fails
+    while building the model config it loads to resolve the tokenizer class,
+    retry by loading the repo's self-contained ``tokenizer.json`` directly. The
+    original error is re-raised if the repo has no such tokenizer.
+    """
+    from transformers import AutoTokenizer
+
+    try:
+        return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
+    except Exception as exc:
+        tok = _load_fast_tokenizer_directly(
+            model_name_or_path, revision=kwargs.get("revision")
+        )
+        if tok is None:
+            raise
+        logger.debug(
+            "AutoTokenizer.from_pretrained(%r) failed building the model config "
+            "(%s: %s); loaded the tokenizer directly from tokenizer.json.",
+            model_name_or_path,
+            type(exc).__name__,
+            str(exc)[:160],
+        )
+        return tok
+
+
 def load_tokenizer(
     model_name_or_path: str,
     *,
@@ -1138,9 +1196,14 @@ def load_tokenizer(
     fastokens raises during the patched load (e.g. an unknown
     pre-tokenizer type), we automatically retry with the vanilla
     backend and emit an INFO log.
-    """
-    from transformers import AutoTokenizer
 
+    ``AutoTokenizer.from_pretrained`` eagerly builds the model config to
+    resolve the tokenizer class. If that construction raises on a
+    modeling-only concern the tokenizer doesn't need (e.g. RoPE
+    validation for configs with nested ``rope_parameters``), we fall
+    back to loading the repo's self-contained ``tokenizer.json``
+    directly — see ``_load_tokenizer_via_auto``.
+    """
     kwargs: dict[str, Any] = {}
     revision = TRUSTED_REVISIONS.get(model_name_or_path)
     if revision is not None:
@@ -1149,7 +1212,7 @@ def load_tokenizer(
         kwargs = {"trust_remote_code": False}
 
     if not use_fastokens or model_name_or_path in FASTOKENS_INCOMPATIBLE:
-        return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
+        return _load_tokenizer_via_auto(model_name_or_path, **kwargs)
 
     try:
         return _patched_load(model_name_or_path, **kwargs)
@@ -1162,7 +1225,7 @@ def load_tokenizer(
             type(exc).__name__,
             str(exc)[:160],
         )
-        return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
+        return _load_tokenizer_via_auto(model_name_or_path, **kwargs)
 
 
 def _populate_registry():