Skip to content

Commit 86b808c

Browse files
committed
Merge remote-tracking branch 'origin/main' into optional-transformers
# Conflicts: # renderers/base.py
2 parents 5062105 + 89ab3f0 commit 86b808c

1 file changed

Lines changed: 70 additions & 6 deletions

File tree

renderers/base.py

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,7 +1187,6 @@ def _patched_load(model_name_or_path: str, **kwargs):
11871187
path is still discoverable in logs.
11881188
"""
11891189
import fastokens
1190-
from transformers import AutoTokenizer
11911190

11921191
global _FASTOKENS_ANNOUNCED
11931192

@@ -1200,13 +1199,72 @@ def _patched_load(model_name_or_path: str, **kwargs):
12001199
)
12011200
_FASTOKENS_ANNOUNCED = True
12021201
try:
1203-
return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
1202+
return _load_tokenizer_via_auto(model_name_or_path, **kwargs)
12041203
finally:
12051204
with _FASTOKENS_PATCH_LOCK:
12061205
with contextlib.redirect_stdout(io.StringIO()):
12071206
fastokens.unpatch_transformers()
12081207

12091208

1209+
def _load_fast_tokenizer_directly(
1210+
model_name_or_path: str, revision: str | None
1211+
) -> Any | None:
1212+
"""Load a self-contained fast tokenizer without building the model config.
1213+
1214+
``AutoTokenizer.from_pretrained`` eagerly constructs the *model* config to
1215+
resolve the tokenizer class — even for a plain ``PreTrainedTokenizerFast``.
1216+
That construction can raise on modeling-only concerns the tokenizer never
1217+
needs (e.g. RoPE parameter validation for configs that carry nested
1218+
``rope_parameters``). When the repo ships a complete ``tokenizer.json`` and
1219+
declares no custom tokenizer, the tokenizer is fully self-describing, so we
1220+
load it directly and skip the config detour.
1221+
1222+
Returns ``None`` when there's nothing safe to load this way — a custom
1223+
``auto_map`` tokenizer (which must run through ``AutoTokenizer`` with
1224+
``trust_remote_code``) or no fast tokenizer at all — so the caller can
1225+
surface its original error instead.
1226+
"""
1227+
from transformers import PreTrainedTokenizerFast
1228+
from transformers.models.auto.tokenization_auto import get_tokenizer_config
1229+
1230+
try:
1231+
if "auto_map" in get_tokenizer_config(model_name_or_path, revision=revision):
1232+
return None
1233+
return PreTrainedTokenizerFast.from_pretrained(
1234+
model_name_or_path, revision=revision
1235+
)
1236+
except Exception:
1237+
return None
1238+
1239+
1240+
def _load_tokenizer_via_auto(model_name_or_path: str, **kwargs) -> Any:
1241+
"""``AutoTokenizer.from_pretrained`` with a config-free fallback.
1242+
1243+
renderers needs the tokenizer, not the model. If ``AutoTokenizer`` fails
1244+
while building the model config it loads to resolve the tokenizer class,
1245+
retry by loading the repo's self-contained ``tokenizer.json`` directly. The
1246+
original error is re-raised if the repo has no such tokenizer.
1247+
"""
1248+
from transformers import AutoTokenizer
1249+
1250+
try:
1251+
return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
1252+
except Exception as exc:
1253+
tok = _load_fast_tokenizer_directly(
1254+
model_name_or_path, revision=kwargs.get("revision")
1255+
)
1256+
if tok is None:
1257+
raise
1258+
logger.debug(
1259+
"AutoTokenizer.from_pretrained(%r) failed building the model config "
1260+
"(%s: %s); loaded the tokenizer directly from tokenizer.json.",
1261+
model_name_or_path,
1262+
type(exc).__name__,
1263+
str(exc)[:160],
1264+
)
1265+
return tok
1266+
1267+
12101268
def load_tokenizer(
12111269
model_name_or_path: str,
12121270
*,
@@ -1237,11 +1295,17 @@ def load_tokenizer(
12371295
pre-tokenizer type), we automatically retry with the vanilla
12381296
backend and emit an INFO log.
12391297
1298+
``AutoTokenizer.from_pretrained`` eagerly builds the model config to
1299+
resolve the tokenizer class. If that construction raises on a
1300+
modeling-only concern the tokenizer doesn't need (e.g. RoPE
1301+
validation for configs with nested ``rope_parameters``), we fall
1302+
back to loading the repo's self-contained ``tokenizer.json``
1303+
directly — see ``_load_tokenizer_via_auto``.
1304+
12401305
Requires the optional ``transformers`` extra; raises a clear
12411306
``ImportError`` with install instructions if it's missing.
12421307
"""
1243-
AutoTokenizer = _require_transformers().AutoTokenizer
1244-
1308+
_require_transformers()
12451309
kwargs: dict[str, Any] = {}
12461310
revision = TRUSTED_REVISIONS.get(model_name_or_path)
12471311
if revision is not None:
@@ -1250,7 +1314,7 @@ def load_tokenizer(
12501314
kwargs = {"trust_remote_code": False}
12511315

12521316
if not use_fastokens or model_name_or_path in FASTOKENS_INCOMPATIBLE:
1253-
return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
1317+
return _load_tokenizer_via_auto(model_name_or_path, **kwargs)
12541318

12551319
try:
12561320
return _patched_load(model_name_or_path, **kwargs)
@@ -1263,7 +1327,7 @@ def load_tokenizer(
12631327
type(exc).__name__,
12641328
str(exc)[:160],
12651329
)
1266-
return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs)
1330+
return _load_tokenizer_via_auto(model_name_or_path, **kwargs)
12671331

12681332

12691333
def _populate_registry():

0 commit comments

Comments
 (0)