66import json
77import numpy as np
88import os
9+ import shutil
910import sys
1011import transformers
1112import warnings
1920 GenerationConfig ,
2021 PreTrainedModel ,
2122)
23+ from transformers .models .auto .tokenization_auto import get_tokenizer_config , tokenizer_class_from_name
2224from typing import Any , Optional
2325
2426import inspect
7779_FALSEY_ENV_VALUES = {"" , "0" , "false" , "off" , "no" }
7880
7981
82+ def _sync_local_remote_code_cache (model_id_or_path : Optional [str ]) -> None :
83+ if not model_id_or_path or not os .path .isdir (model_id_or_path ):
84+ return
85+
86+ try :
87+ from transformers .dynamic_module_utils import HF_MODULES_CACHE , _sanitize_module_name
88+ except Exception :
89+ return
90+
91+ repo_name = os .path .basename (os .path .normpath (model_id_or_path ))
92+ cache_root = os .path .join (
93+ HF_MODULES_CACHE ,
94+ "transformers_modules" ,
95+ _sanitize_module_name (repo_name ),
96+ )
97+ # `trust_remote_code=True` on a local model path still goes through the
98+ # transformers dynamic-module cache. On newer transformers releases we
99+ # occasionally observe split/incomplete cache revisions for the same local
100+ # repo: one revision contains `configuration_*.py`, another contains
101+ # `modeling_*.py`, and recursive relative-import discovery then crashes
102+ # with FileNotFoundError before the model class can be imported.
103+ #
104+ # The model directory already contains the authoritative source files, so
105+ # for local paths we can safely backfill any missing top-level Python files
106+ # into each cached revision directory. This keeps the fix narrowly scoped:
107+ # it only affects local trust-remote-code models, only fills missing files,
108+ # and never overwrites files that transformers has already materialized.
109+ if not os .path .isdir (cache_root ):
110+ return
111+
112+ source_files = [
113+ file_name
114+ for file_name in os .listdir (model_id_or_path )
115+ if file_name .endswith (".py" ) and os .path .isfile (os .path .join (model_id_or_path , file_name ))
116+ ]
117+ if not source_files :
118+ return
119+
120+ target_dirs = [cache_root ]
121+ for entry in os .listdir (cache_root ):
122+ target_dir = os .path .join (cache_root , entry )
123+ if os .path .isdir (target_dir ):
124+ target_dirs .append (target_dir )
125+
126+ for target_dir in target_dirs :
127+ for file_name in source_files :
128+ source_path = os .path .join (model_id_or_path , file_name )
129+ target_path = os .path .join (target_dir , file_name )
130+ if not os .path .exists (target_path ):
131+ os .makedirs (os .path .dirname (target_path ), exist_ok = True )
132+ shutil .copy2 (source_path , target_path )
133+
134+
80135def get_hf_config_dtype (config : Any ) -> Optional [torch .dtype ]:
81136 dtype = getattr (config , "dtype" , None )
82137 if dtype is None :
@@ -1151,6 +1206,7 @@ def prepare_remote_model_init_compat(model_id_or_path: Optional[str], config: An
11511206 return
11521207
11531208 deci_init_compat (config )
1209+ _sync_local_remote_code_cache (model_id_or_path )
11541210
11551211 auto_map = getattr (config , "auto_map" , None ) or {}
11561212 class_ref = auto_map .get ("AutoModelForCausalLM" )
@@ -1470,6 +1526,19 @@ def load_hf_tokenizer(
14701526 trust_remote_code = trust_remote_code ,
14711527 ** kwargs ,
14721528 )
1529+ except ValueError as exc :
1530+ # Transformers 5.x can incorrectly route some legacy/local repos to the
1531+ # generic tokenizers backend, which then fails before consulting the
1532+ # declared tokenizer class from tokenizer_config.json.
1533+ #
1534+ # In that failure mode the repo may still have a complete
1535+ # `vocab.json`/`merges.txt` pair, but `AutoTokenizer` never reaches the
1536+ # model-specific tokenizer class that knows how to map those files to
1537+ # `vocab_file` / `merges_file`. The issue is therefore dispatch, not
1538+ # missing tokenizer assets.
1539+ if "Couldn't instantiate the backend tokenizer" not in str (exc ):
1540+ raise
1541+ auto_tokenizer_exc = exc
14731542 except AttributeError as exc :
14741543 # Narrow fallback for legacy trust_remote_code repositories. On
14751544 # transformers 5.x, some old repos no longer resolve to a tokenizer
@@ -1480,6 +1549,26 @@ def load_hf_tokenizer(
14801549 raise
14811550 auto_tokenizer_exc = exc
14821551
1552+ tokenizer_config = get_tokenizer_config (
1553+ tokenizer_or_path ,
1554+ trust_remote_code = trust_remote_code ,
1555+ ** kwargs ,
1556+ )
1557+ tokenizer_class_name = tokenizer_config .get ("tokenizer_class" )
1558+ if isinstance (tokenizer_class_name , str ):
1559+ # `tokenizer_config.json` is the most direct source of truth once the
1560+ # generic auto-dispatch path has proven unreliable. Resolving the class
1561+ # name explicitly lets us instantiate the tokenizer implementation that
1562+ # ships with transformers (for example `Qwen2Tokenizer`) so it can load
1563+ # its expected files from the repo in the normal way.
1564+ tokenizer_cls = tokenizer_class_from_name (tokenizer_class_name )
1565+ if tokenizer_cls is not None :
1566+ return tokenizer_cls .from_pretrained (
1567+ tokenizer_or_path ,
1568+ trust_remote_code = trust_remote_code ,
1569+ ** kwargs ,
1570+ )
1571+
14831572 auto_map = getattr (model_config , "auto_map" , None ) or {}
14841573 # Old repositories often still expose an authoritative dynamic tokenizer
14851574 # reference in `config.auto_map`, even when the higher-level
0 commit comments