Skip to content

Commit 1a50635

Browse files
Fix InternVL tokenizer compat on transformers 5 (#2892)
1 parent 6ca6036 commit 1a50635

1 file changed

Lines changed: 89 additions & 0 deletions

File tree

gptqmodel/utils/hf.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77
import numpy as np
88
import os
9+
import shutil
910
import sys
1011
import transformers
1112
import warnings
@@ -19,6 +20,7 @@
1920
GenerationConfig,
2021
PreTrainedModel,
2122
)
23+
from transformers.models.auto.tokenization_auto import get_tokenizer_config, tokenizer_class_from_name
2224
from typing import Any, Optional
2325

2426
import inspect
@@ -77,6 +79,59 @@
7779
_FALSEY_ENV_VALUES = {"", "0", "false", "off", "no"}
7880

7981

82+
def _sync_local_remote_code_cache(model_id_or_path: Optional[str]) -> None:
83+
if not model_id_or_path or not os.path.isdir(model_id_or_path):
84+
return
85+
86+
try:
87+
from transformers.dynamic_module_utils import HF_MODULES_CACHE, _sanitize_module_name
88+
except Exception:
89+
return
90+
91+
repo_name = os.path.basename(os.path.normpath(model_id_or_path))
92+
cache_root = os.path.join(
93+
HF_MODULES_CACHE,
94+
"transformers_modules",
95+
_sanitize_module_name(repo_name),
96+
)
97+
# `trust_remote_code=True` on a local model path still goes through the
98+
# transformers dynamic-module cache. On newer transformers releases we
99+
# occasionally observe split/incomplete cache revisions for the same local
100+
# repo: one revision contains `configuration_*.py`, another contains
101+
# `modeling_*.py`, and recursive relative-import discovery then crashes
102+
# with FileNotFoundError before the model class can be imported.
103+
#
104+
# The model directory already contains the authoritative source files, so
105+
# for local paths we can safely backfill any missing top-level Python files
106+
# into each cached revision directory. This keeps the fix narrowly scoped:
107+
# it only affects local trust-remote-code models, only fills missing files,
108+
# and never overwrites files that transformers has already materialized.
109+
if not os.path.isdir(cache_root):
110+
return
111+
112+
source_files = [
113+
file_name
114+
for file_name in os.listdir(model_id_or_path)
115+
if file_name.endswith(".py") and os.path.isfile(os.path.join(model_id_or_path, file_name))
116+
]
117+
if not source_files:
118+
return
119+
120+
target_dirs = [cache_root]
121+
for entry in os.listdir(cache_root):
122+
target_dir = os.path.join(cache_root, entry)
123+
if os.path.isdir(target_dir):
124+
target_dirs.append(target_dir)
125+
126+
for target_dir in target_dirs:
127+
for file_name in source_files:
128+
source_path = os.path.join(model_id_or_path, file_name)
129+
target_path = os.path.join(target_dir, file_name)
130+
if not os.path.exists(target_path):
131+
os.makedirs(os.path.dirname(target_path), exist_ok=True)
132+
shutil.copy2(source_path, target_path)
133+
134+
80135
def get_hf_config_dtype(config: Any) -> Optional[torch.dtype]:
81136
dtype = getattr(config, "dtype", None)
82137
if dtype is None:
@@ -1151,6 +1206,7 @@ def prepare_remote_model_init_compat(model_id_or_path: Optional[str], config: An
11511206
return
11521207

11531208
deci_init_compat(config)
1209+
_sync_local_remote_code_cache(model_id_or_path)
11541210

11551211
auto_map = getattr(config, "auto_map", None) or {}
11561212
class_ref = auto_map.get("AutoModelForCausalLM")
@@ -1470,6 +1526,19 @@ def load_hf_tokenizer(
14701526
trust_remote_code=trust_remote_code,
14711527
**kwargs,
14721528
)
1529+
except ValueError as exc:
1530+
# Transformers 5.x can incorrectly route some legacy/local repos to the
1531+
# generic tokenizers backend, which then fails before consulting the
1532+
# declared tokenizer class from tokenizer_config.json.
1533+
#
1534+
# In that failure mode the repo may still have a complete
1535+
# `vocab.json`/`merges.txt` pair, but `AutoTokenizer` never reaches the
1536+
# model-specific tokenizer class that knows how to map those files to
1537+
# `vocab_file` / `merges_file`. The issue is therefore dispatch, not
1538+
# missing tokenizer assets.
1539+
if "Couldn't instantiate the backend tokenizer" not in str(exc):
1540+
raise
1541+
auto_tokenizer_exc = exc
14731542
except AttributeError as exc:
14741543
# Narrow fallback for legacy trust_remote_code repositories. On
14751544
# transformers 5.x, some old repos no longer resolve to a tokenizer
@@ -1480,6 +1549,26 @@ def load_hf_tokenizer(
14801549
raise
14811550
auto_tokenizer_exc = exc
14821551

1552+
tokenizer_config = get_tokenizer_config(
1553+
tokenizer_or_path,
1554+
trust_remote_code=trust_remote_code,
1555+
**kwargs,
1556+
)
1557+
tokenizer_class_name = tokenizer_config.get("tokenizer_class")
1558+
if isinstance(tokenizer_class_name, str):
1559+
# `tokenizer_config.json` is the most direct source of truth once the
1560+
# generic auto-dispatch path has proven unreliable. Resolving the class
1561+
# name explicitly lets us instantiate the tokenizer implementation that
1562+
# ships with transformers (for example `Qwen2Tokenizer`) so it can load
1563+
# its expected files from the repo in the normal way.
1564+
tokenizer_cls = tokenizer_class_from_name(tokenizer_class_name)
1565+
if tokenizer_cls is not None:
1566+
return tokenizer_cls.from_pretrained(
1567+
tokenizer_or_path,
1568+
trust_remote_code=trust_remote_code,
1569+
**kwargs,
1570+
)
1571+
14831572
auto_map = getattr(model_config, "auto_map", None) or {}
14841573
# Old repositories often still expose an authoritative dynamic tokenizer
14851574
# reference in `config.auto_map`, even when the higher-level

0 commit comments

Comments
 (0)