|
19 | 19 | from transformers import ( |
20 | 20 | AutoConfig, |
21 | 21 | AutoModelForCausalLM, |
| 22 | + AutoTokenizer, |
22 | 23 | GenerationConfig, |
23 | 24 | PreTrainedConfig, |
24 | 25 | PreTrainedModel, |
|
34 | 35 | ) |
35 | 36 | from ..utils import _MONKEY_PATCH_LOCK, internal_gguf |
36 | 37 |
|
37 | | - |
38 | 38 | # Compatibility wrapper for no_init_weights across different transformers versions |
39 | 39 | # transformers >= 5.0.0: from transformers.initialization import no_init_weights |
40 | 40 | # transformers < 5.0.0: from transformers.modeling_utils import no_init_weights |
|
59 | 59 | "normalize_model_id_or_path_for_hf_gguf", |
60 | 60 | "resolve_trust_remote_code", |
61 | 61 | "set_hf_config_dtype", |
| 62 | + "load_hf_tokenizer", |
62 | 63 | "load_tokenizer", |
63 | 64 | ] |
64 | 65 |
|
@@ -875,6 +876,53 @@ def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict: |
875 | 876 | # during init, but newer transformers dropped the default attribute. |
876 | 877 | PreTrainedModel.is_parallelizable = False |
877 | 878 |
|
| 879 | + if not hasattr(PreTrainedModel, "get_head_mask"): |
| 880 | + def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked: bool = False): |
| 881 | + # transformers 5.x removed this helper from PreTrainedModel, |
| 882 | + # but many older trust_remote_code decoder implementations |
| 883 | + # still call `self.get_head_mask(...)` from `forward()`. |
| 884 | + # |
| 885 | + # Legacy behavior: |
| 886 | + # - `None` means no masking and expands to `[None] * n_layers` |
| 887 | + # - 1D masks are `[num_heads]` and must be broadcast to every |
| 888 | + # layer |
| 889 | + # - 2D masks are `[num_hidden_layers, num_heads]` and must be |
| 890 | + # expanded to the 5D attention-mask shape expected by old |
| 891 | + # attention blocks |
| 892 | + # |
| 893 | + # Keeping this compat shim at the base-class level is safer |
| 894 | + # than patching each remote model individually because many |
| 895 | + # pre-transformers-5 architectures shared the same contract. |
| 896 | + if head_mask is None: |
| 897 | + return [None] * num_hidden_layers |
| 898 | + |
| 899 | + if head_mask.dim() == 1: |
| 900 | + # [num_heads] -> [num_hidden_layers, batch, num_heads, seq, seq] |
| 901 | + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) |
| 902 | + head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1) |
| 903 | + elif head_mask.dim() == 2: |
| 904 | + # [num_hidden_layers, num_heads] -> [num_hidden_layers, batch, num_heads, seq, seq] |
| 905 | + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) |
| 906 | + else: |
| 907 | + raise ValueError( |
| 908 | + f"head_mask must have dim 1 or 2, but got shape {tuple(head_mask.shape)}." |
| 909 | + ) |
| 910 | + |
| 911 | + target_dtype = getattr(self, "dtype", None) |
| 912 | + if isinstance(target_dtype, torch.dtype): |
| 913 | + # Match the model compute dtype to avoid dtype promotion or |
| 914 | + # precision mismatches inside legacy attention kernels. |
| 915 | + head_mask = head_mask.to(dtype=target_dtype) |
| 916 | + |
| 917 | + if is_attention_chunked: |
| 918 | + # Older chunked-attention implementations expect one extra |
| 919 | + # axis for chunk broadcasting. |
| 920 | + head_mask = head_mask.unsqueeze(-1) |
| 921 | + |
| 922 | + return head_mask |
| 923 | + |
| 924 | + PreTrainedModel.get_head_mask = get_head_mask |
| 925 | + |
878 | 926 | if not getattr(PreTrainedModel, "_gptqmodel_missing_all_tied_weights_patch", False): |
879 | 927 | original_getattr = PreTrainedModel.__getattr__ |
880 | 928 |
|
@@ -1338,6 +1386,75 @@ def load_tokenizer(tokenizer_or_path, *, model_config: Any = None, **kwargs): |
1338 | 1386 | return Tokenicer.load(tokenizer_or_path, model_config=model_config, **kwargs) |
1339 | 1387 |
|
1340 | 1388 |
|
| 1389 | +def load_hf_tokenizer( |
| 1390 | + tokenizer_or_path, |
| 1391 | + *, |
| 1392 | + model_config: Any = None, |
| 1393 | + trust_remote_code: bool = False, |
| 1394 | + **kwargs, |
| 1395 | +): |
| 1396 | + auto_tokenizer_exc = None |
| 1397 | + try: |
| 1398 | + # Preferred path: let transformers perform its normal tokenizer |
| 1399 | + # resolution. This keeps behavior identical for native tokenizers and |
| 1400 | + # for remote-code tokenizers that are still compatible with the |
| 1401 | + # installed transformers release. |
| 1402 | + return AutoTokenizer.from_pretrained( |
| 1403 | + tokenizer_or_path, |
| 1404 | + trust_remote_code=trust_remote_code, |
| 1405 | + **kwargs, |
| 1406 | + ) |
| 1407 | + except AttributeError as exc: |
| 1408 | + # Narrow fallback for legacy trust_remote_code repositories. On |
| 1409 | + # transformers 5.x, some old repos no longer resolve to a tokenizer |
| 1410 | + # class inside AutoTokenizer and instead fail with |
| 1411 | + # `None.from_pretrained(...)`. Only intercept that specific compat |
| 1412 | + # break; all other exceptions should propagate unchanged. |
| 1413 | + if not trust_remote_code or "from_pretrained" not in str(exc): |
| 1414 | + raise |
| 1415 | + auto_tokenizer_exc = exc |
| 1416 | + |
| 1417 | + auto_map = getattr(model_config, "auto_map", None) or {} |
| 1418 | + # Old repositories often still expose an authoritative dynamic tokenizer |
| 1419 | + # reference in `config.auto_map`, even when the higher-level |
| 1420 | + # AutoTokenizer registry no longer reaches it. |
| 1421 | + class_ref = auto_map.get("AutoTokenizer") |
| 1422 | + if isinstance(class_ref, (list, tuple)): |
| 1423 | + # HF stores tokenizer refs as [slow, fast]. Prefer the fast tokenizer |
| 1424 | + # when present, otherwise use the slow one. |
| 1425 | + class_ref = class_ref[1] if len(class_ref) > 1 and class_ref[1] is not None else class_ref[0] |
| 1426 | + |
| 1427 | + if not isinstance(class_ref, str): |
| 1428 | + raise auto_tokenizer_exc |
| 1429 | + |
| 1430 | + from transformers.dynamic_module_utils import get_class_from_dynamic_module |
| 1431 | + |
| 1432 | + tokenizer_cls = get_class_from_dynamic_module(class_ref, str(tokenizer_or_path), **kwargs) |
| 1433 | + original_init = getattr(tokenizer_cls, "__init__", None) |
| 1434 | + if callable(original_init) and not getattr(tokenizer_cls, "_gptqmodel_legacy_init_compat", False): |
| 1435 | + def patched_init(self, *init_args, **init_kwargs): |
| 1436 | + # Some legacy tokenizers assign `bos/eos/pad/..._token_id` before |
| 1437 | + # they call `PreTrainedTokenizer.__init__()`. In transformers 5.x |
| 1438 | + # those assignments now go through base-class attribute handling, |
| 1439 | + # which expects `_special_tokens_map` to already exist. Creating |
| 1440 | + # the storage eagerly preserves the old constructor order without |
| 1441 | + # modifying the upstream repository code. |
| 1442 | + if not hasattr(self, "_special_tokens_map"): |
| 1443 | + object.__setattr__(self, "_special_tokens_map", {}) |
| 1444 | + return original_init(self, *init_args, **init_kwargs) |
| 1445 | + |
| 1446 | + tokenizer_cls.__init__ = patched_init |
| 1447 | + # Avoid wrapping the same dynamically imported class multiple times in |
| 1448 | + # a long-running process. |
| 1449 | + tokenizer_cls._gptqmodel_legacy_init_compat = True |
| 1450 | + tokenizer_cls.register_for_auto_class() |
| 1451 | + return tokenizer_cls.from_pretrained( |
| 1452 | + tokenizer_or_path, |
| 1453 | + trust_remote_code=trust_remote_code, |
| 1454 | + **kwargs, |
| 1455 | + ) |
| 1456 | + |
| 1457 | + |
1341 | 1458 |
|
1342 | 1459 | _patch_transformers_remote_code_compat() |
1343 | 1460 |
|
|
0 commit comments