Skip to content

Commit 18e7fd7

Browse files
itazapvasqu
authored andcommitted
change got reverted (#45680)
1 parent 2dff8f7 commit 18e7fd7

2 files changed

Lines changed: 46 additions & 7 deletions

File tree

src/transformers/models/auto/tokenization_auto.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -716,13 +716,16 @@ def from_pretrained(
716716
and (TOKENIZER_MAPPING_NAMES.get(config_model_type).removesuffix("Fast"))
717717
!= (tokenizer_config_class.removesuffix("Fast"))
718718
):
719-
tokenizer_class = tokenizer_class_from_name(tokenizer_config_class)
720-
if tokenizer_class is not None and tokenizer_class.__name__ not in (
721-
"TokenizersBackend",
722-
"PythonBackend",
723-
"PreTrainedTokenizerFast",
724-
):
725-
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
719+
registered_class_name = TOKENIZER_MAPPING_NAMES.get(config_model_type).removesuffix("Fast")
720+
if registered_class_name not in ("TokenizersBackend", "PythonBackend", "PreTrainedTokenizerFast"):
721+
# The auto-mapping has a real class but the Hub specifies a different specialized class so trust the Hub's class.
722+
tokenizer_class = tokenizer_class_from_name(tokenizer_config_class)
723+
if tokenizer_class is not None and tokenizer_class.__name__ not in (
724+
"TokenizersBackend",
725+
"PythonBackend",
726+
"PreTrainedTokenizerFast",
727+
):
728+
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
726729

727730
if TokenizersBackend is not None:
728731
return TokenizersBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

tests/models/auto/test_tokenization_auto.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,3 +765,39 @@ def test_mismatched_model_type_uses_config_tokenizer_class_without_sentencepiece
765765
revision="f8d333a098d19b4fd9a8b18f94170487ad3f821d",
766766
)
767767
self.assertEqual(tokenizer.__class__.__name__, "NllbTokenizer")
768+
769+
@slow
770+
@require_tokenizers
771+
def test_deepseek_r1_tokenizer_preserves_spaces(self):
772+
"""Regression: deepseek_v3 Hub config has wrong tokenizer_class='LlamaTokenizerFast'; must use TokenizersBackend."""
773+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
774+
self.assertIsInstance(tokenizer, TokenizersBackend)
775+
text = "hello world"
776+
self.assertEqual(tokenizer.decode(tokenizer.encode(text)), text)
777+
778+
@require_tokenizers
779+
@require_sentencepiece
780+
def test_specialized_hub_tokenizer_class_overrides_mismatched_auto_mapping(self):
781+
"""Hub's tokenizer_class wins when the auto-mapping has a different real class (e.g. m2m_100 → NllbTokenizer)."""
782+
from transformers import NllbTokenizer
783+
784+
fake_config = mock.MagicMock()
785+
fake_config.model_type = "m2m_100"
786+
mock_tokenizer = mock.MagicMock(spec=NllbTokenizer)
787+
788+
with (
789+
mock.patch(
790+
"transformers.models.auto.tokenization_auto.AutoConfig.from_pretrained",
791+
return_value=fake_config,
792+
),
793+
mock.patch(
794+
"transformers.models.auto.tokenization_auto.get_tokenizer_config",
795+
return_value={"tokenizer_class": "NllbTokenizer"},
796+
),
797+
mock.patch.object(NllbTokenizer, "from_pretrained", return_value=mock_tokenizer) as mock_nllb,
798+
mock.patch.object(TokenizersBackend, "from_pretrained") as mock_tb,
799+
):
800+
result = AutoTokenizer.from_pretrained("fake/nllb-model")
801+
mock_nllb.assert_called_once()
802+
mock_tb.assert_not_called()
803+
self.assertIs(result, mock_tokenizer)

0 commit comments

Comments
 (0)