@@ -765,3 +765,39 @@ def test_mismatched_model_type_uses_config_tokenizer_class_without_sentencepiece
765765 revision = "f8d333a098d19b4fd9a8b18f94170487ad3f821d" ,
766766 )
767767 self .assertEqual (tokenizer .__class__ .__name__ , "NllbTokenizer" )
768+
769+ @slow
770+ @require_tokenizers
771+ def test_deepseek_r1_tokenizer_preserves_spaces (self ):
772+ """Regression: deepseek_v3 Hub config has wrong tokenizer_class='LlamaTokenizerFast'; must use TokenizersBackend."""
773+ tokenizer = AutoTokenizer .from_pretrained ("deepseek-ai/DeepSeek-R1" )
774+ self .assertIsInstance (tokenizer , TokenizersBackend )
775+ text = "hello world"
776+ self .assertEqual (tokenizer .decode (tokenizer .encode (text )), text )
777+
778+ @require_tokenizers
779+ @require_sentencepiece
780+ def test_specialized_hub_tokenizer_class_overrides_mismatched_auto_mapping (self ):
781+ """Hub's tokenizer_class wins when the auto-mapping has a different real class (e.g. m2m_100 → NllbTokenizer)."""
782+ from transformers import NllbTokenizer
783+
784+ fake_config = mock .MagicMock ()
785+ fake_config .model_type = "m2m_100"
786+ mock_tokenizer = mock .MagicMock (spec = NllbTokenizer )
787+
788+ with (
789+ mock .patch (
790+ "transformers.models.auto.tokenization_auto.AutoConfig.from_pretrained" ,
791+ return_value = fake_config ,
792+ ),
793+ mock .patch (
794+ "transformers.models.auto.tokenization_auto.get_tokenizer_config" ,
795+ return_value = {"tokenizer_class" : "NllbTokenizer" },
796+ ),
797+ mock .patch .object (NllbTokenizer , "from_pretrained" , return_value = mock_tokenizer ) as mock_nllb ,
798+ mock .patch .object (TokenizersBackend , "from_pretrained" ) as mock_tb ,
799+ ):
800+ result = AutoTokenizer .from_pretrained ("fake/nllb-model" )
801+ mock_nllb .assert_called_once ()
802+ mock_tb .assert_not_called ()
803+ self .assertIs (result , mock_tokenizer )
0 commit comments