Skip to content
Open
44 changes: 35 additions & 9 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1739,16 +1739,42 @@ def _load_tokenizer_from_pretrained(

if is_primary:
# Primary tokenizer: load from root
tokenizer = auto_processor_class.from_pretrained(
pretrained_model_name_or_path, subfolder=subfolder, **kwargs
)
else:
# Additional tokenizer: load from subfolder (e.g., "decoder_tokenizer")
tokenizer_subfolder = os.path.join(subfolder, sub_processor_type) if subfolder else sub_processor_type
tokenizer = auto_processor_class.from_pretrained(
pretrained_model_name_or_path, subfolder=tokenizer_subfolder, **kwargs
return auto_processor_class.from_pretrained(pretrained_model_name_or_path, subfolder=subfolder, **kwargs)
candidate_subfolder = os.path.join(subfolder, sub_processor_type) if subfolder else sub_processor_type

# Backward compatibility
# older/custom preprocessors may use non-primary tokenizer attribute
# names while still storing tokenizer assets at repo root
# In such cases v5 subfolder inference incorrectly looks for repo/bpe_tokenizer
# probe whether tokenizer assets actually exist in the inferred subfolder
# before attempting to load from it

probe_kwargs = {
k: kwargs[k] for k in ("cache_dir", "token", "proxies", "revision", "local_files_only") if k in kwargs
}

tokenizer_probe_files = ("tokenizer_config.json", "tokenizer.json")

sub_folder_has_tokenizer = any(
cached_file(
pretrained_model_name_or_path,
file_name,
subfolder=candidate_subfolder,
_raise_exceptions_for_missing_entries=False,
_raise_exceptions_for_connection_errors=False,
**probe_kwargs,
)
return tokenizer
is not None
for file_name in tokenizer_probe_files
)

# fallback to root/current subfolder when tokenizer assets are
# not present in the inferred tokenizer subfolder
effective_subfolder = candidate_subfolder if sub_folder_has_tokenizer else subfolder

return auto_processor_class.from_pretrained(
pretrained_model_name_or_path, subfolder=effective_subfolder, **kwargs
)

@classmethod
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
Expand Down
55 changes: 55 additions & 0 deletions tests/models/auto/test_processor_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import json
import os
import shutil
import sys
import tempfile
import unittest
Expand Down Expand Up @@ -557,6 +558,60 @@ def __init__(self, tokenizer, image_processor, encoder_image_processor):
self.assertIsInstance(loaded_processor.image_processor, SiglipImageProcessor)
self.assertIsInstance(loaded_processor.encoder_image_processor, CLIPImageProcessor)

def test_non_primary_tokenizer_root_fallback(self):
"""
Regression test for v5 tokenizer subfolder inference.
Non-primary tokenizer attributes used to load successfully even when
tokenizer assets lived at repo root.

v5 introduced unconditional subfolder loading for non-primary
tokenizers, which broke older/custom repos that stored tokenizer
files at the root instead of inside the inferred tokenizer subfolder.
"""

class CustomProcessor(ProcessorMixin):
"""Processor with a non-primary tokenizer attribute."""

def __init__(self, bpe_tokenizer, image_processor):
super().__init__(bpe_tokenizer, image_processor)

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

image_processor = SiglipImageProcessor()

processor = CustomProcessor(
bpe_tokenizer=tokenizer,
image_processor=image_processor,
)

with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)

# Verify tokenizer initially saved in subfolder
tokenizer_dir = os.path.join(tmp_dir, "bpe_tokenizer")

self.assertTrue(os.path.isdir(tokenizer_dir))

# move tokenizer assets from inferred subfolder to repo root
for file_name in os.listdir(tokenizer_dir):
src = os.path.join(tokenizer_dir, file_name)
dst = os.path.join(tmp_dir, file_name)

if os.path.isfile(src):
copyfile(src, dst)

shutil.rmtree(tokenizer_dir)

# Loading should fallback to root and still succeed
loaded_processor = CustomProcessor.from_pretrained(tmp_dir)

self.assertTrue(hasattr(loaded_processor, "bpe_tokenizer"))

self.assertEqual(
loaded_processor.bpe_tokenizer.vocab_size,
tokenizer.vocab_size,
)


@is_staging_test
class ProcessorPushToHubTester(unittest.TestCase):
Expand Down
Loading