Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions sentence_transformers/base/modules/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,10 +659,29 @@ def __init__(
if max_seq_length is not None and "model_max_length" not in processor_kwargs:
processor_kwargs["model_max_length"] = max_seq_length
with suggest_extra_on_exception():
self.processor = AutoProcessor.from_pretrained(
tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
**processor_kwargs,
)
try:
self.processor = AutoProcessor.from_pretrained(
tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
**processor_kwargs,
)
except ValueError as proc_err:
# AutoProcessor failed because the repo has no processor /
# preprocessor config (e.g. a text-only model whose tokenizer
# is registered via ``auto_map -> AutoTokenizer`` together with
# ``trust_remote_code=True``). Fall back to ``AutoTokenizer``;
# the ``tokenizer`` property below already handles the case
# where ``self.processor`` itself is a ``PreTrainedTokenizerBase``.
if "Unrecognized processing class" in str(proc_err) or "does not contain" in str(proc_err):
from transformers import AutoTokenizer

self.processor = AutoTokenizer.from_pretrained(
tokenizer_name_or_path
if tokenizer_name_or_path is not None
else model_name_or_path,
**processor_kwargs,
)
else:
raise

# Cap the tokenizer model_max_length at the model's max_position_embeddings
if self.tokenizer is not None:
Expand Down
48 changes: 48 additions & 0 deletions tests/test_transformer_autotokenizer_fallback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
Test for the AutoProcessor -> AutoTokenizer fallback in Transformer.

Validates that a text-only model whose tokenizer is registered via
``auto_map -> AutoTokenizer`` (with ``trust_remote_code=True``) but that
ships no ``processor_config.json`` / ``preprocessor_config.json`` loads
correctly via ``SentenceTransformer(...)``.
"""

from __future__ import annotations

import pytest
from transformers import PreTrainedTokenizerBase

from sentence_transformers import SentenceTransformer


# Public NeoAraBERT-based sentence-embedding model whose custom Arabic
# morphological tokenizer is exposed only via ``auto_map -> AutoTokenizer``.
# Before the fix, this raises:
# ValueError: Unrecognized processing class in <repo>. Can't instantiate ...
TEST_MODEL = "Omartificial-Intelligence-Space/NeoAraBERT-MSA-Synonym-Matryoshka-V1"


@pytest.mark.slow
def test_autotokenizer_fallback_loads_text_only_custom_tokenizer() -> None:
model = SentenceTransformer(TEST_MODEL, trust_remote_code=True)

# The tokenizer must come back as a real tokenizer, not None.
transformer_module = model[0]
assert transformer_module.tokenizer is not None
assert isinstance(transformer_module.tokenizer, PreTrainedTokenizerBase)

# Encoding should work end-to-end.
sentences = [
"صلاة الجمعة في المسجد", # anchor
"الصلاة في الجامع", # synonym
"السباحة في البحر", # irrelevant
]
emb = model.encode(sentences, normalize_embeddings=True)
assert emb.shape == (3, model.get_embedding_dimension())

# Sanity: synonym should be closer to the anchor than the irrelevant.
sim = emb @ emb.T
assert sim[0, 1] > sim[0, 2], (
f"anchor-vs-synonym ({sim[0, 1]:.3f}) should exceed "
f"anchor-vs-irrelevant ({sim[0, 2]:.3f})"
)