huggingface · omarnj-lab · May 10, 2026 · May 10, 2026
diff --git a/sentence_transformers/base/modules/transformer.py b/sentence_transformers/base/modules/transformer.py
@@ -659,10 +659,29 @@ def __init__(
         if max_seq_length is not None and "model_max_length" not in processor_kwargs:
             processor_kwargs["model_max_length"] = max_seq_length
         with suggest_extra_on_exception():
-            self.processor = AutoProcessor.from_pretrained(
-                tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
-                **processor_kwargs,
-            )
+            try:
+                self.processor = AutoProcessor.from_pretrained(
+                    tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
+                    **processor_kwargs,
+                )
+            except ValueError as proc_err:
+                # AutoProcessor failed because the repo has no processor /
+                # preprocessor config (e.g. a text-only model whose tokenizer
+                # is registered via ``auto_map -> AutoTokenizer`` together with
+                # ``trust_remote_code=True``). Fall back to ``AutoTokenizer``;
+                # the ``tokenizer`` property below already handles the case
+                # where ``self.processor`` itself is a ``PreTrainedTokenizerBase``.
+                if "Unrecognized processing class" in str(proc_err) or "does not contain" in str(proc_err):
+                    from transformers import AutoTokenizer
+
+                    self.processor = AutoTokenizer.from_pretrained(
+                        tokenizer_name_or_path
+                        if tokenizer_name_or_path is not None
+                        else model_name_or_path,
+                        **processor_kwargs,
+                    )
+                else:
+                    raise
 
         # Cap the tokenizer model_max_length at the model's max_position_embeddings
         if self.tokenizer is not None:

diff --git a/tests/test_transformer_autotokenizer_fallback.py b/tests/test_transformer_autotokenizer_fallback.py
@@ -0,0 +1,48 @@
+"""
+Test for the AutoProcessor -> AutoTokenizer fallback in Transformer.
+
+Validates that a text-only model whose tokenizer is registered via
+``auto_map -> AutoTokenizer`` (with ``trust_remote_code=True``) but that
+ships no ``processor_config.json`` / ``preprocessor_config.json`` loads
+correctly via ``SentenceTransformer(...)``.
+"""
+
+from __future__ import annotations
+
+import pytest
+from transformers import PreTrainedTokenizerBase
+
+from sentence_transformers import SentenceTransformer
+
+
+# Public NeoAraBERT-based sentence-embedding model whose custom Arabic
+# morphological tokenizer is exposed only via ``auto_map -> AutoTokenizer``.
+# Before the fix, this raises:
+#   ValueError: Unrecognized processing class in <repo>. Can't instantiate ...
+TEST_MODEL = "Omartificial-Intelligence-Space/NeoAraBERT-MSA-Synonym-Matryoshka-V1"
+
+
+@pytest.mark.slow
+def test_autotokenizer_fallback_loads_text_only_custom_tokenizer() -> None:
+    model = SentenceTransformer(TEST_MODEL, trust_remote_code=True)
+
+    # The tokenizer must come back as a real tokenizer, not None.
+    transformer_module = model[0]
+    assert transformer_module.tokenizer is not None
+    assert isinstance(transformer_module.tokenizer, PreTrainedTokenizerBase)
+
+    # Encoding should work end-to-end.
+    sentences = [
+        "صلاة الجمعة في المسجد",  # anchor
+        "الصلاة في الجامع",  # synonym
+        "السباحة في البحر",  # irrelevant
+    ]
+    emb = model.encode(sentences, normalize_embeddings=True)
+    assert emb.shape == (3, model.get_embedding_dimension())
+
+    # Sanity: synonym should be closer to the anchor than the irrelevant.
+    sim = emb @ emb.T
+    assert sim[0, 1] > sim[0, 2], (
+        f"anchor-vs-synonym ({sim[0, 1]:.3f}) should exceed "
+        f"anchor-vs-irrelevant ({sim[0, 2]:.3f})"
+    )