feat(deps): upgrade transformers to 5.x and sentence-transformers to 5.2+ (#295)

voorhs · claude · voorhs · commit 2b5ea8f8477c · 2026-06-27T22:42:24.000+03:00
The 4.57.x mistral-regex codepath called `huggingface_hub.model_info()` on
every tokenizer load with vocab &gt;100k (e.g. `intfloat/multilingual-e5-*`),
hammering HF's rate limit in CI and in production. transformers 5.0+ caches
that probe per-process and respects `local_files_only`/`HF_HUB_OFFLINE`.

The bump is necessarily a coordinated two-package migration: ST 5.2.0 is
the first release that lifts the `transformers&lt;5.0.0` cap. Resolved
versions: transformers 5.12.1, sentence-transformers 5.6.0.

Adjusts the v5.x surfaces that actually broke:

- ranker.py: `cross_encoder.model.classifier` → `cross_encoder[0].auto_model.classifier`
  (ST 5 restructured CrossEncoder into a nn.Sequential of modules).
- ranker.py: CrossEncoder.predict() renamed `activation_fct` → `activation_fn`.
- ranker.py: `cross_encoder.model.cpu()` → `cross_encoder.cpu()` (the wrapper
  is itself an nn.Module now, no underlying `.model` attribute).
- embedder/sentence_transformers.py: import `losses`/`training_args` from
  `sentence_transformers.sentence_transformer` (top-level path deprecated).
- embedder/sentence_transformers.py: `warmup_ratio=` → `warmup_steps=` (v5
  TrainingArguments accepts a float &lt;1.0 there as a ratio).
- test_sentence_transformers_backend.py: `get_sentence_embedding_dimension()`
  → `get_embedding_dimension()`.

Removes the `_disable_transformers_mistral_regex_patch` workaround from
tests/conftest.py — the underlying bug is fixed in v5.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,8 +51,8 @@ dependencies = [
 [project.optional-dependencies]
 catboost = ["catboost (>=1.2.8,<2.0.0)"]
 peft = ["peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)"]
-transformers = ["transformers[torch] (>=4.49.0,<5.0.0)"]
-sentence-transformers = ["sentence-transformers (>=3,<4)"]
+transformers = ["transformers[torch] (>=5.0.0,<6.0.0)"]
+sentence-transformers = ["sentence-transformers (>=5.2.0,<6.0.0)"]
 dspy = [
     "dspy (>=2.6.5,<3.0.0)",
 ]
diff --git a/src/autointent/_wrappers/embedder/sentence_transformers.py b/src/autointent/_wrappers/embedder/sentence_transformers.py
@@ -299,9 +299,8 @@ def train(self, utterances: list[str], labels: ListOfLabels, config: EmbedderFin
         from sentence_transformers import (
             SentenceTransformerTrainer,
             SentenceTransformerTrainingArguments,
-            losses,
-            training_args,
         )
+        from sentence_transformers.sentence_transformer import losses, training_args
         from transformers import EarlyStoppingCallback
 
         x_train, x_val, y_train, y_val = train_test_split(
@@ -324,7 +323,9 @@ def train(self, utterances: list[str], labels: ListOfLabels, config: EmbedderFin
                 per_device_train_batch_size=config.batch_size,
                 per_device_eval_batch_size=config.batch_size,
                 learning_rate=config.learning_rate,
-                warmup_ratio=config.warmup_ratio,
+                # transformers v5 deprecated `warmup_ratio` in favor of `warmup_steps`,
+                # which now accepts a float < 1.0 as a fraction of total training steps.
+                warmup_steps=config.warmup_ratio,
                 fp16=config.fp16,
                 bf16=config.bf16,
                 seed=config.seed,
diff --git a/src/autointent/_wrappers/ranker.py b/src/autointent/_wrappers/ranker.py
@@ -136,7 +136,13 @@ def __init__(
         if classifier_head is not None or self.config.train_head:
             self._train_head = True
             self._activations_list: list[npt.NDArray[Any]] = []
-            self._hook_handler = self.cross_encoder.model.classifier.register_forward_hook(self._classifier_hook)
+            # sentence-transformers v5 restructured CrossEncoder into a nn.Sequential
+            # of modules: cross_encoder[0] is a Transformer wrapping the underlying
+            # AutoModelForSequenceClassification (exposed as .auto_model). The
+            # classifier head still lives on that HF model.
+            self._hook_handler = self.cross_encoder[0].auto_model.classifier.register_forward_hook(
+                self._classifier_hook
+            )
 
     def _classifier_hook(self, _module, input_tensor, _output_tensor) -> None:  # type: ignore[no-untyped-def] # noqa: ANN001
         """Hook to capture classifier activations.
@@ -163,7 +169,7 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr
                 self.cross_encoder.predict(
                     pairs,
                     batch_size=self.config.batch_size,
-                    activation_fct=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(),
+                    activation_fn=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(),
                 )
             )
 
@@ -311,7 +317,10 @@ def load(cls, path: Path, override_config: CrossEncoderConfig | None = None) ->
 
     def clear_ram(self) -> None:
         """Clear model from RAM and GPU memory."""
-        self.cross_encoder.model.cpu()
+        # sentence-transformers v5 CrossEncoder is itself a nn.Sequential, so we
+        # call .cpu() on the wrapper directly instead of the (now-absent)
+        # underlying `.model` attribute.
+        self.cross_encoder.cpu()
         del self.cross_encoder
         gc.collect()
         torch.cuda.empty_cache()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -25,39 +25,6 @@
     from autointent.nodes import NodeOptimizer
 
 
-def _disable_transformers_mistral_regex_patch() -> None:
-    # transformers.PreTrainedTokenizerBase._patch_mistral_regex calls
-    # huggingface_hub.model_info() for every tokenizer load with vocab > 100k
-    # (e.g. XLM-RoBERTa-based models like intfloat/multilingual-e5-*). On CI
-    # that uncacheable API call hammers the HF rate limit (429s). Tests never
-    # load mistralai tokenizers, so the correction is pure overhead — replace
-    # it with a no-op for the whole test session.
-    #
-    # Upstream bug & fix (merged for transformers 5.0.0+, NOT backported to 4.x):
-    #   https://github.com/huggingface/transformers/issues/44843
-    #   https://github.com/huggingface/transformers/pull/45444
-    # Drop this workaround when we upgrade to transformers>=5.0:
-    #   https://github.com/deeppavlov/AutoIntent/issues/295
-    try:
-        from transformers import tokenization_utils_base
-    except ImportError:
-        return
-
-    base = getattr(tokenization_utils_base, "PreTrainedTokenizerBase", None)
-    if base is None or not hasattr(base, "_patch_mistral_regex"):
-        return
-
-    def _noop_patch_mistral_regex(  # type: ignore[no-untyped-def]  # reason: monkey-patched into transformers internal classmethod; transformers is in ignore_missing_imports so signature types are unavailable
-        cls, tokenizer, *args, **kwargs
-    ):
-        return tokenizer
-
-    base._patch_mistral_regex = classmethod(_noop_patch_mistral_regex)
-
-
-_disable_transformers_mistral_regex_patch()
-
-
 def get_dataset_path() -> Path:
     return cast("Path", ires.files("tests.assets.data").joinpath("clinc_subset.json"))
 
diff --git a/tests/embedder/test_sentence_transformers_backend.py b/tests/embedder/test_sentence_transformers_backend.py
@@ -43,7 +43,7 @@ def test_model_lazy_loading(self, st_backend: SentenceTransformerEmbeddingBacken
         # cannot see the mutation inside `.embed()`. The post-call assert is
         # the whole point of this test (lazy load: None -> non-None).
         assert st_backend._model is not None
-        assert embeddings.shape == (1, st_backend._model.get_sentence_embedding_dimension())  # type: ignore[unreachable]
+        assert embeddings.shape == (1, st_backend._model.get_embedding_dimension())  # type: ignore[unreachable]
 
     def test_clear_ram(self, st_backend: SentenceTransformerEmbeddingBackend) -> None:
         """Test clearing model from RAM."""