Skip to content

Commit 2b5ea8f

Browse files
voorhsclaude
andcommitted
feat(deps): upgrade transformers to 5.x and sentence-transformers to 5.2+ (#295)
The 4.57.x mistral-regex codepath called `huggingface_hub.model_info()` on every tokenizer load with vocab >100k (e.g. `intfloat/multilingual-e5-*`), hammering HF's rate limit in CI and in production. transformers 5.0+ caches that probe per-process and respects `local_files_only`/`HF_HUB_OFFLINE`. The bump is necessarily a coordinated two-package migration: ST 5.2.0 is the first release that lifts the `transformers<5.0.0` cap. Resolved versions: transformers 5.12.1, sentence-transformers 5.6.0. Adjusts the v5.x surfaces that actually broke: - ranker.py: `cross_encoder.model.classifier` → `cross_encoder[0].auto_model.classifier` (ST 5 restructured CrossEncoder into a nn.Sequential of modules). - ranker.py: CrossEncoder.predict() renamed `activation_fct` → `activation_fn`. - ranker.py: `cross_encoder.model.cpu()` → `cross_encoder.cpu()` (the wrapper is itself an nn.Module now, no underlying `.model` attribute). - embedder/sentence_transformers.py: import `losses`/`training_args` from `sentence_transformers.sentence_transformer` (top-level path deprecated). - embedder/sentence_transformers.py: `warmup_ratio=` → `warmup_steps=` (v5 TrainingArguments accepts a float <1.0 there as a ratio). - test_sentence_transformers_backend.py: `get_sentence_embedding_dimension()` → `get_embedding_dimension()`. Removes the `_disable_transformers_mistral_regex_patch` workaround from tests/conftest.py — the underlying bug is fixed in v5. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 483ead2 commit 2b5ea8f

5 files changed

Lines changed: 19 additions & 42 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ dependencies = [
5151
[project.optional-dependencies]
5252
catboost = ["catboost (>=1.2.8,<2.0.0)"]
5353
peft = ["peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)"]
54-
transformers = ["transformers[torch] (>=4.49.0,<5.0.0)"]
55-
sentence-transformers = ["sentence-transformers (>=3,<4)"]
54+
transformers = ["transformers[torch] (>=5.0.0,<6.0.0)"]
55+
sentence-transformers = ["sentence-transformers (>=5.2.0,<6.0.0)"]
5656
dspy = [
5757
"dspy (>=2.6.5,<3.0.0)",
5858
]

src/autointent/_wrappers/embedder/sentence_transformers.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -299,9 +299,8 @@ def train(self, utterances: list[str], labels: ListOfLabels, config: EmbedderFin
299299
from sentence_transformers import (
300300
SentenceTransformerTrainer,
301301
SentenceTransformerTrainingArguments,
302-
losses,
303-
training_args,
304302
)
303+
from sentence_transformers.sentence_transformer import losses, training_args
305304
from transformers import EarlyStoppingCallback
306305

307306
x_train, x_val, y_train, y_val = train_test_split(
@@ -324,7 +323,9 @@ def train(self, utterances: list[str], labels: ListOfLabels, config: EmbedderFin
324323
per_device_train_batch_size=config.batch_size,
325324
per_device_eval_batch_size=config.batch_size,
326325
learning_rate=config.learning_rate,
327-
warmup_ratio=config.warmup_ratio,
326+
# transformers v5 deprecated `warmup_ratio` in favor of `warmup_steps`,
327+
# which now accepts a float < 1.0 as a fraction of total training steps.
328+
warmup_steps=config.warmup_ratio,
328329
fp16=config.fp16,
329330
bf16=config.bf16,
330331
seed=config.seed,

src/autointent/_wrappers/ranker.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,13 @@ def __init__(
136136
if classifier_head is not None or self.config.train_head:
137137
self._train_head = True
138138
self._activations_list: list[npt.NDArray[Any]] = []
139-
self._hook_handler = self.cross_encoder.model.classifier.register_forward_hook(self._classifier_hook)
139+
# sentence-transformers v5 restructured CrossEncoder into a nn.Sequential
140+
# of modules: cross_encoder[0] is a Transformer wrapping the underlying
141+
# AutoModelForSequenceClassification (exposed as .auto_model). The
142+
# classifier head still lives on that HF model.
143+
self._hook_handler = self.cross_encoder[0].auto_model.classifier.register_forward_hook(
144+
self._classifier_hook
145+
)
140146

141147
def _classifier_hook(self, _module, input_tensor, _output_tensor) -> None: # type: ignore[no-untyped-def] # noqa: ANN001
142148
"""Hook to capture classifier activations.
@@ -163,7 +169,7 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr
163169
self.cross_encoder.predict(
164170
pairs,
165171
batch_size=self.config.batch_size,
166-
activation_fct=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(),
172+
activation_fn=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(),
167173
)
168174
)
169175

@@ -311,7 +317,10 @@ def load(cls, path: Path, override_config: CrossEncoderConfig | None = None) ->
311317

312318
def clear_ram(self) -> None:
313319
"""Clear model from RAM and GPU memory."""
314-
self.cross_encoder.model.cpu()
320+
# sentence-transformers v5 CrossEncoder is itself a nn.Sequential, so we
321+
# call .cpu() on the wrapper directly instead of the (now-absent)
322+
# underlying `.model` attribute.
323+
self.cross_encoder.cpu()
315324
del self.cross_encoder
316325
gc.collect()
317326
torch.cuda.empty_cache()

tests/conftest.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -25,39 +25,6 @@
2525
from autointent.nodes import NodeOptimizer
2626

2727

28-
def _disable_transformers_mistral_regex_patch() -> None:
29-
# transformers.PreTrainedTokenizerBase._patch_mistral_regex calls
30-
# huggingface_hub.model_info() for every tokenizer load with vocab > 100k
31-
# (e.g. XLM-RoBERTa-based models like intfloat/multilingual-e5-*). On CI
32-
# that uncacheable API call hammers the HF rate limit (429s). Tests never
33-
# load mistralai tokenizers, so the correction is pure overhead — replace
34-
# it with a no-op for the whole test session.
35-
#
36-
# Upstream bug & fix (merged for transformers 5.0.0+, NOT backported to 4.x):
37-
# https://github.com/huggingface/transformers/issues/44843
38-
# https://github.com/huggingface/transformers/pull/45444
39-
# Drop this workaround when we upgrade to transformers>=5.0:
40-
# https://github.com/deeppavlov/AutoIntent/issues/295
41-
try:
42-
from transformers import tokenization_utils_base
43-
except ImportError:
44-
return
45-
46-
base = getattr(tokenization_utils_base, "PreTrainedTokenizerBase", None)
47-
if base is None or not hasattr(base, "_patch_mistral_regex"):
48-
return
49-
50-
def _noop_patch_mistral_regex( # type: ignore[no-untyped-def] # reason: monkey-patched into transformers internal classmethod; transformers is in ignore_missing_imports so signature types are unavailable
51-
cls, tokenizer, *args, **kwargs
52-
):
53-
return tokenizer
54-
55-
base._patch_mistral_regex = classmethod(_noop_patch_mistral_regex)
56-
57-
58-
_disable_transformers_mistral_regex_patch()
59-
60-
6128
def get_dataset_path() -> Path:
6229
return cast("Path", ires.files("tests.assets.data").joinpath("clinc_subset.json"))
6330

tests/embedder/test_sentence_transformers_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def test_model_lazy_loading(self, st_backend: SentenceTransformerEmbeddingBacken
4343
# cannot see the mutation inside `.embed()`. The post-call assert is
4444
# the whole point of this test (lazy load: None -> non-None).
4545
assert st_backend._model is not None
46-
assert embeddings.shape == (1, st_backend._model.get_sentence_embedding_dimension()) # type: ignore[unreachable]
46+
assert embeddings.shape == (1, st_backend._model.get_embedding_dimension()) # type: ignore[unreachable]
4747

4848
def test_clear_ram(self, st_backend: SentenceTransformerEmbeddingBackend) -> None:
4949
"""Test clearing model from RAM."""

0 commit comments

Comments
 (0)