diff --git a/docs/optimizer_search_space_config.schema.json b/docs/optimizer_search_space_config.schema.json index df8560ea3..f31195dcf 100644 --- a/docs/optimizer_search_space_config.schema.json +++ b/docs/optimizer_search_space_config.schema.json @@ -1302,6 +1302,8 @@ }, "warmup_ratio": { "default": 0.1, + "exclusiveMaximum": 1, + "minimum": 0, "title": "Warmup Ratio", "type": "number" }, diff --git a/pyproject.toml b/pyproject.toml index 5fbabaf86..a94e05693 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,8 +51,8 @@ dependencies = [ [project.optional-dependencies] catboost = ["catboost (>=1.2.8,<2.0.0)"] peft = ["peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)"] -transformers = ["transformers[torch] (>=4.49.0,<5.0.0)"] -sentence-transformers = ["sentence-transformers (>=3,<4)"] +transformers = ["transformers[torch] (>=5.0.0,<6.0.0)"] +sentence-transformers = ["sentence-transformers (>=5.4.0,<6.0.0)"] dspy = [ "dspy (>=2.6.5,<3.0.0)", ] diff --git a/src/autointent/_dump_tools/unit_dumpers.py b/src/autointent/_dump_tools/unit_dumpers.py index 2b1c201bb..ca3cda488 100644 --- a/src/autointent/_dump_tools/unit_dumpers.py +++ b/src/autointent/_dump_tools/unit_dumpers.py @@ -292,7 +292,7 @@ def load(path: Path, **kwargs: Any) -> PreTrainedTokenizer | PreTrainedTokenizer require("transformers") import transformers - return transformers.AutoTokenizer.from_pretrained(path) # type: ignore[no-any-return,no-untyped-call] + return transformers.AutoTokenizer.from_pretrained(path) @classmethod def check_isinstance(cls, obj: Any) -> bool: # noqa: ANN401 diff --git a/src/autointent/_wrappers/embedder/sentence_transformers.py b/src/autointent/_wrappers/embedder/sentence_transformers.py index 772737fed..70920b364 100644 --- a/src/autointent/_wrappers/embedder/sentence_transformers.py +++ b/src/autointent/_wrappers/embedder/sentence_transformers.py @@ -299,9 +299,8 @@ def train(self, utterances: list[str], labels: ListOfLabels, config: EmbedderFin from sentence_transformers import ( SentenceTransformerTrainer, SentenceTransformerTrainingArguments, - losses, - training_args, ) + from sentence_transformers.sentence_transformer import losses, training_args from transformers import EarlyStoppingCallback x_train, x_val, y_train, y_val = train_test_split( @@ -324,7 +323,8 @@ def train(self, utterances: list[str], labels: ListOfLabels, config: EmbedderFin per_device_train_batch_size=config.batch_size, per_device_eval_batch_size=config.batch_size, learning_rate=config.learning_rate, - warmup_ratio=config.warmup_ratio, + # warmup_steps accepts a float < 1 as a fraction of total steps. + warmup_steps=config.warmup_ratio, fp16=config.fp16, bf16=config.bf16, seed=config.seed, diff --git a/src/autointent/_wrappers/ranker.py b/src/autointent/_wrappers/ranker.py index 8e73f0be5..fd0656122 100644 --- a/src/autointent/_wrappers/ranker.py +++ b/src/autointent/_wrappers/ranker.py @@ -12,7 +12,7 @@ import logging from pathlib import Path from random import shuffle -from typing import TYPE_CHECKING, Any, Literal, TypedDict +from typing import TYPE_CHECKING, Any, Literal, TypedDict, cast import joblib import numpy as np @@ -127,7 +127,7 @@ def __init__( revision=self.config.revision, trust_remote_code=self.config.trust_remote_code, device=self.config.device, - max_length=self.config.tokenizer_config.max_length, # type: ignore[arg-type] + max_length=self.config.tokenizer_config.max_length, ) self._train_head = False self._clf = classifier_head @@ -136,7 +136,10 @@ def __init__( if classifier_head is not None or self.config.train_head: self._train_head = True self._activations_list: list[npt.NDArray[Any]] = [] - self._hook_handler = self.cross_encoder.model.classifier.register_forward_hook(self._classifier_hook) + # CrossEncoder is a nn.Sequential of modules; [0] is the Transformer + # wrapping the HF model exposed as .auto_model. + transformer = cast("Any", self.cross_encoder[0]) + self._hook_handler = transformer.auto_model.classifier.register_forward_hook(self._classifier_hook) def _classifier_hook(self, _module, input_tensor, _output_tensor) -> None: # type: ignore[no-untyped-def] # noqa: ANN001 """Hook to capture classifier activations. @@ -161,13 +164,13 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr if not self._train_head: return np.array( self.cross_encoder.predict( - pairs, + pairs, # type: ignore[arg-type] batch_size=self.config.batch_size, - activation_fct=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(), + activation_fn=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(), ) ) - self.cross_encoder.predict(pairs, batch_size=self.config.batch_size) + self.cross_encoder.predict(pairs, batch_size=self.config.batch_size) # type: ignore[arg-type] res = np.concatenate(self._activations_list, axis=0) self._activations_list.clear() return res # type: ignore[no-any-return] @@ -311,7 +314,7 @@ def load(cls, path: Path, override_config: CrossEncoderConfig | None = None) -> def clear_ram(self) -> None: """Clear model from RAM and GPU memory.""" - self.cross_encoder.model.cpu() + self.cross_encoder.cpu() del self.cross_encoder gc.collect() torch.cuda.empty_cache() diff --git a/src/autointent/configs/_transformers.py b/src/autointent/configs/_transformers.py index 2922eb81f..b06a323b5 100644 --- a/src/autointent/configs/_transformers.py +++ b/src/autointent/configs/_transformers.py @@ -28,7 +28,10 @@ class EmbedderFineTuningConfig(BaseModel): batch_size: int margin: float = Field(default=0.5) learning_rate: float = Field(default=2e-5) - warmup_ratio: float = Field(default=0.1) + # Fed to TrainingArguments.warmup_steps in v5, which interprets float<1 as + # a fraction of total steps and float>=1 as a raw step count. Cap below 1 + # so warmup_ratio=1.0 doesn't silently become a single step. + warmup_ratio: float = Field(default=0.1, ge=0, lt=1) early_stopping_patience: int = Field(default=1) early_stopping_threshold: float = Field(default=0.0) val_fraction: float = Field(default=0.2) diff --git a/src/autointent/modules/scoring/_bert.py b/src/autointent/modules/scoring/_bert.py index 816fd863e..419113907 100644 --- a/src/autointent/modules/scoring/_bert.py +++ b/src/autointent/modules/scoring/_bert.py @@ -130,17 +130,27 @@ def get_implicit_initialization_params(self) -> dict[str, Any]: def _initialize_model(self) -> Any: # noqa: ANN401 from transformers import AutoModelForSequenceClassification - label2id = {i: i for i in range(self._n_classes)} - id2label = {i: i for i in range(self._n_classes)} - + # huggingface_hub v1 StrictDataclass requires label2id keys to be str + # (and id2label values to be str); int-keyed dicts raise + # StrictDataclassFieldValidationError on from_pretrained in v5. + label2id = {str(i): i for i in range(self._n_classes)} + id2label = {i: str(i) for i in range(self._n_classes)} + + # transformers v5 + PEFT triggers find_adapter_config_file on every + # from_pretrained; it propagates _commit_hash for the cache lookup but + # NOT the outer `revision` to the fall-through hf_hub_download + # (auto_factory.py:308 only forwards adapter_kwargs). Set revision + # explicitly via adapter_kwargs so the adapter probe stays pinned. + revision = self.classification_model_config.revision return AutoModelForSequenceClassification.from_pretrained( self.classification_model_config.model_name, trust_remote_code=self.classification_model_config.trust_remote_code, - revision=self.classification_model_config.revision, + revision=revision, num_labels=self._n_classes, label2id=label2id, id2label=id2label, problem_type="multi_label_classification" if self._multilabel else "single_label_classification", + adapter_kwargs={"revision": revision} if revision is not None else None, ) def fit( @@ -152,7 +162,7 @@ def fit( self._validate_task(labels) - self._tokenizer = AutoTokenizer.from_pretrained( # type: ignore[no-untyped-call] + self._tokenizer = AutoTokenizer.from_pretrained( self.classification_model_config.model_name, revision=self.classification_model_config.revision ) self._model = self._initialize_model() diff --git a/src/autointent/modules/scoring/_lora/lora.py b/src/autointent/modules/scoring/_lora/lora.py index f310bd3eb..5c9829a94 100644 --- a/src/autointent/modules/scoring/_lora/lora.py +++ b/src/autointent/modules/scoring/_lora/lora.py @@ -120,7 +120,15 @@ def _initialize_model(self) -> Any: # noqa: ANN401 model = super()._initialize_model() from peft import get_peft_model - return get_peft_model(model, self._lora_config) + peft_model = get_peft_model(model, self._lora_config) + # PEFT's save_pretrained vocab-check (save_and_load.py:380-384) calls + # AutoConfig.from_pretrained(base_model_name_or_path) with no revision + # during every Trainer checkpoint. On a cold cache this falls through + # to an unpinned hf_hub_download. Clearing base_model_name_or_path + # short-circuits the check; our dumper saves the base model + # separately (HFModelDumper), so the adapter doesn't need to remember it. + peft_model.peft_config["default"].base_model_name_or_path = "" + return peft_model def dump(self, path: str) -> None: from peft import LoraConfig diff --git a/src/autointent/modules/scoring/_ptuning/ptuning.py b/src/autointent/modules/scoring/_ptuning/ptuning.py index 162ee502d..c443a5066 100644 --- a/src/autointent/modules/scoring/_ptuning/ptuning.py +++ b/src/autointent/modules/scoring/_ptuning/ptuning.py @@ -154,7 +154,15 @@ def _initialize_model(self) -> Any: # noqa: ANN401 model = super()._initialize_model() from peft import get_peft_model - return get_peft_model(model, self._ptuning_config) + peft_model = get_peft_model(model, self._ptuning_config) + # PEFT's save_pretrained vocab-check (save_and_load.py:380-384) calls + # AutoConfig.from_pretrained(base_model_name_or_path) with no revision + # during every Trainer checkpoint. On a cold cache this falls through + # to an unpinned hf_hub_download. Clearing base_model_name_or_path + # short-circuits the check; our dumper saves the base model + # separately (HFModelDumper), so the adapter doesn't need to remember it. + peft_model.peft_config["default"].base_model_name_or_path = "" + return peft_model def dump(self, path: str) -> None: from peft import PromptEncoderConfig diff --git a/tests/conftest.py b/tests/conftest.py index 0845a3e40..ab6f4eba2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,39 +25,6 @@ from autointent.nodes import NodeOptimizer -def _disable_transformers_mistral_regex_patch() -> None: - # transformers.PreTrainedTokenizerBase._patch_mistral_regex calls - # huggingface_hub.model_info() for every tokenizer load with vocab > 100k - # (e.g. XLM-RoBERTa-based models like intfloat/multilingual-e5-*). On CI - # that uncacheable API call hammers the HF rate limit (429s). Tests never - # load mistralai tokenizers, so the correction is pure overhead — replace - # it with a no-op for the whole test session. - # - # Upstream bug & fix (merged for transformers 5.0.0+, NOT backported to 4.x): - # https://github.com/huggingface/transformers/issues/44843 - # https://github.com/huggingface/transformers/pull/45444 - # Drop this workaround when we upgrade to transformers>=5.0: - # https://github.com/deeppavlov/AutoIntent/issues/295 - try: - from transformers import tokenization_utils_base - except ImportError: - return - - base = getattr(tokenization_utils_base, "PreTrainedTokenizerBase", None) - if base is None or not hasattr(base, "_patch_mistral_regex"): - return - - def _noop_patch_mistral_regex( # type: ignore[no-untyped-def] # reason: monkey-patched into transformers internal classmethod; transformers is in ignore_missing_imports so signature types are unavailable - cls, tokenizer, *args, **kwargs - ): - return tokenizer - - base._patch_mistral_regex = classmethod(_noop_patch_mistral_regex) - - -_disable_transformers_mistral_regex_patch() - - def get_dataset_path() -> Path: return cast("Path", ires.files("tests.assets.data").joinpath("clinc_subset.json")) @@ -192,7 +159,7 @@ def tiny_sentence_transformer() -> SentenceTransformer: from autointent.configs._pinned_revisions import DEFAULT_REVISIONS - return SentenceTransformer(TINY_SENTENCE_TRANSFORMER, revision=DEFAULT_REVISIONS[TINY_SENTENCE_TRANSFORMER]) + return SentenceTransformer(TINY_SENTENCE_TRANSFORMER, revision=DEFAULT_REVISIONS[TINY_SENTENCE_TRANSFORMER]) # type: ignore[no-any-return] def apply_test_models(pipeline: Pipeline) -> None: diff --git a/tests/embedder/test_sentence_transformers_backend.py b/tests/embedder/test_sentence_transformers_backend.py index b0c72a800..348ad4165 100644 --- a/tests/embedder/test_sentence_transformers_backend.py +++ b/tests/embedder/test_sentence_transformers_backend.py @@ -43,7 +43,7 @@ def test_model_lazy_loading(self, st_backend: SentenceTransformerEmbeddingBacken # cannot see the mutation inside `.embed()`. The post-call assert is # the whole point of this test (lazy load: None -> non-None). assert st_backend._model is not None - assert embeddings.shape == (1, st_backend._model.get_sentence_embedding_dimension()) # type: ignore[unreachable] + assert embeddings.shape == (1, st_backend._model.get_embedding_dimension()) # type: ignore[unreachable] def test_clear_ram(self, st_backend: SentenceTransformerEmbeddingBackend) -> None: """Test clearing model from RAM.""" diff --git a/tests/modules/test_dumper.py b/tests/modules/test_dumper.py index 52a60e88b..188c9ac35 100644 --- a/tests/modules/test_dumper.py +++ b/tests/modules/test_dumper.py @@ -41,8 +41,7 @@ class TestTransformers: def init_attributes(self) -> None: from transformers import AutoModelForSequenceClassification, AutoTokenizer - # reason: transformers AutoTokenizer.from_pretrained is untyped in stubs - self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # type: ignore[no-untyped-call] + self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") self._tokenizer_predictions = np.array(self.tokenizer(["hello", "world"]).input_ids) self.transformer = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") diff --git a/tests/test_deps.py b/tests/test_deps.py index a5c74a716..862fbac15 100644 --- a/tests/test_deps.py +++ b/tests/test_deps.py @@ -104,7 +104,7 @@ def test_resolve_recurses_into_nested_extra(monkeypatch: pytest.MonkeyPatch) -> _patch_metadata( monkeypatch, { - "autointent": ["transformers[torch]>=4.49.0,<5.0.0 ; extra == 'transformers'"], + "autointent": ["transformers[torch]>=5.0.0,<6.0.0 ; extra == 'transformers'"], "transformers": [ "torch>=2.2 ; extra == 'torch'", "accelerate>=0.26.0 ; extra == 'torch'", @@ -180,13 +180,13 @@ def test_require_detects_missing_nested_accelerate(monkeypatch: pytest.MonkeyPat _patch_metadata( monkeypatch, { - "autointent": ["transformers[torch]>=4.49.0,<5.0.0 ; extra == 'transformers'"], + "autointent": ["transformers[torch]>=5.0.0,<6.0.0 ; extra == 'transformers'"], "transformers": [ "torch>=2.2 ; extra == 'torch'", "accelerate>=0.26.0 ; extra == 'torch'", ], }, - {"transformers": "4.49.0", "torch": "2.2.0"}, # accelerate absent + {"transformers": "5.0.0", "torch": "2.2.0"}, # accelerate absent ) with pytest.raises(ImportError) as exc: deps.require("transformers") @@ -200,7 +200,7 @@ def test_require_reports_extra_package_entirely_missing(monkeypatch: pytest.Monk # `transformers[torch]` requirement is flagged as missing with the install hint. _patch_metadata( monkeypatch, - {"autointent": ["transformers[torch]>=4.49.0,<5.0.0 ; extra == 'transformers'"]}, + {"autointent": ["transformers[torch]>=5.0.0,<6.0.0 ; extra == 'transformers'"]}, {}, # transformers (and everything else) absent ) with pytest.raises(ImportError) as exc: