fix(deps): satisfy huggingface_hub v1 strict validation and mypy on v5

voorhs · claude · voorhs · commit 5dfc75d6aca7 · 2026-06-28T00:04:04.000+03:00
- _bert.py: coerce label2id/id2label keys to str. huggingface_hub 1.x
  StrictDataclassFieldValidationError rejects int-keyed label2id; the
  v5 AutoModelForSequenceClassification.from_pretrained pipeline now
  routes through that validator, so the previous {int: int} mapping
  raised on every BertScorer.fit (and cascaded into a fallback
  hf_hub_download call that the test guard caught as 'unpinned').
- ranker.py: cast cross_encoder[0] to Any for auto_model.classifier
  access (nn.Sequential.__getitem__ is typed Tensor | Module on v5);
  add arg-type ignores on CrossEncoder.predict(list[tuple[str,str]])
  calls — the v5 stub demands the much wider Sequence type but the
  list-of-pairs form is the documented call shape.
- Drop type: ignore comments mypy now reports as unused
  (AutoTokenizer.from_pretrained gained a typed stub in transformers
  v5; max_length matches TokenizerConfig.max_length cleanly).
- conftest.py: SentenceTransformer's constructor is typed Any on v5,
  so add no-any-return ignore at the fixture boundary.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/src/autointent/_dump_tools/unit_dumpers.py b/src/autointent/_dump_tools/unit_dumpers.py
@@ -292,7 +292,7 @@ def load(path: Path, **kwargs: Any) -> PreTrainedTokenizer | PreTrainedTokenizer
         require("transformers")
         import transformers
 
-        return transformers.AutoTokenizer.from_pretrained(path)  # type: ignore[no-any-return,no-untyped-call]
+        return transformers.AutoTokenizer.from_pretrained(path)
 
     @classmethod
     def check_isinstance(cls, obj: Any) -> bool:  # noqa: ANN401
diff --git a/src/autointent/_wrappers/ranker.py b/src/autointent/_wrappers/ranker.py
@@ -12,7 +12,7 @@
 import logging
 from pathlib import Path
 from random import shuffle
-from typing import TYPE_CHECKING, Any, Literal, TypedDict
+from typing import TYPE_CHECKING, Any, Literal, TypedDict, cast
 
 import joblib
 import numpy as np
@@ -127,7 +127,7 @@ def __init__(
             revision=self.config.revision,
             trust_remote_code=self.config.trust_remote_code,
             device=self.config.device,
-            max_length=self.config.tokenizer_config.max_length,  # type: ignore[arg-type]
+            max_length=self.config.tokenizer_config.max_length,
         )
         self._train_head = False
         self._clf = classifier_head
@@ -138,9 +138,8 @@ def __init__(
             self._activations_list: list[npt.NDArray[Any]] = []
             # CrossEncoder is a nn.Sequential of modules; [0] is the Transformer
             # wrapping the HF model exposed as .auto_model.
-            self._hook_handler = self.cross_encoder[0].auto_model.classifier.register_forward_hook(
-                self._classifier_hook
-            )
+            transformer = cast("Any", self.cross_encoder[0])
+            self._hook_handler = transformer.auto_model.classifier.register_forward_hook(self._classifier_hook)
 
     def _classifier_hook(self, _module, input_tensor, _output_tensor) -> None:  # type: ignore[no-untyped-def] # noqa: ANN001
         """Hook to capture classifier activations.
@@ -165,13 +164,13 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr
         if not self._train_head:
             return np.array(
                 self.cross_encoder.predict(
-                    pairs,
+                    pairs,  # type: ignore[arg-type]
                     batch_size=self.config.batch_size,
                     activation_fn=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(),
                 )
             )
 
-        self.cross_encoder.predict(pairs, batch_size=self.config.batch_size)
+        self.cross_encoder.predict(pairs, batch_size=self.config.batch_size)  # type: ignore[arg-type]
         res = np.concatenate(self._activations_list, axis=0)
         self._activations_list.clear()
         return res  # type: ignore[no-any-return]
diff --git a/src/autointent/modules/scoring/_bert.py b/src/autointent/modules/scoring/_bert.py
@@ -130,8 +130,8 @@ def get_implicit_initialization_params(self) -> dict[str, Any]:
     def _initialize_model(self) -> Any:  # noqa: ANN401
         from transformers import AutoModelForSequenceClassification
 
-        label2id = {i: i for i in range(self._n_classes)}
-        id2label = {i: i for i in range(self._n_classes)}
+        label2id = {str(i): i for i in range(self._n_classes)}
+        id2label = {i: str(i) for i in range(self._n_classes)}
 
         return AutoModelForSequenceClassification.from_pretrained(
             self.classification_model_config.model_name,
@@ -152,7 +152,7 @@ def fit(
 
         self._validate_task(labels)
 
-        self._tokenizer = AutoTokenizer.from_pretrained(  # type: ignore[no-untyped-call]
+        self._tokenizer = AutoTokenizer.from_pretrained(
             self.classification_model_config.model_name, revision=self.classification_model_config.revision
         )
         self._model = self._initialize_model()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -159,7 +159,7 @@ def tiny_sentence_transformer() -> SentenceTransformer:
 
     from autointent.configs._pinned_revisions import DEFAULT_REVISIONS
 
-    return SentenceTransformer(TINY_SENTENCE_TRANSFORMER, revision=DEFAULT_REVISIONS[TINY_SENTENCE_TRANSFORMER])
+    return SentenceTransformer(TINY_SENTENCE_TRANSFORMER, revision=DEFAULT_REVISIONS[TINY_SENTENCE_TRANSFORMER])  # type: ignore[no-any-return]
 
 
 def apply_test_models(pipeline: Pipeline) -> None:
diff --git a/tests/modules/test_dumper.py b/tests/modules/test_dumper.py
@@ -41,8 +41,7 @@ class TestTransformers:
     def init_attributes(self) -> None:
         from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-        # reason: transformers AutoTokenizer.from_pretrained is untyped in stubs
-        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # type: ignore[no-untyped-call]
+        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         self._tokenizer_predictions = np.array(self.tokenizer(["hello", "world"]).input_ids)
         self.transformer = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")