Skip to content

Commit 5dfc75d

Browse files
voorhsclaude
andcommitted
fix(deps): satisfy huggingface_hub v1 strict validation and mypy on v5
- _bert.py: coerce label2id/id2label keys to str. huggingface_hub 1.x StrictDataclassFieldValidationError rejects int-keyed label2id; the v5 AutoModelForSequenceClassification.from_pretrained pipeline now routes through that validator, so the previous {int: int} mapping raised on every BertScorer.fit (and cascaded into a fallback hf_hub_download call that the test guard caught as 'unpinned'). - ranker.py: cast cross_encoder[0] to Any for auto_model.classifier access (nn.Sequential.__getitem__ is typed Tensor | Module on v5); add arg-type ignores on CrossEncoder.predict(list[tuple[str,str]]) calls — the v5 stub demands the much wider Sequence type but the list-of-pairs form is the documented call shape. - Drop type: ignore comments mypy now reports as unused (AutoTokenizer.from_pretrained gained a typed stub in transformers v5; max_length matches TokenizerConfig.max_length cleanly). - conftest.py: SentenceTransformer's constructor is typed Any on v5, so add no-any-return ignore at the fixture boundary. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent b94a27e commit 5dfc75d

5 files changed

Lines changed: 12 additions & 14 deletions

File tree

src/autointent/_dump_tools/unit_dumpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def load(path: Path, **kwargs: Any) -> PreTrainedTokenizer | PreTrainedTokenizer
292292
require("transformers")
293293
import transformers
294294

295-
return transformers.AutoTokenizer.from_pretrained(path) # type: ignore[no-any-return,no-untyped-call]
295+
return transformers.AutoTokenizer.from_pretrained(path)
296296

297297
@classmethod
298298
def check_isinstance(cls, obj: Any) -> bool: # noqa: ANN401

src/autointent/_wrappers/ranker.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import logging
1313
from pathlib import Path
1414
from random import shuffle
15-
from typing import TYPE_CHECKING, Any, Literal, TypedDict
15+
from typing import TYPE_CHECKING, Any, Literal, TypedDict, cast
1616

1717
import joblib
1818
import numpy as np
@@ -127,7 +127,7 @@ def __init__(
127127
revision=self.config.revision,
128128
trust_remote_code=self.config.trust_remote_code,
129129
device=self.config.device,
130-
max_length=self.config.tokenizer_config.max_length, # type: ignore[arg-type]
130+
max_length=self.config.tokenizer_config.max_length,
131131
)
132132
self._train_head = False
133133
self._clf = classifier_head
@@ -138,9 +138,8 @@ def __init__(
138138
self._activations_list: list[npt.NDArray[Any]] = []
139139
# CrossEncoder is a nn.Sequential of modules; [0] is the Transformer
140140
# wrapping the HF model exposed as .auto_model.
141-
self._hook_handler = self.cross_encoder[0].auto_model.classifier.register_forward_hook(
142-
self._classifier_hook
143-
)
141+
transformer = cast("Any", self.cross_encoder[0])
142+
self._hook_handler = transformer.auto_model.classifier.register_forward_hook(self._classifier_hook)
144143

145144
def _classifier_hook(self, _module, input_tensor, _output_tensor) -> None: # type: ignore[no-untyped-def] # noqa: ANN001
146145
"""Hook to capture classifier activations.
@@ -165,13 +164,13 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr
165164
if not self._train_head:
166165
return np.array(
167166
self.cross_encoder.predict(
168-
pairs,
167+
pairs, # type: ignore[arg-type]
169168
batch_size=self.config.batch_size,
170169
activation_fn=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(),
171170
)
172171
)
173172

174-
self.cross_encoder.predict(pairs, batch_size=self.config.batch_size)
173+
self.cross_encoder.predict(pairs, batch_size=self.config.batch_size) # type: ignore[arg-type]
175174
res = np.concatenate(self._activations_list, axis=0)
176175
self._activations_list.clear()
177176
return res # type: ignore[no-any-return]

src/autointent/modules/scoring/_bert.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ def get_implicit_initialization_params(self) -> dict[str, Any]:
130130
def _initialize_model(self) -> Any: # noqa: ANN401
131131
from transformers import AutoModelForSequenceClassification
132132

133-
label2id = {i: i for i in range(self._n_classes)}
134-
id2label = {i: i for i in range(self._n_classes)}
133+
label2id = {str(i): i for i in range(self._n_classes)}
134+
id2label = {i: str(i) for i in range(self._n_classes)}
135135

136136
return AutoModelForSequenceClassification.from_pretrained(
137137
self.classification_model_config.model_name,
@@ -152,7 +152,7 @@ def fit(
152152

153153
self._validate_task(labels)
154154

155-
self._tokenizer = AutoTokenizer.from_pretrained( # type: ignore[no-untyped-call]
155+
self._tokenizer = AutoTokenizer.from_pretrained(
156156
self.classification_model_config.model_name, revision=self.classification_model_config.revision
157157
)
158158
self._model = self._initialize_model()

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def tiny_sentence_transformer() -> SentenceTransformer:
159159

160160
from autointent.configs._pinned_revisions import DEFAULT_REVISIONS
161161

162-
return SentenceTransformer(TINY_SENTENCE_TRANSFORMER, revision=DEFAULT_REVISIONS[TINY_SENTENCE_TRANSFORMER])
162+
return SentenceTransformer(TINY_SENTENCE_TRANSFORMER, revision=DEFAULT_REVISIONS[TINY_SENTENCE_TRANSFORMER]) # type: ignore[no-any-return]
163163

164164

165165
def apply_test_models(pipeline: Pipeline) -> None:

tests/modules/test_dumper.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ class TestTransformers:
4141
def init_attributes(self) -> None:
4242
from transformers import AutoModelForSequenceClassification, AutoTokenizer
4343

44-
# reason: transformers AutoTokenizer.from_pretrained is untyped in stubs
45-
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # type: ignore[no-untyped-call]
44+
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
4645
self._tokenizer_predictions = np.array(self.tokenizer(["hello", "world"]).input_ids)
4746
self.transformer = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
4847

0 commit comments

Comments
 (0)