Fix three Codex-review findings on variant= probe

maxwbuckley · claude · maxwbuckley · commit 8abe7fd79aa6 · 2026-04-27T23:40:42.000+02:00
[P2] Pass cache_dir through the variant probe.
``_variant_available`` and ``_resolve_variant`` previously didn't accept
``cache_dir``, so a caller using ``from_pretrained(..., cache_dir=X)``
would have ``hf_hub_download`` probe into the *default* HF cache and
then ``snapshot_download(..., cache_dir=X)`` could not reuse the probe's
download. Result: cold starts paid for the variant weights twice and
the user's requested cache location was bypassed. Both helpers now
take ``cache_dir`` and forward it to ``try_to_load_from_cache`` and
``hf_hub_download``.

[P2] Sharded variant safetensors.
``_variant_allow_patterns`` already included
``model.{variant}.safetensors.index.json`` for forward compatibility,
but the actual shard files
(``model-XXXXX-of-YYYYY.{variant}.safetensors``) were excluded by the
allow-list, so a publisher who shipped a sharded fp16 / bf16 variant
would get only the index file pulled — the load would then fail or
silently fall back to fp32. Added the ``model-*-of-*.{variant}.safetensors``
glob.

[P2] dtype-vs-variant consistency in the outer dispatcher.
``GLiNER.from_pretrained`` (the outer class-level dispatcher) ran the
variant probe before checking dtype/variant consistency. When the
variant file was missing from the Hub, ``_resolve_variant`` downgraded
to ``None`` and the inner consistency check was then skipped — silently
accepting ``variant='bf16', dtype='fp16'`` and loading fp16 instead of
raising the documented mismatch error. Hoisted the consistency check
above the probe in the outer dispatcher to mirror the inner logic.

Test coverage:
- New ``test_includes_sharded_safetensors_pattern`` asserts the shard
  glob is per-variant (no cross-variant slip-through, default shards
  still excluded).
- Updated ``test_fp16_and_bf16_differ_only_in_variant_filename`` to
  account for the new shard pattern entries in the symmetric
  difference.
- New ``test_outer_dispatcher_mismatch_raises_before_probe`` exercises
  the outer-dispatcher path with a non-existent ``model_id`` and a
  mismatched ``variant``/``dtype`` pair, asserting the ``ValueError``
  fires before any I/O. Guards against the silent-fp16 regression.

90 unit tests pass (was 88 + 2 new). Ruff lint and format clean.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/gliner/model.py b/gliner/model.py
@@ -319,19 +319,26 @@ def _normalize_variant(cls, variant) -> Optional[str]:
     def _variant_allow_patterns(variant: str) -> list:
         """Return ``snapshot_download(allow_patterns=...)`` for a variant.
 
-        The patterns include the variant safetensors file (and its sharded
-        index, if present) plus the configs and tokenizer assets every load
-        needs. The default ``model.safetensors`` and ``pytorch_model.bin`` are
-        deliberately excluded so the caller pays I/O only for the requested
-        variant.
+        Includes the single-file variant safetensors, the sharded variant
+        index, the actual sharded variant safetensors files, and the configs
+        and tokenizer assets every load needs. The default
+        ``model.safetensors`` and ``pytorch_model.bin`` are deliberately
+        excluded so the caller pays I/O only for the requested variant.
+
+        Sharded checkpoint convention (transformers-style):
+            ``model-00001-of-NNNNN.{variant}.safetensors``
+            ``model.{variant}.safetensors.index.json``
         """
         return [
             "*.json",
             "*.txt",
             "spiece.model",
             "sentencepiece.bpe.model",
+            # Single-file variant.
             f"model.{variant}.safetensors",
+            # Sharded variant: index file + per-shard files.
             f"model.{variant}.safetensors.index.json",
+            f"model-*-of-*.{variant}.safetensors",
         ]
 
     @classmethod
@@ -340,6 +347,7 @@ def _variant_available(
         model_id: str,
         variant: str,
         revision: Optional[str] = None,
+        cache_dir: Optional[Union[str, Path]] = None,
         token: Union[str, bool, None] = None,
         local_files_only: bool = False,
     ) -> Optional[bool]:
@@ -375,8 +383,11 @@ def _variant_available(
         # try_to_load_from_cache validates the repo_id format; an
         # HFValidationError here means the input isn't a valid repo_id at
         # all (e.g. a non-existent local path), so treat as uncertain.
+        # ``cache_dir`` must match what ``snapshot_download`` will use, or the
+        # probe and the actual download diverge (and we'd download the variant
+        # twice).
         try:
-            cached = try_to_load_from_cache(repo_id=model_id, filename=target, revision=revision)
+            cached = try_to_load_from_cache(repo_id=model_id, filename=target, revision=revision, cache_dir=cache_dir)
         except Exception:
             return None
         if isinstance(cached, str):
@@ -388,8 +399,16 @@ def _variant_available(
 
         # 4. Try-and-recover via hf_hub_download. Success caches the file so
         # the subsequent snapshot_download reuses it (no double download).
+        # cache_dir must propagate so the probe and snapshot_download share
+        # the same store.
         try:
-            hf_hub_download(repo_id=model_id, filename=target, revision=revision, token=token)
+            hf_hub_download(
+                repo_id=model_id,
+                filename=target,
+                revision=revision,
+                cache_dir=cache_dir,
+                token=token,
+            )
             return True
         except EntryNotFoundError:
             return False
@@ -404,6 +423,7 @@ def _resolve_variant(
         model_id: str,
         variant: Optional[str],
         revision: Optional[str] = None,
+        cache_dir: Optional[Union[str, Path]] = None,
         token: Union[str, bool, None] = None,
         local_files_only: bool = False,
     ) -> Optional[str]:
@@ -420,7 +440,12 @@ def _resolve_variant(
         if variant is None:
             return None
         available = cls._variant_available(
-            model_id, variant, revision=revision, token=token, local_files_only=local_files_only
+            model_id,
+            variant,
+            revision=revision,
+            cache_dir=cache_dir,
+            token=token,
+            local_files_only=local_files_only,
         )
         if available is False:
             # TODO(strict-variant): once half-precision variant files have been
@@ -1046,6 +1071,7 @@ def from_pretrained(
                     model_id,
                     variant,
                     revision=revision,
+                    cache_dir=cache_dir,
                     token=token,
                     local_files_only=local_files_only,
                 )
@@ -4530,22 +4556,39 @@ def from_pretrained(
         # outer ``GLiNER`` class doesn't inherit from ``BaseGLiNER``; reuse
         # the helpers directly so behavior stays in lockstep.
         normalized_variant = BaseGLiNER._normalize_variant(variant)
+
+        # dtype-vs-variant consistency check MUST run before the probe.
+        # Otherwise, when the variant file is missing on the Hub,
+        # ``_resolve_variant`` downgrades to ``None`` and the inner
+        # ``from_pretrained``'s consistency check is skipped — silently
+        # accepting a ``variant="bf16", dtype="fp16"`` mismatch instead of
+        # raising as documented.
+        torch_dtype = BaseGLiNER._parse_dtype(dtype)
+        if normalized_variant is not None:
+            variant_dtype = BaseGLiNER._VARIANT_TO_DTYPE[normalized_variant]
+            if torch_dtype is None:
+                torch_dtype = variant_dtype
+                # Propagate the variant's dtype so the inner cast-on-read still
+                # produces the requested precision after a fallback.
+                dtype = variant_dtype
+            elif torch_dtype != variant_dtype:
+                raise ValueError(
+                    f"variant={normalized_variant!r} requires dtype={variant_dtype}; "
+                    f"got dtype={torch_dtype}. Drop dtype= to inherit from variant, "
+                    f"or unset variant= to load the default file."
+                )
+
         # Probe for availability and warn-and-fall-back to None if the variant
         # file isn't published. The inner from_pretrained will see model_dir
         # is already populated and skip its own probe — no double round-trip.
         normalized_variant = BaseGLiNER._resolve_variant(
             model_id,
             normalized_variant,
             revision=revision,
+            cache_dir=cache_dir,
             token=token,
             local_files_only=local_files_only,
         )
-        # If the probe downgraded variant -> None but the user asked for a
-        # specific variant, propagate the variant's dtype so the inner cast-on-
-        # read still produces the requested precision.
-        if variant is not None and normalized_variant is None and dtype is None:
-            original = BaseGLiNER._normalize_variant(variant)
-            dtype = BaseGLiNER._VARIANT_TO_DTYPE[original]
 
         model_dir = BaseGLiNER._download_model(
             model_id,
diff --git a/tests/test_quantize_and_dtype.py b/tests/test_quantize_and_dtype.py
@@ -13,6 +13,7 @@
 from torch import nn
 from safetensors.torch import save_file
 
+from gliner import GLiNER
 from gliner.model import BaseGLiNER
 
 
@@ -295,8 +296,26 @@ def test_fp16_and_bf16_differ_only_in_variant_filename(self):
             "model.bf16.safetensors",
             "model.fp16.safetensors.index.json",
             "model.bf16.safetensors.index.json",
+            # Sharded variant patterns must also differ between fp16 and bf16,
+            # otherwise large multi-file checkpoints can't pull only the
+            # requested precision's shards.
+            "model-*-of-*.fp16.safetensors",
+            "model-*-of-*.bf16.safetensors",
         }
 
+    def test_includes_sharded_safetensors_pattern(self):
+        """Sharded variant checkpoints place tensor data in
+        ``model-XXXXX-of-YYYYY.{variant}.safetensors`` files; without the
+        wildcard pattern the index would download but the actual shards
+        would be filtered out.
+        """
+        patterns = BaseGLiNER._variant_allow_patterns("bf16")
+        assert "model-*-of-*.bf16.safetensors" in patterns
+        # Wrong variant must not slip through the sharded match.
+        assert "model-*-of-*.fp16.safetensors" not in patterns
+        # Default-variant shards must still be excluded.
+        assert "model-*-of-*.safetensors" not in patterns
+
 
 class TestVariantDtypeConsistency:
     """``variant=`` and ``dtype=`` must agree (or only one set).
@@ -339,6 +358,25 @@ def test_int_dtype_against_variant_rejected_by_dtype_parser(self, tmp_path: Path
                 dtype=torch.int8,
             )
 
+    def test_outer_dispatcher_mismatch_raises_before_probe(self, tmp_path: Path):
+        """Codex review finding: the outer ``GLiNER.from_pretrained`` used to
+        run the variant probe before checking dtype/variant consistency, so
+        when the variant file was missing on the Hub the consistency check
+        was skipped and a ``variant='bf16', dtype='fp16'`` mismatch would
+        load fp16 silently. This test guards the regression by checking
+        the mismatch raises even when the model_id is a non-existent path
+        (which would otherwise be caught later by the download step).
+        """
+        # tmp_path has no gliner_config.json; if the consistency check runs
+        # first, we get a ValueError. If it runs after the probe (the bug),
+        # we'd get a FileNotFoundError or warning instead.
+        with pytest.raises(ValueError, match="variant='bf16' requires"):
+            GLiNER.from_pretrained(
+                model_id=str(tmp_path),
+                variant="bf16",
+                dtype="fp16",
+            )
+
 
 class TestVariantAvailable:
     """``_variant_available`` probe for variant file presence."""