merge

stephantul · stephantul · commit bcc692915270 · 2025-09-09T13:06:54.000+02:00
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -9,17 +9,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: ["ubuntu-latest", "windows-latest"]
+        os: ["ubuntu-latest"]
         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-        exclude:
-          - os: windows-latest
-            python-version: "3.9"
-          - os: windows-latest
-            python-version: "3.11"
-          - os: windows-latest
-            python-version: "3.12"
-          - os: windows-latest
-            python-version: "3.13"
       fail-fast: false
 
     steps:
@@ -31,19 +22,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           allow-prereleases: true
 
-      # Step for Windows: Create and activate a virtual environment
-      - name: Create and activate a virtual environment (Windows)
-        if: ${{ runner.os == 'Windows' }}
-        run: |
-          irm https://astral.sh/uv/install.ps1 | iex
-          $env:Path = "C:\Users\runneradmin\.local\bin;$env:Path"
-          uv venv .venv
-          "VIRTUAL_ENV=.venv" | Out-File -FilePath $env:GITHUB_ENV -Append
-          "$PWD/.venv/Scripts" | Out-File -FilePath $env:GITHUB_PATH -Append
-
-      # Step for Unix: Create and activate a virtual environment
       - name: Create and activate a virtual environment (Unix)
-        if: ${{ runner.os != 'Windows' }}
         run: |
           curl -LsSf https://astral.sh/uv/install.sh | sh
           uv venv .venv
diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
@@ -9,6 +9,7 @@
 import numpy as np
 import safetensors
 from huggingface_hub import ModelCard, ModelCardData
+from huggingface_hub.constants import HF_HUB_CACHE
 from safetensors.numpy import save_file
 from tokenizers import Tokenizer
 
@@ -107,9 +108,10 @@ def _create_model_card(
 
 def load_pretrained(
     folder_or_repo_path: str | Path,
-    subfolder: str | None = None,
-    token: str | None = None,
-    from_sentence_transformers: bool = False,
+    subfolder: str | None,
+    token: str | None,
+    from_sentence_transformers: bool,
+    force_download: bool,
 ) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any], np.ndarray | None, np.ndarray | None]:
     """
     Loads a pretrained model from a folder.
@@ -120,8 +122,10 @@ def load_pretrained(
     :param subfolder: The subfolder to load from.
     :param token: The huggingface token to use.
     :param from_sentence_transformers: Whether to load the model from a sentence transformers model.
+    :param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
+        already present in the cache.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
-    :return: The embeddings, tokenizer, config, and metadata.
+    :return: The embeddings, tokenizer, config, metadata, weights and mapping.
 
     """
     if from_sentence_transformers:
@@ -133,7 +137,13 @@ def load_pretrained(
         tokenizer_file = "tokenizer.json"
         config_name = "config.json"
 
-    folder_or_repo_path = Path(folder_or_repo_path)
+    cached_folder = _get_latest_model_path(str(folder_or_repo_path))
+    if cached_folder and not force_download:
+        logger.info(f"Found cached model at {cached_folder}, loading from cache.")
+        folder_or_repo_path = cached_folder
+    else:
+        logger.info(f"No cached model found for {folder_or_repo_path}, loading from local or hub.")
+        folder_or_repo_path = Path(folder_or_repo_path)
 
     local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path
 
@@ -150,9 +160,7 @@ def load_pretrained(
         if not tokenizer_path.exists():
             raise FileNotFoundError(f"Tokenizer file does not exist in {local_folder}")
 
-        # README is optional, so this is a bit finicky.
         readme_path = local_folder / "README.md"
-        metadata = _get_metadata_from_readme(readme_path)
 
     else:
         logger.info("Folder does not exist locally, attempting to use huggingface hub.")
@@ -161,18 +169,11 @@ def load_pretrained(
                 folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
             )
         )
-
-        try:
-            readme_path = Path(
-                huggingface_hub.hf_hub_download(
-                    folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
-                )
+        readme_path = Path(
+            huggingface_hub.hf_hub_download(
+                folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
             )
-            metadata = _get_metadata_from_readme(Path(readme_path))
-        except Exception as e:
-            # NOTE: we don't want to raise an error here, since the README is optional.
-            logger.info(f"No README found in the model folder: {e} No model card loaded.")
-            metadata = {}
+        )
 
         config_path = Path(
             huggingface_hub.hf_hub_download(
@@ -186,21 +187,22 @@ def load_pretrained(
         )
 
     opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
-    if from_sentence_transformers:
-        embeddings = opened_tensor_file.get_tensor("embedding.weight")
+    embedding_name = "embedding.weight" if from_sentence_transformers else "embeddings"
+    embeddings = opened_tensor_file.get_tensor(embedding_name)
+    try:
+        weights = opened_tensor_file.get_tensor("weights")
+    except Exception:
+        # Bare except because safetensors does not export its own errors.
         weights = None
+    try:
+        mapping = opened_tensor_file.get_tensor("mapping")
+    except Exception:
         mapping = None
+
+    if readme_path.exists():
+        metadata = _get_metadata_from_readme(readme_path)
     else:
-        embeddings = opened_tensor_file.get_tensor("embeddings")
-        try:
-            weights = opened_tensor_file.get_tensor("weights")
-        except Exception:
-            # Bare except because safetensors does not export its own errors.
-            weights = None
-        try:
-            mapping = opened_tensor_file.get_tensor("mapping")
-        except Exception:
-            mapping = None
+        metadata = {}
 
     tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
     config = json.load(open(config_path))
@@ -240,3 +242,28 @@ def push_folder_to_hub(
     huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder)
 
     logger.info(f"Pushed model to {repo_id}")
+
+
+def _get_latest_model_path(model_id: str) -> Path | None:
+    """
+    Gets the latest model path for a given identifier from the hugging face hub cache.
+
+    Returns None if there is no cached model. In this case, the model will be downloaded.
+    """
+    # Make path object
+    cache_dir = Path(HF_HUB_CACHE)
+    # This is specific to how HF stores the files.
+    normalized = model_id.replace("/", "--")
+    repo_dir = cache_dir / f"models--{normalized}" / "snapshots"
+
+    if not repo_dir.exists():
+        return None
+
+    # Find all directories.
+    snapshots = [p for p in repo_dir.iterdir() if p.is_dir()]
+    if not snapshots:
+        return None
+
+    # Get the latest directory by modification time.
+    latest_snapshot = max(snapshots, key=lambda p: p.stat().st_mtime)
+    return latest_snapshot
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -12,8 +12,8 @@
 from tokenizers import Encoding, Tokenizer
 from tqdm import tqdm
 
-from model2vec.quantization import DType
-from model2vec.utils import ProgressParallel, load_local_model
+from model2vec.quantization import DType, quantize_and_reduce_dim
+from model2vec.utils import ProgressParallel
 
 PathLike = Union[Path, str]
 
@@ -174,6 +174,7 @@ def from_pretrained(
         quantize_to: str | DType | None = None,
         dimensionality: int | None = None,
         vocabulary_quantization: int | None = None,
+        force_download: bool = True,
     ) -> StaticModel:
         """
         Load a StaticModel from a local path or huggingface hub path.
@@ -190,6 +191,8 @@ def from_pretrained(
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
         :param vocabulary_quantization: The number of clusters to use for vocabulary quantization.
+        :param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
+            already present in the cache.
         :return: A StaticModel.
         """
         return _loading_helper(
@@ -202,6 +205,7 @@ def from_pretrained(
             from_sentence_transformers=False,
             normalize=normalize,
             subfolder=subfolder,
+            force_download=force_download,
         )
 
     @classmethod
@@ -213,6 +217,7 @@ def from_sentence_transformers(
         quantize_to: str | DType | None = None,
         dimensionality: int | None = None,
         vocabulary_quantization: int | None = None,
+        force_download: bool = True,
     ) -> StaticModel:
         """
         Load a StaticModel trained with sentence transformers from a local path or huggingface hub path.
@@ -228,6 +233,8 @@ def from_sentence_transformers(
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
         :param vocabulary_quantization: The number of clusters to use for vocabulary quantization.
+        :param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
+            already present in the cache.
         :return: A StaticModel.
         """
         return _loading_helper(
@@ -240,6 +247,7 @@ def from_sentence_transformers(
             from_sentence_transformers=True,
             normalize=normalize,
             subfolder=None,
+            force_download=force_download,
         )
 
     @overload
@@ -467,33 +475,6 @@ def push_to_hub(
             self.save_pretrained(temp_dir, model_name=repo_id)
             push_folder_to_hub(Path(temp_dir), subfolder=subfolder, repo_id=repo_id, private=private, token=token)
 
-    @classmethod
-    def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
-        """
-        Loads a model from a local path.
-
-        You should only use this code path if you are concerned with start-up time.
-        Loading via the `from_pretrained` method is safer, and auto-downloads, but
-        also means we import a whole bunch of huggingface code that we don't need.
-
-        Additionally, huggingface will check the most recent version of the model,
-        which can be slow.
-
-        :param path: The path to load the model from. The path is a directory saved by the
-            `save_pretrained` method.
-        :return: A StaticModel
-        :raises: ValueError if the path is not a directory.
-        """
-        path = Path(path)
-        if not path.is_dir():
-            raise ValueError(f"Path {path} is not a directory.")
-
-        embeddings, tokenizer, config, weights, mapping = load_local_model(path)
-
-        return StaticModel(
-            vectors=embeddings, tokenizer=tokenizer, config=config, weights=weights, token_mapping=mapping
-        )
-
 
 def quantize_model(
     model: StaticModel,
@@ -552,12 +533,13 @@ def _loading_helper(
     cls: type[StaticModel],
     path: PathLike,
     token: str | None,
-    vocabulary_quantization: int | None = None,
-    quantize_to: str | DType | None = None,
-    dimensionality: int | None = None,
-    from_sentence_transformers: bool = False,
-    normalize: bool | None = None,
-    subfolder: str | None = None,
+    vocabulary_quantization: int | None,
+    quantize_to: str | DType | None,
+    dimensionality: int | None,
+    from_sentence_transformers: bool,
+    normalize: bool | None,
+    subfolder: str | None,
+    force_download: bool,
 ) -> StaticModel:
     """Helper function to load a model from a directory."""
     from model2vec.hf_utils import load_pretrained
@@ -570,6 +552,7 @@ def _loading_helper(
         token=token,
         from_sentence_transformers=from_sentence_transformers,
         subfolder=subfolder,
+        force_download=force_download,
     )
 
     model = cls(
diff --git a/model2vec/train/base.py b/model2vec/train/base.py
@@ -81,7 +81,9 @@ def from_pretrained(
         return cls.from_static_model(model=model, out_dim=out_dim, **kwargs)
 
     @classmethod
-    def from_static_model(cls: type[ModelType], *, model: StaticModel, out_dim: int = 2, **kwargs: Any) -> ModelType:
+    def from_static_model(
+        cls: type[ModelType], *, model: StaticModel, out_dim: int = 2, pad_token: str = "[PAD]", **kwargs: Any
+    ) -> ModelType:
         """Load the model from a static model."""
         model.embedding = np.nan_to_num(model.embedding)
         weights = torch.from_numpy(model.weights) if model.weights is not None else None
@@ -92,7 +94,7 @@ def from_static_model(cls: type[ModelType], *, model: StaticModel, out_dim: int
             token_mapping = None
         return cls(
             vectors=embeddings_converted,
-            pad_id=model.tokenizer.token_to_id("[PAD]"),
+            pad_id=model.tokenizer.token_to_id(pad_token),
             out_dim=out_dim,
             tokenizer=model.tokenizer,
             token_mapping=token_mapping,
diff --git a/model2vec/utils.py b/model2vec/utils.py
@@ -102,33 +102,3 @@ def setup_logging() -> None:
         datefmt="%Y-%m-%d %H:%M:%S",
         handlers=[RichHandler(rich_tracebacks=True)],
     )
-
-
-def load_local_model(
-    folder: Path,
-) -> tuple[np.ndarray, Tokenizer, dict[str, str], np.ndarray | None, np.ndarray | None]:
-    """Load a local model."""
-    embeddings_path = folder / "model.safetensors"
-    tokenizer_path = folder / "tokenizer.json"
-    config_path = folder / "config.json"
-
-    opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
-    embeddings = opened_tensor_file.get_tensor("embeddings")
-    try:
-        weights = opened_tensor_file.get_tensor("weights")
-    except Exception:
-        # Bare except because safetensors does not export its own errors.
-        weights = None
-    try:
-        mapping = opened_tensor_file.get_tensor("mapping")
-    except Exception:
-        mapping = None
-
-    if config_path.exists():
-        config = json.load(open(config_path))
-    else:
-        config = {}
-
-    tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
-
-    return embeddings, tokenizer, config, weights, mapping
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -119,9 +119,9 @@ def test_encode_as_tokens_empty(
     encoded = model.encode_as_sequence("")
     assert np.array_equal(encoded, np.zeros(shape=(0, 2), dtype=model.embedding.dtype))
 
-    encoded = model.encode_as_sequence(["", ""])
+    encoded_list = model.encode_as_sequence(["", ""])
     out = [np.zeros(shape=(0, 2), dtype=model.embedding.dtype) for _ in range(2)]
-    assert [np.array_equal(x, y) for x, y in zip(encoded, out)]
+    assert [np.array_equal(x, y) for x, y in zip(encoded_list, out)]
 
 
 def test_encode_empty_sentence(
@@ -298,23 +298,3 @@ def test_dim(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: d
     model = StaticModel(mock_vectors, mock_tokenizer, mock_config)
     assert model.dim == 2
     assert model.dim == model.embedding.shape[1]
-
-
-def test_local_load_from_model(mock_tokenizer: Tokenizer) -> None:
-    """Test local load from a model."""
-    x = np.ones((mock_tokenizer.get_vocab_size(), 2))
-    with TemporaryDirectory() as tempdir:
-        tempdir_path = Path(tempdir)
-        safetensors.numpy.save_file({"embeddings": x}, Path(tempdir) / "model.safetensors")
-        mock_tokenizer.save(str(Path(tempdir) / "tokenizer.json"))
-
-        model = StaticModel.load_local(tempdir_path)
-        assert model.embedding.shape == x.shape
-        assert model.tokenizer.to_str() == mock_tokenizer.to_str()
-        assert model.config == {"normalize": False}
-
-
-def test_local_load_from_model_no_folder() -> None:
-    """Test local load from a model with no folder."""
-    with pytest.raises(ValueError):
-        StaticModel.load_local("woahbuddy_relax_this_is_just_a_test")
diff --git a/tests/test_utils.py b/tests/test_utils.py
diff --git a/uv.lock b/uv.lock