fix: add force download and remove readme stuff

stephantul · stephantul · commit 669f7324b047 · 2025-09-09T12:42:04.000+02:00
diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
@@ -100,7 +100,7 @@ def load_pretrained(
     subfolder: str | None = None,
     token: str | None = None,
     from_sentence_transformers: bool = False,
-    skip_metadata: bool = False,
+    force_download: bool = False,
 ) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
     """
     Loads a pretrained model from a folder.
@@ -111,7 +111,8 @@ def load_pretrained(
     :param subfolder: The subfolder to load from.
     :param token: The huggingface token to use.
     :param from_sentence_transformers: Whether to load the model from a sentence transformers model.
-    :param skip_metadata: Whether to skip loading metadata. This is useful if you don't need the metadata.
+    :param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
+        already present in the cache.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
     :return: The embeddings, tokenizer, config, and metadata.
 
@@ -125,7 +126,8 @@ def load_pretrained(
         tokenizer_file = "tokenizer.json"
         config_name = "config.json"
 
-    if cached_folder := _get_latest_model_path(str(folder_or_repo_path)):
+    cached_folder = _get_latest_model_path(str(folder_or_repo_path))
+    if cached_folder and not force_download:
         logger.info(f"Found cached model at {cached_folder}, loading from cache.")
         folder_or_repo_path = cached_folder
     else:
@@ -177,7 +179,7 @@ def load_pretrained(
     embedding_key = "embedding.weight" if from_sentence_transformers else "embeddings"
     embeddings = opened_tensor_file.get_tensor(embedding_key)
 
-    if not skip_metadata and readme_path.exists():
+    if readme_path.exists():
         metadata = _get_metadata_from_readme(readme_path)
     else:
         metadata = {}
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -156,7 +156,7 @@ def from_pretrained(
         subfolder: str | None = None,
         quantize_to: str | DType | None = None,
         dimensionality: int | None = None,
-        skip_metadata: bool = False,
+        force_download: bool = False,
     ) -> StaticModel:
         """
         Load a StaticModel from a local path or huggingface hub path.
@@ -172,8 +172,8 @@ def from_pretrained(
         :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
-        :param skip_metadata: Whether to skip loading metadata. This is useful if you don't need the metadata.
-            Loading metadata can be slow for models with lots of results in the README.md
+        :param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
+            already present in the cache.
         :return: A StaticModel.
         """
         from model2vec.hf_utils import load_pretrained
@@ -183,7 +183,7 @@ def from_pretrained(
             token=token,
             from_sentence_transformers=False,
             subfolder=subfolder,
-            skip_metadata=skip_metadata,
+            force_download=force_download,
         )
 
         embeddings = quantize_and_reduce_dim(
@@ -209,7 +209,7 @@ def from_sentence_transformers(
         normalize: bool | None = None,
         quantize_to: str | DType | None = None,
         dimensionality: int | None = None,
-        skip_metadata: bool = False,
+        force_download: bool = False,
     ) -> StaticModel:
         """
         Load a StaticModel trained with sentence transformers from a local path or huggingface hub path.
@@ -224,8 +224,8 @@ def from_sentence_transformers(
         :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
-        :param skip_metadata: Whether to skip loading metadata. This is useful if you don't need the metadata.
-            Loading metadata can be slow for models with lots of results in the README.md
+        :param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
+            already present in the cache.
         :return: A StaticModel.
         """
         from model2vec.hf_utils import load_pretrained
@@ -235,7 +235,7 @@ def from_sentence_transformers(
             token=token,
             from_sentence_transformers=True,
             subfolder=None,
-            skip_metadata=skip_metadata,
+            force_download=force_download,
         )
 
         embeddings = quantize_and_reduce_dim(
diff --git a/uv.lock b/uv.lock