Fix README, rename to for clarity

stephantul · stephantul · commit eb3b05b485ad · 2024-09-21T21:38:45.000+02:00
diff --git a/README.md b/README.md
@@ -112,7 +112,7 @@ vocabulary = ["word1", "word2", "word3"]
 model_name = "BAAI/bge-base-en-v1.5"
 
 # Distill the model with the custom vocabulary
-m2v_model = distill(model_name=model_name, vocabulary=vocabulary, pca_dims=None)
+m2v_model = distill(model_name=model_name, vocabulary=vocabulary, pca_dims=None, apply_zipf=True)
 
 # Save the model
 m2v_model.save_pretrained("m2v_model")
@@ -121,6 +121,8 @@ m2v_model.save_pretrained("m2v_model")
 m2v_model.push_to_hub("my_organization/my_model", token="<it's a secret to everybody>")
 ```
 
+Important note: we assume the passed vocabulary is sorted in rank frequency. i.e., we don't care about the actual word frequencies, but do assume that the most frequent word is first, and the least frequent word is last. If you're not sure whether this is case, set `apply_zipf` to `False`. This disables the weighting, but will also make performance a little bit worse.
+
 We also provide a command line interface for distillation. Note that `vocab.txt` should be a file with one word per line.
 ```bash
 python3 -m model2vec.distill --model-name BAAI/bge-base-en-v1.5 --vocabulary-path vocab.txt --device mps --save-path model2vec_model
@@ -133,7 +135,8 @@ from model2vec import StaticModel
 
 # Load a model from the HuggingFace hub, or a local one.
 model_name = "minishlab/M2V_base_output"
-model = StaticModel.from_pretrained(model_name)
+# You can optionally pass a token if you're loading a private model
+model = StaticModel.from_pretrained(model_name, token=None)
 
 # Make embeddings
 embeddings = model.encode(["It's dangerous to go alone!", "It's a secret to everybody."])
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -68,7 +68,7 @@ def normalize(self, value: bool) -> None:
         """Update the config if the value of normalize changes."""
         config_normalize = self.config.get("normalize", False)
         self._normalize = value
-        if value != config_normalize:
+        if config_normalize is not None and value != config_normalize:
             logger.warning(
                 f"Set normalization to `{value}`, which does not match config value `{config_normalize}`. Updating config."
             )
@@ -123,16 +123,18 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple
     def from_pretrained(
         cls: type[StaticModel],
         path: PathLike,
-        huggingface_token: str | None = None,
+        token: str | None = None,
     ) -> StaticModel:
         """
-        Create a static embeddder by creating a word-level tokenizer.
+        Load a StaticModel from a local path or huggingface hub path.
+
+        NOTE: if you load a private model from the huggingface hub, you need to pass a token.
 
         :param path: The path to load your static model from.
-        :param huggingface_token: The huggingface token to use.
+        :param token: The huggingface token to use.
         :return: A StaticEmbedder
         """
-        embeddings, tokenizer, config = load_pretrained(path, huggingface_token=huggingface_token)
+        embeddings, tokenizer, config = load_pretrained(path, token=token)
 
         return cls(embeddings, tokenizer, config)
 
@@ -200,6 +202,8 @@ def push_to_hub(self, repo_id: str, token: str | None) -> None:
         """
         Push the model to the huggingface hub.
 
+        NOTE: you need to pass a token if you are pushing a private model.
+
         :param repo_id: The repo id to push to.
         :param token: The huggingface token to use.
         """
diff --git a/model2vec/utils.py b/model2vec/utils.py
@@ -51,15 +51,15 @@ def save_pretrained(folder_path: Path, embeddings: np.ndarray, tokenizer: Tokeni
 
 
 def load_pretrained(
-    folder_or_repo_path: str | Path, huggingface_token: str | None = None
+    folder_or_repo_path: str | Path, token: str | None = None
 ) -> tuple[np.ndarray, Tokenizer, dict[str, Any]]:
     """
     Loads a pretrained model from a folder.
 
     :param folder_or_repo_path: The folder or repo path to load from.
         - If this is a local path, we will load from the local path.
         - If the local path is not found, we will attempt to load from the huggingface hub.
-    :param huggingface_token: The huggingface token to use.
+    :param token: The huggingface token to use.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
     :return: The embeddings, tokenizer, and config.
 
@@ -81,12 +81,10 @@ def load_pretrained(
     else:
         logger.info("Folder does not exist locally, attempting to use huggingface hub.")
         embeddings_path = huggingface_hub.hf_hub_download(
-            str(folder_or_repo_path), "embeddings.safetensors", token=huggingface_token
-        )
-        config_path = huggingface_hub.hf_hub_download(str(folder_or_repo_path), "config.json", token=huggingface_token)
-        tokenizer_path = huggingface_hub.hf_hub_download(
-            str(folder_or_repo_path), "tokenizer.json", token=huggingface_token
+            str(folder_or_repo_path), "embeddings.safetensors", token=token
         )
+        config_path = huggingface_hub.hf_hub_download(str(folder_or_repo_path), "config.json", token=token)
+        tokenizer_path = huggingface_hub.hf_hub_download(str(folder_or_repo_path), "tokenizer.json", token=token)
 
     opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
     embeddings = opened_tensor_file.get_tensor("embeddings")
@@ -102,15 +100,15 @@ def load_pretrained(
     return embeddings, tokenizer, config
 
 
-def push_folder_to_hub(folder_path: Path, repo_id: str, huggingface_token: str | None) -> None:
+def push_folder_to_hub(folder_path: Path, repo_id: str, token: str | None) -> None:
     """
     Push a model folder to the huggingface hub.
 
     :param folder_path: The path to the folder.
     :param repo_id: The repo name.
-    :param huggingface_token: The huggingface token.
+    :param token: The huggingface token.
     """
-    if not huggingface_hub.repo_exists(repo_id=repo_id, token=huggingface_token):
-        huggingface_hub.create_repo(repo_id, token=huggingface_token)
-    huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=huggingface_token)
+    if not huggingface_hub.repo_exists(repo_id=repo_id, token=token):
+        huggingface_hub.create_repo(repo_id, token=token)
+    huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token)
     logger.info(f"Pushed model to {repo_id}")
diff --git a/uv.lock b/uv.lock