Skip to content

Commit eb3b05b

Browse files
committed
Fix README, rename to for clarity
1 parent 9a00bdc commit eb3b05b

4 files changed

Lines changed: 25 additions & 1355 deletions

File tree

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ vocabulary = ["word1", "word2", "word3"]
112112
model_name = "BAAI/bge-base-en-v1.5"
113113

114114
# Distill the model with the custom vocabulary
115-
m2v_model = distill(model_name=model_name, vocabulary=vocabulary, pca_dims=None)
115+
m2v_model = distill(model_name=model_name, vocabulary=vocabulary, pca_dims=None, apply_zipf=True)
116116

117117
# Save the model
118118
m2v_model.save_pretrained("m2v_model")
@@ -121,6 +121,8 @@ m2v_model.save_pretrained("m2v_model")
121121
m2v_model.push_to_hub("my_organization/my_model", token="<it's a secret to everybody>")
122122
```
123123

124+
Important note: we assume the passed vocabulary is sorted in rank frequency. i.e., we don't care about the actual word frequencies, but do assume that the most frequent word is first, and the least frequent word is last. If you're not sure whether this is case, set `apply_zipf` to `False`. This disables the weighting, but will also make performance a little bit worse.
125+
124126
We also provide a command line interface for distillation. Note that `vocab.txt` should be a file with one word per line.
125127
```bash
126128
python3 -m model2vec.distill --model-name BAAI/bge-base-en-v1.5 --vocabulary-path vocab.txt --device mps --save-path model2vec_model
@@ -133,7 +135,8 @@ from model2vec import StaticModel
133135

134136
# Load a model from the HuggingFace hub, or a local one.
135137
model_name = "minishlab/M2V_base_output"
136-
model = StaticModel.from_pretrained(model_name)
138+
# You can optionally pass a token if you're loading a private model
139+
model = StaticModel.from_pretrained(model_name, token=None)
137140

138141
# Make embeddings
139142
embeddings = model.encode(["It's dangerous to go alone!", "It's a secret to everybody."])

model2vec/model.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def normalize(self, value: bool) -> None:
6868
"""Update the config if the value of normalize changes."""
6969
config_normalize = self.config.get("normalize", False)
7070
self._normalize = value
71-
if value != config_normalize:
71+
if config_normalize is not None and value != config_normalize:
7272
logger.warning(
7373
f"Set normalization to `{value}`, which does not match config value `{config_normalize}`. Updating config."
7474
)
@@ -123,16 +123,18 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple
123123
def from_pretrained(
124124
cls: type[StaticModel],
125125
path: PathLike,
126-
huggingface_token: str | None = None,
126+
token: str | None = None,
127127
) -> StaticModel:
128128
"""
129-
Create a static embeddder by creating a word-level tokenizer.
129+
Load a StaticModel from a local path or huggingface hub path.
130+
131+
NOTE: if you load a private model from the huggingface hub, you need to pass a token.
130132
131133
:param path: The path to load your static model from.
132-
:param huggingface_token: The huggingface token to use.
134+
:param token: The huggingface token to use.
133135
:return: A StaticEmbedder
134136
"""
135-
embeddings, tokenizer, config = load_pretrained(path, huggingface_token=huggingface_token)
137+
embeddings, tokenizer, config = load_pretrained(path, token=token)
136138

137139
return cls(embeddings, tokenizer, config)
138140

@@ -200,6 +202,8 @@ def push_to_hub(self, repo_id: str, token: str | None) -> None:
200202
"""
201203
Push the model to the huggingface hub.
202204
205+
NOTE: you need to pass a token if you are pushing a private model.
206+
203207
:param repo_id: The repo id to push to.
204208
:param token: The huggingface token to use.
205209
"""

model2vec/utils.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,15 @@ def save_pretrained(folder_path: Path, embeddings: np.ndarray, tokenizer: Tokeni
5151

5252

5353
def load_pretrained(
54-
folder_or_repo_path: str | Path, huggingface_token: str | None = None
54+
folder_or_repo_path: str | Path, token: str | None = None
5555
) -> tuple[np.ndarray, Tokenizer, dict[str, Any]]:
5656
"""
5757
Loads a pretrained model from a folder.
5858
5959
:param folder_or_repo_path: The folder or repo path to load from.
6060
- If this is a local path, we will load from the local path.
6161
- If the local path is not found, we will attempt to load from the huggingface hub.
62-
:param huggingface_token: The huggingface token to use.
62+
:param token: The huggingface token to use.
6363
:raises: FileNotFoundError if the folder exists, but the file does not exist locally.
6464
:return: The embeddings, tokenizer, and config.
6565
@@ -81,12 +81,10 @@ def load_pretrained(
8181
else:
8282
logger.info("Folder does not exist locally, attempting to use huggingface hub.")
8383
embeddings_path = huggingface_hub.hf_hub_download(
84-
str(folder_or_repo_path), "embeddings.safetensors", token=huggingface_token
85-
)
86-
config_path = huggingface_hub.hf_hub_download(str(folder_or_repo_path), "config.json", token=huggingface_token)
87-
tokenizer_path = huggingface_hub.hf_hub_download(
88-
str(folder_or_repo_path), "tokenizer.json", token=huggingface_token
84+
str(folder_or_repo_path), "embeddings.safetensors", token=token
8985
)
86+
config_path = huggingface_hub.hf_hub_download(str(folder_or_repo_path), "config.json", token=token)
87+
tokenizer_path = huggingface_hub.hf_hub_download(str(folder_or_repo_path), "tokenizer.json", token=token)
9088

9189
opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
9290
embeddings = opened_tensor_file.get_tensor("embeddings")
@@ -102,15 +100,15 @@ def load_pretrained(
102100
return embeddings, tokenizer, config
103101

104102

105-
def push_folder_to_hub(folder_path: Path, repo_id: str, huggingface_token: str | None) -> None:
103+
def push_folder_to_hub(folder_path: Path, repo_id: str, token: str | None) -> None:
106104
"""
107105
Push a model folder to the huggingface hub.
108106
109107
:param folder_path: The path to the folder.
110108
:param repo_id: The repo name.
111-
:param huggingface_token: The huggingface token.
109+
:param token: The huggingface token.
112110
"""
113-
if not huggingface_hub.repo_exists(repo_id=repo_id, token=huggingface_token):
114-
huggingface_hub.create_repo(repo_id, token=huggingface_token)
115-
huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=huggingface_token)
111+
if not huggingface_hub.repo_exists(repo_id=repo_id, token=token):
112+
huggingface_hub.create_repo(repo_id, token=token)
113+
huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token)
116114
logger.info(f"Pushed model to {repo_id}")

0 commit comments

Comments
 (0)