Skip to content

Commit 5f36097

Browse files
committed
merge
2 parents 27e856f + 4c69a68 commit 5f36097

9 files changed

Lines changed: 233 additions & 34 deletions

File tree

model2vec/distill/distillation.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from model2vec.distill.tokenizer import replace_vocabulary
1616
from model2vec.distill.utils import select_optimal_device
1717
from model2vec.model import StaticModel
18+
from model2vec.quantization import DType, quantize_embeddings
1819

1920
try:
2021
# For huggingface_hub>=0.25.0
@@ -40,6 +41,7 @@ def distill_from_model(
4041
sif_coefficient: float | None = 1e-4,
4142
use_subword: bool = True,
4243
token_remove_pattern: str | None = r"\[unused\d+\]",
44+
quantize_to: DType | str = DType.Float16,
4345
) -> StaticModel:
4446
"""
4547
Distill a staticmodel from a sentence transformer.
@@ -64,9 +66,11 @@ def distill_from_model(
6466
:param use_subword: Whether to keep subword tokens in the vocabulary. If this is False, you must pass a vocabulary, and the returned tokenizer will only detect full words.
6567
:param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
6668
If the pattern is so general that it removes all tokens, we throw an error. If the pattern can't be compiled into a valid regex, we also throw an error.
69+
:param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
6770
:return: A StaticModel
6871
6972
"""
73+
quantize_to = DType(quantize_to)
7074
backend_tokenizer = tokenizer.backend_tokenizer
7175
sif_coefficient, token_remove_regex = _validate_parameters(
7276
vocabulary, apply_zipf, sif_coefficient, use_subword, token_remove_pattern
@@ -106,6 +110,9 @@ def distill_from_model(
106110
# Post process the embeddings by applying PCA and Zipf weighting.
107111
embeddings = _post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
108112

113+
# Quantize the embeddings.
114+
embeddings = quantize_embeddings(embeddings, quantize_to)
115+
109116
model_name = getattr(model, "name_or_path", "")
110117

111118
config = {
@@ -209,6 +216,7 @@ def distill(
209216
use_subword: bool = True,
210217
token_remove_pattern: str | None = r"\[unused\d+\]",
211218
trust_remote_code: bool = False,
219+
quantize_to: DType | str = DType.Float16,
212220
) -> StaticModel:
213221
"""
214222
Distill a staticmodel from a sentence transformer.
@@ -232,6 +240,7 @@ def distill(
232240
:param use_subword: Whether to keep subword tokens in the vocabulary. If this is False, you must pass a vocabulary, and the returned tokenizer will only detect full words.
233241
:param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
234242
:param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `transformers`. If this is True, we will load all components.
243+
:param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
235244
:return: A StaticModel
236245
237246
"""
@@ -248,6 +257,7 @@ def distill(
248257
use_subword=use_subword,
249258
token_remove_pattern=token_remove_pattern,
250259
sif_coefficient=sif_coefficient,
260+
quantize_to=quantize_to,
251261
)
252262

253263

@@ -303,17 +313,31 @@ def _clean_vocabulary(tokenizer: Tokenizer, vocabulary: list[str], added_tokens:
303313
n_empty = 0
304314
n_duplicates = 0
305315
for token in vocabulary:
306-
if tokenizer.normalizer is not None:
307-
token = tokenizer.normalizer.normalize_str(token)
316+
normalizer = tokenizer.normalizer
317+
if normalizer is not None:
318+
token = normalizer.normalize_str(token)
308319

309320
if not token:
310321
n_empty += 1
311322
continue
312-
if token in seen_tokens or token in added_tokens_set:
323+
324+
pre_tokenizer = tokenizer.pre_tokenizer
325+
if pre_tokenizer is not None:
326+
pretokenized_tokens = pre_tokenizer.pre_tokenize_str(token)
327+
new_token = " ".join(pretokenized_tokens[1])
328+
else:
329+
new_token = token
330+
331+
# We need to check whether the pretokenized token is in the vocabulary.
332+
# But we need to return the original token, because that will be tokenized
333+
# again by the tokenizer during featurization.
334+
if new_token in seen_tokens or new_token in added_tokens_set:
313335
n_duplicates += 1
314336
continue
315337

316-
seen_tokens.add(token)
338+
# Add the possibly pretokenized token to _seen_
339+
seen_tokens.add(new_token)
340+
# Add the original string to the vocabulary.
317341
cleaned_vocabulary.append(token)
318342

319343
if n_duplicates:

model2vec/hf_utils.py

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def save_pretrained(
2323
tokenizer: Tokenizer,
2424
config: dict[str, Any],
2525
create_model_card: bool = True,
26+
subfolder: str | None = None,
2627
**kwargs: Any,
2728
) -> None:
2829
"""
@@ -33,8 +34,10 @@ def save_pretrained(
3334
:param tokenizer: The tokenizer.
3435
:param config: A metadata config.
3536
:param create_model_card: Whether to create a model card.
37+
:param subfolder: The subfolder to save the model in.
3638
:param **kwargs: Any additional arguments.
3739
"""
40+
folder_path = folder_path / subfolder if subfolder else folder_path
3841
folder_path.mkdir(exist_ok=True, parents=True)
3942
save_file({"embeddings": embeddings}, folder_path / "model.safetensors")
4043
tokenizer.save(str(folder_path / "tokenizer.json"))
@@ -92,14 +95,18 @@ def _create_model_card(
9295

9396

9497
def load_pretrained(
95-
folder_or_repo_path: str | Path, token: str | None = None, from_sentence_transformers: bool = False
98+
folder_or_repo_path: str | Path,
99+
subfolder: str | None = None,
100+
token: str | None = None,
101+
from_sentence_transformers: bool = False,
96102
) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
97103
"""
98104
Loads a pretrained model from a folder.
99105
100106
:param folder_or_repo_path: The folder or repo path to load from.
101107
- If this is a local path, we will load from the local path.
102108
- If the local path is not found, we will attempt to load from the huggingface hub.
109+
:param subfolder: The subfolder to load from.
103110
:param token: The huggingface token to use.
104111
:param from_sentence_transformers: Whether to load the model from a sentence transformers model.
105112
:raises: FileNotFoundError if the folder exists, but the file does not exist locally.
@@ -116,36 +123,47 @@ def load_pretrained(
116123
config_name = "config.json"
117124

118125
folder_or_repo_path = Path(folder_or_repo_path)
119-
if folder_or_repo_path.exists():
120-
embeddings_path = folder_or_repo_path / model_file
126+
127+
local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path
128+
129+
if local_folder.exists():
130+
embeddings_path = local_folder / model_file
121131
if not embeddings_path.exists():
122-
raise FileNotFoundError(f"Embeddings file does not exist in {folder_or_repo_path}")
132+
raise FileNotFoundError(f"Embeddings file does not exist in {local_folder}")
123133

124-
config_path = folder_or_repo_path / config_name
134+
config_path = local_folder / config_name
125135
if not config_path.exists():
126-
raise FileNotFoundError(f"Config file does not exist in {folder_or_repo_path}")
136+
raise FileNotFoundError(f"Config file does not exist in {local_folder}")
127137

128-
tokenizer_path = folder_or_repo_path / tokenizer_file
138+
tokenizer_path = local_folder / tokenizer_file
129139
if not tokenizer_path.exists():
130-
raise FileNotFoundError(f"Tokenizer file does not exist in {folder_or_repo_path}")
140+
raise FileNotFoundError(f"Tokenizer file does not exist in {local_folder}")
131141

132142
# README is optional, so this is a bit finicky.
133-
readme_path = folder_or_repo_path / "README.md"
143+
readme_path = local_folder / "README.md"
134144
metadata = _get_metadata_from_readme(readme_path)
135145

136146
else:
137147
logger.info("Folder does not exist locally, attempting to use huggingface hub.")
138-
embeddings_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), model_file, token=token)
148+
embeddings_path = huggingface_hub.hf_hub_download(
149+
folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
150+
)
139151

140152
try:
141-
readme_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "README.md", token=token)
153+
readme_path = huggingface_hub.hf_hub_download(
154+
folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
155+
)
142156
metadata = _get_metadata_from_readme(Path(readme_path))
143157
except huggingface_hub.utils.EntryNotFoundError:
144158
logger.info("No README found in the model folder. No model card loaded.")
145159
metadata = {}
146160

147-
config_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), config_name, token=token)
148-
tokenizer_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), tokenizer_file, token=token)
161+
config_path = huggingface_hub.hf_hub_download(
162+
folder_or_repo_path.as_posix(), config_name, token=token, subfolder=subfolder
163+
)
164+
tokenizer_path = huggingface_hub.hf_hub_download(
165+
folder_or_repo_path.as_posix(), tokenizer_file, token=token, subfolder=subfolder
166+
)
149167

150168
opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
151169
if from_sentence_transformers:
@@ -176,11 +194,15 @@ def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]:
176194
return data
177195

178196

179-
def push_folder_to_hub(folder_path: Path, repo_id: str, private: bool, token: str | None) -> None:
197+
def push_folder_to_hub(
198+
folder_path: Path, subfolder: str | None, repo_id: str, private: bool, token: str | None
199+
) -> None:
180200
"""
181201
Push a model folder to the huggingface hub, including model card.
182202
183203
:param folder_path: The path to the folder.
204+
:param subfolder: The subfolder to push to.
205+
If None, the folder will be pushed to the root of the repo.
184206
:param repo_id: The repo name.
185207
:param private: Whether the repo is private.
186208
:param token: The huggingface token.
@@ -189,15 +211,6 @@ def push_folder_to_hub(folder_path: Path, repo_id: str, private: bool, token: st
189211
huggingface_hub.create_repo(repo_id, token=token, private=private)
190212

191213
# Push model card and all model files to the Hugging Face hub
192-
huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token)
193-
194-
# Check if the model card exists, and push it if available
195-
model_card_path = folder_path / "README.md"
196-
if model_card_path.exists():
197-
card = ModelCard.load(model_card_path)
198-
card.push_to_hub(repo_id=repo_id, token=token)
199-
logger.info(f"Pushed model card to {repo_id}")
200-
else:
201-
logger.warning(f"Model card README.md not found in {folder_path}. Skipping model card upload.")
214+
huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder)
202215

203216
logger.info(f"Pushed model to {repo_id}")

model2vec/model.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from tokenizers import Encoding, Tokenizer
1313
from tqdm import tqdm
1414

15+
from model2vec.quantization import DType, quantize_embeddings
1516
from model2vec.utils import ProgressParallel, load_local_model
1617

1718
PathLike = Union[Path, str]
@@ -95,12 +96,13 @@ def normalize(self, value: bool) -> None:
9596
)
9697
self.config["normalize"] = value
9798

98-
def save_pretrained(self, path: PathLike, model_name: str | None = None) -> None:
99+
def save_pretrained(self, path: PathLike, model_name: str | None = None, subfolder: str | None = None) -> None:
99100
"""
100101
Save the pretrained model.
101102
102103
:param path: The path to save to.
103104
:param model_name: The model name to use in the Model Card.
105+
:param subfolder: The subfolder to save to.
104106
"""
105107
from model2vec.hf_utils import save_pretrained
106108

@@ -112,6 +114,7 @@ def save_pretrained(self, path: PathLike, model_name: str | None = None) -> None
112114
base_model_name=self.base_model_name,
113115
language=self.language,
114116
model_name=model_name,
117+
subfolder=subfolder,
115118
)
116119

117120
def tokenize(self, sentences: list[str], max_length: int | None = None) -> list[list[int]]:
@@ -150,6 +153,9 @@ def from_pretrained(
150153
path: PathLike,
151154
token: str | None = None,
152155
normalize: bool | None = None,
156+
subfolder: str | None = None,
157+
quantize_to: str | DType | None = None,
158+
dimensionality: int | None = None,
153159
) -> StaticModel:
154160
"""
155161
Load a StaticModel from a local path or huggingface hub path.
@@ -159,11 +165,34 @@ def from_pretrained(
159165
:param path: The path to load your static model from.
160166
:param token: The huggingface token to use.
161167
:param normalize: Whether to normalize the embeddings.
168+
:param subfolder: The subfolder to load from.
169+
:param quantize_to: The dtype to quantize the model to. If None, no quantization is done.
170+
If a string is passed, it is converted to a DType.
171+
:param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
172+
This is useful if you want to load a model with a lower dimensionality.
173+
Note that this only applies if you have trained your model using mrl or PCA.
162174
:return: A StaticModel
175+
:raises: ValueError if the dimensionality is greater than the model dimensionality.
163176
"""
164177
from model2vec.hf_utils import load_pretrained
165178

166-
embeddings, tokenizer, config, metadata = load_pretrained(path, token=token, from_sentence_transformers=False)
179+
embeddings, tokenizer, config, metadata = load_pretrained(
180+
path, token=token, from_sentence_transformers=False, subfolder=subfolder
181+
)
182+
183+
if quantize_to is not None:
184+
quantize_to = DType(quantize_to)
185+
embeddings = quantize_embeddings(embeddings, quantize_to)
186+
if dimensionality is not None:
187+
if dimensionality > embeddings.shape[1]:
188+
raise ValueError(
189+
f"Dimensionality {dimensionality} is greater than the model dimensionality {embeddings.shape[1]}"
190+
)
191+
embeddings = embeddings[:, :dimensionality]
192+
if config.get("apply_pca", None) is None:
193+
logger.warning(
194+
"You are reducing the dimensionality of the model, but we can't find a pca key in the model config. This might not work as expected."
195+
)
167196

168197
return cls(
169198
embeddings,
@@ -352,7 +381,9 @@ def _batch(sentences: list[str], batch_size: int) -> Iterator[list[str]]:
352381
"""Batch the sentences into equal-sized."""
353382
return (sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size))
354383

355-
def push_to_hub(self, repo_id: str, private: bool = False, token: str | None = None) -> None:
384+
def push_to_hub(
385+
self, repo_id: str, private: bool = False, token: str | None = None, subfolder: str | None = None
386+
) -> None:
356387
"""
357388
Push the model to the huggingface hub.
358389
@@ -362,12 +393,13 @@ def push_to_hub(self, repo_id: str, private: bool = False, token: str | None = N
362393
:param private: Whether the repo, if created is set to private.
363394
If the repo already exists, this doesn't change the visibility.
364395
:param token: The huggingface token to use.
396+
:param subfolder: The subfolder to push to.
365397
"""
366398
from model2vec.hf_utils import push_folder_to_hub
367399

368400
with TemporaryDirectory() as temp_dir:
369401
self.save_pretrained(temp_dir, model_name=repo_id)
370-
push_folder_to_hub(Path(temp_dir), repo_id, private, token)
402+
push_folder_to_hub(Path(temp_dir), subfolder=subfolder, repo_id=repo_id, private=private, token=token)
371403

372404
@classmethod
373405
def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:

model2vec/py.typed

Whitespace-only changes.

model2vec/quantization.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from enum import Enum
2+
3+
import numpy as np
4+
5+
6+
class DType(str, Enum):
7+
Float16 = "float16"
8+
Float32 = "float32"
9+
Float64 = "float64"
10+
Int8 = "int8"
11+
12+
13+
def quantize_embeddings(embeddings: np.ndarray, quantize_to: DType) -> np.ndarray:
14+
"""
15+
Quantize embeddings to a specified data type to reduce memory usage.
16+
17+
:param embeddings: The embeddings to quantize, as a numpy array.
18+
:param quantize_to: The data type to quantize to.
19+
:return: The quantized embeddings.
20+
:raises ValueError: If the quantization type is not valid.
21+
"""
22+
if quantize_to == DType.Float16:
23+
return embeddings.astype(np.float16)
24+
elif quantize_to == DType.Float32:
25+
return embeddings.astype(np.float32)
26+
elif quantize_to == DType.Float64:
27+
return embeddings.astype(np.float64)
28+
elif quantize_to == DType.Int8:
29+
# Normalize to [-128, 127] range for int8
30+
# We normalize to -127 to 127 to keep symmetry.
31+
scale = np.max(np.abs(embeddings)) / 127.0
32+
quantized = np.round(embeddings / scale).astype(np.int8)
33+
return quantized
34+
else:
35+
raise ValueError("Not a valid enum member of DType.")

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,11 @@ packages = ["model2vec"]
4242
include-package-data = true
4343

4444
[tool.setuptools.package-data]
45-
model2vec = ["assets/modelcards/model_card_template.md", "assets/modelcards/classifier_template.md"]
45+
model2vec = [
46+
"assets/modelcards/model_card_template.md",
47+
"assets/modelcards/classifier_template.md",
48+
"py.typed"
49+
]
4650

4751
[project.optional-dependencies]
4852
dev = [

scripts/export_to_onnx.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ def __init__(self, model: StaticModel) -> None:
2727
"""Initialize the TorchStaticModel with a StaticModel instance."""
2828
super().__init__()
2929
# Convert NumPy embeddings to a torch.nn.EmbeddingBag
30-
embeddings = torch.tensor(model.embedding, dtype=torch.float32)
30+
embeddings = torch.from_numpy(model.embedding)
31+
if embeddings.dtype in {torch.int8, torch.uint8}:
32+
embeddings = embeddings.to(torch.float16)
3133
self.embedding_bag = torch.nn.EmbeddingBag.from_pretrained(embeddings, mode="mean", freeze=True)
3234
self.normalize = model.normalize
3335
# Save tokenizer attributes

0 commit comments

Comments
 (0)