merge

stephantul · stephantul · commit 5f36097667b9 · 2025-04-24T16:08:07.000+02:00
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -15,6 +15,7 @@
 from model2vec.distill.tokenizer import replace_vocabulary
 from model2vec.distill.utils import select_optimal_device
 from model2vec.model import StaticModel
+from model2vec.quantization import DType, quantize_embeddings
 
 try:
     # For huggingface_hub>=0.25.0
@@ -40,6 +41,7 @@ def distill_from_model(
     sif_coefficient: float | None = 1e-4,
     use_subword: bool = True,
     token_remove_pattern: str | None = r"\[unused\d+\]",
+    quantize_to: DType | str = DType.Float16,
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -64,9 +66,11 @@ def distill_from_model(
     :param use_subword: Whether to keep subword tokens in the vocabulary. If this is False, you must pass a vocabulary, and the returned tokenizer will only detect full words.
     :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
         If the pattern is so general that it removes all tokens, we throw an error. If the pattern can't be compiled into a valid regex, we also throw an error.
+    :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :return: A StaticModel
 
     """
+    quantize_to = DType(quantize_to)
     backend_tokenizer = tokenizer.backend_tokenizer
     sif_coefficient, token_remove_regex = _validate_parameters(
         vocabulary, apply_zipf, sif_coefficient, use_subword, token_remove_pattern
@@ -106,6 +110,9 @@ def distill_from_model(
     # Post process the embeddings by applying PCA and Zipf weighting.
     embeddings = _post_process_embeddings(np.asarray(embeddings), pca_dims, sif_coefficient=sif_coefficient)
 
+    # Quantize the embeddings.
+    embeddings = quantize_embeddings(embeddings, quantize_to)
+
     model_name = getattr(model, "name_or_path", "")
 
     config = {
@@ -209,6 +216,7 @@ def distill(
     use_subword: bool = True,
     token_remove_pattern: str | None = r"\[unused\d+\]",
     trust_remote_code: bool = False,
+    quantize_to: DType | str = DType.Float16,
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -232,6 +240,7 @@ def distill(
     :param use_subword: Whether to keep subword tokens in the vocabulary. If this is False, you must pass a vocabulary, and the returned tokenizer will only detect full words.
     :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
     :param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `transformers`. If this is True, we will load all components.
+    :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :return: A StaticModel
 
     """
@@ -248,6 +257,7 @@ def distill(
         use_subword=use_subword,
         token_remove_pattern=token_remove_pattern,
         sif_coefficient=sif_coefficient,
+        quantize_to=quantize_to,
     )
 
 
@@ -303,17 +313,31 @@ def _clean_vocabulary(tokenizer: Tokenizer, vocabulary: list[str], added_tokens:
     n_empty = 0
     n_duplicates = 0
     for token in vocabulary:
-        if tokenizer.normalizer is not None:
-            token = tokenizer.normalizer.normalize_str(token)
+        normalizer = tokenizer.normalizer
+        if normalizer is not None:
+            token = normalizer.normalize_str(token)
 
         if not token:
             n_empty += 1
             continue
-        if token in seen_tokens or token in added_tokens_set:
+
+        pre_tokenizer = tokenizer.pre_tokenizer
+        if pre_tokenizer is not None:
+            pretokenized_tokens = pre_tokenizer.pre_tokenize_str(token)
+            new_token = " ".join(pretokenized_tokens[1])
+        else:
+            new_token = token
+
+        # We need to check whether the pretokenized token is in the vocabulary.
+        # But we need to return the original token, because that will be tokenized
+        # again by the tokenizer during featurization.
+        if new_token in seen_tokens or new_token in added_tokens_set:
             n_duplicates += 1
             continue
 
-        seen_tokens.add(token)
+        # Add the possibly pretokenized token to _seen_
+        seen_tokens.add(new_token)
+        # Add the original string to the vocabulary.
         cleaned_vocabulary.append(token)
 
     if n_duplicates:
diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
@@ -23,6 +23,7 @@ def save_pretrained(
     tokenizer: Tokenizer,
     config: dict[str, Any],
     create_model_card: bool = True,
+    subfolder: str | None = None,
     **kwargs: Any,
 ) -> None:
     """
@@ -33,8 +34,10 @@ def save_pretrained(
     :param tokenizer: The tokenizer.
     :param config: A metadata config.
     :param create_model_card: Whether to create a model card.
+    :param subfolder: The subfolder to save the model in.
     :param **kwargs: Any additional arguments.
     """
+    folder_path = folder_path / subfolder if subfolder else folder_path
     folder_path.mkdir(exist_ok=True, parents=True)
     save_file({"embeddings": embeddings}, folder_path / "model.safetensors")
     tokenizer.save(str(folder_path / "tokenizer.json"))
@@ -92,14 +95,18 @@ def _create_model_card(
 
 
 def load_pretrained(
-    folder_or_repo_path: str | Path, token: str | None = None, from_sentence_transformers: bool = False
+    folder_or_repo_path: str | Path,
+    subfolder: str | None = None,
+    token: str | None = None,
+    from_sentence_transformers: bool = False,
 ) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
     """
     Loads a pretrained model from a folder.
 
     :param folder_or_repo_path: The folder or repo path to load from.
         - If this is a local path, we will load from the local path.
         - If the local path is not found, we will attempt to load from the huggingface hub.
+    :param subfolder: The subfolder to load from.
     :param token: The huggingface token to use.
     :param from_sentence_transformers: Whether to load the model from a sentence transformers model.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
@@ -116,36 +123,47 @@ def load_pretrained(
         config_name = "config.json"
 
     folder_or_repo_path = Path(folder_or_repo_path)
-    if folder_or_repo_path.exists():
-        embeddings_path = folder_or_repo_path / model_file
+
+    local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path
+
+    if local_folder.exists():
+        embeddings_path = local_folder / model_file
         if not embeddings_path.exists():
-            raise FileNotFoundError(f"Embeddings file does not exist in {folder_or_repo_path}")
+            raise FileNotFoundError(f"Embeddings file does not exist in {local_folder}")
 
-        config_path = folder_or_repo_path / config_name
+        config_path = local_folder / config_name
         if not config_path.exists():
-            raise FileNotFoundError(f"Config file does not exist in {folder_or_repo_path}")
+            raise FileNotFoundError(f"Config file does not exist in {local_folder}")
 
-        tokenizer_path = folder_or_repo_path / tokenizer_file
+        tokenizer_path = local_folder / tokenizer_file
         if not tokenizer_path.exists():
-            raise FileNotFoundError(f"Tokenizer file does not exist in {folder_or_repo_path}")
+            raise FileNotFoundError(f"Tokenizer file does not exist in {local_folder}")
 
         # README is optional, so this is a bit finicky.
-        readme_path = folder_or_repo_path / "README.md"
+        readme_path = local_folder / "README.md"
         metadata = _get_metadata_from_readme(readme_path)
 
     else:
         logger.info("Folder does not exist locally, attempting to use huggingface hub.")
-        embeddings_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), model_file, token=token)
+        embeddings_path = huggingface_hub.hf_hub_download(
+            folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
+        )
 
         try:
-            readme_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "README.md", token=token)
+            readme_path = huggingface_hub.hf_hub_download(
+                folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
+            )
             metadata = _get_metadata_from_readme(Path(readme_path))
         except huggingface_hub.utils.EntryNotFoundError:
             logger.info("No README found in the model folder. No model card loaded.")
             metadata = {}
 
-        config_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), config_name, token=token)
-        tokenizer_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), tokenizer_file, token=token)
+        config_path = huggingface_hub.hf_hub_download(
+            folder_or_repo_path.as_posix(), config_name, token=token, subfolder=subfolder
+        )
+        tokenizer_path = huggingface_hub.hf_hub_download(
+            folder_or_repo_path.as_posix(), tokenizer_file, token=token, subfolder=subfolder
+        )
 
     opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
     if from_sentence_transformers:
@@ -176,11 +194,15 @@ def _get_metadata_from_readme(readme_path: Path) -> dict[str, Any]:
     return data
 
 
-def push_folder_to_hub(folder_path: Path, repo_id: str, private: bool, token: str | None) -> None:
+def push_folder_to_hub(
+    folder_path: Path, subfolder: str | None, repo_id: str, private: bool, token: str | None
+) -> None:
     """
     Push a model folder to the huggingface hub, including model card.
 
     :param folder_path: The path to the folder.
+    :param subfolder: The subfolder to push to.
+        If None, the folder will be pushed to the root of the repo.
     :param repo_id: The repo name.
     :param private: Whether the repo is private.
     :param token: The huggingface token.
@@ -189,15 +211,6 @@ def push_folder_to_hub(folder_path: Path, repo_id: str, private: bool, token: st
         huggingface_hub.create_repo(repo_id, token=token, private=private)
 
     # Push model card and all model files to the Hugging Face hub
-    huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token)
-
-    # Check if the model card exists, and push it if available
-    model_card_path = folder_path / "README.md"
-    if model_card_path.exists():
-        card = ModelCard.load(model_card_path)
-        card.push_to_hub(repo_id=repo_id, token=token)
-        logger.info(f"Pushed model card to {repo_id}")
-    else:
-        logger.warning(f"Model card README.md not found in {folder_path}. Skipping model card upload.")
+    huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder)
 
     logger.info(f"Pushed model to {repo_id}")
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -12,6 +12,7 @@
 from tokenizers import Encoding, Tokenizer
 from tqdm import tqdm
 
+from model2vec.quantization import DType, quantize_embeddings
 from model2vec.utils import ProgressParallel, load_local_model
 
 PathLike = Union[Path, str]
@@ -95,12 +96,13 @@ def normalize(self, value: bool) -> None:
             )
         self.config["normalize"] = value
 
-    def save_pretrained(self, path: PathLike, model_name: str | None = None) -> None:
+    def save_pretrained(self, path: PathLike, model_name: str | None = None, subfolder: str | None = None) -> None:
         """
         Save the pretrained model.
 
         :param path: The path to save to.
         :param model_name: The model name to use in the Model Card.
+        :param subfolder: The subfolder to save to.
         """
         from model2vec.hf_utils import save_pretrained
 
@@ -112,6 +114,7 @@ def save_pretrained(self, path: PathLike, model_name: str | None = None) -> None
             base_model_name=self.base_model_name,
             language=self.language,
             model_name=model_name,
+            subfolder=subfolder,
         )
 
     def tokenize(self, sentences: list[str], max_length: int | None = None) -> list[list[int]]:
@@ -150,6 +153,9 @@ def from_pretrained(
         path: PathLike,
         token: str | None = None,
         normalize: bool | None = None,
+        subfolder: str | None = None,
+        quantize_to: str | DType | None = None,
+        dimensionality: int | None = None,
     ) -> StaticModel:
         """
         Load a StaticModel from a local path or huggingface hub path.
@@ -159,11 +165,34 @@ def from_pretrained(
         :param path: The path to load your static model from.
         :param token: The huggingface token to use.
         :param normalize: Whether to normalize the embeddings.
+        :param subfolder: The subfolder to load from.
+        :param quantize_to: The dtype to quantize the model to. If None, no quantization is done.
+            If a string is passed, it is converted to a DType.
+        :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
+            This is useful if you want to load a model with a lower dimensionality.
+            Note that this only applies if you have trained your model using mrl or PCA.
         :return: A StaticModel
+        :raises: ValueError if the dimensionality is greater than the model dimensionality.
         """
         from model2vec.hf_utils import load_pretrained
 
-        embeddings, tokenizer, config, metadata = load_pretrained(path, token=token, from_sentence_transformers=False)
+        embeddings, tokenizer, config, metadata = load_pretrained(
+            path, token=token, from_sentence_transformers=False, subfolder=subfolder
+        )
+
+        if quantize_to is not None:
+            quantize_to = DType(quantize_to)
+            embeddings = quantize_embeddings(embeddings, quantize_to)
+        if dimensionality is not None:
+            if dimensionality > embeddings.shape[1]:
+                raise ValueError(
+                    f"Dimensionality {dimensionality} is greater than the model dimensionality {embeddings.shape[1]}"
+                )
+            embeddings = embeddings[:, :dimensionality]
+            if config.get("apply_pca", None) is None:
+                logger.warning(
+                    "You are reducing the dimensionality of the model, but we can't find a pca key in the model config. This might not work as expected."
+                )
 
         return cls(
             embeddings,
@@ -352,7 +381,9 @@ def _batch(sentences: list[str], batch_size: int) -> Iterator[list[str]]:
         """Batch the sentences into equal-sized."""
         return (sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size))
 
-    def push_to_hub(self, repo_id: str, private: bool = False, token: str | None = None) -> None:
+    def push_to_hub(
+        self, repo_id: str, private: bool = False, token: str | None = None, subfolder: str | None = None
+    ) -> None:
         """
         Push the model to the huggingface hub.
 
@@ -362,12 +393,13 @@ def push_to_hub(self, repo_id: str, private: bool = False, token: str | None = N
         :param private: Whether the repo, if created is set to private.
             If the repo already exists, this doesn't change the visibility.
         :param token: The huggingface token to use.
+        :param subfolder: The subfolder to push to.
         """
         from model2vec.hf_utils import push_folder_to_hub
 
         with TemporaryDirectory() as temp_dir:
             self.save_pretrained(temp_dir, model_name=repo_id)
-            push_folder_to_hub(Path(temp_dir), repo_id, private, token)
+            push_folder_to_hub(Path(temp_dir), subfolder=subfolder, repo_id=repo_id, private=private, token=token)
 
     @classmethod
     def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
diff --git a/model2vec/py.typed b/model2vec/py.typed
diff --git a/model2vec/quantization.py b/model2vec/quantization.py
@@ -0,0 +1,35 @@
+from enum import Enum
+
+import numpy as np
+
+
+class DType(str, Enum):
+    Float16 = "float16"
+    Float32 = "float32"
+    Float64 = "float64"
+    Int8 = "int8"
+
+
+def quantize_embeddings(embeddings: np.ndarray, quantize_to: DType) -> np.ndarray:
+    """
+    Quantize embeddings to a specified data type to reduce memory usage.
+
+    :param embeddings: The embeddings to quantize, as a numpy array.
+    :param quantize_to: The data type to quantize to.
+    :return: The quantized embeddings.
+    :raises ValueError: If the quantization type is not valid.
+    """
+    if quantize_to == DType.Float16:
+        return embeddings.astype(np.float16)
+    elif quantize_to == DType.Float32:
+        return embeddings.astype(np.float32)
+    elif quantize_to == DType.Float64:
+        return embeddings.astype(np.float64)
+    elif quantize_to == DType.Int8:
+        # Normalize to [-128, 127] range for int8
+        # We normalize to -127 to 127 to keep symmetry.
+        scale = np.max(np.abs(embeddings)) / 127.0
+        quantized = np.round(embeddings / scale).astype(np.int8)
+        return quantized
+    else:
+        raise ValueError("Not a valid enum member of DType.")
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,7 +42,11 @@ packages = ["model2vec"]
 include-package-data = true
 
 [tool.setuptools.package-data]
-model2vec = ["assets/modelcards/model_card_template.md", "assets/modelcards/classifier_template.md"]
+model2vec = [
+    "assets/modelcards/model_card_template.md",
+    "assets/modelcards/classifier_template.md",
+    "py.typed"
+]
 
 [project.optional-dependencies]
 dev = [
diff --git a/scripts/export_to_onnx.py b/scripts/export_to_onnx.py
@@ -27,7 +27,9 @@ def __init__(self, model: StaticModel) -> None:
         """Initialize the TorchStaticModel with a StaticModel instance."""
         super().__init__()
         # Convert NumPy embeddings to a torch.nn.EmbeddingBag
-        embeddings = torch.tensor(model.embedding, dtype=torch.float32)
+        embeddings = torch.from_numpy(model.embedding)
+        if embeddings.dtype in {torch.int8, torch.uint8}:
+            embeddings = embeddings.to(torch.float16)
         self.embedding_bag = torch.nn.EmbeddingBag.from_pretrained(embeddings, mode="mean", freeze=True)
         self.normalize = model.normalize
         # Save tokenizer attributes
diff --git a/tests/test_model.py b/tests/test_model.py
diff --git a/tests/test_quantization.py b/tests/test_quantization.py