feat: Added quantization for from_sentence_transformers (#219)

Pringled · stephantul · web-flow · commit 79900e709fa8 · 2025-04-24T19:44:09.000+02:00
* Added quantization for from_sentence_transformers * Updates * feat: remove flag argument (#220) * feat: remove flag argument * fix typing * add future anns --------- Co-authored-by: Stephan Tulkens <stephantul@gmail.com>
diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
@@ -90,7 +90,7 @@ def _create_model_card(
         library_name="model2vec",
         **kwargs,
     )
-    model_card = ModelCard.from_template(model_card_data, template_path=full_path)
+    model_card = ModelCard.from_template(model_card_data, template_path=str(full_path))
     model_card.save(folder_path / "README.md")
 
 
@@ -145,24 +145,32 @@ def load_pretrained(
 
     else:
         logger.info("Folder does not exist locally, attempting to use huggingface hub.")
-        embeddings_path = huggingface_hub.hf_hub_download(
-            folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
+        embeddings_path = Path(
+            huggingface_hub.hf_hub_download(
+                folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
+            )
         )
 
         try:
-            readme_path = huggingface_hub.hf_hub_download(
-                folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
+            readme_path = Path(
+                huggingface_hub.hf_hub_download(
+                    folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
+                )
             )
             metadata = _get_metadata_from_readme(Path(readme_path))
         except huggingface_hub.utils.EntryNotFoundError:
             logger.info("No README found in the model folder. No model card loaded.")
             metadata = {}
 
-        config_path = huggingface_hub.hf_hub_download(
-            folder_or_repo_path.as_posix(), config_name, token=token, subfolder=subfolder
+        config_path = Path(
+            huggingface_hub.hf_hub_download(
+                folder_or_repo_path.as_posix(), config_name, token=token, subfolder=subfolder
+            )
         )
-        tokenizer_path = huggingface_hub.hf_hub_download(
-            folder_or_repo_path.as_posix(), tokenizer_file, token=token, subfolder=subfolder
+        tokenizer_path = Path(
+            huggingface_hub.hf_hub_download(
+                folder_or_repo_path.as_posix(), tokenizer_file, token=token, subfolder=subfolder
+            )
         )
 
     opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -12,7 +12,7 @@
 from tokenizers import Encoding, Tokenizer
 from tqdm import tqdm
 
-from model2vec.quantization import DType, quantize_embeddings
+from model2vec.quantization import DType, quantize_and_reduce_dim
 from model2vec.utils import ProgressParallel, load_local_model
 
 PathLike = Union[Path, str]
@@ -171,28 +171,22 @@ def from_pretrained(
         :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
             This is useful if you want to load a model with a lower dimensionality.
             Note that this only applies if you have trained your model using mrl or PCA.
-        :return: A StaticModel
-        :raises: ValueError if the dimensionality is greater than the model dimensionality.
+        :return: A StaticModel.
         """
         from model2vec.hf_utils import load_pretrained
 
         embeddings, tokenizer, config, metadata = load_pretrained(
-            path, token=token, from_sentence_transformers=False, subfolder=subfolder
+            folder_or_repo_path=path,
+            token=token,
+            from_sentence_transformers=False,
+            subfolder=subfolder,
         )
 
-        if quantize_to is not None:
-            quantize_to = DType(quantize_to)
-            embeddings = quantize_embeddings(embeddings, quantize_to)
-        if dimensionality is not None:
-            if dimensionality > embeddings.shape[1]:
-                raise ValueError(
-                    f"Dimensionality {dimensionality} is greater than the model dimensionality {embeddings.shape[1]}"
-                )
-            embeddings = embeddings[:, :dimensionality]
-            if config.get("apply_pca", None) is None:
-                logger.warning(
-                    "You are reducing the dimensionality of the model, but we can't find a pca key in the model config. This might not work as expected."
-                )
+        embeddings = quantize_and_reduce_dim(
+            embeddings=embeddings,
+            quantize_to=quantize_to,
+            dimensionality=dimensionality,
+        )
 
         return cls(
             embeddings,
@@ -209,6 +203,8 @@ def from_sentence_transformers(
         path: PathLike,
         token: str | None = None,
         normalize: bool | None = None,
+        quantize_to: str | DType | None = None,
+        dimensionality: int | None = None,
     ) -> StaticModel:
         """
         Load a StaticModel trained with sentence transformers from a local path or huggingface hub path.
@@ -218,13 +214,36 @@ def from_sentence_transformers(
         :param path: The path to load your static model from.
         :param token: The huggingface token to use.
         :param normalize: Whether to normalize the embeddings.
-        :return: A StaticModel
+        :param quantize_to: The dtype to quantize the model to. If None, no quantization is done.
+            If a string is passed, it is converted to a DType.
+        :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
+            This is useful if you want to load a model with a lower dimensionality.
+            Note that this only applies if you have trained your model using mrl or PCA.
+        :return: A StaticModel.
         """
         from model2vec.hf_utils import load_pretrained
 
-        embeddings, tokenizer, config, _ = load_pretrained(path, token=token, from_sentence_transformers=True)
+        embeddings, tokenizer, config, metadata = load_pretrained(
+            folder_or_repo_path=path,
+            token=token,
+            from_sentence_transformers=True,
+            subfolder=None,
+        )
+
+        embeddings = quantize_and_reduce_dim(
+            embeddings=embeddings,
+            quantize_to=quantize_to,
+            dimensionality=dimensionality,
+        )
 
-        return cls(embeddings, tokenizer, config, normalize=normalize, base_model_name=None, language=None)
+        return cls(
+            embeddings,
+            tokenizer,
+            config,
+            normalize=normalize,
+            base_model_name=metadata.get("base_model"),
+            language=metadata.get("language"),
+        )
 
     def encode_as_sequence(
         self,
diff --git a/model2vec/quantization.py b/model2vec/quantization.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from enum import Enum
 
 import numpy as np
@@ -33,3 +35,29 @@ def quantize_embeddings(embeddings: np.ndarray, quantize_to: DType) -> np.ndarra
         return quantized
     else:
         raise ValueError("Not a valid enum member of DType.")
+
+
+def quantize_and_reduce_dim(
+    embeddings: np.ndarray, quantize_to: str | DType | None, dimensionality: int | None
+) -> np.ndarray:
+    """
+    Quantize embeddings to a datatype and reduce dimensionality.
+
+    :param embeddings: The embeddings to quantize and reduce, as a numpy array.
+    :param quantize_to: The data type to quantize to. If None, no quantization is performed.
+    :param dimensionality: The number of dimensions to keep. If None, no dimensionality reduction is performed.
+    :return: The quantized and reduced embeddings.
+    :raises ValueError: If the passed dimensionality is not None and greater than the model dimensionality.
+    """
+    if quantize_to is not None:
+        quantize_to = DType(quantize_to)
+        embeddings = quantize_embeddings(embeddings, quantize_to)
+
+    if dimensionality is not None:
+        if dimensionality > embeddings.shape[1]:
+            raise ValueError(
+                f"Dimensionality {dimensionality} is greater than the model dimensionality {embeddings.shape[1]}"
+            )
+        embeddings = embeddings[:, :dimensionality]
+
+    return embeddings