merge

stephantul · stephantul · commit eb21ede28058 · 2025-04-21T18:46:31.000+02:00
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -314,24 +314,35 @@ def _clean_vocabulary(tokenizer: Tokenizer, vocabulary: list[str], added_tokens:
     n_duplicates = 0
     n_multiword = 0
     for token in vocabulary:
-        if tokenizer.normalizer is not None:
-            token = tokenizer.normalizer.normalize_str(token)
+        normalizer = tokenizer.normalizer
+        if normalizer is not None:
+            token = normalizer.normalize_str(token)
 
         if not token:
             n_empty += 1
             continue
-        if token in seen_tokens or token in added_tokens_set:
-            n_duplicates += 1
-            continue
 
         pre_tokenizer = tokenizer.pre_tokenizer
+        # We need to check whether the pretokenized token is a single word or not.
         if pre_tokenizer is not None:
             pretokenized_tokens = pre_tokenizer.pre_tokenize_str(token)
             if len(pretokenized_tokens) != 1:
                 n_multiword += 1
                 continue
+            new_token = pretokenized_tokens[-1][0]
+        else:
+            new_token = token
+
+        # We need to check whether the pretokenized token is in the vocabulary.
+        # But we need to return the original token, because that will be tokenized
+        # again by the tokenizer during featurization.
+        if new_token in seen_tokens or new_token in added_tokens_set:
+            n_duplicates += 1
+            continue
 
-        seen_tokens.add(token)
+        # Add the possibly pretokenized token to _seen_
+        seen_tokens.add(new_token)
+        # Add the original string to the vocabulary.
         cleaned_vocabulary.append(token)
 
     if n_duplicates:
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -152,6 +152,7 @@ def from_pretrained(
         token: str | None = None,
         normalize: bool | None = None,
         quantize_to: str | DType | None = None,
+        dimensionality: int | None = None,
     ) -> StaticModel:
         """
         Load a StaticModel from a local path or huggingface hub path.
@@ -163,7 +164,11 @@ def from_pretrained(
         :param normalize: Whether to normalize the embeddings.
         :param quantize_to: The dtype to quantize the model to. If None, no quantization is done.
             If a string is passed, it is converted to a DType.
+        :param dimensionality: The dimensionality of the model. If this is None, use the dimensionality of the model.
+            This is useful if you want to load a model with a lower dimensionality.
+            Note that this only applies if you have trained your model using mrl or PCA.
         :return: A StaticModel
+        :raises: ValueError if the dimensionality is greater than the model dimensionality.
         """
         from model2vec.hf_utils import load_pretrained
 
@@ -172,6 +177,16 @@ def from_pretrained(
         if quantize_to is not None:
             quantize_to = DType(quantize_to)
             embeddings = quantize_embeddings(embeddings, quantize_to)
+        if dimensionality is not None:
+            if dimensionality > embeddings.shape[1]:
+                raise ValueError(
+                    f"Dimensionality {dimensionality} is greater than the model dimensionality {embeddings.shape[1]}"
+                )
+            embeddings = embeddings[:, :dimensionality]
+            if config.get("apply_pca", None) is None:
+                logger.warning(
+                    "You are reducing the dimensionality of the model, but we can't find a pca key in the model config. This might not work as expected."
+                )
 
         return cls(
             embeddings,
diff --git a/model2vec/py.typed b/model2vec/py.typed
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,7 +42,11 @@ packages = ["model2vec"]
 include-package-data = true
 
 [tool.setuptools.package-data]
-model2vec = ["assets/modelcards/model_card_template.md", "assets/modelcards/classifier_template.md"]
+model2vec = [
+    "assets/modelcards/model_card_template.md",
+    "assets/modelcards/classifier_template.md",
+    "py.typed"
+]
 
 [project.optional-dependencies]
 dev = [
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -207,12 +207,40 @@ def test_load_pretrained_quantized(
 
     # Load the model back from the same path
     loaded_model = StaticModel.from_pretrained(save_path, quantize_to="float32")
-
     # Assert that the loaded model has the same properties as the original one
     assert loaded_model.embedding.dtype == np.float32
     assert loaded_model.embedding.shape == mock_vectors.shape
 
 
+def test_load_pretrained_dim(
+    tmp_path: Path, mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: dict[str, str]
+) -> None:
+    """Test loading a pretrained model with dimensionality."""
+    # Save the model to a temporary path
+    model = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config)
+    save_path = tmp_path / "saved_model"
+    model.save_pretrained(save_path)
+
+    loaded_model = StaticModel.from_pretrained(save_path, dimensionality=2)
+
+    # Assert that the loaded model has the same properties as the original one
+    np.testing.assert_array_equal(loaded_model.embedding, mock_vectors[:, :2])
+    assert loaded_model.tokenizer.get_vocab() == mock_tokenizer.get_vocab()
+    assert loaded_model.config == mock_config
+
+    # Load the model back from the same path
+    loaded_model = StaticModel.from_pretrained(save_path, dimensionality=None)
+
+    # Assert that the loaded model has the same properties as the original one
+    np.testing.assert_array_equal(loaded_model.embedding, mock_vectors)
+    assert loaded_model.tokenizer.get_vocab() == mock_tokenizer.get_vocab()
+    assert loaded_model.config == mock_config
+
+    # Load the model back from the same path
+    with pytest.raises(ValueError):
+        StaticModel.from_pretrained(save_path, dimensionality=3000)
+
+
 def test_initialize_normalize(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer) -> None:
     """Tests whether the normalization initialization is correct."""
     model = StaticModel(mock_vectors, mock_tokenizer, {}, normalize=None)