Merge pull request #19 from MinishLab/fix_pca_bug

stephantul · web-flow · commit 8e0be629db91 · 2024-09-20T22:12:07.000+02:00
Fix pca bug
diff --git a/README.md b/README.md
@@ -108,7 +108,7 @@ vocabulary = ["word1", "word2", "word3"]
 model_name = "BAAI/bge-base-en-v1.5"
 
 # Distill the model with the custom vocabulary
-m2v_model = distill(model_name=model_name, vocabulary=vocabulary, pca_dims=256)
+m2v_model = distill(model_name=model_name, vocabulary=vocabulary, pca_dims=None)
 
 # Save the model
 m2v_model.save_pretrained("m2v_model")
diff --git a/model2vec/distill/__main__.py b/model2vec/distill/__main__.py
@@ -104,15 +104,19 @@ def distill(
         tokenizer = create_tokenizer_from_vocab(tokens, unk_token="[UNK]", pad_token="[PAD]")
 
     if pca_dims is not None:
-        if pca_dims < embeddings.shape[1]:
+        if pca_dims >= embeddings.shape[1]:
+            raise ValueError(
+                f"PCA dimension ({pca_dims}) is larger than the number of dimensions in the embeddings ({embeddings.shape[1]})"
+            )
+        if pca_dims >= len(tokens):
+            logger.warning(
+                f"PCA dimension ({pca_dims}) is larger than the number of tokens in the vocabulary ({len(tokens)}). Not applying PCA."
+            )
+        elif pca_dims < embeddings.shape[1]:
             logger.info(f"Applying PCA with n_components {pca_dims}")
 
             p = PCA(n_components=pca_dims, whiten=False)
             embeddings = p.fit_transform(embeddings)
-        else:
-            raise ValueError(
-                f"PCA dimension ({pca_dims}) is larger than the number of dimensions in the embeddings ({embeddings.shape[1]})"
-            )
 
     if apply_zipf:
         logger.info("Applying Zipf weighting")