Updated docstrings

Pringled · Pringled · commit 46cba79210de · 2025-10-03T11:50:53.000+02:00
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -60,7 +60,7 @@ def distill_from_model(
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
     :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
-    :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean", "last", or "cls".
+    :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean" (default), "last", "first", or "pooler".
     :return: A StaticModel
     :raises: ValueError if the vocabulary is empty after preprocessing.
 
@@ -259,7 +259,7 @@ def distill(
     :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents.
     :param use_subword: DEPRECATED: If this is not set to None, we show a warning. It doesn't do anything.
     :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed.
-    :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean", "last", or "cls".
+    :param pooling: The pooling strategy to use for creating embeddings. Can be one of "mean" (default), "last", "first", or "pooler".
     :return: A StaticModel
 
     """
diff --git a/model2vec/distill/inference.py b/model2vec/distill/inference.py
@@ -218,22 +218,30 @@ def post_process_embeddings(
         if pca_dims > embeddings.shape[1]:
             logger.warning(
                 f"PCA dimension ({pca_dims}) is larger than the number of dimensions in the embeddings ({embeddings.shape[1]}). "
-                "Applying PCA, but not reducing dimensionality. If this is not desired, set `pca_dims` to None."
+                "Applying PCA, but not reducing dimensionality. Is this is not desired, please set `pca_dims` to None. "
+                "Applying PCA will probably improve performance, so consider just leaving it."
             )
             pca_dims = embeddings.shape[1]
         if pca_dims >= embeddings.shape[0]:
             logger.warning(
                 f"PCA dimension ({pca_dims}) is larger than the number of tokens in the vocabulary ({embeddings.shape[0]}). Not applying PCA."
             )
         elif pca_dims <= embeddings.shape[1]:
+            if isinstance(pca_dims, float):
+                logger.info(f"Applying PCA with {pca_dims} explained variance.")
+            else:
+                logger.info(f"Applying PCA with n_components {pca_dims}")
+
             orig_dims = embeddings.shape[1]
             p = PCA(n_components=pca_dims, svd_solver="full")
             embeddings = p.fit_transform(embeddings)
+
             if embeddings.shape[1] < orig_dims:
-                logger.info(
-                    f"Reduced dimensionality {orig_dims} -> {embeddings.shape[1]} "
-                    f"(explained var ratio: {np.sum(p.explained_variance_ratio_):.3f})."
-                )
+                explained_variance_ratio = np.sum(p.explained_variance_ratio_)
+                explained_variance = np.sum(p.explained_variance_)
+                logger.info(f"Reduced dimensionality from {orig_dims} to {embeddings.shape[1]}.")
+                logger.info(f"Explained variance ratio: {explained_variance_ratio:.3f}.")
+                logger.info(f"Explained variance: {explained_variance:.3f}.")
 
     if sif_coefficient is not None:
         logger.info("Estimating word frequencies using Zipf's law, and then applying SIF.")