Merge branch 'main' into vocquant

stephantul · stephantul · commit e1a5ce53b9d2 · 2025-07-13T21:24:24.000+02:00
diff --git a/model2vec/tokenizer/normalizer.py b/model2vec/tokenizer/normalizer.py
@@ -18,10 +18,18 @@ def replace_normalizer(
     :param tokenizer: The tokenizer to change.
     :return: The tokenizer with a replaced normalizer.
     """
+    spaces_punctuation = tokenizer.encode("a, ,", add_special_tokens=False).tokens
+    if len(spaces_punctuation) != 3:
+        add_space = False
+    else:
+        _, first_comma, second_comma = spaces_punctuation
+        add_space = first_comma == second_comma == ","
+
     normalizer = tokenizer.normalizer
     new_normalizers = []
     for char in punctuation:
-        new_normalizers.append(Replace(char, f" {char} "))
+        replacement = f" {char} " if add_space else f"{char} "
+        new_normalizers.append(Replace(char, replacement))
 
     new_normalizers.append(Replace(Regex(r"\s+"), " "))
     new_normalizers.append(Strip(right=True))
diff --git a/model2vec/train/classifier.py b/model2vec/train/classifier.py
@@ -138,6 +138,7 @@ def fit(
         device: str = "auto",
         X_val: list[str] | None = None,
         y_val: LabelType | None = None,
+        class_weight: torch.Tensor | None = None,
     ) -> StaticModelForClassification:
         """
         Fit a model.
@@ -165,6 +166,8 @@ def fit(
         :param device: The device to train on. If this is "auto", the device is chosen automatically.
         :param X_val: The texts to be used for validation.
         :param y_val: The labels to be used for validation.
+        :param class_weight: The weight of the classes. If None, all classes are weighted equally. Must 
+            have the same length as the number of classes.
         :return: The fitted model.
         :raises ValueError: If either X_val or y_val are provided, but not both.
         """
@@ -199,13 +202,17 @@ def fit(
             base_number = int(min(max(1, (len(train_texts) / 30) // 32), 16))
             batch_size = int(base_number * 32)
             logger.info("Batch size automatically set to %d.", batch_size)
+        
+        if class_weight is not None:
+            if len(class_weight) != len(self.classes_):
+                raise ValueError("class_weight must have the same length as the number of classes.")
 
         logger.info("Preparing train dataset.")
         train_dataset = self._prepare_dataset(train_texts, train_labels)
         logger.info("Preparing validation dataset.")
         val_dataset = self._prepare_dataset(validation_texts, validation_labels)
 
-        c = _ClassifierLightningModule(self, learning_rate=learning_rate)
+        c = _ClassifierLightningModule(self, learning_rate=learning_rate, class_weight=class_weight)
 
         n_train_batches = len(train_dataset) // batch_size
         callbacks: list[Callback] = []
@@ -243,6 +250,9 @@ def fit(
 
         state_dict = {}
         for weight_name, weight in best_model_weights["state_dict"].items():
+            if "loss_function" in weight_name:
+                # Skip the loss function class weight as its not needed for predictions
+                continue
             state_dict[weight_name.removeprefix("model.")] = weight
 
         self.load_state_dict(state_dict)
@@ -374,12 +384,12 @@ def to_pipeline(self) -> StaticModelPipeline:
 
 
 class _ClassifierLightningModule(pl.LightningModule):
-    def __init__(self, model: StaticModelForClassification, learning_rate: float) -> None:
+    def __init__(self, model: StaticModelForClassification, learning_rate: float, class_weight: torch.Tensor | None = None) -> None:
         """Initialize the LightningModule."""
         super().__init__()
         self.model = model
         self.learning_rate = learning_rate
-        self.loss_function = nn.CrossEntropyLoss() if not model.multilabel else nn.BCEWithLogitsLoss()
+        self.loss_function = nn.CrossEntropyLoss(weight=class_weight) if not model.multilabel else nn.BCEWithLogitsLoss(pos_weight=class_weight)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Simple forward pass."""
diff --git a/model2vec/version.py b/model2vec/version.py
@@ -1,2 +1,2 @@
-__version_triple__ = (0, 5, 0)
+__version_triple__ = (0, 6, 0)
 __version__ = ".".join(map(str, __version_triple__))
diff --git a/tests/test_trainable.py b/tests/test_trainable.py
@@ -174,6 +174,23 @@ def test_y_val_none() -> None:
         model.fit(X, y, X_val=None, y_val=y_val)
     model.fit(X, y, X_val=None, y_val=None)
 
+def test_class_weight() -> None:
+    """Test the class weight function."""
+    tokenizer = AutoTokenizer.from_pretrained("tests/data/test_tokenizer").backend_tokenizer
+    torch.random.manual_seed(42)
+    vectors_torched = torch.randn(len(tokenizer.get_vocab()), 12)
+    model = StaticModelForClassification(vectors=vectors_torched, tokenizer=tokenizer, hidden_dim=12).to("cpu")
+
+    X = ["dog", "cat"]
+    y = ["0", "1"]
+
+    bad_class_weight = torch.tensor([1.0])
+    with pytest.raises(ValueError):
+        model.fit(X, y, class_weight=bad_class_weight)
+
+    class_weight = torch.tensor([1.0, 2.0])
+    model.fit(X, y, class_weight=class_weight)
+
 
 @pytest.mark.parametrize(
     "y_multi,y_val_multi,should_crash",

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-__version_triple__ = (0, 5, 0)`
	`1`	`+__version_triple__ = (0, 6, 0)`
`2`	`2`	`__version__ = ".".join(map(str, __version_triple__))`