Merge pull request #18 from MinishLab/finetuning

stephantul · web-flow · commit ecde199eeb04 · 2024-09-20T22:14:38.000+02:00
Turn model into module
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -8,6 +8,7 @@
 import numpy as np
 import torch
 from tokenizers import Encoding, Tokenizer
+from torch import nn
 from torch.nn import EmbeddingBag
 from tqdm import tqdm
 
@@ -19,18 +20,21 @@
 logger = getLogger(__name__)
 
 
-class StaticModel:
-    def __init__(self, vectors: np.ndarray, tokenizer: Tokenizer, config: dict[str, Any]) -> None:
+class StaticModel(nn.Module):
+    def __init__(
+        self, vectors: np.ndarray, tokenizer: Tokenizer, config: dict[str, Any], normalize: bool | None = None
+    ) -> None:
         """
         Initialize the StaticModel.
 
         :param vectors: The vectors to use.
         :param tokenizer: The Transformers tokenizer to use.
         :param config: Any metadata config.
+        :param normalize: Whether to normalize.
         :raises: ValueError if the number of tokens does not match the number of vectors.
         """
+        super().__init__()
         tokens, _ = zip(*sorted(tokenizer.get_vocab().items(), key=lambda x: x[1]))
-        self.vectors = vectors
         self.tokens = tokens
         self.embedding = EmbeddingBag.from_pretrained(torch.from_numpy(vectors))
 
@@ -45,14 +49,75 @@ def __init__(self, vectors: np.ndarray, tokenizer: Tokenizer, config: dict[str,
             self.unk_token_id = None
 
         self.config = config
+        if normalize is not None:
+            self.normalize = normalize
+        else:
+            self.normalize = config.get("normalize", False)
+
+    @property
+    def normalize(self) -> bool:
+        """
+        Get the normalize value.
+
+        :return: The normalize value.
+        """
+        return self._normalize
+
+    @normalize.setter
+    def normalize(self, value: bool) -> None:
+        """Update the config if the value of normalize changes."""
+        config_normalize = self.config.get("normalize", False)
+        self._normalize = value
+        if value != config_normalize:
+            logger.warning(
+                f"Set normalization to `{value}`, which does not match config value `{config_normalize}`. Updating config."
+            )
+        self.config["normalize"] = value
 
     def save_pretrained(self, path: PathLike) -> None:
         """
         Save the pretrained model.
 
         :param path: The path to save to.
         """
-        save_pretrained(Path(path), self.vectors, self.tokenizer, self.config)
+        save_pretrained(Path(path), self.embedding.weight.numpy(), self.tokenizer, self.config)
+
+    def forward(self, ids: torch.Tensor, offsets: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the model.
+
+        :param ids: The input tensor.
+        :param offsets: The offsets tensor.
+        :return: The output tensor.
+        """
+        means = self.embedding(ids, offsets)
+        if self.normalize:
+            return torch.nn.functional.normalize(means)
+        return means
+
+    def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Tokenize a sentence.
+
+        :param sentences: The sentence to tokenize.
+        :param max_length: The maximum length of the sentence.
+        :return: The tokens.
+        """
+        encodings: list[Encoding] = self.tokenizer.encode_batch(sentences, add_special_tokens=False)
+        encodings_ids = [encoding.ids for encoding in encodings]
+
+        if self.unk_token_id is not None:
+            # NOTE: Remove the unknown token: necessary for word-level models.
+            encodings_ids = [
+                [token_id for token_id in token_ids if token_id != self.unk_token_id] for token_ids in encodings_ids
+            ]
+        if max_length is not None:
+            encodings_ids = [token_ids[:max_length] for token_ids in encodings_ids]
+
+        offsets = torch.from_numpy(np.cumsum([0] + [len(token_ids) for token_ids in encodings_ids[:-1]]))
+        ids = torch.tensor([token_id for token_ids in encodings_ids for token_id in token_ids], dtype=torch.long)
+
+        return ids, offsets
 
     @classmethod
     def from_pretrained(
@@ -80,20 +145,11 @@ def dim(self) -> int:
         """
         return self.vectors.shape[1]
 
-    @staticmethod
-    def normalize(X: np.ndarray) -> np.ndarray:
-        """Normalize an array to unit length."""
-        norms = np.linalg.norm(X, axis=1, keepdims=True)
-        norms[norms == 0] = 1
-
-        return X / norms
-
     def encode(
         self,
         sentences: list[str] | str,
         show_progressbar: bool = False,
         max_length: int | None = 512,
-        norm: bool = False,
         batch_size: int = 1024,
         **kwargs: Any,
     ) -> np.ndarray:
@@ -107,7 +163,6 @@ def encode(
         :param show_progressbar: Whether to show the progress bar.
         :param max_length: The maximum length of the sentences. Any tokens beyond this length will be truncated.
             If this is None, no truncation is done.
-        :param norm: Whether to normalize the embeddings to unit length.
         :param batch_size: The batch size to use.
         :param **kwargs: Any additional arguments. These are ignored.
         :return: The encoded sentences. If a single sentence was passed, a vector is returned.
@@ -125,31 +180,16 @@ def encode(
 
         out_array = np.concatenate(out_arrays, axis=0)
 
-        if norm:
-            out_array = self.normalize(out_array)
-
         if was_single:
             return out_array[0]
 
         return out_array
 
+    @torch.no_grad()
     def _encode_batch(self, sentences: list[str], max_length: int | None) -> np.ndarray:
         """Encode a batch of sentences."""
-        encodings: list[Encoding] = self.tokenizer.encode_batch(sentences, add_special_tokens=False)
-        encodings_ids = [encoding.ids for encoding in encodings]
-
-        if self.unk_token_id is not None:
-            # NOTE: Remove the unknown token: necessary for word-level models.
-            encodings_ids = [
-                [token_id for token_id in token_ids if token_id != self.unk_token_id] for token_ids in encodings_ids
-            ]
-        if max_length is not None:
-            encodings_ids = [token_ids[:max_length] for token_ids in encodings_ids]
-
-        offsets = np.cumsum([0] + [len(token_ids) for token_ids in encodings_ids[:-1]])
-        ids = torch.tensor([token_id for token_ids in encodings_ids for token_id in token_ids], dtype=torch.long)
-
-        return self.embedding(ids, torch.tensor(offsets, dtype=torch.long)).detach().numpy()
+        ids, offsets = self.tokenize(sentences, max_length)
+        return self.forward(ids, offsets).numpy()
 
     @staticmethod
     def _batch(sentences: list[str], batch_size: int) -> Iterator[list[str]]:
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -2,33 +2,29 @@
 
 import numpy as np
 import pytest
-from transformers import PreTrainedTokenizerFast
+from tokenizers import Tokenizer
 
 from model2vec import StaticModel
 
 
-def test_initialization(
-    mock_vectors: np.ndarray, mock_tokenizer: PreTrainedTokenizerFast, mock_config: dict[str, str]
-) -> None:
+def test_initialization(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: dict[str, str]) -> None:
     """Test successful initialization of StaticModel."""
     model = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config)
-    assert model.vectors.shape == (5, 2)
+    assert model.embedding.weight.shape == (5, 2)
     assert len(model.tokens) == 5
     assert model.tokenizer == mock_tokenizer
     assert model.config == mock_config
 
 
-def test_initialization_token_vector_mismatch(
-    mock_tokenizer: PreTrainedTokenizerFast, mock_config: dict[str, str]
-) -> None:
+def test_initialization_token_vector_mismatch(mock_tokenizer: Tokenizer, mock_config: dict[str, str]) -> None:
     """Test if error is raised when number of tokens and vectors don't match."""
     mock_vectors = np.array([[0.1, 0.2], [0.2, 0.3]])
     with pytest.raises(ValueError):
         StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config)
 
 
 def test_encode_single_sentence(
-    mock_vectors: np.ndarray, mock_tokenizer: PreTrainedTokenizerFast, mock_config: dict[str, str]
+    mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: dict[str, str]
 ) -> None:
     """Test encoding of a single sentence."""
     model = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config)
@@ -37,7 +33,7 @@ def test_encode_single_sentence(
 
 
 def test_encode_multiple_sentences(
-    mock_vectors: np.ndarray, mock_tokenizer: PreTrainedTokenizerFast, mock_config: dict[str, str]
+    mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: dict[str, str]
 ) -> None:
     """Test encoding of multiple sentences."""
     model = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config)
@@ -46,24 +42,29 @@ def test_encode_multiple_sentences(
 
 
 def test_encode_empty_sentence(
-    mock_vectors: np.ndarray, mock_tokenizer: PreTrainedTokenizerFast, mock_config: dict[str, str]
+    mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: dict[str, str]
 ) -> None:
     """Test encoding with an empty sentence."""
     model = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config)
     encoded = model.encode("")
     assert np.array_equal(encoded, np.zeros((2,)))
 
 
-def test_normalize() -> None:
+def test_normalize(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: dict[str, str]) -> None:
     """Test normalization of vectors."""
-    X = np.array([[3, 4], [1, 2], [0, 0]])
-    normalized = StaticModel.normalize(X)
-    expected = np.array([[0.6, 0.8], [0.4472136, 0.89442719], [0, 0]])
+    s = "word1 word2 word3"
+    model = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config, normalize=False)
+    X = model.encode(s)
+    model = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config, normalize=True)
+    normalized = model.encode(s)
+
+    expected = X / np.linalg.norm(X)
+
     np.testing.assert_almost_equal(normalized, expected)
 
 
 def test_save_pretrained(
-    tmp_path: Path, mock_vectors: np.ndarray, mock_tokenizer: PreTrainedTokenizerFast, mock_config: dict[str, str]
+    tmp_path: Path, mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: dict[str, str]
 ) -> None:
     """Test saving a pretrained model."""
     model = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config=mock_config)
@@ -80,7 +81,7 @@ def test_save_pretrained(
 
 
 def test_load_pretrained(
-    tmp_path: Path, mock_vectors: np.ndarray, mock_tokenizer: PreTrainedTokenizerFast, mock_config: dict[str, str]
+    tmp_path: Path, mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: dict[str, str]
 ) -> None:
     """Test loading a pretrained model after saving it."""
     # Save the model to a temporary path
@@ -92,6 +93,33 @@ def test_load_pretrained(
     loaded_model = StaticModel.from_pretrained(save_path)
 
     # Assert that the loaded model has the same properties as the original one
-    np.testing.assert_array_equal(loaded_model.vectors, mock_vectors)
+    np.testing.assert_array_equal(loaded_model.embedding.weight.numpy(), mock_vectors)
     assert loaded_model.tokenizer.get_vocab() == mock_tokenizer.get_vocab()
     assert loaded_model.config == mock_config
+
+
+def test_initialize_normalize(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer) -> None:
+    """Tests whether the normalization initialization is correct."""
+    model = StaticModel(mock_vectors, mock_tokenizer, {}, normalize=None)
+    assert not model.normalize
+
+    model = StaticModel(mock_vectors, mock_tokenizer, {}, normalize=False)
+    assert not model.normalize
+
+    model = StaticModel(mock_vectors, mock_tokenizer, {}, normalize=True)
+    assert model.normalize
+
+    model = StaticModel(mock_vectors, mock_tokenizer, {"normalize": False}, normalize=True)
+    assert model.normalize
+
+    model = StaticModel(mock_vectors, mock_tokenizer, {"normalize": True}, normalize=False)
+    assert not model.normalize
+
+
+def test_set_normalize(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer) -> None:
+    """Tests whether the normalize is set correctly."""
+    model = StaticModel(mock_vectors, mock_tokenizer, {}, normalize=True)
+    model.normalize = False
+    assert model.config == {"normalize": False}
+    model.normalize = True
+    assert model.config == {"normalize": True}