fix: update vocab_size after training

meilame-tayebjee · meilame-tayebjee · commit ab704853a6d9 · 2025-11-20T11:12:14.000Z
diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py
@@ -113,7 +113,7 @@ def __init__(
         trained: bool = False,
     ):
         super().__init__(
-            vocab_size, output_vectorized=False, output_dim=output_dim
+            vocab_size, output_vectorized=False, output_dim=output_dim, padding_idx=padding_idx
         )  # it outputs token ids and not vectors
 
         self.trained = trained
diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py
@@ -282,6 +282,7 @@ def __init__(
             self.subword_cache = None
 
         self.vocab_size = 3 + self.nwords + self.num_tokens
+        print("brrrrr ", self.vocab_size)
         super().__init__(
             vocab_size=self.vocab_size, padding_idx=self.pad_token_id, output_dim=output_dim
         )
@@ -301,6 +302,7 @@ def train(self, training_text: List[str]):
                 idx += 1
 
         self.nwords = len(self.word_to_id)
+        self.vocab_size = 3 + self.nwords + self.num_tokens
 
         # Create reverse mapping
         self.id_to_word = {v: k for k, v in self.word_to_id.items()}