Skip to content

Commit ab70485

Browse files
fix: update vocab_size after training
1 parent 3c0a85a commit ab70485

2 files changed

Lines changed: 3 additions & 1 deletion

File tree

torchTextClassifiers/tokenizers/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def __init__(
113113
trained: bool = False,
114114
):
115115
super().__init__(
116-
vocab_size, output_vectorized=False, output_dim=output_dim
116+
vocab_size, output_vectorized=False, output_dim=output_dim, padding_idx=padding_idx
117117
) # it outputs token ids and not vectors
118118

119119
self.trained = trained

torchTextClassifiers/tokenizers/ngram.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ def __init__(
282282
self.subword_cache = None
283283

284284
self.vocab_size = 3 + self.nwords + self.num_tokens
285+
print("brrrrr ", self.vocab_size)
285286
super().__init__(
286287
vocab_size=self.vocab_size, padding_idx=self.pad_token_id, output_dim=output_dim
287288
)
@@ -301,6 +302,7 @@ def train(self, training_text: List[str]):
301302
idx += 1
302303

303304
self.nwords = len(self.word_to_id)
305+
self.vocab_size = 3 + self.nwords + self.num_tokens
304306

305307
# Create reverse mapping
306308
self.id_to_word = {v: k for k, v in self.word_to_id.items()}

0 commit comments

Comments
 (0)