Skip to content

Commit 1ee4798

Browse files
fix: fix Tokenizer has no len / no attribute token_to_id at loading
1 parent b1d4937 commit 1ee4798

1 file changed

Lines changed: 3 additions & 6 deletions

File tree

  • torchTextClassifiers/tokenizers

torchTextClassifiers/tokenizers/base.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,17 +178,14 @@ def load(cls, load_path: str):
178178
@classmethod
179179
def load_from_s3(cls, s3_path: str, filesystem):
180180
if filesystem.exists(s3_path) is False:
181-
raise FileNotFoundError(
182-
f"Tokenizer not found at {s3_path}. Please train it first (see src/train_tokenizers)."
183-
)
181+
raise FileNotFoundError(f"Tokenizer not found at {s3_path}.")
184182

185183
with filesystem.open(s3_path, "rb") as f:
186184
json_str = f.read().decode("utf-8")
187185

188186
tokenizer_obj = Tokenizer.from_str(json_str)
189-
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj)
190-
instance = cls(vocab_size=len(tokenizer), trained=True)
191-
instance.tokenizer = tokenizer
187+
instance = cls(vocab_size=tokenizer_obj.get_vocab_size(), trained=True)
188+
instance.tokenizer = tokenizer_obj
192189
instance._post_training()
193190
return instance
194191

0 commit comments

Comments
 (0)