Fix PretrainedFromHF tokenizer with T5 training

janEbert · janEbert · commit b2fc665668d9 · 2022-12-13T12:12:34.000+01:00
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
@@ -388,6 +388,18 @@ def eos(self):
         candidate = self.tokenizer.eos_token_id
         return self._check_token_candidate(candidate)
 
+    @property
+    def bos_token_id(self):
+        """Id of the beginning of sentence token in the vocabulary."""
+        candidate = self.tokenizer.bos_token_id
+        return self._check_token_candidate(candidate)
+
+    @property
+    def eos_token_id(self):
+        """Id of the end of sentence token in the vocabulary."""
+        candidate = self.tokenizer.eos_token_id
+        return self._check_token_candidate(candidate)
+
     @property
     def additional_special_tokens_ids(self):
         """ All the additional special tokens you may want to use (list of strings)."""