File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1111from torch .nn .utils .rnn import pad_sequence
1212from torch .utils .data import Dataset
1313from transformers import BertTokenizerFast
14+ from transformers .utils import logging as transformers_logging
1415from tqdm import tqdm
1516from sacremoses import MosesTokenizer
1617from transformers .tokenization_utils_base import BatchEncoding
@@ -532,12 +533,19 @@ def __getitem__(self, index: int) -> BatchEncoding:
532533 # rcontext depends on the total size allowed for the input,
533534 # which is self.quote_ctx_len
534535 quote_ctx_start , quote_ctx_end = quote .ctx_bounds (self .quote_ctx_len )
536+ # NOTE: we disable tokenizer warning to avoid a length
537+ # ---- warning. Usually, sequences should be truncated to a max
538+ # length (512 for BERT). However, in our case, the sequence is
539+ # later cut into segments of configurable size, so this does
540+ # not apply
541+ transformers_logging .set_verbosity_error ()
535542 batch = self .tokenizer (
536543 document .tokens [quote_ctx_start :quote_ctx_end ],
537544 is_split_into_words = True ,
538545 truncation = False ,
539546 return_tensors = "pt" ,
540547 )
548+ transformers_logging .set_verbosity_info ()
541549
542550 for key in batch .keys ():
543551 batch [key ] = batch [key ][0 ]
You can’t perform that action at this time.
0 commit comments