Skip to content

Commit 0db15ff

Browse files
committed
disable unapplicable transformers length warning
1 parent 7603711 commit 0db15ff

1 file changed

Lines changed: 8 additions & 0 deletions

File tree

grimbert/datas.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from torch.nn.utils.rnn import pad_sequence
1212
from torch.utils.data import Dataset
1313
from transformers import BertTokenizerFast
14+
from transformers.utils import logging as transformers_logging
1415
from tqdm import tqdm
1516
from sacremoses import MosesTokenizer
1617
from transformers.tokenization_utils_base import BatchEncoding
@@ -532,12 +533,19 @@ def __getitem__(self, index: int) -> BatchEncoding:
532533
# rcontext depends on the total size allowed for the input,
533534
# which is self.quote_ctx_len
534535
quote_ctx_start, quote_ctx_end = quote.ctx_bounds(self.quote_ctx_len)
536+
# NOTE: we disable tokenizer warning to avoid a length
537+
# ---- warning. Usually, sequences should be truncated to a max
538+
# length (512 for BERT). However, in our case, the sequence is
539+
# later cut into segments of configurable size, so this does
540+
# not apply
541+
transformers_logging.set_verbosity_error()
535542
batch = self.tokenizer(
536543
document.tokens[quote_ctx_start:quote_ctx_end],
537544
is_split_into_words=True,
538545
truncation=False,
539546
return_tensors="pt",
540547
)
548+
transformers_logging.set_verbosity_info()
541549

542550
for key in batch.keys():
543551
batch[key] = batch[key][0]

0 commit comments

Comments
 (0)