-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain-roberta-tokenizer.py
More file actions
53 lines (43 loc) · 1.68 KB
/
Copy pathtrain-roberta-tokenizer.py
File metadata and controls
53 lines (43 loc) · 1.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from __future__ import absolute_import, division, print_function
import sys
import os
import argparse
import json
import pdb
import logging
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer, CharBPETokenizer, SentencePieceBPETokenizer, BertWordPieceTokenizer
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str, default='wikitext-2-raw')
parser.add_argument('--file_suffix', type=str, default='raw')
parser.add_argument('--vocab_size', default=50265, type=int)
parser.add_argument('--min_frequency', default=2, type=int)
parser.add_argument('--tokenizer_name', type=str, default='BertWordPieceTokenizer')
opt = parser.parse_args()
inc_paths = [str(x) for x in Path(opt.data_dir).glob("**/*.%s" % (opt.file_suffix))]
exc_paths = [str(x) for x in Path(opt.data_dir).glob("**/*cached*")]
paths = list(set(inc_paths) - set(exc_paths))
# Initialize a tokenizer
TOKENIZER_CLASSES = {
'ByteLevelBPETokenizer': ByteLevelBPETokenizer,
'CharBPETokenizer': CharBPETokenizer,
'SentencePieceBPETokenizer': SentencePieceBPETokenizer,
'BertWordPieceTokenizer': BertWordPieceTokenizer,
}
Tokenizer = TOKENIZER_CLASSES[opt.tokenizer_name]
tokenizer = Tokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=opt.vocab_size, min_frequency=opt.min_frequency, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
# Save files to disk
tokenizer.save_model(".", opt.tokenizer_name)
if __name__ == '__main__':
main()