-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathTokenizer.py
More file actions
103 lines (78 loc) · 2.86 KB
/
Copy pathTokenizer.py
File metadata and controls
103 lines (78 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from konlpy.tag import Mecab
#from Symbolizer import Symbolizer
from utils import load_raw_math_data
import random as rd
class Tokenizer:
def __init__(self):
self.mecab = Mecab()
#self.symbolizer = Symbolizer()
def whitespace_tokenize(self, data):
data = data.strip()
if not data:
return []
tokens = data.split()
return tokens
def tokenize(self, phrase , forVocab = False):
output_tokens = []
if forVocab:
output_tokens = ['PAD', 'UNK', 'MASK', 'BOS', 'EOS']
temp_phrase = phrase
for wst in self.whitespace_tokenize(temp_phrase):
count = 0
for token, pos in self.mecab.pos(wst):
tk = token
if count > 0:
tk = "##" + tk
if forVocab:
if tk in output_tokens:
continue
output_tokens.append(tk)
else:
count += 1
if forVocab:
if tk in output_tokens:
continue
output_tokens.append(tk)
if not forVocab:
output_tokens.append(tk)
return output_tokens
# END OF ERA
def phrase2idxTokens(self, symbolized, vocab, seq_len, isBert = False):
symbolized = "BOS "+ symbolized + "EOS"
tokens = self.tokenize(symbolized)
bert_output = []
idxes = []
for token in tokens:
if token in vocab:
idxes.append(vocab.index(token))
if isBert:
idxes, bert_output = self.masking_tokens(idxes)
return idxes, bert_output
def masking_tokens(self, tokens):
number_of_mask = int(len(tokens) * 0.15)
copied_tokens = tokens.copy()
if not number_of_mask:
number_of_mask = 1
samples = rd.sample(range(1,len(tokens) -1), number_of_mask)
toss = rd.random()
for sample in samples:
if toss > 0.2:
copied_tokens[sample] = 2
elif toss > 0.1 and toss <= 0.2:
copied_tokens[sample] = rd.randint(10,1000)
bert_output = [tokens[sample] for sample in samples]
return copied_tokens, bert_output
if __name__=="__main__":
#symbolizer = Symbolizer()
tokenizer = Tokenizer()
symbolized_corpus = ""
questions, _ = load_raw_math_data("./data/corpus.tsv")
mecab = Mecab()
# MAKE SYMOBLIZED CORPUS
for question in questions:
sym_phr = str(" ".join(mecab.morphs(question)))
symbolized_corpus += sym_phr
vocab = tokenizer.tokenize(symbolized_corpus, forVocab=True)
with open('./data/vocab.txt', 'w', encoding='utf-8-sig') as f:
for v in vocab:
f.write(v + '\n')