@@ -109,23 +109,21 @@ def _pos(self, doc:Doc):
109109 else :
110110 _list_txt = [[j .text for j in doc ]]
111111 for i in _list_txt :
112- _w = i
113- _tag_ = pos_tag (_w , engine = self .pos_engine ,corpus = self .pos_corpus )
112+ _word = i
113+ _tag_ = pos_tag (_word , engine = self .pos_engine , corpus = self .pos_corpus )
114114 _pos_tag .extend ([tag for _ ,tag in _tag_ ])
115115 for i ,_ in enumerate (_pos_tag ):
116- #print(doc[i])
117116 doc [i ].pos_ = _pos_tag [i ]
118117 return doc
119118
120119 def _sent (self , doc :Doc ):
121120 from pythainlp .tokenize import sent_tokenize
122121 _text = sent_tokenize (str (doc .text ), engine = self .sent_engine )
123122 _doc = word_tokenize ('SPLIT' .join (_text ), engine = self .tokenize_engine )
124- #print(_doc)
125123 number_skip = 0
126124 seen_break = False
127125 _new_cut = []
128- for i ,word in enumerate (_doc ):
126+ for i , word in enumerate (_doc ):
129127 if 'SPLIT' in word :
130128 if word .startswith ("SPLIT" ):
131129 _new_cut .append ("SPLIT" )
@@ -137,9 +135,7 @@ def _sent(self, doc:Doc):
137135 _new_cut .append (word )
138136 else :
139137 _new_cut .append (word )
140- #print(_new_cut)
141- for i ,word in enumerate (_new_cut ):
142- #print(str(i),str(word))
138+ for i , word in enumerate (_new_cut ):
143139 if i - number_skip == len (doc ) - 1 :
144140 break
145141 elif i == 0 :
@@ -166,14 +162,14 @@ def _dep(self, doc:Doc):
166162 heads = []
167163 lemmas = []
168164 offset = 0
169- _dep_temp = dependency_parsing (text ,model = self .dependency_parsing_model , engine = self .dependency_parsing_engine , tag = "list" )
165+ _dep_temp = dependency_parsing (text , model = self .dependency_parsing_model , engine = self .dependency_parsing_engine , tag = "list" )
170166 for i in _dep_temp :
171- idx ,word ,_ , postag ,_ , _ , head ,dep ,_ , space = i
167+ idx , word , _ , postag , _ , _ , head , dep , _ , space = i
172168 words .append (word )
173169 pos .append (postag )
174170 heads .append (int (head ))
175171 deps .append (dep )
176- if space == '_' :
172+ if space == '_' :
177173 spaces .append (True )
178174 else :
179175 spaces .append (False )
@@ -189,13 +185,11 @@ def _ner(self, doc:Doc):
189185 _ner_ = []
190186 for i in _list_txt :
191187 _ner_ .extend (self .ner .tag (i , pos = False ))
192- #print(_ner_)
193188 _new_ner = []
194189 c = 0
195190 _t = ""
196191 for i ,(w , tag ) in enumerate (_ner_ ):
197192 len_w = len (w )
198- #print(str(i),str(w),str(tag))
199193 if i + 1 == len (_ner_ ) and _t != "" :
200194 _new_ner [- 1 ][1 ] = c + len_w
201195 elif i + 1 == len (_ner_ ) and tag .startswith ("B-" ):
@@ -213,7 +207,6 @@ def _ner(self, doc:Doc):
213207 _t = ""
214208 c += len_w
215209 _ents = []
216- #print(_new_ner)
217210 for start , end , label in _new_ner :
218211 span = doc .char_span (start , end , label = label , alignment_mode = "contract" )
219212 if span is None :
@@ -223,7 +216,7 @@ def _ner(self, doc:Doc):
223216
224217 doc .ents = _ents
225218 return doc
226-
219+
227220 def _vec (self ):
228221 from pythainlp .word_vector import WordVector
229222 _wv = WordVector (model_name = self .word_vector_model )
0 commit comments