Skip to content

Commit 2a37f94

Browse files
committed
v0.1
1 parent 2c70c86 commit 2a37f94

2 files changed

Lines changed: 9 additions & 17 deletions

File tree

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,14 @@
1313

1414
setup(
1515
name="spacy-pythainlp",
16-
version="0.1dev8",
16+
version="0.1",
1717
description="PyThaiNLP For spaCy",
1818
long_description=readme,
1919
long_description_content_type="text/markdown",
2020
author="Wannaphong Phatthiyaphaibun",
2121
author_email="wannaphong@yahoo.com",
2222
url="https://github.com/PyThaiNLP/spaCy-PyThaiNLP",
2323
packages=["spacy_pythainlp"],
24-
# test_suite="tests",
2524
python_requires=">=3.7",
2625
include_package_data=True,
2726
install_requires=requirements,

spacy_pythainlp/core.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -109,23 +109,21 @@ def _pos(self, doc:Doc):
109109
else:
110110
_list_txt = [[j.text for j in doc]]
111111
for i in _list_txt:
112-
_w = i
113-
_tag_ = pos_tag(_w, engine=self.pos_engine,corpus=self.pos_corpus)
112+
_word = i
113+
_tag_ = pos_tag(_word, engine=self.pos_engine, corpus=self.pos_corpus)
114114
_pos_tag.extend([tag for _,tag in _tag_])
115115
for i,_ in enumerate(_pos_tag):
116-
#print(doc[i])
117116
doc[i].pos_ = _pos_tag[i]
118117
return doc
119118

120119
def _sent(self, doc:Doc):
121120
from pythainlp.tokenize import sent_tokenize
122121
_text = sent_tokenize(str(doc.text), engine=self.sent_engine)
123122
_doc = word_tokenize('SPLIT'.join(_text), engine=self.tokenize_engine)
124-
#print(_doc)
125123
number_skip = 0
126124
seen_break = False
127125
_new_cut = []
128-
for i,word in enumerate(_doc):
126+
for i, word in enumerate(_doc):
129127
if 'SPLIT' in word:
130128
if word.startswith("SPLIT"):
131129
_new_cut.append("SPLIT")
@@ -137,9 +135,7 @@ def _sent(self, doc:Doc):
137135
_new_cut.append(word)
138136
else:
139137
_new_cut.append(word)
140-
#print(_new_cut)
141-
for i,word in enumerate(_new_cut):
142-
#print(str(i),str(word))
138+
for i, word in enumerate(_new_cut):
143139
if i-number_skip == len(doc) -1:
144140
break
145141
elif i == 0:
@@ -166,14 +162,14 @@ def _dep(self, doc:Doc):
166162
heads = []
167163
lemmas = []
168164
offset = 0
169-
_dep_temp = dependency_parsing(text,model=self.dependency_parsing_model, engine=self.dependency_parsing_engine, tag="list")
165+
_dep_temp = dependency_parsing(text, model=self.dependency_parsing_model, engine=self.dependency_parsing_engine, tag="list")
170166
for i in _dep_temp:
171-
idx,word,_,postag,_,_,head,dep,_,space = i
167+
idx, word, _, postag, _, _, head, dep, _, space = i
172168
words.append(word)
173169
pos.append(postag)
174170
heads.append(int(head))
175171
deps.append(dep)
176-
if space=='_':
172+
if space == '_':
177173
spaces.append(True)
178174
else:
179175
spaces.append(False)
@@ -189,13 +185,11 @@ def _ner(self, doc:Doc):
189185
_ner_ =[]
190186
for i in _list_txt:
191187
_ner_.extend(self.ner.tag(i, pos=False))
192-
#print(_ner_)
193188
_new_ner = []
194189
c=0
195190
_t=""
196191
for i,(w, tag) in enumerate(_ner_):
197192
len_w = len(w)
198-
#print(str(i),str(w),str(tag))
199193
if i+1 == len(_ner_) and _t != "":
200194
_new_ner[-1][1] = c+len_w
201195
elif i+1 == len(_ner_) and tag.startswith("B-"):
@@ -213,7 +207,6 @@ def _ner(self, doc:Doc):
213207
_t=""
214208
c+=len_w
215209
_ents = []
216-
#print(_new_ner)
217210
for start, end, label in _new_ner:
218211
span = doc.char_span(start, end, label=label, alignment_mode="contract")
219212
if span is None:
@@ -223,7 +216,7 @@ def _ner(self, doc:Doc):
223216

224217
doc.ents = _ents
225218
return doc
226-
219+
227220
def _vec(self):
228221
from pythainlp.word_vector import WordVector
229222
_wv = WordVector(model_name=self.word_vector_model)

0 commit comments

Comments
 (0)