Skip to content

Commit e470390

Browse files
committed
Update data
1 parent df3e8de commit e470390

6 files changed

Lines changed: 329 additions & 152 deletions

File tree

README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ Work in processing...
99
1010
## How to use
1111

12+
13+
**Example**
1214
```python
1315
import spacy
1416
from spacy_pythainlp.core import *
@@ -24,6 +26,34 @@ print(list(list(data.sents)))
2426
# output: [ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน , ผมอยากไปเที่ยว]
2527
```
2628

29+
You can config the setting in the nlp.add_pipe.
30+
```python
31+
nlp.add_pipe(
32+
"pythainlp",
33+
config={
34+
"pos_engine": "perceptron",
35+
"pos": True,
36+
"pos_corpus": "orchid_ud",
37+
"sent_engine": "crfcut",
38+
"sent": True,
39+
"ner_engine": "thainer",
40+
"ner": True,
41+
"tokenize_engine": "newmm",
42+
"tokenize": False,
43+
}
44+
)
45+
```
46+
47+
- tokenize: Bool (True or False) to change the word tokenize. (the default spaCy is newmm of PyThaiNLP)
48+
- tokenize_engine: The tokenize engine. You can read more: [Options for engine](https://pythainlp.github.io/dev-docs/api/tokenize.html#pythainlp.tokenize.word_tokenize)
49+
- sent: Bool (True or False) to turn on the sentence tokenizer.
50+
- sent_engine: The sentence tokenizer engine. You can read more: [Options for engine](hhttps://pythainlp.github.io/dev-docs/api/tokenize.html#pythainlp.tokenize.sent_tokenize)
51+
- pos: Bool (True or False) to turn on the part-of-speech.
52+
- pos_engine: The part-of-speech engine. You can read more: [Options for engine](https://pythainlp.github.io/dev-docs/api/tag.html#pythainlp.tag.pos_tag)
53+
- ner: Bool (True or False) to turn on the NER.
54+
- ner_engine: The NER engine. You can read more: [Options for engine](https://pythainlp.github.io/dev-docs/api/tag.html#pythainlp.tag.NER)
55+
56+
2757
## License
2858

2959
```

notebooks/example.ipynb

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import spacy\n",
10+
"from spacy_pythainlp.core import *"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": 2,
16+
"metadata": {},
17+
"outputs": [
18+
{
19+
"data": {
20+
"text/plain": [
21+
"<spacy_pythainlp.core.PyThaiNLP at 0x28c271310>"
22+
]
23+
},
24+
"execution_count": 2,
25+
"metadata": {},
26+
"output_type": "execute_result"
27+
}
28+
],
29+
"source": [
30+
"nlp = spacy.blank('th')\n",
31+
"nlp.add_pipe(\n",
32+
" \"pythainlp\", \n",
33+
" config={\n",
34+
" \"pos_engine\": \"perceptron\",\n",
35+
" \"pos\": True,\n",
36+
" \"pos_corpus\": \"orchid_ud\",\n",
37+
" \"sent_engine\": \"crfcut\",\n",
38+
" \"sent\": True,\n",
39+
" \"ner_engine\": \"thainer\",\n",
40+
" \"ner\": True,\n",
41+
" \"tokenize_engine\": \"newmm\",\n",
42+
" \"tokenize\": False,\n",
43+
" }\n",
44+
")"
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": 3,
50+
"metadata": {},
51+
"outputs": [],
52+
"source": [
53+
"doc=nlp(\"ผมเป็นนักศึกษา ผมจบจากมหาวิทยาลัยขอนแก่น\")"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": 4,
59+
"metadata": {},
60+
"outputs": [
61+
{
62+
"data": {
63+
"text/plain": [
64+
"[ผมเป็นนักศึกษา , ผมจบจากมหาวิทยาลัยขอนแก่น]"
65+
]
66+
},
67+
"execution_count": 4,
68+
"metadata": {},
69+
"output_type": "execute_result"
70+
}
71+
],
72+
"source": [
73+
"list(doc.sents)"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": 5,
79+
"metadata": {},
80+
"outputs": [
81+
{
82+
"data": {
83+
"text/plain": [
84+
"'PRON'"
85+
]
86+
},
87+
"execution_count": 5,
88+
"metadata": {},
89+
"output_type": "execute_result"
90+
}
91+
],
92+
"source": [
93+
"doc[-4].pos_"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": 6,
99+
"metadata": {},
100+
"outputs": [
101+
{
102+
"name": "stderr",
103+
"output_type": "stream",
104+
"text": [
105+
"/var/folders/np/q29jfk_d5w92vp4xpqhq96kc0000gn/T/ipykernel_63172/463249936.py:1: DeprecationWarning: [W107] The property `Doc.is_sentenced` is deprecated. Use `Doc.has_annotation(\"SENT_START\")` instead.\n",
106+
" doc.is_sentenced\n"
107+
]
108+
},
109+
{
110+
"data": {
111+
"text/plain": [
112+
"True"
113+
]
114+
},
115+
"execution_count": 6,
116+
"metadata": {},
117+
"output_type": "execute_result"
118+
}
119+
],
120+
"source": [
121+
"doc.is_sentenced"
122+
]
123+
},
124+
{
125+
"cell_type": "code",
126+
"execution_count": 7,
127+
"metadata": {},
128+
"outputs": [
129+
{
130+
"data": {
131+
"text/plain": [
132+
"[ผมเป็นนักศึกษา , ผมจบจากมหาวิทยาลัยขอนแก่น]"
133+
]
134+
},
135+
"execution_count": 7,
136+
"metadata": {},
137+
"output_type": "execute_result"
138+
}
139+
],
140+
"source": [
141+
"list(doc.sents)"
142+
]
143+
},
144+
{
145+
"cell_type": "code",
146+
"execution_count": 8,
147+
"metadata": {},
148+
"outputs": [],
149+
"source": [
150+
"from spacy import displacy"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": 9,
156+
"metadata": {},
157+
"outputs": [
158+
{
159+
"data": {
160+
"text/html": [
161+
"<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">ผมเป็นนักศึกษา ผมจบจาก\n",
162+
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
163+
" มหาวิทยาลัยขอนแก่น\n",
164+
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORGANIZATION</span>\n",
165+
"</mark>\n",
166+
"</div></span>"
167+
],
168+
"text/plain": [
169+
"<IPython.core.display.HTML object>"
170+
]
171+
},
172+
"metadata": {},
173+
"output_type": "display_data"
174+
}
175+
],
176+
"source": [
177+
"displacy.render(doc, style=\"ent\",jupyter=True)"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": 10,
183+
"metadata": {},
184+
"outputs": [
185+
{
186+
"name": "stderr",
187+
"output_type": "stream",
188+
"text": [
189+
"/Users/wannaphongphatthiyaphaibun/miniforge3/lib/python3.8/site-packages/spacy/displacy/__init__.py:133: UserWarning: [W005] Doc object not parsed. This means displaCy won't be able to generate a dependency visualization for it. Make sure the Doc was processed with a model that supports dependency parsing, and not just a language class like `English()`. For more info, see the docs:\n",
190+
"https://spacy.io/usage/models\n",
191+
" warnings.warn(Warnings.W005)\n"
192+
]
193+
},
194+
{
195+
"data": {
196+
"text/html": [
197+
"<span class=\"tex2jax_ignore\"><svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"th\" id=\"300119dae5b44e3f9e7caa9b0e973de1-0\" class=\"displacy\" width=\"1450\" height=\"137.0\" direction=\"ltr\" style=\"max-width: none; height: 137.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
198+
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
199+
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">ผม</tspan>\n",
200+
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PRON</tspan>\n",
201+
"</text>\n",
202+
"\n",
203+
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
204+
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"225\">เป็น</tspan>\n",
205+
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"225\">VERB</tspan>\n",
206+
"</text>\n",
207+
"\n",
208+
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
209+
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"400\">นักศึกษา</tspan>\n",
210+
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"400\">NOUN</tspan>\n",
211+
"</text>\n",
212+
"\n",
213+
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
214+
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"575\"> </tspan>\n",
215+
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"575\">PUNCT</tspan>\n",
216+
"</text>\n",
217+
"\n",
218+
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
219+
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">ผม</tspan>\n",
220+
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">PRON</tspan>\n",
221+
"</text>\n",
222+
"\n",
223+
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
224+
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"925\">จบ</tspan>\n",
225+
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"925\">VERB</tspan>\n",
226+
"</text>\n",
227+
"\n",
228+
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
229+
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1100\">จาก</tspan>\n",
230+
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1100\">ADP</tspan>\n",
231+
"</text>\n",
232+
"\n",
233+
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
234+
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1275\">มหาวิทยาลัยขอนแก่น</tspan>\n",
235+
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1275\">PROPN</tspan>\n",
236+
"</text>\n",
237+
"</svg></span>"
238+
],
239+
"text/plain": [
240+
"<IPython.core.display.HTML object>"
241+
]
242+
},
243+
"metadata": {},
244+
"output_type": "display_data"
245+
}
246+
],
247+
"source": [
248+
"displacy.render(doc, style=\"dep\",jupyter=True)"
249+
]
250+
},
251+
{
252+
"cell_type": "code",
253+
"execution_count": null,
254+
"metadata": {},
255+
"outputs": [],
256+
"source": []
257+
}
258+
],
259+
"metadata": {
260+
"kernelspec": {
261+
"display_name": "Python 3.8.13 ('base')",
262+
"language": "python",
263+
"name": "python3"
264+
},
265+
"language_info": {
266+
"codemirror_mode": {
267+
"name": "ipython",
268+
"version": 3
269+
},
270+
"file_extension": ".py",
271+
"mimetype": "text/x-python",
272+
"name": "python",
273+
"nbconvert_exporter": "python",
274+
"pygments_lexer": "ipython3",
275+
"version": "3.8.13"
276+
},
277+
"orig_nbformat": 4,
278+
"vscode": {
279+
"interpreter": {
280+
"hash": "a1d6ff38954a1cdba4cf61ffa51e42f4658fc35985cd256cd89123cae8466a39"
281+
}
282+
}
283+
},
284+
"nbformat": 4,
285+
"nbformat_minor": 2
286+
}

0 commit comments

Comments
 (0)