PyThaiNLP
diff --git a/‎README.md‎
Lines changed: 30 additions & 0 deletions b/‎README.md‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎notebooks/example.ipynb‎
Lines changed: 286 additions & 0 deletions b/‎notebooks/example.ipynb‎
Lines changed: 286 additions & 0 deletions
@@ -9,6 +9,8 @@ Work in processing...
 
 ## How to use
 
+
+**Example**
 ```python
 import spacy
 from spacy_pythainlp.core import *
@@ -24,6 +26,34 @@ print(list(list(data.sents)))
 # output: [ผมเป็นคนไทย   แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน  , ผมอยากไปเที่ยว]
 ```
 
+You can config the setting in the nlp.add_pipe.
+```python
+nlp.add_pipe(
+    "pythainlp", 
+    config={
+        "pos_engine": "perceptron",
+        "pos": True,
+        "pos_corpus": "orchid_ud",
+        "sent_engine": "crfcut",
+        "sent": True,
+        "ner_engine": "thainer",
+        "ner": True,
+        "tokenize_engine": "newmm",
+        "tokenize": False,
+    }
+)
+```
+
+- tokenize: Bool (True or False) to change the word tokenize. (the default spaCy is newmm of PyThaiNLP)
+- tokenize_engine: The tokenize engine. You can read more: [Options for engine](https://pythainlp.github.io/dev-docs/api/tokenize.html#pythainlp.tokenize.word_tokenize)
+- sent: Bool (True or False) to turn on the sentence tokenizer.
+- sent_engine: The sentence tokenizer engine. You can read more: [Options for engine](hhttps://pythainlp.github.io/dev-docs/api/tokenize.html#pythainlp.tokenize.sent_tokenize)
+- pos:  Bool (True or False) to turn on the part-of-speech.
+- pos_engine: The part-of-speech engine. You can read more: [Options for engine](https://pythainlp.github.io/dev-docs/api/tag.html#pythainlp.tag.pos_tag)
+- ner: Bool (True or False) to turn on the NER.
+- ner_engine: The NER engine. You can read more: [Options for engine](https://pythainlp.github.io/dev-docs/api/tag.html#pythainlp.tag.NER)
+
+
 ## License
 
 ```
 
@@ -0,0 +1,286 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "from spacy_pythainlp.core import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<spacy_pythainlp.core.PyThaiNLP at 0x28c271310>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp = spacy.blank('th')\n",
+    "nlp.add_pipe(\n",
+    "    \"pythainlp\", \n",
+    "    config={\n",
+    "        \"pos_engine\": \"perceptron\",\n",
+    "        \"pos\": True,\n",
+    "        \"pos_corpus\": \"orchid_ud\",\n",
+    "        \"sent_engine\": \"crfcut\",\n",
+    "        \"sent\": True,\n",
+    "        \"ner_engine\": \"thainer\",\n",
+    "        \"ner\": True,\n",
+    "        \"tokenize_engine\": \"newmm\",\n",
+    "        \"tokenize\": False,\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc=nlp(\"ผมเป็นนักศึกษา ผมจบจากมหาวิทยาลัยขอนแก่น\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[ผมเป็นนักศึกษา , ผมจบจากมหาวิทยาลัยขอนแก่น]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(doc.sents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'PRON'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "doc[-4].pos_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/np/q29jfk_d5w92vp4xpqhq96kc0000gn/T/ipykernel_63172/463249936.py:1: DeprecationWarning: [W107] The property `Doc.is_sentenced` is deprecated. Use `Doc.has_annotation(\"SENT_START\")` instead.\n",
+      "  doc.is_sentenced\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "doc.is_sentenced"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[ผมเป็นนักศึกษา , ผมจบจากมหาวิทยาลัยขอนแก่น]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(doc.sents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from spacy import displacy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">ผมเป็นนักศึกษา ผมจบจาก\n",
+       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
+       "    มหาวิทยาลัยขอนแก่น\n",
+       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORGANIZATION</span>\n",
+       "</mark>\n",
+       "</div></span>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "displacy.render(doc, style=\"ent\",jupyter=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/wannaphongphatthiyaphaibun/miniforge3/lib/python3.8/site-packages/spacy/displacy/__init__.py:133: UserWarning: [W005] Doc object not parsed. This means displaCy won't be able to generate a dependency visualization for it. Make sure the Doc was processed with a model that supports dependency parsing, and not just a language class like `English()`. For more info, see the docs:\n",
+      "https://spacy.io/usage/models\n",
+      "  warnings.warn(Warnings.W005)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<span class=\"tex2jax_ignore\"><svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"th\" id=\"300119dae5b44e3f9e7caa9b0e973de1-0\" class=\"displacy\" width=\"1450\" height=\"137.0\" direction=\"ltr\" style=\"max-width: none; height: 137.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">ผม</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PRON</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"225\">เป็น</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"225\">VERB</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"400\">นักศึกษา</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"400\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"575\"> </tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"575\">PUNCT</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">ผม</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">PRON</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"925\">จบ</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"925\">VERB</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1100\">จาก</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1100\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"47.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1275\">มหาวิทยาลัยขอนแก่น</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1275\">PROPN</tspan>\n",
+       "</text>\n",
+       "</svg></span>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "displacy.render(doc, style=\"dep\",jupyter=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.13 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "a1d6ff38954a1cdba4cf61ffa51e42f4658fc35985cd256cd89123cae8466a39"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}