diff --git a/CHANGELOG.md b/CHANGELOG.md index 84357afbe..bc75ca09c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,18 @@ and this project adheres to - Full release notes: - Commit history: +## [Unreleased] + +### Changed + +- `pythainlp.tokenize.deepcut`: migrated from the TensorFlow-based `deepcut` + package to a built-in ONNX inference engine, removing the TensorFlow + dependency. The `deepcut.onnx` model (ported from + [LEKCut](https://github.com/PyThaiNLP/LEKCut)) is now bundled with PyThaiNLP. + The `segment()` API is unchanged; the `custom_dict` parameter is kept for + backward compatibility but is no longer applied to the model inference. + Deepcut tests moved from `tests/noauto_tensorflow/` to `tests/noauto_onnx/`. + ## [5.3.3] - 2026-03-26 Security fixes and thai2rom_onnx bug fixes. diff --git a/pyproject.toml b/pyproject.toml index 8eee56708..1aa8eeb6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -218,7 +218,6 @@ noauto-torch = [ # TensorFlow-based dependencies - for tests.noauto_tensorflow noauto-tensorflow = [ - "deepcut>=0.7.0", "numpy>=1.26.0", ] @@ -247,8 +246,6 @@ full = [ "attaparse==1.0.0", "bpemb>=0.3.6,<0.4", "budoux==0.7.0", - "deepcut==0.7.0.0", - "emoji>=0.6.0,<1", "epitran==1.26.0", "esupar>=1.3.9,<2", 'fairseq>=0.10.0,<0.13;python_version<"3.11"', @@ -453,7 +450,6 @@ module = [ "attaparse.*", "bpemb.*", "budoux.*", - "deepcut.*", "emoji.*", "epitran.*", "esupar.*", diff --git a/pythainlp/corpus/deepcut.onnx b/pythainlp/corpus/deepcut.onnx new file mode 100644 index 000000000..108453cd6 Binary files /dev/null and b/pythainlp/corpus/deepcut.onnx differ diff --git a/pythainlp/corpus/default_db.json b/pythainlp/corpus/default_db.json index a9b86e253..6b97a4208 100644 --- a/pythainlp/corpus/default_db.json +++ b/pythainlp/corpus/default_db.json @@ -1,4 +1,29 @@ { + "deepcut_onnx": { + "name": "deepcut_onnx", + "latest_version": "1.0.0", + "description": "DeepCut ONNX model", + "long_description": "DeepCut Thai word segmentation model in ONNX format, ported from the original TensorFlow model", + "url": "https://github.com/PyThaiNLP/LEKCut", + "authors": [ + "Rakpong Kittinaradorn", + "Titipat Achakulvisut", + "Korakot Chaovavanich", + "Kittinan Srithaworn", + "Pattarawat Chormai", + "Chanwit Kaewkasi", + "Tulakan Ruangrong", + "Krichkorn Oparad" + ], + "license": "MIT", + "versions": { + "1.0.0": { + "filename": "deepcut.onnx", + "md5": "f4662560dd9a706bfb1d7790ad6c667f", + "pythainlp_version": ">=5.4.0" + } + } + }, "thainer": { "name": "thainer", "latest_version": "1.5.1", diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py index 611361a6c..a19a76c75 100644 --- a/pythainlp/tokenize/deepcut.py +++ b/pythainlp/tokenize/deepcut.py @@ -1,38 +1,181 @@ # SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 -"""Wrapper for deepcut Thai word segmentation. deepcut is a -Thai word segmentation library using 1D Convolution Neural Network. +"""DeepCut Thai word segmentation using ONNX runtime. -User need to install deepcut (and its dependency: tensorflow) by themselves. +DeepCut is a Thai word segmentation library using 1D Convolution Neural +Network. This module provides ONNX-based inference, removing the need for +TensorFlow. + +The ONNX model is ported from the original DeepCut TensorFlow model, +available from the LEKCut project. :See Also: - * `GitHub repository `_ + * `DeepCut GitHub `_ + * `LEKCut GitHub `_ + +:References: + Rakpong Kittinaradorn, Titipat Achakulvisut, Korakot Chaovavanich, + Kittinan Srithaworn, Pattarawat Chormai, Chanwit Kaewkasi, + Tulakan Ruangrong, Krichkorn Oparad. + (2019, September 23). DeepCut: A Thai word tokenization library using + Deep Neural Network. Zenodo. https://doi.org/10.5281/zenodo.3457707 """ from __future__ import annotations -from typing import Union, cast +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np +from onnxruntime import InferenceSession -try: - from deepcut import tokenize -except ImportError as e: - raise ImportError( - "deepcut is not installed. Install it with: pip install deepcut" - ) from e +from pythainlp.corpus import get_corpus_path from pythainlp.util import Trie +if TYPE_CHECKING: + from numpy.typing import NDArray + +_MODEL_NAME: str = "deepcut_onnx" +_N_PAD: int = 21 +_THRESHOLD: float = 0.5 + +# Character type mapping from the original DeepCut model +_CHAR_TYPE: dict[str, str] = { + "กขฃคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ": "c", + "ฅฉผฟฌหฮ": "n", + "ะาำิีืึุู": "v", + "เแโใไ": "w", + "่้๊๋": "t", + "์ๆฯ.": "s", + "0123456789๑๒๓๔๕๖๗๘๙": "d", + '"': "q", + "'": "q", + "\u2018": "q", + "\u2019": "q", + " ": "p", + "abcdefghijklmnopqrstuvwxyz": "s_e", + "ABCDEFGHIJKLMNOPQRSTUVWXYZ": "b_e", +} + +_CHAR_TYPE_FLAT: dict[str, str] = {} +for _ks, _ct in _CHAR_TYPE.items(): + for _k in _ks: + _CHAR_TYPE_FLAT[_k] = _ct + +_CHARS: list[str] = [ + "\n", " ", "!", '"', "#", "$", "%", "&", "'", "(", ")", "*", "+", + ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", + "9", ":", ";", "<", "=", ">", "?", "@", "A", "B", "C", "D", "E", + "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", + "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\", "]", "^", "_", + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "other", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", + "z", "}", "~", "ก", "ข", "ฃ", "ค", "ฅ", "ฆ", "ง", "จ", "ฉ", "ช", + "ซ", "ฌ", "ญ", "ฎ", "ฏ", "ฐ", "ฑ", "ฒ", "ณ", "ด", "ต", "ถ", "ท", + "ธ", "น", "บ", "ป", "ผ", "ฝ", "พ", "ฟ", "ภ", "ม", "ย", "ร", "ฤ", + "ล", "ว", "ศ", "ษ", "ส", "ห", "ฬ", "อ", "ฮ", "ฯ", "ะ", "ั", "า", + "ำ", "ิ", "ี", "ึ", "ื", "ุ", "ู", "ฺ", "เ", "แ", "โ", "ใ", "ไ", + "ๅ", "ๆ", "็", "่", "้", "๊", "๋", "์", "ํ", "๐", "๑", "๒", "๓", + "๔", "๕", "๖", "๗", "๘", "๙", "\u2018", "\u2019", "\ufeff", +] +_CHARS_MAP: dict[str, int] = {v: k for k, v in enumerate(_CHARS)} + +_CHAR_TYPES: list[str] = [ + "b_e", "c", "d", "n", "o", "p", "q", "s", "s_e", "t", "v", "w", +] +_CHAR_TYPES_MAP: dict[str, int] = {v: k for k, v in enumerate(_CHAR_TYPES)} + +# Default index for unknown characters and types +_OTHER_CHAR_INDEX: int = _CHARS_MAP.get("other", 80) +_OTHER_TYPE_INDEX: int = _CHAR_TYPES_MAP.get("o", 4) + +_session: Optional[InferenceSession] = None + + +def _get_session() -> InferenceSession: + """Return a cached ONNX inference session, loading it on first call.""" + global _session + if _session is None: + model_path = get_corpus_path(_MODEL_NAME) + if not model_path: + raise FileNotFoundError( + f"corpus-not-found name={_MODEL_NAME!r}\n" + " DeepCut ONNX model file not found in the package.\n" + " Try reinstalling PyThaiNLP:\n" + " pip install --force-reinstall pythainlp" + ) + _session = InferenceSession(model_path) + return _session + + +def _create_feature_array( + text: str, n_pad: int = _N_PAD +) -> tuple["NDArray[np.float32]", "NDArray[np.float32]"]: + """Create character and type feature arrays for ONNX model input. + + :param str text: input text + :param int n_pad: window size for padding (default: 21) + :return: character and type feature arrays of shape (n, n_pad) + :rtype: tuple[numpy.ndarray, numpy.ndarray] + """ + n = len(text) + n_pad_2 = (n_pad - 1) // 2 + text_pad = [" "] * n_pad_2 + list(text) + [" "] * n_pad_2 + x_char: list[list[int]] = [] + x_type: list[list[int]] = [] + for i in range(n_pad_2, n_pad_2 + n): + char_list = ( + text_pad[i + 1 : i + n_pad_2 + 1] + + list(reversed(text_pad[i - n_pad_2 : i])) + + [text_pad[i]] + ) + x_char.append([_CHARS_MAP.get(c, _OTHER_CHAR_INDEX) for c in char_list]) + x_type.append( + [ + _CHAR_TYPES_MAP.get(_CHAR_TYPE_FLAT.get(c, "o"), _OTHER_TYPE_INDEX) + for c in char_list + ] + ) + return ( + np.array(x_char, dtype=np.float32), + np.array(x_type, dtype=np.float32), + ) + def segment( - text: str, custom_dict: Union[Trie, list[str], str] = [] + text: str, + custom_dict: Union[Trie, list[str], str, None] = None, ) -> list[str]: + """Segment Thai text using the DeepCut ONNX model. + + :param str text: text to segment + :param custom_dict: ignored; kept for API compatibility only + :type custom_dict: Union[pythainlp.util.Trie, list[str], str, None] + :return: list of word tokens + :rtype: list[str] + + :Example: + :: + + from pythainlp.tokenize import deepcut + + deepcut.segment("ทดสอบตัดคำ") + # output: ['ทดสอบ', 'ตัด', 'คำ'] + """ if not text or not isinstance(text, str): return [] - if custom_dict: - if isinstance(custom_dict, Trie): - custom_dict = list(custom_dict) - - return cast("list[str]", tokenize(text, custom_dict)) + session = _get_session() + x_char, x_type = _create_feature_array(text) + outputs = session.run(None, {"input_1": x_char, "input_2": x_type}) + y_predict = (outputs[0].ravel() > _THRESHOLD).astype(int) + word_end = y_predict[1:].tolist() + [1] - return cast("list[str]", tokenize(text)) + tokens: list[str] = [] + word = "" + for char, is_end in zip(text, word_end): + word += char + if is_end: + tokens.append(word) + word = "" + return tokens diff --git a/tests/noauto_onnx/testn_tokenize_onnx.py b/tests/noauto_onnx/testn_tokenize_onnx.py index af92a2f3b..7959835f9 100644 --- a/tests/noauto_onnx/testn_tokenize_onnx.py +++ b/tests/noauto_onnx/testn_tokenize_onnx.py @@ -11,8 +11,10 @@ import unittest from pythainlp.tokenize import ( + deepcut, oskut, sefr_cut, + word_dict_trie, word_tokenize, ) @@ -20,6 +22,34 @@ from ..test_helpers import assert_segment_handles_none_and_empty +class DetokenizeDeepcutTestCaseN(unittest.TestCase): + """Tests for deepcut tokenizer numeric handling (requires onnxruntime)""" + + def test_numeric_data_format_deepcut(self): + self.assertIn( + "127.0.0.1", + word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine="deepcut"), + ) + + tokens = word_tokenize( + "เวลา 12:12pm มีโปรโมชั่น 11.11", engine="deepcut" + ) + self.assertTrue( + any(value in tokens for value in ["12:12pm", "12:12"]), + msg=f"deepcut: {tokens}", + ) + self.assertIn("11.11", tokens) + + self.assertIn( + "1,234,567.89", + word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine="deepcut"), + ) + + tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine="deepcut") + self.assertIn("2.5:1", tokens) + self.assertIn("5:2", tokens) + + class DetokenizeSEFRCutTestCaseN(unittest.TestCase): """Tests for sefr_cut tokenizer numeric handling (requires onnxruntime)""" @@ -48,6 +78,24 @@ def test_numeric_data_format_sefr_cut(self): self.assertIn("5:2", tokens) +class WordTokenizeDeepcutTestCaseN(unittest.TestCase): + """Tests for deepcut tokenizer (requires onnxruntime)""" + + def test_word_tokenize_deepcut(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="deepcut")) + + def test_deepcut(self): + assert_segment_handles_none_and_empty(self, deepcut.segment) + self.assertIsNotNone(deepcut.segment("ทดสอบ", word_dict_trie())) + self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) + self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut")) + self.assertIsNotNone( + word_tokenize( + "ทดสอบ", engine="deepcut", custom_dict=word_dict_trie() + ) + ) + + class WordTokenizeOSKutTestCaseN(unittest.TestCase): """Tests for oskut tokenizer (requires onnxruntime)""" diff --git a/tests/noauto_tensorflow/__init__.py b/tests/noauto_tensorflow/__init__.py index dd71f2b28..86b285edb 100644 --- a/tests/noauto_tensorflow/__init__.py +++ b/tests/noauto_tensorflow/__init__.py @@ -6,7 +6,6 @@ Test functions that require TensorFlow and its ecosystem dependencies: - tensorflow - keras -- deepcut These tests are NOT run in automated CI workflows due to: - Very large dependencies (~1-2 GB for tensorflow) @@ -15,14 +14,15 @@ These tests are kept for manual testing and may be run in separate CI workflows dedicated to TensorFlow-based features. + +NOTE: deepcut tokenizer was migrated to ONNX; its tests are now in +tests/noauto_onnx/. """ from unittest import TestLoader, TestSuite # Names of module to be tested -test_packages: list[str] = [ - "tests.noauto_tensorflow.testn_tokenize_tensorflow", -] +test_packages: list[str] = [] def load_tests( diff --git a/tests/noauto_tensorflow/testn_tokenize_tensorflow.py b/tests/noauto_tensorflow/testn_tokenize_tensorflow.py index 03d7155c8..cf47a58e5 100644 --- a/tests/noauto_tensorflow/testn_tokenize_tensorflow.py +++ b/tests/noauto_tensorflow/testn_tokenize_tensorflow.py @@ -8,59 +8,5 @@ # - Potential version conflicts with PyTorch # - Python 3.13+ compatibility issues -import unittest - -from pythainlp.tokenize import ( - deepcut, - word_dict_trie, - word_tokenize, -) - -from ..core.test_tokenize import TEXT_1 -from ..test_helpers import assert_segment_handles_none_and_empty - - -class DetokenizeDeepcutTestCaseN(unittest.TestCase): - """Tests for deepcut tokenizer numeric handling (requires tensorflow)""" - - def test_numeric_data_format_deepcut(self): - self.assertIn( - "127.0.0.1", - word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine="deepcut"), - ) - - tokens = word_tokenize( - "เวลา 12:12pm มีโปรโมชั่น 11.11", engine="deepcut" - ) - self.assertTrue( - any(value in tokens for value in ["12:12pm", "12:12"]), - msg=f"deepcut: {tokens}", - ) - self.assertIn("11.11", tokens) - - self.assertIn( - "1,234,567.89", - word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine="deepcut"), - ) - - tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine="deepcut") - self.assertIn("2.5:1", tokens) - self.assertIn("5:2", tokens) - - -class WordTokenizeDeepcutTestCaseN(unittest.TestCase): - """Tests for deepcut tokenizer (requires tensorflow)""" - - def test_word_tokenize_deepcut(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="deepcut")) - - def test_deepcut(self): - assert_segment_handles_none_and_empty(self, deepcut.segment) - self.assertIsNotNone(deepcut.segment("ทดสอบ", word_dict_trie())) - self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) - self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut")) - self.assertIsNotNone( - word_tokenize( - "ทดสอบ", engine="deepcut", custom_dict=word_dict_trie() - ) - ) +# NOTE: deepcut tokenizer was migrated to ONNX and moved to +# tests/noauto_onnx/testn_tokenize_onnx.py