Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ and this project adheres to
- Full release notes: <https://github.com/PyThaiNLP/pythainlp/releases>
- Commit history: <https://github.com/PyThaiNLP/pythainlp/compare/v5.3.1...v5.3.3>

## [Unreleased]

### Changed

- `pythainlp.tokenize.deepcut`: migrated from the TensorFlow-based `deepcut`
package to a built-in ONNX inference engine, removing the TensorFlow
dependency. The `deepcut.onnx` model (ported from
[LEKCut](https://github.com/PyThaiNLP/LEKCut)) is now bundled with PyThaiNLP.
The `segment()` API is unchanged; the `custom_dict` parameter is kept for
backward compatibility but is no longer applied to the model inference.
Deepcut tests moved from `tests/noauto_tensorflow/` to `tests/noauto_onnx/`.

## [5.3.3] - 2026-03-26

Security fixes and thai2rom_onnx bug fixes.
Expand Down
4 changes: 0 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ noauto-torch = [

# TensorFlow-based dependencies - for tests.noauto_tensorflow
noauto-tensorflow = [
"deepcut>=0.7.0",
"numpy>=1.26.0",
]

Expand Down Expand Up @@ -247,8 +246,6 @@ full = [
"attaparse==1.0.0",
"bpemb>=0.3.6,<0.4",
"budoux==0.7.0",
"deepcut==0.7.0.0",
"emoji>=0.6.0,<1",
"epitran==1.26.0",
"esupar>=1.3.9,<2",
'fairseq>=0.10.0,<0.13;python_version<"3.11"',
Expand Down Expand Up @@ -453,7 +450,6 @@ module = [
"attaparse.*",
"bpemb.*",
"budoux.*",
"deepcut.*",
"emoji.*",
"epitran.*",
"esupar.*",
Expand Down
Binary file added pythainlp/corpus/deepcut.onnx
Binary file not shown.
25 changes: 25 additions & 0 deletions pythainlp/corpus/default_db.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,29 @@
{
"deepcut_onnx": {
"name": "deepcut_onnx",
"latest_version": "1.0.0",
"description": "DeepCut ONNX model",
"long_description": "DeepCut Thai word segmentation model in ONNX format, ported from the original TensorFlow model",
"url": "https://github.com/PyThaiNLP/LEKCut",
"authors": [
"Rakpong Kittinaradorn",
"Titipat Achakulvisut",
"Korakot Chaovavanich",
"Kittinan Srithaworn",
"Pattarawat Chormai",
"Chanwit Kaewkasi",
"Tulakan Ruangrong",
"Krichkorn Oparad"
],
"license": "MIT",
"versions": {
"1.0.0": {
"filename": "deepcut.onnx",
"md5": "f4662560dd9a706bfb1d7790ad6c667f",
"pythainlp_version": ">=5.4.0"
}
}
},
"thainer": {
"name": "thainer",
"latest_version": "1.5.1",
Expand Down
179 changes: 161 additions & 18 deletions pythainlp/tokenize/deepcut.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,181 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Wrapper for deepcut Thai word segmentation. deepcut is a
Thai word segmentation library using 1D Convolution Neural Network.
"""DeepCut Thai word segmentation using ONNX runtime.

User need to install deepcut (and its dependency: tensorflow) by themselves.
DeepCut is a Thai word segmentation library using 1D Convolution Neural
Network. This module provides ONNX-based inference, removing the need for
TensorFlow.

The ONNX model is ported from the original DeepCut TensorFlow model,
available from the LEKCut project.

:See Also:
* `GitHub repository <https://github.com/rkcosmos/deepcut>`_
* `DeepCut GitHub <https://github.com/rkcosmos/deepcut>`_
* `LEKCut GitHub <https://github.com/PyThaiNLP/LEKCut>`_

:References:
Rakpong Kittinaradorn, Titipat Achakulvisut, Korakot Chaovavanich,
Kittinan Srithaworn, Pattarawat Chormai, Chanwit Kaewkasi,
Tulakan Ruangrong, Krichkorn Oparad.
(2019, September 23). DeepCut: A Thai word tokenization library using
Deep Neural Network. Zenodo. https://doi.org/10.5281/zenodo.3457707
"""

from __future__ import annotations

from typing import Union, cast
from typing import TYPE_CHECKING, Optional, Union

import numpy as np
from onnxruntime import InferenceSession

try:
from deepcut import tokenize
except ImportError as e:
raise ImportError(
"deepcut is not installed. Install it with: pip install deepcut"
) from e
from pythainlp.corpus import get_corpus_path
from pythainlp.util import Trie

if TYPE_CHECKING:
from numpy.typing import NDArray

_MODEL_NAME: str = "deepcut_onnx"
_N_PAD: int = 21
_THRESHOLD: float = 0.5

# Character type mapping from the original DeepCut model
_CHAR_TYPE: dict[str, str] = {
"กขฃคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ": "c",
"ฅฉผฟฌหฮ": "n",
"ะาำิีืึุู": "v",
"เแโใไ": "w",
"่้๊๋": "t",
"์ๆฯ.": "s",
"0123456789๑๒๓๔๕๖๗๘๙": "d",
'"': "q",
"'": "q",
"\u2018": "q",
"\u2019": "q",
" ": "p",
"abcdefghijklmnopqrstuvwxyz": "s_e",
"ABCDEFGHIJKLMNOPQRSTUVWXYZ": "b_e",
}

_CHAR_TYPE_FLAT: dict[str, str] = {}
for _ks, _ct in _CHAR_TYPE.items():
for _k in _ks:
_CHAR_TYPE_FLAT[_k] = _ct

_CHARS: list[str] = [
"\n", " ", "!", '"', "#", "$", "%", "&", "'", "(", ")", "*", "+",
",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8",
"9", ":", ";", "<", "=", ">", "?", "@", "A", "B", "C", "D", "E",
"F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R",
"S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\", "]", "^", "_",
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "other", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y",
"z", "}", "~", "ก", "ข", "ฃ", "ค", "ฅ", "ฆ", "ง", "จ", "ฉ", "ช",
"ซ", "ฌ", "ญ", "ฎ", "ฏ", "ฐ", "ฑ", "ฒ", "ณ", "ด", "ต", "ถ", "ท",
"ธ", "น", "บ", "ป", "ผ", "ฝ", "พ", "ฟ", "ภ", "ม", "ย", "ร", "ฤ",
"ล", "ว", "ศ", "ษ", "ส", "ห", "ฬ", "อ", "ฮ", "ฯ", "ะ", "ั", "า",
"ำ", "ิ", "ี", "ึ", "ื", "ุ", "ู", "ฺ", "เ", "แ", "โ", "ใ", "ไ",
"ๅ", "ๆ", "็", "่", "้", "๊", "๋", "์", "ํ", "๐", "๑", "๒", "๓",
"๔", "๕", "๖", "๗", "๘", "๙", "\u2018", "\u2019", "\ufeff",
]
_CHARS_MAP: dict[str, int] = {v: k for k, v in enumerate(_CHARS)}

_CHAR_TYPES: list[str] = [
"b_e", "c", "d", "n", "o", "p", "q", "s", "s_e", "t", "v", "w",
]
_CHAR_TYPES_MAP: dict[str, int] = {v: k for k, v in enumerate(_CHAR_TYPES)}

# Default index for unknown characters and types
_OTHER_CHAR_INDEX: int = _CHARS_MAP.get("other", 80)
_OTHER_TYPE_INDEX: int = _CHAR_TYPES_MAP.get("o", 4)

_session: Optional[InferenceSession] = None


def _get_session() -> InferenceSession:
"""Return a cached ONNX inference session, loading it on first call."""
global _session
if _session is None:
model_path = get_corpus_path(_MODEL_NAME)
if not model_path:
raise FileNotFoundError(
f"corpus-not-found name={_MODEL_NAME!r}\n"
" DeepCut ONNX model file not found in the package.\n"
" Try reinstalling PyThaiNLP:\n"
" pip install --force-reinstall pythainlp"
)
_session = InferenceSession(model_path)
return _session


def _create_feature_array(
text: str, n_pad: int = _N_PAD
) -> tuple["NDArray[np.float32]", "NDArray[np.float32]"]:
"""Create character and type feature arrays for ONNX model input.

:param str text: input text
:param int n_pad: window size for padding (default: 21)
:return: character and type feature arrays of shape (n, n_pad)
:rtype: tuple[numpy.ndarray, numpy.ndarray]
"""
n = len(text)
n_pad_2 = (n_pad - 1) // 2
text_pad = [" "] * n_pad_2 + list(text) + [" "] * n_pad_2
x_char: list[list[int]] = []
x_type: list[list[int]] = []
for i in range(n_pad_2, n_pad_2 + n):
char_list = (
text_pad[i + 1 : i + n_pad_2 + 1]
+ list(reversed(text_pad[i - n_pad_2 : i]))
+ [text_pad[i]]
)
x_char.append([_CHARS_MAP.get(c, _OTHER_CHAR_INDEX) for c in char_list])
x_type.append(
[
_CHAR_TYPES_MAP.get(_CHAR_TYPE_FLAT.get(c, "o"), _OTHER_TYPE_INDEX)
for c in char_list
]
)
return (
np.array(x_char, dtype=np.float32),
np.array(x_type, dtype=np.float32),
)


def segment(
text: str, custom_dict: Union[Trie, list[str], str] = []
text: str,
custom_dict: Union[Trie, list[str], str, None] = None,

Check warning on line 147 in pythainlp/tokenize/deepcut.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove the unused function parameter "custom_dict".

See more on https://sonarcloud.io/project/issues?id=PyThaiNLP_pythainlp&issues=AZ07L_sdoHL-2mk7yVAQ&open=AZ07L_sdoHL-2mk7yVAQ&pullRequest=1372
) -> list[str]:
"""Segment Thai text using the DeepCut ONNX model.

:param str text: text to segment
:param custom_dict: ignored; kept for API compatibility only
:type custom_dict: Union[pythainlp.util.Trie, list[str], str, None]
:return: list of word tokens
:rtype: list[str]

:Example:
::

from pythainlp.tokenize import deepcut

deepcut.segment("ทดสอบตัดคำ")
# output: ['ทดสอบ', 'ตัด', 'คำ']
"""
if not text or not isinstance(text, str):
return []

if custom_dict:
if isinstance(custom_dict, Trie):
custom_dict = list(custom_dict)

return cast("list[str]", tokenize(text, custom_dict))
session = _get_session()
x_char, x_type = _create_feature_array(text)
outputs = session.run(None, {"input_1": x_char, "input_2": x_type})
y_predict = (outputs[0].ravel() > _THRESHOLD).astype(int)
word_end = y_predict[1:].tolist() + [1]

return cast("list[str]", tokenize(text))
tokens: list[str] = []
word = ""
for char, is_end in zip(text, word_end):
word += char
if is_end:
tokens.append(word)
word = ""
return tokens
48 changes: 48 additions & 0 deletions tests/noauto_onnx/testn_tokenize_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,45 @@
import unittest

from pythainlp.tokenize import (
deepcut,
oskut,
sefr_cut,
word_dict_trie,
word_tokenize,
)

from ..core.test_tokenize import TEXT_1
from ..test_helpers import assert_segment_handles_none_and_empty


class DetokenizeDeepcutTestCaseN(unittest.TestCase):
"""Tests for deepcut tokenizer numeric handling (requires onnxruntime)"""

def test_numeric_data_format_deepcut(self):
self.assertIn(
"127.0.0.1",
word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine="deepcut"),
)

tokens = word_tokenize(
"เวลา 12:12pm มีโปรโมชั่น 11.11", engine="deepcut"
)
self.assertTrue(
any(value in tokens for value in ["12:12pm", "12:12"]),
msg=f"deepcut: {tokens}",
)
self.assertIn("11.11", tokens)

self.assertIn(
"1,234,567.89",
word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine="deepcut"),
)

tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine="deepcut")
self.assertIn("2.5:1", tokens)
self.assertIn("5:2", tokens)


class DetokenizeSEFRCutTestCaseN(unittest.TestCase):
"""Tests for sefr_cut tokenizer numeric handling (requires onnxruntime)"""

Expand Down Expand Up @@ -48,6 +78,24 @@ def test_numeric_data_format_sefr_cut(self):
self.assertIn("5:2", tokens)


class WordTokenizeDeepcutTestCaseN(unittest.TestCase):
"""Tests for deepcut tokenizer (requires onnxruntime)"""

def test_word_tokenize_deepcut(self):
self.assertIsNotNone(word_tokenize(TEXT_1, engine="deepcut"))

def test_deepcut(self):
assert_segment_handles_none_and_empty(self, deepcut.segment)
self.assertIsNotNone(deepcut.segment("ทดสอบ", word_dict_trie()))
self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"]))
self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut"))
self.assertIsNotNone(
word_tokenize(
"ทดสอบ", engine="deepcut", custom_dict=word_dict_trie()
)
)


class WordTokenizeOSKutTestCaseN(unittest.TestCase):
"""Tests for oskut tokenizer (requires onnxruntime)"""

Expand Down
8 changes: 4 additions & 4 deletions tests/noauto_tensorflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
Test functions that require TensorFlow and its ecosystem dependencies:
- tensorflow
- keras
- deepcut

These tests are NOT run in automated CI workflows due to:
- Very large dependencies (~1-2 GB for tensorflow)
Expand All @@ -15,14 +14,15 @@

These tests are kept for manual testing and may be run in separate CI
workflows dedicated to TensorFlow-based features.

NOTE: deepcut tokenizer was migrated to ONNX; its tests are now in
tests/noauto_onnx/.
"""

from unittest import TestLoader, TestSuite

# Names of module to be tested
test_packages: list[str] = [
"tests.noauto_tensorflow.testn_tokenize_tensorflow",
]
test_packages: list[str] = []


def load_tests(
Expand Down
Loading
Loading