PyThaiNLP · Copilot · Mar 29, 2026 · Mar 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,18 @@ and this project adheres to
 - Full release notes: <https://github.com/PyThaiNLP/pythainlp/releases>
 - Commit history: <https://github.com/PyThaiNLP/pythainlp/compare/v5.3.1...v5.3.3>
 
+## [Unreleased]
+
+### Changed
+
+- `pythainlp.tokenize.deepcut`: migrated from the TensorFlow-based `deepcut`
+  package to a built-in ONNX inference engine, removing the TensorFlow
+  dependency. The `deepcut.onnx` model (ported from
+  [LEKCut](https://github.com/PyThaiNLP/LEKCut)) is now bundled with PyThaiNLP.
+  The `segment()` API is unchanged; the `custom_dict` parameter is kept for
+  backward compatibility but is no longer applied to the model inference.
+  Deepcut tests moved from `tests/noauto_tensorflow/` to `tests/noauto_onnx/`.
+
 ## [5.3.3] - 2026-03-26
 
 Security fixes and thai2rom_onnx bug fixes.

diff --git a/pyproject.toml b/pyproject.toml
@@ -218,7 +218,6 @@ noauto-torch = [
 
 # TensorFlow-based dependencies - for tests.noauto_tensorflow
 noauto-tensorflow = [
-    "deepcut>=0.7.0",
     "numpy>=1.26.0",
 ]
 
@@ -247,8 +246,6 @@ full = [
     "attaparse==1.0.0",
     "bpemb>=0.3.6,<0.4",
     "budoux==0.7.0",
-    "deepcut==0.7.0.0",
-    "emoji>=0.6.0,<1",
     "epitran==1.26.0",
     "esupar>=1.3.9,<2",
     'fairseq>=0.10.0,<0.13;python_version<"3.11"',
@@ -453,7 +450,6 @@ module = [
     "attaparse.*",
     "bpemb.*",
     "budoux.*",
-    "deepcut.*",
     "emoji.*",
     "epitran.*",
     "esupar.*",

diff --git a/pythainlp/corpus/deepcut.onnx b/pythainlp/corpus/deepcut.onnx
diff --git a/pythainlp/corpus/default_db.json b/pythainlp/corpus/default_db.json
@@ -1,4 +1,29 @@
 {
+    "deepcut_onnx": {
+        "name": "deepcut_onnx",
+        "latest_version": "1.0.0",
+        "description": "DeepCut ONNX model",
+        "long_description": "DeepCut Thai word segmentation model in ONNX format, ported from the original TensorFlow model",
+        "url": "https://github.com/PyThaiNLP/LEKCut",
+        "authors": [
+            "Rakpong Kittinaradorn",
+            "Titipat Achakulvisut",
+            "Korakot Chaovavanich",
+            "Kittinan Srithaworn",
+            "Pattarawat Chormai",
+            "Chanwit Kaewkasi",
+            "Tulakan Ruangrong",
+            "Krichkorn Oparad"
+        ],
+        "license": "MIT",
+        "versions": {
+            "1.0.0": {
+                "filename": "deepcut.onnx",
+                "md5": "f4662560dd9a706bfb1d7790ad6c667f",
+                "pythainlp_version": ">=5.4.0"
+            }
+        }
+    },
     "thainer": {
         "name": "thainer",
         "latest_version": "1.5.1",

diff --git a/pythainlp/tokenize/deepcut.py b/pythainlp/tokenize/deepcut.py
@@ -1,38 +1,181 @@
 # SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
 # SPDX-FileType: SOURCE
 # SPDX-License-Identifier: Apache-2.0
-"""Wrapper for deepcut Thai word segmentation. deepcut is a
-Thai word segmentation library using 1D Convolution Neural Network.
+"""DeepCut Thai word segmentation using ONNX runtime.
 
-User need to install deepcut (and its dependency: tensorflow) by themselves.
+DeepCut is a Thai word segmentation library using 1D Convolution Neural
+Network. This module provides ONNX-based inference, removing the need for
+TensorFlow.
+
+The ONNX model is ported from the original DeepCut TensorFlow model,
+available from the LEKCut project.
 
 :See Also:
-    * `GitHub repository <https://github.com/rkcosmos/deepcut>`_
+    * `DeepCut GitHub <https://github.com/rkcosmos/deepcut>`_
+    * `LEKCut GitHub <https://github.com/PyThaiNLP/LEKCut>`_
+
+:References:
+    Rakpong Kittinaradorn, Titipat Achakulvisut, Korakot Chaovavanich,
+    Kittinan Srithaworn, Pattarawat Chormai, Chanwit Kaewkasi,
+    Tulakan Ruangrong, Krichkorn Oparad.
+    (2019, September 23). DeepCut: A Thai word tokenization library using
+    Deep Neural Network. Zenodo. https://doi.org/10.5281/zenodo.3457707
 """
 
 from __future__ import annotations
 
-from typing import Union, cast
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+from onnxruntime import InferenceSession
 
-try:
-    from deepcut import tokenize
-except ImportError as e:
-    raise ImportError(
-        "deepcut is not installed. Install it with: pip install deepcut"
-    ) from e
+from pythainlp.corpus import get_corpus_path
 from pythainlp.util import Trie
 
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+_MODEL_NAME: str = "deepcut_onnx"
+_N_PAD: int = 21
+_THRESHOLD: float = 0.5
+
+# Character type mapping from the original DeepCut model
+_CHAR_TYPE: dict[str, str] = {
+    "กขฃคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ": "c",
+    "ฅฉผฟฌหฮ": "n",
+    "ะาำิีืึุู": "v",
+    "เแโใไ": "w",
+    "่้๊๋": "t",
+    "์ๆฯ.": "s",
+    "0123456789๑๒๓๔๕๖๗๘๙": "d",
+    '"': "q",
+    "'": "q",
+    "\u2018": "q",
+    "\u2019": "q",
+    " ": "p",
+    "abcdefghijklmnopqrstuvwxyz": "s_e",
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ": "b_e",
+}
+
+_CHAR_TYPE_FLAT: dict[str, str] = {}
+for _ks, _ct in _CHAR_TYPE.items():
+    for _k in _ks:
+        _CHAR_TYPE_FLAT[_k] = _ct
+
+_CHARS: list[str] = [
+    "\n", " ", "!", '"', "#", "$", "%", "&", "'", "(", ")", "*", "+",
+    ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8",
+    "9", ":", ";", "<", "=", ">", "?", "@", "A", "B", "C", "D", "E",
+    "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R",
+    "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\", "]", "^", "_",
+    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
+    "n", "o", "other", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y",
+    "z", "}", "~", "ก", "ข", "ฃ", "ค", "ฅ", "ฆ", "ง", "จ", "ฉ", "ช",
+    "ซ", "ฌ", "ญ", "ฎ", "ฏ", "ฐ", "ฑ", "ฒ", "ณ", "ด", "ต", "ถ", "ท",
+    "ธ", "น", "บ", "ป", "ผ", "ฝ", "พ", "ฟ", "ภ", "ม", "ย", "ร", "ฤ",
+    "ล", "ว", "ศ", "ษ", "ส", "ห", "ฬ", "อ", "ฮ", "ฯ", "ะ", "ั", "า",
+    "ำ", "ิ", "ี", "ึ", "ื", "ุ", "ู", "ฺ", "เ", "แ", "โ", "ใ", "ไ",
+    "ๅ", "ๆ", "็", "่", "้", "๊", "๋", "์", "ํ", "๐", "๑", "๒", "๓",
+    "๔", "๕", "๖", "๗", "๘", "๙", "\u2018", "\u2019", "\ufeff",
+]
+_CHARS_MAP: dict[str, int] = {v: k for k, v in enumerate(_CHARS)}
+
+_CHAR_TYPES: list[str] = [
+    "b_e", "c", "d", "n", "o", "p", "q", "s", "s_e", "t", "v", "w",
+]
+_CHAR_TYPES_MAP: dict[str, int] = {v: k for k, v in enumerate(_CHAR_TYPES)}
+
+# Default index for unknown characters and types
+_OTHER_CHAR_INDEX: int = _CHARS_MAP.get("other", 80)
+_OTHER_TYPE_INDEX: int = _CHAR_TYPES_MAP.get("o", 4)
+
+_session: Optional[InferenceSession] = None
+
+
+def _get_session() -> InferenceSession:
+    """Return a cached ONNX inference session, loading it on first call."""
+    global _session
+    if _session is None:
+        model_path = get_corpus_path(_MODEL_NAME)
+        if not model_path:
+            raise FileNotFoundError(
+                f"corpus-not-found name={_MODEL_NAME!r}\n"
+                "  DeepCut ONNX model file not found in the package.\n"
+                "  Try reinstalling PyThaiNLP:\n"
+                "    pip install --force-reinstall pythainlp"
+            )
+        _session = InferenceSession(model_path)
+    return _session
+
+
+def _create_feature_array(
+    text: str, n_pad: int = _N_PAD
+) -> tuple["NDArray[np.float32]", "NDArray[np.float32]"]:
+    """Create character and type feature arrays for ONNX model input.
+
+    :param str text: input text
+    :param int n_pad: window size for padding (default: 21)
+    :return: character and type feature arrays of shape (n, n_pad)
+    :rtype: tuple[numpy.ndarray, numpy.ndarray]
+    """
+    n = len(text)
+    n_pad_2 = (n_pad - 1) // 2
+    text_pad = [" "] * n_pad_2 + list(text) + [" "] * n_pad_2
+    x_char: list[list[int]] = []
+    x_type: list[list[int]] = []
+    for i in range(n_pad_2, n_pad_2 + n):
+        char_list = (
+            text_pad[i + 1 : i + n_pad_2 + 1]
+            + list(reversed(text_pad[i - n_pad_2 : i]))
+            + [text_pad[i]]
+        )
+        x_char.append([_CHARS_MAP.get(c, _OTHER_CHAR_INDEX) for c in char_list])
+        x_type.append(
+            [
+                _CHAR_TYPES_MAP.get(_CHAR_TYPE_FLAT.get(c, "o"), _OTHER_TYPE_INDEX)
+                for c in char_list
+            ]
+        )
+    return (
+        np.array(x_char, dtype=np.float32),
+        np.array(x_type, dtype=np.float32),
+    )
+
 
 def segment(
-    text: str, custom_dict: Union[Trie, list[str], str] = []
+    text: str,
+    custom_dict: Union[Trie, list[str], str, None] = None,
 ) -> list[str]:
+    """Segment Thai text using the DeepCut ONNX model.
+
+    :param str text: text to segment
+    :param custom_dict: ignored; kept for API compatibility only
+    :type custom_dict: Union[pythainlp.util.Trie, list[str], str, None]
+    :return: list of word tokens
+    :rtype: list[str]
+
+    :Example:
+    ::
+
+        from pythainlp.tokenize import deepcut
+
+        deepcut.segment("ทดสอบตัดคำ")
+        # output: ['ทดสอบ', 'ตัด', 'คำ']
+    """
     if not text or not isinstance(text, str):
         return []
 
-    if custom_dict:
-        if isinstance(custom_dict, Trie):
-            custom_dict = list(custom_dict)
-
-        return cast("list[str]", tokenize(text, custom_dict))
+    session = _get_session()
+    x_char, x_type = _create_feature_array(text)
+    outputs = session.run(None, {"input_1": x_char, "input_2": x_type})
+    y_predict = (outputs[0].ravel() > _THRESHOLD).astype(int)
+    word_end = y_predict[1:].tolist() + [1]
 
-    return cast("list[str]", tokenize(text))
+    tokens: list[str] = []
+    word = ""
+    for char, is_end in zip(text, word_end):
+        word += char
+        if is_end:
+            tokens.append(word)
+            word = ""
+    return tokens
diff --git a/tests/noauto_onnx/testn_tokenize_onnx.py b/tests/noauto_onnx/testn_tokenize_onnx.py
@@ -11,15 +11,45 @@
 import unittest
 
 from pythainlp.tokenize import (
+    deepcut,
     oskut,
     sefr_cut,
+    word_dict_trie,
     word_tokenize,
 )
 
 from ..core.test_tokenize import TEXT_1
 from ..test_helpers import assert_segment_handles_none_and_empty
 
 
+class DetokenizeDeepcutTestCaseN(unittest.TestCase):
+    """Tests for deepcut tokenizer numeric handling (requires onnxruntime)"""
+
+    def test_numeric_data_format_deepcut(self):
+        self.assertIn(
+            "127.0.0.1",
+            word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine="deepcut"),
+        )
+
+        tokens = word_tokenize(
+            "เวลา 12:12pm มีโปรโมชั่น 11.11", engine="deepcut"
+        )
+        self.assertTrue(
+            any(value in tokens for value in ["12:12pm", "12:12"]),
+            msg=f"deepcut: {tokens}",
+        )
+        self.assertIn("11.11", tokens)
+
+        self.assertIn(
+            "1,234,567.89",
+            word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine="deepcut"),
+        )
+
+        tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine="deepcut")
+        self.assertIn("2.5:1", tokens)
+        self.assertIn("5:2", tokens)
+
+
 class DetokenizeSEFRCutTestCaseN(unittest.TestCase):
     """Tests for sefr_cut tokenizer numeric handling (requires onnxruntime)"""
 
@@ -48,6 +78,24 @@ def test_numeric_data_format_sefr_cut(self):
         self.assertIn("5:2", tokens)
 
 
+class WordTokenizeDeepcutTestCaseN(unittest.TestCase):
+    """Tests for deepcut tokenizer (requires onnxruntime)"""
+
+    def test_word_tokenize_deepcut(self):
+        self.assertIsNotNone(word_tokenize(TEXT_1, engine="deepcut"))
+
+    def test_deepcut(self):
+        assert_segment_handles_none_and_empty(self, deepcut.segment)
+        self.assertIsNotNone(deepcut.segment("ทดสอบ", word_dict_trie()))
+        self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"]))
+        self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut"))
+        self.assertIsNotNone(
+            word_tokenize(
+                "ทดสอบ", engine="deepcut", custom_dict=word_dict_trie()
+            )
+        )
+
+
 class WordTokenizeOSKutTestCaseN(unittest.TestCase):
     """Tests for oskut tokenizer (requires onnxruntime)"""
 

diff --git a/tests/noauto_tensorflow/__init__.py b/tests/noauto_tensorflow/__init__.py
@@ -6,7 +6,6 @@
 Test functions that require TensorFlow and its ecosystem dependencies:
 - tensorflow
 - keras
-- deepcut
 
 These tests are NOT run in automated CI workflows due to:
 - Very large dependencies (~1-2 GB for tensorflow)
@@ -15,14 +14,15 @@
 
 These tests are kept for manual testing and may be run in separate CI
 workflows dedicated to TensorFlow-based features.
+
+NOTE: deepcut tokenizer was migrated to ONNX; its tests are now in
+tests/noauto_onnx/.
 """
 
 from unittest import TestLoader, TestSuite
 
 # Names of module to be tested
-test_packages: list[str] = [
-    "tests.noauto_tensorflow.testn_tokenize_tensorflow",
-]
+test_packages: list[str] = []
 
 
 def load_tests(