Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
c3d577a
Add: Cython C extensions for Thai character and normalization
chanitnan0jr Apr 2, 2026
6610c06
Fix: resolve linting and type issues in bench_full_evidence.py
chanitnan0jr Apr 2, 2026
da6d741
Refactor: deduplicate load_tests across noauto test suites
chanitnan0jr Apr 2, 2026
38f289e
Fix: preserve TypeError behavior of is_thai_char when Cython is loaded
chanitnan0jr Apr 2, 2026
844643b
Refactor: use try/except/else for Cython override in thai.py
chanitnan0jr Apr 2, 2026
1d4e811
Fix: use typing.Optional for Python 3.9 compatibility in bench script
chanitnan0jr Apr 2, 2026
7e92cb7
Fix: correct _normalize_fast.pyx docstring to reflect explicit-import…
chanitnan0jr Apr 2, 2026
1fa0970
Refactor: remove flaky performance assertions from Cython test suite
chanitnan0jr Apr 2, 2026
8bbae81
Fix: avoid F811 redefinition warning for is_thai_char Cython wrapper
chanitnan0jr Apr 2, 2026
8863a5d
Merge branch 'PyThaiNLP:dev' into dev
chanitnan0jr Apr 2, 2026
cee533a
Fix: use collections.abc.Callable per PEP 585 in bench script
chanitnan0jr Apr 2, 2026
3a1089a
Fix: remove unnecessary noqa: F811 from Cython assignment overrides
chanitnan0jr Apr 2, 2026
eac8a0f
Fix: split Cython imports for isort and add return type to make_load_…
chanitnan0jr Apr 3, 2026
cbbf87b
[cd build] CI: Add cibuildwheel for binary wheel distribution
chanitnan0jr Apr 4, 2026
4eaba97
[cd build] CI: Pin GitHub Actions to exact commit SHAs (SonarQube S7637)
chanitnan0jr Apr 4, 2026
7ef91e7
Fix: move compile_py option into the correct TOML table to prevent ha…
chanitnan0jr Apr 4, 2026
10427ad
Fix: Pin remaining download-artifact instance to SHA
chanitnan0jr Apr 4, 2026
4193d6e
Fix: Migrate Cython type coercion directly into C-extension boundary …
chanitnan0jr Apr 4, 2026
048c58d
Test: add coverage for pure-Python fallbacks and Cython ImportError b…
chanitnan0jr Apr 5, 2026
b5b81a7
Fix: register Cython coverage test in core suite
chanitnan0jr Apr 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 54 additions & 12 deletions .github/workflows/pypi-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
build: ${{ steps.check_build_trigger.outputs.build }}
steps:
- name: Checkout source code
uses: actions/checkout@v6
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ github.event.pull_request.head.sha }}
- id: check_build_trigger
Expand All @@ -71,42 +71,84 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
pip install --upgrade build pip twine

- name: Build source distribution and wheels
run: python -m build
- name: Build source distribution
run: python -m build --sdist # was: python -m build

- name: Check distributions
run: twine check dist/*

- name: Store distributions
uses: actions/upload-artifact@v7
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: dist-sdist # explicit name for downstream retrieval
path: dist

build_wheels:
name: Build binary wheels (${{ matrix.os }})
needs: [check_build_trigger]
if: needs.check_build_trigger.outputs.build
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest # → manylinux_2_17_x86_64
- windows-latest # → win_amd64
- macos-13 # → macosx_13_*_x86_64 (Intel)
- macos-14 # → macosx_14_*_arm64 (Apple Silicon)

steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Build wheels
uses: pypa/cibuildwheel@fa04202e88ea28b84d5d4d20696ee8dfc0119436 # v2.23.0
# All config is read from [tool.cibuildwheel] in pyproject.toml:
# build/skip selectors, test command, per-platform archs

- name: Validate wheels
run: |
pip install twine
twine check ./wheelhouse/*.whl

- name: Upload wheel artifacts
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: cibw-wheels-${{ matrix.os }}
path: ./wheelhouse/*.whl

publish_pypi:
name: Publish to PyPI
runs-on: ubuntu-latest
needs: [build]
needs: [build, build_wheels] # was: needs: [build]
if: github.event_name == 'release' && github.event.action == 'published'
steps:
- name: Retrieve distributions
uses: actions/download-artifact@v7
- name: Retrieve sdist
uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9
with:
name: artifact
name: dist-sdist # matches renamed artifact
path: dist

- name: Retrieve binary wheels
uses: actions/download-artifact@cc203385981b70ca67e1cc392babf9cc229d5806 # v4.1.9
with:
pattern: cibw-wheels-* # globs all 4 matrix artifacts
path: dist
merge-multiple: true # flatten: cibw-wheels-os1/a.whl → dist/a.whl

- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
if: github.event_name == 'release' && github.event.action == 'published'
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
with:
skip-existing: true
user: __token__
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ dmypy.json

# Cython debug symbols
cython_debug/

# Cython-generated C source files (anywhere in the package tree)
pythainlp/**/*.c

notebooks/iso_11940-dev.ipynb

# vscode devcontainer
Expand Down
50 changes: 49 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

[build-system]
requires = ["hatchling"]
requires = ["hatchling", "hatch-cython>=0.5.0", "cython>=3.0"]
build-backend = "hatchling.build"

[project]
Expand Down Expand Up @@ -233,6 +233,8 @@ noauto-onnx = [
# Cython-based dependencies - for tests.noauto_cython
noauto-cython = [
"phunspell>=0.1.6",
"hatch-cython>=0.5.0",
"cython>=3.0",
]

# Network-dependent tests - for tests.noauto_network
Expand Down Expand Up @@ -311,6 +313,22 @@ include = [
"README.md",
]

[tool.hatch.build.hooks.cython]
dependencies = ["cython>=3.0"]
optional = true

[tool.hatch.build.hooks.cython.options]
# Compile only .pyx files in pythainlp/_ext — do NOT compile .py files.
# Without compile_py=false, hatch-cython would compile every .py file in
# the package into a Cython extension, which is not what we want.
compile_py = false

# hatch-cython internally invokes setuptools' build_ext. Restrict package
# discovery to pythainlp only so setuptools doesn't error on the flat layout
# (multiple top-level directories: build_tools, fuzz, notebooks, pythainlp).
[tool.setuptools.packages.find]
include = ["pythainlp*"]

[tool.bumpversion]
current_version = "5.3.4"
commit = true
Expand Down Expand Up @@ -497,6 +515,10 @@ module = [
]
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = ["pythainlp._ext.*"]
ignore_missing_imports = true

[tool.pylint.main]
disable = [
"import-error",
Expand All @@ -507,3 +529,29 @@ disable = [
"too-many-branches",
"too-many-statements",
]

# ---------------------------------------------------------------------------
# cibuildwheel — binary wheel build matrix
# Docs: https://cibuildwheel.readthedocs.io/en/stable/options/
# ---------------------------------------------------------------------------
[tool.cibuildwheel]
# CPython 3.9–3.13 (stable; matches requires-python = ">=3.9")
build = "cp39-* cp310-* cp311-* cp312-* cp313-*"
skip = "pp* *-musllinux_*" # PyPy and Alpine excluded (complex toolchain, deferred)

[tool.cibuildwheel.test]
# After wheel install, verify _thai_fast loaded as a compiled .so/.pyd
# (not a pure-Python fallback). No test deps required.
# Note: pythainlp/_ext/_thai_fast has NO .py fallback — ImportError here
# means compilation failed silently, which also fails this step explicitly.
command = "python -c \"import pythainlp._ext._thai_fast as m; assert m.__file__.endswith(('.so', '.pyd')), 'NOT compiled: ' + m.__file__; print('CIBW OK:', m.__file__)\""

[tool.cibuildwheel.linux]
manylinux-x86_64-image = "manylinux2014" # glibc >= 2.17 (RHEL 7+ / Ubuntu 18.04+)
archs = "x86_64" # linux aarch64 deferred — QEMU adds ~20 min/version on GitHub runners

[tool.cibuildwheel.macos]
archs = "auto" # macos-13 runner = Intel (auto → x86_64); macos-14 runner = ARM (auto → arm64)

[tool.cibuildwheel.windows]
archs = "AMD64"
9 changes: 9 additions & 0 deletions pythainlp/_ext/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Optional Cython-compiled extensions for performance-critical functions.

These extensions are built at install time when a C compiler and Cython are
available. If unavailable (e.g., PyPy, no compiler), the pure Python
implementations in pythainlp.util are used as fallback.
"""
7 changes: 7 additions & 0 deletions pythainlp/_ext/_normalize_fast.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Type stubs for pythainlp._ext._normalize_fast Cython extension."""

def remove_tonemark(text: str) -> str: ...
def remove_dup_spaces(text: str) -> str: ...
116 changes: 116 additions & 0 deletions pythainlp/_ext/_normalize_fast.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Cython-optimized text normalization functions.

Provides faster implementations of remove_tonemark and remove_dup_spaces
using C-level typed memory views and byte filtering.

These functions are API-compatible with their equivalents in
pythainlp.util.normalize and can be used as faster drop-in replacements
when explicitly imported.
"""
# cython: language_level=3
# cython: boundscheck=False
# cython: wraparound=False

import re as _re

from pythainlp import thai_tonemarks as _tonemarks_str

# Frozenset of tone mark characters for O(1) membership test.
# Must contain single-char strings (not ints): when Cython converts a
# Py_UCS4 value via the `in` operator it produces chr(c), not an integer.
cdef frozenset _TONE_SET = frozenset(_tonemarks_str)

# Use the same regex pattern as normalize.py to keep newline behaviour
# identical (collapses sequences of spaces+newlines into a single newline)
_RE_REMOVE_NEWLINES = _re.compile(r"[ \n]*\n[ \n]*")


cpdef str remove_tonemark(object text):
"""Remove Thai tone marks from text using UTF-8 byte-level filtering.

Thai tone marks occupy the Unicode range U+0E48-U+0E4B, which encodes
in UTF-8 as the three-byte sequence 0xE0 0xB9 {0x88-0x8B}. Filtering
at the byte level using typed memory views avoids per-character Python
object creation and outperforms repeated str.replace() calls on long texts.

:param text: input text (str or str-like object)
:type text: str
:return: text with all Thai tone marks removed
:rtype: str
"""
cdef str _text = str(text)
if not _text:
return _text

# Fast path: bail out early if none of the four tone marks are present
cdef Py_UCS4 c
cdef bint found = False
for c in _text:
if c in _TONE_SET:
found = True
break
if not found:
return _text

# Encode once to UTF-8 bytes; use memoryview for C-level access.
# IMPORTANT: the byte pattern below is hard-coded for the four Thai tone
# marks U+0E48–U+0E4B (encoding: 0xE0 0xB9 {0x88–0x8B}). If
# pythainlp.thai_tonemarks is ever extended beyond those four codepoints
# this filter will silently miss any additions; update the scan range
# in the while-loop accordingly.
cdef bytes src_bytes = _text.encode("utf-8")
cdef const unsigned char[:] src = src_bytes
cdef Py_ssize_t n = len(src)

# Pre-allocate output buffer (same size as input; result is always smaller)
cdef bytearray dst_arr = bytearray(n)
cdef unsigned char[:] dst = dst_arr
cdef Py_ssize_t i = 0
cdef Py_ssize_t j = 0
cdef unsigned char b0

while i < n:
b0 = src[i]
# All Thai tone marks share first two bytes 0xE0 0xB9
if b0 == 0xE0 and i + 2 < n and src[i + 1] == 0xB9:
if 0x88 <= src[i + 2] <= 0x8B:
i += 3 # skip tone-mark sequence
continue
dst[j] = b0
j += 1
i += 1

return bytes(dst_arr[:j]).decode("utf-8")


cpdef str remove_dup_spaces(object text):
"""Remove duplicate ASCII spaces and collapse newlines; strip result.

Behaviorally identical to pythainlp.util.normalize.remove_dup_spaces:
- Only ASCII space (0x20) runs are collapsed (not tabs or other whitespace)
- Newline normalisation is delegated to the same compiled regex

:param text: input text (str or str-like object)
:type text: str
:return: text without duplicate spaces, with newlines normalised and
leading/trailing whitespace stripped
:rtype: str
"""
cdef str _text = str(text)
cdef list out = []
cdef Py_UCS4 c
cdef bint prev_space = False
for c in _text:
if c == 32: # ASCII space 0x20
if not prev_space:
out.append(" ")
prev_space = True
else:
out.append(chr(c))
prev_space = False
result = "".join(out)
result = _RE_REMOVE_NEWLINES.sub("\n", result)
return result.strip()
11 changes: 11 additions & 0 deletions pythainlp/_ext/_thai_fast.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Type stubs for pythainlp._ext._thai_fast Cython extension."""

def is_thai_char(ch: str) -> bool: ...
def is_thai(text: str, ignore_chars: str = ...) -> bool: ...
def count_thai(
text: str,
ignore_chars: str = ..., # defaults to whitespace + digits + punctuation
) -> float: ...
Loading
Loading