Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@
- Full release notes: <https://github.com/PyThaiNLP/pythainlp/releases>
- Commit history: <https://github.com/PyThaiNLP/pythainlp/compare/v5.3.3...v5.3.4>

## [Unreleased]

### Added

- ThaiG2P v3 engine (`thaig2p_v3`) for `transliterate()` (#1398).
Uses a char-level Transformer model in ONNX format from
https://github.com/wannaphong/thai-g2p-v3.

Check failure on line 26 in CHANGELOG.md

View workflow job for this annotation

GitHub Actions / Run Markdown lint

Bare URL used

CHANGELOG.md:26:3 MD034/no-bare-urls Bare URL used [Context: "https://github.com/wannaphong/..."] https://github.com/DavidAnson/markdownlint/blob/v0.40.0/doc/md034.md

## [5.3.4] - 2026-04-02

### Fixed
Expand Down
1 change: 1 addition & 0 deletions docs/api/transliterate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ This section includes multiple transliteration engines designed to suit various
- **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text.
- **thaig2p**: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation.
- **thaig2p_v2**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0
- **thaig2p_v3**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation using an ONNX model (v3). This model is from https://github.com/wannaphong/thai-g2p-v3
- **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration.
- **iso_11940**: Focuses on the ISO 11940 transliteration standard.

Expand Down
56 changes: 55 additions & 1 deletion pythainlp/corpus/default_db.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,59 @@
"pythainlp_version": ">=3.2.0"
}
}
},
"thaig2p_v3_encoder_onnx": {
"name": "thaig2p_v3_encoder_onnx",
"latest_version": "1.0.0",
"description": "ThaiG2P v3 Encoder ONNX",
"long_description": "Thai Grapheme-to-Phoneme v3 ONNX model - Encoder",
"url": "https://github.com/wannaphong/thai-g2p-v3",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"license": "Apache-2.0",
"versions": {
"1.0.0": {
"filename": "thaig2p_v3_encoder.onnx",
"md5": "-",
"pythainlp_version": ">=5.3.4"
}
}
},
"thaig2p_v3_decoder_onnx": {
"name": "thaig2p_v3_decoder_onnx",
"latest_version": "1.0.0",
"description": "ThaiG2P v3 Decoder ONNX",
"long_description": "Thai Grapheme-to-Phoneme v3 ONNX model - Decoder",
"url": "https://github.com/wannaphong/thai-g2p-v3",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"license": "Apache-2.0",
"versions": {
"1.0.0": {
"filename": "thaig2p_v3_decoder.onnx",
"md5": "-",
"pythainlp_version": ">=5.3.4"
}
}
},
"thaig2p_v3_vocab": {
"name": "thaig2p_v3_vocab",
"latest_version": "1.0.0",
"description": "ThaiG2P v3 Vocabulary",
"long_description": "Thai Grapheme-to-Phoneme v3 vocabulary (character-to-index mapping)",
"url": "https://github.com/wannaphong/thai-g2p-v3",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"license": "Apache-2.0",
"versions": {
"1.0.0": {
"filename": "thaig2p_v3_vocab.json",
"md5": "-",
"pythainlp_version": ">=5.3.4"
}
}
}
}
}
Binary file added pythainlp/corpus/thaig2p_v3_decoder.onnx
Binary file not shown.
Binary file added pythainlp/corpus/thaig2p_v3_encoder.onnx
Binary file not shown.
1 change: 1 addition & 0 deletions pythainlp/corpus/thaig2p_v3_vocab.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"input_char2idx": {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3, "ร": 4, "ู": 5, "้": 6, "เ": 7, "ห": 8, "็": 9, "น": 10, "ไ": 11, "ซ": 12, "ต": 13, "ั": 14, "ง": 15, "ม": 16, "า": 17, "บ": 18, "ว": 19, "ส": 20, "ฤ": 21, "ด": 22, "ก": 23, "ุ": 24, "ะ": 25, "ื": 26, "อ": 27, "โ": 28, "ิ": 29, "่": 30, "ฏ": 31, "แ": 32, "ฟ": 33, "ใ": 34, "ข": 35, "ฮ": 36, "ย": 37, "ฃ": 38, "ล": 39, "์": 40, "ำ": 41, "ป": 42, "ช": 43, "ี": 44, "ค": 45, "ณ": 46, "พ": 47, "ึ": 48, "ญ": 49, "ท": 50, "ธ": 51, "ฺ": 52, "ภ": 53, "ผ": 54, "ฝ": 55, "ฉ": 56, "จ": 57, "ฐ": 58, "ฆ": 59, "ศ": 60, "๋": 61, "ถ": 62, "ํ": 63, "ฎ": 64, "๊": 65, "ๅ": 66, " ": 67, "ๆ": 68, "ษ": 69, "-": 70, "ฌ": 71, "ฬ": 72, "ฯ": 73, "ฑ": 74, "ฅ": 75, "ฒ": 76, "ฦ": 77, "๎": 78, "๏": 79}, "target_idx2char": {"0": "<PAD>", "1": "<SOS>", "2": "<EOS>", "3": "<UNK>", "4": "/", "5": "r", "6": "u", "7": "ː", "8": "˦", "9": "˥", "10": ".", "11": "h", "12": "e", "13": "n", "14": "˩", "15": "s", "16": "a", "17": "j", "18": "t", "19": "ŋ", "20": "o", "21": "p", "22": "̚", "23": "w", "24": "˨", "25": "ɯ", "26": "d", "27": "˧", "28": "k", "29": "m", "30": "i", "31": "ʰ", "32": "̯", "33": "b", "34": "ə", "35": "ʔ", "36": "[", "37": "ä", "38": "ɔ", "39": "̃", "40": "]", "41": "ɤ", "42": "ɛ", "43": "͡", "44": "ɕ", "45": "f", "46": "l", "47": "ʌ", "48": "ˈ", "49": "~", "50": "ɪ", "51": "ᵊ", "52": "ɲ", "53": " ", "54": "-", "55": "ʊ", "56": "x", "57": "ɡ", "58": "ɣ", "59": "ɨ", "60": "ũ", "61": "v", "62": "æ", "63": "̂", "64": "ú", "65": "c", "66": "ç", "67": "̌", "68": "û", "69": "̩", "70": "ɗ", "71": "(", "72": ")", "73": "õ", "74": "ˀ", "75": "̊", "76": "ɟ", "77": "̥", "78": "ǐ", "79": "̀", "80": "á", "81": "ì", "82": "â", "83": "ɒ", "84": "ĩ", "85": "́", "86": "ǎ", "87": "í", "88": "ê", "89": "à", "90": "î", "91": "ṳ", "92": "̤", "93": "ù", "94": "ɓ", "95": "ǔ", "96": "è", "97": "ā", "98": "ã", "99": "̠", "100": "ʲ", "101": "z", "102": "ő", "103": "ě", "104": "ʉ", "105": "ó", "106": "ǒ", "107": "̍", "108": ",", "109": "é", "110": "ẽ", "111": "œ", "112": "ʋ", "113": "ɭ", "114": "ò", "115": "ʄ", "116": "̰", "117": "ă", "118": "ṹ", "119": "ô", "120": "ɳ", "121": "ɖ", "122": "+", "123": "ʃ", "124": "ʍ", "125": "ø", "126": "y", "127": "ī", "128": "ŭ", "129": "̪", "130": "ɐ"}}
8 changes: 6 additions & 2 deletions pythainlp/transliterate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Callable

DEFAULT_ROMANIZE_ENGINE: str = "royin"
DEFAULT_TRANSLITERATE_ENGINE: str = "thaig2p"
DEFAULT_TRANSLITERATE_ENGINE: str = "thaig2p_v3"
DEFAULT_PRONUNCIATE_ENGINE: str = "w2p"


Expand Down Expand Up @@ -105,7 +105,9 @@ def transliterate(
:rtype: str

:Options for engines:
* *thaig2p* - (default) Thai Grapheme-to-Phoneme,
* *thaig2p_v3* - (default) Thai Grapheme-to-Phoneme using ONNX model (v3),
output is IPA. https://github.com/wannaphong/thai-g2p-v3
* *thaig2p* - Thai Grapheme-to-Phoneme,
output is IPA (require PyTorch)
* *icu* - pyicu, based on International Components for Unicode (ICU)
* *ipa* - epitran, output is International Phonetic Alphabet (IPA)
Expand Down Expand Up @@ -158,6 +160,8 @@ def transliterate(
from pythainlp.transliterate.iso_11940 import transliterate # type: ignore[assignment] # noqa: I001
elif engine == "thaig2p_v2":
from pythainlp.transliterate.thaig2p_v2 import transliterate # noqa: I001
elif engine == "thaig2p_v3":
from pythainlp.transliterate.thaig2p_v3 import transliterate # noqa: I001
elif engine == "umt5_thaig2p":
from pythainlp.transliterate.umt5_thaig2p import transliterate # noqa: I001
else: # use default engine: "thaig2p"
Expand Down
147 changes: 147 additions & 0 deletions pythainlp/transliterate/thaig2p_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Thai Grapheme-to-Phoneme (Thai G2P) v3

GitHub: https://github.com/wannaphong/thai-g2p-v3
"""

from __future__ import annotations

import json
from typing import TYPE_CHECKING, Dict, List, Optional

from pythainlp.corpus import get_corpus_path

if TYPE_CHECKING:
import numpy as np
from numpy.typing import NDArray
from onnxruntime import InferenceSession

_MODEL_ENCODER_NAME: str = "thaig2p_v3_encoder_onnx"
_MODEL_DECODER_NAME: str = "thaig2p_v3_decoder_onnx"
_MODEL_VOCAB_NAME: str = "thaig2p_v3_vocab"


class ThaiG2P:
"""Thai Grapheme-to-Phoneme using ONNX model (v3).

This version uses a char-level Transformer model exported to ONNX
for converting Thai text to International Phonetic Alphabet (IPA).

Model files are bundled with PyThaiNLP in the corpus directory.

For more information, see:
https://github.com/wannaphong/thai-g2p-v3
"""

_encoder: "InferenceSession"
_decoder: "InferenceSession"
_char2idx: Dict[str, int]
_idx2char: Dict[int, str]
_sos_idx: int
_eos_idx: int

def __init__(self) -> None:
from onnxruntime import InferenceSession

encoder_path = get_corpus_path(_MODEL_ENCODER_NAME)
decoder_path = get_corpus_path(_MODEL_DECODER_NAME)
vocab_path = get_corpus_path(_MODEL_VOCAB_NAME)

missing = [
n
for n, v in (
(_MODEL_ENCODER_NAME, encoder_path),
(_MODEL_DECODER_NAME, decoder_path),
(_MODEL_VOCAB_NAME, vocab_path),
)
if not v
]
if missing:
raise FileNotFoundError(
f"corpus-not-found names={missing!r}\n"
f" Corpus file(s) not found: {', '.join(missing)}."
)

with open(str(vocab_path), encoding="utf-8") as f:
vocab: Dict[str, Dict[str, str]] = json.load(f)

self._char2idx: Dict[str, int] = {
k: int(v) for k, v in vocab["input_char2idx"].items()
}
self._idx2char: Dict[int, str] = {
int(k): v for k, v in vocab["target_idx2char"].items()
}
self._sos_idx: int = self._char2idx["<SOS>"]
self._eos_idx: int = self._char2idx["<EOS>"]

self._encoder: "InferenceSession" = InferenceSession(
str(encoder_path),
providers=["CPUExecutionProvider"],
)
self._decoder: "InferenceSession" = InferenceSession(
str(decoder_path),
providers=["CPUExecutionProvider"],
)

def g2p(self, text: str, max_len: int = 50) -> str:
"""Convert Thai text to IPA using greedy decoding.

:param str text: Thai text to convert
:param int max_len: maximum output length (default: 50)
:return: IPA representation of the input text
:rtype: str
"""
import numpy as np

unk_idx: int = self._char2idx.get("<UNK>", 3)
src: List[int] = (
[self._sos_idx]
+ [self._char2idx.get(c, unk_idx) for c in text]
+ [self._eos_idx]
)
src_tensor: "NDArray[np.int64]" = np.array([src], dtype=np.int64)

enc_outputs: list["NDArray[np.float32]"] = self._encoder.run(
None, {"src": src_tensor}
)
memory: "NDArray[np.float32]" = enc_outputs[0]
src_pad_mask: "NDArray[np.bool_]" = enc_outputs[1]

trg_indexes: List[int] = [self._sos_idx]
for _ in range(max_len):
trg_tensor: "NDArray[np.int64]" = np.array(
[trg_indexes], dtype=np.int64
)
dec_outputs: list["NDArray[np.float32]"] = self._decoder.run(
None,
{
"trg": trg_tensor,
"memory": memory,
"src_pad_mask": src_pad_mask,
},
)
next_token_logits: "NDArray[np.float32]" = dec_outputs[0][0, -1, :]
next_token: int = int(np.argmax(next_token_logits))
if next_token == self._eos_idx:
break
trg_indexes.append(next_token)

return "".join(self._idx2char[idx] for idx in trg_indexes[1:])


_THAI_G2P: Optional[ThaiG2P] = None


def transliterate(text: str) -> str:
"""Transliterate Thai text to IPA using ThaiG2P v3.

:param str text: Thai text to transliterate
:return: IPA representation of the input text
:rtype: str
"""
global _THAI_G2P
if _THAI_G2P is None:
_THAI_G2P = ThaiG2P()
return _THAI_G2P.g2p(text)
2 changes: 2 additions & 0 deletions tests/extra/testx_transliterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,8 @@ def test_transliterate(self):
self.assertIsNotNone(transliterate("แมว", engine="thaig2p"))
self.assertIsNotNone(transliterate("คน", engine="thaig2p_v2"))
self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v2"))
self.assertIsNotNone(transliterate("คน", engine="thaig2p_v3"))
self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v3"))
self.assertIsNotNone(transliterate("คน", engine="umt5_thaig2p"))
self.assertIsNotNone(transliterate("แมว", engine="umt5_thaig2p"))
self.assertIsNotNone(transliterate("คน", engine="tltk_g2p"))
Expand Down
Loading