diff --git a/CHANGELOG.md b/CHANGELOG.md index 330d57cde..ed8063a6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,14 @@ and this project adheres to - Full release notes: - Commit history: +## [Unreleased] + +### Added + +- ThaiG2P v3 engine (`thaig2p_v3`) for `transliterate()` (#1398). + Uses a char-level Transformer model in ONNX format from + https://github.com/wannaphong/thai-g2p-v3. + ## [5.3.4] - 2026-04-02 ### Fixed diff --git a/docs/api/transliterate.rst b/docs/api/transliterate.rst index fc5c259c8..e44761770 100644 --- a/docs/api/transliterate.rst +++ b/docs/api/transliterate.rst @@ -55,6 +55,7 @@ This section includes multiple transliteration engines designed to suit various - **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text. - **thaig2p**: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. - **thaig2p_v2**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0 +- **thaig2p_v3**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation using an ONNX model (v3). This model is from https://github.com/wannaphong/thai-g2p-v3 - **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration. - **iso_11940**: Focuses on the ISO 11940 transliteration standard. diff --git a/pythainlp/corpus/default_db.json b/pythainlp/corpus/default_db.json index a9b86e253..16ce829e5 100644 --- a/pythainlp/corpus/default_db.json +++ b/pythainlp/corpus/default_db.json @@ -72,5 +72,59 @@ "pythainlp_version": ">=3.2.0" } } + }, + "thaig2p_v3_encoder_onnx": { + "name": "thaig2p_v3_encoder_onnx", + "latest_version": "1.0.0", + "description": "ThaiG2P v3 Encoder ONNX", + "long_description": "Thai Grapheme-to-Phoneme v3 ONNX model - Encoder", + "url": "https://github.com/wannaphong/thai-g2p-v3", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "license": "Apache-2.0", + "versions": { + "1.0.0": { + "filename": "thaig2p_v3_encoder.onnx", + "md5": "-", + "pythainlp_version": ">=5.3.4" + } + } + }, + "thaig2p_v3_decoder_onnx": { + "name": "thaig2p_v3_decoder_onnx", + "latest_version": "1.0.0", + "description": "ThaiG2P v3 Decoder ONNX", + "long_description": "Thai Grapheme-to-Phoneme v3 ONNX model - Decoder", + "url": "https://github.com/wannaphong/thai-g2p-v3", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "license": "Apache-2.0", + "versions": { + "1.0.0": { + "filename": "thaig2p_v3_decoder.onnx", + "md5": "-", + "pythainlp_version": ">=5.3.4" + } + } + }, + "thaig2p_v3_vocab": { + "name": "thaig2p_v3_vocab", + "latest_version": "1.0.0", + "description": "ThaiG2P v3 Vocabulary", + "long_description": "Thai Grapheme-to-Phoneme v3 vocabulary (character-to-index mapping)", + "url": "https://github.com/wannaphong/thai-g2p-v3", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "license": "Apache-2.0", + "versions": { + "1.0.0": { + "filename": "thaig2p_v3_vocab.json", + "md5": "-", + "pythainlp_version": ">=5.3.4" + } + } } -} \ No newline at end of file +} diff --git a/pythainlp/corpus/thaig2p_v3_decoder.onnx b/pythainlp/corpus/thaig2p_v3_decoder.onnx new file mode 100644 index 000000000..d6465a89c Binary files /dev/null and b/pythainlp/corpus/thaig2p_v3_decoder.onnx differ diff --git a/pythainlp/corpus/thaig2p_v3_encoder.onnx b/pythainlp/corpus/thaig2p_v3_encoder.onnx new file mode 100644 index 000000000..65c627559 Binary files /dev/null and b/pythainlp/corpus/thaig2p_v3_encoder.onnx differ diff --git a/pythainlp/corpus/thaig2p_v3_vocab.json b/pythainlp/corpus/thaig2p_v3_vocab.json new file mode 100644 index 000000000..a675d41b4 --- /dev/null +++ b/pythainlp/corpus/thaig2p_v3_vocab.json @@ -0,0 +1 @@ +{"input_char2idx": {"": 0, "": 1, "": 2, "": 3, "ร": 4, "ู": 5, "้": 6, "เ": 7, "ห": 8, "็": 9, "น": 10, "ไ": 11, "ซ": 12, "ต": 13, "ั": 14, "ง": 15, "ม": 16, "า": 17, "บ": 18, "ว": 19, "ส": 20, "ฤ": 21, "ด": 22, "ก": 23, "ุ": 24, "ะ": 25, "ื": 26, "อ": 27, "โ": 28, "ิ": 29, "่": 30, "ฏ": 31, "แ": 32, "ฟ": 33, "ใ": 34, "ข": 35, "ฮ": 36, "ย": 37, "ฃ": 38, "ล": 39, "์": 40, "ำ": 41, "ป": 42, "ช": 43, "ี": 44, "ค": 45, "ณ": 46, "พ": 47, "ึ": 48, "ญ": 49, "ท": 50, "ธ": 51, "ฺ": 52, "ภ": 53, "ผ": 54, "ฝ": 55, "ฉ": 56, "จ": 57, "ฐ": 58, "ฆ": 59, "ศ": 60, "๋": 61, "ถ": 62, "ํ": 63, "ฎ": 64, "๊": 65, "ๅ": 66, " ": 67, "ๆ": 68, "ษ": 69, "-": 70, "ฌ": 71, "ฬ": 72, "ฯ": 73, "ฑ": 74, "ฅ": 75, "ฒ": 76, "ฦ": 77, "๎": 78, "๏": 79}, "target_idx2char": {"0": "", "1": "", "2": "", "3": "", "4": "/", "5": "r", "6": "u", "7": "ː", "8": "˦", "9": "˥", "10": ".", "11": "h", "12": "e", "13": "n", "14": "˩", "15": "s", "16": "a", "17": "j", "18": "t", "19": "ŋ", "20": "o", "21": "p", "22": "̚", "23": "w", "24": "˨", "25": "ɯ", "26": "d", "27": "˧", "28": "k", "29": "m", "30": "i", "31": "ʰ", "32": "̯", "33": "b", "34": "ə", "35": "ʔ", "36": "[", "37": "ä", "38": "ɔ", "39": "̃", "40": "]", "41": "ɤ", "42": "ɛ", "43": "͡", "44": "ɕ", "45": "f", "46": "l", "47": "ʌ", "48": "ˈ", "49": "~", "50": "ɪ", "51": "ᵊ", "52": "ɲ", "53": " ", "54": "-", "55": "ʊ", "56": "x", "57": "ɡ", "58": "ɣ", "59": "ɨ", "60": "ũ", "61": "v", "62": "æ", "63": "̂", "64": "ú", "65": "c", "66": "ç", "67": "̌", "68": "û", "69": "̩", "70": "ɗ", "71": "(", "72": ")", "73": "õ", "74": "ˀ", "75": "̊", "76": "ɟ", "77": "̥", "78": "ǐ", "79": "̀", "80": "á", "81": "ì", "82": "â", "83": "ɒ", "84": "ĩ", "85": "́", "86": "ǎ", "87": "í", "88": "ê", "89": "à", "90": "î", "91": "ṳ", "92": "̤", "93": "ù", "94": "ɓ", "95": "ǔ", "96": "è", "97": "ā", "98": "ã", "99": "̠", "100": "ʲ", "101": "z", "102": "ő", "103": "ě", "104": "ʉ", "105": "ó", "106": "ǒ", "107": "̍", "108": ",", "109": "é", "110": "ẽ", "111": "œ", "112": "ʋ", "113": "ɭ", "114": "ò", "115": "ʄ", "116": "̰", "117": "ă", "118": "ṹ", "119": "ô", "120": "ɳ", "121": "ɖ", "122": "+", "123": "ʃ", "124": "ʍ", "125": "ø", "126": "y", "127": "ī", "128": "ŭ", "129": "̪", "130": "ɐ"}} \ No newline at end of file diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py index b18c89b88..724a12fb9 100644 --- a/pythainlp/transliterate/core.py +++ b/pythainlp/transliterate/core.py @@ -6,7 +6,7 @@ from typing import Callable DEFAULT_ROMANIZE_ENGINE: str = "royin" -DEFAULT_TRANSLITERATE_ENGINE: str = "thaig2p" +DEFAULT_TRANSLITERATE_ENGINE: str = "thaig2p_v3" DEFAULT_PRONUNCIATE_ENGINE: str = "w2p" @@ -105,7 +105,9 @@ def transliterate( :rtype: str :Options for engines: - * *thaig2p* - (default) Thai Grapheme-to-Phoneme, + * *thaig2p_v3* - (default) Thai Grapheme-to-Phoneme using ONNX model (v3), + output is IPA. https://github.com/wannaphong/thai-g2p-v3 + * *thaig2p* - Thai Grapheme-to-Phoneme, output is IPA (require PyTorch) * *icu* - pyicu, based on International Components for Unicode (ICU) * *ipa* - epitran, output is International Phonetic Alphabet (IPA) @@ -158,6 +160,8 @@ def transliterate( from pythainlp.transliterate.iso_11940 import transliterate # type: ignore[assignment] # noqa: I001 elif engine == "thaig2p_v2": from pythainlp.transliterate.thaig2p_v2 import transliterate # noqa: I001 + elif engine == "thaig2p_v3": + from pythainlp.transliterate.thaig2p_v3 import transliterate # noqa: I001 elif engine == "umt5_thaig2p": from pythainlp.transliterate.umt5_thaig2p import transliterate # noqa: I001 else: # use default engine: "thaig2p" diff --git a/pythainlp/transliterate/thaig2p_v3.py b/pythainlp/transliterate/thaig2p_v3.py new file mode 100644 index 000000000..f432f7797 --- /dev/null +++ b/pythainlp/transliterate/thaig2p_v3.py @@ -0,0 +1,147 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Thai Grapheme-to-Phoneme (Thai G2P) v3 + +GitHub: https://github.com/wannaphong/thai-g2p-v3 +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Dict, List, Optional + +from pythainlp.corpus import get_corpus_path + +if TYPE_CHECKING: + import numpy as np + from numpy.typing import NDArray + from onnxruntime import InferenceSession + +_MODEL_ENCODER_NAME: str = "thaig2p_v3_encoder_onnx" +_MODEL_DECODER_NAME: str = "thaig2p_v3_decoder_onnx" +_MODEL_VOCAB_NAME: str = "thaig2p_v3_vocab" + + +class ThaiG2P: + """Thai Grapheme-to-Phoneme using ONNX model (v3). + + This version uses a char-level Transformer model exported to ONNX + for converting Thai text to International Phonetic Alphabet (IPA). + + Model files are bundled with PyThaiNLP in the corpus directory. + + For more information, see: + https://github.com/wannaphong/thai-g2p-v3 + """ + + _encoder: "InferenceSession" + _decoder: "InferenceSession" + _char2idx: Dict[str, int] + _idx2char: Dict[int, str] + _sos_idx: int + _eos_idx: int + + def __init__(self) -> None: + from onnxruntime import InferenceSession + + encoder_path = get_corpus_path(_MODEL_ENCODER_NAME) + decoder_path = get_corpus_path(_MODEL_DECODER_NAME) + vocab_path = get_corpus_path(_MODEL_VOCAB_NAME) + + missing = [ + n + for n, v in ( + (_MODEL_ENCODER_NAME, encoder_path), + (_MODEL_DECODER_NAME, decoder_path), + (_MODEL_VOCAB_NAME, vocab_path), + ) + if not v + ] + if missing: + raise FileNotFoundError( + f"corpus-not-found names={missing!r}\n" + f" Corpus file(s) not found: {', '.join(missing)}." + ) + + with open(str(vocab_path), encoding="utf-8") as f: + vocab: Dict[str, Dict[str, str]] = json.load(f) + + self._char2idx: Dict[str, int] = { + k: int(v) for k, v in vocab["input_char2idx"].items() + } + self._idx2char: Dict[int, str] = { + int(k): v for k, v in vocab["target_idx2char"].items() + } + self._sos_idx: int = self._char2idx[""] + self._eos_idx: int = self._char2idx[""] + + self._encoder: "InferenceSession" = InferenceSession( + str(encoder_path), + providers=["CPUExecutionProvider"], + ) + self._decoder: "InferenceSession" = InferenceSession( + str(decoder_path), + providers=["CPUExecutionProvider"], + ) + + def g2p(self, text: str, max_len: int = 50) -> str: + """Convert Thai text to IPA using greedy decoding. + + :param str text: Thai text to convert + :param int max_len: maximum output length (default: 50) + :return: IPA representation of the input text + :rtype: str + """ + import numpy as np + + unk_idx: int = self._char2idx.get("", 3) + src: List[int] = ( + [self._sos_idx] + + [self._char2idx.get(c, unk_idx) for c in text] + + [self._eos_idx] + ) + src_tensor: "NDArray[np.int64]" = np.array([src], dtype=np.int64) + + enc_outputs: list["NDArray[np.float32]"] = self._encoder.run( + None, {"src": src_tensor} + ) + memory: "NDArray[np.float32]" = enc_outputs[0] + src_pad_mask: "NDArray[np.bool_]" = enc_outputs[1] + + trg_indexes: List[int] = [self._sos_idx] + for _ in range(max_len): + trg_tensor: "NDArray[np.int64]" = np.array( + [trg_indexes], dtype=np.int64 + ) + dec_outputs: list["NDArray[np.float32]"] = self._decoder.run( + None, + { + "trg": trg_tensor, + "memory": memory, + "src_pad_mask": src_pad_mask, + }, + ) + next_token_logits: "NDArray[np.float32]" = dec_outputs[0][0, -1, :] + next_token: int = int(np.argmax(next_token_logits)) + if next_token == self._eos_idx: + break + trg_indexes.append(next_token) + + return "".join(self._idx2char[idx] for idx in trg_indexes[1:]) + + +_THAI_G2P: Optional[ThaiG2P] = None + + +def transliterate(text: str) -> str: + """Transliterate Thai text to IPA using ThaiG2P v3. + + :param str text: Thai text to transliterate + :return: IPA representation of the input text + :rtype: str + """ + global _THAI_G2P + if _THAI_G2P is None: + _THAI_G2P = ThaiG2P() + return _THAI_G2P.g2p(text) diff --git a/tests/extra/testx_transliterate.py b/tests/extra/testx_transliterate.py index 438b0fce3..f8097e8e6 100644 --- a/tests/extra/testx_transliterate.py +++ b/tests/extra/testx_transliterate.py @@ -261,6 +261,8 @@ def test_transliterate(self): self.assertIsNotNone(transliterate("แมว", engine="thaig2p")) self.assertIsNotNone(transliterate("คน", engine="thaig2p_v2")) self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v2")) + self.assertIsNotNone(transliterate("คน", engine="thaig2p_v3")) + self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v3")) self.assertIsNotNone(transliterate("คน", engine="umt5_thaig2p")) self.assertIsNotNone(transliterate("แมว", engine="umt5_thaig2p")) self.assertIsNotNone(transliterate("คน", engine="tltk_g2p"))