PyThaiNLP · Copilot · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,14 @@
 - Full release notes: <https://github.com/PyThaiNLP/pythainlp/releases>
 - Commit history: <https://github.com/PyThaiNLP/pythainlp/compare/v5.3.3...v5.3.4>
 
+## [Unreleased]
+
+### Added
+
+- ThaiG2P v3 engine (`thaig2p_v3`) for `transliterate()` (#1398).
+  Uses a char-level Transformer model in ONNX format from
+  https://github.com/wannaphong/thai-g2p-v3.
+
 ## [5.3.4] - 2026-04-02
 
 ### Fixed

diff --git a/docs/api/transliterate.rst b/docs/api/transliterate.rst
@@ -55,6 +55,7 @@ This section includes multiple transliteration engines designed to suit various
 - **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text.
 - **thaig2p**: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation.
 - **thaig2p_v2**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0
+- **thaig2p_v3**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation using an ONNX model (v3). This model is from https://github.com/wannaphong/thai-g2p-v3
 - **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration.
 - **iso_11940**: Focuses on the ISO 11940 transliteration standard.
 

diff --git a/pythainlp/corpus/default_db.json b/pythainlp/corpus/default_db.json
@@ -72,5 +72,59 @@
                 "pythainlp_version": ">=3.2.0"
             }
         }
+    },
+    "thaig2p_v3_encoder_onnx": {
+        "name": "thaig2p_v3_encoder_onnx",
+        "latest_version": "1.0.0",
+        "description": "ThaiG2P v3 Encoder ONNX",
+        "long_description": "Thai Grapheme-to-Phoneme v3 ONNX model - Encoder",
+        "url": "https://github.com/wannaphong/thai-g2p-v3",
+        "authors": [
+            "Wannaphong Phatthiyaphaibun"
+        ],
+        "license": "Apache-2.0",
+        "versions": {
+            "1.0.0": {
+                "filename": "thaig2p_v3_encoder.onnx",
+                "md5": "-",
+                "pythainlp_version": ">=5.3.4"
+            }
+        }
+    },
+    "thaig2p_v3_decoder_onnx": {
+        "name": "thaig2p_v3_decoder_onnx",
+        "latest_version": "1.0.0",
+        "description": "ThaiG2P v3 Decoder ONNX",
+        "long_description": "Thai Grapheme-to-Phoneme v3 ONNX model - Decoder",
+        "url": "https://github.com/wannaphong/thai-g2p-v3",
+        "authors": [
+            "Wannaphong Phatthiyaphaibun"
+        ],
+        "license": "Apache-2.0",
+        "versions": {
+            "1.0.0": {
+                "filename": "thaig2p_v3_decoder.onnx",
+                "md5": "-",
+                "pythainlp_version": ">=5.3.4"
+            }
+        }
+    },
+    "thaig2p_v3_vocab": {
+        "name": "thaig2p_v3_vocab",
+        "latest_version": "1.0.0",
+        "description": "ThaiG2P v3 Vocabulary",
+        "long_description": "Thai Grapheme-to-Phoneme v3 vocabulary (character-to-index mapping)",
+        "url": "https://github.com/wannaphong/thai-g2p-v3",
+        "authors": [
+            "Wannaphong Phatthiyaphaibun"
+        ],
+        "license": "Apache-2.0",
+        "versions": {
+            "1.0.0": {
+                "filename": "thaig2p_v3_vocab.json",
+                "md5": "-",
+                "pythainlp_version": ">=5.3.4"
+            }
+        }
     }
-}
+}
diff --git a/pythainlp/corpus/thaig2p_v3_decoder.onnx b/pythainlp/corpus/thaig2p_v3_decoder.onnx
diff --git a/pythainlp/corpus/thaig2p_v3_encoder.onnx b/pythainlp/corpus/thaig2p_v3_encoder.onnx
diff --git a/pythainlp/corpus/thaig2p_v3_vocab.json b/pythainlp/corpus/thaig2p_v3_vocab.json
@@ -0,0 +1 @@
+{"input_char2idx": {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3, "ร": 4, "ู": 5, "้": 6, "เ": 7, "ห": 8, "็": 9, "น": 10, "ไ": 11, "ซ": 12, "ต": 13, "ั": 14, "ง": 15, "ม": 16, "า": 17, "บ": 18, "ว": 19, "ส": 20, "ฤ": 21, "ด": 22, "ก": 23, "ุ": 24, "ะ": 25, "ื": 26, "อ": 27, "โ": 28, "ิ": 29, "่": 30, "ฏ": 31, "แ": 32, "ฟ": 33, "ใ": 34, "ข": 35, "ฮ": 36, "ย": 37, "ฃ": 38, "ล": 39, "์": 40, "ำ": 41, "ป": 42, "ช": 43, "ี": 44, "ค": 45, "ณ": 46, "พ": 47, "ึ": 48, "ญ": 49, "ท": 50, "ธ": 51, "ฺ": 52, "ภ": 53, "ผ": 54, "ฝ": 55, "ฉ": 56, "จ": 57, "ฐ": 58, "ฆ": 59, "ศ": 60, "๋": 61, "ถ": 62, "ํ": 63, "ฎ": 64, "๊": 65, "ๅ": 66, " ": 67, "ๆ": 68, "ษ": 69, "-": 70, "ฌ": 71, "ฬ": 72, "ฯ": 73, "ฑ": 74, "ฅ": 75, "ฒ": 76, "ฦ": 77, "๎": 78, "๏": 79}, "target_idx2char": {"0": "<PAD>", "1": "<SOS>", "2": "<EOS>", "3": "<UNK>", "4": "/", "5": "r", "6": "u", "7": "ː", "8": "˦", "9": "˥", "10": ".", "11": "h", "12": "e", "13": "n", "14": "˩", "15": "s", "16": "a", "17": "j", "18": "t", "19": "ŋ", "20": "o", "21": "p", "22": "̚", "23": "w", "24": "˨", "25": "ɯ", "26": "d", "27": "˧", "28": "k", "29": "m", "30": "i", "31": "ʰ", "32": "̯", "33": "b", "34": "ə", "35": "ʔ", "36": "[", "37": "ä", "38": "ɔ", "39": "̃", "40": "]", "41": "ɤ", "42": "ɛ", "43": "͡", "44": "ɕ", "45": "f", "46": "l", "47": "ʌ", "48": "ˈ", "49": "~", "50": "ɪ", "51": "ᵊ", "52": "ɲ", "53": " ", "54": "-", "55": "ʊ", "56": "x", "57": "ɡ", "58": "ɣ", "59": "ɨ", "60": "ũ", "61": "v", "62": "æ", "63": "̂", "64": "ú", "65": "c", "66": "ç", "67": "̌", "68": "û", "69": "̩", "70": "ɗ", "71": "(", "72": ")", "73": "õ", "74": "ˀ", "75": "̊", "76": "ɟ", "77": "̥", "78": "ǐ", "79": "̀", "80": "á", "81": "ì", "82": "â", "83": "ɒ", "84": "ĩ", "85": "́", "86": "ǎ", "87": "í", "88": "ê", "89": "à", "90": "î", "91": "ṳ", "92": "̤", "93": "ù", "94": "ɓ", "95": "ǔ", "96": "è", "97": "ā", "98": "ã", "99": "̠", "100": "ʲ", "101": "z", "102": "ő", "103": "ě", "104": "ʉ", "105": "ó", "106": "ǒ", "107": "̍", "108": ",", "109": "é", "110": "ẽ", "111": "œ", "112": "ʋ", "113": "ɭ", "114": "ò", "115": "ʄ", "116": "̰", "117": "ă", "118": "ṹ", "119": "ô", "120": "ɳ", "121": "ɖ", "122": "+", "123": "ʃ", "124": "ʍ", "125": "ø", "126": "y", "127": "ī", "128": "ŭ", "129": "̪", "130": "ɐ"}}
diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py
@@ -6,7 +6,7 @@
 from typing import Callable
 
 DEFAULT_ROMANIZE_ENGINE: str = "royin"
-DEFAULT_TRANSLITERATE_ENGINE: str = "thaig2p"
+DEFAULT_TRANSLITERATE_ENGINE: str = "thaig2p_v3"
 DEFAULT_PRONUNCIATE_ENGINE: str = "w2p"
 
 
@@ -105,7 +105,9 @@ def transliterate(
     :rtype: str
 
     :Options for engines:
-        * *thaig2p* - (default) Thai Grapheme-to-Phoneme,
+        * *thaig2p_v3* - (default) Thai Grapheme-to-Phoneme using ONNX model (v3),
+          output is IPA. https://github.com/wannaphong/thai-g2p-v3
+        * *thaig2p* - Thai Grapheme-to-Phoneme,
           output is IPA (require PyTorch)
         * *icu* - pyicu, based on International Components for Unicode (ICU)
         * *ipa* - epitran, output is International Phonetic Alphabet (IPA)
@@ -158,6 +160,8 @@ def transliterate(
         from pythainlp.transliterate.iso_11940 import transliterate  # type: ignore[assignment]  # noqa: I001
     elif engine == "thaig2p_v2":
         from pythainlp.transliterate.thaig2p_v2 import transliterate  # noqa: I001
+    elif engine == "thaig2p_v3":
+        from pythainlp.transliterate.thaig2p_v3 import transliterate  # noqa: I001
     elif engine == "umt5_thaig2p":
         from pythainlp.transliterate.umt5_thaig2p import transliterate  # noqa: I001
     else:  # use default engine: "thaig2p"

diff --git a/pythainlp/transliterate/thaig2p_v3.py b/pythainlp/transliterate/thaig2p_v3.py
@@ -0,0 +1,147 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""Thai Grapheme-to-Phoneme (Thai G2P) v3
+
+GitHub: https://github.com/wannaphong/thai-g2p-v3
+"""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+from pythainlp.corpus import get_corpus_path
+
+if TYPE_CHECKING:
+    import numpy as np
+    from numpy.typing import NDArray
+    from onnxruntime import InferenceSession
+
+_MODEL_ENCODER_NAME: str = "thaig2p_v3_encoder_onnx"
+_MODEL_DECODER_NAME: str = "thaig2p_v3_decoder_onnx"
+_MODEL_VOCAB_NAME: str = "thaig2p_v3_vocab"
+
+
+class ThaiG2P:
+    """Thai Grapheme-to-Phoneme using ONNX model (v3).
+
+    This version uses a char-level Transformer model exported to ONNX
+    for converting Thai text to International Phonetic Alphabet (IPA).
+
+    Model files are bundled with PyThaiNLP in the corpus directory.
+
+    For more information, see:
+    https://github.com/wannaphong/thai-g2p-v3
+    """
+
+    _encoder: "InferenceSession"
+    _decoder: "InferenceSession"
+    _char2idx: Dict[str, int]
+    _idx2char: Dict[int, str]
+    _sos_idx: int
+    _eos_idx: int
+
+    def __init__(self) -> None:
+        from onnxruntime import InferenceSession
+
+        encoder_path = get_corpus_path(_MODEL_ENCODER_NAME)
+        decoder_path = get_corpus_path(_MODEL_DECODER_NAME)
+        vocab_path = get_corpus_path(_MODEL_VOCAB_NAME)
+
+        missing = [
+            n
+            for n, v in (
+                (_MODEL_ENCODER_NAME, encoder_path),
+                (_MODEL_DECODER_NAME, decoder_path),
+                (_MODEL_VOCAB_NAME, vocab_path),
+            )
+            if not v
+        ]
+        if missing:
+            raise FileNotFoundError(
+                f"corpus-not-found names={missing!r}\n"
+                f"  Corpus file(s) not found: {', '.join(missing)}."
+            )
+
+        with open(str(vocab_path), encoding="utf-8") as f:
+            vocab: Dict[str, Dict[str, str]] = json.load(f)
+
+        self._char2idx: Dict[str, int] = {
+            k: int(v) for k, v in vocab["input_char2idx"].items()
+        }
+        self._idx2char: Dict[int, str] = {
+            int(k): v for k, v in vocab["target_idx2char"].items()
+        }
+        self._sos_idx: int = self._char2idx["<SOS>"]
+        self._eos_idx: int = self._char2idx["<EOS>"]
+
+        self._encoder: "InferenceSession" = InferenceSession(
+            str(encoder_path),
+            providers=["CPUExecutionProvider"],
+        )
+        self._decoder: "InferenceSession" = InferenceSession(
+            str(decoder_path),
+            providers=["CPUExecutionProvider"],
+        )
+
+    def g2p(self, text: str, max_len: int = 50) -> str:
+        """Convert Thai text to IPA using greedy decoding.
+
+        :param str text: Thai text to convert
+        :param int max_len: maximum output length (default: 50)
+        :return: IPA representation of the input text
+        :rtype: str
+        """
+        import numpy as np
+
+        unk_idx: int = self._char2idx.get("<UNK>", 3)
+        src: List[int] = (
+            [self._sos_idx]
+            + [self._char2idx.get(c, unk_idx) for c in text]
+            + [self._eos_idx]
+        )
+        src_tensor: "NDArray[np.int64]" = np.array([src], dtype=np.int64)
+
+        enc_outputs: list["NDArray[np.float32]"] = self._encoder.run(
+            None, {"src": src_tensor}
+        )
+        memory: "NDArray[np.float32]" = enc_outputs[0]
+        src_pad_mask: "NDArray[np.bool_]" = enc_outputs[1]
+
+        trg_indexes: List[int] = [self._sos_idx]
+        for _ in range(max_len):
+            trg_tensor: "NDArray[np.int64]" = np.array(
+                [trg_indexes], dtype=np.int64
+            )
+            dec_outputs: list["NDArray[np.float32]"] = self._decoder.run(
+                None,
+                {
+                    "trg": trg_tensor,
+                    "memory": memory,
+                    "src_pad_mask": src_pad_mask,
+                },
+            )
+            next_token_logits: "NDArray[np.float32]" = dec_outputs[0][0, -1, :]
+            next_token: int = int(np.argmax(next_token_logits))
+            if next_token == self._eos_idx:
+                break
+            trg_indexes.append(next_token)
+
+        return "".join(self._idx2char[idx] for idx in trg_indexes[1:])
+
+
+_THAI_G2P: Optional[ThaiG2P] = None
+
+
+def transliterate(text: str) -> str:
+    """Transliterate Thai text to IPA using ThaiG2P v3.
+
+    :param str text: Thai text to transliterate
+    :return: IPA representation of the input text
+    :rtype: str
+    """
+    global _THAI_G2P
+    if _THAI_G2P is None:
+        _THAI_G2P = ThaiG2P()
+    return _THAI_G2P.g2p(text)
diff --git a/tests/extra/testx_transliterate.py b/tests/extra/testx_transliterate.py
@@ -261,6 +261,8 @@ def test_transliterate(self):
         self.assertIsNotNone(transliterate("แมว", engine="thaig2p"))
         self.assertIsNotNone(transliterate("คน", engine="thaig2p_v2"))
         self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v2"))
+        self.assertIsNotNone(transliterate("คน", engine="thaig2p_v3"))
+        self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v3"))
         self.assertIsNotNone(transliterate("คน", engine="umt5_thaig2p"))
         self.assertIsNotNone(transliterate("แมว", engine="umt5_thaig2p"))
         self.assertIsNotNone(transliterate("คน", engine="tltk_g2p"))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"input_char2idx": {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3, "ร": 4, "ู": 5, "้": 6, "เ": 7, "ห": 8, "็": 9, "น": 10, "ไ": 11, "ซ": 12, "ต": 13, "ั": 14, "ง": 15, "ม": 16, "า": 17, "บ": 18, "ว": 19, "ส": 20, "ฤ": 21, "ด": 22, "ก": 23, "ุ": 24, "ะ": 25, "ื": 26, "อ": 27, "โ": 28, "ิ": 29, "่": 30, "ฏ": 31, "แ": 32, "ฟ": 33, "ใ": 34, "ข": 35, "ฮ": 36, "ย": 37, "ฃ": 38, "ล": 39, "์": 40, "ำ": 41, "ป": 42, "ช": 43, "ี": 44, "ค": 45, "ณ": 46, "พ": 47, "ึ": 48, "ญ": 49, "ท": 50, "ธ": 51, "ฺ": 52, "ภ": 53, "ผ": 54, "ฝ": 55, "ฉ": 56, "จ": 57, "ฐ": 58, "ฆ": 59, "ศ": 60, "๋": 61, "ถ": 62, "ํ": 63, "ฎ": 64, "๊": 65, "ๅ": 66, " ": 67, "ๆ": 68, "ษ": 69, "-": 70, "ฌ": 71, "ฬ": 72, "ฯ": 73, "ฑ": 74, "ฅ": 75, "ฒ": 76, "ฦ": 77, "๎": 78, "๏": 79}, "target_idx2char": {"0": "<PAD>", "1": "<SOS>", "2": "<EOS>", "3": "<UNK>", "4": "/", "5": "r", "6": "u", "7": "ː", "8": "˦", "9": "˥", "10": ".", "11": "h", "12": "e", "13": "n", "14": "˩", "15": "s", "16": "a", "17": "j", "18": "t", "19": "ŋ", "20": "o", "21": "p", "22": "̚", "23": "w", "24": "˨", "25": "ɯ", "26": "d", "27": "˧", "28": "k", "29": "m", "30": "i", "31": "ʰ", "32": "̯", "33": "b", "34": "ə", "35": "ʔ", "36": "[", "37": "ä", "38": "ɔ", "39": "̃", "40": "]", "41": "ɤ", "42": "ɛ", "43": "͡", "44": "ɕ", "45": "f", "46": "l", "47": "ʌ", "48": "ˈ", "49": "~", "50": "ɪ", "51": "ᵊ", "52": "ɲ", "53": " ", "54": "-", "55": "ʊ", "56": "x", "57": "ɡ", "58": "ɣ", "59": "ɨ", "60": "ũ", "61": "v", "62": "æ", "63": "̂", "64": "ú", "65": "c", "66": "ç", "67": "̌", "68": "û", "69": "̩", "70": "ɗ", "71": "(", "72": ")", "73": "õ", "74": "ˀ", "75": "̊", "76": "ɟ", "77": "̥", "78": "ǐ", "79": "̀", "80": "á", "81": "ì", "82": "â", "83": "ɒ", "84": "ĩ", "85": "́", "86": "ǎ", "87": "í", "88": "ê", "89": "à", "90": "î", "91": "ṳ", "92": "̤", "93": "ù", "94": "ɓ", "95": "ǔ", "96": "è", "97": "ā", "98": "ã", "99": "̠", "100": "ʲ", "101": "z", "102": "ő", "103": "ě", "104": "ʉ", "105": "ó", "106": "ǒ", "107": "̍", "108": ",", "109": "é", "110": "ẽ", "111": "œ", "112": "ʋ", "113": "ɭ", "114": "ò", "115": "ʄ", "116": "̰", "117": "ă", "118": "ṹ", "119": "ô", "120": "ɳ", "121": "ɖ", "122": "+", "123": "ʃ", "124": "ʍ", "125": "ø", "126": "y", "127": "ī", "128": "ŭ", "129": "̪", "130": "ɐ"}}