NVIDIA-NeMo · quapham · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Apr 16, 2026
diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -15,7 +15,7 @@
 
 # fmt: off
 
-SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
+SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "pt-BR", "ko-KR"]
 
 DEFAULT_PUNCTUATION = (
     ',', '.', '!', '?', '-',
@@ -347,5 +347,28 @@ def get_ipa_punctuation_list(locale):
                 '・',
             ]
         )
-        )
+        ),
+    elif locale == "pt-BR":
+        # ref: https://en.wikipedia.org/wiki/Portuguese_orthography#Punctuation
+        punct_set.update(
+            [
+                '—',  # em dash, U+2014 (used for dialogue in Brazilian Portuguese)
+                '–',  # en dash, U+2013
+                '…',  # horizontal ellipsis, U+2026
+                '\u201C',  # left double quotation mark, U+201C
+                '\u201D',  # right double quotation mark, U+201D
+            ]
+        )
-        )
+        ),
+    elif locale == "pt-BR":
+        # ref: https://en.wikipedia.org/wiki/Portuguese_orthography#Punctuation
+        punct_set.update(
+            [
+                '—',  # em dash, U+2014 (used for dialogue in Brazilian Portuguese)
+                '–',  # en dash, U+2013
+                '…',  # horizontal ellipsis, U+2026
+                '\u201C',  # left double quotation mark, U+201C
+                '\u201D',  # right double quotation mark, U+201D
+            ]
+        )
+    elif locale == "hi-IN":
+        punct_set.update(
+            [
+                '।',
+                '॥',
+            ]
+        )
+    elif locale == "ko-KR":
+        punct_set.update(
+            [
+                '『',
+                '』',
+                '「',
+                '」',
+                '《',
+                '》',
+                '…',
+                '·',
+                '—',
+                '–',
+                '〜',
+            ]
+        )
     punct_list = sorted(list(punct_set))
     return punct_list
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
@@ -29,6 +29,8 @@
     "any_locale_word_tokenize",
     "english_word_tokenize",
     "LATIN_CHARS_ALL",
+    "INDIC_CHARS_ALL",
+    "KOREAN_CHARS",
     "normalize_unicode_text",
     "japanese_text_preprocessing",
 ]
@@ -52,11 +54,30 @@
 LATIN_ALPHABET_BASIC = "A-Za-z"
 ACCENTED_CHARS = "À-ÖØ-öø-ÿ"
 LATIN_CHARS_ALL = f"{LATIN_ALPHABET_BASIC}{ACCENTED_CHARS}"
+
+# Indic characters based on https://www.unicode.org/charts/
+# Hindi, Marathi, Nepali, Sanskrit https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)
+DEVANAGARI_CHARS = (
+    r'\u0900-\u0963\u0966-\u097F'  # excluding danda (U+0964), double danda (U+0965) so they are treated as punctuation
+)
+BENGALI_CHARS = r'\u0980-\u09FF'  # Bengali, Assamese
+TAMIL_CHARS = r'\u0B80-\u0BFF'  # Tamil
+TELUGU_CHARS = r'\u0C00-\u0C7F'  # Telugu
+KANNADA_CHARS = r'\u0C80-\u0CFF'  # Kannada
+GUJARATI_CHARS = r'\u0A80-\u0AFF'  # Gujarati
+INDIC_CHARS_ALL = f"{DEVANAGARI_CHARS}{BENGALI_CHARS}{TAMIL_CHARS}{TELUGU_CHARS}{KANNADA_CHARS}{GUJARATI_CHARS}"
+
+# Korean
+# ref: https://en.wikipedia.org/wiki/Hangul_Syllables   (U+AC00–U+D7A3)
+# ref: https://en.wikipedia.org/wiki/Hangul_Jamo_(Unicode_block)   (U+1100–U+11FF)
+# ref: https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo   (U+3130–U+318F)
+KOREAN_CHARS = r'\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F'
+
 _WORDS_RE_EN = re.compile(
     fr"([{LATIN_ALPHABET_BASIC}]+(?:[{LATIN_ALPHABET_BASIC}\-']*[{LATIN_ALPHABET_BASIC}]+)*)|(\|[^|]*\|)|([^{LATIN_ALPHABET_BASIC}|]+)"
 )
 _WORDS_RE_ANY_LOCALE = re.compile(
-    fr"([{LATIN_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}|]+)"
+    fr"([{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}]+(?:[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}\-']*[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}|]+)"
 )
 
 

diff --git a/nemo/collections/tts/g2p/models/i18n_ipa.py b/nemo/collections/tts/g2p/models/i18n_ipa.py
@@ -20,6 +20,8 @@
 
 from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import validate_locale
 from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
+    INDIC_CHARS_ALL,
+    KOREAN_CHARS,
     LATIN_CHARS_ALL,
     any_locale_word_tokenize,
     english_word_tokenize,
@@ -29,18 +31,24 @@
 from nemo.collections.tts.g2p.utils import GRAPHEME_CASE_MIXED, GRAPHEME_CASE_UPPER, set_grapheme_case
 from nemo.utils import logging
 
+# Compiled regex pattern for Indic scripts (used in dictionary parsing)
+_INDIC_PATTERN = re.compile(f'^[{INDIC_CHARS_ALL}]')
+_KOREAN_PATTERN = re.compile(f'^[{KOREAN_CHARS}]')
+
 
 class IpaG2p(BaseG2p):
     # fmt: off
     STRESS_SYMBOLS = ["ˈ", "ˌ"]
     # Regex for roman characters, accented characters, and locale-agnostic numbers/digits
-    CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}\d]")
-    PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}\d]")
+    CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}\d]")
+    PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}\d]")
     # fmt: on
 
     def __init__(
         self,
-        phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
+        phoneme_dict: Union[
+            str, pathlib.Path, List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]], Dict[str, List[List[str]]]
+        ],
         locale: str = "en-US",
-    def __init__(
-        self,
-        phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
-        # phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
-        phoneme_dict: Union[str, pathlib.Path, List[Union[str, pathlib.Path]], Dict[str, List[List[str]]]],
-        locale: str = "en-US",
+    def __init__(
+        self,
+        phoneme_dict: Union[
+            str, pathlib.Path, List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]], Dict[str, List[List[str]]]
+        ],
-    def __init__(
-        self,
-        phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
-        # phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
-        phoneme_dict: Union[str, pathlib.Path, List[Union[str, pathlib.Path]], Dict[str, List[List[str]]]],
-        locale: str = "en-US",
+    def __init__(
+        self,
+        phoneme_dict: Union[
+            str, pathlib.Path, List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]], Dict[str, List[List[str]]]
+        ],
         apply_to_oov_word: Optional[Callable[[str], str]] = None,
         ignore_ambiguous_words: bool = True,
@@ -59,10 +67,13 @@ def __init__(
         `apply_to_oov_word` for handling.
 
         Args:
-            phoneme_dict (str, Path, or Dict): Path to file in CMUdict format or an IPA dict object with CMUdict-like
-                entries. For example,
-                a dictionary file: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt;
-                a dictionary object: {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}.
+            phoneme_dict: A single phoneme dictionary source or a list of sources for multi-dictionary
+                code-switching (e.g. Hindi + English). Each source can be:
+                - a file path (str or pathlib.Path) in CMUdict format,
+                  e.g. ``scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt``
+                - a dict object with CMUdict-like entries,
+                  e.g. ``{"Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]]}``
+                When a list is provided, all sources are parsed and merged into a single dictionary.
             locale (str): Locale used to determine a locale-specific tokenization logic. Currently, it supports "en-US",
                 "de-DE", and "es-ES". Defaults to "en-US". Specify None if implementing custom logic for a new locale.
             apply_to_oov_word (Callable): Function that deals with the out-of-vocabulary (OOV) words that do not exist
@@ -154,19 +165,36 @@ def __init__(
 
     @staticmethod
     def _parse_phoneme_dict(
-        phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]]
+        phoneme_dict: Union[
+            str,
+            pathlib.Path,
+            Dict[str, List[List[str]]],
+            List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]],
+        ]
     ) -> Dict[str, List[List[str]]]:
         """
-        parse an input IPA dictionary and save it as a dict object.
+        Parse one or more IPA dictionaries and return a merged dict object.
 
         Args:
-            phoneme_dict (Union[str, pathlib.Path, dict]): Path to file in CMUdict format or an IPA dict object with
-                CMUdict-like entries. For example,
-                a dictionary file: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt;
-                a dictionary object: {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}.
+            phoneme_dict: A single phoneme dictionary source or a list of sources for multi-dictionary
+                code-switching (e.g. Hindi + English). Each source can be:
+                - a file path (str or pathlib.Path) in CMUdict format,
+                e.g. ``scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt``
+                - a dict object with CMUdict-like entries,
+                e.g. ``{"Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]]}``
+                When a list is provided, all sources are parsed and merged into a single dictionary.
 
-        Returns: a dict object (Dict[str, List[List[str]]]).
+        Returns:
+            A merged dict object (Dict[str, List[List[str]]]).
         """
+        if isinstance(phoneme_dict, list):
+            merged = defaultdict(list)
+            for source in phoneme_dict:
+                parsed = IpaG2p._parse_phoneme_dict(source)
+                for word, prons in parsed.items():
+                    merged[word].extend(prons)
+            return merged
+
         if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path):
             # load the dictionary file where there may exist a digit suffix after a word, e.g. "Word(2)", which
             # represents the pronunciation variant of that word.
@@ -190,6 +218,8 @@ def _parse_phoneme_dict(
                         or 'À' <= line[0] <= 'Ö'
                         or 'Ø' <= line[0] <= 'ö'
                         or 'ø' <= line[0] <= 'ÿ'
+                        or _INDIC_PATTERN.match(line[0])
+                        or _KOREAN_PATTERN.match(line[0])
                         or line[0] == "'"
                     ):
                         parts = line.strip().split(maxsplit=1)
@@ -217,7 +247,15 @@ def _parse_phoneme_dict(
 
         return phoneme_dict_obj
 
-    def replace_dict(self, phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]]):
+    def replace_dict(
+        self,
+        phoneme_dict: Union[
+            str,
+            pathlib.Path,
+            Dict[str, List[List[str]]],
+            List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]],
+        ],
+    ):
         """
         Replace model's phoneme dictionary with a custom one
         """