NVIDIA-NeMo
diff --git a/‎nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py‎
Lines changed: 46 additions & 2 deletions b/‎nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py‎
Lines changed: 25 additions & 1 deletion b/‎nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py‎
Lines changed: 40 additions & 31 deletions b/‎nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py‎
Lines changed: 40 additions & 31 deletions
diff --git a/‎nemo/collections/tts/data/text_to_speech_dataset_lhotse.py‎
Lines changed: 23 additions & 1 deletion b/‎nemo/collections/tts/data/text_to_speech_dataset_lhotse.py‎
Lines changed: 23 additions & 1 deletion
@@ -15,8 +15,15 @@
 
 # fmt: off
 
-SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
+# TODO: pt-BR and ko-KR are missing from GRAPHEME_CHARACTER_SETS and IPA_CHARACTER_SETS below.
+#  They work with IPATokenizer (which builds vocab from g2p.symbols), but get_grapheme_character_set()
+#  and get_ipa_character_set() will raise ValueError for these locales until entries are added.
+#  These functions are used by locale-specific tokenizers (e.g., HindiCharsTokenizer uses
+#  get_grapheme_character_set("hi-IN")). If someone later creates PortugueseCharsTokenizer or
+#  KoreanCharsTokenizer, they'd hit this.
+SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "pt-BR", "ko-KR"]
 
+# Derived from LJSpeech and "/" additionally
 DEFAULT_PUNCTUATION = (
     ',', '.', '!', '?', '-',
     ':', ';', '/', '"', '(',
@@ -240,7 +247,7 @@ def get_ipa_punctuation_list(locale):
     punct_set = set(DEFAULT_PUNCTUATION)
     # TODO @xueyang: verify potential mismatches with locale-specific punctuation sets used
     #  in nemo_text_processing.text_normalization.en.taggers.punctuation.py
-    if locale in ["de-DE", "es-ES", "it-IT", "fr-FR", "ja-JP"]:
+    if locale in ["de-DE", "es-ES", "it-IT", "fr-FR", "ja-JP", "pt-BR"]:
         # ref: https://en.wikipedia.org/wiki/Guillemet#Uses
         punct_set.update(['«', '»', '‹', '›'])
     if locale == "de-DE":
@@ -347,5 +354,42 @@ def get_ipa_punctuation_list(locale):
                 '・',
             ]
         )
+    elif locale == "hi-IN":
+        punct_set.update(
+            [
+                '।',
+                '॥',
+            ]
+        )
+    elif locale == "pt-BR":
+        # ref: https://en.wikipedia.org/wiki/Portuguese_orthography#Punctuation
+        # Guillemets (« » ‹ ›) are already added by the shared block above.
+        punct_set.update(
+            [
+                '\u201c',  # " left double quotation mark
+                '\u201d',  # " right double quotation mark
+                '\u2018',  # ' left single quotation mark
+                '\u2019',  # ' right single quotation mark
+                '\u2013',  # – en dash
+                '\u2014',  # — em dash
+                '\u2026',  # … horizontal ellipsis
+            ]
+        )
+    elif locale == "ko-KR":
+        punct_set.update(
+            [
+                '『',
+                '』',
+                '「',
+                '」',
+                '《',
+                '》',
+                '…',
+                '·',
+                '—',
+                '–',
+                '〜',
+            ]
+        )
     punct_list = sorted(list(punct_set))
     return punct_list
@@ -29,6 +29,9 @@
     "any_locale_word_tokenize",
     "english_word_tokenize",
     "LATIN_CHARS_ALL",
+    "INDIC_CHARS_ALL",
+    "KOREAN_CHARS",
+    "WORD_CHARS_ALL",
     "normalize_unicode_text",
     "japanese_text_preprocessing",
 ]
@@ -52,11 +55,32 @@
 LATIN_ALPHABET_BASIC = "A-Za-z"
 ACCENTED_CHARS = "À-ÖØ-öø-ÿ"
 LATIN_CHARS_ALL = f"{LATIN_ALPHABET_BASIC}{ACCENTED_CHARS}"
+
+# Indic characters based on https://www.unicode.org/charts/
+# Hindi, Marathi, Nepali, Sanskrit https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)
+DEVANAGARI_CHARS = (
+    r'\u0900-\u0963\u0966-\u097F'  # excluding danda (U+0964), double danda (U+0965) so they are treated as punctuation
+)
+BENGALI_CHARS = r'\u0980-\u09FF'  # Bengali, Assamese
+TAMIL_CHARS = r'\u0B80-\u0BFF'  # Tamil
+TELUGU_CHARS = r'\u0C00-\u0C7F'  # Telugu
+KANNADA_CHARS = r'\u0C80-\u0CFF'  # Kannada
+GUJARATI_CHARS = r'\u0A80-\u0AFF'  # Gujarati
+INDIC_CHARS_ALL = f"{DEVANAGARI_CHARS}{BENGALI_CHARS}{TAMIL_CHARS}{TELUGU_CHARS}{KANNADA_CHARS}{GUJARATI_CHARS}"
+
+# Korean
+# ref: https://en.wikipedia.org/wiki/Hangul_Syllables   (U+AC00–U+D7A3)
+# ref: https://en.wikipedia.org/wiki/Hangul_Jamo_(Unicode_block)   (U+1100–U+11FF)
+# ref: https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo   (U+3130–U+318F)
+KOREAN_CHARS = r'\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F'
+
+WORD_CHARS_ALL = f"{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}"
+
 _WORDS_RE_EN = re.compile(
     fr"([{LATIN_ALPHABET_BASIC}]+(?:[{LATIN_ALPHABET_BASIC}\-']*[{LATIN_ALPHABET_BASIC}]+)*)|(\|[^|]*\|)|([^{LATIN_ALPHABET_BASIC}|]+)"
 )
 _WORDS_RE_ANY_LOCALE = re.compile(
-    fr"([{LATIN_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}|]+)"
+    fr"([{WORD_CHARS_ALL}]+(?:[{WORD_CHARS_ALL}\-']*[{WORD_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{WORD_CHARS_ALL}|]+)"
 )
 
 
 
@@ -16,6 +16,7 @@
 import itertools
 import os
 import string
+import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import List, Optional, Union
@@ -24,10 +25,12 @@
 from transformers import PreTrainedTokenizerBase
 
 from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import (
+    DEFAULT_PUNCTUATION,
     get_grapheme_character_set,
     get_ipa_punctuation_list,
     validate_locale,
 )
+
 from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
     any_locale_text_preprocessing,
     chinese_text_preprocessing,
@@ -110,14 +113,7 @@ class BaseCharsTokenizer(BaseTokenizer):
         text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
     """
 
-    # fmt: off
-    # TODO @xueyang: unify definition of the default PUNCT_LIST and import from ipa_lexicon.py
-    PUNCT_LIST = (  # Derived from LJSpeech and "/" additionally
-        ',', '.', '!', '?', '-',
-        ':', ';', '/', '"', '(',
-        ')', '[', ']', '{', '}',
-    )
-    # fmt: on
+    PUNCT_LIST = DEFAULT_PUNCTUATION
 
     def __init__(
         self,
@@ -392,6 +388,12 @@ class HindiCharsTokenizer(BaseCharsTokenizer):
             if None then no blank in labels.
         pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
         non_default_punct_list: List of punctuation marks which will be used instead default.
+            Overrides ``punct_version`` when explicitly provided.
+        punct_version: Punctuation set version (default 2).
+            2 — expanded set from ``get_ipa_punctuation_list("hi-IN")`` including dandas.
+            1 — legacy ``sorted(list(DEFAULT_PUNCTUATION))`` without dandas; emits
+            ``DeprecationWarning`` and will be removed in a future release.
+            Ignored when ``non_default_punct_list`` is explicitly provided.
         text_preprocessing_func: Text preprocessing function. Keeps Devanagari unchanged.
 
         Each Unicode code point becomes 1 token (not visual grapheme clusters)
@@ -404,6 +406,7 @@ class HindiCharsTokenizer(BaseCharsTokenizer):
 
     _LOCALE = "hi-IN"
     _PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
+    _PUNCT_LIST_V1 = sorted(list(DEFAULT_PUNCTUATION))
     _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
     _CHARSET_STR += string.ascii_lowercase
 
@@ -414,9 +417,25 @@ def __init__(
         apostrophe=True,
         add_blank_at=None,
         pad_with_space=False,
-        non_default_punct_list=_PUNCT_LIST,
+        non_default_punct_list=None,
+        punct_version=2,
         text_preprocessing_func=any_locale_text_preprocessing,
     ):
+        if non_default_punct_list is None:
+            if punct_version == 1:
+                warnings.warn(
+                    "HindiCharsTokenizer: punct_version=1 uses DEFAULT_PUNCTUATION without dandas "
+                    "and will be removed in a future release. Migrate to punct_version=2.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                non_default_punct_list = self._PUNCT_LIST_V1
+            elif punct_version == 2:
+                non_default_punct_list = self._PUNCT_LIST
+            else:
+                raise ValueError(
+                    f"HindiCharsTokenizer: unsupported punct_version={punct_version}. Use 1 (legacy) or 2."
+                )
         super().__init__(
             chars=chars,
             punct=punct,
@@ -471,14 +490,6 @@ class GermanPhonemesTokenizer(BaseCharsTokenizer):
             Currently, it only applies lower() function.
     """
 
-    # fmt: off
-    PUNCT_LIST = (  # Derived from LJSpeech and "/" additionally
-        ',', '.', '!', '?', '-',
-        ':', ';', '/', '"', '(',
-        ')', '[', ']', '{', '}',
-    )
-    # fmt: on
-
     def __init__(
         self,
         punct=True,
@@ -628,12 +639,9 @@ class EnglishPhonemesTokenizer(BaseTokenizer):
             handled by g2p).
     """
 
+    PUNCT_LIST = DEFAULT_PUNCTUATION
+
     # fmt: off
-    PUNCT_LIST = (  # Derived from LJSpeech and "/" additionally
-        ',', '.', '!', '?', '-',
-        ':', ';', '/', '"', '(',
-        ')', '[', ']', '{', '}',
-    )
     VOWELS = (
         'AA', 'AE', 'AH', 'AO', 'AW',
         'AY', 'EH', 'ER', 'EY', 'IH',
@@ -773,10 +781,14 @@ class IPATokenizer(BaseTokenizer):
     Args:
         g2p: Grapheme to phoneme module, should be IpaG2p or some subclass thereof.
         locale: Locale used to determine default text processing logic and punctuation.
-            Supports ["en-US", "de-DE", "es-ES", "fr-FR"]. Defaults to "en-US".
+            See ``SUPPORTED_LOCALES`` in ``ipa_lexicon.py`` for the full list. Defaults to "en-US".
             Specify None if implementing custom logic for a new locale.
         punct: Whether to reserve grapheme for basic punctuation or not.
         non_default_punct_list: List of punctuation marks which will be used instead default, if any.
+        locale_specific_punct: Whether to use locale-specific punctuation (via ``get_ipa_punctuation_list``)
+            or only ``DEFAULT_PUNCTUATION``. Defaults to True. Set to False to preserve the token
+            vocabulary of checkpoints trained before locale-specific punctuation was introduced.
+            Currently only affects pt-BR. Ignored when ``non_default_punct_list`` is provided.
         fixed_vocab: List of valid grapheme/phoneme tokens for the model.
             Set only if overriding the default vocab generation process (reading from G2P dict).
             If set, any dataset entries that have unincluded graphemes will be filtered out, and any words whose
@@ -799,6 +811,7 @@ def __init__(
         locale="en-US",
         punct=True,
         non_default_punct_list=None,
+        locale_specific_punct=True,
         fixed_vocab=None,
         *,
         space=' ',
@@ -851,8 +864,10 @@ def __init__(
         if punct:
             if non_default_punct_list is not None:
                 self.punct_list = non_default_punct_list
-            else:
+            elif locale_specific_punct:
                 self.punct_list = get_ipa_punctuation_list(locale)
+            else:
+                self.punct_list = sorted(list(DEFAULT_PUNCTUATION))
 
             tokens.update(self.punct_list)
 
@@ -964,14 +979,8 @@ class ChinesePhonemesTokenizer(BaseTokenizer):
             handled by g2p).
     """
 
-    # fmt: off
-    PUNCT_LIST = (  # Derived from LJSpeech and "/" additionally
-        ',', '.', '!', '?', '-',
-        ':', ';', '/', '"', '(',
-        ')', '[', ']', '{', '}',
-    )
+    PUNCT_LIST = DEFAULT_PUNCTUATION
     ZH_PUNCT_LIST = list("，。？！；：、‘’“”（）【】「」《》") + list(PUNCT_LIST)
-    # fmt: on
 
     def __init__(
         self,
 
@@ -21,7 +21,7 @@
 from hydra.utils import instantiate
 from lhotse import CutSet
 from lhotse.dataset.collation import collate_matrices, collate_vectors
-from omegaconf import DictConfig
+from omegaconf import DictConfig, open_dict
 from transformers import AutoTokenizer, T5Tokenizer
 
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPABPETokenizer
@@ -48,6 +48,28 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
             text_tokenizer_kwargs = {}
             if "g2p" in tokenizer_config:
                 text_tokenizer_kwargs["g2p"] = instantiate(tokenizer_config.g2p)
+            # Ensure locale_specific_punct is persisted so it survives .nemo save/restore.
+            # New training for locales with extended punctuation should use the full set (True).
+            if (
+                hasattr(tokenizer_config, '_target_')
+                and tokenizer_config._target_
+                == "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer"
+                and tokenizer_config.get('locale', None) == "pt-BR"
+                and not hasattr(tokenizer_config, 'non_default_punct_list')
+                and not hasattr(tokenizer_config, 'locale_specific_punct')
+            ):
+                with open_dict(tokenizer_config):
+                    tokenizer_config.locale_specific_punct = True
+            # Persist punct_version=2 for HindiCharsTokenizer so .nemo save/restore
+            # always uses the expanded punctuation set (with dandas).
+            if (
+                hasattr(tokenizer_config, '_target_')
+                and tokenizer_config._target_
+                == "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.HindiCharsTokenizer"
+                and not hasattr(tokenizer_config, 'punct_version')
+            ):
+                with open_dict(tokenizer_config):
+                    tokenizer_config.punct_version = 2
             tokenizer = instantiate(tokenizer_config, **text_tokenizer_kwargs)
             # TODO @xueyang: is it really necessary to set phone probability to 1.0 for test mode?
             if mode == 'test' and hasattr(tokenizer, "set_phone_prob"):