fix: add back-compatibility, case=mixed, ascii_letters.

XuesongYang · XuesongYang · commit 4762ab3f4254 · 2026-04-23T15:46:16.000Z
Signed-off-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -16,6 +16,7 @@
 import itertools
 import os
 import string
+import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import List, Optional, Union
@@ -386,6 +387,14 @@ def __init__(
 class HindiCharsTokenizer(BaseCharsTokenizer):
     """Hindi grapheme tokenizer (character-based, no phonemes).
     Args:
+        chars: Explicit character set string. When provided, ``charset_version`` is ignored.
+        charset_version: Controls which default character set to use (only when ``chars`` is None).
+            ``2`` (default) — ``case="upper"`` Devanagari + ``ascii_letters``.
+            Hindi/Devanagari has no case distinction, so ``case="upper"`` avoids duplicating
+            every code-point. ``ascii_letters`` covers both upper- and lower-case English for
+            mixed-language text.
+            ``1`` — legacy ``case="mixed"`` Devanagari + ``ascii_lowercase``. Use this value to
+            restore models that were trained before the charset fix.
         punct: Whether to reserve grapheme for basic punctuation or not.
         apostrophe: Whether to use apostrophe or not.
         add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
@@ -404,24 +413,36 @@ class HindiCharsTokenizer(BaseCharsTokenizer):
 
     _LOCALE = "hi-IN"
     _PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
-    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
-    _CHARSET_STR += string.ascii_lowercase
+    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="upper") + string.ascii_letters
+    _CHARSET_STR_V1 = get_grapheme_character_set(locale=_LOCALE, case="mixed") + string.ascii_lowercase
 
     def __init__(
         self,
-        chars=_CHARSET_STR,
+        chars=None,
+        charset_version=2,
         punct=True,
         apostrophe=True,
         add_blank_at=None,
         pad_with_space=False,
         non_default_punct_list=_PUNCT_LIST,
         text_preprocessing_func=any_locale_text_preprocessing,
     ):
-        logging.warning(
-            "HindiCharsTokenizer: The default character set (case='mixed' + ascii_lowercase) "
-            "is deprecated and will change to (case='upper' + ascii_letters) in the next release. "
-            "Please pass 'chars' explicitly to avoid unexpected behavior."
-        )
+        if chars is None:
+            if charset_version == 1:
+                warnings.warn(
+                    "HindiCharsTokenizer charset_version=1 (case='mixed' + ascii_lowercase) is deprecated "
+                    "and will be removed in a future release. "
+                    "Migrate to charset_version=2 (case='upper' + ascii_letters) and retrain.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                chars = self._CHARSET_STR_V1
+            elif charset_version == 2:
+                chars = self._CHARSET_STR
+            else:
+                raise ValueError(
+                    f"HindiCharsTokenizer: unsupported charset_version={charset_version!r}. Use 1 (legacy) or 2."
+                )
         super().__init__(
             chars=chars,
             punct=punct,
@@ -466,6 +487,14 @@ def encode(self, text):
 class ArabicCharsTokenizer(BaseCharsTokenizer):
     """Arabic grapheme tokenizer (character-based, no phonemes).
     Args:
+        chars: Explicit character set string. When provided, ``charset_version`` is ignored.
+        charset_version: Controls which default character set to use (only when ``chars`` is None).
+            ``2`` (default) — ``case="upper"`` Arabic + ``ascii_letters``.
+            Arabic script has no case distinction, so ``case="upper"`` avoids duplicating
+            every code-point. ``ascii_letters`` covers both upper- and lower-case English for
+            mixed-language text.
+            ``1`` — legacy ``case="mixed"`` Arabic + ``ascii_letters``. Use this value to
+            restore models that were trained before the charset fix.
         punct: Whether to reserve grapheme for basic punctuation or not.
         apostrophe: Whether to use apostrophe or not.
         add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
@@ -483,19 +512,36 @@ class ArabicCharsTokenizer(BaseCharsTokenizer):
 
     _LOCALE = "ar-MSA"
     _PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
-    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
-    _CHARSET_STR += string.ascii_letters
+    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="upper") + string.ascii_letters
+    _CHARSET_STR_V1 = get_grapheme_character_set(locale=_LOCALE, case="mixed") + string.ascii_letters
 
     def __init__(
         self,
-        chars=_CHARSET_STR,
+        chars=None,
+        charset_version=2,
         punct=True,
         apostrophe=True,
         add_blank_at=None,
         pad_with_space=False,
         non_default_punct_list=_PUNCT_LIST,
         text_preprocessing_func=any_locale_text_preprocessing,
     ):
+        if chars is None:
+            if charset_version == 1:
+                warnings.warn(
+                    "ArabicCharsTokenizer charset_version=1 (case='mixed' + ascii_letters) is deprecated "
+                    "and will be removed in a future release. "
+                    "Migrate to charset_version=2 (case='upper' + ascii_letters) and retrain.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                chars = self._CHARSET_STR_V1
+            elif charset_version == 2:
+                chars = self._CHARSET_STR
+            else:
+                raise ValueError(
+                    f"ArabicCharsTokenizer: unsupported charset_version={charset_version!r}. Use 1 (legacy) or 2."
+                )
         super().__init__(
             chars=chars,
             punct=punct,
diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -329,6 +329,51 @@ def test_hindi_chars_tokenizer(self):
 
         assert chars == expected_output
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_hindi_chars_tokenizer_legacy(self):
+        """charset_version=1 reproduces the old (case='mixed' + ascii_lowercase) behaviour."""
+        input_text = "नमस्ते दुनिया!"
+        expected_output = "नमस्ते दुनिया!"
+
+        tokenizer = HindiCharsTokenizer(charset_version=1)
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_hindi_chars_tokenizer_mixed_english(self):
+        """Default v2 charset supports both upper and lower English."""
+        input_text = "नमस्ते Hello World"
+        expected_output = "नमस्ते Hello World"
+
+        tokenizer = HindiCharsTokenizer()
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_hindi_chars_tokenizer_v1_no_upper_english(self):
+        """Legacy v1 charset only has ascii_lowercase, so uppercase English is skipped."""
+        input_text = "नमस्ते Hello"
+        expected_output = "नमस्ते ello"
+
+        tokenizer = HindiCharsTokenizer(charset_version=1)
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_hindi_chars_tokenizer_v1_v2_different_vocab(self):
+        """v1 and v2 must produce different token vocabularies."""
+        tok_v1 = HindiCharsTokenizer(charset_version=1)
+        tok_v2 = HindiCharsTokenizer(charset_version=2)
+
+        assert tok_v1.tokens != tok_v2.tokens
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_arabic_chars_tokenizer_mixed_english(self):
@@ -379,3 +424,24 @@ def test_arabic_chars_tokenizer_unknown_token(self):
         chars, tokens = self._parse_text(tokenizer, input_text)
 
         assert chars == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_arabic_chars_tokenizer_legacy(self):
+        """charset_version=1 reproduces the old (case='mixed' + ascii_letters) behaviour."""
+        input_text = "مرحبا Hello"
+        expected_output = "مرحبا Hello"
+
+        tokenizer = ArabicCharsTokenizer(charset_version=1)
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_arabic_chars_tokenizer_v1_v2_different_vocab(self):
+        """v1 and v2 must produce different token vocabularies."""
+        tok_v1 = ArabicCharsTokenizer(charset_version=1)
+        tok_v2 = ArabicCharsTokenizer(charset_version=2)
+
+        assert tok_v1.tokens != tok_v2.tokens