Add Arabic char tokenizer and Japanese-English katakana support (#15614)

quapham · web-flow · commit 78694d56d262 · 2026-04-25T11:41:02.000-04:00
Apply isort and black reformatting
Fix Hindi chartokenizer, use 'case=upper' and prevent duplicate spaces in the Japanese G2P fallback paths.
Apply isort and black reformatting
Fix HindiCharsTokenizer backward compat and add Arabic dialect tests
Apply isort and black reformatting
Add Arabic tokenizer test coverage: diacritics, dialects, punctuation, unknown chars
Expand Arabic tokenizer tests: parametrize diacritics, dialects
Apply isort and black reformatting
added comprehensive test coverage.
fix: add back-compatibility, case=mixed, ascii_letters.
fix: add charset_version to Hindi/Arabic tokenizers for backward compatibility
Introduce a  parameter in HindiCharsTokenizer and
ArabicCharsTokenizer so old models (v1: case='mixed') keep working
while new models train with the corrected charset (v2: case='upper').
- Define CASELESS_SCRIPT_TOKENIZER_TARGETS and DEFAULT_CHARSET_VERSION
  constants in tts_tokenizers.py
- Persist charset_version into the OmegaConf config during training
  (setup_tokenizers) so .nemo archives record which version was used
- Add _migrate_charset_version() helper in magpietts inference utils
  to pin charset_version=1 for old checkpoints that lack the field,
  preventing a silent vocabulary mismatch at inference time
bugfix: L2_TTS_Fast_dev_runs_Magpietts_OnlineCFGDistillation.sh

Signed-off-by: quapham &lt;quapham@users.noreply.github.com&gt;
Signed-off-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;
diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -21,7 +21,7 @@
 #  These functions are used by locale-specific tokenizers (e.g., HindiCharsTokenizer uses
 #  get_grapheme_character_set("hi-IN")). If someone later creates PortugueseCharsTokenizer or
 #  KoreanCharsTokenizer, they'd hit this.
-SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "pt-BR", "ko-KR"]
+SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "ar-MSA", "pt-BR", "ko-KR"]
 
 # Derived from LJSpeech and "/" additionally
 DEFAULT_PUNCTUATION = (
@@ -114,6 +114,15 @@
         # Danda (period)
         '।',
     ),
+    # ref: https://en.wikipedia.org/wiki/Arabic_alphabet
+    "ar-MSA": (
+        'ء', 'آ', 'أ', 'إ', 'ؤ', 'ئ', 'ا', 'ب', 'ة', 'ت',
+        'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش',
+        'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل',
+        'م', 'ن', 'ه', 'و', 'ى', 'ي', 
+        # Diacritics
+        'ً', 'ٌ', 'ٍ', 'َ', 'ُ', 'ِ', 'ّ', 'ٰ', 'ْ',
+    ),
 }
 
 IPA_CHARACTER_SETS = {
@@ -354,6 +363,14 @@ def get_ipa_punctuation_list(locale):
                 '・',
             ]
         )
+    elif locale == "ar-MSA":
+        punct_set.update(
+            [
+                '،',
+                '؛',
+                '؟',
+            ]
+        )
     elif locale == "hi-IN":
         punct_set.update(
             [
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -44,6 +44,15 @@
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.utils import logging
 
+CASELESS_SCRIPT_TOKENIZER_TARGETS = frozenset(
+    {
+        'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.HindiCharsTokenizer',
+        'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ArabicCharsTokenizer',
+    }
+)
+
+DEFAULT_CHARSET_VERSION = 2
+
 
 class BaseTokenizer(ABC):
     """Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens.
@@ -164,9 +173,8 @@ def encode(self, text):
                 logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
 
         # Remove trailing spaces
-        if cs:
-            while cs[-1] == space:
-                cs.pop()
+        while cs and cs[-1] == space:
+            cs.pop()
 
         if self.pad_with_space:
             cs = [space] + cs + [space]
@@ -382,6 +390,14 @@ def __init__(
 class HindiCharsTokenizer(BaseCharsTokenizer):
     """Hindi grapheme tokenizer (character-based, no phonemes).
     Args:
+        chars: Explicit character set string. When provided, ``charset_version`` is ignored.
+        charset_version: Controls which default character set to use (only when ``chars`` is None).
+            ``2`` (default) — ``case="upper"`` Devanagari + ``ascii_letters``.
+            Hindi/Devanagari has no case distinction, so ``case="upper"`` avoids duplicating
+            every code-point. ``ascii_letters`` covers both upper- and lower-case English for
+            mixed-language text.
+            ``1`` — legacy ``case="mixed"`` Devanagari + ``ascii_lowercase``. Use this value to
+            restore models that were trained before the charset fix.
         punct: Whether to reserve grapheme for basic punctuation or not.
         apostrophe: Whether to use apostrophe or not.
         add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
@@ -406,13 +422,14 @@ class HindiCharsTokenizer(BaseCharsTokenizer):
 
     _LOCALE = "hi-IN"
     _PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
+    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="upper") + string.ascii_letters
     _PUNCT_LIST_V1 = sorted(list(DEFAULT_PUNCTUATION))
-    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
-    _CHARSET_STR += string.ascii_lowercase
+    _CHARSET_STR_V1 = get_grapheme_character_set(locale=_LOCALE, case="mixed") + string.ascii_lowercase
 
     def __init__(
         self,
-        chars=_CHARSET_STR,
+        chars=None,
+        charset_version=2,
         punct=True,
         apostrophe=True,
         add_blank_at=None,
@@ -421,6 +438,22 @@ def __init__(
         punct_version=2,
         text_preprocessing_func=any_locale_text_preprocessing,
     ):
+        if chars is None:
+            if charset_version == 1:
+                warnings.warn(
+                    "HindiCharsTokenizer charset_version=1 (case='mixed' + ascii_lowercase) is deprecated "
+                    "and will be removed in a future release. "
+                    "Migrate to charset_version=2 (case='upper' + ascii_letters) and retrain.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                chars = self._CHARSET_STR_V1
+            elif charset_version == 2:
+                chars = self._CHARSET_STR
+            else:
+                raise ValueError(
+                    f"HindiCharsTokenizer: unsupported charset_version={charset_version!r}. Use 1 (legacy) or 2."
+                )
         if non_default_punct_list is None:
             if punct_version == 1:
                 warnings.warn(
@@ -467,9 +500,100 @@ def encode(self, text):
                 logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
 
         # Remove trailing spaces
-        if cs:
-            while cs[-1] == space:
-                cs.pop()
+        while cs and cs[-1] == space:
+            cs.pop()
+
+        if self.pad_with_space:
+            cs = [space] + cs + [space]
+
+        return [self._token2id[p] for p in cs]
+
+
+class ArabicCharsTokenizer(BaseCharsTokenizer):
+    """Arabic grapheme tokenizer (character-based, no phonemes).
+    Args:
+        chars: Explicit character set string. When provided, ``charset_version`` is ignored.
+        charset_version: Controls which default character set to use (only when ``chars`` is None).
+            ``2`` (default) — ``case="upper"`` Arabic + ``ascii_letters``.
+            Arabic script has no case distinction, so ``case="upper"`` avoids duplicating
+            every code-point. ``ascii_letters`` covers both upper- and lower-case English for
+            mixed-language text.
+            ``1`` — legacy ``case="mixed"`` Arabic + ``ascii_letters``. Use this value to
+            restore models that were trained before the charset fix.
+        punct: Whether to reserve grapheme for basic punctuation or not.
+        apostrophe: Whether to use apostrophe or not.
+        add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+            if None then no blank in labels.
+        pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+        non_default_punct_list: List of punctuation marks which will be used instead default.
+        text_preprocessing_func: Text preprocessing function. Keeps Arabic unchanged.
+
+        Each Unicode code point becomes 1 token (letters, diacritics, and Arabic punct from ipa_lexicon).
+        Supports both upper and lower English letters (e.g. mixed-language text).
+
+        Input Text: مرحبا Hello
+        Chars: ['م', 'ر', 'ح', 'ب', 'ا', ' ', 'H', 'e', 'l', 'l', 'o']
+    """
+
+    _LOCALE = "ar-MSA"
+    _PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
+    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="upper") + string.ascii_letters
+    _CHARSET_STR_V1 = get_grapheme_character_set(locale=_LOCALE, case="mixed") + string.ascii_letters
+
+    def __init__(
+        self,
+        chars=None,
+        charset_version=2,
+        punct=True,
+        apostrophe=True,
+        add_blank_at=None,
+        pad_with_space=False,
+        non_default_punct_list=_PUNCT_LIST,
+        text_preprocessing_func=any_locale_text_preprocessing,
+    ):
+        if chars is None:
+            if charset_version == 1:
+                warnings.warn(
+                    "ArabicCharsTokenizer charset_version=1 (case='mixed' + ascii_letters) is deprecated "
+                    "and will be removed in a future release. "
+                    "Migrate to charset_version=2 (case='upper' + ascii_letters) and retrain.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                chars = self._CHARSET_STR_V1
+            elif charset_version == 2:
+                chars = self._CHARSET_STR
+            else:
+                raise ValueError(
+                    f"ArabicCharsTokenizer: unsupported charset_version={charset_version!r}. Use 1 (legacy) or 2."
+                )
+        super().__init__(
+            chars=chars,
+            punct=punct,
+            apostrophe=apostrophe,
+            add_blank_at=add_blank_at,
+            pad_with_space=pad_with_space,
+            non_default_punct_list=non_default_punct_list,
+            text_preprocessing_func=text_preprocessing_func,
+        )
+
+    def encode(self, text):
+        """Encode Arabic text, handling diacritics and English (upper/lower) correctly."""
+        cs, space, tokens = [], self.tokens[self.space], set(self.tokens)
+
+        text = self.text_preprocessing_func(text)
+        for c in text:
+            if c == space and len(cs) > 0 and cs[-1] != space:
+                cs.append(c)
+            elif c in tokens and c != space:
+                cs.append(c)
+            elif (c in self.PUNCT_LIST) and self.punct:
+                cs.append(c)
+            elif c != space:
+                logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
+
+        while cs and cs[-1] == space:
+            cs.pop()
 
         if self.pad_with_space:
             cs = [space] + cs + [space]
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -24,7 +24,12 @@
 from omegaconf import DictConfig, open_dict
 from transformers import AutoTokenizer, T5Tokenizer
 
-from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPABPETokenizer
+from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (
+    CASELESS_SCRIPT_TOKENIZER_TARGETS,
+    DEFAULT_CHARSET_VERSION,
+    AggregatedTTSTokenizer,
+    IPABPETokenizer,
+)
 from nemo.collections.tts.parts.utils.tts_dataset_utils import (
     beta_binomial_prior_distribution,
     normalize_volume,
@@ -74,6 +79,18 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
             # TODO @xueyang: is it really necessary to set phone probability to 1.0 for test mode?
             if mode == 'test' and hasattr(tokenizer, "set_phone_prob"):
                 tokenizer.set_phone_prob(1.0)
+
+            # Persist charset_version so it's saved in .nemo archives and
+            # update_config_for_inference can distinguish old checkpoints
+            # (missing charset_version → v1) from new ones.
+            if (
+                hasattr(tokenizer_config, '_target_')
+                and tokenizer_config._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS
+                and not hasattr(tokenizer_config, 'charset_version')
+            ):
+                with open_dict(all_tokenizers_config):
+                    tokenizer_config.charset_version = DEFAULT_CHARSET_VERSION
+
         tokenizers.append(tokenizer)
         tokenizer_names.append(tokenizer_name)
 
diff --git a/nemo/collections/tts/g2p/models/ja_jp_ipa.py b/nemo/collections/tts/g2p/models/ja_jp_ipa.py
@@ -298,28 +298,44 @@ def __call__(self, text: str) -> List[str]:
             acc = word.get('acc', 0)
 
             string = unicodedata.normalize('NFKC', string)
+            pos_group1 = word.get('pos_group1', '')
 
-            # Handle English letters
+            # If string is pure ASCII letters after NFKC normalization, decide based on pos:
+            # - pos='フィラー' means OpenJTalk didn't recognize the word (just spelled it out) → keep Latin
+            # - any other pos (e.g. '名詞') means OpenJTalk recognized it as a loanword → use katakana pron
             if string and all(c in self.ascii_letter_dict for c in string):
-                if current_chain:
-                    self._process_chain(current_chain, result)
-                    current_chain = []
-
-                result.extend(list(string))
-                continue
+                if pos == 'フィラー':
+                    if current_chain:
+                        self._process_chain(current_chain, result)
+                        current_chain = []
+                    result.extend(list(string))
+                    continue
 
-            # Handle punctuation
-            if pos in ('記号', '補助記号'):
+            # Handle punctuation (記号), but keep alphabet symbols (アルファベット) as regular words
+            if pos in ('記号', '補助記号') and pos_group1 != 'アルファベット':
                 if current_chain:
                     self._process_chain(current_chain, result)
                     current_chain = []
                 if string.isspace():
-                    result.append(' ')
+                    if not result or result[-1] != ' ':
+                        result.append(' ')
                 elif string in punctuation:
                     result.append(string)
+                else:
+                    logging.warning(
+                        f"Unknown symbol '{string}' (pos={pos}) not in punctuation list, replacing with space. original text: {text}"
+                    )
+                    if not result or result[-1] != ' ':
+                        result.append(' ')
                 continue
 
             if not pron or mora_size == 0:
+                if string and not string.isspace():
+                    logging.warning(
+                        f"Unknown symbol '{string}' (pos={pos}) not in punctuation list, replacing with space. original text: {text}"
+                    )
+                    if not result or result[-1] != ' ':
+                        result.append(' ')
                 continue
 
             # Add word to current chain
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -28,6 +28,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 
+from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import CASELESS_SCRIPT_TOKENIZER_TARGETS
 from nemo.collections.tts.models import EasyMagpieTTSInferenceModel, MagpieTTSModel
 from nemo.utils import logging
 
@@ -149,6 +150,24 @@ def validate(self) -> None:
             )
 
 
+def _migrate_charset_version(model_cfg: DictConfig) -> None:
+    """Pin charset_version=1 for Hindi/Arabic tokenizers in old checkpoints.
+
+    New models have ``charset_version`` persisted by ``setup_tokenizers()``.
+    Old checkpoints lack it, so without this migration the new default (v2)
+    would silently change the token-to-ID mapping and break the model.
+
+    Must be called inside ``open_dict(model_cfg)``.
+    """
+    if not hasattr(model_cfg, 'text_tokenizers'):
+        return
+    for tok_name in model_cfg.text_tokenizers:
+        tok_cfg = model_cfg.text_tokenizers[tok_name]
+        if hasattr(tok_cfg, '_target_') and tok_cfg._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS:
+            if not hasattr(tok_cfg, 'charset_version'):
+                tok_cfg.charset_version = 1
+
+
 def _migrate_tokenizer_punctuation(model_cfg: DictConfig) -> None:
     """Backfill punctuation fields for tokenizers that predate them.
 
@@ -203,6 +222,7 @@ def update_config_for_inference(
     model_cfg.codecmodel_path = codecmodel_path
 
     _migrate_tokenizer_punctuation(model_cfg)
+    _migrate_charset_version(model_cfg)
 
     # Update text tokenizer paths for backward compatibility
     if hasattr(model_cfg, 'text_tokenizer'):
diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
diff --git a/tests/functional_tests/L2_TTS_Fast_dev_runs_Magpietts_OnlineCFGDistillation.sh b/tests/functional_tests/L2_TTS_Fast_dev_runs_Magpietts_OnlineCFGDistillation.sh