fix: add charset_version to Hindi/Arabic tokenizers for backward compatibility

XuesongYang · XuesongYang · commit 5662a320733d · 2026-04-23T15:46:16.000Z
Introduce a  parameter in HindiCharsTokenizer and
ArabicCharsTokenizer so old models (v1: case='mixed') keep working
while new models train with the corrected charset (v2: case='upper').

- Define CASELESS_SCRIPT_TOKENIZER_TARGETS and DEFAULT_CHARSET_VERSION
  constants in tts_tokenizers.py
- Persist charset_version into the OmegaConf config during training
  (setup_tokenizers) so .nemo archives record which version was used
- Add _migrate_charset_version() helper in magpietts inference utils
  to pin charset_version=1 for old checkpoints that lack the field,
  preventing a silent vocabulary mismatch at inference time

Signed-off-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -42,6 +42,15 @@
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.utils import logging
 
+CASELESS_SCRIPT_TOKENIZER_TARGETS = frozenset(
+    {
+        'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.HindiCharsTokenizer',
+        'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ArabicCharsTokenizer',
+    }
+)
+
+DEFAULT_CHARSET_VERSION = 2
+
 
 class BaseTokenizer(ABC):
     """Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens.
diff --git a/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py b/nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
@@ -21,10 +21,15 @@
 from hydra.utils import instantiate
 from lhotse import CutSet
 from lhotse.dataset.collation import collate_matrices, collate_vectors
-from omegaconf import DictConfig
+from omegaconf import DictConfig, open_dict
 from transformers import AutoTokenizer, T5Tokenizer
 
-from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPABPETokenizer
+from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (
+    CASELESS_SCRIPT_TOKENIZER_TARGETS,
+    DEFAULT_CHARSET_VERSION,
+    AggregatedTTSTokenizer,
+    IPABPETokenizer,
+)
 from nemo.collections.tts.parts.utils.tts_dataset_utils import (
     beta_binomial_prior_distribution,
     normalize_volume,
@@ -52,6 +57,18 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
             # TODO @xueyang: is it really necessary to set phone probability to 1.0 for test mode?
             if mode == 'test' and hasattr(tokenizer, "set_phone_prob"):
                 tokenizer.set_phone_prob(1.0)
+
+            # Persist charset_version so it's saved in .nemo archives and
+            # update_config_for_inference can distinguish old checkpoints
+            # (missing charset_version → v1) from new ones.
+            if (
+                hasattr(tokenizer_config, '_target_')
+                and tokenizer_config._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS
+                and not hasattr(tokenizer_config, 'charset_version')
+            ):
+                with open_dict(all_tokenizers_config):
+                    tokenizer_config.charset_version = DEFAULT_CHARSET_VERSION
+
         tokenizers.append(tokenizer)
         tokenizer_names.append(tokenizer_name)
 
diff --git a/nemo/collections/tts/modules/magpietts_inference/utils.py b/nemo/collections/tts/modules/magpietts_inference/utils.py
@@ -28,6 +28,7 @@
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
 
+from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import CASELESS_SCRIPT_TOKENIZER_TARGETS
 from nemo.collections.tts.models import EasyMagpieTTSInferenceModel, MagpieTTSModel
 from nemo.utils import logging
 
@@ -149,6 +150,24 @@ def validate(self) -> None:
             )
 
 
+def _migrate_charset_version(model_cfg: DictConfig) -> None:
+    """Pin charset_version=1 for Hindi/Arabic tokenizers in old checkpoints.
+
+    New models have ``charset_version`` persisted by ``setup_tokenizers()``.
+    Old checkpoints lack it, so without this migration the new default (v2)
+    would silently change the token-to-ID mapping and break the model.
+
+    Must be called inside ``open_dict(model_cfg)``.
+    """
+    if not hasattr(model_cfg, 'text_tokenizers'):
+        return
+    for tok_name in model_cfg.text_tokenizers:
+        tok_cfg = model_cfg.text_tokenizers[tok_name]
+        if hasattr(tok_cfg, '_target_') and tok_cfg._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS:
+            if not hasattr(tok_cfg, 'charset_version'):
+                tok_cfg.charset_version = 1
+
+
 def update_config_for_inference(
     model_cfg: DictConfig,
     codecmodel_path: Optional[str],
@@ -223,6 +242,8 @@ def update_config_for_inference(
             model_cfg.forced_context_audio_eos_id = num_audio_tokens - 1
             model_cfg.forced_context_audio_bos_id = num_audio_tokens - 2
 
+    _migrate_charset_version(model_cfg)
+
     # Extract and remove sample_rate (now in model class)
     sample_rate = None
     if hasattr(model_cfg, 'sample_rate'):