Skip to content

Commit 5662a32

Browse files
committed
fix: add charset_version to Hindi/Arabic tokenizers for backward compatibility
Introduce a parameter in HindiCharsTokenizer and ArabicCharsTokenizer so old models (v1: case='mixed') keep working while new models train with the corrected charset (v2: case='upper'). - Define CASELESS_SCRIPT_TOKENIZER_TARGETS and DEFAULT_CHARSET_VERSION constants in tts_tokenizers.py - Persist charset_version into the OmegaConf config during training (setup_tokenizers) so .nemo archives record which version was used - Add _migrate_charset_version() helper in magpietts inference utils to pin charset_version=1 for old checkpoints that lack the field, preventing a silent vocabulary mismatch at inference time Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
1 parent 4762ab3 commit 5662a32

3 files changed

Lines changed: 49 additions & 2 deletions

File tree

nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@
4242
from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
4343
from nemo.utils import logging
4444

45+
CASELESS_SCRIPT_TOKENIZER_TARGETS = frozenset(
46+
{
47+
'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.HindiCharsTokenizer',
48+
'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ArabicCharsTokenizer',
49+
}
50+
)
51+
52+
DEFAULT_CHARSET_VERSION = 2
53+
4554

4655
class BaseTokenizer(ABC):
4756
"""Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens.

nemo/collections/tts/data/text_to_speech_dataset_lhotse.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,15 @@
2121
from hydra.utils import instantiate
2222
from lhotse import CutSet
2323
from lhotse.dataset.collation import collate_matrices, collate_vectors
24-
from omegaconf import DictConfig
24+
from omegaconf import DictConfig, open_dict
2525
from transformers import AutoTokenizer, T5Tokenizer
2626

27-
from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPABPETokenizer
27+
from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (
28+
CASELESS_SCRIPT_TOKENIZER_TARGETS,
29+
DEFAULT_CHARSET_VERSION,
30+
AggregatedTTSTokenizer,
31+
IPABPETokenizer,
32+
)
2833
from nemo.collections.tts.parts.utils.tts_dataset_utils import (
2934
beta_binomial_prior_distribution,
3035
normalize_volume,
@@ -52,6 +57,18 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
5257
# TODO @xueyang: is it really necessary to set phone probability to 1.0 for test mode?
5358
if mode == 'test' and hasattr(tokenizer, "set_phone_prob"):
5459
tokenizer.set_phone_prob(1.0)
60+
61+
# Persist charset_version so it's saved in .nemo archives and
62+
# update_config_for_inference can distinguish old checkpoints
63+
# (missing charset_version → v1) from new ones.
64+
if (
65+
hasattr(tokenizer_config, '_target_')
66+
and tokenizer_config._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS
67+
and not hasattr(tokenizer_config, 'charset_version')
68+
):
69+
with open_dict(all_tokenizers_config):
70+
tokenizer_config.charset_version = DEFAULT_CHARSET_VERSION
71+
5572
tokenizers.append(tokenizer)
5673
tokenizer_names.append(tokenizer_name)
5774

nemo/collections/tts/modules/magpietts_inference/utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import torch
2929
from omegaconf import DictConfig, OmegaConf, open_dict
3030

31+
from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import CASELESS_SCRIPT_TOKENIZER_TARGETS
3132
from nemo.collections.tts.models import EasyMagpieTTSInferenceModel, MagpieTTSModel
3233
from nemo.utils import logging
3334

@@ -149,6 +150,24 @@ def validate(self) -> None:
149150
)
150151

151152

153+
def _migrate_charset_version(model_cfg: DictConfig) -> None:
154+
"""Pin charset_version=1 for Hindi/Arabic tokenizers in old checkpoints.
155+
156+
New models have ``charset_version`` persisted by ``setup_tokenizers()``.
157+
Old checkpoints lack it, so without this migration the new default (v2)
158+
would silently change the token-to-ID mapping and break the model.
159+
160+
Must be called inside ``open_dict(model_cfg)``.
161+
"""
162+
if not hasattr(model_cfg, 'text_tokenizers'):
163+
return
164+
for tok_name in model_cfg.text_tokenizers:
165+
tok_cfg = model_cfg.text_tokenizers[tok_name]
166+
if hasattr(tok_cfg, '_target_') and tok_cfg._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS:
167+
if not hasattr(tok_cfg, 'charset_version'):
168+
tok_cfg.charset_version = 1
169+
170+
152171
def update_config_for_inference(
153172
model_cfg: DictConfig,
154173
codecmodel_path: Optional[str],
@@ -223,6 +242,8 @@ def update_config_for_inference(
223242
model_cfg.forced_context_audio_eos_id = num_audio_tokens - 1
224243
model_cfg.forced_context_audio_bos_id = num_audio_tokens - 2
225244

245+
_migrate_charset_version(model_cfg)
246+
226247
# Extract and remove sample_rate (now in model class)
227248
sample_rate = None
228249
if hasattr(model_cfg, 'sample_rate'):

0 commit comments

Comments
 (0)