Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# These functions are used by locale-specific tokenizers (e.g., HindiCharsTokenizer uses
# get_grapheme_character_set("hi-IN")). If someone later creates PortugueseCharsTokenizer or
# KoreanCharsTokenizer, they'd hit this.
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "pt-BR", "ko-KR"]
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "ar-MSA", "pt-BR", "ko-KR"]

# Derived from LJSpeech and "/" additionally
DEFAULT_PUNCTUATION = (
Expand Down Expand Up @@ -114,6 +114,15 @@
# Danda (period)
'।',
),
# ref: https://en.wikipedia.org/wiki/Arabic_alphabet
"ar-MSA": (
'ء', 'آ', 'أ', 'إ', 'ؤ', 'ئ', 'ا', 'ب', 'ة', 'ت',
'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش',
'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل',
'م', 'ن', 'ه', 'و', 'ى', 'ي',
# Diacritics
'ً', 'ٌ', 'ٍ', 'َ', 'ُ', 'ِ', 'ّ', 'ٰ', 'ْ',
),
}

IPA_CHARACTER_SETS = {
Expand Down Expand Up @@ -354,6 +363,14 @@ def get_ipa_punctuation_list(locale):
'・',
]
)
elif locale == "ar-MSA":
punct_set.update(
[
'،',
'؛',
'؟',
]
)
elif locale == "hi-IN":
punct_set.update(
[
Expand Down
142 changes: 133 additions & 9 deletions nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,15 @@
from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
from nemo.utils import logging

CASELESS_SCRIPT_TOKENIZER_TARGETS = frozenset(
{
'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.HindiCharsTokenizer',
'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ArabicCharsTokenizer',
}
)

DEFAULT_CHARSET_VERSION = 2


class BaseTokenizer(ABC):
"""Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens.
Expand Down Expand Up @@ -164,9 +173,8 @@ def encode(self, text):
logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")

# Remove trailing spaces
if cs:
while cs[-1] == space:
cs.pop()
while cs and cs[-1] == space:
cs.pop()

if self.pad_with_space:
cs = [space] + cs + [space]
Expand Down Expand Up @@ -382,6 +390,14 @@ def __init__(
class HindiCharsTokenizer(BaseCharsTokenizer):
"""Hindi grapheme tokenizer (character-based, no phonemes).
Args:
chars: Explicit character set string. When provided, ``charset_version`` is ignored.
charset_version: Controls which default character set to use (only when ``chars`` is None).
``2`` (default) — ``case="upper"`` Devanagari + ``ascii_letters``.
Hindi/Devanagari has no case distinction, so ``case="upper"`` avoids duplicating
every code-point. ``ascii_letters`` covers both upper- and lower-case English for
mixed-language text.
``1`` — legacy ``case="mixed"`` Devanagari + ``ascii_lowercase``. Use this value to
restore models that were trained before the charset fix.
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
Expand All @@ -406,13 +422,14 @@ class HindiCharsTokenizer(BaseCharsTokenizer):

_LOCALE = "hi-IN"
_PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
_CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="upper") + string.ascii_letters
_PUNCT_LIST_V1 = sorted(list(DEFAULT_PUNCTUATION))
_CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
_CHARSET_STR += string.ascii_lowercase
_CHARSET_STR_V1 = get_grapheme_character_set(locale=_LOCALE, case="mixed") + string.ascii_lowercase

def __init__(
self,
chars=_CHARSET_STR,
chars=None,
charset_version=2,
punct=True,
apostrophe=True,
add_blank_at=None,
Expand All @@ -421,6 +438,22 @@ def __init__(
punct_version=2,
text_preprocessing_func=any_locale_text_preprocessing,
):
if chars is None:
if charset_version == 1:
warnings.warn(
"HindiCharsTokenizer charset_version=1 (case='mixed' + ascii_lowercase) is deprecated "
"and will be removed in a future release. "
"Migrate to charset_version=2 (case='upper' + ascii_letters) and retrain.",
DeprecationWarning,
stacklevel=2,
)
chars = self._CHARSET_STR_V1
elif charset_version == 2:
chars = self._CHARSET_STR
else:
raise ValueError(
f"HindiCharsTokenizer: unsupported charset_version={charset_version!r}. Use 1 (legacy) or 2."
)
if non_default_punct_list is None:
if punct_version == 1:
warnings.warn(
Expand Down Expand Up @@ -467,9 +500,100 @@ def encode(self, text):
logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")

# Remove trailing spaces
if cs:
while cs[-1] == space:
cs.pop()
while cs and cs[-1] == space:
cs.pop()

if self.pad_with_space:
cs = [space] + cs + [space]

return [self._token2id[p] for p in cs]


class ArabicCharsTokenizer(BaseCharsTokenizer):
"""Arabic grapheme tokenizer (character-based, no phonemes).
Args:
chars: Explicit character set string. When provided, ``charset_version`` is ignored.
charset_version: Controls which default character set to use (only when ``chars`` is None).
``2`` (default) — ``case="upper"`` Arabic + ``ascii_letters``.
Arabic script has no case distinction, so ``case="upper"`` avoids duplicating
every code-point. ``ascii_letters`` covers both upper- and lower-case English for
mixed-language text.
``1`` — legacy ``case="mixed"`` Arabic + ``ascii_letters``. Use this value to
restore models that were trained before the charset fix.
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
text_preprocessing_func: Text preprocessing function. Keeps Arabic unchanged.

Each Unicode code point becomes 1 token (letters, diacritics, and Arabic punct from ipa_lexicon).
Supports both upper and lower English letters (e.g. mixed-language text).

Input Text: مرحبا Hello
Chars: ['م', 'ر', 'ح', 'ب', 'ا', ' ', 'H', 'e', 'l', 'l', 'o']
"""

_LOCALE = "ar-MSA"
_PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
_CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="upper") + string.ascii_letters
_CHARSET_STR_V1 = get_grapheme_character_set(locale=_LOCALE, case="mixed") + string.ascii_letters

def __init__(
self,
chars=None,
charset_version=2,
punct=True,
apostrophe=True,
add_blank_at=None,
pad_with_space=False,
non_default_punct_list=_PUNCT_LIST,
text_preprocessing_func=any_locale_text_preprocessing,
):
if chars is None:
if charset_version == 1:
warnings.warn(
"ArabicCharsTokenizer charset_version=1 (case='mixed' + ascii_letters) is deprecated "
"and will be removed in a future release. "
"Migrate to charset_version=2 (case='upper' + ascii_letters) and retrain.",
DeprecationWarning,
stacklevel=2,
)
chars = self._CHARSET_STR_V1
elif charset_version == 2:
chars = self._CHARSET_STR
else:
raise ValueError(
f"ArabicCharsTokenizer: unsupported charset_version={charset_version!r}. Use 1 (legacy) or 2."
)
super().__init__(
chars=chars,
punct=punct,
apostrophe=apostrophe,
add_blank_at=add_blank_at,
pad_with_space=pad_with_space,
non_default_punct_list=non_default_punct_list,
text_preprocessing_func=text_preprocessing_func,
)

def encode(self, text):
"""Encode Arabic text, handling diacritics and English (upper/lower) correctly."""
cs, space, tokens = [], self.tokens[self.space], set(self.tokens)

text = self.text_preprocessing_func(text)
for c in text:
if c == space and len(cs) > 0 and cs[-1] != space:
cs.append(c)
elif c in tokens and c != space:
cs.append(c)
elif (c in self.PUNCT_LIST) and self.punct:
cs.append(c)
elif c != space:
logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")

while cs and cs[-1] == space:
cs.pop()

if self.pad_with_space:
cs = [space] + cs + [space]
Expand Down
19 changes: 18 additions & 1 deletion nemo/collections/tts/data/text_to_speech_dataset_lhotse.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@
from omegaconf import DictConfig, open_dict
from transformers import AutoTokenizer, T5Tokenizer

from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPABPETokenizer
from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (
CASELESS_SCRIPT_TOKENIZER_TARGETS,
DEFAULT_CHARSET_VERSION,
AggregatedTTSTokenizer,
IPABPETokenizer,
)
from nemo.collections.tts.parts.utils.tts_dataset_utils import (
beta_binomial_prior_distribution,
normalize_volume,
Expand Down Expand Up @@ -74,6 +79,18 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
# TODO @xueyang: is it really necessary to set phone probability to 1.0 for test mode?
if mode == 'test' and hasattr(tokenizer, "set_phone_prob"):
tokenizer.set_phone_prob(1.0)

# Persist charset_version so it's saved in .nemo archives and
# update_config_for_inference can distinguish old checkpoints
# (missing charset_version → v1) from new ones.
if (
hasattr(tokenizer_config, '_target_')
and tokenizer_config._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS
and not hasattr(tokenizer_config, 'charset_version')
):
with open_dict(all_tokenizers_config):
tokenizer_config.charset_version = DEFAULT_CHARSET_VERSION
Comment on lines +83 to +92
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of persisting charset_version, just persist the chars argument instead.

  1. Use tokenizer_config._target_ to get the class
  2. Then add tokenizer_config.chars = cls._CHARSET_STR

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Persist charset_version so it's saved in .nemo archives and
# update_config_for_inference can distinguish old checkpoints
# (missing charset_version → v1) from new ones.
if (
hasattr(tokenizer_config, '_target_')
and tokenizer_config._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS
and not hasattr(tokenizer_config, 'charset_version')
):
with open_dict(all_tokenizers_config):
tokenizer_config.charset_version = DEFAULT_CHARSET_VERSION
from importlib import import_module
mod = import_module("nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers ")
cls = getattr(mod, tokenizer_config._target_.split(".")[-1])
tokenizer_config.chars = cls._CHARSET_STR

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ Something like that, at a high level psuedo-code guide.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for bring it up. Actually I was considering whether to persist chars vs charset_version, or saying an integer vs a long string, when implementing back-compatibility support. And my final decision goes with charset_version. Here is why,

it is definitely human-readable if persisting chars, but it would add long unicode strings that make configs hard to read/diff. For example, Hindi charset string is hundreds of Unicode codepoints, which would bloat every config/log dump, compared against a single integer charset_version. We should be good with charset_version added in .nemo, and add clear docstring.

but again, we need to avoid such scenarios to make design complicated in long run.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a second thought, considering charset_version is still much better than chars. So we'd better keeping this implementation.

  1. Easy config/CLI override — charset_version=1 is a single integer in YAML or Hydra CLI. The Hindi v1 charset is 172 Unicode characters including zero-width combining marks (्, ़, ँ); inserting that as a chars string into a config is awkward and error-prone.
  2. Concrete example — this very PR shows the benefit. Fixing the CFG distillation test (L2_TTS_Fast_dev_runs_Magpietts_OnlineCFGDistillation.sh) was just adding one line: +model.text_tokenizers.hindi_chartokenizer.charset_version=1. With chars, that line would be a 172-char Unicode blob.
  3. chars still works as an escape hatch — the if chars is None: guard means anyone who needs a truly custom charset can still pass chars directly and charset_version is ignored.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You’re right, keeping the config cleaner is better if we can. If the change already covers this safely, I’m happy to keep the config clean and rely on that instead.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm ok to merge this as this. But in the future, we need to refactor this default version to be part of the class not a top level global in the file.


tokenizers.append(tokenizer)
tokenizer_names.append(tokenizer_name)

Expand Down
36 changes: 26 additions & 10 deletions nemo/collections/tts/g2p/models/ja_jp_ipa.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,28 +298,44 @@ def __call__(self, text: str) -> List[str]:
acc = word.get('acc', 0)

string = unicodedata.normalize('NFKC', string)
pos_group1 = word.get('pos_group1', '')

# Handle English letters
# If string is pure ASCII letters after NFKC normalization, decide based on pos:
# - pos='フィラー' means OpenJTalk didn't recognize the word (just spelled it out) → keep Latin
# - any other pos (e.g. '名詞') means OpenJTalk recognized it as a loanword → use katakana pron
if string and all(c in self.ascii_letter_dict for c in string):
if current_chain:
self._process_chain(current_chain, result)
current_chain = []

result.extend(list(string))
continue
if pos == 'フィラー':
if current_chain:
self._process_chain(current_chain, result)
current_chain = []
result.extend(list(string))
continue

# Handle punctuation
if pos in ('記号', '補助記号'):
# Handle punctuation (記号), but keep alphabet symbols (アルファベット) as regular words
if pos in ('記号', '補助記号') and pos_group1 != 'アルファベット':
if current_chain:
self._process_chain(current_chain, result)
current_chain = []
if string.isspace():
result.append(' ')
if not result or result[-1] != ' ':
result.append(' ')
elif string in punctuation:
result.append(string)
else:
logging.warning(
f"Unknown symbol '{string}' (pos={pos}) not in punctuation list, replacing with space. original text: {text}"
)
if not result or result[-1] != ' ':
result.append(' ')
continue

if not pron or mora_size == 0:
if string and not string.isspace():
logging.warning(
f"Unknown symbol '{string}' (pos={pos}) not in punctuation list, replacing with space. original text: {text}"
)
Comment thread
XuesongYang marked this conversation as resolved.
if not result or result[-1] != ' ':
result.append(' ')
continue

# Add word to current chain
Expand Down
20 changes: 20 additions & 0 deletions nemo/collections/tts/modules/magpietts_inference/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import torch
from omegaconf import DictConfig, OmegaConf, open_dict

from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import CASELESS_SCRIPT_TOKENIZER_TARGETS
from nemo.collections.tts.models import EasyMagpieTTSInferenceModel, MagpieTTSModel
from nemo.utils import logging

Expand Down Expand Up @@ -149,6 +150,24 @@ def validate(self) -> None:
)


def _migrate_charset_version(model_cfg: DictConfig) -> None:
"""Pin charset_version=1 for Hindi/Arabic tokenizers in old checkpoints.

New models have ``charset_version`` persisted by ``setup_tokenizers()``.
Old checkpoints lack it, so without this migration the new default (v2)
would silently change the token-to-ID mapping and break the model.

Must be called inside ``open_dict(model_cfg)``.
"""
Comment thread
blisc marked this conversation as resolved.
if not hasattr(model_cfg, 'text_tokenizers'):
return
for tok_name in model_cfg.text_tokenizers:
tok_cfg = model_cfg.text_tokenizers[tok_name]
if hasattr(tok_cfg, '_target_') and tok_cfg._target_ in CASELESS_SCRIPT_TOKENIZER_TARGETS:
if not hasattr(tok_cfg, 'charset_version'):
tok_cfg.charset_version = 1


def _migrate_tokenizer_punctuation(model_cfg: DictConfig) -> None:
"""Backfill punctuation fields for tokenizers that predate them.

Expand Down Expand Up @@ -203,6 +222,7 @@ def update_config_for_inference(
model_cfg.codecmodel_path = codecmodel_path

_migrate_tokenizer_punctuation(model_cfg)
_migrate_charset_version(model_cfg)

# Update text tokenizer paths for backward compatibility
if hasattr(model_cfg, 'text_tokenizer'):
Expand Down
Loading
Loading