Skip to content

Commit 52ba833

Browse files
quaphamXuesongYangCopilot
authored
Add hi-IN , Ko-KR and pt-BR IPA tokenizer support (#15567)
* feat(tts): extend IPA tokenizer with hi-IN/en code-switching and pt-BR * feat(tts): remove ar-MSA locale as out of scope for this PR Signed-off-by: quanpham <youngkwan199@gmail.com> * Apply isort and black reformatting Signed-off-by: quapham <quapham@users.noreply.github.com> * Add Korean IPA support Signed-off-by: quanpham <youngkwan199@gmail.com> * Fix leftover merge markers in Korean IPA support Signed-off-by: quanpham <youngkwan199@gmail.com> * Apply isort and black reformatting Signed-off-by: quapham <quapham@users.noreply.github.com> * fix: add KOREAN_CHARS import Signed-off-by: quanpham <youngkwan199@gmail.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Apply suggestion from @XuesongYang Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Apply suggestion from @XuesongYang Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Update tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Apply suggestion from @XuesongYang Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Apply suggestion from @XuesongYang Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Apply suggestion from @XuesongYang Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Apply suggestion from @XuesongYang Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Apply suggestion from @XuesongYang Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * WIP: save local changes before rebasing * fix: update IPAG2p typing and docs Signed-off-by: quanpham <youngkwan199@gmail.com> * Apply isort and black reformatting Signed-off-by: quapham <quapham@users.noreply.github.com> * bugfix: unit test of hindi Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * refactor: introduce a combined constant WORD_CHARS_ALL. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * fix: backward-compatible punctuation for pt-BR and hi-IN tokenizers Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * bugfix: L2_TTS_Fast_dev_runs_Magpietts_OnlineCFGDistillation.sh Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --------- Signed-off-by: quanpham <youngkwan199@gmail.com> Signed-off-by: quapham <quapham@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: quapham <quapham@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 87ccac8 commit 52ba833

11 files changed

Lines changed: 1496984 additions & 53 deletions

File tree

nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,15 @@
1515

1616
# fmt: off
1717

18-
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
18+
# TODO: pt-BR and ko-KR are missing from GRAPHEME_CHARACTER_SETS and IPA_CHARACTER_SETS below.
19+
# They work with IPATokenizer (which builds vocab from g2p.symbols), but get_grapheme_character_set()
20+
# and get_ipa_character_set() will raise ValueError for these locales until entries are added.
21+
# These functions are used by locale-specific tokenizers (e.g., HindiCharsTokenizer uses
22+
# get_grapheme_character_set("hi-IN")). If someone later creates PortugueseCharsTokenizer or
23+
# KoreanCharsTokenizer, they'd hit this.
24+
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "pt-BR", "ko-KR"]
1925

26+
# Derived from LJSpeech and "/" additionally
2027
DEFAULT_PUNCTUATION = (
2128
',', '.', '!', '?', '-',
2229
':', ';', '/', '"', '(',
@@ -240,7 +247,7 @@ def get_ipa_punctuation_list(locale):
240247
punct_set = set(DEFAULT_PUNCTUATION)
241248
# TODO @xueyang: verify potential mismatches with locale-specific punctuation sets used
242249
# in nemo_text_processing.text_normalization.en.taggers.punctuation.py
243-
if locale in ["de-DE", "es-ES", "it-IT", "fr-FR", "ja-JP"]:
250+
if locale in ["de-DE", "es-ES", "it-IT", "fr-FR", "ja-JP", "pt-BR"]:
244251
# ref: https://en.wikipedia.org/wiki/Guillemet#Uses
245252
punct_set.update(['«', '»', '‹', '›'])
246253
if locale == "de-DE":
@@ -347,5 +354,42 @@ def get_ipa_punctuation_list(locale):
347354
'・',
348355
]
349356
)
357+
elif locale == "hi-IN":
358+
punct_set.update(
359+
[
360+
'।',
361+
'॥',
362+
]
363+
)
364+
elif locale == "pt-BR":
365+
# ref: https://en.wikipedia.org/wiki/Portuguese_orthography#Punctuation
366+
# Guillemets (« » ‹ ›) are already added by the shared block above.
367+
punct_set.update(
368+
[
369+
'\u201c', # " left double quotation mark
370+
'\u201d', # " right double quotation mark
371+
'\u2018', # ' left single quotation mark
372+
'\u2019', # ' right single quotation mark
373+
'\u2013', # – en dash
374+
'\u2014', # — em dash
375+
'\u2026', # … horizontal ellipsis
376+
]
377+
)
378+
elif locale == "ko-KR":
379+
punct_set.update(
380+
[
381+
'『',
382+
'』',
383+
'「',
384+
'」',
385+
'《',
386+
'》',
387+
'…',
388+
'·',
389+
'—',
390+
'–',
391+
'〜',
392+
]
393+
)
350394
punct_list = sorted(list(punct_set))
351395
return punct_list

nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
"any_locale_word_tokenize",
3030
"english_word_tokenize",
3131
"LATIN_CHARS_ALL",
32+
"INDIC_CHARS_ALL",
33+
"KOREAN_CHARS",
34+
"WORD_CHARS_ALL",
3235
"normalize_unicode_text",
3336
"japanese_text_preprocessing",
3437
]
@@ -52,11 +55,32 @@
5255
LATIN_ALPHABET_BASIC = "A-Za-z"
5356
ACCENTED_CHARS = "À-ÖØ-öø-ÿ"
5457
LATIN_CHARS_ALL = f"{LATIN_ALPHABET_BASIC}{ACCENTED_CHARS}"
58+
59+
# Indic characters based on https://www.unicode.org/charts/
60+
# Hindi, Marathi, Nepali, Sanskrit https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)
61+
DEVANAGARI_CHARS = (
62+
r'\u0900-\u0963\u0966-\u097F' # excluding danda (U+0964), double danda (U+0965) so they are treated as punctuation
63+
)
64+
BENGALI_CHARS = r'\u0980-\u09FF' # Bengali, Assamese
65+
TAMIL_CHARS = r'\u0B80-\u0BFF' # Tamil
66+
TELUGU_CHARS = r'\u0C00-\u0C7F' # Telugu
67+
KANNADA_CHARS = r'\u0C80-\u0CFF' # Kannada
68+
GUJARATI_CHARS = r'\u0A80-\u0AFF' # Gujarati
69+
INDIC_CHARS_ALL = f"{DEVANAGARI_CHARS}{BENGALI_CHARS}{TAMIL_CHARS}{TELUGU_CHARS}{KANNADA_CHARS}{GUJARATI_CHARS}"
70+
71+
# Korean
72+
# ref: https://en.wikipedia.org/wiki/Hangul_Syllables (U+AC00–U+D7A3)
73+
# ref: https://en.wikipedia.org/wiki/Hangul_Jamo_(Unicode_block) (U+1100–U+11FF)
74+
# ref: https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo (U+3130–U+318F)
75+
KOREAN_CHARS = r'\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F'
76+
77+
WORD_CHARS_ALL = f"{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}"
78+
5579
_WORDS_RE_EN = re.compile(
5680
fr"([{LATIN_ALPHABET_BASIC}]+(?:[{LATIN_ALPHABET_BASIC}\-']*[{LATIN_ALPHABET_BASIC}]+)*)|(\|[^|]*\|)|([^{LATIN_ALPHABET_BASIC}|]+)"
5781
)
5882
_WORDS_RE_ANY_LOCALE = re.compile(
59-
fr"([{LATIN_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}|]+)"
83+
fr"([{WORD_CHARS_ALL}]+(?:[{WORD_CHARS_ALL}\-']*[{WORD_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{WORD_CHARS_ALL}|]+)"
6084
)
6185

6286

nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import itertools
1717
import os
1818
import string
19+
import warnings
1920
from abc import ABC, abstractmethod
2021
from contextlib import contextmanager
2122
from typing import List, Optional, Union
@@ -24,10 +25,12 @@
2425
from transformers import PreTrainedTokenizerBase
2526

2627
from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import (
28+
DEFAULT_PUNCTUATION,
2729
get_grapheme_character_set,
2830
get_ipa_punctuation_list,
2931
validate_locale,
3032
)
33+
3134
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
3235
any_locale_text_preprocessing,
3336
chinese_text_preprocessing,
@@ -110,14 +113,7 @@ class BaseCharsTokenizer(BaseTokenizer):
110113
text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
111114
"""
112115

113-
# fmt: off
114-
# TODO @xueyang: unify definition of the default PUNCT_LIST and import from ipa_lexicon.py
115-
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
116-
',', '.', '!', '?', '-',
117-
':', ';', '/', '"', '(',
118-
')', '[', ']', '{', '}',
119-
)
120-
# fmt: on
116+
PUNCT_LIST = DEFAULT_PUNCTUATION
121117

122118
def __init__(
123119
self,
@@ -392,6 +388,12 @@ class HindiCharsTokenizer(BaseCharsTokenizer):
392388
if None then no blank in labels.
393389
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
394390
non_default_punct_list: List of punctuation marks which will be used instead default.
391+
Overrides ``punct_version`` when explicitly provided.
392+
punct_version: Punctuation set version (default 2).
393+
2 — expanded set from ``get_ipa_punctuation_list("hi-IN")`` including dandas.
394+
1 — legacy ``sorted(list(DEFAULT_PUNCTUATION))`` without dandas; emits
395+
``DeprecationWarning`` and will be removed in a future release.
396+
Ignored when ``non_default_punct_list`` is explicitly provided.
395397
text_preprocessing_func: Text preprocessing function. Keeps Devanagari unchanged.
396398
397399
Each Unicode code point becomes 1 token (not visual grapheme clusters)
@@ -404,6 +406,7 @@ class HindiCharsTokenizer(BaseCharsTokenizer):
404406

405407
_LOCALE = "hi-IN"
406408
_PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
409+
_PUNCT_LIST_V1 = sorted(list(DEFAULT_PUNCTUATION))
407410
_CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
408411
_CHARSET_STR += string.ascii_lowercase
409412

@@ -414,9 +417,25 @@ def __init__(
414417
apostrophe=True,
415418
add_blank_at=None,
416419
pad_with_space=False,
417-
non_default_punct_list=_PUNCT_LIST,
420+
non_default_punct_list=None,
421+
punct_version=2,
418422
text_preprocessing_func=any_locale_text_preprocessing,
419423
):
424+
if non_default_punct_list is None:
425+
if punct_version == 1:
426+
warnings.warn(
427+
"HindiCharsTokenizer: punct_version=1 uses DEFAULT_PUNCTUATION without dandas "
428+
"and will be removed in a future release. Migrate to punct_version=2.",
429+
DeprecationWarning,
430+
stacklevel=2,
431+
)
432+
non_default_punct_list = self._PUNCT_LIST_V1
433+
elif punct_version == 2:
434+
non_default_punct_list = self._PUNCT_LIST
435+
else:
436+
raise ValueError(
437+
f"HindiCharsTokenizer: unsupported punct_version={punct_version}. Use 1 (legacy) or 2."
438+
)
420439
super().__init__(
421440
chars=chars,
422441
punct=punct,
@@ -471,14 +490,6 @@ class GermanPhonemesTokenizer(BaseCharsTokenizer):
471490
Currently, it only applies lower() function.
472491
"""
473492

474-
# fmt: off
475-
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
476-
',', '.', '!', '?', '-',
477-
':', ';', '/', '"', '(',
478-
')', '[', ']', '{', '}',
479-
)
480-
# fmt: on
481-
482493
def __init__(
483494
self,
484495
punct=True,
@@ -628,12 +639,9 @@ class EnglishPhonemesTokenizer(BaseTokenizer):
628639
handled by g2p).
629640
"""
630641

642+
PUNCT_LIST = DEFAULT_PUNCTUATION
643+
631644
# fmt: off
632-
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
633-
',', '.', '!', '?', '-',
634-
':', ';', '/', '"', '(',
635-
')', '[', ']', '{', '}',
636-
)
637645
VOWELS = (
638646
'AA', 'AE', 'AH', 'AO', 'AW',
639647
'AY', 'EH', 'ER', 'EY', 'IH',
@@ -773,10 +781,14 @@ class IPATokenizer(BaseTokenizer):
773781
Args:
774782
g2p: Grapheme to phoneme module, should be IpaG2p or some subclass thereof.
775783
locale: Locale used to determine default text processing logic and punctuation.
776-
Supports ["en-US", "de-DE", "es-ES", "fr-FR"]. Defaults to "en-US".
784+
See ``SUPPORTED_LOCALES`` in ``ipa_lexicon.py`` for the full list. Defaults to "en-US".
777785
Specify None if implementing custom logic for a new locale.
778786
punct: Whether to reserve grapheme for basic punctuation or not.
779787
non_default_punct_list: List of punctuation marks which will be used instead default, if any.
788+
locale_specific_punct: Whether to use locale-specific punctuation (via ``get_ipa_punctuation_list``)
789+
or only ``DEFAULT_PUNCTUATION``. Defaults to True. Set to False to preserve the token
790+
vocabulary of checkpoints trained before locale-specific punctuation was introduced.
791+
Currently only affects pt-BR. Ignored when ``non_default_punct_list`` is provided.
780792
fixed_vocab: List of valid grapheme/phoneme tokens for the model.
781793
Set only if overriding the default vocab generation process (reading from G2P dict).
782794
If set, any dataset entries that have unincluded graphemes will be filtered out, and any words whose
@@ -799,6 +811,7 @@ def __init__(
799811
locale="en-US",
800812
punct=True,
801813
non_default_punct_list=None,
814+
locale_specific_punct=True,
802815
fixed_vocab=None,
803816
*,
804817
space=' ',
@@ -851,8 +864,10 @@ def __init__(
851864
if punct:
852865
if non_default_punct_list is not None:
853866
self.punct_list = non_default_punct_list
854-
else:
867+
elif locale_specific_punct:
855868
self.punct_list = get_ipa_punctuation_list(locale)
869+
else:
870+
self.punct_list = sorted(list(DEFAULT_PUNCTUATION))
856871

857872
tokens.update(self.punct_list)
858873

@@ -964,14 +979,8 @@ class ChinesePhonemesTokenizer(BaseTokenizer):
964979
handled by g2p).
965980
"""
966981

967-
# fmt: off
968-
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
969-
',', '.', '!', '?', '-',
970-
':', ';', '/', '"', '(',
971-
')', '[', ']', '{', '}',
972-
)
982+
PUNCT_LIST = DEFAULT_PUNCTUATION
973983
ZH_PUNCT_LIST = list(",。?!;:、‘’“”()【】「」《》") + list(PUNCT_LIST)
974-
# fmt: on
975984

976985
def __init__(
977986
self,

nemo/collections/tts/data/text_to_speech_dataset_lhotse.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from hydra.utils import instantiate
2222
from lhotse import CutSet
2323
from lhotse.dataset.collation import collate_matrices, collate_vectors
24-
from omegaconf import DictConfig
24+
from omegaconf import DictConfig, open_dict
2525
from transformers import AutoTokenizer, T5Tokenizer
2626

2727
from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import AggregatedTTSTokenizer, IPABPETokenizer
@@ -48,6 +48,28 @@ def setup_tokenizers(all_tokenizers_config, mode='train'):
4848
text_tokenizer_kwargs = {}
4949
if "g2p" in tokenizer_config:
5050
text_tokenizer_kwargs["g2p"] = instantiate(tokenizer_config.g2p)
51+
# Ensure locale_specific_punct is persisted so it survives .nemo save/restore.
52+
# New training for locales with extended punctuation should use the full set (True).
53+
if (
54+
hasattr(tokenizer_config, '_target_')
55+
and tokenizer_config._target_
56+
== "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer"
57+
and tokenizer_config.get('locale', None) == "pt-BR"
58+
and not hasattr(tokenizer_config, 'non_default_punct_list')
59+
and not hasattr(tokenizer_config, 'locale_specific_punct')
60+
):
61+
with open_dict(tokenizer_config):
62+
tokenizer_config.locale_specific_punct = True
63+
# Persist punct_version=2 for HindiCharsTokenizer so .nemo save/restore
64+
# always uses the expanded punctuation set (with dandas).
65+
if (
66+
hasattr(tokenizer_config, '_target_')
67+
and tokenizer_config._target_
68+
== "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.HindiCharsTokenizer"
69+
and not hasattr(tokenizer_config, 'punct_version')
70+
):
71+
with open_dict(tokenizer_config):
72+
tokenizer_config.punct_version = 2
5173
tokenizer = instantiate(tokenizer_config, **text_tokenizer_kwargs)
5274
# TODO @xueyang: is it really necessary to set phone probability to 1.0 for test mode?
5375
if mode == 'test' and hasattr(tokenizer, "set_phone_prob"):

0 commit comments

Comments
 (0)