Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# fmt: off

SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "pt-BR", "ko-KR"]

DEFAULT_PUNCTUATION = (
',', '.', '!', '?', '-',
Expand Down Expand Up @@ -347,5 +347,28 @@ def get_ipa_punctuation_list(locale):
'・',
]
)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
)
),
elif locale == "pt-BR":
# ref: https://en.wikipedia.org/wiki/Portuguese_orthography#Punctuation
punct_set.update(
[
'—', # em dash, U+2014 (used for dialogue in Brazilian Portuguese)
'–', # en dash, U+2013
'…', # horizontal ellipsis, U+2026
'\u201C', # left double quotation mark, U+201C
'\u201D', # right double quotation mark, U+201D
]
)

elif locale == "hi-IN":
punct_set.update(
[
'।',
'॥',
]
)
elif locale == "ko-KR":
punct_set.update(
[
'『',
'』',
'「',
'」',
'《',
'》',
'…',
'·',
'—',
'–',
'〜',
]
)
punct_list = sorted(list(punct_set))
return punct_list
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
"any_locale_word_tokenize",
"english_word_tokenize",
"LATIN_CHARS_ALL",
"INDIC_CHARS_ALL",
"KOREAN_CHARS",
"normalize_unicode_text",
"japanese_text_preprocessing",
]
Expand All @@ -52,11 +54,30 @@
LATIN_ALPHABET_BASIC = "A-Za-z"
ACCENTED_CHARS = "À-ÖØ-öø-ÿ"
LATIN_CHARS_ALL = f"{LATIN_ALPHABET_BASIC}{ACCENTED_CHARS}"

# Indic characters based on https://www.unicode.org/charts/
# Hindi, Marathi, Nepali, Sanskrit https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)
DEVANAGARI_CHARS = (
r'\u0900-\u0963\u0966-\u097F' # excluding danda (U+0964), double danda (U+0965) so they are treated as punctuation
)
BENGALI_CHARS = r'\u0980-\u09FF' # Bengali, Assamese
TAMIL_CHARS = r'\u0B80-\u0BFF' # Tamil
TELUGU_CHARS = r'\u0C00-\u0C7F' # Telugu
KANNADA_CHARS = r'\u0C80-\u0CFF' # Kannada
GUJARATI_CHARS = r'\u0A80-\u0AFF' # Gujarati
INDIC_CHARS_ALL = f"{DEVANAGARI_CHARS}{BENGALI_CHARS}{TAMIL_CHARS}{TELUGU_CHARS}{KANNADA_CHARS}{GUJARATI_CHARS}"

# Korean
# ref: https://en.wikipedia.org/wiki/Hangul_Syllables (U+AC00–U+D7A3)
# ref: https://en.wikipedia.org/wiki/Hangul_Jamo_(Unicode_block) (U+1100–U+11FF)
# ref: https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo (U+3130–U+318F)
KOREAN_CHARS = r'\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F'

_WORDS_RE_EN = re.compile(
fr"([{LATIN_ALPHABET_BASIC}]+(?:[{LATIN_ALPHABET_BASIC}\-']*[{LATIN_ALPHABET_BASIC}]+)*)|(\|[^|]*\|)|([^{LATIN_ALPHABET_BASIC}|]+)"
)
_WORDS_RE_ANY_LOCALE = re.compile(
fr"([{LATIN_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}|]+)"
fr"([{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}]+(?:[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}\-']*[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}|]+)"
)


Expand Down
68 changes: 53 additions & 15 deletions nemo/collections/tts/g2p/models/i18n_ipa.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import validate_locale
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
INDIC_CHARS_ALL,
KOREAN_CHARS,
LATIN_CHARS_ALL,
any_locale_word_tokenize,
english_word_tokenize,
Expand All @@ -29,18 +31,24 @@
from nemo.collections.tts.g2p.utils import GRAPHEME_CASE_MIXED, GRAPHEME_CASE_UPPER, set_grapheme_case
from nemo.utils import logging

# Compiled regex pattern for Indic scripts (used in dictionary parsing)
_INDIC_PATTERN = re.compile(f'^[{INDIC_CHARS_ALL}]')
_KOREAN_PATTERN = re.compile(f'^[{KOREAN_CHARS}]')


class IpaG2p(BaseG2p):
# fmt: off
STRESS_SYMBOLS = ["ˈ", "ˌ"]
# Regex for roman characters, accented characters, and locale-agnostic numbers/digits
CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}\d]")
PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}\d]")
CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}\d]")
PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}\d]")
# fmt: on

def __init__(
self,
phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
phoneme_dict: Union[
str, pathlib.Path, List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]], Dict[str, List[List[str]]]
],
locale: str = "en-US",
Comment on lines 47 to 52
Copy link

Copilot AI Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The phoneme_dict type annotation doesn't match the supported runtime behavior: the Hindi unit test passes a list of dicts for code-switching, but the annotation only allows List[Union[str, Path]]. This will trip static type checking and makes the API contract unclear; broaden the union to allow lists containing dicts (or use a Sequence[...]) and update the parameter docstring accordingly (also remove the stale commented-out type line).

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator

@XuesongYang XuesongYang Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i agree with Copilot's comment. Need to remove stale commented-out type line and fix typing. This appears in three places: __init__, _parse_phoneme_dict, and replace_dict.

The type List[Union[str, pathlib.Path]] doesn't reflect the actual runtime behavior. The Hindi test passes [self.PHONEME_DICT_HI, self.PHONEME_DICT_EN], which is a list of dicts. The recursive call in _parse_phoneme_dict handles this correctly at runtime, but the type annotation is misleading.

Suggested change
def __init__(
self,
phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
# phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]],
phoneme_dict: Union[str, pathlib.Path, List[Union[str, pathlib.Path]], Dict[str, List[List[str]]]],
locale: str = "en-US",
def __init__(
self,
phoneme_dict: Union[
str, pathlib.Path, List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]], Dict[str, List[List[str]]]
],

apply_to_oov_word: Optional[Callable[[str], str]] = None,
ignore_ambiguous_words: bool = True,
Expand All @@ -59,10 +67,13 @@ def __init__(
`apply_to_oov_word` for handling.

Args:
phoneme_dict (str, Path, or Dict): Path to file in CMUdict format or an IPA dict object with CMUdict-like
entries. For example,
a dictionary file: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt;
a dictionary object: {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}.
phoneme_dict: A single phoneme dictionary source or a list of sources for multi-dictionary
code-switching (e.g. Hindi + English). Each source can be:
- a file path (str or pathlib.Path) in CMUdict format,
e.g. ``scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt``
- a dict object with CMUdict-like entries,
e.g. ``{"Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]]}``
When a list is provided, all sources are parsed and merged into a single dictionary.
locale (str): Locale used to determine a locale-specific tokenization logic. Currently, it supports "en-US",
"de-DE", and "es-ES". Defaults to "en-US". Specify None if implementing custom logic for a new locale.
apply_to_oov_word (Callable): Function that deals with the out-of-vocabulary (OOV) words that do not exist
Expand Down Expand Up @@ -154,19 +165,36 @@ def __init__(

@staticmethod
def _parse_phoneme_dict(
phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]]
phoneme_dict: Union[
str,
pathlib.Path,
Dict[str, List[List[str]]],
List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]],
]
) -> Dict[str, List[List[str]]]:
"""
parse an input IPA dictionary and save it as a dict object.
Parse one or more IPA dictionaries and return a merged dict object.

Args:
phoneme_dict (Union[str, pathlib.Path, dict]): Path to file in CMUdict format or an IPA dict object with
CMUdict-like entries. For example,
a dictionary file: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt;
a dictionary object: {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}.
phoneme_dict: A single phoneme dictionary source or a list of sources for multi-dictionary
code-switching (e.g. Hindi + English). Each source can be:
- a file path (str or pathlib.Path) in CMUdict format,
e.g. ``scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt``
- a dict object with CMUdict-like entries,
e.g. ``{"Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]]}``
When a list is provided, all sources are parsed and merged into a single dictionary.

Returns: a dict object (Dict[str, List[List[str]]]).
Returns:
A merged dict object (Dict[str, List[List[str]]]).
"""
if isinstance(phoneme_dict, list):
merged = defaultdict(list)
for source in phoneme_dict:
parsed = IpaG2p._parse_phoneme_dict(source)
for word, prons in parsed.items():
merged[word].extend(prons)
return merged
Comment thread
XuesongYang marked this conversation as resolved.

if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path):
# load the dictionary file where there may exist a digit suffix after a word, e.g. "Word(2)", which
# represents the pronunciation variant of that word.
Expand All @@ -190,6 +218,8 @@ def _parse_phoneme_dict(
or 'À' <= line[0] <= 'Ö'
or 'Ø' <= line[0] <= 'ö'
or 'ø' <= line[0] <= 'ÿ'
or _INDIC_PATTERN.match(line[0])
or _KOREAN_PATTERN.match(line[0])
or line[0] == "'"
):
parts = line.strip().split(maxsplit=1)
Expand Down Expand Up @@ -217,7 +247,15 @@ def _parse_phoneme_dict(

return phoneme_dict_obj

def replace_dict(self, phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]]):
def replace_dict(
self,
phoneme_dict: Union[
str,
pathlib.Path,
Dict[str, List[List[str]]],
List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]],
],
):
"""
Replace model's phoneme dictionary with a custom one
"""
Expand Down
Loading
Loading