-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Add hi-IN , Ko-KR and pt-BR IPA tokenizer support #15567
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
37fc7d8
ea93a40
2845815
9bf8a0b
4bc23b7
683d5be
f0bbf22
2151454
fa43171
b17e9b8
7ba7767
caf97ce
58c0d5b
34a0e85
0f60dde
a776f66
f65538b
e4710cd
73724fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -20,6 +20,8 @@ | |||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import validate_locale | ||||||||||||||||||||||||
| from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import ( | ||||||||||||||||||||||||
| INDIC_CHARS_ALL, | ||||||||||||||||||||||||
| KOREAN_CHARS, | ||||||||||||||||||||||||
| LATIN_CHARS_ALL, | ||||||||||||||||||||||||
| any_locale_word_tokenize, | ||||||||||||||||||||||||
| english_word_tokenize, | ||||||||||||||||||||||||
|
|
@@ -29,18 +31,24 @@ | |||||||||||||||||||||||
| from nemo.collections.tts.g2p.utils import GRAPHEME_CASE_MIXED, GRAPHEME_CASE_UPPER, set_grapheme_case | ||||||||||||||||||||||||
| from nemo.utils import logging | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| # Compiled regex pattern for Indic scripts (used in dictionary parsing) | ||||||||||||||||||||||||
| _INDIC_PATTERN = re.compile(f'^[{INDIC_CHARS_ALL}]') | ||||||||||||||||||||||||
| _KOREAN_PATTERN = re.compile(f'^[{KOREAN_CHARS}]') | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| class IpaG2p(BaseG2p): | ||||||||||||||||||||||||
| # fmt: off | ||||||||||||||||||||||||
| STRESS_SYMBOLS = ["ˈ", "ˌ"] | ||||||||||||||||||||||||
| # Regex for roman characters, accented characters, and locale-agnostic numbers/digits | ||||||||||||||||||||||||
| CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}\d]") | ||||||||||||||||||||||||
| PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}\d]") | ||||||||||||||||||||||||
| CHAR_REGEX = re.compile(fr"[{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}\d]") | ||||||||||||||||||||||||
| PUNCT_REGEX = re.compile(fr"[^{LATIN_CHARS_ALL}{INDIC_CHARS_ALL}{KOREAN_CHARS}\d]") | ||||||||||||||||||||||||
| # fmt: on | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def __init__( | ||||||||||||||||||||||||
| self, | ||||||||||||||||||||||||
| phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]], | ||||||||||||||||||||||||
| phoneme_dict: Union[ | ||||||||||||||||||||||||
| str, pathlib.Path, List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]], Dict[str, List[List[str]]] | ||||||||||||||||||||||||
| ], | ||||||||||||||||||||||||
| locale: str = "en-US", | ||||||||||||||||||||||||
|
Comment on lines
47
to
52
|
||||||||||||||||||||||||
| def __init__( | |
| self, | |
| phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]], | |
| # phoneme_dict: Union[str, pathlib.Path, Dict[str, List[List[str]]]], | |
| phoneme_dict: Union[str, pathlib.Path, List[Union[str, pathlib.Path]], Dict[str, List[List[str]]]], | |
| locale: str = "en-US", | |
| def __init__( | |
| self, | |
| phoneme_dict: Union[ | |
| str, pathlib.Path, List[Union[str, pathlib.Path, Dict[str, List[List[str]]]]], Dict[str, List[List[str]]] | |
| ], |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.