From fa304a251a31c5f0efe20dee7997f521eb0b9e39 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 13 May 2025 10:54:38 -0700 Subject: [PATCH 01/14] First draft of Korean Cardinal ITN Sparrowhawk testing is not done yet. Signed-off-by: hmlee245 --- .../inverse_normalize.py | 7 +- .../inverse_text_normalization/ko/__init__.py | 17 + .../ko/clean_eval_data.py | 361 ++++++++++++++++++ .../ko/data/__init__.py | 13 + .../ko/data/numbers/__init__.py | 13 + .../ko/data/numbers/digit.tsv | 9 + .../ko/data/numbers/thousands.tsv | 11 + .../ko/data/numbers/zero.tsv | 1 + .../ko/graph_utils.py | 292 ++++++++++++++ .../ko/taggers/__init__.py | 17 + .../ko/taggers/cardinal.py | 104 +++++ .../ko/taggers/tokenize_and_classify.py | 76 ++++ .../ko/taggers/word.py | 32 ++ .../inverse_text_normalization/ko/utils.py | 23 ++ .../ko/verbalizers/__init__.py | 17 + .../ko/verbalizers/cardinal.py | 54 +++ .../ko/verbalizers/verbalize.py | 36 ++ .../ko/verbalizers/verbalize_final.py | 49 +++ .../ko/verbalizers/word.py | 34 ++ .../run_evaluate.py | 2 +- tests/nemo_text_processing/ko/__init__.py | 13 + .../test_cases_cardinal.txt | 27 ++ .../nemo_text_processing/ko/test_cardinal.py | 39 ++ ..._sparrowhawk_inverse_text_normalization.sh | 34 ++ .../pynini_export.py | 8 + 25 files changed, 1287 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py create mode 100644 tests/nemo_text_processing/ko/__init__.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/ko/test_cardinal.py create mode 100644 tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index c10819908..e505a8ad0 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -131,6 +131,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'ko': # Korean + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -175,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py new file mode 100644 index 000000000..3c1193333 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py @@ -0,0 +1,361 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser +from typing import List + +import regex as re + +from nemo_text_processing.text_normalization.data_loader_utils import ( + EOS_TYPE, + Instance, + load_files, + training_data_to_sentences, +) + +""" +This file is for evaluation purposes. +filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. +For example, normalized text should only include characters and whitespace characters but no punctuation. + Cardinal unnormalized instances should contain at least one integer and all other characters are removed. +""" + + +class Filter: + """ + Filter class + + Args: + class_type: semiotic class used in dataset + process_func: function to transform text + filter_func: function to filter text + + """ + + def __init__(self, class_type: str, process_func: object, filter_func: object): + self.class_type = class_type + self.process_func = process_func + self.filter_func = filter_func + + def filter(self, instance: Instance) -> bool: + """ + filter function + + Args: + filters given instance with filter function + + Returns: True if given instance fulfills criteria or does not belong to class type + """ + if instance.token_type != self.class_type: + return True + return self.filter_func(instance) + + def process(self, instance: Instance) -> Instance: + """ + process function + + Args: + processes given instance with process function + + Returns: processed instance if instance belongs to expected class type or original instance + """ + if instance.token_type != self.class_type: + return instance + return self.process_func(instance) + + +def filter_cardinal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_cardinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[^0-9]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_ordinal_1(instance: Instance) -> bool: + ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) + return ok + + +def process_ordinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[,\s]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_decimal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_decimal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_measure_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_measure_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"m2", "m²", un_normalized) + un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) + normalized = re.sub(r"[^a-z\s]", "", normalized) + normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_money_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_money_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"a\$", r"$", un_normalized) + un_normalized = re.sub(r"us\$", r"$", un_normalized) + un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) + un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_time_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_time_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r": ", ":", un_normalized) + un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) + un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_plain_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_plain_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_punct_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_punct_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_date_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_date_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_letters_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_letters_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_verbatim_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_verbatim_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_digit_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_digit_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_telephone_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_telephone_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_electronic_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_electronic_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_fraction_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_fraction_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_address_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_address_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +filters = [] +filters.append(Filter(class_type="CARDINAL", + process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", + process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", + process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", + process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", + process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", + process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", + process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", + process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", + process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", + process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", + process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", + process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", + process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", + process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", + process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", + process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, + process_func=lambda x: x, filter_func=lambda x: True)) + + +def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: + """ + Filters list of instances + + Args: + data: list of instances + + Returns: filtered and transformed list of instances + """ + updates_instances = [] + for instance in data: + updated_instance = False + for fil in filters: + if fil.class_type == instance.token_type and fil.filter(instance): + instance = fil.process(instance) + updated_instance = True + if updated_instance: + if verbose: + print(instance) + updates_instances.append(instance) + return updates_instances + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument("--input", help="input file path", + type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument( + "--verbose", help="print filtered instances", action='store_true') + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + file_path = args.input + + print("Loading training data: " + file_path) + instance_list = load_files([file_path]) # List of instances + filtered_instance_list = filter_loaded_data(instance_list, args.verbose) + training_data_to_sentences(filtered_instance_list) diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv new file mode 100644 index 000000000..9871cb9cf --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +육 6 +칠 7 +팔 8 +구 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv new file mode 100644 index 000000000..541752211 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv @@ -0,0 +1,11 @@ +억 +조 +경 +해 +자 +양 +구 +간 +정 +재 +극 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv new file mode 100644 index 000000000..43baac7c1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv @@ -0,0 +1 @@ +영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..7a9fd8720 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -0,0 +1,292 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_NARROW_NON_BREAK_SPACE = "\u202f" +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00a0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +NEMO_NOT_ALPHA = pynini.difference(NEMO_SIGMA, NEMO_ALPHA).optimize() +NEMO_LOWER_NOT_A = pynini.union( + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", +).optimize() + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + +suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +_c = pynini.union( + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z" +) +_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +_s = NEMO_SIGMA + pynutil.insert("s") + +graph_plural = plurals._priority_union( + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA +).optimize() + +SINGULAR_TO_PLURAL = graph_plural +PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("minus", "Minus").optimize() + + +def capitalized_input_graph( + graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None +) -> 'pynini.FstLike': + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logging.info(f'Created {file_name}') + + +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..df5804fc0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. 마이너스 이십삼 -> cardinal { integer: "23" negative: "-" } } + + Args: + input_case: accepting Korean input. + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_zero = pynini.cross("영", "0") + + graph_negative = pynini.cross("마이너스", "-") + graph_negative += delete_space + + ten = pynutil.delete("십") + ten_alt = pynini.cross("십", "1") + ### Responsible for second digit of two digit number. ex) 20's 2 + graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) + ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, + graph_ten_component += graph_digit | pynutil.insert("0") + + hundred = pynutil.delete("백") + hundred_alt = pynini.cross("백", "1") + graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) + graph_hundred_component += graph_ten_component + + thousand = pynutil.delete("천") + thousand_alt = pynini.cross("천", "1") + graph_thousand_component = pynini.union(((graph_digit + thousand) | thousand_alt), pynutil.insert("0")) + graph_thousand_component += graph_hundred_component + + tenthousand = pynutil.delete("만") + tenthousand_alt = pynini.cross("만", "1") + ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space + ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits + graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component += graph_thousand_component + + hundredmillion = pynutil.delete("억") + hundredmillion_alt = pynini.cross("억", "1") + graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) + graph_hundredmillion_component += graph_tenthousand_component + + trillion = pynutil.delete("조") + trillion_alt = pynini.cross("조", "1") + graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component += graph_hundredmillion_component + + tenquadrillion = pynutil.delete("경") + tenquadrillion_alt = pynini.cross("경", "1") + graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component += graph_trillion_component + + + graph = pynini.union( + ### From biggest unit to smallest, everything is included + graph_tenquadrillion_component| + graph_zero + ) + + leading_zero = ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + graph_nonzero = graph @ leading_zero + graph = pynini.union(graph_nonzero, graph_zero) + + graph = graph @ leading_zero | graph_zero + + self.just_cardinals = graph + + optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + + final_graph = ( + optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..760ce6829 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = INPUT_LOWER_CASED, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"jp_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + word_graph = WordFst().fst + classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") + tagger = pynini.closure(token, 1) + + self.fst = tagger + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py new file mode 100644 index 000000000..0d6ccd5c5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert( + "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py new file mode 100644 index 000000000..0222cc0b8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + + +def get_abs_path(rel_path): + + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..da950f35e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..1800a6dc8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal + e.g. cardinal { negative: "-" integer: "23" } -> -23 + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + negative_sign = ( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + ) + + optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) + + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + integer_cardinal = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + digits_from_tag + + pynutil.delete("\"") + ) + + graph = integer_cardinal + final_graph = optional_sign_output + graph + self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..9d750d757 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + word_graph = WordFst().fst + + graph = (cardinal_graph|word_graph) + self.fst = graph + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..8554fc161 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_deterministic_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + else: + # token_graph = VerbalizeFst(deterministic=deterministic) + token_graph = VerbalizeFst().fst + token_verbalizer = ( + pynutil.delete("tokens {") + delete_space + token_graph + delete_space + pynutil.delete(" }") + ) + verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) + + self.fst = (verbalizer).optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py new file mode 100644 index 000000000..d79957ca8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + + +class WordFst(GraphFst): + ''' + tokens { name: "一" } -> 一 + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 0852329d6..7bfdd3399 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..007273e5e --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,27 @@ +영~0 +구~9 +십~10 +십칠~17 +오십삼~53 +백~100 +백오~105 +삼백이십~320 +구백팔십칠~987 +천~1000 +천육~1006 +천오백~1500 +오천사백삼십이~5432 +만~10000 +만천이백~11200 +삼만오천칠백~35700 +십이만~120000 +백오십만삼천~1503000 +천만~10000000 +오천이백칠십만육천백~52706100 +억~100000000 +삼억오천만~350000000 +십이억천만~1210000000 +백오십억칠천만~15070000000 +오천억~500000000000 +일조~1000000000000 +이조오천억~2500000000000 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..9fd366ea6 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_ko = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) + assert pred == expected + + normalizer_with_audio_ko = ( + NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + if RUN_AUDIO_BASED_TESTS + else None + ) \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..c44f4a703 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,34 @@ +#! /bin/sh + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/ko"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..0df099774 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,6 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko' ], type=str, default='en', @@ -307,6 +308,13 @@ def parse_args(): PostProcessingFst as TNPostProcessingFst, ) from nemo_text_processing.text_normalization.ja.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'rw': from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, From 67357acb882eb45861bfa2f4e83842556f5315cb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 May 2025 22:04:00 +0000 Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_normalize.py | 4 +- .../ko/clean_eval_data.py | 59 +++++++------------ .../ko/taggers/cardinal.py | 38 +++++++----- .../ko/taggers/tokenize_and_classify.py | 12 ++-- .../ko/taggers/word.py | 3 +- .../inverse_text_normalization/ko/utils.py | 3 - .../ko/verbalizers/__init__.py | 2 +- .../ko/verbalizers/cardinal.py | 18 ++---- .../ko/verbalizers/verbalize.py | 7 +-- .../ko/verbalizers/verbalize_final.py | 3 +- .../ko/verbalizers/word.py | 1 - .../run_evaluate.py | 2 +- .../nemo_text_processing/ko/test_cardinal.py | 6 +- .../pynini_export.py | 2 +- 14 files changed, 68 insertions(+), 92 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index e505a8ad0..acda8b7f9 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -135,7 +135,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( VerbalizeFinalFst, - ) + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py index 3c1193333..bc429e858 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py +++ b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py @@ -282,41 +282,24 @@ def process_address_1(instance: Instance) -> Instance: filters = [] -filters.append(Filter(class_type="CARDINAL", - process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", - process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", - process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", - process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", - process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", - process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", - process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", - process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", - process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", - process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", - process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", - process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", - process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", - process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", - process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", - process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, - process_func=lambda x: x, filter_func=lambda x: True)) +filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True)) def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: @@ -344,10 +327,8 @@ def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Inst def parse_args(): parser = ArgumentParser() - parser.add_argument("--input", help="input file path", - type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument( - "--verbose", help="print filtered instances", action='store_true') + parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument("--verbose", help="print filtered instances", action='store_true') return parser.parse_args() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index df5804fc0..09cc03909 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -19,6 +19,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals @@ -37,14 +38,14 @@ def __init__(self): graph_negative = pynini.cross("마이너스", "-") graph_negative += delete_space - + ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, graph_ten_component += graph_digit | pynutil.insert("0") - + hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) @@ -59,29 +60,36 @@ def __init__(self): tenthousand_alt = pynini.cross("만", "1") ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits - graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component = pynini.union( + ((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000") + ) graph_tenthousand_component += graph_thousand_component hundredmillion = pynutil.delete("억") hundredmillion_alt = pynini.cross("억", "1") - graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) - graph_hundredmillion_component += graph_tenthousand_component - + graph_hundredmillion_component = pynini.union( + ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") + ) + graph_hundredmillion_component += graph_tenthousand_component + trillion = pynutil.delete("조") trillion_alt = pynini.cross("조", "1") - graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component = pynini.union( + ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") + ) graph_trillion_component += graph_hundredmillion_component tenquadrillion = pynutil.delete("경") tenquadrillion_alt = pynini.cross("경", "1") - graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component = pynini.union( + ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") + ) graph_tenquadrillion_component += graph_trillion_component - graph = pynini.union( ### From biggest unit to smallest, everything is included - graph_tenquadrillion_component| - graph_zero + graph_tenquadrillion_component + | graph_zero ) leading_zero = ( @@ -89,16 +97,18 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + optional_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) final_graph = ( optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 760ce6829..2842a4167 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,15 +19,15 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( INPUT_LOWER_CASED, GraphFst, delete_extra_space, delete_space, generator_main, ) +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst class ClassifyFst(GraphFst): @@ -64,8 +64,8 @@ def __init__( cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) - + classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) @@ -73,4 +73,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py index 0d6ccd5c5..0e4dbb93c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -27,6 +27,5 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert( - "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py index 0222cc0b8..d198c3835 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -15,9 +15,6 @@ import os - def get_abs_path(rel_path): return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path - - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index da950f35e..f541211af 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -14,4 +14,4 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py index 1800a6dc8..fb9a76d8e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): @@ -34,21 +30,17 @@ def __init__(self): pynutil.delete("negative:") + delete_space + pynutil.delete("\"") - + pynini.accep("-") + + pynini.accep("-") + pynutil.delete("\"") ) optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) - digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) integer_cardinal = ( - pynutil.delete("integer:") - + delete_space - + pynutil.delete("\"") - + digits_from_tag - + pynutil.delete("\"") + pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") ) graph = integer_cardinal final_graph = optional_sign_output + graph - self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 9d750d757..d8851e206 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst class VerbalizeFst(GraphFst): @@ -30,7 +30,6 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - - graph = (cardinal_graph|word_graph) + + graph = cardinal_graph | word_graph self.fst = graph - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8554fc161..09b4cbc8b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -18,9 +18,9 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst): @@ -28,6 +28,7 @@ class VerbalizeFinalFst(GraphFst): Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index d79957ca8..c134fe63a 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -20,7 +20,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space - class WordFst(GraphFst): ''' tokens { name: "一" } -> 一 diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 7bfdd3399..133474940 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja", "ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 9fd366ea6..526747668 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -33,7 +33,5 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) - if RUN_AUDIO_BASED_TESTS - else None - ) \ No newline at end of file + NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) if RUN_AUDIO_BASED_TESTS else None + ) diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0df099774..d1ba34a37 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,7 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', - 'ko' + 'ko', ], type=str, default='en', From 9bca70aa68eac2b472e9a8ab00f8deae9c2a52e3 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 13 May 2025 15:04:57 -0700 Subject: [PATCH 03/14] Fixing all the feedbacks Haven't fixed the graph@leading zero part --- .../ko/data/numbers/zero.tsv | 1 - .../inverse_text_normalization/ko/graph_utils.py | 2 +- .../ko/taggers/__init__.py | 4 ---- .../ko/taggers/cardinal.py | 8 ++------ .../ko/taggers/tokenize_and_classify.py | 2 -- .../ko/verbalizers/__init__.py | 6 +----- .../ko/verbalizers/verbalize_final.py | 1 - .../ko/verbalizers/word.py | 4 +--- tests/nemo_text_processing/ko/test_cardinal.py | 12 ++---------- 9 files changed, 7 insertions(+), 33 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv deleted file mode 100644 index 43baac7c1..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv +++ /dev/null @@ -1 +0,0 @@ -영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py index 7a9fd8720..50f1eb3b9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py index f541211af..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -11,7 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index df5804fc0..684685001 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -31,13 +31,9 @@ class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") - graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.cross("영", "0") + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_negative = pynini.cross("마이너스", "-") - graph_negative += delete_space - ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 @@ -47,7 +43,7 @@ def __init__(self): hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") - graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) + graph_hundred_component = pynini.union(((graph_digit + hundred | hundred_alt)), pynutil.insert("0")) graph_hundred_component += graph_ten_component thousand = pynutil.delete("천") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 760ce6829..bb6b35d41 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -24,8 +24,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( INPUT_LOWER_CASED, GraphFst, - delete_extra_space, - delete_space, generator_main, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index da950f35e..ecc3520ab 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -10,8 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. - -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file +# limitations under the License. \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8554fc161..8d40d2804 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -19,7 +19,6 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index d79957ca8..a423d5d0c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -13,11 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 9fd366ea6..96681fd8b 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,10 +16,8 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestCardinal: @@ -31,9 +29,3 @@ class TestCardinal: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) assert pred == expected - - normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) - if RUN_AUDIO_BASED_TESTS - else None - ) \ No newline at end of file From c42be15382b3badbd5a20cea995a4c7abf87816b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 May 2025 22:09:55 +0000 Subject: [PATCH 04/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../ko/taggers/tokenize_and_classify.py | 6 +----- .../inverse_text_normalization/ko/verbalizers/__init__.py | 2 +- .../ko/verbalizers/verbalize_final.py | 1 - 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 30e0f5df4..75e3f6f20 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,11 +19,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - INPUT_LOWER_CASED, - GraphFst, - generator_main, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index ecc3520ab..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 3e1769297..17f547740 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -20,7 +20,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst): From 090dac4c57145c4badf60d69a82bd1414786798d Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 13 May 2025 15:11:44 -0700 Subject: [PATCH 05/14] Delete clean eval file --- .../ko/clean_eval_data.py | 342 ------------------ 1 file changed, 342 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py deleted file mode 100644 index bc429e858..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py +++ /dev/null @@ -1,342 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser -from typing import List - -import regex as re - -from nemo_text_processing.text_normalization.data_loader_utils import ( - EOS_TYPE, - Instance, - load_files, - training_data_to_sentences, -) - -""" -This file is for evaluation purposes. -filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. -For example, normalized text should only include characters and whitespace characters but no punctuation. - Cardinal unnormalized instances should contain at least one integer and all other characters are removed. -""" - - -class Filter: - """ - Filter class - - Args: - class_type: semiotic class used in dataset - process_func: function to transform text - filter_func: function to filter text - - """ - - def __init__(self, class_type: str, process_func: object, filter_func: object): - self.class_type = class_type - self.process_func = process_func - self.filter_func = filter_func - - def filter(self, instance: Instance) -> bool: - """ - filter function - - Args: - filters given instance with filter function - - Returns: True if given instance fulfills criteria or does not belong to class type - """ - if instance.token_type != self.class_type: - return True - return self.filter_func(instance) - - def process(self, instance: Instance) -> Instance: - """ - process function - - Args: - processes given instance with process function - - Returns: processed instance if instance belongs to expected class type or original instance - """ - if instance.token_type != self.class_type: - return instance - return self.process_func(instance) - - -def filter_cardinal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_cardinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[^0-9]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_ordinal_1(instance: Instance) -> bool: - ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) - return ok - - -def process_ordinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[,\s]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_decimal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_decimal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_measure_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_measure_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"m2", "m²", un_normalized) - un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) - normalized = re.sub(r"[^a-z\s]", "", normalized) - normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_money_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_money_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"a\$", r"$", un_normalized) - un_normalized = re.sub(r"us\$", r"$", un_normalized) - un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) - un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_time_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_time_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r": ", ":", un_normalized) - un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) - un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_plain_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_plain_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_punct_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_punct_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_date_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_date_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_letters_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_letters_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_verbatim_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_verbatim_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_digit_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_digit_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_telephone_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_telephone_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_electronic_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_electronic_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_fraction_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_fraction_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_address_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_address_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -filters = [] -filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True)) - - -def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: - """ - Filters list of instances - - Args: - data: list of instances - - Returns: filtered and transformed list of instances - """ - updates_instances = [] - for instance in data: - updated_instance = False - for fil in filters: - if fil.class_type == instance.token_type and fil.filter(instance): - instance = fil.process(instance) - updated_instance = True - if updated_instance: - if verbose: - print(instance) - updates_instances.append(instance) - return updates_instances - - -def parse_args(): - parser = ArgumentParser() - parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument("--verbose", help="print filtered instances", action='store_true') - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - file_path = args.input - - print("Loading training data: " + file_path) - instance_list = load_files([file_path]) # List of instances - filtered_instance_list = filter_loaded_data(instance_list, args.verbose) - training_data_to_sentences(filtered_instance_list) From 9deb254a92f49b785d46750253c43a481f79d502 Mon Sep 17 00:00:00 2001 From: anand-nv <105917641+anand-nv@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:48:17 +0530 Subject: [PATCH 06/14] Update Dockerfile (#254) Fixes issue with sparrowhawk builds as the original base image is no longer maintained and build breaks Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Signed-off-by: hmlee245 --- tools/text_processing_deployment/Dockerfile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/text_processing_deployment/Dockerfile b/tools/text_processing_deployment/Dockerfile index 22c2b8b92..be6fedcda 100644 --- a/tools/text_processing_deployment/Dockerfile +++ b/tools/text_processing_deployment/Dockerfile @@ -16,22 +16,25 @@ # Dockerfile for C++ (inverse) text normalization backend Sparrowhawk https://github.com/google/sparrowhawk # set base image (host OS) -FROM conda/miniconda3 +FROM continuumio/miniconda3 + # set the working directory in the container WORKDIR /workspace # install dependencies RUN echo "deb http://archive.debian.org/debian stretch main contrib non-free" > /etc/apt/sources.list +RUN apt-get update && apt-get upgrade -y && apt-get install -y --reinstall build-essential pkg-config git make wget RUN conda install conda-build -y -RUN apt-get update && apt-get install -y --reinstall build-essential pkg-config && apt-get upgrade -y && apt-get install -y git && apt-get install make +RUN conda install -c conda-forge thrax=1.3.4 -y RUN git clone https://github.com/google/re2 RUN cd re2 && git checkout tags/2022-02-01 && make && make install -RUN apt-get install build-essential -y && apt-get install wget -y RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz RUN tar xzvf protobuf-2.5.0.tar.gz RUN cd protobuf-2.5.0 && ./configure && make && make install && ldconfig -RUN conda install -c conda-forge thrax=1.3.4 -y +RUN printf "# Conda lib path \n/opt/conda/lib" > /etc/ld.so.conf.d/conda.so.conf +ENV CPPFLAGS="-I/opt/conda/include" +ENV LDFLAGS="-L/opt/conda/lib" RUN git clone https://github.com/anand-nv/sparrowhawk.git && cd sparrowhawk && git checkout nemo_tests && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig RUN git clone https://github.com/kward/shunit2.git -RUN echo "DONE" \ No newline at end of file +RUN echo "DONE" From 44e211b904b00c8fd0cb838792e101a502dc6525 Mon Sep 17 00:00:00 2001 From: anand-nv <105917641+anand-nv@users.noreply.github.com> Date: Thu, 3 Apr 2025 20:15:41 +0530 Subject: [PATCH 07/14] Future implementations to date.py - Hindi ITN (#265) (#266) * Future implementations to date.py - Hindi ITN (#265) * Addition of whitelist and word classes Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updation of Jenkins date Signed-off-by: Tarushi V * Cleanup Signed-off-by: Tarushi V * Updation Signed-off-by: Tarushi V * Updation Signed-off-by: Tarushi V * Future implementations for date Signed-off-by: Tarushi V * pushing rough date code for ref Signed-off-by: Tarushi V * Future implementations date.py Signed-off-by: Tarushi V * Cleanup Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updation of Jenkinsfile Signed-off-by: Tarushi V * Telephone.py-hindi itn Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Telephone.py - Hindi ITN Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Telephone modified tagger and verbalizer Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * telephone tagger with 3,4,5 digit std codes Signed-off-by: Tarushi V * Further additions - telephone.py Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Jenkins update Signed-off-by: Tarushi V * Telephone.py Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated tagger-telephone.py Signed-off-by: Tarushi V * Telephone and Jenkinsfile cleanup Signed-off-by: Tarushi V * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update Jenkins Signed-off-by: Tarushi V --------- Signed-off-by: Tarushi V Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Anand Joseph * Add missing __init__.py file Signed-off-by: Anand Joseph --------- Signed-off-by: Tarushi V Signed-off-by: Anand Joseph Co-authored-by: tarushi2k2 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hmlee245 --- Jenkinsfile | 2 +- .../hi/data/date/century.tsv | 3 + .../hi/data/telephone/__init__.py | 13 ++ .../hi/data/telephone/eng_to_hindi_digit.tsv | 10 ++ .../telephone/teens_and_ties_eng_to_hin.tsv | 90 ++++++++++ .../hi/taggers/date.py | 35 +++- .../hi/taggers/telephone.py | 158 ++++++++++++++++++ .../hi/taggers/tokenize_and_classify.py | 4 + .../hi/verbalizers/date.py | 38 ++++- .../hi/verbalizers/telephone.py | 55 ++++++ .../hi/verbalizers/verbalize.py | 3 + .../test_cases_date.txt | 12 +- .../test_cases_telephone.txt | 37 ++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 + .../nemo_text_processing/hi/test_telephone.py | 31 ++++ 15 files changed, 490 insertions(+), 6 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt create mode 100644 tests/nemo_text_processing/hi/test_telephone.py diff --git a/Jenkinsfile b/Jenkinsfile index 6edad14a2..82a0a4799 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv new file mode 100644 index 000000000..da69e23eb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv @@ -0,0 +1,3 @@ +ई.पू. ईसा पूर्व +ई. ईस्वी +ई. ईसवी diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv new file mode 100644 index 000000000..53c5e36cb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv @@ -0,0 +1,10 @@ +० zero +१ one +२ two +३ three +४ four +५ five +६ six +७ seven +८ eight +९ nine diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv new file mode 100644 index 000000000..ac37b55f2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv @@ -0,0 +1,90 @@ +१० ten +११ eleven +१२ twelve +१३ thirteen +१४ fourteen +१५ fifteen +१६ sixteen +१७ seventeen +१८ eighteen +१९ nineteen +२० twenty +२१ twenty one +२२ twenty two +२३ twenty three +२४ twenty four +२५ twenty five +२६ twenty six +२७ twenty seven +२८ twenty eight +२९ twenty nine +३० thirty +३१ thirty one +३२ thirty two +३३ thirty three +३४ thirty four +३५ thirty five +३६ thirty six +३७ thirty seven +३८ thirty eight +३९ thirty nine +४० forty +४१ forty one +४२ forty two +४३ forty three +४४ forty four +४५ forty five +४६ forty six +४७ forty seven +४८ forty eight +४९ forty nine +५० fifty +५१ fifty one +५२ fifty two +५३ fifty three +५४ fifty four +५५ fifty five +५६ fifty six +५७ fifty seven +५८ fifty eight +५९ fifty nine +६० sixty +६१ sixty one +६२ sixty two +६३ sixty three +६४ sixty four +६५ sixty five +६६ sixty six +६७ sixty seven +६८ sixty eight +६९ sixty nine +७० seventy +७१ seventy one +७२ seventy two +७३ seventy three +७४ seventy four +७५ seventy five +७६ seventy six +७७ seventy seven +७८ seventy eight +७९ seventy nine +८० eighty +८१ eighty one +८२ eighty two +८३ eighty three +८४ eighty four +८५ eighty five +८६ eighty six +८७ eighty seven +८८ eighty eight +८९ eighty nine +९० ninety +९१ ninety one +९२ ninety two +९३ ninety three +९४ ninety four +९५ ninety five +९६ ninety six +९७ ninety seven +९८ ninety eight +९९ ninety nine diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 61183ae72..6859f0834 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -44,10 +44,22 @@ def __init__(self, cardinal: GraphFst): month_graph = pynini.string_file(get_abs_path("data/date/months.tsv")) graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() + graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ") + self.year_range = ( + pynutil.insert("year: \"") + + graph_year + + delete_space + + pynini.cross("से", "-") + + delete_space + + graph_year + + delete_space + + pynutil.insert("\" ") + ) + self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") insert_comma = pynutil.insert(", ") graph_day_month = self.day + delete_space + self.month @@ -58,9 +70,28 @@ def __init__(self, cardinal: GraphFst): graph_month_day_year += pynutil.insert(" preserve_order: true") graph_month_year = self.month + delete_space + self.year graph_saal = self.year + graph_AD_BC = self.year + delete_space + self.century + graph_day_month_year_century = ( + self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century + ) + graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century + graph_year_range = self.year_range - graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year - self.graph = graph.optimize() + graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day + graph_date_exceptions += pynutil.insert("preserve_order: true") + graph = ( + graph_day_month + | graph_month_day + | graph_day_month_year + | graph_month_day_year + | graph_month_year + | graph_saal + | graph_AD_BC + | graph_day_month_year_century + | graph_month_year_century + | graph_year_range + | graph_date_exceptions + ) final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py new file mode 100644 index 000000000..1d1d3c875 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying telephone numbers, e.g. + e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } + + Args: + Cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="classify") + + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + + english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() + + country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + country_code_graph_single_digits |= pynini.string_file( + get_abs_path("data/telephone/eng_to_hindi_digit.tsv") + ).invert() + + country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + country_code_graph_double_digits |= pynini.string_file( + get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") + ).invert() + + self.hindi_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.english_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 9) + + english_digit_graph + + delete_space + + pynutil.insert("\" ") + ) + + self.country_code_with_single_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.country_code_with_double_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + + pynutil.insert("\" ") + ) + self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits + + # two, three, four-digit extension code with zero + self.city_code_hindi = ( + pynutil.insert("extension: \"") + + pynini.closure(hindi_digit_graph + delete_space, 2, 5) + + pynutil.insert("\" ") + ) + self.city_code_english = ( + pynutil.insert("extension: \"") + + pynini.closure(english_digit_graph + delete_space, 2, 5) + + pynutil.insert("\" ") + ) + + self.city_extension = self.city_code_hindi | self.city_code_english + + # 7-digit landline graph in hindi and english digits + self.landline_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 7, 7) + + pynutil.insert("\" ") + ) + self.landline_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 7, 7) + + pynutil.insert("\" ") + ) + + self.landline = self.landline_hindi | self.landline_english + + self.pincode_in_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 5) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.pincode_in_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 5) + + english_digit_graph + + pynutil.insert("\" ") + ) + + self.credit_card_last_digits_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.credit_card_last_digits_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 3) + + english_digit_graph + + pynutil.insert("\" ") + ) + + delete_plus = pynini.union( + pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") + ) + + delete_zero = pynini.union( + pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") + ) + + graph_number_with_hindi_digit = ( + delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + ) + graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit + + graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline + + graph_pincode = self.pincode_in_hindi | self.pincode_in_english + + graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english + + graph = ( + graph_number_with_hindi_digit + | graph_number_with_english_digit + | graph_landline_with_extension + | graph_pincode + | graph_credit_card_last_digits + ) + + final_graph = self.add_tokens(graph) + self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index a5a371d90..62554bd14 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -33,6 +33,7 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst @@ -82,6 +83,8 @@ def __init__( measure_graph = measure.fst money = MoneyFst(cardinal, decimal) money_graph = money.fst + telephone = TelephoneFst(cardinal) + telephone_graph = telephone.fst punct_graph = PunctuationFst().fst whitelist_graph = WhiteListFst().fst word_graph = WordFst().fst @@ -95,6 +98,7 @@ def __init__( | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(whitelist_graph, 1.01) ) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index 5442777da..eacfb5765 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -61,22 +61,45 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - graph_fy = period + delete_space + year + graph_fy = year + graph_fy |= period + delete_space + year + + # century + graph_century = year + delete_extra_space + period + # month (day) year graph_mdy = month + delete_extra_space + day + pynutil.insert(",") + delete_extra_space + year # (day) month year graph_dmy = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year + # day month year century + graph_dmyc = ( + day + + delete_extra_space + + month + + pynutil.insert(",") + + delete_extra_space + + year + + delete_extra_space + + period + ) + # month year graph_my = month + pynini.closure(delete_extra_space + year, 0, 1) + # month year century + graph_myc = month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period + # month day graph_md = month + pynini.closure(delete_extra_space + day, 0, 1) # day month graph_dm = day + pynini.closure(delete_extra_space + month, 0, 1) + # year range + graph_year_range = year + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -88,7 +111,18 @@ def __init__(self): ) final_graph = ( - (graph_fy | graph_mdy | graph_dmy | graph_my | graph_md | graph_dm) + ( + graph_fy + | graph_mdy + | graph_dmy + | graph_my + | graph_md + | graph_dm + | graph_century + | graph_dmyc + | graph_myc + | graph_year_range + ) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py new file mode 100644 index 000000000..3f4b4de1f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2025 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone, e.g. + telephone { number_part: "123-123-5678" } + -> 123-123-5678 + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="verbalize") + + number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynutil.insert("+") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + pynini.accep(" "), + 0, + 1, + ) + optional_city_code = pynini.closure( + pynutil.delete("extension: \"") + + pynutil.insert("०") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + pynini.accep(" "), + 0, + 1, + ) + delete_tokens = self.delete_tokens(optional_country_code + number_part) + delete_tokens |= self.delete_tokens(optional_city_code + number_part) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index d88bd25d9..165fe7a7e 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -21,6 +21,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst @@ -45,6 +46,7 @@ def __init__(self): time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst + telephone_graph = TelephoneFst(cardinal).fst word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst @@ -59,5 +61,6 @@ def __init__(self): | time_graph | measure_graph | money_graph + | telephone_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt index bdc450fdd..6d570a9c5 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt @@ -22,4 +22,14 @@ सत्ताईस जुलाई दो हज़ार ग्यारह~२७ जुलाई, २०११ जुलाई सत्ताईस~जुलाई २७ वर्ष दो हज़ार उन्नीस~वर्ष २०१९ -सन उन्नीस सौ नब्बे~सन १९९० \ No newline at end of file +सन उन्नीस सौ नब्बे~सन १९९० +उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे~१९९०-१९९१ +दो हज़ार पाँच से दो हज़ार उन्नीस~२००५-२०१९ +दो हज़ार पाँच से उन्नीस~२००५-१९ +चौंतीस सौ ईसा पूर्व~३४०० ई.पू. +उन्नीस सौ बीस ईस्वी~१९२० ई. +पच्चीस जनवरी अठारह सौ तिरेपन ईसवी~२५ जनवरी, १८५३ ई. +इकत्तीस मई उन्नीस सौ नब्बे ईसवी~३१ मई, १९९० ई. +पच्चीस ईसा पूर्व~२५ ई.पू. +मार्च की दो~मार्च २ +फ़रवरी की बीस~फ़रवरी २० diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..0c51d8df0 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,37 @@ +प्लस इक्यानवे nine four one one one two three four one two~+९१ ९४१११२३४१२ +प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +plus nine eight nine four one one one two three four zero one~+९८ ९४१११२३४०१ +प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +plus eleven nine four one one one two three~+११ ९४१११२३ +zero eight zero two nine four one one one two~०८० २९४१११२ +शून्य आठ शून्य दो नौ चार एक एक एक दो~०८० २९४१११२ +zero four zero two seven eight one eight three nine~०४० २७८१८३९ +शून्य चार शून्य दो सात आठ एक आठ तीन नौ~०४० २७८१८३९ +शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ +प्लस नौ एक नौ तीन आठ दो सात एक चार छह पांच शून्य~+९१ ९३८२७१४६५० +प्लस नौ एक नौ शून्य पांच एक तीन चार आठ दो सात छह~+९१ ९०५१३४८२७६ +प्लस नौ एक नौ चार तीन सात दो शून्य पांच छह एक आठ~+९१ ९४३७२०५६१८ +PLUS ninety one nine three eight two seven one four six five zero~+९१ ९३८२७१४६५० +plus nine one nine zero five one three four eight two seven six~+९१ ९०५१३४८२७६ +plus ninety one nine four three seven two zero five six one eight~+९१ ९४३७२०५६१८ +ZERO seven three चार पाँच छह सात आठ नौ शून्य~०७३ ४५६७८९० +शून्य चार शून्य पाँच चार एक दो सात तीन आठ~०४० ५४१२७३८ +ZERO seven three four five six seven eight nine zero~०७३ ४५६७८९० +zero two eight seven six five four three two seven~०२८ ७६५४३२७ +PLUS eighty one nine seven four seven two zero zero one one eight~+८१ ९७४७२००११८ +zero eight zero two two nine four one one one~०८० २२९४१११ +शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ +zero eight zero nine two two nine four one one one~०८०९ २२९४१११ +शून्य सात नौ नौ एक नौ आठ सात छह पांच चार~०७९९ १९८७६५४ +zero three one nine two two two nine four one one one~०३१९२ २२९४१११ +शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार~०७९११ १९८७६५४ +एक एक शून्य शून्य सात शून्य दिल्ली के वसंत कुंज का पिनकोड है~११००७० दिल्ली के वसंत कुंज का पिनकोड है +बंगलौर के बैयापानहली का पिनकोड पाँच छह शून्य शून्य तीन आठ है~बंगलौर के बैयापानहली का पिनकोड ५६००३८ है +दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है +five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है +मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं +दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है +five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है +मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh index aec7299d5..a365a834d 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh @@ -63,6 +63,11 @@ testITNMoney() { runtest $input } +testITNTelephone() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + testITNWord() { input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt runtest $input diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py new file mode 100644 index 000000000..7e43f7e82 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTelephone: + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() From fc81e645b5c02ec4a22874b2f0333a40f20e2901 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Mon, 7 Apr 2025 15:49:56 -0400 Subject: [PATCH 08/14] add base coverage for fr tn date (#267) (#269) * add base coverage for fr tn date * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hmlee245 --- Jenkinsfile | 2 +- .../fr/data/dates/__init__.py | 13 +++ .../text_normalization/fr/data/dates/eras.tsv | 8 ++ .../fr/data/dates/months.tsv | 12 +++ .../text_normalization/fr/taggers/date.py | 97 +++++++++++++++++++ .../fr/taggers/tokenize_and_classify.py | 5 + .../text_normalization/fr/verbalizers/date.py | 51 ++++++++++ .../fr/verbalizers/verbalize.py | 5 +- .../test_cases_date.txt | 13 +++ tests/nemo_text_processing/fr/test_date.py | 10 ++ .../fr/test_sparrowhawk_normalization.sh | 5 + 11 files changed, 219 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/text_normalization/fr/data/dates/__init__.py create mode 100644 nemo_text_processing/text_normalization/fr/data/dates/eras.tsv create mode 100644 nemo_text_processing/text_normalization/fr/data/dates/months.tsv create mode 100644 nemo_text_processing/text_normalization/fr/taggers/date.py create mode 100644 nemo_text_processing/text_normalization/fr/verbalizers/date.py create mode 100644 tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt diff --git a/Jenkinsfile b/Jenkinsfile index 82a0a4799..c94c107c6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,7 +16,7 @@ pipeline { EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-04-24-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-25-24-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0' - FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0' + FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-07-25-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' diff --git a/nemo_text_processing/text_normalization/fr/data/dates/__init__.py b/nemo_text_processing/text_normalization/fr/data/dates/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv b/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv new file mode 100644 index 000000000..6127bea93 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv @@ -0,0 +1,8 @@ +20s twenties +30s thirties +40s forties +50s fifties +60s sixties +70s seventies +80s eighties +90s nineties \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr/data/dates/months.tsv b/nemo_text_processing/text_normalization/fr/data/dates/months.tsv new file mode 100644 index 000000000..98a4e7d5d --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/months.tsv @@ -0,0 +1,12 @@ +1 janvier +2 février +3 mars +4 avril +5 mai +6 juin +7 juillet +8 août +9 septembre +10 octobre +11 novembre +12 décembre \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr/taggers/date.py b/nemo_text_processing/text_normalization/fr/taggers/date.py new file mode 100644 index 000000000..91e83c40c --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/taggers/date.py @@ -0,0 +1,97 @@ +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.fr.utils import get_abs_path + +# TODO: add articles? 'le...' + +month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv")) +eras = pynini.string_file(get_abs_path("data/dates/eras.tsv")) +delete_leading_zero = ( + pynutil.delete("0") | (NEMO_DIGIT - "0") +) + NEMO_DIGIT # reminder, NEMO_DIGIT = filter on digits + + +class DateFst(GraphFst): + ''' Finite state transducer for classyfing dates, e.g.: + '02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true} + ''' + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="dates", kind="classify") + + cardinal_graph = cardinal.all_nums_no_tokens + + # 'le' -> 'le', 'les' -> 'les' + le_determiner = pynini.accep("le ") | pynini.accep("les ") + self.optional_le = pynini.closure(le_determiner, 0, 1) + + # '01' -> 'un' + optional_leading_zero = delete_leading_zero | NEMO_DIGIT + valid_day_number = pynini.union(*[str(x) for x in range(1, 32)]) + premier = pynini.string_map([("1", "premier")]) + day_number_to_word = premier | cardinal_graph + + digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word + self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"") + + # '03' -> 'mars' + normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)]) + number_to_month = month_numbers.optimize() + month_graph = normalize_month_number @ number_to_month + self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") + + # 2025 -> deux mille vingt cinq + accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3) + digits_to_year = accept_year_digits @ cardinal_graph + self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"") + + # Putting it all together + self.fst = pynini.accep("") + + for separator in ["/", ".", "-"]: + self.fst |= ( + pynutil.insert("date { ") + + self.day_graph + + pynutil.delete(separator) + + pynutil.insert(" ") + + self.month_graph + + pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true }") + ) + + # Accepts "janvier", "février", etc + month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"") + + self.fst |= ( + pynutil.insert("date { ") + + self.day_graph + + pynini.accep(" ") + + month_name_graph + + pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true}") + ) + + # Accepts "70s", "80s", etc + self.fst |= pynutil.insert("date { year: \"") + eras + pynutil.insert("\" preserve_order: true }") + + # Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"} + for separator in ["-", "/"]: + day_range_graph = ( + pynutil.insert("day: \"") + + pynini.closure(digit_to_day + pynutil.delete(separator) + pynutil.insert(" "), 1) + + digit_to_day + + pynutil.insert("\"") + ) + + self.fst |= ( + pynutil.insert("date { ") + + day_range_graph + + pynini.accep(" ") + + month_name_graph + + pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true }") + ) + + self.fst = self.fst.optimize() diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index de9a0b047..cacc94bcf 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -26,6 +26,7 @@ ) from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.fr.taggers.date import DateFst from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst @@ -86,8 +87,12 @@ def __init__( whitelist_graph = self.whitelist.fst punct_graph = PunctuationFst(deterministic=deterministic).fst + self.date = DateFst(self.cardinal, deterministic=deterministic) + date_graph = self.date.fst + classify = ( pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.09) | pynutil.add_weight(ordinal_graph, 1.1) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/date.py b/nemo_text_processing/text_normalization/fr/verbalizers/date.py new file mode 100644 index 000000000..8c8c1aa21 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/verbalizers/date.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, + delete_preserve_order, +) + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, e.g. + date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois + Args: + ordinal: OrdinalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="date", kind="verbalize", deterministic=deterministic) + + day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + decade = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + graph_dmy = day + NEMO_SPACE + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order + graph_my = month + NEMO_SPACE + year + delete_preserve_order + graph_decade = decade + delete_preserve_order + + self.graph = graph_dmy | graph_my | graph_decade + + delete_tokens = self.delete_tokens(self.graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py index 02510ea5f..3ea0117af 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py @@ -14,6 +14,7 @@ from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.fr.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst @@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True): fraction = FractionFst(ordinal=ordinal, deterministic=deterministic) fraction_graph = fraction.fst whitelist_graph = WhiteListFst(deterministic=deterministic).fst + date = DateFst(deterministic=deterministic) + date_graph = date.fst - graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph + graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph self.fst = graph diff --git a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..3b4f09154 --- /dev/null +++ b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt @@ -0,0 +1,13 @@ +02.03.2003~deux mars deux mille trois +02/03/2003~deux mars deux mille trois +02-03-2003~deux mars deux mille trois +le 02.03.2003~le deux mars deux mille trois +17.06~dix-sept juin +17 janvier~dix-sept janvier +10 mars 2023~dix mars deux mille vingt-trois +le 10 mars 2023~le dix mars deux mille vingt-trois +les 80s~les eighties +les 17/18 juin~les dix-sept dix-huit juin +les 17/18/19 mars~les dix-sept dix-huit dix-neuf mars +les 17-18-19 juin~les dix-sept dix-huit dix-neuf juin +les 17-18-19 juin 2025~les dix-sept dix-huit dix-neuf juin deux mille vingt-cinq \ No newline at end of file diff --git a/tests/nemo_text_processing/fr/test_date.py b/tests/nemo_text_processing/fr/test_date.py index 614ed0e24..35e3086cd 100644 --- a/tests/nemo_text_processing/fr/test_date.py +++ b/tests/nemo_text_processing/fr/test_date.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,12 @@ class TestDate: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(input_case='cased', lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh index 009032118..71f3f4759 100644 --- a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh @@ -27,6 +27,11 @@ testTNCardinal() { runtest $input } +testTNDate() { + input=$PROJECT_DIR/fr/data_text_normalization/test_cases_date.txt + runtest $input +} + testTNDecimal() { input=$PROJECT_DIR/fr/data_text_normalization/test_cases_decimal.txt runtest $input From 1a3bce3a8deb2510465a595e9496f9b7387fe155 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Apr 2025 12:14:57 -0400 Subject: [PATCH 09/14] [pre-commit.ci] pre-commit suggestions (#270) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit suggestions updates: - [github.com/pre-commit/pre-commit-hooks: v4.3.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.3.0...v5.0.0) - [github.com/PyCQA/flake8: 7.1.1 → 7.2.0](https://github.com/PyCQA/flake8/compare/7.1.1...7.2.0) - [github.com/PyCQA/isort: 5.12.0 → 6.0.1](https://github.com/PyCQA/isort/compare/5.12.0...6.0.1) - [github.com/psf/black: 19.10b0 → 25.1.0](https://github.com/psf/black/compare/19.10b0...25.1.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: hmlee245 --- .pre-commit-config.yaml | 8 +-- .../fst_alignment/alignment.py | 2 +- nemo_text_processing/hybrid/utils.py | 17 +++++- .../ar/taggers/cardinal.py | 4 +- .../ar/taggers/decimal.py | 4 +- .../ar/verbalizers/word.py | 2 +- .../en/taggers/cardinal.py | 5 +- .../en/taggers/date.py | 6 +- .../en/taggers/decimal.py | 4 +- .../en/taggers/electronic.py | 8 ++- .../en/taggers/measure.py | 4 +- .../en/taggers/money.py | 2 +- .../en/taggers/telephone.py | 14 ++--- .../en/taggers/time.py | 22 +++++++- .../en/verbalizers/whitelist.py | 2 +- .../en/verbalizers/word.py | 2 +- .../es/taggers/cardinal.py | 9 +-- .../es/taggers/electronic.py | 8 ++- .../es/taggers/ordinal.py | 8 ++- .../es/taggers/telephone.py | 5 +- .../es/verbalizers/whitelist.py | 2 +- .../es/verbalizers/word.py | 2 +- .../fr/graph_utils.py | 6 +- .../fr/taggers/cardinal.py | 9 +-- .../fr/taggers/date.py | 4 +- .../fr/taggers/punctuation.py | 2 +- .../fr/verbalizers/decimal.py | 2 +- .../fr/verbalizers/ordinal.py | 4 +- .../fr/verbalizers/whitelist.py | 2 +- .../fr/verbalizers/word.py | 2 +- .../hi/graph_utils.py | 8 +-- .../hi/taggers/cardinal.py | 21 ++++--- .../hi/taggers/date.py | 4 +- .../hi/taggers/decimal.py | 16 +++--- .../hi/taggers/fraction.py | 6 +- .../hi/taggers/measure.py | 4 +- .../hi/taggers/telephone.py | 2 +- .../hi/taggers/time.py | 4 +- .../hi/taggers/tokenize_and_classify.py | 6 +- .../inverse_text_normalization/hi/utils.py | 10 ++-- .../hi/verbalizers/verbalize_final.py | 2 +- .../hi/verbalizers/whitelist.py | 2 +- .../hi/verbalizers/word.py | 2 +- .../hy/verbalizers/ordinal.py | 7 ++- .../hy/verbalizers/whitelist.py | 2 +- .../hy/verbalizers/word.py | 2 +- .../ja/graph_utils.py | 8 +-- .../ja/taggers/cardinal.py | 5 +- .../ja/taggers/fraction.py | 4 +- .../ja/taggers/preprocessor.py | 4 +- .../ja/taggers/time.py | 8 ++- .../inverse_text_normalization/ja/utils.py | 22 ++++---- .../ja/verbalizers/cardinal.py | 2 +- .../ja/verbalizers/postprocessor.py | 6 +- .../ja/verbalizers/time.py | 10 +++- .../ja/verbalizers/verbalize_final.py | 7 ++- .../mr/graph_utils.py | 8 +-- .../mr/taggers/cardinal.py | 6 +- .../mr/taggers/date.py | 6 +- .../mr/taggers/decimal.py | 4 +- .../mr/verbalizers/word.py | 2 +- .../pt/taggers/cardinal.py | 15 ++--- .../pt/verbalizers/whitelist.py | 2 +- .../pt/verbalizers/word.py | 2 +- .../sv/taggers/date.py | 3 +- .../sv/taggers/decimal.py | 10 +++- .../vi/graph_utils.py | 6 +- .../vi/taggers/cardinal.py | 7 ++- .../vi/taggers/date.py | 5 +- .../vi/taggers/decimal.py | 6 +- .../vi/verbalizers/time.py | 4 +- .../vi/verbalizers/whitelist.py | 2 +- .../vi/verbalizers/word.py | 2 +- .../zh/graph_utils.py | 16 ++++-- .../zh/taggers/cardinal.py | 7 ++- .../zh/taggers/date.py | 4 +- .../zh/taggers/money.py | 4 +- .../zh/taggers/tokenize_and_classify.py | 6 +- .../zh/verbalizers/cardinal.py | 2 +- .../zh/verbalizers/decimal.py | 2 +- .../zh/verbalizers/whitelist.py | 2 +- .../zh/verbalizers/word.py | 2 +- .../text_normalization/ar/graph_utils.py | 8 +-- .../text_normalization/ar/taggers/measure.py | 21 +++---- .../text_normalization/ar/taggers/money.py | 5 +- .../text_normalization/ar/verbalizers/word.py | 2 +- .../text_normalization/de/taggers/cardinal.py | 4 +- .../text_normalization/de/taggers/date.py | 2 +- .../text_normalization/de/taggers/measure.py | 4 +- .../de/taggers/telephone.py | 2 +- .../text_normalization/de/taggers/time.py | 4 +- .../de/taggers/tokenize_and_classify.py | 14 ++++- .../de/verbalizers/ordinal.py | 5 +- .../text_normalization/en/graph_utils.py | 46 ++++++++++++--- .../text_normalization/en/taggers/cardinal.py | 4 +- .../text_normalization/en/taggers/date.py | 6 +- .../en/taggers/electronic.py | 18 ++++-- .../text_normalization/en/taggers/measure.py | 26 ++++++--- .../text_normalization/en/taggers/money.py | 6 +- .../text_normalization/en/taggers/range.py | 13 +++-- .../text_normalization/en/taggers/serial.py | 2 +- .../en/taggers/tokenize_and_classify.py | 15 ++++- .../en/verbalizers/ordinal.py | 5 +- .../en/verbalizers/whitelist.py | 2 +- .../text_normalization/en/verbalizers/word.py | 2 +- .../text_normalization/es/graph_utils.py | 5 +- .../text_normalization/es/taggers/cardinal.py | 4 +- .../text_normalization/es/taggers/date.py | 2 +- .../text_normalization/es/taggers/fraction.py | 56 +++++++++++++++++-- .../text_normalization/es/taggers/measure.py | 4 +- .../text_normalization/es/taggers/time.py | 4 +- .../es/taggers/tokenize_and_classify.py | 20 +++++-- .../es/verbalizers/fraction.py | 6 +- .../text_normalization/fr/taggers/date.py | 4 +- .../fr/taggers/tokenize_and_classify.py | 9 ++- .../text_normalization/hi/graph_utils.py | 6 +- .../text_normalization/hi/taggers/cardinal.py | 16 +++--- .../text_normalization/hi/taggers/date.py | 2 +- .../text_normalization/hi/taggers/decimal.py | 9 +-- .../text_normalization/hi/taggers/fraction.py | 2 +- .../text_normalization/hi/taggers/measure.py | 6 +- .../text_normalization/hi/taggers/money.py | 6 +- .../text_normalization/hi/taggers/time.py | 2 +- .../hi/taggers/tokenize_and_classify.py | 2 +- .../text_normalization/hi/utils.py | 12 ++-- .../text_normalization/hi/verbalizers/date.py | 2 +- .../hi/verbalizers/decimal.py | 4 +- .../hi/verbalizers/fraction.py | 2 +- .../hi/verbalizers/measure.py | 8 ++- .../hi/verbalizers/post_processing.py | 6 +- .../hi/verbalizers/whitelist.py | 2 +- .../text_normalization/hu/taggers/cardinal.py | 10 ++-- .../text_normalization/hu/taggers/decimal.py | 2 +- .../text_normalization/hu/taggers/measure.py | 4 +- .../text_normalization/hu/taggers/time.py | 6 +- .../hu/taggers/tokenize_and_classify.py | 20 +++++-- .../hu/verbalizers/telephone.py | 10 +++- .../hy/verbalizers/whitelist.py | 2 +- .../text_normalization/hy/verbalizers/word.py | 2 +- .../text_normalization/it/taggers/cardinal.py | 4 +- .../text_normalization/it/taggers/measure.py | 4 +- .../it/taggers/tokenize_and_classify.py | 15 ++++- .../text_normalization/ja/graph_utils.py | 8 +-- .../text_normalization/ja/taggers/cardinal.py | 34 +++++------ .../text_normalization/ja/taggers/date.py | 20 +++---- .../text_normalization/ja/taggers/decimal.py | 6 +- .../text_normalization/ja/taggers/fraction.py | 4 +- .../ja/taggers/punctuation.py | 4 +- .../text_normalization/ja/taggers/time.py | 2 +- .../ja/taggers/tokenize_and_classify.py | 4 +- .../text_normalization/ja/utils.py | 3 +- .../text_normalization/ja/verbalizers/date.py | 2 +- .../ja/verbalizers/post_processing.py | 6 +- .../ja/verbalizers/postprocessor.py | 14 +++-- .../text_normalization/ja/verbalizers/time.py | 4 +- .../ja/verbalizers/verbalize.py | 11 +++- .../ja/verbalizers/verbalize_final.py | 11 ++-- .../ja/verbalizers/whitelist.py | 2 +- .../text_normalization/ja/verbalizers/word.py | 2 +- .../text_normalization/normalize.py | 4 +- .../normalize_with_audio.py | 15 ++++- .../text_normalization/ru/taggers/date.py | 2 +- .../ru/taggers/telephone.py | 8 +-- .../text_normalization/rw/graph_utils.py | 36 ++++++++++-- .../text_normalization/sv/taggers/cardinal.py | 28 ++++------ .../text_normalization/sv/taggers/measure.py | 4 +- .../text_normalization/sv/taggers/ordinal.py | 7 ++- .../text_normalization/sv/taggers/time.py | 6 +- .../sv/verbalizers/telephone.py | 6 +- .../text_normalization/zh/graph_utils.py | 6 +- .../text_normalization/zh/taggers/cardinal.py | 34 +++++------ .../zh/taggers/preprocessor.py | 4 +- .../zh/verbalizers/postprocessor.py | 6 +- .../zh/verbalizers/verbalize_final.py | 7 ++- setup.py | 19 +++++-- tests/conftest.py | 10 +++- tests/nemo_text_processing/ar/test_money.py | 4 +- tests/nemo_text_processing/en/test_address.py | 4 +- .../nemo_text_processing/en/test_cardinal.py | 4 +- tests/nemo_text_processing/en/test_decimal.py | 4 +- .../en/test_electronic.py | 4 +- .../nemo_text_processing/en/test_fraction.py | 4 +- tests/nemo_text_processing/en/test_math.py | 4 +- tests/nemo_text_processing/en/test_measure.py | 4 +- tests/nemo_text_processing/en/test_money.py | 4 +- tests/nemo_text_processing/en/test_ordinal.py | 4 +- .../en/test_punctuation.py | 6 +- tests/nemo_text_processing/en/test_range.py | 4 +- tests/nemo_text_processing/en/test_roman.py | 4 +- tests/nemo_text_processing/en/test_serial.py | 4 +- .../en/test_special_text.py | 4 +- tests/nemo_text_processing/es/test_ordinal.py | 4 +- 192 files changed, 927 insertions(+), 454 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e8728287d..cbc636f1a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v5.0.0 hooks: - id: check-yaml - id: check-case-conflict @@ -30,14 +30,14 @@ repos: - id: requirements-txt-fixer - repo: https://github.com/PyCQA/flake8 - rev: 7.1.1 + rev: 7.2.0 hooks: - id: flake8 args: - --select=W605 - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 6.0.1 hooks: - id: isort name: Format imports @@ -45,7 +45,7 @@ repos: exclude: docs/ - repo: https://github.com/psf/black - rev: 19.10b0 + rev: 25.1.0 hooks: - id: black name: Format code diff --git a/nemo_text_processing/fst_alignment/alignment.py b/nemo_text_processing/fst_alignment/alignment.py index 5e76f66eb..c16aec079 100644 --- a/nemo_text_processing/fst_alignment/alignment.py +++ b/nemo_text_processing/fst_alignment/alignment.py @@ -96,7 +96,7 @@ def parse_args(): EPS = "" -WHITE_SPACE = "\u23B5" +WHITE_SPACE = "\u23b5" ITN_MODE = "itn" TN_MODE = "tn" tn_item_special_chars = ["$", "\\", ":", "+", "-", "="] diff --git a/nemo_text_processing/hybrid/utils.py b/nemo_text_processing/hybrid/utils.py index d634f5a09..39a3fc2c6 100644 --- a/nemo_text_processing/hybrid/utils.py +++ b/nemo_text_processing/hybrid/utils.py @@ -482,7 +482,7 @@ def remove_punctuation(text: str, remove_spaces=True, do_lower=True, lang="en", text = re.sub(r" +", " ", text) if remove_spaces: - text = text.replace(" ", "").replace("\u00A0", "").strip() + text = text.replace(" ", "").replace("\u00a0", "").strip() if do_lower: text = text.lower() @@ -515,7 +515,11 @@ def _relax_diff(text): return acceptable -def get_labels(targets: List[str], norm_texts_weights: List[Tuple[str, str]], lang="en",) -> List[List[str]]: +def get_labels( + targets: List[str], + norm_texts_weights: List[Tuple[str, str]], + lang="en", +) -> List[List[str]]: """ Assign labels to generated normalization options (1 - for ground truth, 0 - other options) Args: @@ -605,7 +609,14 @@ def print_df(df): prints data frame """ with pd.option_context( - "display.max_rows", None, "display.max_columns", None, "display.width", 1000, "display.max_colwidth", 400, + "display.max_rows", + None, + "display.max_columns", + None, + "display.width", + 1000, + "display.max_colwidth", + 400, ): print(df) diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py index 47febc4ac..2c58df6a9 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/cardinal.py @@ -33,7 +33,9 @@ def __init__(self, tn_cardinal): self.graph = pynini.invert(tn_cardinal.cardinal_numbers).optimize() optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, 0, 1, + pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, + 0, + 1, ) final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py index f0d641d14..3b22ece05 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ar/taggers/decimal.py @@ -36,7 +36,9 @@ def __init__(self, tn_decimal): super().__init__(name="decimal", kind="classify") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, + 0, + 1, ) graph_fractional_part = pynini.invert(tn_decimal.graph_fractional).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/word.py index 434ffcc6a..76a02eefe 100644 --- a/nemo_text_processing/inverse_text_normalization/ar/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ar/verbalizers/word.py @@ -27,6 +27,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py index fa5df3367..5eea89af1 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py @@ -207,7 +207,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): + graph_in_thousands ) - graph = pynini.union((graph_int | graph_ind) + delete_space + graph_hundreds, graph_zero,) + graph = pynini.union( + (graph_int | graph_ind) + delete_space + graph_hundreds, + graph_zero, + ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py index 5be9240d7..b1ace40ce 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/date.py @@ -165,7 +165,11 @@ def __init__(self, ordinal: GraphFst, input_case: str): + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) - optional_graph_year = pynini.closure(graph_year, 0, 1,) + optional_graph_year = pynini.closure( + graph_year, + 0, + 1, + ) graph_mdy = month_graph + ( (delete_extra_space + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py index 1d730ec30..6e5de2418 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/decimal.py @@ -97,7 +97,9 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): point = pynutil.delete("point") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py index a2373d9d7..0a41b4702 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py @@ -106,7 +106,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): delete_extra_space + url_symbols + delete_extra_space - + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) + + ( + domain + | pynini.closure( + accepted_username + delete_extra_space, + ) + + accepted_username + ) ) protocol_default = ( diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py index 2d9d5e02c..69eeaa56e 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/measure.py @@ -58,7 +58,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU graph_unit_plural = pynini.compose(casing_graph, graph_unit_plural).optimize() optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) unit_singular = convert_space(graph_unit_singular) diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py index 2a1e32a49..2c5d5ad78 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU # "one fifty" -> "one hundred fifty" with_hundred = pynini.compose( pynini.closure(NEMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("hundred ") + NEMO_SIGMA, - pynini.compose(cardinal_graph, NEMO_DIGIT ** 3), + pynini.compose(cardinal_graph, NEMO_DIGIT**3), ) cardinal_graph |= with_hundred graph_decimal_final = decimal.final_graph_wo_negative diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py index 06d749e39..9a106ca78 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py @@ -40,7 +40,7 @@ def get_serial_number(cardinal): """ digit = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT) - two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT ** 2), 0.002) + two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT**2), 0.002) character = digit | two_digit | NEMO_ALPHA sequence = (NEMO_LOWER_NOT_A | digit) + pynini.closure(pynutil.delete(" ") + character, 2) sequence |= character + pynini.closure(pynutil.delete(" ") + (digit | NEMO_ALPHA), 2) @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): triple_digit.invert() # to handle cases like "one twenty three" - two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2) + two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT**2) double_digit_to_digit = ( pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal ) @@ -139,7 +139,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): number_part = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4, + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**4, ).optimize() number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"") @@ -156,16 +156,16 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): graph = optional_country_code + number_part # credit card number - space_four_digits = insert_space + NEMO_DIGIT ** 4 + space_four_digits = insert_space + NEMO_DIGIT**4 space_five_digits = space_four_digits + NEMO_DIGIT space_six_digits = space_five_digits + NEMO_DIGIT credit_card_graph = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 4 + (space_six_digits | (space_four_digits ** 2)) + space_four_digits, + NEMO_DIGIT**4 + (space_six_digits | (space_four_digits**2)) + space_four_digits, ).optimize() credit_card_graph |= pynini.compose( - single_double_or_triple_digit, NEMO_DIGIT ** 4 + space_six_digits + space_five_digits + single_double_or_triple_digit, NEMO_DIGIT**4 + space_six_digits + space_five_digits ).optimize() graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"") @@ -173,7 +173,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): # SSN ssn_graph = pynini.compose( single_double_or_triple_digit, - NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4, + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**2 + pynutil.insert("-") + NEMO_DIGIT**4, ).optimize() graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py index 53d3dd931..46dc71bc8 100644 --- a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py @@ -71,14 +71,32 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15") - oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock", "hundred hours",), "",) + oclock = pynini.cross( + pynini.union( + "o' clock", + "o clock", + "o'clock", + "oclock", + "hundred hours", + ), + "", + ) if input_case == INPUT_CASED: minute_to_graph = capitalized_input_graph(minute_to_graph) graph_minute_single = capitalized_input_graph(graph_minute_single) graph_minute_double = capitalized_input_graph(graph_minute_double) graph_minute_verbose |= pynini.cross("Half", "30") | pynini.cross("Quarter", "15") - oclock |= pynini.cross(pynini.union("O' clock", "O clock", "O'clock", "Oclock", "Hundred hours",), "",) + oclock |= pynini.cross( + pynini.union( + "O' clock", + "O clock", + "O'clock", + "Oclock", + "Hundred hours", + ), + "", + ) final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/whitelist.py index 7e1148909..9132a639e 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/whitelist.py @@ -35,5 +35,5 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/en/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/en/verbalizers/word.py index 70614fd49..e75474e9b 100644 --- a/nemo_text_processing/inverse_text_normalization/en/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/en/verbalizers/word.py @@ -29,6 +29,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py index 3e164bcc9..d3082509a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/cardinal.py @@ -160,18 +160,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(self.graph_no_exception, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand.optimize() # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py index 50a5e07f7..a7d767119 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/electronic.py @@ -136,7 +136,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): delete_extra_space + symbols + delete_extra_space - + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) + + ( + domain + | pynini.closure( + accepted_username + delete_extra_space, + ) + + accepted_username + ) ) protocol_default = ( diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py index d97cc752a..7cdcfacc7 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/ordinal.py @@ -62,7 +62,13 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): full_graph_ties = graph_ties | (graph_ties + pynini.cross(" ", "y") + graph_digit) - ordinal_graph_union = pynini.union(graph_digit, graph_teens, graph_twenties, full_graph_ties, graph_hundreds,) + ordinal_graph_union = pynini.union( + graph_digit, + graph_teens, + graph_twenties, + full_graph_ties, + graph_hundreds, + ) accept_o_endings = NEMO_SIGMA + pynini.accep("o") accept_a_endings = NEMO_SIGMA + pynini.accep("a") diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py index 2086d643c..8c73ca434 100644 --- a/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/es/taggers/telephone.py @@ -110,7 +110,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): # Denormalized phone numbers are grouped in sets of 3 or 4 digits group_of_two = pynini.union(doubled_digit, digit_twice, double_digits) - group_of_three = pynini.union(tripled_digit, single_digits + pynutil.delete(" ") + group_of_two,) + group_of_three = pynini.union( + tripled_digit, + single_digits + pynutil.delete(" ") + group_of_two, + ) group_of_four = pynini.union( group_of_two + pynutil.delete(" ") + group_of_two, diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py index 606a4e569..d54b8509a 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py @@ -34,5 +34,5 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py index 8c0bd08b1..197155d92 100644 --- a/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/es/verbalizers/word.py @@ -28,6 +28,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py b/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py index 36eccd14b..676574c79 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/fr/graph_utils.py @@ -35,9 +35,9 @@ NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = u"\u00A0" +NEMO_NON_BREAKING_SPACE = u"\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -188,4 +188,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py index d827a63e2..ea1fcf8ea 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/cardinal.py @@ -248,18 +248,13 @@ def __init__(self): self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py index 06807f6a3..68d35741c 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/date.py @@ -46,7 +46,9 @@ def __init__(self, cardinal: GraphFst): day_graph = self.cardinal | pynini.cross("premier", "1") # Premier is only ordinal used for dates day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"") optional_graph_year = pynini.closure( - delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), 0, 1, + delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), + 0, + 1, ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/punctuation.py index e6c833db3..da26ba825 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/taggers/punctuation.py +++ b/nemo_text_processing/inverse_text_normalization/fr/taggers/punctuation.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="punctuation", kind="classify") s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~" - guillemets = "\u00AB" + "\u00BB" # quotation marks in French. + guillemets = "\u00ab" + "\u00bb" # quotation marks in French. s += guillemets punct = pynini.union(*s) diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py index ce0bdf8c4..3e654b859 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py @@ -46,7 +46,7 @@ def __init__(self): super().__init__(name="decimal", kind="verbalize") # Need parser to group digits by threes - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) space_every_three_integer = ( diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py index 77dd6323f..3179af643 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py @@ -61,12 +61,12 @@ def __init__(self): graph_roman_hundreds = pynini.string_file(get_abs_path("data/roman/hundreds_large.tsv")).invert() graph_roman_zero_digit = pynutil.delete("0") - graph_roman_hundreds = NEMO_DIGIT ** 3 @ ( + graph_roman_hundreds = NEMO_DIGIT**3 @ ( graph_roman_hundreds + pynini.union(graph_roman_ties, graph_roman_zero_digit) + pynini.union(graph_roman_digits, graph_roman_zero_digit) ) - graph_roman_ties = NEMO_DIGIT ** 2 @ ( + graph_roman_ties = NEMO_DIGIT**2 @ ( graph_roman_ties + pynini.union(graph_roman_digits, graph_roman_zero_digit) ) graph_roman_digits = NEMO_DIGIT @ graph_roman_digits diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/whitelist.py index 00327a416..38b1a962c 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/whitelist.py @@ -39,5 +39,5 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/word.py index 6c510af08..6b43ec999 100644 --- a/nemo_text_processing/inverse_text_normalization/fr/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/fr/verbalizers/word.py @@ -33,6 +33,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py b/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py index 8454fc139..96cbc58bb 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/hi/graph_utils.py @@ -34,10 +34,10 @@ NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = u"\u00A0" -NEMO_ZWNJ = u"\u200C" +NEMO_NON_BREAKING_SPACE = u"\u00a0" +NEMO_ZWNJ = u"\u200c" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -196,4 +196,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py index f1e4da381..63b055bef 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/cardinal.py @@ -79,12 +79,14 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): # This only covers "standard format". # Conventional format like thousand crores/lakh crores is yet to be implemented graph_in_thousands = pynini.union( - self.graph_two_digit + delete_space + delete_thousand, pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + delete_thousand, + pynutil.insert("००", weight=0.1), ) self.graph_thousands = graph_in_thousands graph_in_lakhs = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("लाख"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("लाख"), + pynutil.insert("००", weight=0.1), ) graph_in_crores = pynini.union( @@ -93,23 +95,28 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED): ) graph_in_arabs = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("अरब"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("अरब"), + pynutil.insert("००", weight=0.1), ) graph_in_kharabs = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("खरब"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("खरब"), + pynutil.insert("००", weight=0.1), ) graph_in_nils = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("नील"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("नील"), + pynutil.insert("००", weight=0.1), ) graph_in_padmas = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("पद्म"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("पद्म"), + pynutil.insert("००", weight=0.1), ) graph_in_shankhs = pynini.union( - self.graph_two_digit + delete_space + pynutil.delete("शंख"), pynutil.insert("००", weight=0.1), + self.graph_two_digit + delete_space + pynutil.delete("शंख"), + pynutil.insert("००", weight=0.1), ) graph_ind = ( diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 6859f0834..817b1b86a 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -27,9 +27,9 @@ class DateFst(GraphFst): """ - Finite state transducer for classifying date, + Finite state transducer for classifying date, e.g. पांच जनवरी दो हज़ार बारह -> date { month: "जनवरी" day: "५" year: "२०१२" preserve_order: true } - e.g. दो हज़ार बारह -> date { year: "२०१२" preserve_order: true } + e.g. दो हज़ार बारह -> date { year: "२०१२" preserve_order: true } Args: cardinal: CardinalFst date: DateFst diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/decimal.py index 215c34e5c..ddbf32c9b 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/decimal.py @@ -40,8 +40,8 @@ def get_quantity( Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. दस लाख -> integer_part: "१॰" quantity: "लाख" e.g. एक दशमलव पाँच लाख -> integer_part: "१" fractional_part: "५" quantity: "लाख" - - Args: + + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST input_case: accepting either "lower_cased" or "cased" input. @@ -70,17 +70,17 @@ class DecimalFst(GraphFst): Decimal point "." is determined by "दशमलव" e.g. ऋण एक दशमलव दो छह -> decimal { negative: "true" integer_part: "१" morphosyntactic_features: "." fractional_part: "२६" } - + This decimal rule assumes that decimals can be pronounced as: (a cardinal) + ('दशमलव') plus (any sequence of cardinals <१०००, including 'शून्य') - - Also writes large numbers in shortened form, e.g. + + Also writes large numbers in shortened form, e.g. e.g. एक दशमलव दो छह लाख -> decimal { negative: "false" integer_part: "१" morphosyntactic_features: "." fractional_part: "२६" quantity: "लाख" } e.g. दो लाख -> decimal { negative: "false" integer_part: "२" quantity: "लाख" } e.g. एक अरब आठ सौ चौबीस लाख -> decimal { negative: "false" integer_part: "१८२४" quantity: "लाख" } Args: cardinal: CardinalFst - + """ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): @@ -97,7 +97,9 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED): point = pynutil.delete("दशमलव") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, + 0, + 1, ) graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py index 56b2c63e3..1e44f59e8 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/fraction.py @@ -40,7 +40,7 @@ class FractionFst(GraphFst): e.g. ऋण एक बटा छब्बीस -> fraction { negative: "true" numerator: "१" denominator: "२६" } e.g. छह सौ साठ बटा पाँच सौ तैंतालीस -> fraction { negative: "false" numerator: "६६०" denominator: "५४३" } - + The fractional rule assumes that fractions can be pronounced as: (a cardinal) + ('बटा') plus (a cardinal, excluding 'शून्य') Args: @@ -65,7 +65,9 @@ def __init__(self, cardinal: GraphFst): self.graph = graph.optimize() self.final_graph_wo_negative = graph optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, + 0, + 1, ) graph = optional_graph_negative + graph final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py index d7e9ba562..15d8e4eb8 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/measure.py @@ -45,7 +45,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): decimal_graph = decimal.final_graph_wo_negative optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("ऋण", "\"true\"") + delete_extra_space, + 0, + 1, ) measurements_graph = pynini.string_file(get_abs_path("data/measure/measurements.tsv")).invert() diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 1d1d3c875..386f1353d 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -23,7 +23,7 @@ class TelephoneFst(GraphFst): """ Finite state transducer for classifying telephone numbers, e.g. e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } - + Args: Cardinal: CardinalFst """ diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py index ac539966d..6bfc51af7 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/time.py @@ -21,9 +21,9 @@ class TimeFst(GraphFst): """ - Finite state transducer for classifying time, + Finite state transducer for classifying time, e.g. एक बजके सात मिनट -> time { hours: "१" minutes: "७" } - e.g. चार बजे चवालीस मिनट -> time { hours: "४" minutes: "४४" } + e.g. चार बजे चवालीस मिनट -> time { hours: "४" minutes: "४४" } Args: cardinal: CardinalFst time: TimeFst diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index 62554bd14..b3fcb0c2d 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -53,7 +53,11 @@ class ClassifyFst(GraphFst): """ def __init__( - self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, input_case: str = None, + self, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + input_case: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/hi/utils.py b/nemo_text_processing/inverse_text_normalization/hi/utils.py index 5e387b6e8..8e3f62c3c 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/utils.py +++ b/nemo_text_processing/inverse_text_normalization/hi/utils.py @@ -24,7 +24,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ abs_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + rel_path @@ -52,10 +52,10 @@ def load_labels(abs_path): def apply_fst(text, fst): - """ Given a string input, returns the output string - produced by traversing the path with lowest weight. - If no valid path accepts input string, returns an - error. + """Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. """ try: print(pynini.shortestpath(text @ fst).string()) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py index d8d61f2f8..17dfebf64 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize_final.py @@ -23,7 +23,7 @@ class VerbalizeFinalFst(GraphFst): """ - Finite state transducer that verbalizes an entire sentence, e.g. + Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "अब" } tokens { time { hours: "१२" minutes: "३०" } } tokens { name: "बज" } tokens { name: "गए" } tokens { name: "हैं" } -> अब १२:३० बज गए हैं """ diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/whitelist.py index 0a3e3e261..9f342eacd 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/whitelist.py @@ -40,5 +40,5 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/word.py index 0b1a895bb..a411b2ba8 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/word.py @@ -34,6 +34,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py index e912ff60b..b0d4e52cc 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/ordinal.py @@ -37,7 +37,12 @@ def __init__(self): convert_one = pynini.cross("[BOS]1", "[BOS]1-ին") convert_rest = pynutil.insert("-րդ", weight=0.01) - suffix = pynini.cdrewrite(convert_rest | convert_one, "", "[EOS]", NEMO_SIGMA,) + suffix = pynini.cdrewrite( + convert_rest | convert_one, + "", + "[EOS]", + NEMO_SIGMA, + ) graph = graph @ suffix delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/whitelist.py index bdfb84dea..1aa132328 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/whitelist.py @@ -34,5 +34,5 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/word.py index b846cc4b9..a6887e528 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/hy/verbalizers/word.py @@ -29,6 +29,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ja/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ja/graph_utils.py index 57474b772..abcced58a 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/ja/graph_utils.py @@ -29,16 +29,16 @@ NEMO_CHAR = utf8.VALID_UTF8_CHAR -NEMO_NARROW_NON_BREAK_SPACE = "\u202F" +NEMO_NARROW_NON_BREAK_SPACE = "\u202f" NEMO_DIGIT = byte.DIGIT NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_NON_BREAKING_SPACE = "\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -289,4 +289,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py index fa6bebd87..15d17f81d 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/cardinal.py @@ -39,7 +39,10 @@ def __init__(self): hundred = pynutil.delete("百") | pynutil.delete("ひゃく") | pynutil.delete("びゃく") | pynutil.delete("ぴゃく") hundred_alt = ( - pynini.cross("百", "1") | pynini.cross("ひゃく", "1") | pynini.cross("びゃく", "1") | pynini.cross("ぴゃく", "1") + pynini.cross("百", "1") + | pynini.cross("ひゃく", "1") + | pynini.cross("びゃく", "1") + | pynini.cross("ぴゃく", "1") ) graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) graph_hundred_component += pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py index 908b02d95..0ced0c679 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/fraction.py @@ -36,7 +36,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): cardinal = cardinal.just_cardinals decimal = decimal.just_decimal - fraction_word = pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") + fraction_word = ( + pynutil.delete("分の") | pynutil.delete(" 分 の ") | pynutil.delete("分 の ") | pynutil.delete("分 の") + ) integer_word = pynutil.delete("と") | pynutil.delete("荷") root_word = pynini.accep("√") | pynini.cross("ルート", "√") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py index 8fca40fdd..26e053334 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py @@ -32,7 +32,9 @@ class PreProcessorFst(GraphFst): ''' def __init__( - self, remove_interjections: bool = True, fullwidth_to_halfwidth: bool = True, + self, + remove_interjections: bool = True, + fullwidth_to_halfwidth: bool = True, ): super().__init__(name="PreProcessor", kind="processor") diff --git a/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py index 8477dfaa5..20ff3f34a 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/taggers/time.py @@ -38,14 +38,18 @@ def __init__(self): minutes_seconds = pynini.string_file(get_abs_path("data/time_minutes_seconds.tsv")) hour_component = ( - pynutil.insert("hours: \"") + ((hours + pynutil.delete("時")) | pynini.accep("正午")) + pynutil.insert("\"") + pynutil.insert("hours: \"") + + ((hours + pynutil.delete("時")) | pynini.accep("正午")) + + pynutil.insert("\"") ) minute_component = ( pynutil.insert("minutes: \"") + ((minutes_seconds + pynutil.delete("分")) | pynini.accep("半")) + pynutil.insert("\"") ) - second_component = pynutil.insert("seconds: \"") + minutes_seconds + pynutil.delete("秒") + pynutil.insert("\"") + second_component = ( + pynutil.insert("seconds: \"") + minutes_seconds + pynutil.delete("秒") + pynutil.insert("\"") + ) graph_regular = ( pynini.closure(hour_component + insert_space + minute_component + insert_space + second_component) diff --git a/nemo_text_processing/inverse_text_normalization/ja/utils.py b/nemo_text_processing/inverse_text_normalization/ja/utils.py index 95555308b..fd3017d28 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ja/utils.py @@ -17,19 +17,19 @@ def get_abs_path(rel_path): """ - Get absolute path + Get absolute path - Args: - rel_path: relative path to this file -<<<<<<< HEAD -<<<<<<< HEAD + Args: + rel_path: relative path to this file + <<<<<<< HEAD + <<<<<<< HEAD -======= - ->>>>>>> 0a4a21c (Jp itn 20240221 (#141)) -======= + ======= ->>>>>>> 59f46198ab4c8880c6a5fb88f3cbee9530156498 - Returns absolute path + >>>>>>> 0a4a21c (Jp itn 20240221 (#141)) + ======= + + >>>>>>> 59f46198ab4c8880c6a5fb88f3cbee9530156498 + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py index 60bdff8a1..62d41cb65 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py @@ -52,7 +52,7 @@ def __init__(self): + pynutil.delete("\"") ) - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py index 7bbc16516..103cfb7a8 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/postprocessor.py @@ -36,7 +36,11 @@ class PostProcessor(GraphFst): ''' def __init__( - self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, + self, + remove_puncts: bool = False, + to_upper: bool = False, + to_lower: bool = False, + tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py index 798cd001d..8e95e14cf 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/time.py @@ -40,12 +40,18 @@ def __init__(self): hours_component |= hours_component_alt minutes_component = ( - pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("分") + pynutil.delete("\"") + pynutil.delete("minutes: \"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.insert("分") + + pynutil.delete("\"") ) minutes_component_alt = pynutil.delete("minutes: \"") + pynini.accep("半") + pynutil.delete("\"") minutes_component |= minutes_component_alt second_component = ( - pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.insert("秒") + pynutil.delete("\"") + pynutil.delete("seconds: \"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.insert("秒") + + pynutil.delete("\"") ) suffix_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py index 980e41816..7624d5f1b 100644 --- a/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ja/verbalizers/verbalize_final.py @@ -47,7 +47,12 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) + postprocessor = PostProcessor( + remove_puncts=False, + to_upper=False, + to_lower=False, + tag_oov=False, + ) self.fst = (verbalizer @ postprocessor.fst).optimize() if far_file: diff --git a/nemo_text_processing/inverse_text_normalization/mr/graph_utils.py b/nemo_text_processing/inverse_text_normalization/mr/graph_utils.py index 9e6276813..eaac1ba33 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/mr/graph_utils.py @@ -28,13 +28,13 @@ NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_MARATHI_DIGITS = ( - "\u0966" + "\u0967" + "\u0968" + "\u0969" + "\u096A" + "\u096B" + "\u096C" + "\u096D" + "\u096E" + "\u096F" + "\u0966" + "\u0967" + "\u0968" + "\u0969" + "\u096a" + "\u096b" + "\u096c" + "\u096d" + "\u096e" + "\u096f" ) NEMO_DIGIT = pynini.union(*NEMO_MARATHI_DIGITS).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = u"\u00A0" +NEMO_NON_BREAKING_SPACE = u"\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -191,4 +191,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py index 27d0a35c5..8aa218a9a 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/cardinal.py @@ -91,7 +91,11 @@ def __init__(self): graph_arabs + delete_space + graph_crores + delete_space + graph_lakhs + delete_space + graph_thousands ) - graph = pynini.union(graph_higher_powers + delete_space + graph_hundreds, graph_hundred_unique, graph_zero,) + graph = pynini.union( + graph_higher_powers + delete_space + graph_hundreds, + graph_hundred_unique, + graph_zero, + ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("०")) + pynini.difference(NEMO_DIGIT, "०") + pynini.closure(NEMO_DIGIT), "०" diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py index 96e8fb08d..15a75affc 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/date.py @@ -46,7 +46,11 @@ def __init__(self, cardinal: GraphFst): + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) - optional_graph_year = pynini.closure(graph_year, 0, 1,) + optional_graph_year = pynini.closure( + graph_year, + 0, + 1, + ) graph_ad_bc = pynutil.insert("text: \"") + prefixes + delete_space + pynutil.insert("\"") graph_mdy = month_graph + ( diff --git a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py index 8882b860c..92af8c7c3 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/mr/taggers/decimal.py @@ -65,7 +65,9 @@ def __init__(self, cardinal: GraphFst): graph_digits = pynini.string_file(get_abs_path("data/numbers/digits.tsv")).invert() decimal_word = pynini.cross("पूर्णांक", "") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, + 0, + 1, ) graph_integer = ( pynutil.insert("integer_part: \"") diff --git a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/word.py index f2bab8fa4..f3ca79d5a 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/mr/verbalizers/word.py @@ -34,6 +34,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py index 8eeea3876..59b30ae9e 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py @@ -171,9 +171,9 @@ def __init__(self, use_strict_e=False): ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) graph_hundred_component_prefix_e = graph_hundred_component_prefix_e.optimize() - graph_hundred_component_no_prefix = pynini.union(graph_hundreds + graph_e + graph_ties_component,) @ ( - pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) - ) + graph_hundred_component_no_prefix = pynini.union( + graph_hundreds + graph_e + graph_ties_component, + ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) graph_hundred_component_no_prefix = graph_hundred_component_no_prefix.optimize() graph_mil_prefix_e = pynini.union( @@ -350,18 +350,13 @@ def __init__(self, use_strict_e=False): self.graph_no_exception = graph # save self.numbers_up_to_thousand for use in DecimalFst - digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = ( - NEMO_DIGIT - | (NEMO_DIGIT ** 2) - | (NEMO_DIGIT ** 3) - | (NEMO_DIGIT ** 4) - | (NEMO_DIGIT ** 5) - | (NEMO_DIGIT ** 6) + NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6) ) numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py index 75c80c383..6d9859c6e 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py @@ -34,5 +34,5 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py index 16c38ee05..aba72eaf8 100644 --- a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py @@ -28,6 +28,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py index 5bb6c63bc..5d9308958 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/date.py @@ -31,7 +31,8 @@ class DateFst(GraphFst): """ def __init__( - self, tn_date_tagger: GraphFst, + self, + tn_date_tagger: GraphFst, ): super().__init__(name="date", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py index e39a9017a..97bd36582 100644 --- a/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/sv/taggers/decimal.py @@ -49,7 +49,15 @@ def __init__(self, itn_cardinal_tagger: GraphFst, tn_decimal_tagger: GraphFst): self.final_graph_wo_sign = final_graph_wo_sign self.final_graph_wo_negative = ( - final_graph_wo_sign | get_quantity(final_graph_wo_sign, None, hundreds_no_one, None, False, True,) + final_graph_wo_sign + | get_quantity( + final_graph_wo_sign, + None, + hundreds_no_one, + None, + False, + True, + ) ).optimize() optional_minus_graph = pynini.closure(pynini.cross("minus ", "negative: \"true\" "), 0, 1) diff --git a/nemo_text_processing/inverse_text_normalization/vi/graph_utils.py b/nemo_text_processing/inverse_text_normalization/vi/graph_utils.py index a04fd73ca..4e58ff475 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/vi/graph_utils.py @@ -33,9 +33,9 @@ NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_NON_BREAKING_SPACE = "\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -151,4 +151,4 @@ def delete_tokens(self, fst) -> "pynini.FstLike": + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py index 016df4f1d..155513937 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/cardinal.py @@ -134,7 +134,8 @@ def __init__(self): ) graph = graph @ pynini.union( - pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", ) # don't convert cardinals from zero to nine inclusive @@ -145,7 +146,9 @@ def __init__(self): self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, 0, 1, + pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, + 0, + 1, ) final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"') diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py index b0cd8561a..21576efd5 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/date.py @@ -59,7 +59,10 @@ def _get_year_graph(): def _get_digits_graph(): zero = pynini.cross((pynini.union("linh", "lẻ")), "0") four = pynini.cross("tư", "4") - graph = pynini.union(zero + delete_space + (graph_digit | four), graph_zero + delete_space + graph_digit,) + graph = pynini.union( + zero + delete_space + (graph_digit | four), + graph_zero + delete_space + graph_digit, + ) graph.optimize() return graph diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py index 033f3d86e..60c550228 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/vi/taggers/decimal.py @@ -123,10 +123,12 @@ def __init__(self, cardinal: GraphFst): final_graph = optional_graph_negative + final_graph_wo_sign self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( - final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, + final_graph_wo_sign, + cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph |= optional_graph_negative + get_quantity( - final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit, + final_graph_wo_sign, + cardinal.graph_hundred_component_at_least_one_none_zero_digit, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py index 30d262722..2ad4d5bbf 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/time.py @@ -70,7 +70,9 @@ def __init__(self): ) optional_zone = pynini.closure(zone, 0, 1) optional_second = pynini.closure( - delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit), 0, 1, + delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit), + 0, + 1, ) graph_h = hour + pynutil.insert("h") diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/whitelist.py index a0f630c4e..77c225c38 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/whitelist.py @@ -40,5 +40,5 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete('"') ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/word.py index afd0c6a0c..ad9007ff3 100644 --- a/nemo_text_processing/inverse_text_normalization/vi/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/vi/verbalizers/word.py @@ -34,6 +34,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete('"') + chars + pynutil.delete('"') - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py index de1a7a28c..8d8217644 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/zh/graph_utils.py @@ -27,9 +27,9 @@ NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_DIGIT = byte.DIGIT NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_NON_BREAKING_SPACE = "\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() @@ -86,7 +86,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.capitalize(), + ], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -100,7 +103,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): print(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + [ + [written, spoken_no_space], + [written_capitalized, spoken_no_space.upper()], + ] ) else: additional_labels.extend( @@ -181,4 +187,4 @@ def delete_tokens(self, fst) -> "pynini.FstLike": + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index f3b30238c..0715a3988 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -110,7 +110,12 @@ def __init__(self): + graph_hundreds_complex ) | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) - | (graph_hundreds_complex + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) + | ( + graph_hundreds_complex + + delete_ten_thousands + + pynini.cross(pynini.closure("零"), "000") + + graph_digits + ) ) graph_millions = ( pynutil.add_weight(graph_millions_simple, -1.0) | graph_millions_complex | pynutil.insert("0000000") diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py index 331f0b7ff..108c222fd 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/date.py @@ -61,7 +61,9 @@ def __init__(self): # graph_date = graph_year | graph_month | graph_day # grammar for optional prefix ad or bc - graph_bc_prefix = pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) + graph_bc_prefix = ( + pynini.closure("紀元前", 0, 1) | pynini.closure("公元前", 0, 1) | pynini.closure("纪元前", 0, 1) + ) graph_bc = pynutil.delete(graph_bc_prefix) graph_ad_prefix = ( diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py index e660b6015..477a82f5d 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/money.py @@ -57,7 +57,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # yuan major plus minor major_symbol = pynini.accep("块") | pynini.cross("塊", "块") - tencent = pynini.accep("毛") | pynini.accep("角",) + tencent = pynini.accep("毛") | pynini.accep( + "角", + ) cent = pynini.accep("分") graph_kuai = ( graph_integer_component diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 2877d4160..3364ed4b2 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -49,7 +49,11 @@ class ClassifyFst(GraphFst): """ def __init__( - self, input_case: str, cache_dir: str = None, whitelist: str = None, overwrite_cache: bool = False, + self, + input_case: str, + cache_dir: str = None, + whitelist: str = None, + overwrite_cache: bool = False, ): super().__init__(name="tokenize_and_classify", kind="classify") diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py index 31d5880dc..f33987173 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/cardinal.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="cardinal", kind="verbalize") # group numbers by three - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) suffix = pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index 28e2d5ff1..b36e44dfa 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -28,7 +28,7 @@ def __init__(self): super().__init__(name="decimal", kind="verbalize") # group numbers by three - exactly_three_digits = NEMO_DIGIT ** 3 + exactly_three_digits = NEMO_DIGIT**3 at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # insert a "," for every three numbers before decimal point diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py index df722ac25..571070a2e 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/whitelist.py @@ -39,5 +39,5 @@ def __init__(self): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete('"') ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py index 545de8af1..c3e76fef7 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/word.py @@ -33,6 +33,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete('"') + chars + pynutil.delete('"') - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ar/graph_utils.py b/nemo_text_processing/text_normalization/ar/graph_utils.py index 164bd6f78..54685a093 100644 --- a/nemo_text_processing/text_normalization/ar/graph_utils.py +++ b/nemo_text_processing/text_normalization/ar/graph_utils.py @@ -36,9 +36,9 @@ NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() - NEMO_NON_BREAKING_SPACE = u"\u00A0" + NEMO_NON_BREAKING_SPACE = u"\u00a0" NEMO_SPACE = " " - NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() + NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -163,7 +163,7 @@ NEMO_ALPHA = None NEMO_ALNUM = None NEMO_HEX = None - NEMO_NON_BREAKING_SPACE = u"\u00A0" + NEMO_NON_BREAKING_SPACE = u"\u00a0" NEMO_SPACE = " " NEMO_WHITE_SPACE = None NEMO_NOT_SPACE = None @@ -317,4 +317,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/ar/taggers/measure.py b/nemo_text_processing/text_normalization/ar/taggers/measure.py index 707b40998..ce22f3d76 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/measure.py +++ b/nemo_text_processing/text_normalization/ar/taggers/measure.py @@ -55,7 +55,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( @@ -76,15 +78,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) subgraph_cardinal = ( - (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst - + insert_space - + pynini.closure(pynutil.delete(" "), 0, 1) - + unit_plural - | unit_plural - + pynini.closure(pynutil.delete(" "), 0, 1) - + insert_space - + (optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1")) @ cardinal.fst - ) + optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1") + ) @ cardinal.fst + insert_space + pynini.closure( + pynutil.delete(" "), 0, 1 + ) + unit_plural | unit_plural + pynini.closure( + pynutil.delete(" "), 0, 1 + ) + insert_space + ( + optional_graph_negative + (pynini.closure(NEMO_DIGIT) - "1") + ) @ cardinal.fst subgraph_cardinal |= ( (optional_graph_negative + pynini.accep("1")) @ cardinal.fst diff --git a/nemo_text_processing/text_normalization/ar/taggers/money.py b/nemo_text_processing/text_normalization/ar/taggers/money.py index 5098989c6..925fa348e 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/text_normalization/ar/taggers/money.py @@ -142,7 +142,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) graph_with_no_minor_curr = integer_plus_maj - graph_with_no_minor_curr |= pynutil.add_weight(integer_plus_maj, weight=0.0001,) + graph_with_no_minor_curr |= pynutil.add_weight( + integer_plus_maj, + weight=0.0001, + ) graph_with_no_minor_curr = pynutil.delete(curr_symbol) + graph_with_no_minor_curr + preserve_order diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/word.py b/nemo_text_processing/text_normalization/ar/verbalizers/word.py index eb0e2d2c7..2987d4ad1 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/word.py @@ -32,6 +32,6 @@ def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="verbalize", deterministic=deterministic) chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/de/taggers/cardinal.py b/nemo_text_processing/text_normalization/de/taggers/cardinal.py index a8ef5af17..902a62b3f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/de/taggers/cardinal.py @@ -166,7 +166,7 @@ def thousand(): self.graph = ( ((NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT, 0)) - "0" - "1") @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) @@ -181,7 +181,7 @@ def thousand(): self.graph_hundred_component_at_least_one_none_zero_digit = ( ((NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT, 0)) - "0" - "1") @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 3 + @ NEMO_DIGIT**3 @ hundred_non_zero() ) | pynini.cross("1", "eins") diff --git a/nemo_text_processing/text_normalization/de/taggers/date.py b/nemo_text_processing/text_normalization/de/taggers/date.py index 21b32eb2b..8c13882d2 100644 --- a/nemo_text_processing/text_normalization/de/taggers/date.py +++ b/nemo_text_processing/text_normalization/de/taggers/date.py @@ -42,7 +42,7 @@ def get_year_graph(cardinal: GraphFst) -> 'pynini.FstLike': cardinal: cardinal GraphFst """ - year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT ** 2) @ cardinal.graph + year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT**2) @ cardinal.graph graph_two_digit = delete_leading_zero @ cardinal.two_digit_non_zero hundred = pynutil.insert("hundert") diff --git a/nemo_text_processing/text_normalization/de/taggers/measure.py b/nemo_text_processing/text_normalization/de/taggers/measure.py index 122ff8a67..a46822a0f 100644 --- a/nemo_text_processing/text_normalization/de/taggers/measure.py +++ b/nemo_text_processing/text_normalization/de/taggers/measure.py @@ -82,7 +82,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/de/taggers/telephone.py b/nemo_text_processing/text_normalization/de/taggers/telephone.py index 90af2f07e..97482a236 100644 --- a/nemo_text_processing/text_normalization/de/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/de/taggers/telephone.py @@ -45,7 +45,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): numbers_with_single_digits = pynini.closure(graph_digit + insert_space) + graph_digit - two_digit_and_zero = (NEMO_DIGIT ** 2 @ cardinal.two_digit_non_zero) | graph_zero + two_digit_and_zero = (NEMO_DIGIT**2 @ cardinal.two_digit_non_zero) | graph_zero # def add_space_after_two_digit(): # return pynini.closure(two_digit_and_zero + insert_space) + ( # two_digit_and_zero diff --git a/nemo_text_processing/text_normalization/de/taggers/time.py b/nemo_text_processing/text_normalization/de/taggers/time.py index 371ad16ac..2fe74f5ba 100644 --- a/nemo_text_processing/text_normalization/de/taggers/time.py +++ b/nemo_text_processing/text_normalization/de/taggers/time.py @@ -65,7 +65,9 @@ def __init__(self, deterministic: bool = True): + pynutil.insert('"') ) final_time_zone_optional = pynini.closure( - pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), 0, 1, + pynini.accep(" ") + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), + 0, + 1, ) # Accepts the following formats: 02:30 Uhr, 02.30 Uhr, 2:30 Uhr, 2.30 Uhr diff --git a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py index e6590536f..646d7a6b7 100644 --- a/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/de/taggers/tokenize_and_classify.py @@ -70,7 +70,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -92,7 +93,10 @@ def __init__( self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -104,7 +108,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py index f8d5f6967..d4ea8eb09 100644 --- a/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/de/verbalizers/ordinal.py @@ -43,7 +43,10 @@ def __init__(self, deterministic: bool = True): self.ordinal_stem = graph_digit | graph_ties | graph_thousands suffix = pynini.cdrewrite( - pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, "", "[EOS]", NEMO_SIGMA, + pynini.closure(self.ordinal_stem, 0, 1) + convert_rest, + "", + "[EOS]", + NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix diff --git a/nemo_text_processing/text_normalization/en/graph_utils.py b/nemo_text_processing/text_normalization/en/graph_utils.py index 161e5d97e..23ebcf6bd 100644 --- a/nemo_text_processing/text_normalization/en/graph_utils.py +++ b/nemo_text_processing/text_normalization/en/graph_utils.py @@ -35,9 +35,9 @@ NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_NON_BREAKING_SPACE = "\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -103,14 +103,36 @@ suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", + "b", + "c", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "w", + "x", + "y", + "z", ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, + suppletive, + plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), + NEMO_SIGMA, ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -125,7 +147,9 @@ def capitalized_input_graph( - graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, + graph: "pynini.FstLike", + original_graph_weight: float = None, + capitalized_graph_weight: float = None, ) -> "pynini.FstLike": """ Allow graph input to be capitalized, e.g. for ITN) @@ -209,7 +233,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): written_capitalized = written[0].upper() + written[1:] additional_labels.extend( [ - [written_capitalized, spoken.capitalize(),], # first letter capitalized + [ + written_capitalized, + spoken.capitalize(), + ], # first letter capitalized [ written_capitalized, spoken.upper().replace(" AND ", " and "), @@ -223,7 +250,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): logger.debug(f"This is weight {weight}") if len(weight) == 0: additional_labels.extend( - [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],] + [ + [written, spoken_no_space], + [written_capitalized, spoken_no_space.upper()], + ] ) else: additional_labels.extend( @@ -304,4 +334,4 @@ def delete_tokens(self, fst) -> "pynini.FstLike": + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/en/taggers/cardinal.py b/nemo_text_processing/text_normalization/en/taggers/cardinal.py index 6ec0ac9dd..5e2a8535c 100644 --- a/nemo_text_processing/text_normalization/en/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/en/taggers/cardinal.py @@ -83,7 +83,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): graph = ( pynini.closure(NEMO_DIGIT, 1, 3) - + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT ** 3) | pynini.closure(NEMO_DIGIT ** 3)) + + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3) | pynini.closure(NEMO_DIGIT**3)) ) @ graph self.graph = graph @@ -118,7 +118,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False): ) final_graph |= pynini.compose(final_graph, one_to_a_replacement_graph.optimize() + NEMO_SIGMA).optimize() # remove commas for 4 digits numbers - four_digit_comma_graph = (NEMO_DIGIT - "0") + pynutil.delete(",") + NEMO_DIGIT ** 3 + four_digit_comma_graph = (NEMO_DIGIT - "0") + pynutil.delete(",") + NEMO_DIGIT**3 final_graph |= pynini.compose(four_digit_comma_graph.optimize(), final_graph).optimize() self.final_graph = final_graph diff --git a/nemo_text_processing/text_normalization/en/taggers/date.py b/nemo_text_processing/text_normalization/en/taggers/date.py index 869716ef9..52225f0ba 100644 --- a/nemo_text_processing/text_normalization/en/taggers/date.py +++ b/nemo_text_processing/text_normalization/en/taggers/date.py @@ -126,11 +126,11 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True): 123 A.D., 4200 B.C """ graph = get_four_digit_year_graph(deterministic) - graph = (pynini.union("1", "2") + (NEMO_DIGIT ** 3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph + graph = (pynini.union("1", "2") + (NEMO_DIGIT**3) + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)) @ graph graph |= _get_two_digit_year_with_s_graph() - three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph + three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT**2) @ cardinal_graph year_with_suffix = ( (get_four_digit_year_graph(deterministic=True) | three_digit_year) + delete_space + insert_space + year_suffix ) @@ -270,7 +270,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year - day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph + day_ex_month = (NEMO_DIGIT**2 - pynini.project(month_numbers_graph, "input")) @ day_graph for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_dmy |= ( diff --git a/nemo_text_processing/text_normalization/en/taggers/electronic.py b/nemo_text_processing/text_normalization/en/taggers/electronic.py index 3262c7485..874d2e437 100644 --- a/nemo_text_processing/text_normalization/en/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/en/taggers/electronic.py @@ -49,9 +49,15 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): else: numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ") - cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), + MIN_NEG_WEIGHT, + ) - cc_cues = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), MIN_NEG_WEIGHT,) + cc_cues = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")), + MIN_NEG_WEIGHT, + ) accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") @@ -59,10 +65,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" ) - dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT,) + dict_words = pynutil.add_weight( + pynini.string_file(get_abs_path("data/electronic/words.tsv")), + MIN_NEG_WEIGHT, + ) dict_words_without_delimiter = dict_words + pynini.closure( - pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), 1, + pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), + 1, ) dict_words_graph = dict_words_without_delimiter | dict_words diff --git a/nemo_text_processing/text_normalization/en/taggers/measure.py b/nemo_text_processing/text_normalization/en/taggers/measure.py index fc61620ce..e8d92e1da 100644 --- a/nemo_text_processing/text_normalization/en/taggers/measure.py +++ b/nemo_text_processing/text_normalization/en/taggers/measure.py @@ -53,7 +53,11 @@ class MeasureFst(GraphFst): """ def __init__( - self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True, + self, + cardinal: GraphFst, + decimal: GraphFst, + fraction: GraphFst, + deterministic: bool = True, ): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph_with_and | self.get_range(cardinal.graph_with_and) @@ -63,7 +67,8 @@ def __init__( graph_unit |= pynini.string_file(get_abs_path("data/measure/unit_alternatives.tsv")) graph_unit |= pynini.compose( - pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), graph_unit, + pynini.closure(TO_LOWER, 1) + (NEMO_ALPHA | TO_LOWER) + pynini.closure(NEMO_ALPHA | TO_LOWER), + graph_unit, ).optimize() graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) @@ -76,7 +81,9 @@ def __init__( ) optional_graph_unit2 = pynini.closure( - delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, + 0, + 1, ) unit_plural = ( @@ -250,11 +257,12 @@ def get_address_graph(self, cardinal): ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( - pynutil.insert('integer: "') + ordinal_tagger + pynutil.insert('"'), ordinal_verbalizer, + pynutil.insert('integer: "') + ordinal_tagger + pynutil.insert('"'), + ordinal_verbalizer, ) address_num = NEMO_DIGIT ** (1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit - address_num += insert_space + NEMO_DIGIT ** 2 @ ( + address_num += insert_space + NEMO_DIGIT**2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit ) @@ -292,8 +300,12 @@ def get_address_graph(self, cardinal): state = pynini.invert(state_graph) state = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) - zip_code = pynini.compose(NEMO_DIGIT ** 5, cardinal.single_digits_graph) - zip_code = pynini.closure(pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1,) + zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) + zip_code = pynini.closure( + pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, + 0, + 1, + ) address = address_num + direction + address_words + pynini.closure(city + state + zip_code, 0, 1) diff --git a/nemo_text_processing/text_normalization/en/taggers/money.py b/nemo_text_processing/text_normalization/en/taggers/money.py index ef38c56b5..0687b0c1a 100644 --- a/nemo_text_processing/text_normalization/en/taggers/money.py +++ b/nemo_text_processing/text_normalization/en/taggers/money.py @@ -112,7 +112,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( - NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj, + NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), + integer_plus_maj, ) integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma @@ -189,7 +190,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( - NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered, + NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), + integer_graph_reordered, ) final_graph += graph_per_units.ques diff --git a/nemo_text_processing/text_normalization/en/taggers/range.py b/nemo_text_processing/text_normalization/en/taggers/range.py index 9d57a9fb9..c989e99f5 100644 --- a/nemo_text_processing/text_normalization/en/taggers/range.py +++ b/nemo_text_processing/text_normalization/en/taggers/range.py @@ -33,7 +33,12 @@ class RangeFst(GraphFst): """ def __init__( - self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False, + self, + time: GraphFst, + date: GraphFst, + cardinal: GraphFst, + deterministic: bool = True, + lm: bool = False, ): super().__init__(name="range", kind="classify", deterministic=deterministic) @@ -47,14 +52,14 @@ def __init__( cardinal = cardinal.graph_with_and # YEAR - date_year_four_digit = (NEMO_DIGIT ** 4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date - date_year_two_digit = (NEMO_DIGIT ** 2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date + date_year_four_digit = (NEMO_DIGIT**4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date + date_year_two_digit = (NEMO_DIGIT**2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date year_to_year_graph = ( date_year_four_digit + delete_space + pynini.cross("-", " to ") + delete_space - + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT ** 2 @ cardinal)) + + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT**2 @ cardinal)) ) mid_year_graph = pynini.accep("mid") + pynini.cross("-", " ") + (date_year_four_digit | date_year_two_digit) diff --git a/nemo_text_processing/text_normalization/en/taggers/serial.py b/nemo_text_processing/text_normalization/en/taggers/serial.py index 913c09285..f650c8ff3 100644 --- a/nemo_text_processing/text_normalization/en/taggers/serial.py +++ b/nemo_text_processing/text_normalization/en/taggers/serial.py @@ -71,7 +71,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = num_graph |= pynini.compose(num_graph, NEMO_SIGMA + pynutil.delete("hundred ") + NEMO_SIGMA) # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( - NEMO_DIGIT ** 2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 + NEMO_DIGIT**2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 ) # add space between letter and digit/symbol diff --git a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py index 28614fad1..7a253cccc 100644 --- a/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py @@ -78,7 +78,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", + cache_dir, + f"en_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -107,7 +108,12 @@ def __init__( logger.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") start_time = time.time() - measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic,) + measure = MeasureFst( + cardinal=cardinal, + decimal=decimal, + fraction=fraction, + deterministic=deterministic, + ) measure_graph = measure.fst logger.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") @@ -157,7 +163,10 @@ def __init__( time_final = pynini.compose(time_graph, v_time_graph) date_final = pynini.compose(date_graph, v_date_graph) range_graph = RangeFst( - time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic, + time=time_final, + date=date_final, + cardinal=cardinal, + deterministic=deterministic, ).fst logger.debug(f"range: {time.time() - start_time: .2f}s -- {range_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py index 4ad7d1c85..dff205f8e 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/ordinal.py @@ -46,7 +46,10 @@ def __init__(self, deterministic: bool = True): convert_rest = pynutil.insert("th") suffix = pynini.cdrewrite( - graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, "", "[EOS]", NEMO_SIGMA, + graph_digit | graph_teens | pynini.cross("ty", "tieth") | convert_rest, + "", + "[EOS]", + NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix diff --git a/nemo_text_processing/text_normalization/en/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/en/verbalizers/whitelist.py index 926de207b..559605e63 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/whitelist.py @@ -36,5 +36,5 @@ def __init__(self, deterministic: bool = True): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/en/verbalizers/word.py b/nemo_text_processing/text_normalization/en/verbalizers/word.py index 5e5dddd21..b5978030e 100644 --- a/nemo_text_processing/text_normalization/en/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/en/verbalizers/word.py @@ -31,6 +31,6 @@ def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="verbalize", deterministic=deterministic) chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index 101185a90..946f4234e 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -107,7 +107,10 @@ def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike": """ fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA) fem_allign @= pynini.cdrewrite( - fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA, + fem_ones, + "", + pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), + NEMO_SIGMA, ) # If before a quote or EOS, we know it's the end of a string return fst @ fem_allign diff --git a/nemo_text_processing/text_normalization/es/taggers/cardinal.py b/nemo_text_processing/text_normalization/es/taggers/cardinal.py index 1b8f0a440..85402089f 100644 --- a/nemo_text_processing/text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/cardinal.py @@ -47,7 +47,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_string = pynini.closure( @@ -157,7 +157,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/es/taggers/date.py b/nemo_text_processing/text_normalization/es/taggers/date.py index ea7f15292..dd5cd7f0e 100644 --- a/nemo_text_processing/text_normalization/es/taggers/date.py +++ b/nemo_text_processing/text_normalization/es/taggers/date.py @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool): dash = "-" day_optional = pynini.closure(pynini.cross(dash, NEMO_SPACE) + day, 0, 1) - graph_ymd = NEMO_DIGIT ** 4 @ year_only + pynini.cross(dash, NEMO_SPACE) + month_number + day_optional + graph_ymd = NEMO_DIGIT**4 @ year_only + pynini.cross(dash, NEMO_SPACE) + month_number + day_optional final_graph = graph_dmy + pynutil.insert(" preserve_order: true") final_graph |= graph_ymd diff --git a/nemo_text_processing/text_normalization/es/taggers/fraction.py b/nemo_text_processing/text_normalization/es/taggers/fraction.py index 1fb5b8118..7bbe86402 100644 --- a/nemo_text_processing/text_normalization/es/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/es/taggers/fraction.py @@ -47,15 +47,50 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = ordinal_graph = ordinal.graph # 2-10 are all ordinals - three_to_ten = pynini.string_map(["2", "3", "4", "5", "6", "7", "8", "9", "10",]) + three_to_ten = pynini.string_map( + [ + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + ] + ) block_three_to_ten = pynutil.delete(three_to_ten) # To block cardinal productions if not deterministic: # Multiples of tens are sometimes rendered as ordinals - three_to_ten |= pynini.string_map(["20", "30", "40", "50", "60", "70", "80", "90",]) + three_to_ten |= pynini.string_map( + [ + "20", + "30", + "40", + "50", + "60", + "70", + "80", + "90", + ] + ) graph_three_to_ten = three_to_ten @ ordinal_graph graph_three_to_ten @= pynini.cdrewrite(ordinal_exceptions, "", "", NEMO_SIGMA) # Higher powers of tens (and multiples) are converted to ordinals. - hundreds = pynini.string_map(["100", "200", "300", "400", "500", "600", "700", "800", "900",]) + hundreds = pynini.string_map( + [ + "100", + "200", + "300", + "400", + "500", + "600", + "700", + "800", + "900", + ] + ) graph_hundreds = hundreds @ ordinal_graph multiples_of_thousand = ordinal.multiples_of_thousand # So we can have X milésimos @@ -68,7 +103,10 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = graph_higher_powers_of_ten += higher_powers_of_ten graph_higher_powers_of_ten = cardinal_graph @ graph_higher_powers_of_ten graph_higher_powers_of_ten @= pynini.cdrewrite( - pynutil.delete("un "), pynini.accep("[BOS]"), pynini.project(higher_powers_of_ten, "output"), NEMO_SIGMA, + pynutil.delete("un "), + pynini.accep("[BOS]"), + pynini.project(higher_powers_of_ten, "output"), + NEMO_SIGMA, ) # we drop 'un' from these ordinals (millionths, not one-millionths) graph_higher_powers_of_ten = multiples_of_thousand | graph_hundreds | graph_higher_powers_of_ten @@ -83,10 +121,16 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = # Blocking the digits and hundreds from Cardinal graph graph_fractions_cardinals = pynini.cdrewrite( - block_three_to_ten | block_higher_powers_of_ten, pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA, + block_three_to_ten | block_higher_powers_of_ten, + pynini.accep("[BOS]"), + pynini.accep("[EOS]"), + NEMO_SIGMA, ) graph_fractions_cardinals @= NEMO_CHAR.plus @ pynini.cdrewrite( - pynutil.delete("0"), pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA, + pynutil.delete("0"), + pynini.accep("[BOS]"), + pynini.accep("[EOS]"), + NEMO_SIGMA, ) # Empty characters become '0' for NEMO_CHAR fst, so need to block graph_fractions_cardinals @= cardinal_graph graph_fractions_cardinals += pynutil.insert( diff --git a/nemo_text_processing/text_normalization/es/taggers/measure.py b/nemo_text_processing/text_normalization/es/taggers/measure.py index a1933dbed..a63677c47 100644 --- a/nemo_text_processing/text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/text_normalization/es/taggers/measure.py @@ -79,7 +79,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) complex_unit_singular_graph = ( diff --git a/nemo_text_processing/text_normalization/es/taggers/time.py b/nemo_text_processing/text_normalization/es/taggers/time.py index 4a947dd31..de2752657 100644 --- a/nemo_text_processing/text_normalization/es/taggers/time.py +++ b/nemo_text_processing/text_normalization/es/taggers/time.py @@ -115,7 +115,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): time_zone_graph = time_zones + pynini.closure(utc_or_gmt_diff, 0, 1) final_time_zone_optional = pynini.closure( - delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), 0, 1, + delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), + 0, + 1, ) # 02.30 h diff --git a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py index 5aa66031a..165f5eeca 100644 --- a/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/es/taggers/tokenize_and_classify.py @@ -69,7 +69,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -86,10 +87,17 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -101,7 +109,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py index 3758c1bd5..5d7afc1b7 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/fraction.py @@ -141,7 +141,8 @@ def __init__(self, deterministic: bool = True): fraction_with_one_fem = numerator_one_fem + delete_space + insert_space fraction_with_one_fem += pynini.union( - denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word, + denominator_singular_fem @ merge_stem, + denominator_singular_fem @ merge_into_single_word, ) # Both forms exists fraction_with_one_fem += pynutil.insert(" parte") fraction_with_one_fem @= pynini.cdrewrite( @@ -150,7 +151,8 @@ def __init__(self, deterministic: bool = True): fraction_default_fem = numerator_fem + delete_space + insert_space fraction_default_fem += pynini.union( - denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word, + denominator_plural_fem @ merge_stem, + denominator_plural_fem @ merge_into_single_word, ) fraction_default_fem += pynutil.insert(" partes") diff --git a/nemo_text_processing/text_normalization/fr/taggers/date.py b/nemo_text_processing/text_normalization/fr/taggers/date.py index 91e83c40c..3c96f6913 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/date.py +++ b/nemo_text_processing/text_normalization/fr/taggers/date.py @@ -14,8 +14,8 @@ class DateFst(GraphFst): - ''' Finite state transducer for classyfing dates, e.g.: - '02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true} + '''Finite state transducer for classyfing dates, e.g.: + '02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true} ''' def __init__(self, cardinal: GraphFst, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index cacc94bcf..b3e10a4ec 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -63,7 +63,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -80,7 +81,11 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst word_graph = WordFst(deterministic=deterministic).fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) diff --git a/nemo_text_processing/text_normalization/hi/graph_utils.py b/nemo_text_processing/text_normalization/hi/graph_utils.py index ced1b8949..6a5d3c699 100644 --- a/nemo_text_processing/text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/text_normalization/hi/graph_utils.py @@ -34,9 +34,9 @@ NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = u"\u00A0" +NEMO_NON_BREAKING_SPACE = u"\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) @@ -179,4 +179,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index fe3ad9a1d..f6a8bdd65 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -21,12 +21,12 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. - -२३ -> cardinal { negative: "true" integer: "तेइस" } } - s - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + Finite state transducer for classifying cardinals, e.g. + -२३ -> cardinal { negative: "true" integer: "तेइस" } } + s + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True, lm: bool = False): @@ -42,7 +42,7 @@ def create_graph_suffix(digit_graph, suffix, zeros_counts): if zeros_counts == 0: return digit_graph + suffix - return digit_graph + (zero ** zeros_counts) + suffix + return digit_graph + (zero**zeros_counts) + suffix def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): insert_space = pynutil.insert(" ") @@ -50,7 +50,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): if zeros_counts == 0: return digit_graph + suffix + insert_space + sub_graph - return digit_graph + suffix + (zero ** zeros_counts) + insert_space + sub_graph + return digit_graph + suffix + (zero**zeros_counts) + insert_space + sub_graph # Hundred graph suffix_hundreds = pynutil.insert(" सौ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 19aaf3139..42135add7 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -33,7 +33,7 @@ class DateFst(GraphFst): Finite state transducer for classifying date, e.g. "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } - + Args: cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py index d0bef9373..955e8c0d3 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -22,13 +22,12 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': - """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. १ लाख -> integer_part: "एक" quantity: "लाख" e.g. १.५ लाख -> integer_part: "एक" fractional_part: "पाँच" quantity: "लाख" - Args: + Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ @@ -49,7 +48,7 @@ def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstL class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. + Finite state transducer for classifying decimal, e.g. -१२.५००६ अरब -> decimal { negative: "true" integer_part: "बारह" fractional_part: "पाँच शून्य शून्य छह" quantity: "अरब" } १ अरब -> decimal { integer_part: "एक" quantity: "अरब" } @@ -69,7 +68,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): point = pynutil.delete(".") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + 0, + 1, ) self.graph_fractional = pynutil.insert("fractional_part: \"") + self.graph + pynutil.insert("\"") diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index a29a72666..8971cd3dd 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -25,7 +25,7 @@ class FractionFst(GraphFst): fraction { integer: "तेईस" numerator: "चार" denominator: "छः"} ४/६" -> fraction { numerator: "चार" denominator: "छः"} - + Args: cardinal: cardinal GraphFst diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 7434fd70f..55279f4da 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -21,7 +21,7 @@ class MeasureFst(GraphFst): """ - Finite state transducer for classifying measure, suppletive aware, e.g. + Finite state transducer for classifying measure, suppletive aware, e.g. -१२kg -> measure { negative: "true" cardinal { integer: "बारह" } units: "किलोग्राम" } -१२.२kg -> measure { decimal { negative: "true" integer_part: "बारह" fractional_part: "दो"} units: "किलोग्राम" } @@ -40,7 +40,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + 0, + 1, ) # Define the unit handling diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index c44d6d346..7446b77e5 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -26,7 +26,7 @@ class MoneyFst(GraphFst): Finite state transducer for classifying money, suppletive aware, e.g. ₹1 -> money { currency: "रुपए" integer_part: "एक" } ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } - + Args: cardinal: CardinalFst decimal: DecimalFst @@ -40,7 +40,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): cardinal_graph = cardinal.final_graph optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + 0, + 1, ) self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index 622d4d5cb..6c87c9aad 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -29,7 +29,7 @@ class TimeFst(GraphFst): १२:३०:३० -> time { hours: "बारह" minutes: "तीस" seconds: "तीस" } १:४० -> time { hours: "एक" minutes: "चालीस" } १:०० -> time { hours: "एक" } - + Args: time: GraphFst deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 48ee97ef3..cc22a99f5 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -43,7 +43,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/hi/utils.py b/nemo_text_processing/text_normalization/hi/utils.py index 102212183..5d314506e 100644 --- a/nemo_text_processing/text_normalization/hi/utils.py +++ b/nemo_text_processing/text_normalization/hi/utils.py @@ -23,7 +23,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -46,7 +46,7 @@ def load_labels(abs_path): def augment_labels_with_punct_at_end(labels): """ - augments labels: if key ends on a punctuation that value does not have, add a new label + augments labels: if key ends on a punctuation that value does not have, add a new label where the value maintains the punctuation Args: @@ -63,10 +63,10 @@ def augment_labels_with_punct_at_end(labels): def apply_fst(text, fst): - """ Given a string input, returns the output string - produced by traversing the path with lowest weight. - If no valid path accepts input string, returns an - error. + """Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. """ try: print(pynini.shortestpath(text @ fst).string()) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index 1265fcec6..f0af1a2d4 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -23,7 +23,7 @@ class DateFst(GraphFst): Finite state transducer for verbalizing date, e.g. date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } -> "एक अप्रैल दो हज़ार चौबीस" date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } -> "अप्रैल एक दो हज़ार चौबीस" - + Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py b/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py index 57ec38003..ca4636897 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py @@ -21,8 +21,8 @@ class DecimalFst(GraphFst): """ - Finite state transducer for classifying decimal, e.g. - decimal { negative: "true" integer_part: "बारह" fractional_part: "पाँच शून्य शून्य छह" quantity: "अरब" } -> ऋणात्मक बारह दशमलव पाँच शून्य शून्य छह + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "बारह" fractional_part: "पाँच शून्य शून्य छह" quantity: "अरब" } -> ऋणात्मक बारह दशमलव पाँच शून्य शून्य छह decimal { integer_part: "बारह" quantity: "billion" } -> बारह अरब """ diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index e4cfae302..39b16b423 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -23,7 +23,7 @@ class FractionFst(GraphFst): Finite state transducer for verbalizing fraction e.g. fraction { integer: "तेईस" numerator: "चार" denominator: "छः" }-> तेईस चार बटा छः e.g. fraction { numerator: "चार" denominator: "छः" } -> चार बटा छः - + Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py index 6cc6f8879..d6d17ac37 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py @@ -23,8 +23,8 @@ class MeasureFst(GraphFst): Finite state transducer for verbalizing measure, e.g. measure { negative: "true" cardinal { integer: "बारह" } units: "किलोग्राम" } -> ऋणात्मक बारह किलोग्राम measure { decimal { integer_part: "बारह" fractional_part: "दो" } units: "किलोग्राम" } -> बारह दशमलव दो किलोग्राम - - + + Args: decimal: DecimalFst cardinal: CardinalFs @@ -36,7 +36,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="verbalize") optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, + 0, + 1, ) unit = pynutil.delete("units: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py index 87ec8e389..d838ca6ff 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py @@ -96,10 +96,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ remove_space_around_single_quote = pynini.cdrewrite( diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py index 3f478a2d2..58dbc9583 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py @@ -36,5 +36,5 @@ def __init__(self, deterministic: bool = True): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py index c20a3d27b..c9c5c3063 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/cardinal.py @@ -62,7 +62,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': fst: A pynini.FstLike object """ cardinal_separator = pynini.string_map([".", NEMO_SPACE]) - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string up_to_three_digits = up_to_three_digits - "000" - "00" - "0" @@ -246,7 +246,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ clean_output ) @@ -257,12 +257,12 @@ def __init__(self, deterministic: bool = True): zero_space + digit, ).optimize() self.three_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit, + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ).optimize() self.four_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 3)) @ self.graph, zero_space + self.three_digits_read + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**3)) @ self.graph, zero_space + self.three_digits_read ).optimize() self.graph |= graph_zero diff --git a/nemo_text_processing/text_normalization/hu/taggers/decimal.py b/nemo_text_processing/text_normalization/hu/taggers/decimal.py index 5026caec3..10ae4a8fe 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hu/taggers/decimal.py @@ -101,7 +101,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ]: for modifier in ["", "tíz", "száz"]: decimal_number |= ( - (NEMO_DIGIT ** order + (NEMO_DIGIT - "0")) + (NEMO_DIGIT**order + (NEMO_DIGIT - "0")) @ pynini.cdrewrite(pynini.cross("0", ""), "[BOS]", "", NEMO_SIGMA) @ cardinal_graph + final_zero diff --git a/nemo_text_processing/text_normalization/hu/taggers/measure.py b/nemo_text_processing/text_normalization/hu/taggers/measure.py index 9e5f328fb..f2c3a2368 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hu/taggers/measure.py @@ -61,7 +61,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_singular_graph = ( diff --git a/nemo_text_processing/text_normalization/hu/taggers/time.py b/nemo_text_processing/text_normalization/hu/taggers/time.py index ae1592f74..43e067fef 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/time.py +++ b/nemo_text_processing/text_normalization/hu/taggers/time.py @@ -180,7 +180,11 @@ def hours_to_pairs(): final_time_zone = ( pynini.accep(" ") + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\"") ) - final_time_zone_optional = pynini.closure(final_time_zone, 0, 1,) + final_time_zone_optional = pynini.closure( + final_time_zone, + 0, + 1, + ) # This might be better as just the inflected forms hour_only_delimited = ( diff --git a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py index 60ed0ddc9..8c269bb00 100644 --- a/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py @@ -69,7 +69,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -86,10 +87,17 @@ def __init__( self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst - self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + self.fraction = FractionFst( + cardinal=self.cardinal, + ordinal=self.ordinal, + deterministic=deterministic, + ) fraction_graph = self.fraction.fst self.measure = MeasureFst( - cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic, + cardinal=self.cardinal, + decimal=self.decimal, + fraction=self.fraction, + deterministic=deterministic, ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) @@ -101,7 +109,11 @@ def __init__( telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst diff --git a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py index f17f7c36a..b52e6efb7 100644 --- a/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/hu/verbalizers/telephone.py @@ -34,7 +34,11 @@ def __init__(self, deterministic: bool = True): country_code = pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) + optional_country_code = pynini.closure( + country_code + delete_space + insert_space, + 0, + 1, + ) number_part = ( pynutil.delete("number_part: \"") @@ -53,6 +57,8 @@ def __init__(self, deterministic: bool = True): 1, ) - graph = pynini.union(optional_country_code + number_part + optional_extension,) + graph = pynini.union( + optional_country_code + number_part + optional_extension, + ) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hy/verbalizers/whitelist.py index 7d5783688..5fd8f7bdb 100644 --- a/nemo_text_processing/text_normalization/hy/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/hy/verbalizers/whitelist.py @@ -29,6 +29,6 @@ def __init__(self): + pynutil.delete("\"") ) graph = graph @ pynini.cdrewrite( - pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA + pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA ) # Removes possible null token self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hy/verbalizers/word.py b/nemo_text_processing/text_normalization/hy/verbalizers/word.py index b0174d35e..9ef23b1d5 100644 --- a/nemo_text_processing/text_normalization/hy/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/hy/verbalizers/word.py @@ -33,6 +33,6 @@ def __init__(self): super().__init__(name="word", kind="verbalize") chars = pynini.closure(NEMO_CHAR - " ", 1) char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") - graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/it/taggers/cardinal.py b/nemo_text_processing/text_normalization/it/taggers/cardinal.py index ecb003775..1e16d6e36 100644 --- a/nemo_text_processing/text_normalization/it/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/it/taggers/cardinal.py @@ -48,7 +48,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_string = pynini.closure( @@ -162,7 +162,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/it/taggers/measure.py b/nemo_text_processing/text_normalization/it/taggers/measure.py index 40144cd61..880be0aa7 100644 --- a/nemo_text_processing/text_normalization/it/taggers/measure.py +++ b/nemo_text_processing/text_normalization/it/taggers/measure.py @@ -68,7 +68,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = ) optional_unit_denominator = pynini.closure( - pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, 0, 1, + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit_denominator, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py index 3aebcca91..603d520b5 100644 --- a/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/it/taggers/tokenize_and_classify.py @@ -66,7 +66,8 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far", + cache_dir, + f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] @@ -88,10 +89,18 @@ def __init__( self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst - self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.measure = MeasureFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) measure_graph = self.measure.fst - self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,) + self.money = MoneyFst( + cardinal=self.cardinal, + decimal=self.decimal, + deterministic=deterministic, + ) money_graph = self.money.fst self.time = TimeFst(deterministic=deterministic) diff --git a/nemo_text_processing/text_normalization/ja/graph_utils.py b/nemo_text_processing/text_normalization/ja/graph_utils.py index b7a01567f..f4c30b6ee 100644 --- a/nemo_text_processing/text_normalization/ja/graph_utils.py +++ b/nemo_text_processing/text_normalization/ja/graph_utils.py @@ -35,10 +35,10 @@ NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = u"\u00A0" -NEMO_NARROW_NON_BREAK_SPACE = u"\u202F" +NEMO_NON_BREAKING_SPACE = u"\u00a0" +NEMO_NARROW_NON_BREAK_SPACE = u"\u202f" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -294,4 +294,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike': + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/ja/taggers/cardinal.py b/nemo_text_processing/text_normalization/ja/taggers/cardinal.py index b17abbbbb..ff80f6a3b 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ja/taggers/cardinal.py @@ -23,7 +23,7 @@ class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals - e.g. 23 -> cardinal { integer: "二十三" } + e.g. 23 -> cardinal { integer: "二十三" } """ def __init__(self, deterministic: bool = True): @@ -41,13 +41,13 @@ def __init__(self, deterministic: bool = True): graph_all = (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit - hundreds = NEMO_DIGIT ** 3 + hundreds = NEMO_DIGIT**3 graph_hundred_component = (pynini.cross('1', '百') | (graph_digit_alt + pynutil.insert('百'))) + pynini.union( pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_all) ) graph_hundred = hundreds @ graph_hundred_component - thousands = NEMO_DIGIT ** 4 + thousands = NEMO_DIGIT**4 graph_thousand_component = (pynini.cross('1', '千') | (graph_digit_alt + pynutil.insert('千'))) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, @@ -61,7 +61,7 @@ def __init__(self, deterministic: bool = True): # this grammar is for larger number in later gramamr graph_thousand = thousands @ graph_thousand_component - ten_thousands = NEMO_DIGIT ** 5 + ten_thousands = NEMO_DIGIT**5 graph_ten_thousand_component = (graph_digit + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, @@ -71,8 +71,8 @@ def __init__(self, deterministic: bool = True): graph_ten_thousand = ten_thousands @ graph_ten_thousand_component self.man = graph_ten_thousand.optimize() - hundred_thousands = NEMO_DIGIT ** 6 - hundred_thousands_position = NEMO_DIGIT ** 2 + hundred_thousands = NEMO_DIGIT**6 + hundred_thousands_position = NEMO_DIGIT**2 hundred_thousands_position = hundred_thousands_position @ graph_all graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -82,8 +82,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - millions = NEMO_DIGIT ** 7 - million_position = NEMO_DIGIT ** 3 + millions = NEMO_DIGIT**7 + million_position = NEMO_DIGIT**3 million_position = million_position @ graph_hundred_component graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -93,8 +93,8 @@ def __init__(self, deterministic: bool = True): ) graph_million = millions @ graph_million_component - ten_millions = NEMO_DIGIT ** 8 - ten_million_position = NEMO_DIGIT ** 4 + ten_millions = NEMO_DIGIT**8 + ten_million_position = NEMO_DIGIT**4 ten_million_position = ten_million_position @ graph_thousand_component_alt graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -104,7 +104,7 @@ def __init__(self, deterministic: bool = True): ) graph_ten_million = ten_millions @ graph_ten_million_component - hundred_millions = NEMO_DIGIT ** 9 + hundred_millions = NEMO_DIGIT**9 graph_hundred_million_component = (graph_digit + pynutil.insert('億')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -117,8 +117,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_million = hundred_millions @ graph_hundred_million_component - thousand_millions = NEMO_DIGIT ** 10 - thousand_millions_position = NEMO_DIGIT ** 2 + thousand_millions = NEMO_DIGIT**10 + thousand_millions_position = NEMO_DIGIT**2 thousand_millions_position = thousand_millions_position @ graph_all graph_thousand_million_component = (thousand_millions_position + pynutil.insert('億')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -132,8 +132,8 @@ def __init__(self, deterministic: bool = True): ) graph_thousand_million = thousand_millions @ graph_thousand_million_component - ten_billions = NEMO_DIGIT ** 11 - ten_billions_position = NEMO_DIGIT ** 3 + ten_billions = NEMO_DIGIT**11 + ten_billions_position = NEMO_DIGIT**3 ten_billions_position = ten_billions_position @ graph_hundred_component graph_ten_billions_component = (ten_billions_position + pynutil.insert('億')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -147,8 +147,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_billions = ten_billions @ graph_ten_billions_component - hundred_billions = NEMO_DIGIT ** 12 - hundred_billions_position = NEMO_DIGIT ** 4 + hundred_billions = NEMO_DIGIT**12 + hundred_billions_position = NEMO_DIGIT**4 hundred_billions_position = hundred_billions_position @ graph_thousand_component_alt graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('億')) + pynini.union( pynini.closure(pynutil.delete('0')), diff --git a/nemo_text_processing/text_normalization/ja/taggers/date.py b/nemo_text_processing/text_normalization/ja/taggers/date.py index 25dbd71de..a8a469252 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/date.py +++ b/nemo_text_processing/text_normalization/ja/taggers/date.py @@ -38,24 +38,24 @@ class DateFst(GraphFst): M.5 -> date { era: "明治" "year: "五年" } 21日月曜日 -> tokens { date { day: "二十一日" weekday: "月曜日" } } 70年代 -> date { year: "七十年代" } - 西暦794年 -> tokens { date { era: "西暦" year: "七百九十四年" } } - 1月1日(月)〜3日(水) - -> tokens { date { month: "一月" day: "一日" weekday: "月曜日" } } tokens { name: "から" } tokens { date { day: "三日" weekday: "水曜日" } } + 西暦794年 -> tokens { date { era: "西暦" year: "七百九十四年" } } + 1月1日(月)〜3日(水) + -> tokens { date { month: "一月" day: "一日" weekday: "月曜日" } } tokens { name: "から" } tokens { date { day: "三日" weekday: "水曜日" } } 70〜80年代 -> tokens { cardinal { integer: "七十" } } tokens { name: "から" } tokens { date { year: "八十年代" } } 7月5〜9日(月〜金) - -> tokens { date { month: "七月" } } tokens { cardinal { integer: "五" } } tokens { name: "から" } tokens { date { day: "九日" weekday: "月曜日" } } tokens { name: "から" } tokens { date { weekday: "金曜日" } } + -> tokens { date { month: "七月" } } tokens { cardinal { integer: "五" } } tokens { name: "から" } tokens { date { day: "九日" weekday: "月曜日" } } tokens { name: "から" } tokens { date { weekday: "金曜日" } } 7月初旬〜9月中旬 - -> tokens { date { month: "七月" } } tokens { name: "初" } tokens { name: "旬" } tokens { name: "から" } tokens { date { month: "九月" } } tokens { name: "中" } tokens { name: "旬" } + -> tokens { date { month: "七月" } } tokens { name: "初" } tokens { name: "旬" } tokens { name: "から" } tokens { date { month: "九月" } } tokens { name: "中" } tokens { name: "旬" } 3〜4月 -> tokens { cardinal { integer: "三" } } tokens { name: "から" } tokens { date { month: "四月" } } - 2023年3月1日(水)〜6月12日(火) - -> tokens { date { year: "二千二十三年" month: "三月" day: "一日" weekday: "水曜日" } } tokens { name: "から" } tokens { date { month: "六月" day: "十二日" weekday: "火曜日" } } + 2023年3月1日(水)〜6月12日(火) + -> tokens { date { year: "二千二十三年" month: "三月" day: "一日" weekday: "水曜日" } } tokens { name: "から" } tokens { date { month: "六月" day: "十二日" weekday: "火曜日" } } 10月中旬〜11月上旬 -> tokens { date { month: "十月" } } tokens { date { month: "中旬" } } tokens { name: "から" } tokens { date { month: "十一月" } } tokens { date { month: "上旬" } } - 1976年7月17日〜8月1日 - -> tokens { date { year: "千九百七十六年" month: "七月" day: "十七日" } } tokens { name: "から" } tokens { date { month: "八月" day: "一日" } } - + 1976年7月17日〜8月1日 + -> tokens { date { year: "千九百七十六年" month: "七月" day: "十七日" } } tokens { name: "から" } tokens { date { month: "八月" day: "一日" } } + Args: cardinal: CardinalFst """ diff --git a/nemo_text_processing/text_normalization/ja/taggers/decimal.py b/nemo_text_processing/text_normalization/ja/taggers/decimal.py index 4ccd06d57..8fdea4c87 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ja/taggers/decimal.py @@ -25,7 +25,7 @@ class DecimalFst(GraphFst): Finite state transducer for classifying decimal, e.g. 0.5 -> decimal { integer_part: "零" fractional_part: "五" } -0.5万 -> decimal { negative: "マイナス" integer_part: "零" fractional_part: "五" quantity: "万"} - + Args: cardinal: CardinalFst """ @@ -46,7 +46,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_decimal_no_sign = graph_integer + pynutil.delete('.') + pynutil.insert(" ") + graph_fraction graph_optional_sign = ( - pynutil.insert("negative: \"") + (pynini.cross("-", "マイナス") | pynini.accep("マイナス")) + pynutil.insert("\"") + pynutil.insert("negative: \"") + + (pynini.cross("-", "マイナス") | pynini.accep("マイナス")) + + pynutil.insert("\"") ) graph_decimal = graph_decimal_no_sign | (graph_optional_sign + pynutil.insert(" ") + graph_decimal_no_sign) diff --git a/nemo_text_processing/text_normalization/ja/taggers/fraction.py b/nemo_text_processing/text_normalization/ja/taggers/fraction.py index 0dd488f4f..94fb4af68 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ja/taggers/fraction.py @@ -110,7 +110,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) optional_sign = ( - pynutil.insert("negative: \"") + (pynini.accep("マイナス") | pynini.cross("-", "マイナス")) + pynutil.insert("\"") + pynutil.insert("negative: \"") + + (pynini.accep("マイナス") | pynini.cross("-", "マイナス")) + + pynutil.insert("\"") ) graph_fraction_slash_sigh = pynini.closure(optional_sign + pynutil.insert(NEMO_SPACE), 0, 1) + ( diff --git a/nemo_text_processing/text_normalization/ja/taggers/punctuation.py b/nemo_text_processing/text_normalization/ja/taggers/punctuation.py index 24ee2f15f..c5df8388c 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/ja/taggers/punctuation.py @@ -62,7 +62,9 @@ def __init__(self, deterministic: bool = True): + pynini.accep(">") ) punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) - range_component = pynini.cross("〜", "から") | pynini.accep("から") # forcing this conversion for special tilde + range_component = pynini.cross("〜", "から") | pynini.accep( + "から" + ) # forcing this conversion for special tilde self.graph = punct | pynutil.add_weight(range_component, -1.0) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/ja/taggers/time.py b/nemo_text_processing/text_normalization/ja/taggers/time.py index 6b8a308a2..7c74bc53e 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/time.py +++ b/nemo_text_processing/text_normalization/ja/taggers/time.py @@ -25,7 +25,7 @@ class TimeFst(GraphFst): Finite state transducer for classifying time, e.g. 1時30分 -> time { hours: "一" minutes: "三十" } 今夜0時 -> time { suffix: "今夜" hours: "零" } - + Args: cardinal: CardinalFst """ diff --git a/nemo_text_processing/text_normalization/ja/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ja/taggers/tokenize_and_classify.py index c28c444ed..f992e9b70 100644 --- a/nemo_text_processing/text_normalization/ja/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ja/taggers/tokenize_and_classify.py @@ -33,9 +33,9 @@ class ClassifyFst(GraphFst): """ Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/ja/utils.py b/nemo_text_processing/text_normalization/ja/utils.py index 2a5455b2b..65523afed 100644 --- a/nemo_text_processing/text_normalization/ja/utils.py +++ b/nemo_text_processing/text_normalization/ja/utils.py @@ -22,7 +22,7 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - + Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -35,6 +35,7 @@ def get_abs_path(rel_path): # Args: # abs_path: absolute path + # Returns dictionary of mappings # """ # #label_tsv = open(abs_path, encoding="utf-8") diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/date.py b/nemo_text_processing/text_normalization/ja/verbalizers/date.py index 209c3c34f..8292c622a 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/date.py @@ -23,7 +23,7 @@ class DateFst(GraphFst): """ Finite state transducer for verbalizing date e.g. date { year: "二千二十四" month: "三" day: "四" } -> 二千二十四年三月四日 - + """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ja/verbalizers/post_processing.py index 4bafef0bd..8b196dcaf 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/post_processing.py @@ -96,10 +96,10 @@ def set_punct_dict(self): def get_punct_postprocess_graph(self): """ - Returns graph to post process punctuation marks. + Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. """ remove_space_around_single_quote = pynini.cdrewrite( diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/ja/verbalizers/postprocessor.py index 89b56f8dc..3ff05fa57 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/postprocessor.py @@ -29,14 +29,18 @@ class PostProcessor(GraphFst): ''' - Postprocessing of TN, now contains: - 1. punctuation removal - 2. letter case conversion - 3. oov tagger + Postprocessing of TN, now contains: + 1. punctuation removal + 2. letter case conversion + 3. oov tagger ''' def __init__( - self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, + self, + remove_puncts: bool = False, + to_upper: bool = False, + to_lower: bool = False, + tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/time.py b/nemo_text_processing/text_normalization/ja/verbalizers/time.py index 73029ae6c..7058f437b 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/time.py @@ -22,8 +22,8 @@ class TimeFst(GraphFst): """ Finite state transducer for verbalizing time e.g. - - + + """ def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ja/verbalizers/verbalize.py index dbf59c446..6a16f96d9 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/verbalize.py @@ -29,7 +29,7 @@ class VerbalizeFst(GraphFst): """ Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. Args: deterministic: if True will provide a single transduction option, @@ -52,7 +52,14 @@ def __init__(self, deterministic: bool = True): whitelist = WhiteListFst(deterministic=deterministic) graph = pynini.union( - date.fst, cardinal.fst, ordinal.fst, decimal.fst, fraction.fst, word.fst, time.fst, whitelist.fst, + date.fst, + cardinal.fst, + ordinal.fst, + decimal.fst, + fraction.fst, + word.fst, + time.fst, + whitelist.fst, ) graph = pynini.closure(delete_space) + graph + pynini.closure(delete_space) diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ja/verbalizers/verbalize_final.py index c0327a876..750598649 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/verbalize_final.py @@ -26,9 +26,7 @@ class VerbalizeFinalFst(GraphFst): - """ - - """ + """ """ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) @@ -46,6 +44,11 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) + postprocessor = PostProcessor( + remove_puncts=False, + to_upper=False, + to_lower=False, + tag_oov=False, + ) self.fst = (verbalizer @ postprocessor.fst).optimize() diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/ja/verbalizers/whitelist.py index 6476a8836..11b0b3ae0 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/whitelist.py @@ -34,5 +34,5 @@ def __init__(self, deterministic: bool = True): + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") ) - graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ja/verbalizers/word.py b/nemo_text_processing/text_normalization/ja/verbalizers/word.py index 6ee724f93..afd5d1037 100644 --- a/nemo_text_processing/text_normalization/ja/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/ja/verbalizers/word.py @@ -20,7 +20,7 @@ class WordFst(GraphFst): ''' - tokens { char: "文字" } -> 文字 + tokens { char: "文字" } -> 文字 ''' def __init__(self, deterministic: bool = True): diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 1a38dde3d..82f8f43d2 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -540,8 +540,8 @@ def split_text_into_sentences(self, text: str, additional_split_symbols: str = " upper_case_unicode = "" if self.lang == "ru": - lower_case_unicode = '\u0430-\u04FF' - upper_case_unicode = '\u0410-\u042F' + lower_case_unicode = '\u0430-\u04ff' + upper_case_unicode = '\u0410-\u042f' # end of quoted speech - to be able to split sentences by full stop text = re.sub(r"([\.\?\!])([\"\'])", r"\g<2>\g<1> ", text) diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 6a61efd4e..8a60516cc 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -164,11 +164,16 @@ def normalize( text_with_span_tags_list[masked_idx_list[sem_tag_idx]] = "" else: non_deter_options = self.normalize_non_deterministic( - text=cur_semiotic_span, n_tagged=n_tagged, punct_post_process=punct_post_process, verbose=verbose, + text=cur_semiotic_span, + n_tagged=n_tagged, + punct_post_process=punct_post_process, + verbose=verbose, ) try: best_option, cer, _ = self.select_best_match( - normalized_texts=non_deter_options, pred_text=cur_pred_text, verbose=verbose, + normalized_texts=non_deter_options, + pred_text=cur_pred_text, + verbose=verbose, ) if cer_threshold > 0 and cer > cer_threshold: best_option = cur_deter_norm @@ -366,7 +371,11 @@ def get_verbalized_text(tagged_text): continue def select_best_match( - self, normalized_texts: List[str], pred_text: str, verbose: bool = False, remove_punct: bool = False, + self, + normalized_texts: List[str], + pred_text: str, + verbose: bool = False, + remove_punct: bool = False, ): """ Selects the best normalization option based on the lowest CER diff --git a/nemo_text_processing/text_normalization/ru/taggers/date.py b/nemo_text_processing/text_normalization/ru/taggers/date.py index 2dc87ee06..3ad16f999 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/date.py +++ b/nemo_text_processing/text_normalization/ru/taggers/date.py @@ -78,7 +78,7 @@ def __init__(self, number_names: dict, deterministic: bool): month = ( pynutil.insert("month: \"") + (month_name | pynutil.add_weight(digit_month, 0.1)) + pynutil.insert("\"") ).optimize() - year = pynini.compose(((NEMO_DIGIT ** 4) | (NEMO_DIGIT ** 2)), numbers).optimize() + year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)), numbers).optimize() year |= zero_digit # reduce year options diff --git a/nemo_text_processing/text_normalization/ru/taggers/telephone.py b/nemo_text_processing/text_normalization/ru/taggers/telephone.py index 4fbfbf06a..456bd6f1a 100644 --- a/nemo_text_processing/text_normalization/ru/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ru/taggers/telephone.py @@ -48,13 +48,13 @@ def __init__(self, number_names: dict, deterministic: bool = True): optional_country_code = pynini.closure(country_code + insert_space, 0, 1) number_part = ( - NEMO_DIGIT ** 3 @ number + NEMO_DIGIT**3 @ number + separator - + NEMO_DIGIT ** 3 @ number + + NEMO_DIGIT**3 @ number + separator - + NEMO_DIGIT ** 2 @ number + + NEMO_DIGIT**2 @ number + separator - + NEMO_DIGIT ** 2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) + + NEMO_DIGIT**2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) ) number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") tagger_graph = (optional_country_code + number_part).optimize() diff --git a/nemo_text_processing/text_normalization/rw/graph_utils.py b/nemo_text_processing/text_normalization/rw/graph_utils.py index ce75cd17e..ecb20fe84 100644 --- a/nemo_text_processing/text_normalization/rw/graph_utils.py +++ b/nemo_text_processing/text_normalization/rw/graph_utils.py @@ -38,9 +38,9 @@ NEMO_VOWELS = pynini.union(*"aeiouAEIOU").optimize() NEMO_CONSONANTS = pynini.union(*"BCDFGHJKLMNPQRSTVWXYZbcdfghjklmnpqrstvwxyz").optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() -NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_NON_BREAKING_SPACE = "\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -107,14 +107,36 @@ suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union( - "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", + "b", + "c", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "w", + "x", + "y", + "z", ) _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( - suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA, + suppletive, + plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), + NEMO_SIGMA, ).optimize() SINGULAR_TO_PLURAL = graph_plural @@ -129,7 +151,9 @@ def capitalized_input_graph( - graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None, + graph: "pynini.FstLike", + original_graph_weight: float = None, + capitalized_graph_weight: float = None, ) -> "pynini.FstLike": """ Allow graph input to be capitalized, e.g. for ITN) @@ -270,4 +294,4 @@ def delete_tokens(self, fst) -> "pynini.FstLike": + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py index 021e652bd..750ff867b 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/cardinal.py @@ -69,7 +69,7 @@ def filter_punctuation(fst: 'pynini.FstLike') -> 'pynini.FstLike': Returns: fst: A pynini.FstLike object """ - exactly_three_digits = NEMO_DIGIT ** 3 # for blocks of three + exactly_three_digits = NEMO_DIGIT**3 # for blocks of three up_to_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) # for start of string cardinal_separator = NEMO_SPACE @@ -249,7 +249,7 @@ def __init__(self, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) @@ -276,30 +276,27 @@ def __init__(self, deterministic: bool = True): zero_space = zero + insert_space self.zero_space = zero_space self.three_digits_read = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ) self.three_digits_read_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, - zero_space + ((NEMO_DIGIT ** 2) @ graph_tens), + zero_space + ((NEMO_DIGIT**2) @ graph_tens), zero_space + zero_space + digit, ) self.three_digits_read_frac = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, zero_space + digit + insert_space + digit, ) self.three_digits_read_frac_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, zero_space + digit + insert_space + digit, ) self.two_or_three_digits_read_frac = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, zero_space + single_digits_graph + pynini.closure(insert_space + digit, 0, 1), single_digits_graph + pynini.closure(insert_space + single_digits_graph, 3), @@ -307,7 +304,7 @@ def __init__(self, deterministic: bool = True): single_digits_graph, ) self.two_or_three_digits_read_frac_en = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ (graph_tens @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)), zero_space + single_digits_graph + pynini.closure(insert_space + single_digits_graph, 0, 1), @@ -316,9 +313,8 @@ def __init__(self, deterministic: bool = True): single_digits_graph, ) self.two_or_three_digits_read_frac_both = pynini.union( - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) - @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, - ((NEMO_DIGIT - "0") + (NEMO_DIGIT ** 2)) + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one, + ((NEMO_DIGIT - "0") + (NEMO_DIGIT**2)) @ self.graph_hundreds_component_at_least_one_non_zero_digit_no_one_en, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ graph_tens, ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ (graph_tens @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)), diff --git a/nemo_text_processing/text_normalization/sv/taggers/measure.py b/nemo_text_processing/text_normalization/sv/taggers/measure.py index e114e9e6d..4da3f81c2 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/measure.py +++ b/nemo_text_processing/text_normalization/sv/taggers/measure.py @@ -81,7 +81,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de ) optional_graph_unit2 = pynini.closure( - delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, + delete_zero_or_one_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, + 0, + 1, ) unit_plural = ( diff --git a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py index 25dfb6e9b..0877ca08f 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/sv/taggers/ordinal.py @@ -95,7 +95,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): hundreds |= pynini.cross("1", "ett hundra") hundreds |= digit + pynutil.insert(NEMO_SPACE) + pynutil.insert("hundra") - graph_hundreds = hundreds + pynini.union(graph_tens, (pynutil.delete("0") + graph_digit),) + graph_hundreds = hundreds + pynini.union( + graph_tens, + (pynutil.delete("0") + graph_digit), + ) if not deterministic: graph_hundreds |= hundreds + pynini.union( (graph_teens | pynutil.insert(NEMO_SPACE) + graph_teens), (pynini.cross("0", NEMO_SPACE) + graph_digit) @@ -179,7 +182,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): self.graph = ( ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0)) @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) - @ NEMO_DIGIT ** 24 + @ NEMO_DIGIT**24 @ graph @ pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/sv/taggers/time.py b/nemo_text_processing/text_normalization/sv/taggers/time.py index 676e78592..cb5067058 100644 --- a/nemo_text_processing/text_normalization/sv/taggers/time.py +++ b/nemo_text_processing/text_normalization/sv/taggers/time.py @@ -106,7 +106,11 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure(ensure_space + final_suffix, 0, 1) final_time_zone = pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\"") - final_time_zone_optional = pynini.closure(NEMO_SPACE + final_time_zone, 0, 1,) + final_time_zone_optional = pynini.closure( + NEMO_SPACE + final_time_zone, + 0, + 1, + ) # 2:30 pm, 02:30, 2:00 graph_hm_kl = ( diff --git a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py index af17c6d48..6656e3445 100644 --- a/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/sv/verbalizers/telephone.py @@ -40,7 +40,11 @@ def __init__(self, deterministic: bool = True): country_code = pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - optional_country_code = pynini.closure(country_code + delete_space + insert_space, 0, 1,) + optional_country_code = pynini.closure( + country_code + delete_space + insert_space, + 0, + 1, + ) number_part = ( pynutil.delete("number_part: \"") diff --git a/nemo_text_processing/text_normalization/zh/graph_utils.py b/nemo_text_processing/text_normalization/zh/graph_utils.py index f2ad527ae..69652b1f5 100644 --- a/nemo_text_processing/text_normalization/zh/graph_utils.py +++ b/nemo_text_processing/text_normalization/zh/graph_utils.py @@ -33,9 +33,9 @@ NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() -NEMO_NON_BREAKING_SPACE = "\u00A0" +NEMO_NON_BREAKING_SPACE = "\u00a0" NEMO_SPACE = " " -NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize() +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() @@ -136,7 +136,7 @@ def delete_tokens(self, fst) -> "pynini.FstLike": + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) def convert_space(fst) -> "pynini.FstLike": diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index 21437e82f..a0c3b587d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -35,7 +35,7 @@ def __init__(self, deterministic: bool = True): graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) graph_teen_alt = pynini.string_file(get_abs_path("data/number/teen_alt.tsv")) - alls = NEMO_DIGIT ** 2 | NEMO_DIGIT ** 1 + alls = NEMO_DIGIT**2 | NEMO_DIGIT**1 graph_all = ( (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen_alt | graph_digit ) # graph_all when within a larger number e.g., 316-> 三百一十六 instead of 三百十六 @@ -46,7 +46,7 @@ def __init__(self, deterministic: bool = True): ) # graph_all when at the head of the larger numbere.g., 13万 -> 十三万 instead of 一十三万 graph_all_alt = alls @ graph_all_alt - hundreds = NEMO_DIGIT ** 3 + hundreds = NEMO_DIGIT**3 graph_hundred_component = (graph_digit + pynutil.insert('百')) + pynini.union( pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0') + pynutil.insert('零')) + graph_all), @@ -56,7 +56,7 @@ def __init__(self, deterministic: bool = True): self.digit = graph_digit.optimize() self.all = graph_all.optimize() - thousands = NEMO_DIGIT ** 4 + thousands = NEMO_DIGIT**4 graph_thousand_component = (graph_digit_alt + pynutil.insert('千')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, @@ -64,7 +64,7 @@ def __init__(self, deterministic: bool = True): ) graph_thousand = thousands @ graph_thousand_component - ten_thousands = NEMO_DIGIT ** 5 + ten_thousands = NEMO_DIGIT**5 graph_ten_thousand_component = (graph_digit_alt + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, @@ -73,8 +73,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_thousand = ten_thousands @ graph_ten_thousand_component - hundred_thousands = NEMO_DIGIT ** 6 - hundred_thousands_position = NEMO_DIGIT ** 2 + hundred_thousands = NEMO_DIGIT**6 + hundred_thousands_position = NEMO_DIGIT**2 hundred_thousands_position = hundred_thousands_position @ graph_all_alt graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - millions = NEMO_DIGIT ** 7 - million_position = NEMO_DIGIT ** 3 + millions = NEMO_DIGIT**7 + million_position = NEMO_DIGIT**3 million_position = million_position @ graph_hundred_component graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -95,8 +95,8 @@ def __init__(self, deterministic: bool = True): ) graph_million = millions @ graph_million_component - ten_millions = NEMO_DIGIT ** 8 - ten_million_position = NEMO_DIGIT ** 4 + ten_millions = NEMO_DIGIT**8 + ten_million_position = NEMO_DIGIT**4 ten_million_position = ten_million_position @ graph_thousand_component graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -106,7 +106,7 @@ def __init__(self, deterministic: bool = True): ) graph_ten_million = ten_millions @ graph_ten_million_component - hundred_millions = NEMO_DIGIT ** 9 + hundred_millions = NEMO_DIGIT**9 graph_hundred_million_component = (graph_digit_alt + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -119,8 +119,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_million = hundred_millions @ graph_hundred_million_component - thousand_millions = NEMO_DIGIT ** 10 - thousand_millions_position = NEMO_DIGIT ** 2 + thousand_millions = NEMO_DIGIT**10 + thousand_millions_position = NEMO_DIGIT**2 thousand_millions_position = thousand_millions_position @ graph_all_alt graph_thousand_million_component = (thousand_millions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -134,8 +134,8 @@ def __init__(self, deterministic: bool = True): ) graph_thousand_million = thousand_millions @ graph_thousand_million_component - ten_billions = NEMO_DIGIT ** 11 - ten_billions_position = NEMO_DIGIT ** 3 + ten_billions = NEMO_DIGIT**11 + ten_billions_position = NEMO_DIGIT**3 ten_billions_position = ten_billions_position @ graph_hundred_component graph_ten_billions_component = (ten_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -149,8 +149,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_billions = ten_billions @ graph_ten_billions_component - hundred_billions = NEMO_DIGIT ** 12 - hundred_billions_position = NEMO_DIGIT ** 4 + hundred_billions = NEMO_DIGIT**12 + hundred_billions_position = NEMO_DIGIT**4 hundred_billions_position = hundred_billions_position @ graph_thousand_component graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index 5cd95e58c..b283f3444 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -32,7 +32,9 @@ class PreProcessorFst(GraphFst): ''' def __init__( - self, remove_interjections: bool = True, fullwidth_to_halfwidth: bool = True, + self, + remove_interjections: bool = True, + fullwidth_to_halfwidth: bool = True, ): super().__init__(name="PreProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py index dab0cea0f..dcdd73622 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/postprocessor.py @@ -36,7 +36,11 @@ class PostProcessor(GraphFst): ''' def __init__( - self, remove_puncts: bool = False, to_upper: bool = False, to_lower: bool = False, tag_oov: bool = False, + self, + remove_puncts: bool = False, + to_upper: bool = False, + to_lower: bool = False, + tag_oov: bool = False, ): super().__init__(name="PostProcessor", kind="processor") diff --git a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py index 4592d7841..846254938 100644 --- a/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/zh/verbalizers/verbalize_final.py @@ -42,6 +42,11 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ ) verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - postprocessor = PostProcessor(remove_puncts=False, to_upper=False, to_lower=False, tag_oov=False,) + postprocessor = PostProcessor( + remove_puncts=False, + to_upper=False, + to_lower=False, + tag_oov=False, + ) self.fst = (verbalizer @ postprocessor.fst).optimize() diff --git a/setup.py b/setup.py index 4667b49e8..e22afbab3 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,9 @@ elif os.path.exists('README.rst'): # codec is used for consistent encoding long_description = codecs.open( - os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', encoding='utf-8', + os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), + 'r', + encoding='utf-8', ).read() long_description_content_type = "text/x-rst" @@ -125,7 +127,8 @@ def __call_checker(self, base_command, scope, check): command.extend(['--check', '--diff']) self.announce( - msg='Running command: %s' % str(' '.join(command)), level=distutils_log.INFO, + msg='Running command: %s' % str(' '.join(command)), + level=distutils_log.INFO, ) return_code = subprocess.call(command) @@ -133,10 +136,18 @@ def __call_checker(self, base_command, scope, check): return return_code def _isort(self, scope, check): - return self.__call_checker(base_command=self.__ISORT_BASE.split(), scope=scope, check=check,) + return self.__call_checker( + base_command=self.__ISORT_BASE.split(), + scope=scope, + check=check, + ) def _black(self, scope, check): - return self.__call_checker(base_command=self.__BLACK_BASE.split(), scope=scope, check=check,) + return self.__call_checker( + base_command=self.__BLACK_BASE.split(), + scope=scope, + check=check, + ) def _pass(self): self.announce(msg='\033[32mPASS\x1b[0m', level=distutils_log.INFO) diff --git a/tests/conftest.py b/tests/conftest.py index 8db3b106c..a26dab531 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,7 +56,9 @@ def pytest_addoption(parser): help="path to a directory with .far grammars for CPU TN/ITN tests, (DEFAULT: None, i.e. no cache)", ) parser.addoption( - '--run_audio_based', action='store_true', help="pass this argument to run audio-based TN tests", + '--run_audio_based', + action='store_true', + help="pass this argument to run audio-based TN tests", ) @@ -148,10 +150,12 @@ def pytest_configure(config): If file absent or sizes not equal, function downloads the archive from github and unpacks it. """ config.addinivalue_line( - "markers", "run_only_on(device): runs the test only on a given device [CPU | GPU]", + "markers", + "run_only_on(device): runs the test only on a given device [CPU | GPU]", ) config.addinivalue_line( - "markers", "with_downloads: runs the test using data present in tests/.data", + "markers", + "with_downloads: runs the test using data present in tests/.data", ) # Test dir and archive filepath. test_dir = join(dirname(__file__), __TEST_DATA_SUBDIR) diff --git a/tests/nemo_text_processing/ar/test_money.py b/tests/nemo_text_processing/ar/test_money.py index 6fe36ba35..2aa49ba9a 100644 --- a/tests/nemo_text_processing/ar/test_money.py +++ b/tests/nemo_text_processing/ar/test_money.py @@ -49,6 +49,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_address.py b/tests/nemo_text_processing/en/test_address.py index c7a3523a0..ea8328d10 100644 --- a/tests/nemo_text_processing/en/test_address.py +++ b/tests/nemo_text_processing/en/test_address.py @@ -42,6 +42,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_cardinal.py b/tests/nemo_text_processing/en/test_cardinal.py index 1ee3a2a5b..f40e0d1f6 100644 --- a/tests/nemo_text_processing/en/test_cardinal.py +++ b/tests/nemo_text_processing/en/test_cardinal.py @@ -63,6 +63,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/en/test_decimal.py b/tests/nemo_text_processing/en/test_decimal.py index ff021f72a..ea20f18d6 100644 --- a/tests/nemo_text_processing/en/test_decimal.py +++ b/tests/nemo_text_processing/en/test_decimal.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_electronic.py b/tests/nemo_text_processing/en/test_electronic.py index e8640062c..4dfec585e 100644 --- a/tests/nemo_text_processing/en/test_electronic.py +++ b/tests/nemo_text_processing/en/test_electronic.py @@ -60,6 +60,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=100, punct_post_process=False, + test_input, + n_tagged=100, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_fraction.py b/tests/nemo_text_processing/en/test_fraction.py index 764205591..a6186aabb 100644 --- a/tests/nemo_text_processing/en/test_fraction.py +++ b/tests/nemo_text_processing/en/test_fraction.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_math.py b/tests/nemo_text_processing/en/test_math.py index e2ecdebb8..22859f596 100644 --- a/tests/nemo_text_processing/en/test_math.py +++ b/tests/nemo_text_processing/en/test_math.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_measure.py b/tests/nemo_text_processing/en/test_measure.py index b03b3ff53..6ea9a0eda 100644 --- a/tests/nemo_text_processing/en/test_measure.py +++ b/tests/nemo_text_processing/en/test_measure.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_money.py b/tests/nemo_text_processing/en/test_money.py index c81945ecd..103223d5e 100644 --- a/tests/nemo_text_processing/en/test_money.py +++ b/tests/nemo_text_processing/en/test_money.py @@ -63,6 +63,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_ordinal.py b/tests/nemo_text_processing/en/test_ordinal.py index 6f87a832d..dac56bf38 100644 --- a/tests/nemo_text_processing/en/test_ordinal.py +++ b/tests/nemo_text_processing/en/test_ordinal.py @@ -61,6 +61,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_punctuation.py b/tests/nemo_text_processing/en/test_punctuation.py index 75ff2e73c..761b3c9f4 100644 --- a/tests/nemo_text_processing/en/test_punctuation.py +++ b/tests/nemo_text_processing/en/test_punctuation.py @@ -22,7 +22,11 @@ class TestPunctuation: normalizer_en = Normalizer( - input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True, + input_case='cased', + lang='en', + cache_dir=CACHE_DIR, + overwrite_cache=False, + post_process=True, ) # address is tagged by the measure class diff --git a/tests/nemo_text_processing/en/test_range.py b/tests/nemo_text_processing/en/test_range.py index ac93613be..64b47d898 100644 --- a/tests/nemo_text_processing/en/test_range.py +++ b/tests/nemo_text_processing/en/test_range.py @@ -39,6 +39,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_roman.py b/tests/nemo_text_processing/en/test_roman.py index dc9468fb3..3ef655c65 100644 --- a/tests/nemo_text_processing/en/test_roman.py +++ b/tests/nemo_text_processing/en/test_roman.py @@ -40,6 +40,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=False, + test_input, + n_tagged=30, + punct_post_process=False, ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/en/test_serial.py b/tests/nemo_text_processing/en/test_serial.py index aab870abf..2a27b1f54 100644 --- a/tests/nemo_text_processing/en/test_serial.py +++ b/tests/nemo_text_processing/en/test_serial.py @@ -38,6 +38,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=-1, punct_post_process=False, + test_input, + n_tagged=-1, + punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/en/test_special_text.py b/tests/nemo_text_processing/en/test_special_text.py index a461fe703..73be5d382 100644 --- a/tests/nemo_text_processing/en/test_special_text.py +++ b/tests/nemo_text_processing/en/test_special_text.py @@ -41,6 +41,8 @@ def test_norm(self, test_input, expected): # Audio-based normalization will output only options without digits if self.normalizer_with_audio_en and sum([1 for ch in expected if ch.isdigit()]) == 0: pred_non_deterministic = self.normalizer_with_audio_en.normalize( - test_input, n_tagged=30, punct_post_process=True, + test_input, + n_tagged=30, + punct_post_process=True, ) assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/es/test_ordinal.py b/tests/nemo_text_processing/es/test_ordinal.py index e2cd7d4a2..1a48d6da8 100644 --- a/tests/nemo_text_processing/es/test_ordinal.py +++ b/tests/nemo_text_processing/es/test_ordinal.py @@ -62,6 +62,8 @@ def test_norm(self, test_input, expected): if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, n_tagged=500, punct_post_process=False, + test_input, + n_tagged=500, + punct_post_process=False, ) assert expected in pred_non_deterministic From e8a531d9b2c5a8d0e1f19eee8bd7248ecfaa2c1d Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 13 May 2025 10:54:38 -0700 Subject: [PATCH 10/14] First draft of Korean Cardinal ITN Sparrowhawk testing is not done yet. Signed-off-by: hmlee245 --- .../inverse_normalize.py | 7 +- .../inverse_text_normalization/ko/__init__.py | 17 + .../ko/clean_eval_data.py | 361 ++++++++++++++++++ .../ko/data/__init__.py | 13 + .../ko/data/numbers/__init__.py | 13 + .../ko/data/numbers/digit.tsv | 9 + .../ko/data/numbers/thousands.tsv | 11 + .../ko/data/numbers/zero.tsv | 1 + .../ko/graph_utils.py | 292 ++++++++++++++ .../ko/taggers/__init__.py | 17 + .../ko/taggers/cardinal.py | 104 +++++ .../ko/taggers/tokenize_and_classify.py | 76 ++++ .../ko/taggers/word.py | 32 ++ .../inverse_text_normalization/ko/utils.py | 23 ++ .../ko/verbalizers/__init__.py | 17 + .../ko/verbalizers/cardinal.py | 54 +++ .../ko/verbalizers/verbalize.py | 36 ++ .../ko/verbalizers/verbalize_final.py | 49 +++ .../ko/verbalizers/word.py | 34 ++ .../run_evaluate.py | 2 +- tests/nemo_text_processing/ko/__init__.py | 13 + .../test_cases_cardinal.txt | 27 ++ .../nemo_text_processing/ko/test_cardinal.py | 39 ++ ..._sparrowhawk_inverse_text_normalization.sh | 34 ++ .../pynini_export.py | 8 + 25 files changed, 1287 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py create mode 100644 tests/nemo_text_processing/ko/__init__.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/ko/test_cardinal.py create mode 100644 tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index c10819908..e505a8ad0 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -131,6 +131,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'ko': # Korean + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -175,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py new file mode 100644 index 000000000..3c1193333 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py @@ -0,0 +1,361 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser +from typing import List + +import regex as re + +from nemo_text_processing.text_normalization.data_loader_utils import ( + EOS_TYPE, + Instance, + load_files, + training_data_to_sentences, +) + +""" +This file is for evaluation purposes. +filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. +For example, normalized text should only include characters and whitespace characters but no punctuation. + Cardinal unnormalized instances should contain at least one integer and all other characters are removed. +""" + + +class Filter: + """ + Filter class + + Args: + class_type: semiotic class used in dataset + process_func: function to transform text + filter_func: function to filter text + + """ + + def __init__(self, class_type: str, process_func: object, filter_func: object): + self.class_type = class_type + self.process_func = process_func + self.filter_func = filter_func + + def filter(self, instance: Instance) -> bool: + """ + filter function + + Args: + filters given instance with filter function + + Returns: True if given instance fulfills criteria or does not belong to class type + """ + if instance.token_type != self.class_type: + return True + return self.filter_func(instance) + + def process(self, instance: Instance) -> Instance: + """ + process function + + Args: + processes given instance with process function + + Returns: processed instance if instance belongs to expected class type or original instance + """ + if instance.token_type != self.class_type: + return instance + return self.process_func(instance) + + +def filter_cardinal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_cardinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[^0-9]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_ordinal_1(instance: Instance) -> bool: + ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) + return ok + + +def process_ordinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[,\s]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_decimal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_decimal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_measure_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_measure_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"m2", "m²", un_normalized) + un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) + normalized = re.sub(r"[^a-z\s]", "", normalized) + normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_money_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_money_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"a\$", r"$", un_normalized) + un_normalized = re.sub(r"us\$", r"$", un_normalized) + un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) + un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_time_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_time_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r": ", ":", un_normalized) + un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) + un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_plain_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_plain_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_punct_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_punct_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_date_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_date_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_letters_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_letters_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_verbatim_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_verbatim_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_digit_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_digit_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_telephone_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_telephone_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_electronic_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_electronic_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_fraction_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_fraction_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_address_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_address_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +filters = [] +filters.append(Filter(class_type="CARDINAL", + process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", + process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", + process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", + process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", + process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", + process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", + process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", + process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", + process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", + process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", + process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", + process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", + process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", + process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", + process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", + process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, + process_func=lambda x: x, filter_func=lambda x: True)) + + +def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: + """ + Filters list of instances + + Args: + data: list of instances + + Returns: filtered and transformed list of instances + """ + updates_instances = [] + for instance in data: + updated_instance = False + for fil in filters: + if fil.class_type == instance.token_type and fil.filter(instance): + instance = fil.process(instance) + updated_instance = True + if updated_instance: + if verbose: + print(instance) + updates_instances.append(instance) + return updates_instances + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument("--input", help="input file path", + type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument( + "--verbose", help="print filtered instances", action='store_true') + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + file_path = args.input + + print("Loading training data: " + file_path) + instance_list = load_files([file_path]) # List of instances + filtered_instance_list = filter_loaded_data(instance_list, args.verbose) + training_data_to_sentences(filtered_instance_list) diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv new file mode 100644 index 000000000..9871cb9cf --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +육 6 +칠 7 +팔 8 +구 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv new file mode 100644 index 000000000..541752211 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv @@ -0,0 +1,11 @@ +억 +조 +경 +해 +자 +양 +구 +간 +정 +재 +극 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv new file mode 100644 index 000000000..43baac7c1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv @@ -0,0 +1 @@ +영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..7a9fd8720 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -0,0 +1,292 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_NARROW_NON_BREAK_SPACE = "\u202f" +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00a0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +NEMO_NOT_ALPHA = pynini.difference(NEMO_SIGMA, NEMO_ALPHA).optimize() +NEMO_LOWER_NOT_A = pynini.union( + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", +).optimize() + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + +suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +_c = pynini.union( + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z" +) +_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +_s = NEMO_SIGMA + pynutil.insert("s") + +graph_plural = plurals._priority_union( + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA +).optimize() + +SINGULAR_TO_PLURAL = graph_plural +PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("minus", "Minus").optimize() + + +def capitalized_input_graph( + graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None +) -> 'pynini.FstLike': + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logging.info(f'Created {file_name}') + + +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..df5804fc0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. 마이너스 이십삼 -> cardinal { integer: "23" negative: "-" } } + + Args: + input_case: accepting Korean input. + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_zero = pynini.cross("영", "0") + + graph_negative = pynini.cross("마이너스", "-") + graph_negative += delete_space + + ten = pynutil.delete("십") + ten_alt = pynini.cross("십", "1") + ### Responsible for second digit of two digit number. ex) 20's 2 + graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) + ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, + graph_ten_component += graph_digit | pynutil.insert("0") + + hundred = pynutil.delete("백") + hundred_alt = pynini.cross("백", "1") + graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) + graph_hundred_component += graph_ten_component + + thousand = pynutil.delete("천") + thousand_alt = pynini.cross("천", "1") + graph_thousand_component = pynini.union(((graph_digit + thousand) | thousand_alt), pynutil.insert("0")) + graph_thousand_component += graph_hundred_component + + tenthousand = pynutil.delete("만") + tenthousand_alt = pynini.cross("만", "1") + ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space + ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits + graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component += graph_thousand_component + + hundredmillion = pynutil.delete("억") + hundredmillion_alt = pynini.cross("억", "1") + graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) + graph_hundredmillion_component += graph_tenthousand_component + + trillion = pynutil.delete("조") + trillion_alt = pynini.cross("조", "1") + graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component += graph_hundredmillion_component + + tenquadrillion = pynutil.delete("경") + tenquadrillion_alt = pynini.cross("경", "1") + graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component += graph_trillion_component + + + graph = pynini.union( + ### From biggest unit to smallest, everything is included + graph_tenquadrillion_component| + graph_zero + ) + + leading_zero = ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + graph_nonzero = graph @ leading_zero + graph = pynini.union(graph_nonzero, graph_zero) + + graph = graph @ leading_zero | graph_zero + + self.just_cardinals = graph + + optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + + final_graph = ( + optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..760ce6829 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = INPUT_LOWER_CASED, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"jp_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + word_graph = WordFst().fst + classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") + tagger = pynini.closure(token, 1) + + self.fst = tagger + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py new file mode 100644 index 000000000..0d6ccd5c5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert( + "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py new file mode 100644 index 000000000..0222cc0b8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + + +def get_abs_path(rel_path): + + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..da950f35e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..1800a6dc8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal + e.g. cardinal { negative: "-" integer: "23" } -> -23 + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + negative_sign = ( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + ) + + optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) + + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + integer_cardinal = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + digits_from_tag + + pynutil.delete("\"") + ) + + graph = integer_cardinal + final_graph = optional_sign_output + graph + self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..9d750d757 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + word_graph = WordFst().fst + + graph = (cardinal_graph|word_graph) + self.fst = graph + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..8554fc161 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_deterministic_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + else: + # token_graph = VerbalizeFst(deterministic=deterministic) + token_graph = VerbalizeFst().fst + token_verbalizer = ( + pynutil.delete("tokens {") + delete_space + token_graph + delete_space + pynutil.delete(" }") + ) + verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) + + self.fst = (verbalizer).optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py new file mode 100644 index 000000000..d79957ca8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + + +class WordFst(GraphFst): + ''' + tokens { name: "一" } -> 一 + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 0852329d6..7bfdd3399 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..007273e5e --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,27 @@ +영~0 +구~9 +십~10 +십칠~17 +오십삼~53 +백~100 +백오~105 +삼백이십~320 +구백팔십칠~987 +천~1000 +천육~1006 +천오백~1500 +오천사백삼십이~5432 +만~10000 +만천이백~11200 +삼만오천칠백~35700 +십이만~120000 +백오십만삼천~1503000 +천만~10000000 +오천이백칠십만육천백~52706100 +억~100000000 +삼억오천만~350000000 +십이억천만~1210000000 +백오십억칠천만~15070000000 +오천억~500000000000 +일조~1000000000000 +이조오천억~2500000000000 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..9fd366ea6 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_ko = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) + assert pred == expected + + normalizer_with_audio_ko = ( + NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + if RUN_AUDIO_BASED_TESTS + else None + ) \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..c44f4a703 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,34 @@ +#! /bin/sh + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/ko"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..0df099774 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,6 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko' ], type=str, default='en', @@ -307,6 +308,13 @@ def parse_args(): PostProcessingFst as TNPostProcessingFst, ) from nemo_text_processing.text_normalization.ja.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'rw': from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, From bfc6d2723797421992545ba6dd7931d356195898 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 13 May 2025 15:04:57 -0700 Subject: [PATCH 11/14] Fixing all the feedbacks Haven't fixed the graph@leading zero part Signed-off-by: hmlee245 --- .../ko/data/numbers/zero.tsv | 1 - .../inverse_text_normalization/ko/graph_utils.py | 2 +- .../ko/taggers/__init__.py | 4 ---- .../ko/taggers/cardinal.py | 8 ++------ .../ko/taggers/tokenize_and_classify.py | 2 -- .../ko/verbalizers/__init__.py | 6 +----- .../ko/verbalizers/verbalize_final.py | 1 - .../ko/verbalizers/word.py | 4 +--- tests/nemo_text_processing/ko/test_cardinal.py | 12 ++---------- 9 files changed, 7 insertions(+), 33 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv deleted file mode 100644 index 43baac7c1..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv +++ /dev/null @@ -1 +0,0 @@ -영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py index 7a9fd8720..50f1eb3b9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py index f541211af..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -11,7 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index df5804fc0..684685001 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -31,13 +31,9 @@ class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") - graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.cross("영", "0") + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_negative = pynini.cross("마이너스", "-") - graph_negative += delete_space - ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 @@ -47,7 +43,7 @@ def __init__(self): hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") - graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) + graph_hundred_component = pynini.union(((graph_digit + hundred | hundred_alt)), pynutil.insert("0")) graph_hundred_component += graph_ten_component thousand = pynutil.delete("천") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 760ce6829..bb6b35d41 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -24,8 +24,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( INPUT_LOWER_CASED, GraphFst, - delete_extra_space, - delete_space, generator_main, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index da950f35e..ecc3520ab 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -10,8 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. - -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file +# limitations under the License. \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8554fc161..8d40d2804 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -19,7 +19,6 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index d79957ca8..a423d5d0c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -13,11 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 9fd366ea6..96681fd8b 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,10 +16,8 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestCardinal: @@ -31,9 +29,3 @@ class TestCardinal: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) assert pred == expected - - normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) - if RUN_AUDIO_BASED_TESTS - else None - ) \ No newline at end of file From a9e839e760911679db2c480814d21bc31d22c79a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 May 2025 22:04:00 +0000 Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hmlee245 --- .../inverse_normalize.py | 4 +- .../ko/clean_eval_data.py | 59 +++++++------------ .../ko/taggers/cardinal.py | 36 +++++++---- .../ko/taggers/tokenize_and_classify.py | 12 ++-- .../ko/taggers/word.py | 3 +- .../inverse_text_normalization/ko/utils.py | 3 - .../ko/verbalizers/cardinal.py | 18 ++---- .../ko/verbalizers/verbalize.py | 7 +-- .../ko/verbalizers/verbalize_final.py | 2 + .../ko/verbalizers/word.py | 1 - .../run_evaluate.py | 2 +- .../pynini_export.py | 2 +- 12 files changed, 64 insertions(+), 85 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index e505a8ad0..acda8b7f9 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -135,7 +135,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( VerbalizeFinalFst, - ) + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py index 3c1193333..bc429e858 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py +++ b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py @@ -282,41 +282,24 @@ def process_address_1(instance: Instance) -> Instance: filters = [] -filters.append(Filter(class_type="CARDINAL", - process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", - process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", - process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", - process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", - process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", - process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", - process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", - process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", - process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", - process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", - process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", - process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", - process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", - process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", - process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", - process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, - process_func=lambda x: x, filter_func=lambda x: True)) +filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True)) def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: @@ -344,10 +327,8 @@ def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Inst def parse_args(): parser = ArgumentParser() - parser.add_argument("--input", help="input file path", - type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument( - "--verbose", help="print filtered instances", action='store_true') + parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument("--verbose", help="print filtered instances", action='store_true') return parser.parse_args() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 684685001..9effed162 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -19,6 +19,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals @@ -40,7 +41,7 @@ def __init__(self): graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, graph_ten_component += graph_digit | pynutil.insert("0") - + hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") graph_hundred_component = pynini.union(((graph_digit + hundred | hundred_alt)), pynutil.insert("0")) @@ -55,29 +56,36 @@ def __init__(self): tenthousand_alt = pynini.cross("만", "1") ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits - graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component = pynini.union( + ((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000") + ) graph_tenthousand_component += graph_thousand_component hundredmillion = pynutil.delete("억") hundredmillion_alt = pynini.cross("억", "1") - graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) - graph_hundredmillion_component += graph_tenthousand_component - + graph_hundredmillion_component = pynini.union( + ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") + ) + graph_hundredmillion_component += graph_tenthousand_component + trillion = pynutil.delete("조") trillion_alt = pynini.cross("조", "1") - graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component = pynini.union( + ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") + ) graph_trillion_component += graph_hundredmillion_component tenquadrillion = pynutil.delete("경") tenquadrillion_alt = pynini.cross("경", "1") - graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component = pynini.union( + ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") + ) graph_tenquadrillion_component += graph_trillion_component - graph = pynini.union( ### From biggest unit to smallest, everything is included - graph_tenquadrillion_component| - graph_zero + graph_tenquadrillion_component + | graph_zero ) leading_zero = ( @@ -85,16 +93,18 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + optional_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) final_graph = ( optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index bb6b35d41..30e0f5df4 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,13 +19,13 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( INPUT_LOWER_CASED, GraphFst, generator_main, ) +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst class ClassifyFst(GraphFst): @@ -62,8 +62,8 @@ def __init__( cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) - + classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) @@ -71,4 +71,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py index 0d6ccd5c5..0e4dbb93c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -27,6 +27,5 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert( - "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py index 0222cc0b8..d198c3835 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -15,9 +15,6 @@ import os - def get_abs_path(rel_path): return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path - - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py index 1800a6dc8..fb9a76d8e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): @@ -34,21 +30,17 @@ def __init__(self): pynutil.delete("negative:") + delete_space + pynutil.delete("\"") - + pynini.accep("-") + + pynini.accep("-") + pynutil.delete("\"") ) optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) - digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) integer_cardinal = ( - pynutil.delete("integer:") - + delete_space - + pynutil.delete("\"") - + digits_from_tag - + pynutil.delete("\"") + pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") ) graph = integer_cardinal final_graph = optional_sign_output + graph - self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 9d750d757..d8851e206 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst class VerbalizeFst(GraphFst): @@ -30,7 +30,6 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - - graph = (cardinal_graph|word_graph) + + graph = cardinal_graph | word_graph self.fst = graph - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8d40d2804..3e1769297 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -18,6 +18,7 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space @@ -27,6 +28,7 @@ class VerbalizeFinalFst(GraphFst): Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index a423d5d0c..ecf62bfe3 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -18,7 +18,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst - class WordFst(GraphFst): ''' tokens { name: "一" } -> 一 diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 7bfdd3399..133474940 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja", "ko"], default="en", type=str, ) diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0df099774..d1ba34a37 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,7 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', - 'ko' + 'ko', ], type=str, default='en', From c6fc9e176b82d71c3bcfde283cff9421f3f42402 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 13 May 2025 15:11:44 -0700 Subject: [PATCH 13/14] Delete clean eval file Signed-off-by: hmlee245 --- .../ko/clean_eval_data.py | 342 ------------------ 1 file changed, 342 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py deleted file mode 100644 index bc429e858..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py +++ /dev/null @@ -1,342 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser -from typing import List - -import regex as re - -from nemo_text_processing.text_normalization.data_loader_utils import ( - EOS_TYPE, - Instance, - load_files, - training_data_to_sentences, -) - -""" -This file is for evaluation purposes. -filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. -For example, normalized text should only include characters and whitespace characters but no punctuation. - Cardinal unnormalized instances should contain at least one integer and all other characters are removed. -""" - - -class Filter: - """ - Filter class - - Args: - class_type: semiotic class used in dataset - process_func: function to transform text - filter_func: function to filter text - - """ - - def __init__(self, class_type: str, process_func: object, filter_func: object): - self.class_type = class_type - self.process_func = process_func - self.filter_func = filter_func - - def filter(self, instance: Instance) -> bool: - """ - filter function - - Args: - filters given instance with filter function - - Returns: True if given instance fulfills criteria or does not belong to class type - """ - if instance.token_type != self.class_type: - return True - return self.filter_func(instance) - - def process(self, instance: Instance) -> Instance: - """ - process function - - Args: - processes given instance with process function - - Returns: processed instance if instance belongs to expected class type or original instance - """ - if instance.token_type != self.class_type: - return instance - return self.process_func(instance) - - -def filter_cardinal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_cardinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[^0-9]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_ordinal_1(instance: Instance) -> bool: - ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) - return ok - - -def process_ordinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[,\s]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_decimal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_decimal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_measure_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_measure_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"m2", "m²", un_normalized) - un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) - normalized = re.sub(r"[^a-z\s]", "", normalized) - normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_money_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_money_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"a\$", r"$", un_normalized) - un_normalized = re.sub(r"us\$", r"$", un_normalized) - un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) - un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_time_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_time_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r": ", ":", un_normalized) - un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) - un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_plain_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_plain_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_punct_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_punct_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_date_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_date_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_letters_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_letters_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_verbatim_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_verbatim_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_digit_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_digit_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_telephone_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_telephone_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_electronic_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_electronic_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_fraction_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_fraction_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_address_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_address_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -filters = [] -filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True)) - - -def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: - """ - Filters list of instances - - Args: - data: list of instances - - Returns: filtered and transformed list of instances - """ - updates_instances = [] - for instance in data: - updated_instance = False - for fil in filters: - if fil.class_type == instance.token_type and fil.filter(instance): - instance = fil.process(instance) - updated_instance = True - if updated_instance: - if verbose: - print(instance) - updates_instances.append(instance) - return updates_instances - - -def parse_args(): - parser = ArgumentParser() - parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument("--verbose", help="print filtered instances", action='store_true') - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - file_path = args.input - - print("Loading training data: " + file_path) - instance_list = load_files([file_path]) # List of instances - filtered_instance_list = filter_loaded_data(instance_list, args.verbose) - training_data_to_sentences(filtered_instance_list) From 11a62df5b077f04177efbe06769eaa4b5827ac0a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 May 2025 22:09:55 +0000 Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hmlee245 --- .../ko/taggers/tokenize_and_classify.py | 6 +----- .../inverse_text_normalization/ko/verbalizers/__init__.py | 2 +- .../ko/verbalizers/verbalize_final.py | 1 - 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 30e0f5df4..75e3f6f20 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,11 +19,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - INPUT_LOWER_CASED, - GraphFst, - generator_main, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index ecc3520ab..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 3e1769297..17f547740 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -20,7 +20,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst):