diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv new file mode 100644 index 000000000..c2fb2992b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv @@ -0,0 +1,4 @@ +% प्रतिशत +% परसेंट +% फ़ीसदी +% फीसदी \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py new file mode 100644 index 000000000..c191866b3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + GraphFst, + delete_space, +) +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path + + +class PercentageFst(GraphFst): + """ + Finite state transducer for classifying percentages + e.g. बीस प्रतिशत -> percentage { integer: "२०" percent: "%" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal): + super().__init__(name="percentage", kind="classify") + + graph_percent_symbol = pynini.string_file( + get_abs_path("data/percentage/percent_symbol.tsv") + ).invert() + + integer_graph = cardinal.graph_no_exception + + final_graph = ( + pynutil.insert("integer: \"") + + integer_graph + + pynutil.insert("\"") + + delete_space + + pynutil.insert(" percent: \"") + + graph_percent_symbol + + pynutil.insert("\"") + ) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index b3fcb0c2d..1484c354e 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -32,6 +32,7 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.percentage import PercentageFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst @@ -79,6 +80,8 @@ def __init__( decimal_graph = decimal.fst fraction = FractionFst(cardinal) fraction_graph = fraction.fst + percentage = PercentageFst(cardinal) + percentage_graph = percentage.fst date = DateFst(cardinal) date_graph = date.fst time = TimeFst() @@ -98,6 +101,7 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) + | pynutil.add_weight(percentage_graph, 1.1) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) @@ -120,4 +124,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py new file mode 100644 index 000000000..2267a8761 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class PercentageFst(GraphFst): + """ + Finite state transducer for verbalizing percentage + e.g. percentage { integer: "२०" percent: "%" } -> २०% + """ + + def __init__(self): + super().__init__(name="percentage", kind="verbalize") + + integer_part = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + percent_part = ( + pynutil.delete("percent:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + graph = integer_part + delete_space + percent_part + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 165fe7a7e..a782b3c13 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -21,6 +21,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.percentage import PercentageFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -38,10 +39,12 @@ def __init__(self): super().__init__(name="verbalize", kind="verbalize") cardinal = CardinalFst() cardinal_graph = cardinal.fst - ordinal_graph = OrdinalFst().fst + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst decimal = DecimalFst() decimal_graph = decimal.fst fraction_graph = FractionFst().fst + percentage_graph = PercentageFst().fst date_graph = DateFst().fst time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst @@ -57,10 +60,11 @@ def __init__(self): | ordinal_graph | decimal_graph | fraction_graph + | percentage_graph | date_graph | time_graph | measure_graph | money_graph | telephone_graph ) - self.fst = graph + self.fst = graph \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt new file mode 100644 index 000000000..622cbd791 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt @@ -0,0 +1,12 @@ +बीस प्रतिशत~२०% +पचास प्रतिशत~५०% +दस प्रतिशत~१०% +सौ प्रतिशत~१००% +पच्चीस प्रतिशत~२५% +पाँच प्रतिशत~५% +तीन प्रतिशत~३% +सत्तर परसेंट~७०% +एक प्रतिशत~१% +शून्य प्रतिशत~०% +पाँच सौ फ़ीसदी~५००% +तेरह प्रतिशत~१३% \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_percentage.py b/tests/nemo_text_processing/hi/test_percentage.py new file mode 100644 index 000000000..cec684241 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_percentage.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestPercentage: + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py index 5326784e9..0e06d5945 100644 --- a/tests/nemo_text_processing/utils.py +++ b/tests/nemo_text_processing/utils.py @@ -39,7 +39,7 @@ def parse_test_case_file(file_name: str): Prepares tests pairs for ITN and TN tests """ test_pairs = [] - with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f: + with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r', encoding='utf-8') as f: for line in f: components = line.strip("\n").split("~") spoken = components[0]