From b68f80badeb2cd8bd28d0c2047757a8b0f185bf4 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 9 Apr 2026 17:36:53 +0530 Subject: [PATCH 1/4] Added Hindi percentage ITN class --- .../hi/data/percentage/__init__.py | 0 .../hi/data/percentage/percent_symbol.tsv | 4 ++ .../hi/taggers/percentage.py | 39 ++++++++++++++++++ .../hi/taggers/tokenize_and_classify.py | 6 ++- .../hi/verbalizers/percentage.py | 41 +++++++++++++++++++ .../hi/verbalizers/verbalize.py | 21 ++++++---- .../text_normalization/utils_audio_based.py | 5 ++- test_percentage.py | 29 +++++++++++++ .../test_cases_percentage.txt | 12 ++++++ .../hi/test_percentage.py | 29 +++++++++++++ tests/nemo_text_processing/utils.py | 2 +- 11 files changed, 176 insertions(+), 12 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py create mode 100644 test_percentage.py create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt create mode 100644 tests/nemo_text_processing/hi/test_percentage.py diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv new file mode 100644 index 000000000..c2fb2992b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv @@ -0,0 +1,4 @@ +% प्रतिशत +% परसेंट +% फ़ीसदी +% फीसदी \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py new file mode 100644 index 000000000..e888c12a7 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py @@ -0,0 +1,39 @@ +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_space, +) +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path + + +class PercentageFst(GraphFst): + def __init__(self, cardinal, input_case: str = INPUT_LOWER_CASED): + super().__init__(name="percentage", kind="classify") + + # load percent words and flip mapping: प्रतिशत → % + percent_graph = pynini.string_file( + get_abs_path("data/percentage/percent_symbol.tsv") + ).invert() + + # reuse number logic (बीस → २०, पाँच सौ → ५००) + integer_graph = cardinal.graph_no_exception + + # match: + + # and convert into structured format + final_graph = ( + pynutil.insert('integer: "') + + integer_graph + + pynutil.insert('"') + + delete_space + + pynutil.insert(' percent: "') + + percent_graph + + pynutil.insert('"') + ) + + # wrap as: percentage { ... } + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index b3fcb0c2d..1484c354e 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -32,6 +32,7 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.percentage import PercentageFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst @@ -79,6 +80,8 @@ def __init__( decimal_graph = decimal.fst fraction = FractionFst(cardinal) fraction_graph = fraction.fst + percentage = PercentageFst(cardinal) + percentage_graph = percentage.fst date = DateFst(cardinal) date_graph = date.fst time = TimeFst() @@ -98,6 +101,7 @@ def __init__( | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) + | pynutil.add_weight(percentage_graph, 1.1) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) @@ -120,4 +124,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py new file mode 100644 index 000000000..c302b5d64 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py @@ -0,0 +1,41 @@ +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class PercentageFst(GraphFst): + def __init__(self): + super().__init__(name="percentage", kind="verbalize") + + # extract number part (remove labels and quotes) + # example: integer: "२०" → २० + integer_part = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + # extract percent symbol + # example: percent: "%" → % + percent_part = ( + pynutil.delete("percent:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + # combine both → २०% + graph = integer_part + delete_space + percent_part + + # remove outer wrapper: percentage { ... } + delete_tokens = self.delete_tokens(graph) + + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 165fe7a7e..6315188ff 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -21,6 +21,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.percentage import PercentageFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -38,15 +39,16 @@ def __init__(self): super().__init__(name="verbalize", kind="verbalize") cardinal = CardinalFst() cardinal_graph = cardinal.fst - ordinal_graph = OrdinalFst().fst - decimal = DecimalFst() + ordinal_graph = OrdinalFst().fst # takes nothing + decimal = DecimalFst() # takes nothing decimal_graph = decimal.fst - fraction_graph = FractionFst().fst - date_graph = DateFst().fst - time_graph = TimeFst().fst - measure_graph = MeasureFst(cardinal, decimal).fst - money_graph = MoneyFst(cardinal, decimal).fst - telephone_graph = TelephoneFst(cardinal).fst + fraction_graph = FractionFst().fst # takes nothing + percentage_graph = PercentageFst().fst # takes nothing + date_graph = DateFst().fst # takes nothing + time_graph = TimeFst().fst # takes nothing + measure_graph = MeasureFst(cardinal, decimal).fst # takes cardinal, decimal + money_graph = MoneyFst(cardinal, decimal).fst # takes cardinal, decimal + telephone_graph = TelephoneFst(cardinal).fst # takes cardinal word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst @@ -57,10 +59,11 @@ def __init__(self): | ordinal_graph | decimal_graph | fraction_graph + | percentage_graph | date_graph | time_graph | measure_graph | money_graph | telephone_graph ) - self.fst = graph + self.fst = graph \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/utils_audio_based.py b/nemo_text_processing/text_normalization/utils_audio_based.py index 2e9626d9e..1e3e811af 100644 --- a/nemo_text_processing/text_normalization/utils_audio_based.py +++ b/nemo_text_processing/text_normalization/utils_audio_based.py @@ -14,7 +14,10 @@ from typing import Dict -from cdifflib import CSequenceMatcher +try: + from cdifflib import CSequenceMatcher +except ImportError: + from difflib import SequenceMatcher as CSequenceMatcher from nemo_text_processing.utils.logging import logger diff --git a/test_percentage.py b/test_percentage.py new file mode 100644 index 000000000..cec684241 --- /dev/null +++ b/test_percentage.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestPercentage: + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt new file mode 100644 index 000000000..622cbd791 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt @@ -0,0 +1,12 @@ +बीस प्रतिशत~२०% +पचास प्रतिशत~५०% +दस प्रतिशत~१०% +सौ प्रतिशत~१००% +पच्चीस प्रतिशत~२५% +पाँच प्रतिशत~५% +तीन प्रतिशत~३% +सत्तर परसेंट~७०% +एक प्रतिशत~१% +शून्य प्रतिशत~०% +पाँच सौ फ़ीसदी~५००% +तेरह प्रतिशत~१३% \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_percentage.py b/tests/nemo_text_processing/hi/test_percentage.py new file mode 100644 index 000000000..cec684241 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_percentage.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestPercentage: + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py index 5326784e9..0e06d5945 100644 --- a/tests/nemo_text_processing/utils.py +++ b/tests/nemo_text_processing/utils.py @@ -39,7 +39,7 @@ def parse_test_case_file(file_name: str): Prepares tests pairs for ITN and TN tests """ test_pairs = [] - with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f: + with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r', encoding='utf-8') as f: for line in f: components = line.strip("\n").split("~") spoken = components[0] From 42a46d055ec50af9c739c5d800264b17008e3a84 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 10 Apr 2026 13:26:32 +0530 Subject: [PATCH 2/4] Revert utils_audio_based.py change - unrelated to percentage class --- nemo_text_processing/text_normalization/utils_audio_based.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/nemo_text_processing/text_normalization/utils_audio_based.py b/nemo_text_processing/text_normalization/utils_audio_based.py index 1e3e811af..2e9626d9e 100644 --- a/nemo_text_processing/text_normalization/utils_audio_based.py +++ b/nemo_text_processing/text_normalization/utils_audio_based.py @@ -14,10 +14,7 @@ from typing import Dict -try: - from cdifflib import CSequenceMatcher -except ImportError: - from difflib import SequenceMatcher as CSequenceMatcher +from cdifflib import CSequenceMatcher from nemo_text_processing.utils.logging import logger From 6ee6cceba3179ac7574c2e7607ef76f0098617cc Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 10 Apr 2026 13:31:24 +0530 Subject: [PATCH 3/4] Addressed review comments --- .../hi/taggers/percentage.py | 43 +++++++++++++------ .../hi/verbalizers/percentage.py | 26 ++++++++--- .../hi/verbalizers/verbalize.py | 19 ++++---- 3 files changed, 58 insertions(+), 30 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py index e888c12a7..c191866b3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py @@ -1,8 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pynini from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( - INPUT_LOWER_CASED, GraphFst, delete_space, ) @@ -10,30 +23,32 @@ class PercentageFst(GraphFst): - def __init__(self, cardinal, input_case: str = INPUT_LOWER_CASED): + """ + Finite state transducer for classifying percentages + e.g. बीस प्रतिशत -> percentage { integer: "२०" percent: "%" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal): super().__init__(name="percentage", kind="classify") - # load percent words and flip mapping: प्रतिशत → % - percent_graph = pynini.string_file( + graph_percent_symbol = pynini.string_file( get_abs_path("data/percentage/percent_symbol.tsv") ).invert() - # reuse number logic (बीस → २०, पाँच सौ → ५००) integer_graph = cardinal.graph_no_exception - # match: + - # and convert into structured format final_graph = ( - pynutil.insert('integer: "') + pynutil.insert("integer: \"") + integer_graph - + pynutil.insert('"') + + pynutil.insert("\"") + delete_space - + pynutil.insert(' percent: "') - + percent_graph - + pynutil.insert('"') + + pynutil.insert(" percent: \"") + + graph_percent_symbol + + pynutil.insert("\"") ) - # wrap as: percentage { ... } final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py index c302b5d64..2267a8761 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pynini from pynini.lib import pynutil @@ -9,11 +23,14 @@ class PercentageFst(GraphFst): + """ + Finite state transducer for verbalizing percentage + e.g. percentage { integer: "२०" percent: "%" } -> २०% + """ + def __init__(self): super().__init__(name="percentage", kind="verbalize") - # extract number part (remove labels and quotes) - # example: integer: "२०" → २० integer_part = ( pynutil.delete("integer:") + delete_space @@ -22,8 +39,6 @@ def __init__(self): + pynutil.delete("\"") ) - # extract percent symbol - # example: percent: "%" → % percent_part = ( pynutil.delete("percent:") + delete_space @@ -32,10 +47,7 @@ def __init__(self): + pynutil.delete("\"") ) - # combine both → २०% graph = integer_part + delete_space + percent_part - # remove outer wrapper: percentage { ... } delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 6315188ff..a782b3c13 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -39,16 +39,17 @@ def __init__(self): super().__init__(name="verbalize", kind="verbalize") cardinal = CardinalFst() cardinal_graph = cardinal.fst - ordinal_graph = OrdinalFst().fst # takes nothing - decimal = DecimalFst() # takes nothing + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst + decimal = DecimalFst() decimal_graph = decimal.fst - fraction_graph = FractionFst().fst # takes nothing - percentage_graph = PercentageFst().fst # takes nothing - date_graph = DateFst().fst # takes nothing - time_graph = TimeFst().fst # takes nothing - measure_graph = MeasureFst(cardinal, decimal).fst # takes cardinal, decimal - money_graph = MoneyFst(cardinal, decimal).fst # takes cardinal, decimal - telephone_graph = TelephoneFst(cardinal).fst # takes cardinal + fraction_graph = FractionFst().fst + percentage_graph = PercentageFst().fst + date_graph = DateFst().fst + time_graph = TimeFst().fst + measure_graph = MeasureFst(cardinal, decimal).fst + money_graph = MoneyFst(cardinal, decimal).fst + telephone_graph = TelephoneFst(cardinal).fst word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst From f131cc6deb8e697b28fbe5cf69601fe3153f6ec9 Mon Sep 17 00:00:00 2001 From: mayuris-00 Date: Fri, 10 Apr 2026 13:45:15 +0530 Subject: [PATCH 4/4] Delete test_percentage.py Signed-off-by: mayuris-00 --- test_percentage.py | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 test_percentage.py diff --git a/test_percentage.py b/test_percentage.py deleted file mode 100644 index cec684241..000000000 --- a/test_percentage.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from parameterized import parameterized -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from ..utils import CACHE_DIR, parse_test_case_file - - -class TestPercentage: - inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file