RajanPutty · mayuris-00 · Apr 9, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/__init__.py
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/percentage/percent_symbol.tsv
@@ -0,0 +1,4 @@
+%	प्रतिशत
+%	परसेंट
+%	फ़ीसदी
+%	फीसदी
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/percentage.py
@@ -0,0 +1,39 @@
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.inverse_text_normalization.hi.graph_utils import (
+    INPUT_LOWER_CASED,
+    GraphFst,
+    delete_space,
+)
+from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path
+
+
+class PercentageFst(GraphFst):
+    def __init__(self, cardinal, input_case: str = INPUT_LOWER_CASED):
+        super().__init__(name="percentage", kind="classify")
+
+        # load percent words and flip mapping: प्रतिशत → %
+        percent_graph = pynini.string_file(
+            get_abs_path("data/percentage/percent_symbol.tsv")
+        ).invert()
+
+        # reuse number logic (बीस → २०, पाँच सौ → ५००)
+        integer_graph = cardinal.graph_no_exception
+
+        # match: <number> + <percent word>
+        # and convert into structured format
+        final_graph = (
+            pynutil.insert('integer: "')
+            + integer_graph
+            + pynutil.insert('"')
+            + delete_space
+            + pynutil.insert(' percent: "')
+            + percent_graph
+            + pynutil.insert('"')
+        )
+
+        # wrap as: percentage { ... }
+        final_graph = self.add_tokens(final_graph)
+
+        self.fst = final_graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py
@@ -32,6 +32,7 @@
 from nemo_text_processing.inverse_text_normalization.hi.taggers.measure import MeasureFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.percentage import PercentageFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
@@ -79,6 +80,8 @@ def __init__(
             decimal_graph = decimal.fst
             fraction = FractionFst(cardinal)
             fraction_graph = fraction.fst
+            percentage = PercentageFst(cardinal)
+            percentage_graph = percentage.fst
             date = DateFst(cardinal)
             date_graph = date.fst
             time = TimeFst()
@@ -98,6 +101,7 @@ def __init__(
                 | pynutil.add_weight(ordinal_graph, 1.1)
                 | pynutil.add_weight(decimal_graph, 1.1)
                 | pynutil.add_weight(fraction_graph, 1.1)
+                | pynutil.add_weight(percentage_graph, 1.1)
                 | pynutil.add_weight(date_graph, 1.1)
                 | pynutil.add_weight(time_graph, 1.1)
                 | pynutil.add_weight(measure_graph, 1.1)
@@ -120,4 +124,4 @@ def __init__(
 
             if far_file:
                 generator_main(far_file, {"tokenize_and_classify": self.fst})
-                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
+                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/percentage.py
@@ -0,0 +1,41 @@
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.inverse_text_normalization.hi.graph_utils import (
+    NEMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+)
+
+
+class PercentageFst(GraphFst):
+    def __init__(self):
+        super().__init__(name="percentage", kind="verbalize")
+
+        # extract number part (remove labels and quotes)
+        # example: integer: "२०" → २०
+        integer_part = (
+            pynutil.delete("integer:")
+            + delete_space
+            + pynutil.delete("\"")
+            + pynini.closure(NEMO_NOT_QUOTE, 1)
+            + pynutil.delete("\"")
+        )
+
+        # extract percent symbol
+        # example: percent: "%" → %
+        percent_part = (
+            pynutil.delete("percent:")
+            + delete_space
+            + pynutil.delete("\"")
+            + pynini.closure(NEMO_NOT_QUOTE, 1)
+            + pynutil.delete("\"")
+        )
+
+        # combine both → २०%
+        graph = integer_part + delete_space + percent_part
+
+        # remove outer wrapper: percentage { ... }
+        delete_tokens = self.delete_tokens(graph)
+
+        self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py
@@ -21,6 +21,7 @@
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst
+from nemo_text_processing.inverse_text_normalization.hi.verbalizers.percentage import PercentageFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
@@ -38,15 +39,16 @@ def __init__(self):
         super().__init__(name="verbalize", kind="verbalize")
         cardinal = CardinalFst()
         cardinal_graph = cardinal.fst
-        ordinal_graph = OrdinalFst().fst
-        decimal = DecimalFst()
+        ordinal_graph = OrdinalFst().fst          # takes nothing
+        decimal = DecimalFst()                     # takes nothing
         decimal_graph = decimal.fst
-        fraction_graph = FractionFst().fst
-        date_graph = DateFst().fst
-        time_graph = TimeFst().fst
-        measure_graph = MeasureFst(cardinal, decimal).fst
-        money_graph = MoneyFst(cardinal, decimal).fst
-        telephone_graph = TelephoneFst(cardinal).fst
+        fraction_graph = FractionFst().fst         # takes nothing
+        percentage_graph = PercentageFst().fst     # takes nothing
+        date_graph = DateFst().fst                 # takes nothing
+        time_graph = TimeFst().fst                 # takes nothing
+        measure_graph = MeasureFst(cardinal, decimal).fst   # takes cardinal, decimal
+        money_graph = MoneyFst(cardinal, decimal).fst       # takes cardinal, decimal
+        telephone_graph = TelephoneFst(cardinal).fst        # takes cardinal
         word_graph = WordFst().fst
         whitelist_graph = WhiteListFst().fst
 
@@ -57,10 +59,11 @@ def __init__(self):
             | ordinal_graph
             | decimal_graph
             | fraction_graph
+            | percentage_graph
             | date_graph
             | time_graph
             | measure_graph
             | money_graph
             | telephone_graph
         )
-        self.fst = graph
+        self.fst = graph
diff --git a/nemo_text_processing/text_normalization/utils_audio_based.py b/nemo_text_processing/text_normalization/utils_audio_based.py
@@ -14,7 +14,10 @@
 
 from typing import Dict
 
-from cdifflib import CSequenceMatcher
+try:
+    from cdifflib import CSequenceMatcher
+except ImportError:
+    from difflib import SequenceMatcher as CSequenceMatcher
 
 from nemo_text_processing.utils.logging import logger
 

diff --git a/test_percentage.py b/test_percentage.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from parameterized import parameterized
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from ..utils import CACHE_DIR, parse_test_case_file
+
+
+class TestPercentage:
+    inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
+
+    @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred == expected
diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_percentage.txt
@@ -0,0 +1,12 @@
+बीस प्रतिशत~२०%
+पचास प्रतिशत~५०%
+दस प्रतिशत~१०%
+सौ प्रतिशत~१००%
+पच्चीस प्रतिशत~२५%
+पाँच प्रतिशत~५%
+तीन प्रतिशत~३%
+सत्तर परसेंट~७०%
+एक प्रतिशत~१%
+शून्य प्रतिशत~०%
+पाँच सौ फ़ीसदी~५००%
+तेरह प्रतिशत~१३%
diff --git a/tests/nemo_text_processing/hi/test_percentage.py b/tests/nemo_text_processing/hi/test_percentage.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from parameterized import parameterized
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from ..utils import CACHE_DIR, parse_test_case_file
+
+
+class TestPercentage:
+    inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
+
+    @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred == expected
diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py
@@ -39,7 +39,7 @@ def parse_test_case_file(file_name: str):
     Prepares tests pairs for ITN and TN tests
     """
     test_pairs = []
-    with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f:
+    with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r', encoding='utf-8') as f:
         for line in f:
             components = line.strip("\n").split("~")
             spoken = components[0]