NVIDIA
diff --git a/‎Jenkinsfile‎
Lines changed: 1 addition & 1 deletion b/‎Jenkinsfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nemo_text_processing/text_normalization/hi/data/telephone/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎nemo_text_processing/text_normalization/hi/data/telephone/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv‎
Lines changed: 3 additions & 0 deletions b/‎nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv‎
Lines changed: 5 additions & 0 deletions b/‎nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv‎
Lines changed: 4 additions & 0 deletions b/‎nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎nemo_text_processing/text_normalization/hi/data/telephone/number.tsv‎
Lines changed: 10 additions & 0 deletions b/‎nemo_text_processing/text_normalization/hi/data/telephone/number.tsv‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv‎
Lines changed: 4 additions & 0 deletions b/‎nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎nemo_text_processing/text_normalization/hi/taggers/telephone.py‎
Lines changed: 227 additions & 0 deletions b/‎nemo_text_processing/text_normalization/hi/taggers/telephone.py‎
Lines changed: 227 additions & 0 deletions
diff --git a/‎nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py‎
Lines changed: 13 additions & 3 deletions b/‎nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py‎
Lines changed: 13 additions & 3 deletions
@@ -27,7 +27,7 @@ pipeline {
     HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
     MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
     JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
-    HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-06-25-0'
+    HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-28-25-0'
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {
 
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,3 @@
+नंबर
+कार्ड
+क्रेडिट
@@ -0,0 +1,5 @@
+नंबर
+मोबाइल
+फोन
+लैंडलाइन
+कॉल
@@ -0,0 +1,4 @@
+नंबर
+मोबाइल
+फोन
+कॉल
@@ -0,0 +1,10 @@
+0	शून्य
+1	एक
+2	दो
+3	तीन
+4	चार
+5	पाँच
+6	छह
+7	सात
+8	आठ
+9	नौ
@@ -0,0 +1,4 @@
+नंबर
+पिन
+कोड
+पिनकोड
@@ -0,0 +1,227 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.hi.graph_utils import (
+    NEMO_CHAR,
+    NEMO_DIGIT,
+    NEMO_HI_DIGIT,
+    NEMO_SPACE,
+    NEMO_WHITE_SPACE,
+    GraphFst,
+    delete_space,
+    insert_space,
+)
+from nemo_text_processing.text_normalization.hi.utils import get_abs_path
+
+delete_zero = pynutil.delete(pynini.union("0", "०"))
+delete_zero_optional = pynini.closure(delete_zero, 0, 1)
+insert_shunya = pynutil.insert('शून्य') + insert_space
+
+# Load the number mappings from the TSV file
+digit_to_word = pynini.string_file(get_abs_path("data/telephone/number.tsv"))
+digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+mobile_context = pynini.string_file(get_abs_path("data/telephone/mobile_context.tsv"))
+landline_context = pynini.string_file(get_abs_path("data/telephone/landline_context.tsv"))
+credit_context = pynini.string_file(get_abs_path("data/telephone/credit_context.tsv"))
+pincode_context = pynini.string_file(get_abs_path("data/telephone/pincode_context.tsv"))
+
+
+def generate_mobile(context_keywords):
+    context_before, context_after = get_context(context_keywords)
+
+    allowed_digits = pynini.union("६", "७", "८", "९", "6", "7", "8", "9")
+
+    # Filter cardinals to only include allowed digits
+    mobile_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word
+
+    country_code_digits = pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3)
+    country_code = (
+        pynutil.insert("country_code: \"")
+        + context_before
+        + pynini.cross("+", "प्लस")
+        + insert_space
+        + country_code_digits
+        + pynutil.insert("\" ")
+        + pynini.closure(delete_space, 0, 1)
+    )
+
+    extension_optional = pynini.closure(
+        pynutil.insert("extension: \"")
+        + pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3)
+        + context_after
+        + pynutil.insert("\" ")
+        + delete_space,
+        0,
+        1,
+    )
+
+    number_part = mobile_start_digit + insert_space + pynini.closure((digit_to_word | digits | zero) + insert_space, 9)
+
+    number_without_country = (
+        pynutil.insert("number_part: \"")
+        + context_before
+        + delete_zero_optional
+        + insert_shunya
+        + number_part
+        + context_after
+        + pynutil.insert("\" ")
+        + delete_space
+    )
+
+    number_with_country = (
+        country_code
+        + pynutil.insert("number_part: \"")
+        + number_part
+        + context_after
+        + pynutil.insert("\" ")
+        + delete_space
+    )
+
+    return (number_with_country | number_without_country) + extension_optional
+
+
+def get_landline(std_length, context_keywords):
+    context_before, context_after = get_context(context_keywords)
+
+    allowed_digits = pynini.union("२", "३", "४", "६", "2", "3", "4", "6")
+
+    # Filter cardinals to only include allowed digits
+    landline_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word
+
+    std_code_graph = (
+        delete_zero_optional
+        + insert_shunya
+        + pynini.closure((digit_to_word | digits | zero) + insert_space, std_length, std_length)
+    )
+
+    landline_digit_count = 9 - std_length
+    landline_graph = (
+        landline_start_digit
+        + insert_space
+        + pynini.closure((digit_to_word | digits | zero) + insert_space, landline_digit_count, landline_digit_count)
+    )
+
+    separator_optional = pynini.closure(pynini.cross("-", "") | pynini.cross(".", ""), 0, 1)
+
+    std_code_in_brackets = (
+        delete_zero_optional
+        + delete_space
+        + pynutil.delete("(")
+        + pynini.closure(delete_space, 0, 1)
+        + std_code_graph
+        + pynini.closure(delete_space, 0, 1)
+        + pynutil.delete(")")
+    )
+
+    std_part = pynini.union(std_code_graph, std_code_in_brackets)
+
+    return (
+        pynutil.insert("number_part: \"")
+        + context_before
+        + std_part
+        + separator_optional
+        + delete_space
+        + landline_graph
+        + context_after
+        + pynutil.insert("\" ")
+    )
+
+
+def generate_landline(context_keywords):
+    graph = (
+        get_landline(2, context_keywords)
+        | get_landline(3, context_keywords)
+        | get_landline(4, context_keywords)
+        | get_landline(5, context_keywords)
+        | get_landline(6, context_keywords)
+        | get_landline(7, context_keywords)
+    )
+
+    return graph
+
+
+def get_context(keywords: list):
+
+    all_digits = pynini.union(NEMO_HI_DIGIT, NEMO_DIGIT)
+
+    non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE))
+    word = pynini.closure(non_digit_char, 1) + pynini.accep(NEMO_SPACE)
+
+    window = pynini.closure(word, 0, 5)
+
+    before = pynini.closure(keywords + pynini.accep(NEMO_SPACE) + window, 0, 1)
+
+    after = pynini.closure(pynutil.delete(NEMO_SPACE) + window + keywords, 0, 1)
+
+    return before.optimize(), after.optimize()
+
+
+def generate_credit(context_keywords):
+    context_before, context_after = get_context(context_keywords)
+    return (
+        pynutil.insert("number_part: \"")
+        + context_before
+        + pynini.closure((digit_to_word | digits | zero) + insert_space, 4)
+        + context_after
+        + pynutil.insert("\" ")
+        + delete_space
+    )
+
+
+def generate_pincode(context_keywords):
+    context_before, context_after = get_context(context_keywords)
+    return (
+        pynutil.insert("number_part: \"")
+        + context_before
+        + pynini.closure((digit_to_word | digits | zero) + insert_space, 6)
+        + context_after
+        + pynutil.insert("\" ")
+        + delete_space
+    )
+
+
+class TelephoneFst(GraphFst):
+    """
+    Finite state transducer for tagging telephone numbers, e.g.
+        ९१५७११४००७ -> telephone { number_part: "शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात" }
+        +९१ ९२१०५१५६०६ -> telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" }
+        १३७४-३०९९८८ -> telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" }
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization
+    """
+
+    def __init__(self):
+        super().__init__(name="telephone", kind="classify")
+
+        mobile_number = generate_mobile(mobile_context)
+        landline = generate_landline(landline_context)
+        credit_card = generate_credit(credit_context)
+        pincode = generate_pincode(pincode_context)
+
+        graph = (
+            pynutil.add_weight(mobile_number, 0.7)
+            | pynutil.add_weight(landline, 0.8)
+            | pynutil.add_weight(credit_card, 0.9)
+            | pynutil.add_weight(pincode, 1)
+        )
+
+        self.final = graph.optimize()
+        self.fst = self.add_tokens(self.final)
@@ -20,6 +20,7 @@
 from pynini.lib import pynutil
 
 from nemo_text_processing.text_normalization.hi.graph_utils import (
+    NEMO_SPACE,
     NEMO_WHITE_SPACE,
     GraphFst,
     delete_extra_space,
@@ -33,6 +34,7 @@
 from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst
 from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst
 from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst
+from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst
 from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst
 from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst
 from nemo_text_processing.text_normalization.hi.taggers.word import WordFst
@@ -123,6 +125,11 @@ def __init__(
             punct_graph = punctuation.fst
             logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes")
 
+            start_time = time.time()
+            telephone = TelephoneFst()
+            telephone_graph = telephone.fst
+            logging.debug(f"telephone: {time.time() - start_time: .2f}s -- {telephone_graph.num_states()} nodes")
+
             classify = (
                 pynutil.add_weight(whitelist_graph, 1.01)
                 | pynutil.add_weight(cardinal_graph, 1.1)
@@ -132,6 +139,7 @@ def __init__(
                 | pynutil.add_weight(time_graph, 1.1)
                 | pynutil.add_weight(measure_graph, 1.1)
                 | pynutil.add_weight(money_graph, 1.1)
+                | pynutil.add_weight(telephone_graph, 1.1)
             )
 
             start_time = time.time()
@@ -141,20 +149,22 @@ def __init__(
             punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
             punct = pynini.closure(
                 pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
-                | (pynutil.insert(" ") + punct),
+                | (pynutil.insert(NEMO_SPACE) + punct),
                 1,
             )
 
             classify |= pynutil.add_weight(word_graph, 100)
             token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
             token_plus_punct = (
-                pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
+                pynini.closure(punct + pynutil.insert(NEMO_SPACE))
+                + token
+                + pynini.closure(pynutil.insert(NEMO_SPACE) + punct)
             )
 
             graph = token_plus_punct + pynini.closure(
                 (
                     pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
-                    | (pynutil.insert(" ") + punct + pynutil.insert(" "))
+                    | (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE))
                 )
                 + token_plus_punct
             )
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ pipeline {`
`27`	`27`	`HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'`
`28`	`28`	`MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'`
`29`	`29`	`JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'`
`30`		`- HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-06-25-0'`
	`30`	`+ HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-28-25-0'`
`31`	`31`	`DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'`
`32`	`32`	`}`
`33`	`33`	`stages {`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+नंबर`
	`2`	`+कार्ड`
	`3`	`+क्रेडिट`