Skip to content

Commit 68529fd

Browse files
Hindi TN 2.0 - Telephone class integration from staging branch (#320)
* telephone class integration (cherry picked from commit a7c9adf) Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * Updated date in Jenkins file to the PR creation date Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * Jenkins file date change Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * Trying today's date Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * improved country code coverage + some test cases Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Ignore test generated files Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * Improved landline detection and added edge test cases for proper coverage Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Deleted gitignore file Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> --------- Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 714c1cc commit 68529fd

File tree

14 files changed

+397
-8
lines changed

14 files changed

+397
-8
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pipeline {
2727
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
2828
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
2929
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
30-
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-06-25-0'
30+
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-28-25-0'
3131
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
3232
}
3333
stages {
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
नंबर
2+
कार्ड
3+
क्रेडिट
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
नंबर
2+
मोबाइल
3+
फोन
4+
लैंडलाइन
5+
कॉल
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
नंबर
2+
मोबाइल
3+
फोन
4+
कॉल
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
0 शून्य
2+
1 एक
3+
2 दो
4+
3 तीन
5+
4 चार
6+
5 पाँच
7+
6 छह
8+
7 सात
9+
8 आठ
10+
9 नौ
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
नंबर
2+
पिन
3+
कोड
4+
पिनकोड
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
import pynini
17+
from pynini.lib import pynutil
18+
19+
from nemo_text_processing.text_normalization.hi.graph_utils import (
20+
NEMO_CHAR,
21+
NEMO_DIGIT,
22+
NEMO_HI_DIGIT,
23+
NEMO_SPACE,
24+
NEMO_WHITE_SPACE,
25+
GraphFst,
26+
delete_space,
27+
insert_space,
28+
)
29+
from nemo_text_processing.text_normalization.hi.utils import get_abs_path
30+
31+
delete_zero = pynutil.delete(pynini.union("0", "०"))
32+
delete_zero_optional = pynini.closure(delete_zero, 0, 1)
33+
insert_shunya = pynutil.insert('शून्य') + insert_space
34+
35+
# Load the number mappings from the TSV file
36+
digit_to_word = pynini.string_file(get_abs_path("data/telephone/number.tsv"))
37+
digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
38+
zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
39+
mobile_context = pynini.string_file(get_abs_path("data/telephone/mobile_context.tsv"))
40+
landline_context = pynini.string_file(get_abs_path("data/telephone/landline_context.tsv"))
41+
credit_context = pynini.string_file(get_abs_path("data/telephone/credit_context.tsv"))
42+
pincode_context = pynini.string_file(get_abs_path("data/telephone/pincode_context.tsv"))
43+
44+
45+
def generate_mobile(context_keywords):
46+
context_before, context_after = get_context(context_keywords)
47+
48+
allowed_digits = pynini.union("६", "७", "८", "९", "6", "7", "8", "9")
49+
50+
# Filter cardinals to only include allowed digits
51+
mobile_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word
52+
53+
country_code_digits = pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3)
54+
country_code = (
55+
pynutil.insert("country_code: \"")
56+
+ context_before
57+
+ pynini.cross("+", "प्लस")
58+
+ insert_space
59+
+ country_code_digits
60+
+ pynutil.insert("\" ")
61+
+ pynini.closure(delete_space, 0, 1)
62+
)
63+
64+
extension_optional = pynini.closure(
65+
pynutil.insert("extension: \"")
66+
+ pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3)
67+
+ context_after
68+
+ pynutil.insert("\" ")
69+
+ delete_space,
70+
0,
71+
1,
72+
)
73+
74+
number_part = mobile_start_digit + insert_space + pynini.closure((digit_to_word | digits | zero) + insert_space, 9)
75+
76+
number_without_country = (
77+
pynutil.insert("number_part: \"")
78+
+ context_before
79+
+ delete_zero_optional
80+
+ insert_shunya
81+
+ number_part
82+
+ context_after
83+
+ pynutil.insert("\" ")
84+
+ delete_space
85+
)
86+
87+
number_with_country = (
88+
country_code
89+
+ pynutil.insert("number_part: \"")
90+
+ number_part
91+
+ context_after
92+
+ pynutil.insert("\" ")
93+
+ delete_space
94+
)
95+
96+
return (number_with_country | number_without_country) + extension_optional
97+
98+
99+
def get_landline(std_length, context_keywords):
100+
context_before, context_after = get_context(context_keywords)
101+
102+
allowed_digits = pynini.union("२", "३", "४", "६", "2", "3", "4", "6")
103+
104+
# Filter cardinals to only include allowed digits
105+
landline_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word
106+
107+
std_code_graph = (
108+
delete_zero_optional
109+
+ insert_shunya
110+
+ pynini.closure((digit_to_word | digits | zero) + insert_space, std_length, std_length)
111+
)
112+
113+
landline_digit_count = 9 - std_length
114+
landline_graph = (
115+
landline_start_digit
116+
+ insert_space
117+
+ pynini.closure((digit_to_word | digits | zero) + insert_space, landline_digit_count, landline_digit_count)
118+
)
119+
120+
separator_optional = pynini.closure(pynini.cross("-", "") | pynini.cross(".", ""), 0, 1)
121+
122+
std_code_in_brackets = (
123+
delete_zero_optional
124+
+ delete_space
125+
+ pynutil.delete("(")
126+
+ pynini.closure(delete_space, 0, 1)
127+
+ std_code_graph
128+
+ pynini.closure(delete_space, 0, 1)
129+
+ pynutil.delete(")")
130+
)
131+
132+
std_part = pynini.union(std_code_graph, std_code_in_brackets)
133+
134+
return (
135+
pynutil.insert("number_part: \"")
136+
+ context_before
137+
+ std_part
138+
+ separator_optional
139+
+ delete_space
140+
+ landline_graph
141+
+ context_after
142+
+ pynutil.insert("\" ")
143+
)
144+
145+
146+
def generate_landline(context_keywords):
147+
graph = (
148+
get_landline(2, context_keywords)
149+
| get_landline(3, context_keywords)
150+
| get_landline(4, context_keywords)
151+
| get_landline(5, context_keywords)
152+
| get_landline(6, context_keywords)
153+
| get_landline(7, context_keywords)
154+
)
155+
156+
return graph
157+
158+
159+
def get_context(keywords: list):
160+
161+
all_digits = pynini.union(NEMO_HI_DIGIT, NEMO_DIGIT)
162+
163+
non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE))
164+
word = pynini.closure(non_digit_char, 1) + pynini.accep(NEMO_SPACE)
165+
166+
window = pynini.closure(word, 0, 5)
167+
168+
before = pynini.closure(keywords + pynini.accep(NEMO_SPACE) + window, 0, 1)
169+
170+
after = pynini.closure(pynutil.delete(NEMO_SPACE) + window + keywords, 0, 1)
171+
172+
return before.optimize(), after.optimize()
173+
174+
175+
def generate_credit(context_keywords):
176+
context_before, context_after = get_context(context_keywords)
177+
return (
178+
pynutil.insert("number_part: \"")
179+
+ context_before
180+
+ pynini.closure((digit_to_word | digits | zero) + insert_space, 4)
181+
+ context_after
182+
+ pynutil.insert("\" ")
183+
+ delete_space
184+
)
185+
186+
187+
def generate_pincode(context_keywords):
188+
context_before, context_after = get_context(context_keywords)
189+
return (
190+
pynutil.insert("number_part: \"")
191+
+ context_before
192+
+ pynini.closure((digit_to_word | digits | zero) + insert_space, 6)
193+
+ context_after
194+
+ pynutil.insert("\" ")
195+
+ delete_space
196+
)
197+
198+
199+
class TelephoneFst(GraphFst):
200+
"""
201+
Finite state transducer for tagging telephone numbers, e.g.
202+
९१५७११४००७ -> telephone { number_part: "शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात" }
203+
+९१ ९२१०५१५६०६ -> telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" }
204+
१३७४-३०९९८८ -> telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" }
205+
206+
Args:
207+
deterministic: if True will provide a single transduction option,
208+
for False multiple transduction are generated (used for audio-based normalization
209+
"""
210+
211+
def __init__(self):
212+
super().__init__(name="telephone", kind="classify")
213+
214+
mobile_number = generate_mobile(mobile_context)
215+
landline = generate_landline(landline_context)
216+
credit_card = generate_credit(credit_context)
217+
pincode = generate_pincode(pincode_context)
218+
219+
graph = (
220+
pynutil.add_weight(mobile_number, 0.7)
221+
| pynutil.add_weight(landline, 0.8)
222+
| pynutil.add_weight(credit_card, 0.9)
223+
| pynutil.add_weight(pincode, 1)
224+
)
225+
226+
self.final = graph.optimize()
227+
self.fst = self.add_tokens(self.final)

nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from pynini.lib import pynutil
2121

2222
from nemo_text_processing.text_normalization.hi.graph_utils import (
23+
NEMO_SPACE,
2324
NEMO_WHITE_SPACE,
2425
GraphFst,
2526
delete_extra_space,
@@ -33,6 +34,7 @@
3334
from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst
3435
from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst
3536
from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst
37+
from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst
3638
from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst
3739
from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst
3840
from nemo_text_processing.text_normalization.hi.taggers.word import WordFst
@@ -123,6 +125,11 @@ def __init__(
123125
punct_graph = punctuation.fst
124126
logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes")
125127

128+
start_time = time.time()
129+
telephone = TelephoneFst()
130+
telephone_graph = telephone.fst
131+
logging.debug(f"telephone: {time.time() - start_time: .2f}s -- {telephone_graph.num_states()} nodes")
132+
126133
classify = (
127134
pynutil.add_weight(whitelist_graph, 1.01)
128135
| pynutil.add_weight(cardinal_graph, 1.1)
@@ -132,6 +139,7 @@ def __init__(
132139
| pynutil.add_weight(time_graph, 1.1)
133140
| pynutil.add_weight(measure_graph, 1.1)
134141
| pynutil.add_weight(money_graph, 1.1)
142+
| pynutil.add_weight(telephone_graph, 1.1)
135143
)
136144

137145
start_time = time.time()
@@ -141,20 +149,22 @@ def __init__(
141149
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
142150
punct = pynini.closure(
143151
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
144-
| (pynutil.insert(" ") + punct),
152+
| (pynutil.insert(NEMO_SPACE) + punct),
145153
1,
146154
)
147155

148156
classify |= pynutil.add_weight(word_graph, 100)
149157
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
150158
token_plus_punct = (
151-
pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
159+
pynini.closure(punct + pynutil.insert(NEMO_SPACE))
160+
+ token
161+
+ pynini.closure(pynutil.insert(NEMO_SPACE) + punct)
152162
)
153163

154164
graph = token_plus_punct + pynini.closure(
155165
(
156166
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
157-
| (pynutil.insert(" ") + punct + pynutil.insert(" "))
167+
| (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE))
158168
)
159169
+ token_plus_punct
160170
)

0 commit comments

Comments
 (0)