Skip to content

Commit 0279602

Browse files
leading zero and formal/informal year fixes (#378)
* leading zero and formal/informal year fixes Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Jenkins date update Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> --------- Signed-off-by: shreeshd-tn <shreeshd@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 120db28 commit 0279602

7 files changed

Lines changed: 104 additions & 13 deletions

File tree

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ pipeline {
2626
HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0'
2727
MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
2828
JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
29-
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-12-26-0'
29+
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-16-26-0'
3030
DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
3131
}
3232
stages {
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
yr वर्ष

nemo_text_processing/text_normalization/hi/taggers/cardinal.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space
18+
from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_HI_DIGIT, GraphFst, insert_space
1919
from nemo_text_processing.text_normalization.hi.utils import get_abs_path
2020

2121

@@ -41,6 +41,11 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
4141
self.zero = zero
4242
self.teens_and_ties = teens_and_ties
4343

44+
# Single digit graph for digit-by-digit reading
45+
# e.g., "०७३" -> "शून्य सात तीन"
46+
single_digit_graph = digit | zero
47+
self.single_digits_graph = single_digit_graph + pynini.closure(insert_space + single_digit_graph)
48+
4449
def create_graph_suffix(digit_graph, suffix, zeros_counts):
4550
zero = pynutil.add_weight(pynutil.delete("०"), -0.1)
4651
if zeros_counts == 0:
@@ -298,13 +303,8 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
298303
graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 0, graph_ten_padmas)
299304
graph_ten_shankhs.optimize()
300305

301-
# Only match exactly 2 digits to avoid interfering with telephone numbers, decimals, etc.
302-
# e.g., "०५" -> "शून्य पाँच"
303-
single_digit = digit | zero
304-
graph_leading_zero = zero + insert_space + single_digit
305-
graph_leading_zero = pynutil.add_weight(graph_leading_zero, 0.5)
306-
307-
final_graph = (
306+
# Graph without leading zeros - used by other taggers like ordinal, decimal and measure
307+
graph_without_leading_zeros = (
308308
digit
309309
| zero
310310
| teens_and_ties
@@ -325,8 +325,18 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph):
325325
| graph_ten_padmas
326326
| graph_shankhs
327327
| graph_ten_shankhs
328-
| graph_leading_zero
329328
)
329+
self.graph_without_leading_zeros = graph_without_leading_zeros.optimize()
330+
331+
# Handle numbers with leading zeros by reading digit-by-digit
332+
# e.g., "०७३" -> "शून्य सात तीन", "००५" -> "शून्य शून्य पाँच"
333+
cardinal_with_leading_zeros = pynini.compose(
334+
pynini.accep("०") + pynini.closure(NEMO_HI_DIGIT), self.single_digits_graph
335+
)
336+
cardinal_with_leading_zeros = pynutil.add_weight(cardinal_with_leading_zeros, 0.5)
337+
338+
# Full graph including leading zeros - for standalone cardinal matching
339+
final_graph = graph_without_leading_zeros | cardinal_with_leading_zeros
330340

331341
optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
332342

nemo_text_processing/text_normalization/hi/taggers/decimal.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
5959
super().__init__(name="decimal", kind="classify", deterministic=deterministic)
6060

6161
graph_digit = cardinal.digit | cardinal.zero
62-
cardinal_graph = cardinal.final_graph
62+
cardinal_graph = cardinal.graph_without_leading_zeros
6363

6464
self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize()
6565

nemo_text_processing/text_normalization/hi/taggers/measure.py

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
218218
decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
219219
unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
220220

221+
# Year unit variants for formal/informal handling
222+
year_informal = pynini.string_map([("yr", "साल")])
223+
year_formal = pynini.string_file(get_abs_path("data/measure/unit_year_formal.tsv"))
224+
225+
# All units EXCEPT year
226+
unit_inputs_except_yr = pynini.difference(pynini.project(unit_graph, "input"), pynini.accep("yr"))
227+
unit_graph_no_year = pynini.compose(unit_inputs_except_yr, unit_graph)
228+
221229
# Load quarterly units from separate files: map (FST) and list (FSA)
222230
quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv"))
223231
quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv"))
@@ -243,7 +251,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
243251
unit = (
244252
pynutil.insert(NEMO_SPACE)
245253
+ pynutil.insert("units: \"")
246-
+ unit_graph
254+
+ unit_graph_no_year
247255
+ pynutil.insert("\"")
248256
+ pynutil.insert(NEMO_SPACE)
249257
)
@@ -255,6 +263,29 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
255263
+ pynutil.insert(NEMO_SPACE)
256264
)
257265

266+
# Year-specific unit wrappers
267+
unit_year_informal = (
268+
pynutil.insert(NEMO_SPACE)
269+
+ pynutil.insert("units: \"")
270+
+ year_informal
271+
+ pynutil.insert("\"")
272+
+ pynutil.insert(NEMO_SPACE)
273+
)
274+
unit_year_formal = (
275+
pynutil.insert(NEMO_SPACE)
276+
+ pynutil.insert("units: \"")
277+
+ year_formal
278+
+ pynutil.insert("\"")
279+
+ pynutil.insert(NEMO_SPACE)
280+
)
281+
282+
# Cardinal >= 1000 -> formal year (वर्ष)
283+
# Use graph_without_leading_zeros which covers all number ranges (thousands to shankhs)
284+
cardinal_large = cardinal.graph_without_leading_zeros
285+
286+
# Cardinal < 1000 -> informal year (साल)
287+
cardinal_small = cardinal.zero | cardinal.digit | cardinal.teens_and_ties | cardinal.graph_hundreds
288+
258289
symbol_graph = pynini.string_map(
259290
[
260291
(LOWERCASE_X, HI_BY),
@@ -354,6 +385,42 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
354385
+ unit
355386
)
356387

388+
# Large numbers (>=1000) + yr -> formal (वर्ष)
389+
graph_cardinal_year_formal = (
390+
pynutil.insert("cardinal { ")
391+
+ optional_graph_negative
392+
+ pynutil.insert("integer: \"")
393+
+ cardinal_large
394+
+ pynutil.insert("\"")
395+
+ pynutil.insert(NEMO_SPACE)
396+
+ pynutil.insert("}")
397+
+ delete_space
398+
+ unit_year_formal
399+
)
400+
401+
# Small numbers (<1000) + yr -> informal (साल)
402+
graph_cardinal_year_informal = (
403+
pynutil.insert("cardinal { ")
404+
+ optional_graph_negative
405+
+ pynutil.insert("integer: \"")
406+
+ cardinal_small
407+
+ pynutil.insert("\"")
408+
+ pynutil.insert(NEMO_SPACE)
409+
+ pynutil.insert("}")
410+
+ delete_space
411+
+ unit_year_informal
412+
)
413+
414+
# Regular decimals (e.g., 16.07) + yr -> formal (वर्ष)
415+
graph_decimal_year_formal = (
416+
pynutil.insert("decimal { ")
417+
+ optional_graph_negative
418+
+ decimal_graph
419+
+ pynutil.insert(" }")
420+
+ delete_space
421+
+ unit_year_formal
422+
)
423+
357424
# Handling cardinal clubbed with symbol as single token
358425
graph_exceptions = (
359426
pynutil.insert("cardinal { ")
@@ -381,7 +448,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp
381448

382449
graph = (
383450
pynutil.add_weight(graph_decimal, 0.1)
451+
| pynutil.add_weight(graph_decimal_year_formal, 0.1)
384452
| pynutil.add_weight(graph_cardinal, 0.1)
453+
| pynutil.add_weight(graph_cardinal_year_formal, 0.1)
454+
| pynutil.add_weight(graph_cardinal_year_informal, -0.1) # Higher priority for small numbers
385455
| pynutil.add_weight(graph_exceptions, 0.1)
386456
| pynutil.add_weight(graph_dedh_dhai, -0.2)
387457
| pynutil.add_weight(graph_savva, -0.1)

tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,7 @@
144144
५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ
145145
२ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल
146146
०५~शून्य पाँच
147-
०१~शून्य एक
147+
०१~शून्य एक
148+
०७३~शून्य सात तीन
149+
०००१~शून्य शून्य शून्य एक
150+
०००~शून्य शून्य शून्य

tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,10 @@
6464
५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा
6565
२x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब
6666
१३x१३ का घर~तेरह बाई तेरह का घर
67+
१००० yr~एक हज़ार वर्ष
68+
९९९९ yr~नौ हज़ार नौ सौ निन्यानबे वर्ष
69+
१६.०७ yr~सोलह दशमलव शून्य सात वर्ष
70+
५ yr~पाँच साल
71+
१.५ yr~डेढ़ साल
72+
२.५ yr~ढाई साल
73+
३.५ yr~साढ़े तीन साल

0 commit comments

Comments
 (0)