Skip to content

Commit c51d04a

Browse files
committed
Future Implementations for classes - Measure, Money, and Date
Signed-off-by: Namrata Gachchi <ngachchi@nvidia.com>
1 parent 48ca992 commit c51d04a

16 files changed

Lines changed: 200 additions & 65 deletions

File tree

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
ई. पू. ईसा पूर्व
2+
ई. ईसवी
3+
तक तक

nemo_text_processing/text_normalization/hi/data/measure/unit.tsv

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,16 @@ month महीना
141141
months महीने
142142
ct कैरेट
143143
pH पीएच
144+
km/h किलोमीटर प्रति घंटा
144145
km/hr किलोमीटर प्रति घंटा
145146
km/min किलोमीटर प्रति मिनट
147+
m/h मीटर प्रति घंटा
146148
m/hr मीटर प्रति घंटा
147149
mi/s मील प्रति सेकंड
150+
mi/h मील प्रति घंटा
148151
mi/hr मील प्रति घंटा
149152
mi/min मील प्रति मिनट
150153
₹/ac रुपए प्रति एकड़
151154
x बाई
152155
X बाई
153156
* बाई
154-
- से
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
रुपए
2-
P पैसे
32
£ पाउंड
43
वॉन
54
$ डॉलर
65
लीरा
76
टका
87
¥ येन
98
नाइरा
10-
यूरो
9+
यूरो
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
रुपए पैसे
2+
पाउंड पेंस
3+
वॉन जिओन
4+
डॉलर सेंट
5+
लीरा कुरस
6+
टका पैसे
7+
येन सेन
8+
नाइरा कोबो
9+
यूरो सेंट

nemo_text_processing/text_normalization/hi/data/time/hours.tsv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
शून्य
12
एक
23
दो
34
तीन

nemo_text_processing/text_normalization/hi/taggers/date.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
days = pynini.string_file(get_abs_path("data/date/days.tsv"))
2828
months = pynini.string_file(get_abs_path("data/date/months.tsv"))
29+
year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv"))
2930

3031

3132
class DateFst(GraphFst):
@@ -62,12 +63,17 @@ def __init__(self, cardinal: GraphFst):
6263

6364
years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space
6465

65-
graph_dd_mm = days_graph + delete_dash + months_graph
66+
graph_dd_mm = days_graph + (delete_dash | pynini.accep("")) + months_graph
6667

67-
graph_mm_dd = months_graph + delete_dash + days_graph
68+
graph_mm_dd = months_graph + (delete_dash | pynini.accep("")) + days_graph
6869

6970
graph_mm_dd += pynutil.insert(" preserve_order: true ")
7071

72+
# Graph for era
73+
era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space
74+
75+
range_graph = pynini.cross("-", "से")
76+
7177
graph_dd_mm_yyyy = (
7278
days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph
7379
)
@@ -78,7 +84,22 @@ def __init__(self, cardinal: GraphFst):
7884

7985
graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ")
8086

81-
graph_mm_yyyy = months_graph + delete_dash + years_graph
87+
graph_mm_yyyy = (
88+
months_graph + (delete_dash | pynini.accep("")) + years_graph + pynutil.insert(" preserve_order: true ")
89+
)
90+
91+
graph_year_suffix = era_graph
92+
93+
graph_range = (
94+
pynutil.insert("text: \"")
95+
+ (cardinal.final_graph | graph_year)
96+
+ insert_space
97+
+ range_graph
98+
+ insert_space
99+
+ (cardinal.final_graph | graph_year)
100+
+ pynutil.insert("\"")
101+
+ pynutil.insert(" preserve_order: true ")
102+
)
82103

83104
# default assume dd_mm_yyyy
84105

@@ -88,6 +109,8 @@ def __init__(self, cardinal: GraphFst):
88109
| pynutil.add_weight(graph_dd_mm_yyyy, -0.001)
89110
| graph_mm_dd_yyyy
90111
| graph_mm_yyyy
112+
| pynutil.add_weight(graph_year_suffix, -0.001)
113+
| pynutil.add_weight(graph_range, -0.005)
91114
)
92115

93116
self.final_graph = final_graph.optimize()

nemo_text_processing/text_normalization/hi/taggers/measure.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,20 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
4444
)
4545

4646
# Define the unit handling
47-
self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ")
47+
unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ")
48+
49+
# Handling symbols like x, X, *
50+
symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),])
4851

4952
graph_measurements = (
5053
pynutil.insert("decimal { ")
5154
+ optional_graph_negative
5255
+ decimal_graph
5356
+ pynutil.insert(" }")
5457
+ delete_space
55-
+ self.unit
58+
+ unit
5659
)
60+
5761
graph_measurements |= (
5862
pynutil.insert("cardinal { ")
5963
+ optional_graph_negative
@@ -62,7 +66,27 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
6266
+ pynutil.insert("\"")
6367
+ pynutil.insert(" }")
6468
+ delete_space
65-
+ self.unit
69+
+ unit
70+
)
71+
72+
# Handling cardinal clubbed with symbol as single token
73+
graph_measurements |= (
74+
pynutil.insert("cardinal { ")
75+
+ optional_graph_negative
76+
+ pynutil.insert("integer: \"")
77+
+ cardinal_graph
78+
+ pynutil.insert("\"")
79+
+ pynutil.insert(" }")
80+
+ pynutil.insert(" units: \"")
81+
+ symbol_graph
82+
+ pynutil.insert("\" ")
83+
+ pynutil.insert("} }")
84+
+ insert_space
85+
+ pynutil.insert("tokens { cardinal { ")
86+
+ optional_graph_negative
87+
+ pynutil.insert("integer: \"")
88+
+ cardinal_graph
89+
+ pynutil.insert("\"")
6690
)
6791

6892
graph = graph_measurements

nemo_text_processing/text_normalization/hi/taggers/money.py

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,39 +24,35 @@
2424
class MoneyFst(GraphFst):
2525
"""
2626
Finite state transducer for classifying money, suppletive aware, e.g.
27-
₹1 -> money { currency: "रुपए" integer_part: "एक" }
28-
₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" }
29-
27+
₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" }
28+
₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" }
29+
₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" }
30+
Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination
31+
3032
Args:
3133
cardinal: CardinalFst
3234
decimal: DecimalFst
3335
deterministic: if True will provide a single transduction option,
3436
for False multiple transduction are generated (used for audio-based normalization)
3537
"""
3638

37-
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
39+
def __init__(self, cardinal: GraphFst):
3840
super().__init__(name="money", kind="classify")
3941

4042
cardinal_graph = cardinal.final_graph
4143

42-
optional_graph_negative = pynini.closure(
43-
pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1,
44-
)
45-
self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ")
46-
self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ")
47-
self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ")
48-
49-
graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger
50-
graph_currencies |= (
51-
optional_graph_negative
52-
+ self.currency
53-
+ insert_space
54-
+ self.interger
55-
+ pynutil.delete(".")
56-
+ insert_space
57-
+ self.fraction
44+
currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"')
45+
integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
46+
fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"')
47+
currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"')
48+
49+
graph_major_only = currency_major + insert_space + integer
50+
graph_major_and_minor = (
51+
currency_major + insert_space + integer + pynini.cross(".", " ") + fraction + insert_space + currency_minor
5852
)
59-
graph = graph_currencies
60-
self.graph = graph.optimize()
53+
54+
graph_currencies = graph_major_only | graph_major_and_minor
55+
56+
graph = graph_currencies.optimize()
6157
final_graph = self.add_tokens(graph)
6258
self.fst = final_graph

nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class ClassifyFst(GraphFst):
4343
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
4444
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
4545
More details to deployment at NeMo/tools/text_processing_deployment.
46-
46+
4747
Args:
4848
input_case: accepting either "lower_cased" or "cased" input.
4949
deterministic: if True will provide a single transduction option,
@@ -68,11 +68,11 @@ def __init__(
6868
os.makedirs(cache_dir, exist_ok=True)
6969
whitelist_file = os.path.basename(whitelist) if whitelist else ""
7070
far_file = os.path.join(
71-
cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far"
71+
cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far",
7272
)
7373
if not overwrite_cache and far_file and os.path.exists(far_file):
7474
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
75-
logging.info(f'ClassifyFst.fst was restored from {far_file}.')
75+
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
7676
else:
7777
logging.info(f"Creating ClassifyFst grammars.")
7878

@@ -107,7 +107,7 @@ def __init__(
107107
logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes")
108108

109109
start_time = time.time()
110-
money = MoneyFst(cardinal=cardinal, decimal=decimal)
110+
money = MoneyFst(cardinal=cardinal)
111111
money_graph = money.fst
112112
logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes")
113113

nemo_text_processing/text_normalization/hi/verbalizers/date.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ def __init__(self):
3939

4040
year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
4141

42+
era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
43+
44+
range = pynutil.delete("text: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
45+
4246
graph_dd_mm = day + NEMO_SPACE + month
4347

4448
graph_mm_dd = month + NEMO_SPACE + day
@@ -49,6 +53,10 @@ def __init__(self):
4953

5054
graph_mm_yyyy = month + NEMO_SPACE + year
5155

56+
graph_era = era
57+
58+
graph_range = range
59+
5260
optional_preserve_order = pynini.closure(
5361
pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
5462
| pynutil.delete("field_order:")
@@ -60,7 +68,7 @@ def __init__(self):
6068
)
6169

6270
self.graph = (
63-
(graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy)
71+
(graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era | graph_range)
6472
+ delete_space
6573
+ optional_preserve_order
6674
)

0 commit comments

Comments
 (0)