Skip to content

Commit 90a61e6

Browse files
Merge EN riva release 22.10 (#26)
* Merge EN riva release 22.10 Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Code cleanup Signed-off-by: Anand Joseph <anajoseph@nvidia.com> --------- Signed-off-by: Anand Joseph <anajoseph@nvidia.com> Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 0748a29 commit 90a61e6

11 files changed

Lines changed: 305 additions & 46 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,4 +174,4 @@ examples/neural_graphs/*.yml
174174

175175
.hydra/
176176
nemo_experiments/
177-
177+
*.swp
Lines changed: 159 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,168 @@
1-
$ dollar
2-
$ us dollar
3-
$ united states dollar
1+
؋ afghan afghani
2+
l albanian lek
3+
دج algerian dinar
4+
kz angolan kwanza
5+
aed arab emirates dirham
6+
$ argentine peso
7+
֏ armenian dram
8+
ƒ aruban florin
9+
a$ australian dollar
10+
azerbaijani manat
11+
b$ bahamian dollar
12+
.د.ب bahraini dinar
13+
bangladeshi taka
14+
$ barbadian dollar
15+
br belarusian ruble
16+
bz$ belize dollar
17+
$ bermudian dollar
18+
nu bhutanese ngultrum
19+
bitcoin
20+
bs bolivian boliviano
21+
km bosnia and herzegovina convertible mark
22+
p botswana pula
23+
r$ brazilian real
424
£ british pound
25+
b$ brunei dollar
26+
лв bulgarian lev
27+
fbu burundian franc
28+
cambodian riel
29+
ca$ canadian dollar
30+
cve cape verde escudo
31+
cve cape verdean escudo
32+
ci$ cayman islands dollar
33+
cfa cfa franc
34+
$ chilean peso
35+
chinese yuan
36+
$ colombian peso
37+
cf comorian franc
38+
fc congolese franc
39+
costa rican colón
40+
kn croatian kuna
41+
cuc$ cuban peso
42+
cyp cypriot pound
43+
czech koruna
44+
d dalasi
45+
kr danish krone
46+
fdj djiboutian franc
47+
$ dollar
48+
rd$ dominican peso
49+
$ east caribbean dollar
50+
egyptian pound
51+
aed emirati dirham
52+
nkf eritrean nakfa
53+
Ξ ethereum
54+
br ethiopian birr
555
euro
6-
won
7-
nzd new zealand dollar
8-
rs rupee
9-
chf swiss franc
10-
dkk danish kroner
56+
fk£ falkland islands pound
1157
fim finnish markka
12-
aed arab emirates dirham
13-
¥ yen
14-
czk czech koruna
58+
georgian lari
59+
dm german mark
60+
gh₵ ghanaian cedi
61+
q guatemalan quetzal
62+
fg guinean franc
63+
g$ guyanese dollar
64+
g haitian gourde
65+
l honduran lempira
66+
hk$ hong kong dollar
67+
ft hungarian forint
68+
inr
69+
íkr icelandic króna
70+
indian rupee
71+
rp indonesian rupiah
72+
iranian rial
73+
ع.د iraqi dinar
74+
israeli new shekel
75+
israeli shekel
76+
j$ jamaican dollar
77+
¥ japanese yen
78+
ينار jordanian dinar
79+
kazakhstani tenge
80+
ksh kenyan shilling
81+
ك kuwaiti dinar
82+
k kyat
83+
som kyrgyzstani som
84+
lao kip
85+
ლარი lari
86+
ل.ل lebanese pound
87+
l lesotho loti
88+
ld$ liberian dollar
89+
ld libyan dinar
90+
l lilangeni
91+
ł litecoin
92+
mop$ macanese pataca
93+
ar malagasy ariary
94+
k malawian kwacha
95+
rm malaysian ringgit
96+
rf maldivian rufiyaa
1597
mro mauritanian ouguiya
98+
mauritian rupee
99+
$ mexican peso
100+
l moldovan leu
101+
ɱ monero
102+
mongolian tögrög
103+
dh moroccan dirham
104+
mt mozambican metical
105+
n$ namibian dollar
106+
rs nepalese rupee
107+
ƒ netherlands antillean guilder
108+
nt$ new taiwan dollar
109+
nz$ new zealand dollar
110+
c$ nicaraguan córdoba
111+
nigerian naira
112+
north korean won
113+
kr norwegian krone
114+
ر.ع omani rial
115+
um ouguiya
16116
pkr pakistani rupee
17-
crc costa rican colon
18-
hk$ hong kong dollar
19-
npr nepalese rupee
20-
awg aruban florin
21-
nok norwegian kroner
22-
tzs tanzanian shilling
23-
sek swedish kronor
24-
cyp cypriot pound
117+
b/. panamanian balboa
118+
paraguayan guaraní
119+
s/. peruvian sol
120+
philippine peso
121+
polish zloty
122+
£ pounds sterling
123+
ر.ق qatari riyal
124+
£ quid
25125
r real
126+
xrp ripples
127+
lei romanian leu
128+
rupee
129+
russian ruble
130+
r₣ rwandan franc
131+
shp saint helena pounds
132+
db são tomé and príncipe dobra
26133
sar saudi riyal
27-
cve cape verde escudo
134+
ден second macedonian denar
28135
rsd serbian dinar
29-
dm german mark
30-
shp saint helena pounds
31-
php philippine peso
32-
cad canadian dollar
33-
ssp south sudanese pound
34136
scr seychelles rupee
35-
mvr maldivian rufiyaa
137+
le sierra leonean leone
138+
s$ singapore dollar
139+
sh.so. somali shilling
140+
tjs somoni
141+
r south african rand
142+
south korean won
143+
ss£ south sudanese pound
144+
රු sri lankan rupee
145+
sdg sudanese pound
146+
sr$ surinamese dollar
147+
kr swedish krona
148+
chf swiss franc
149+
£s syrian pound
150+
taka
151+
tzs tanzanian shilling
152+
tether
153+
฿ thai baht
154+
tt$ trinidad and tobago dollar
155+
د.ت tunisian dinar
156+
turkish lira
157+
m turkmen new manat
158+
ush ugandan shilling
159+
ukrainian hryvna
160+
$ united states dollar
161+
$u uruguayan peso
162+
$ us dollar
163+
som uzbekistan som
164+
bs. venezuelan bolívar
165+
vietnamese đồng
166+
won
167+
yemeni rial
168+
¥ yen
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
. dot
2+
- dash
3+
- hyphen
4+
_ underscore
5+
/ slash

nemo_text_processing/inverse_text_normalization/en/taggers/cardinal.py

Lines changed: 89 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
1716
import pynini
1817
from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path, num_to_word
1918
from nemo_text_processing.text_normalization.en.graph_utils import (
@@ -31,7 +30,7 @@ class CardinalFst(GraphFst):
3130
"""
3231
Finite state transducer for classifying cardinals
3332
e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } }
34-
Numbers below thirteen are not converted.
33+
Numbers below thirteen are not converted.
3534
"""
3635

3736
def __init__(self):
@@ -67,6 +66,17 @@ def __init__(self):
6766

6867
graph_hundreds = graph_hundred_component | graph_hundred_as_thousand
6968

69+
graph_ties_component = pynini.union(
70+
graph_teen | pynutil.insert("00"),
71+
(graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
72+
)
73+
74+
graph_ties_component_at_least_one_none_zero_digit = graph_ties_component @ (
75+
pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
76+
)
77+
self.graph_ties_component_at_least_one_none_zero_digit = graph_ties_component_at_least_one_none_zero_digit
78+
79+
# %%% International numeric format
7080
graph_thousands = pynini.union(
7181
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"),
7282
pynutil.insert("000", weight=0.1),
@@ -96,8 +106,9 @@ def __init__(self):
96106
graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"),
97107
pynutil.insert("000", weight=0.1),
98108
)
109+
# %%%
99110

100-
graph = pynini.union(
111+
graph_int = (
101112
graph_sextillion
102113
+ delete_space
103114
+ graph_quintillion
@@ -111,11 +122,84 @@ def __init__(self):
111122
+ graph_million
112123
+ delete_space
113124
+ graph_thousands
125+
)
126+
127+
# %% Indian numeric format simple https://en.wikipedia.org/wiki/Indian_numbering_system
128+
# This only covers "standard format".
129+
# Conventional format like thousand crores/lakh crores is yet to be implemented
130+
graph_in_thousands = pynini.union(
131+
graph_ties_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"),
132+
pynutil.insert("00", weight=0.1),
133+
)
134+
graph_in_lakhs = pynini.union(
135+
graph_ties_component_at_least_one_none_zero_digit
136+
+ delete_space
137+
+ (pynutil.delete("lakh") | pynutil.delete("lakhs")),
138+
pynutil.insert("00", weight=0.1),
139+
)
140+
141+
graph_in_crores = pynini.union(
142+
graph_ties_component_at_least_one_none_zero_digit
143+
+ delete_space
144+
+ (pynutil.delete("crore") | pynutil.delete("crores")),
145+
pynutil.insert("00", weight=0.1),
146+
)
147+
148+
graph_in_arabs = pynini.union(
149+
graph_ties_component_at_least_one_none_zero_digit
150+
+ delete_space
151+
+ (pynutil.delete("arab") | pynutil.delete("arabs")),
152+
pynutil.insert("00", weight=0.1),
153+
)
154+
155+
graph_in_kharabs = pynini.union(
156+
graph_ties_component_at_least_one_none_zero_digit
114157
+ delete_space
115-
+ graph_hundreds,
116-
graph_zero,
158+
+ (pynutil.delete("kharab") | pynutil.delete("kharabs")),
159+
pynutil.insert("00", weight=0.1),
117160
)
118161

162+
graph_in_nils = pynini.union(
163+
graph_ties_component_at_least_one_none_zero_digit
164+
+ delete_space
165+
+ (pynutil.delete("nil") | pynutil.delete("nils")),
166+
pynutil.insert("00", weight=0.1),
167+
)
168+
169+
graph_in_padmas = pynini.union(
170+
graph_ties_component_at_least_one_none_zero_digit
171+
+ delete_space
172+
+ (pynutil.delete("padma") | pynutil.delete("padmas")),
173+
pynutil.insert("00", weight=0.1),
174+
)
175+
176+
graph_in_shankhs = pynini.union(
177+
graph_ties_component_at_least_one_none_zero_digit
178+
+ delete_space
179+
+ (pynutil.delete("shankh") | pynutil.delete("shankhs")),
180+
pynutil.insert("00", weight=0.1),
181+
)
182+
183+
graph_ind = (
184+
graph_in_shankhs
185+
+ delete_space
186+
+ graph_in_padmas
187+
+ delete_space
188+
+ graph_in_nils
189+
+ delete_space
190+
+ graph_in_kharabs
191+
+ delete_space
192+
+ graph_in_arabs
193+
+ delete_space
194+
+ graph_in_crores
195+
+ delete_space
196+
+ graph_in_lakhs
197+
+ delete_space
198+
+ graph_in_thousands
199+
)
200+
201+
graph = pynini.union((graph_int | graph_ind) + delete_space + graph_hundreds, graph_zero,)
202+
119203
graph = graph @ pynini.union(
120204
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0"
121205
)

nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@ def __init__(self):
3535
| pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
3636
)
3737

38-
symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert()
39-
40-
accepted_username = alpha_num | symbols
38+
url_symbols = pynini.string_file(get_abs_path("data/electronic/url_symbols.tsv")).invert()
39+
accepted_username = alpha_num | url_symbols
4140
process_dot = pynini.cross("dot", ".")
4241
username = (alpha_num + pynini.closure(delete_extra_space + accepted_username)) | pynutil.add_weight(
4342
pynini.closure(NEMO_ALPHA, 1), weight=0.0001
@@ -65,7 +64,7 @@ def __init__(self):
6564
# .com,
6665
ending = (
6766
delete_extra_space
68-
+ symbols
67+
+ url_symbols
6968
+ delete_extra_space
7069
+ (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username)
7170
)

nemo_text_processing/inverse_text_normalization/en/taggers/money.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
5252
)
5353
cardinal_graph |= with_hundred
5454
graph_decimal_final = decimal.final_graph_wo_negative
55-
5655
unit = pynini.string_file(get_abs_path("data/currency.tsv"))
5756
unit_singular = pynini.invert(unit)
5857
unit_plural = get_singulars(unit_singular)
5958

6059
graph_unit_singular = pynutil.insert("currency: \"") + convert_space(unit_singular) + pynutil.insert("\"")
61-
graph_unit_plural = pynutil.insert("currency: \"") + convert_space(unit_plural) + pynutil.insert("\"")
60+
graph_unit_plural = (
61+
pynutil.insert("currency: \"") + convert_space(unit_plural | unit_singular) + pynutil.insert("\"")
62+
)
6263

6364
add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT)
6465
# twelve dollars (and) fifty cents, zero cents

0 commit comments

Comments
 (0)