Skip to content

Commit 568cbb5

Browse files
Eng tn itn (#31)
* Add additional units and plurals Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Add support for financial periods (1H22, 2Q19) Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Add missing plural for "gigabit per second" Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Fix for measures Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Use environment variables to set path of fst cache Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix environment variable Signed-off-by: Anand Joseph <anajoseph@nvidia.com> --------- Signed-off-by: Anand Joseph <anajoseph@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 4cc07ee commit 568cbb5

7 files changed

Lines changed: 106 additions & 22 deletions

File tree

Jenkinsfile

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ pipeline {
99
timeout(time: 2, unit: 'HOURS')
1010
disableConcurrentBuilds(abortPrevious: true)
1111
}
12-
12+
environment {
13+
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-03-23-1'
14+
}
1315
stages {
1416

1517
stage('Add git safe directory'){
@@ -53,17 +55,17 @@ pipeline {
5355
parallel {
5456
stage('L0: En TN grammars') {
5557
steps {
56-
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
58+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir ${EN_TN_CACHE}'
5759
}
5860
}
5961
stage('L0: En TN non-deterministic grammars') {
6062
steps {
61-
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
63+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir ${EN_TN_CACHE}'
6264
}
6365
}
6466
stage('L0: En ITN grammars') {
6567
steps {
66-
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
68+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir ${EN_TN_CACHE}'
6769
}
6870
}
6971

@@ -81,7 +83,7 @@ pipeline {
8183
parallel {
8284
stage('L1: Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
8385
steps {
84-
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
86+
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir ${EN_TN_CACHE}'
8587
}
8688
}
8789
}
@@ -99,7 +101,7 @@ pipeline {
99101
stage('L2: Eng TN') {
100102
steps {
101103
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && \
102-
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
104+
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
103105
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
104106
cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkinsci/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \
105107
cat $NORM_OUTPUT_DIR/test.pynini.txt && \
@@ -111,7 +113,7 @@ pipeline {
111113
stage('L2: Eng ITN export') {
112114
steps {
113115
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && \
114-
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
116+
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
115117
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
116118
cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \
117119
cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \

nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
f fahrenheit
2-
c celsius
1+
°f fahrenheit
2+
°c celsius
3+
°f degree fahrenheit
4+
°c degree celsius
5+
k kelvin
36
km kilometer
47
m meter
58
cm centimeter
@@ -35,7 +38,6 @@ mv milli volt
3538
mw megawatt
3639
μm micrometer
3740
" inch
38-
tb terabyte
3941
cc c c
4042
g gram
4143
da dalton
@@ -47,13 +49,38 @@ oz ounce
4749
hl hecto liter
4850
μg microgram
4951
pg petagram
50-
gb gigabyte
5152
kb kilobit
53+
mb megabit
54+
gb gigabit
55+
tb terabit
56+
pb petabit
5257
ev electron volt
5358
mb megabyte
5459
kb kilobyte
60+
gb gigabyte
61+
tb terabyte
62+
pb peta byte
63+
bps bit per second
5564
kbps kilobit per second
5665
mbps megabit per second
66+
gbps gigabit per second
67+
kbps kilo bit per second
68+
mbps mega bit per second
69+
mbps mega bit per second
70+
tbps terabit per second
71+
tbps tera bit per second
72+
pbps petabit per second
73+
pbps peta bit per second
74+
kb/s kilobyte per second
75+
kb/s kilo byte per second
76+
mb/s megabyte per second
77+
mb/s mega byte per second
78+
gb/s gigabyte per second
79+
gb/s giga byte per second
80+
tb/s terabyte per second
81+
tb/s tera byte per second
82+
pb/s petabyte per second
83+
pb/s peta byte per second
5784
st stone
5885
kl kilo liter
5986
tj tera joule
@@ -86,7 +113,6 @@ ms milli second
86113
dm deci meter
87114
dm³ cubic deci meter
88115
amu atomic mass unit
89-
mb megabit
90116
mf mega farad
91117
bq becquerel
92118
pb petabit
@@ -100,7 +126,6 @@ tl tera liter
100126
ms mega second
101127
mpa megapascal
102128
pm peta meter
103-
pb peta byte
104129
gwh giga watt hour
105130
kcal kilo calory
106131
gy gray

nemo_text_processing/text_normalization/en/data/suppletive.tsv

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,38 @@ revolution per minute revolutions per minute
3636
mile per hour miles per hour
3737
megabit per second megabits per second
3838
square foot square feet
39+
centimeter per second centimeters per second
40+
meter per second meters per second
41+
kilometer per second kilometers per second
42+
meter per hour meters per hour
43+
bit per second bits per second
44+
kilometer per hour kilometers per hour
3945
kilobit per second kilobits per second
46+
kilo bit per second kilo bits per second
47+
megabit per second megabits per second
48+
mega bit per second mega bits per second
49+
gigabit per second gigabits per second
50+
giga bit per second giga bits per second
51+
terabit per second terabits per second
52+
tera bit per second tera bits per second
53+
petabit per second petabits per second
54+
peta bit per second peta bits per second
55+
byte per second bytes per second
56+
kilobyte per second kilobytes per second
57+
kilo byte per second kilo bytes per second
58+
megabyte per second megabytes per second
59+
mega byte per second mega bytes per second
60+
gigabyte per second gigabytes per second
61+
giga byte per second giga bytes per second
62+
terabyte per second terabytes per second
63+
tera byte per second tera bytes per second
64+
petabyte per second petabytes per second
65+
peta byte per second peta bytes per second
4066
degree Celsius degrees Celsius
4167
degree Fahrenheit degrees Fahrenheit
68+
degree celsius degrees celsius
69+
degree fahrenheit degrees fahrenheit
70+
kelvin kelvin
4271
ATM
4372
AU
4473
BQ
@@ -80,4 +109,4 @@ PS
80109
S
81110
TB
82111
YB
83-
ZB
112+
ZB

nemo_text_processing/text_normalization/en/taggers/date.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,22 @@ def _get_year_graph(cardinal_graph, deterministic: bool = True):
138138

139139

140140
def _get_two_digit_year(cardinal_graph, single_digits_graph):
141-
wo_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA)
142-
return wo_digit_year
141+
two_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA)
142+
return two_digit_year
143+
144+
145+
def _get_financial_period_graph():
146+
# 1H23 -> first half of twenty three
147+
# 3Q22 -> third quarter of twenty two
148+
149+
h_ordinals = pynini.cross('1', 'first') | pynini.cross('2', 'second')
150+
q_ordinals = h_ordinals | pynini.cross('3', 'third') | pynini.cross('4', 'fourth')
151+
152+
h_graph = h_ordinals + pynini.cross('H', ' half')
153+
q_graph = q_ordinals + pynini.cross('Q', ' quarter')
154+
period_graph = h_graph | q_graph
155+
156+
return period_graph
143157

144158

145159
class DateFst(GraphFst):
@@ -297,7 +311,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False):
297311
else:
298312
final_graph += pynutil.insert(" preserve_order: true")
299313

300-
final_graph |= graph_ymd | year_graph
314+
period_fy = pynutil.insert("period: \"") + _get_financial_period_graph() + pynutil.insert("\"")
315+
graph_fy = period_fy + insert_space + two_digit_year
316+
317+
final_graph |= graph_ymd | year_graph | graph_fy
301318

302319
if not deterministic or lm:
303320
ymd_to_mdy_graph = None

nemo_text_processing/text_normalization/en/verbalizers/date.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class DateFst(GraphFst):
3939
def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = False):
4040
super().__init__(name="date", kind="verbalize", deterministic=deterministic)
4141

42-
month = pynini.closure(NEMO_NOT_QUOTE, 1)
42+
phrase = pynini.closure(NEMO_NOT_QUOTE, 1)
4343
day_cardinal = (
4444
pynutil.delete("day:")
4545
+ delete_space
@@ -48,8 +48,8 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal
4848
+ pynutil.delete("\"")
4949
)
5050
day = day_cardinal @ ordinal.suffix
51-
52-
month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"")
51+
period = pynutil.delete("period:") + delete_space + pynutil.delete("\"") + phrase + pynutil.delete("\"")
52+
month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + phrase + pynutil.delete("\"")
5353

5454
year = (
5555
pynutil.delete("year:")
@@ -60,6 +60,11 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal
6060
+ pynutil.delete("\"")
6161
)
6262

63+
# financial period
64+
graph_fy = (
65+
pynutil.insert("the ") + period + pynutil.insert(" of ") + pynini.closure(delete_extra_space + year, 0, 1)
66+
)
67+
6368
# month (day) year
6469
graph_mdy = (
6570
month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1)
@@ -93,7 +98,7 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True, lm: bool = Fal
9398
)
9499

95100
final_graph = (
96-
(plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year)
101+
(plurals._priority_union(graph_mdy, pynutil.add_weight(graph_dmy, 0.0001), NEMO_SIGMA) | year | graph_fy)
97102
+ delete_space
98103
+ optional_preserve_order
99104
)

tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_measure.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,8 @@ eight point five megawatts~8.5 mw
110110
eight point five meters~8.5 m
111111
eight point five two percent~8.52 %
112112
eight point four four percent~8.44 %
113+
one gigabit per second~1 gbps
114+
nine gigabits per second~9 gbps
115+
five degrees celsius~5 °c
116+
seventy two degrees fahrenheit~72 °f
117+
two hundred seventy three kelvin~273 k

tests/nemo_text_processing/en/data_text_normalization/test_cases_date.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,5 @@ Jan-15~january fifteenth
5050
15-01-2020~the fifteenth of january twenty twenty
5151
15.01.2020~the fifteenth of january twenty twenty
5252
340 A.D~three forty AD
53-
1998/2/30~february thirtieth nineteen ninety eight
53+
1998/2/30~february thirtieth nineteen ninety eight
54+
We have seen YoY growth in 2Q22~We have seen YoY growth in the second quarter of twenty two

0 commit comments

Comments
 (0)