Skip to content

Commit 0a72b79

Browse files
authored
Eng TN - update urls to handle dictionary words (#27)
* wip el words Signed-off-by: ekmb <ebakhturina@nvidia.com> * wip el words Signed-off-by: ekmb <ebakhturina@nvidia.com> * wip Signed-off-by: ekmb <ebakhturina@nvidia.com> * electronic pass Signed-off-by: ekmb <ebakhturina@nvidia.com> * test pass Signed-off-by: ekmb <ebakhturina@nvidia.com> * clean up Signed-off-by: ekmb <ebakhturina@nvidia.com> * clean up Signed-off-by: ekmb <ebakhturina@nvidia.com> * remove unused imports Signed-off-by: ekmb <ebakhturina@nvidia.com> * add deterministic option normalized options Signed-off-by: ekmb <ebakhturina@nvidia.com> * update jenkins grammar folder Signed-off-by: ekmb <ebakhturina@nvidia.com> * clean up, update for SH Signed-off-by: ekmb <ebakhturina@nvidia.com> * update jenkins dir Signed-off-by: ekmb <ebakhturina@nvidia.com> * clean up Signed-off-by: ekmb <ebakhturina@nvidia.com> * reduce cardinal graph Signed-off-by: ekmb <ebakhturina@nvidia.com> * jenkins dir Signed-off-by: ekmb <ebakhturina@nvidia.com> * add weight for sh Signed-off-by: ekmb <ebakhturina@nvidia.com> --------- Signed-off-by: ekmb <ebakhturina@nvidia.com>
1 parent 90a61e6 commit 0a72b79

11 files changed

Lines changed: 142 additions & 59 deletions

File tree

Jenkinsfile

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ pipeline {
1414

1515
stage('Add git safe directory'){
1616
steps{
17-
// sh 'git config --global user.name "jenkinsci"'
18-
// sh 'git config --global user.email "$(whoami)@$(hostname)"'
1917
sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NTP_$GIT_BRANCH'
2018
sh 'git config --global --add safe.directory /home/jenkinsci/workspace/NTP_$GIT_BRANCH'
2119
}
@@ -55,17 +53,17 @@ pipeline {
5553
parallel {
5654
stage('En TN grammars') {
5755
steps {
58-
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/01-30-23'
56+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1'
5957
}
6058
}
6159
stage('En ITN grammars') {
6260
steps {
63-
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/01-30-23'
61+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1'
6462
}
6563
}
6664
stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
6765
steps {
68-
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/01-30-23'
66+
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1'
6967
}
7068
}
7169

@@ -84,7 +82,7 @@ pipeline {
8482
stage('L2: Eng TN') {
8583
steps {
8684
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && \
87-
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/01-30-23 --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
85+
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1 --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
8886
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
8987
cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkinsci/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \
9088
cat $NORM_OUTPUT_DIR/test.pynini.txt && \
@@ -96,7 +94,7 @@ pipeline {
9694
stage('L2: Eng ITN export') {
9795
steps {
9896
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && \
99-
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/01-30-23 --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
97+
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1 --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
10098
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
10199
cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \
102100
cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \

nemo_text_processing/text_normalization/en/data/electronic/symbol.tsv

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ $ dollar sign
1414
? question mark
1515
^ circumflex
1616
` right single quote
17-
{ left brace
1817
| vertical bar
19-
} right brace
2018
~ tilde
2119
, comma
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
drive
2+
sim
3+
early
4+
access
5+
program
6+
rtx RTX
7+
developer
8+
basepod BASEPOD
9+
cuda CUDA
10+
cv
11+
enterprise
12+
services
13+
nvidia NVIDIA
14+
dgx DGX
15+
pro
16+
help

nemo_text_processing/text_normalization/en/taggers/cardinal.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,21 +89,20 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
8989

9090
if deterministic:
9191
long_numbers = pynini.compose(NEMO_DIGIT ** (5, ...), self.single_digits_graph).optimize()
92-
final_graph = plurals._priority_union(long_numbers, self.graph_with_and, NEMO_SIGMA).optimize()
92+
self.long_numbers = plurals._priority_union(long_numbers, self.graph_with_and, NEMO_SIGMA).optimize()
9393
cardinal_with_leading_zeros = pynini.compose(
9494
pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph
9595
)
96-
final_graph |= cardinal_with_leading_zeros
96+
final_graph = self.long_numbers | cardinal_with_leading_zeros
9797
else:
9898
leading_zeros = pynini.compose(pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
9999
cardinal_with_leading_zeros = (
100100
leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph_with_and)
101101
)
102-
102+
self.long_numbers = self.graph_with_and | pynutil.add_weight(self.single_digits_graph, 0.0001)
103103
# add small weight to non-default graphs to make sure the deterministic option is listed first
104104
final_graph = (
105-
self.graph_with_and
106-
| pynutil.add_weight(self.single_digits_graph, 0.0001)
105+
self.long_numbers
107106
| get_four_digit_year_graph() # allows e.g. 4567 be pronounced as forty five sixty seven
108107
| pynutil.add_weight(single_digits_graph_with_commas, 0.0001)
109108
| cardinal_with_leading_zeros

nemo_text_processing/text_normalization/en/taggers/electronic.py

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@
1515

1616
import pynini
1717
from nemo_text_processing.text_normalization.en.graph_utils import (
18+
MIN_NEG_WEIGHT,
1819
NEMO_ALPHA,
1920
NEMO_DIGIT,
20-
NEMO_SIGMA,
21+
NEMO_NOT_SPACE,
22+
NEMO_UPPER,
23+
TO_UPPER,
2124
GraphFst,
2225
get_abs_path,
2326
insert_space,
@@ -31,56 +34,84 @@ class ElectronicFst(GraphFst):
3134
e.g. cdf1@abc.edu -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
3235
3336
Args:
37+
cardinal: CardinalFst
3438
deterministic: if True will provide a single transduction option,
3539
for False multiple transduction are generated (used for audio-based normalization)
3640
"""
3741

38-
def __init__(self, deterministic: bool = True):
42+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
3943
super().__init__(name="electronic", kind="classify", deterministic=deterministic)
4044

45+
if deterministic:
46+
numbers = NEMO_DIGIT
47+
else:
48+
numbers = pynutil.insert(" ") + cardinal.long_numbers + pynutil.insert(" ")
49+
4150
accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input")
4251
accepted_common_domains = pynini.project(
4352
pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input"
4453
)
45-
all_accepted_symbols = NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
54+
55+
dict_words = pynutil.add_weight(pynini.string_file(get_abs_path("data/electronic/words.tsv")), MIN_NEG_WEIGHT)
56+
57+
dict_words_without_delimiter = dict_words + pynini.closure(
58+
pynutil.add_weight(pynutil.insert(" ") + dict_words, MIN_NEG_WEIGHT), 1
59+
)
60+
dict_words_graph = dict_words_without_delimiter | dict_words
61+
62+
all_accepted_symbols_start = (
63+
dict_words_graph | pynini.closure(TO_UPPER) | pynini.closure(NEMO_UPPER) | accepted_symbols
64+
).optimize()
65+
66+
all_accepted_symbols_end = (
67+
dict_words_graph | numbers | pynini.closure(TO_UPPER) | pynini.closure(NEMO_UPPER) | accepted_symbols
68+
).optimize()
69+
4670
graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()
71+
username = (NEMO_ALPHA | dict_words_graph) + pynini.closure(
72+
NEMO_ALPHA | numbers | accepted_symbols | dict_words_graph
73+
)
74+
75+
username = pynutil.insert("username: \"") + username + pynutil.insert("\"") + pynini.cross('@', ' ')
4776

48-
username = (
49-
pynutil.insert("username: \"") + all_accepted_symbols + pynutil.insert("\"") + pynini.cross('@', ' ')
77+
domain_graph = all_accepted_symbols_start + pynini.closure(
78+
all_accepted_symbols_end | pynutil.add_weight(accepted_common_domains, MIN_NEG_WEIGHT)
5079
)
51-
domain_graph = all_accepted_symbols + pynini.accep('.') + all_accepted_symbols + NEMO_ALPHA
52-
protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "semicolon")) + pynutil.insert(" "))
80+
81+
protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "colon")) + pynutil.insert(" "))
5382
protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross("http", "HTTP ")) + (
5483
pynini.accep("://") @ protocol_symbols
5584
)
5685
protocol_file_start = pynini.accep("file") + insert_space + (pynini.accep(":///") @ protocol_symbols)
5786

58-
protocol_end = pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols
87+
protocol_end = pynutil.add_weight(pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols, -1000)
5988
protocol = protocol_file_start | protocol_start | protocol_end | (protocol_start + protocol_end)
6089

61-
domain_graph = (
62-
pynutil.insert("domain: \"")
63-
+ pynini.difference(domain_graph, pynini.project(protocol, "input") + NEMO_SIGMA)
64-
+ pynutil.insert("\"")
65-
)
66-
domain_common_graph = (
90+
domain_graph_with_class_tags = (
6791
pynutil.insert("domain: \"")
68-
+ pynini.difference(
69-
all_accepted_symbols
70-
+ accepted_common_domains
71-
+ pynini.closure(accepted_symbols + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols), 0, 1),
72-
pynini.project(protocol, "input") + NEMO_SIGMA,
73-
)
92+
+ pynini.compose(
93+
NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + (NEMO_ALPHA | NEMO_DIGIT | pynini.accep("/")),
94+
domain_graph,
95+
).optimize()
7496
+ pynutil.insert("\"")
7597
)
7698

77-
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
99+
protocol = pynutil.insert("protocol: \"") + pynutil.add_weight(protocol, MIN_NEG_WEIGHT) + pynutil.insert("\"")
78100
# email
79-
graph = username + domain_graph
101+
graph = username + domain_graph_with_class_tags
102+
80103
# abc.com, abc.com/123-sm
81-
graph |= domain_common_graph
104+
# when only domain, make sure it starts and end with NEMO_ALPHA
105+
graph |= (
106+
pynutil.insert("domain: \"")
107+
+ pynini.compose(
108+
NEMO_ALPHA + pynini.closure(NEMO_NOT_SPACE) + accepted_common_domains + pynini.closure(NEMO_NOT_SPACE),
109+
domain_graph,
110+
).optimize()
111+
+ pynutil.insert("\"")
112+
)
82113
# www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
83-
graph |= protocol + pynutil.insert(" ") + domain_graph
114+
graph |= protocol + pynutil.insert(" ") + domain_graph_with_class_tags
84115

85116
final_graph = self.add_tokens(graph)
86117

nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def __init__(
123123
logging.debug(f"telephone: {time.time() - start_time: .2f}s -- {telephone_graph.num_states()} nodes")
124124

125125
start_time = time.time()
126-
electonic_graph = ElectronicFst(deterministic=deterministic).fst
126+
electonic_graph = ElectronicFst(cardinal=cardinal, deterministic=deterministic).fst
127127
logging.debug(f"electronic: {time.time() - start_time: .2f}s -- {electonic_graph.num_states()} nodes")
128128

129129
start_time = time.time()

nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_lm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def __init__(
120120
word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).graph
121121
time_graph = TimeFst(cardinal=cardinal, deterministic=True).fst
122122
telephone_graph = TelephoneFst(deterministic=True).fst
123-
electronic_graph = ElectronicFst(deterministic=True).fst
123+
electronic_graph = ElectronicFst(cardinal=cardinal, deterministic=True).fst
124124
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=False).fst
125125
whitelist = WhiteListFst(input_case=input_case, deterministic=False, input_file=whitelist)
126126
whitelist_graph = whitelist.graph

nemo_text_processing/text_normalization/en/taggers/tokenize_and_classify_with_audio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def __init__(
117117
word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).graph
118118
time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst
119119
telephone_graph = TelephoneFst(deterministic=deterministic).fst
120-
electronic_graph = ElectronicFst(deterministic=deterministic).fst
120+
electronic_graph = ElectronicFst(cardinal=cardinal, deterministic=deterministic).fst
121121
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst
122122
whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
123123
whitelist_graph = whitelist.graph

nemo_text_processing/text_normalization/en/verbalizers/electronic.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,14 @@
1414

1515
import pynini
1616
from nemo_text_processing.text_normalization.en.graph_utils import (
17+
MIN_NEG_WEIGHT,
18+
NEMO_ALPHA,
19+
NEMO_CHAR,
20+
NEMO_LOWER,
1721
NEMO_NOT_QUOTE,
18-
NEMO_NOT_SPACE,
1922
NEMO_SIGMA,
20-
TO_UPPER,
23+
NEMO_SPACE,
24+
TO_LOWER,
2125
GraphFst,
2226
delete_extra_space,
2327
delete_space,
@@ -42,39 +46,55 @@ def __init__(self, deterministic: bool = True):
4246
super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)
4347
graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
4448
graph_zero = pynini.cross("0", "zero")
49+
long_numbers = pynutil.add_weight(graph_digit_no_zero + pynini.cross("000", " thousand"), MIN_NEG_WEIGHT)
4550

4651
if not deterministic:
4752
graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")
4853

4954
graph_digit = graph_digit_no_zero | graph_zero
5055
graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()
5156

57+
NEMO_NOT_BRACKET = pynini.difference(NEMO_CHAR, pynini.union("{", "}")).optimize()
58+
dict_words = pynini.project(pynini.string_file(get_abs_path("data/electronic/words.tsv")), "output")
5259
default_chars_symbols = pynini.cdrewrite(
53-
pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA
60+
pynutil.insert(" ") + (graph_symbols | graph_digit | long_numbers) + pynutil.insert(" "),
61+
"",
62+
"",
63+
NEMO_SIGMA,
5464
)
5565
default_chars_symbols = pynini.compose(
56-
pynini.closure(NEMO_NOT_SPACE), default_chars_symbols.optimize()
66+
pynini.closure(NEMO_NOT_BRACKET), default_chars_symbols.optimize()
5767
).optimize()
5868

69+
# this is far cases when user name was split by dictionary words, i.e. "sevicepart@ab.com" -> "service part"
70+
space_separated_dict_words = pynutil.add_weight(
71+
NEMO_ALPHA
72+
+ pynini.closure(NEMO_ALPHA | NEMO_SPACE)
73+
+ NEMO_SPACE
74+
+ pynini.closure(NEMO_ALPHA | NEMO_SPACE),
75+
MIN_NEG_WEIGHT,
76+
)
77+
5978
user_name = (
6079
pynutil.delete("username:")
6180
+ delete_space
6281
+ pynutil.delete("\"")
63-
+ default_chars_symbols
82+
+ (default_chars_symbols | space_separated_dict_words).optimize()
6483
+ pynutil.delete("\"")
6584
)
6685

6786
domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
6887

6988
domain = (
70-
default_chars_symbols
89+
pynini.compose(
90+
default_chars_symbols,
91+
pynini.closure(TO_LOWER | NEMO_LOWER | NEMO_SPACE | pynutil.add_weight(dict_words, MIN_NEG_WEIGHT)),
92+
)
7193
+ insert_space
7294
+ plurals._priority_union(
7395
domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA
7496
)
75-
+ pynini.closure(
76-
insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1
77-
)
97+
+ pynini.closure(insert_space + default_chars_symbols, 0, 1)
7898
)
7999
domain = (
80100
pynutil.delete("domain:")

nemo_text_processing/text_normalization/normalize_with_audio.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from nemo_text_processing.text_normalization.data_loader_utils import post_process_punct, pre_process
2424
from nemo_text_processing.text_normalization.normalize import Normalizer
2525
from nemo_text_processing.text_normalization.utils_audio_based import get_alignment
26-
from pynini import Far
2726
from pynini.lib import rewrite
2827

2928

@@ -188,6 +187,14 @@ def normalize(
188187
def normalize_non_deterministic(
189188
self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False
190189
):
190+
# get deterministic option
191+
if self.tagger:
192+
deterministic_form = super().normalize(
193+
text=text, verbose=verbose, punct_pre_process=False, punct_post_process=punct_post_process
194+
)
195+
else:
196+
deterministic_form = None
197+
191198
original_text = text
192199

193200
text = pre_process(text) # to handle []
@@ -244,6 +251,9 @@ def normalize_non_deterministic(
244251
normalized_texts, weights = zip(*remove_dup)
245252
return list(normalized_texts), weights
246253

254+
if deterministic_form is not None:
255+
normalized_texts.append(deterministic_form)
256+
247257
normalized_texts = set(normalized_texts)
248258
return normalized_texts
249259

0 commit comments

Comments
 (0)