Skip to content

Commit 2f6f2f6

Browse files
Tn en astronomical no (#28)
* Add support for large numbers (>999,999,999,999,999) Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Update cache folder in Jenkinsfile Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Increase mem size for CI tests Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Updating shmem for docker to deal with memory overflow Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Ensure large au cardinal graph is used only if deterministic Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Make comma mandatory in cardinals Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Run FST cache generation and Pytests in separate stages Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Fix stage Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Change cache folder Signed-off-by: Anand Joseph <anajoseph@nvidia.com> --------- Signed-off-by: Anand Joseph <anajoseph@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 0a72b79 commit 2f6f2f6

4 files changed

Lines changed: 30 additions & 10 deletions

File tree

Jenkinsfile

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ pipeline {
22
agent {
33
docker {
44
image 'nvcr.io/nvidia/pytorch:22.12-py3'
5-
args '--user 0:128 -v /home/jenkinsci:/home/jenkinsci -v $HOME/.cache:/root/.cache --shm-size=3g --entrypoint=""'
5+
args '--user 0:128 -v /home/jenkinsci:/home/jenkinsci -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""'
66
}
77
}
88
options {
@@ -42,7 +42,7 @@ pipeline {
4242

4343

4444

45-
stage('L0: TN/ITN Tests CPU') {
45+
stage('L0: TN/ITN Grammars') {
4646
when {
4747
anyOf {
4848
branch 'main'
@@ -51,24 +51,41 @@ pipeline {
5151
}
5252
failFast true
5353
parallel {
54-
stage('En TN grammars') {
54+
stage('L0: En TN grammars') {
5555
steps {
56-
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1'
56+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
5757
}
5858
}
59-
stage('En ITN grammars') {
59+
stage('L0: En TN non-deterministic grammars') {
6060
steps {
61-
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1'
61+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text="1" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
6262
}
6363
}
64-
stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
64+
stage('L0: En ITN grammars') {
6565
steps {
66-
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1'
66+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
6767
}
6868
}
6969

7070
}
7171
}
72+
73+
stage('L1: TN/ITN Tests CPU') {
74+
when {
75+
anyOf {
76+
branch 'main'
77+
changeRequest target: 'main'
78+
}
79+
}
80+
failFast true
81+
parallel {
82+
stage('L1: Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
83+
steps {
84+
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3'
85+
}
86+
}
87+
}
88+
}
7289

7390
stage('L2: NeMo text processing') {
7491
when {
@@ -82,7 +99,7 @@ pipeline {
8299
stage('L2: Eng TN') {
83100
steps {
84101
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && \
85-
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1 --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
102+
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
86103
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
87104
cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkinsci/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \
88105
cat $NORM_OUTPUT_DIR/test.pynini.txt && \
@@ -94,7 +111,7 @@ pipeline {
94111
stage('L2: Eng ITN export') {
95112
steps {
96113
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && \
97-
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-1 --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
114+
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir /home/jenkinsci/TestData/text_norm/ci/grammars/02-01-23-3 --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
98115
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
99116
cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \
100117
cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \
Binary file not shown.

nemo_text_processing/text_normalization/en/taggers/cardinal.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
4444
self.deterministic = deterministic
4545
# TODO replace to have "oh" as a default for "0"
4646
graph = pynini.Far(get_abs_path("data/number/cardinal_number_name.far")).get_fst()
47+
graph_au = pynini.Far(get_abs_path("data/number/cardinal_number_name_au.far")).get_fst()
4748
self.graph_hundred_component_at_least_one_none_zero_digit = (
4849
pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))
4950
) @ graph
@@ -94,6 +95,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
9495
pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph
9596
)
9697
final_graph = self.long_numbers | cardinal_with_leading_zeros
98+
final_graph |= self.add_optional_and(graph_au)
9799
else:
98100
leading_zeros = pynini.compose(pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
99101
cardinal_with_leading_zeros = (

tests/nemo_text_processing/en/data_text_normalization/test_cases_cardinal.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@
1515
1234567890123124~one two three four five six seven eight nine zero one two three one two four
1616
978-0~nine hundred and seventy eight - zero
1717
004~zero zero four
18+
124,444,234,854,823,834,553~one hundred twenty four quintillion four hundred forty four quadrillion two hundred thirty four trillion eight hundred fifty four billion eight hundred twenty three million eight hundred thirty four thousand five hundred and fifty three

0 commit comments

Comments
 (0)