Skip to content

Commit 36fdbf4

Browse files
authored
Align ci test (#51)
* added jenkins tests for aligment Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * added test to pr doc Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * fix ci test Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * fix ci test Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * fix ci test Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * fix ci Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * fix ci Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * fix ci Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * fix Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * fix Signed-off-by: Yang Zhang <yangzhang@nvidia.com> --------- Signed-off-by: Yang Zhang <yangzhang@nvidia.com>
1 parent 2fe04a2 commit 36fdbf4

3 files changed

Lines changed: 28 additions & 4 deletions

File tree

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,6 @@ Add a one line overview of what this PR aims to accomplish.
2424
- [ ] New Feature
2525
- [ ] Bugfix
2626
- [ ] Documentation
27+
- [ ] Test
2728

2829
If you haven't finished some of the above items you can still open "Draft" PR.

Jenkinsfile

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ pipeline {
6565
}
6666
failFast true
6767
parallel {
68+
6869
stage('L0: En TN grammars') {
6970
steps {
7071
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir ${EN_TN_CACHE}'
@@ -375,7 +376,26 @@ pipeline {
375376
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
376377
cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \
377378
cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \
378-
rm -rf $OUTPUT_DIR'
379+
rm -rf $DENORM_OUTPUT_DIR'
380+
}
381+
}
382+
383+
384+
stage('L2: Eng alignment TN') {
385+
steps {
386+
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
387+
cd nemo_text_processing/fst_alignment && python alignment.py --text="2615 Forest Av, 90501 CA, Santa Clara. 10kg, 12/16/2018" --grammar=tn --rule=tokenize_and_classify --fst=${EN_TN_CACHE}/en_tn_True_deterministic_cased__tokenize.far 2>&1 | tee $NORM_OUTPUT_DIR/pred.txt && \
388+
cmp --silent $NORM_OUTPUT_DIR/pred.txt /home/jenkinsci/TestData/text_norm/ci/alignment_gold.txt || exit 1 && \
389+
rm -rf $NORM_OUTPUT_DIR'
390+
}
391+
}
392+
393+
stage('L2: Eng alignment ITN') {
394+
steps {
395+
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
396+
cd nemo_text_processing/fst_alignment && python alignment.py --text="one million twenty three thousand two hundred eleven ten kilograms one hundred twenty three dollars and twenty five cents" --grammar=itn --rule=tokenize_and_classify --fst=${EN_TN_CACHE}/en_itn_lower_cased.far 2>&1 | tee $DENORM_OUTPUT_DIR/pred.txt && \
397+
cmp --silent $DENORM_OUTPUT_DIR/pred.txt /home/jenkinsci/TestData/text_denorm/ci/alignment_gold.txt || exit 1 && \
398+
rm -rf $DENORM_OUTPUT_DIR'
379399
}
380400
}
381401

nemo_text_processing/fst_alignment/alignment.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515

1616
import logging
17+
import string
1718
from argparse import ArgumentParser
1819
from typing import List
1920

@@ -99,6 +100,7 @@ def parse_args():
99100
WHITE_SPACE = "\u23B5"
100101
ITN_MODE = "itn"
101102
TN_MODE = "tn"
103+
tn_itn_symbols = list(string.ascii_letters + string.digits) + list("$\:+-=")
102104

103105

104106
def get_word_segments(text: str) -> List[List[int]]:
@@ -210,26 +212,27 @@ def indexed_map_to_output(alignment: List[tuple], start: int, end: int, mode: st
210212
aligned_end = _get_aligned_index(alignment, end - 1) # inclusive
211213

212214
logging.debug(f"0: |{list(map(remove, [x[0] for x in alignment[aligned_start:aligned_end+1]]))}|")
215+
logging.debug(f"1: |{aligned_start}:{aligned_end+1}|")
213216

214217
# extend aligned_start to left
215218

216219
while (
217220
aligned_start - 1 > 0
218221
and alignment[aligned_start - 1][0] == EPS
219-
and (alignment[aligned_start - 1][1].isalnum() or alignment[aligned_start - 1][1] == EPS)
222+
and (alignment[aligned_start - 1][1] in tn_itn_symbols or alignment[aligned_start - 1][1] == EPS)
220223
):
221224
aligned_start -= 1
222225

223226
while (
224227
aligned_end + 1 < len(alignment)
225228
and alignment[aligned_end + 1][0] == EPS
226-
and (alignment[aligned_end + 1][1].isalnum() or alignment[aligned_end + 1][1] == EPS)
229+
and (alignment[aligned_end + 1][1] in tn_itn_symbols or alignment[aligned_end + 1][1] == EPS)
227230
):
228231
aligned_end += 1
229232

230233
if mode == TN_MODE:
231234
while (aligned_end + 1) < len(alignment) and (
232-
alignment[aligned_end + 1][1].isalnum() or alignment[aligned_end + 1][1] == EPS
235+
alignment[aligned_end + 1][1] in tn_itn_symbols or alignment[aligned_end + 1][1] == EPS
233236
):
234237
aligned_end += 1
235238

0 commit comments

Comments
 (0)