Skip to content

Commit 45c04c4

Browse files
Zhitn 0727 (#93)
* updates on itn grammar to pass sparrowhawk tests Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * updats for sparrowhawk tests Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * updates fro sparrowhawk tests Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * coding style fix Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * updates for coding style and sparrowhawk test Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * updated classes for tests on whitelist and word grammar Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * added for tests on whitelist Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * added for test on word Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * added to run test on whitelist Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * added to run test on word Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update test_word.py Removed unused import. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_word.py Removed imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_whitelist.py Removing imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_whitelist.py Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update Jenkinsfile changed zh cache to 07-27-23 as it is the latest update. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --------- Signed-off-by: BuyuanCui <alexcui1994@gmail.com> Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent a95fa25 commit 45c04c4

9 files changed

Lines changed: 142 additions & 74 deletions

File tree

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ pipeline {
2222
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2323
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2424
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
25-
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-12-23-0'
25+
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0'
2626
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2727

2828
}

nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,11 @@ def __init__(self):
7878
| (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all)
7979
| (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits)
8080
)
81-
graph_ten_thousands = graph_ten_thousands_simple | graph_ten_thousands_complex | pynutil.insert("00000")
81+
graph_ten_thousands = (
82+
pynutil.add_weight(graph_ten_thousands_simple, -1.0)
83+
| graph_ten_thousands_complex
84+
| pynutil.insert("00000")
85+
)
8286

8387
# grammmar for hundred thousands 十万
8488
graph_hundred_thousands_simple = graph_all + closure_ten_thousands
@@ -88,8 +92,10 @@ def __init__(self):
8892
| (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all)
8993
| (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits)
9094
)
91-
graph_hundred_thousands = (graph_hundred_thousands_simple | graph_hundred_thousands_complex) | pynutil.insert(
92-
"000000"
95+
graph_hundred_thousands = (
96+
pynutil.add_weight(graph_hundred_thousands_simple, -1.0)
97+
| graph_hundred_thousands_complex
98+
| pynutil.insert("000000")
9399
)
94100

95101
# grammar for millions 百万
@@ -168,7 +174,9 @@ def __init__(self):
168174
| (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits)
169175
)
170176
graph_hundred_millions = (
171-
graph_hundred_millions_simple | graph_hundred_millions_complex | pynutil.insert("000000000")
177+
pynutil.add_weight(graph_hundred_millions_simple, -1.0)
178+
| graph_hundred_millions_complex
179+
| pynutil.insert("000000000")
172180
)
173181

174182
# grammar for billions 十亿
@@ -203,7 +211,9 @@ def __init__(self):
203211
| (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all)
204212
| (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits)
205213
)
206-
graph_billions = graph_billions_simple | graph_billions_complex | pynutil.insert("0000000000")
214+
graph_billions = (
215+
pynutil.add_weight(graph_billions_simple, -1.0) | graph_billions_complex | pynutil.insert("0000000000")
216+
)
207217

208218
# grammar for ten billions 百亿
209219
graph_ten_billions_simple = graph_hundreds_complex + closure_hundred_millions
@@ -252,7 +262,11 @@ def __init__(self):
252262
+ graph_digits
253263
)
254264
)
255-
graph_ten_billions = graph_ten_billions_simple | graph_ten_billions_complex | pynutil.insert("00000000000")
265+
graph_ten_billions = (
266+
pynutil.add_weight(graph_ten_billions_simple, -1.0)
267+
| graph_ten_billions_complex
268+
| pynutil.insert("00000000000")
269+
)
256270

257271
# grammar for hundred billions 千亿
258272
graph_hundred_billions_simple = graph_thousands_complex + closure_hundred_millions
@@ -301,7 +315,9 @@ def __init__(self):
301315
+ graph_digits
302316
)
303317
)
304-
graph_hundred_billions = graph_hundred_billions_simple | graph_hundred_billions_complex
318+
graph_hundred_billions = (
319+
pynutil.add_weight(graph_hundred_billions_simple, -1.0) | graph_hundred_billions_complex
320+
)
305321

306322
# combining grammar; output for cardinal grammar
307323
graph = pynini.union(

nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py

Lines changed: 11 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -68,35 +68,20 @@ class DecimalFst(GraphFst):
6868
def __init__(self, cardinal: GraphFst):
6969
super().__init__(name="decimal", kind="classify")
7070

71-
cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv"))
72-
cardinal_before_decimal = cardinal.just_cardinals | (pynini.closure(pynini.cross("零", "0"), 0, 1))
73-
74-
delete_decimal = pynutil.delete("点") | pynutil.delete(
75-
"點"
76-
) # delete decimal character, 'point' in english in 'one point two for 1.2'
77-
78-
# grammar for integer part
79-
graph_integer = (
80-
pynutil.insert('integer_part: "')
81-
+ (cardinal_before_decimal | (pynini.closure(pynini.cross("零", "0"), 0, 1)))
82-
+ pynutil.insert('" ')
83-
) # tokenization on just numbers
84-
graph_integer_or_none = graph_integer | pynutil.insert('integer_part: "0" ', weight=0.01) # integer or zero
85-
86-
# grammar for fractional part
87-
delete_zero = pynini.closure(pynini.cross("零", "0"))
88-
graph_string_of_cardinals = cardinal_after_decimal
89-
graph_string_of_cardinals = pynini.closure(
90-
(pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals)), 1
91-
)
92-
graph_fractional = pynini.closure(
93-
pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'), 1
71+
cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) | pynini.closure(
72+
pynini.cross("零", "0")
9473
)
74+
cardinal_before_decimal = cardinal.just_cardinals | pynini.cross("零", "0")
75+
76+
delete_decimal = pynutil.delete("点") | pynutil.delete("點")
77+
78+
graph_integer = pynutil.insert('integer_part: "') + cardinal_before_decimal + pynutil.insert('" ')
79+
80+
graph_string_of_cardinals = pynini.closure(cardinal_after_decimal, 1)
81+
graph_fractional = pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"')
9582

96-
# grammar for decimal: integer+delete character+part after decimal point
97-
graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1)
83+
graph_decimal_no_sign = pynini.closure((graph_integer + delete_decimal + graph_fractional), 1)
9884

99-
# New Grammar added for Money
10085
self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity(
10186
graph_decimal_no_sign, cardinal.just_cardinals
10287
)

nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,6 @@ def __init__(self):
3232

3333
# insert a "," for every three numbers before decimal point
3434
space_every_three_integer = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure()
35-
# insert a "," for every three numbers after decimal point
36-
space_every_three_decimal = (
37-
pynini.accep(".") + (exactly_three_digits + pynutil.insert(",")).closure() + at_most_three_digits
38-
)
39-
40-
# combine both
41-
group_by_threes = space_every_three_integer | space_every_three_decimal
42-
self.group_by_threes = group_by_threes
4335

4436
# removing tokenizations, 'negative: '
4537
optional_sign = pynini.closure(
@@ -56,10 +48,10 @@ def __init__(self):
5648
pynutil.delete("integer_part:")
5749
+ delete_space
5850
+ pynutil.delete('"')
59-
+ pynini.closure(NEMO_NOT_QUOTE, 1)
51+
+ pynini.closure(NEMO_DIGIT, 1)
6052
+ pynutil.delete('"')
6153
)
62-
integer = integer @ group_by_threes
54+
integer = integer @ space_every_three_integer
6355
optional_integer = pynini.closure(integer + delete_space, 0, 1)
6456

6557
# removing tokenizations, 'fractionl_part'
@@ -81,10 +73,11 @@ def __init__(self):
8173
+ pynini.closure(NEMO_NOT_QUOTE, 1)
8274
+ pynutil.delete('"')
8375
)
84-
optional_quantity = pynini.closure(quantity + delete_space)
76+
optional_quantity = pynini.closure(delete_space + quantity)
8577

8678
# combining graphs removing tokenizations *3
8779
graph = (optional_integer + optional_fractional + optional_quantity).optimize()
80+
8881
graph = optional_sign + graph # add optional sign for negative number
8982
self.numebrs = graph
9083
delete_tokens = self.delete_tokens(graph)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
人力资源~HR
2+
自动取款机~ATM
3+
人力资源~HR
4+
首席执行官~CEO
5+
美国研究生入学考试~GRE
6+
研究生管理专业入学考试~GMAT
7+
全球定位系统~GPS
8+
刷卡机~POS机
9+
数位多功能光碟~DVD
10+
镭射唱片~CD
11+
通用串行总线~USB
12+
统一资源定位符~URL
13+
虚拟专用网络~VPN
14+
网络互联协议~IP
15+
脱氧核糖核酸~DNA
16+
核糖核酸~RNA
17+
平均学分绩点~GPA
18+
发光二极管~LED
19+
可移植文档格式~PDF
20+
社会性网络服务~SNS
21+
博士~PhD
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
你好~你好
2+
年级~年级
3+
秘密~秘密
4+
键盘~键盘
5+
借口~借口
6+
学生~学生
7+
人力~人力
8+
转移~转移
9+
徘徊~徘徊
10+
冤枉~冤枉
11+
浏览~浏览
12+
珍藏~珍藏
13+
患难 ~患难
14+
湿~湿
15+
眼眶~眼眶
16+
遗产~遗产
17+
流浪~流浪
18+
信仰~信仰
19+
戒指~戒指
20+
义无反顾~义无反顾
21+
交换~交换

tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,62 +21,62 @@ runtest () {
2121
}
2222

2323
testITNCardinal() {
24-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_cardinal.txt
24+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_cardinal.txt
2525
runtest $input
2626
}
2727

2828
testITNDate() {
29-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_date.txt
29+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_date.txt
3030
runtest $input
3131
}
3232

3333
testITNDecimal() {
34-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_decimal.txt
34+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_decimal.txt
3535
runtest $input
3636
}
3737

3838
testITNOrdinal() {
39-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_ordinal.txt
39+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_ordinal.txt
4040
runtest $input
4141
}
4242

4343
testITNFraction() {
44-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_fraction.txt
44+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_fraction.txt
4545
runtest $input
4646
}
4747

4848
testITNTime() {
49-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_time.txt
49+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_time.txt
5050
runtest $input
5151
}
5252

53-
testITNMeasure() {
54-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt
55-
runtest $input
56-
}
53+
#testITNMeasure() {
54+
# input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt
55+
# runtest $input
56+
#}
5757

5858
testITNMoney() {
59-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_money.txt
59+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_money.txt
6060
runtest $input
6161
}
6262

6363
testITNWhitelist() {
64-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_whitelist.txt
64+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_whitelist.txt
6565
runtest $input
6666
}
6767

68-
testITNTelephone() {
69-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_telephone.txt
70-
runtest $input
71-
}
68+
#testITNTelephone() {
69+
# input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_telephone.txt
70+
# runtest $input
71+
#}
7272

73-
testITNElectronic() {
74-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt
75-
runtest $input
76-
}
73+
#testITNElectronic() {
74+
# input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt
75+
# runtest $input
76+
#}
7777

7878
testITNWord() {
79-
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_word.txt
79+
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_word.txt
8080
runtest $input
8181
}
8282

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
import pytest
17+
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
18+
from parameterized import parameterized
19+
20+
from ..utils import CACHE_DIR, parse_test_case_file
21+
22+
23+
class TestWhitelist:
24+
inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False)
25+
26+
@parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_whitelist.txt'))
27+
@pytest.mark.run_only_on('CPU')
28+
@pytest.mark.unit
29+
def test_denorm(self, test_input, expected):
30+
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
31+
assert pred == expected
Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -12,19 +12,20 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
1516
import pytest
16-
from nemo_text_processing.text_normalization.normalize import Normalizer
17+
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
1718
from parameterized import parameterized
1819

1920
from ..utils import CACHE_DIR, parse_test_case_file
2021

2122

22-
class TestChar:
23-
normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased')
23+
class TestWord:
24+
inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False)
2425

25-
@parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_word.txt'))
26+
@parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_word.txt'))
2627
@pytest.mark.run_only_on('CPU')
2728
@pytest.mark.unit
28-
def test_norm_char(self, test_input, expected):
29-
preds = self.normalizer_zh.normalize(test_input)
30-
assert expected == preds
29+
def test_denorm(self, test_input, expected):
30+
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
31+
assert pred == expected

0 commit comments

Comments
 (0)