Zhitn 0727 (#93)

BuyuanCui · pre-commit-ci[bot] · web-flow · commit 45c04c41663e · 2023-09-04T11:33:11.000+05:30
* updates on itn grammar to pass sparrowhawk tests Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * updats for sparrowhawk tests Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * updates fro sparrowhawk tests Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * coding style fix Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * updates for coding style and sparrowhawk test Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * updated classes for tests on whitelist and word grammar Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * added for tests on whitelist Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * added for test on word Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * added to run test on whitelist Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * added to run test on word Signed-off-by: BuyuanCui <alexcui1994@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update test_word.py Removed unused import. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_word.py Removed imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_whitelist.py Removing imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update test_whitelist.py Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> * Update Jenkinsfile changed zh cache to 07-27-23 as it is the latest update. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --------- Signed-off-by: BuyuanCui <alexcui1994@gmail.com> Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -22,7 +22,7 @@ pipeline {
     RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
     VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
     SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
-    ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-12-23-0'
+    ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0'
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
 
   }
diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py
@@ -78,7 +78,11 @@ def __init__(self):
             | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all)
             | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits)
         )
-        graph_ten_thousands = graph_ten_thousands_simple | graph_ten_thousands_complex | pynutil.insert("00000")
+        graph_ten_thousands = (
+            pynutil.add_weight(graph_ten_thousands_simple, -1.0)
+            | graph_ten_thousands_complex
+            | pynutil.insert("00000")
+        )
 
         # grammmar for hundred thousands 十万
         graph_hundred_thousands_simple = graph_all + closure_ten_thousands
@@ -88,8 +92,10 @@ def __init__(self):
             | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all)
             | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits)
         )
-        graph_hundred_thousands = (graph_hundred_thousands_simple | graph_hundred_thousands_complex) | pynutil.insert(
-            "000000"
+        graph_hundred_thousands = (
+            pynutil.add_weight(graph_hundred_thousands_simple, -1.0)
+            | graph_hundred_thousands_complex
+            | pynutil.insert("000000")
         )
 
         # grammar for millions 百万
@@ -168,7 +174,9 @@ def __init__(self):
             | (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits)
         )
         graph_hundred_millions = (
-            graph_hundred_millions_simple | graph_hundred_millions_complex | pynutil.insert("000000000")
+            pynutil.add_weight(graph_hundred_millions_simple, -1.0)
+            | graph_hundred_millions_complex
+            | pynutil.insert("000000000")
         )
 
         # grammar for billions 十亿
@@ -203,7 +211,9 @@ def __init__(self):
             | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all)
             | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits)
         )
-        graph_billions = graph_billions_simple | graph_billions_complex | pynutil.insert("0000000000")
+        graph_billions = (
+            pynutil.add_weight(graph_billions_simple, -1.0) | graph_billions_complex | pynutil.insert("0000000000")
+        )
 
         # grammar for ten billions 百亿
         graph_ten_billions_simple = graph_hundreds_complex + closure_hundred_millions
@@ -252,7 +262,11 @@ def __init__(self):
                 + graph_digits
             )
         )
-        graph_ten_billions = graph_ten_billions_simple | graph_ten_billions_complex | pynutil.insert("00000000000")
+        graph_ten_billions = (
+            pynutil.add_weight(graph_ten_billions_simple, -1.0)
+            | graph_ten_billions_complex
+            | pynutil.insert("00000000000")
+        )
 
         # grammar for hundred billions 千亿
         graph_hundred_billions_simple = graph_thousands_complex + closure_hundred_millions
@@ -301,7 +315,9 @@ def __init__(self):
                 + graph_digits
             )
         )
-        graph_hundred_billions = graph_hundred_billions_simple | graph_hundred_billions_complex
+        graph_hundred_billions = (
+            pynutil.add_weight(graph_hundred_billions_simple, -1.0) | graph_hundred_billions_complex
+        )
 
         # combining grammar; output for cardinal grammar
         graph = pynini.union(
diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py
@@ -68,35 +68,20 @@ class DecimalFst(GraphFst):
     def __init__(self, cardinal: GraphFst):
         super().__init__(name="decimal", kind="classify")
 
-        cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv"))
-        cardinal_before_decimal = cardinal.just_cardinals | (pynini.closure(pynini.cross("零", "0"), 0, 1))
-
-        delete_decimal = pynutil.delete("点") | pynutil.delete(
-            "點"
-        )  # delete decimal character, 'point' in english in 'one point two for 1.2'
-
-        # grammar for integer part
-        graph_integer = (
-            pynutil.insert('integer_part: "')
-            + (cardinal_before_decimal | (pynini.closure(pynini.cross("零", "0"), 0, 1)))
-            + pynutil.insert('" ')
-        )  # tokenization on just numbers
-        graph_integer_or_none = graph_integer | pynutil.insert('integer_part: "0" ', weight=0.01)  # integer or zero
-
-        # grammar for fractional part
-        delete_zero = pynini.closure(pynini.cross("零", "0"))
-        graph_string_of_cardinals = cardinal_after_decimal
-        graph_string_of_cardinals = pynini.closure(
-            (pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals)), 1
-        )
-        graph_fractional = pynini.closure(
-            pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'), 1
+        cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) | pynini.closure(
+            pynini.cross("零", "0")
         )
+        cardinal_before_decimal = cardinal.just_cardinals | pynini.cross("零", "0")
+
+        delete_decimal = pynutil.delete("点") | pynutil.delete("點")
+
+        graph_integer = pynutil.insert('integer_part: "') + cardinal_before_decimal + pynutil.insert('" ')
+
+        graph_string_of_cardinals = pynini.closure(cardinal_after_decimal, 1)
+        graph_fractional = pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"')
 
-        # grammar for decimal: integer+delete character+part after decimal point
-        graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1)
+        graph_decimal_no_sign = pynini.closure((graph_integer + delete_decimal + graph_fractional), 1)
 
-        # New Grammar added for Money
         self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity(
             graph_decimal_no_sign, cardinal.just_cardinals
         )
diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py
@@ -32,14 +32,6 @@ def __init__(self):
 
         # insert a "," for every three numbers before decimal point
         space_every_three_integer = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure()
-        # insert a "," for every three numbers after decimal point
-        space_every_three_decimal = (
-            pynini.accep(".") + (exactly_three_digits + pynutil.insert(",")).closure() + at_most_three_digits
-        )
-
-        # combine both
-        group_by_threes = space_every_three_integer | space_every_three_decimal
-        self.group_by_threes = group_by_threes
 
         # removing tokenizations, 'negative: '
         optional_sign = pynini.closure(
@@ -56,10 +48,10 @@ def __init__(self):
             pynutil.delete("integer_part:")
             + delete_space
             + pynutil.delete('"')
-            + pynini.closure(NEMO_NOT_QUOTE, 1)
+            + pynini.closure(NEMO_DIGIT, 1)
             + pynutil.delete('"')
         )
-        integer = integer @ group_by_threes
+        integer = integer @ space_every_three_integer
         optional_integer = pynini.closure(integer + delete_space, 0, 1)
 
         # removing tokenizations, 'fractionl_part'
@@ -81,10 +73,11 @@ def __init__(self):
             + pynini.closure(NEMO_NOT_QUOTE, 1)
             + pynutil.delete('"')
         )
-        optional_quantity = pynini.closure(quantity + delete_space)
+        optional_quantity = pynini.closure(delete_space + quantity)
 
         # combining graphs removing tokenizations *3
         graph = (optional_integer + optional_fractional + optional_quantity).optimize()
+
         graph = optional_sign + graph  # add optional sign for negative number
         self.numebrs = graph
         delete_tokens = self.delete_tokens(graph)
diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt
@@ -0,0 +1,21 @@
+人力资源~HR
+自动取款机~ATM
+人力资源~HR
+首席执行官~CEO
+美国研究生入学考试~GRE
+研究生管理专业入学考试~GMAT
+全球定位系统~GPS
+刷卡机~POS机
+数位多功能光碟~DVD
+镭射唱片~CD
+通用串行总线~USB
+统一资源定位符~URL
+虚拟专用网络~VPN
+网络互联协议~IP
+脱氧核糖核酸~DNA
+核糖核酸~RNA
+平均学分绩点~GPA
+发光二极管~LED
+可移植文档格式~PDF
+社会性网络服务~SNS
+博士~PhD
diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt
@@ -0,0 +1,21 @@
+你好~你好
+年级~年级
+秘密~秘密
+键盘~键盘
+借口~借口
+学生~学生
+人力~人力
+转移~转移
+徘徊~徘徊
+冤枉~冤枉
+浏览~浏览
+珍藏~珍藏
+患难 ~患难
+湿~湿
+眼眶~眼眶
+遗产~遗产
+流浪~流浪
+信仰~信仰
+戒指~戒指
+义无反顾~义无反顾
+交换~交换
diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh
@@ -21,62 +21,62 @@ runtest () {
 }
 
 testITNCardinal() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_cardinal.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_cardinal.txt
   runtest $input
 }
 
 testITNDate() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_date.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_date.txt
   runtest $input
 }
 
 testITNDecimal() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_decimal.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_decimal.txt
   runtest $input
 }
 
 testITNOrdinal() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_ordinal.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_ordinal.txt
   runtest $input
 }
 
 testITNFraction() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_fraction.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_fraction.txt
   runtest $input
 }
 
 testITNTime() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_time.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_time.txt
   runtest $input
 }
 
-testITNMeasure() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt
-  runtest $input
-}
+#testITNMeasure() {
+#  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt
+#  runtest $input
+#}
 
 testITNMoney() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_money.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_money.txt
   runtest $input
 }
 
 testITNWhitelist() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_whitelist.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_whitelist.txt
   runtest $input
 }
 
-testITNTelephone() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_telephone.txt
-  runtest $input
-}
+#testITNTelephone() {
+#  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_telephone.txt
+#  runtest $input
+#}
 
-testITNElectronic() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt
-  runtest $input
-}
+#testITNElectronic() {
+#  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt
+#  runtest $input
+#}
 
 testITNWord() {
-  input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_word.txt
+  input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_word.txt
   runtest $input
 }
 
diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from parameterized import parameterized
+
+from ..utils import CACHE_DIR, parse_test_case_file
+
+
+class TestWhitelist:
+    inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False)
+
+    @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_whitelist.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred == expected
diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import pytest
-from nemo_text_processing.text_normalization.normalize import Normalizer
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
 from parameterized import parameterized
 
 from ..utils import CACHE_DIR, parse_test_case_file
 
 
-class TestChar:
-    normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased')
+class TestWord:
+    inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False)
 
-    @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_word.txt'))
+    @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_word.txt'))
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
-    def test_norm_char(self, test_input, expected):
-        preds = self.normalizer_zh.normalize(test_input)
-        assert expected == preds
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred == expected

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ pipeline {`
`22`	`22`	`RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'`
`23`	`23`	`VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'`
`24`	`24`	`SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'`
`25`		`- ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-12-23-0'`
	`25`	`+ ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0'`
`26`	`26`	`DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'`
`27`	`27`
`28`	`28`	`}`