Fix space issue with ZH ITN (#244)

zoobereq · anand-nv · mgrafu · commit cd610a172c8c · 2026-03-13T15:23:09.000-07:00
* Fix space issue with ZH ITN

Signed-off-by: Anand Joseph &lt;anajoseph@nvidia.com&gt;

* Update Jenkinsfile

Update FST paths

Signed-off-by: anand-nv &lt;105917641+anand-nv@users.noreply.github.com&gt;

---------

Signed-off-by: Anand Joseph &lt;anajoseph@nvidia.com&gt;
Signed-off-by: anand-nv &lt;105917641+anand-nv@users.noreply.github.com&gt;
Signed-off-by: Simon Zuberek &lt;simon@zuberek.net&gt;
Co-authored-by: Anand Joseph &lt;anajoseph@nvidia.com&gt;
Co-authored-by: anand-nv &lt;105917641+anand-nv@users.noreply.github.com&gt;
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -22,7 +22,7 @@ pipeline {
     RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
     VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
     SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
-    ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0'
+    ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0'
     IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
     HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
     MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py
@@ -21,6 +21,7 @@
     GraphFst,
     delete_extra_space,
     delete_space,
+    delete_zero_or_one_space,
     generator_main,
 )
 from nemo_text_processing.inverse_text_normalization.zh.taggers.cardinal import CardinalFst
@@ -92,12 +93,12 @@ def __init__(
             )
 
             punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
-            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
+            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ")
             token_plus_punct = (
                 pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
             )
 
-            graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
+            graph = token_plus_punct + pynini.closure(delete_zero_or_one_space + token_plus_punct)
             graph = delete_space + graph + delete_space
 
             self.fst = graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/word.py
@@ -26,5 +26,5 @@ class WordFst(GraphFst):
 
     def __init__(self):
         super().__init__(name="word", kind="classify")
-        word = pynutil.insert('name: "') + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert('"')
+        word = pynutil.insert('name: "') + NEMO_NOT_SPACE + pynutil.insert('"')
         self.fst = word.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/verbalize_final.py
@@ -40,5 +40,5 @@ def __init__(self):
             + delete_space
             + pynutil.delete("}")
         )
-        graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
+        graph = delete_space + pynini.closure(graph + delete_space) + graph + delete_space
         self.fst = graph

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@`
`21`	`21`	`GraphFst,`
`22`	`22`	`delete_extra_space,`
`23`	`23`	`delete_space,`
	`24`	`+ delete_zero_or_one_space,`
`24`	`25`	`generator_main,`
`25`	`26`	`)`
`26`	`27`	`from nemo_text_processing.inverse_text_normalization.zh.taggers.cardinal import CardinalFst`
`@@ -92,12 +93,12 @@ def __init__(`
`92`	`93`	`)`
`93`	`94`
`94`	`95`	`punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")`
`95`		`- token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")`
	`96`	`+ token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ")`
`96`	`97`	`token_plus_punct = (`
`97`	`98`	`pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)`
`98`	`99`	`)`
`99`	`100`
`100`		`- graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)`
	`101`	`+ graph = token_plus_punct + pynini.closure(delete_zero_or_one_space + token_plus_punct)`
`101`	`102`	`graph = delete_space + graph + delete_space`
`102`	`103`
`103`	`104`	`self.fst = graph.optimize()`
Original file line number	Diff line number	Diff line change
`@@ -40,5 +40,5 @@ def __init__(self):`
`40`	`40`	`+ delete_space`
`41`	`41`	`+ pynutil.delete("}")`
`42`	`42`	`)`
`43`		`- graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space`
	`43`	`+ graph = delete_space + pynini.closure(graph + delete_space) + graph + delete_space`
`44`	`44`	`self.fst = graph`