NVIDIA
diff --git a/‎Jenkinsfile‎
Lines changed: 14 additions & 5 deletions b/‎Jenkinsfile‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎nemo_text_processing/inverse_text_normalization/hi_en/__init__.py‎
Lines changed: 17 additions & 0 deletions b/‎nemo_text_processing/inverse_text_normalization/hi_en/__init__.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py‎
Lines changed: 13 additions & 0 deletions b/‎nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py‎
Lines changed: 173 additions & 0 deletions b/‎nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎nemo_text_processing/inverse_text_normalization/hi_en/utils.py‎
Lines changed: 27 additions & 0 deletions b/‎nemo_text_processing/inverse_text_normalization/hi_en/utils.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py‎
Lines changed: 13 additions & 0 deletions
@@ -12,9 +12,10 @@ pipeline {
   environment {
     AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0'
     DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0'
-    EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-25-0'
+    EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-3'
     ES_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-24-0'
     ES_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-30-24-0'
+    HI_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-4'
     FR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-07-25-0'
     HU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/07-16-24-0'
     PT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-01-26-1'
@@ -27,8 +28,8 @@ pipeline {
     HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0'
     MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
     JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
-    KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
-    HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
+    HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-5'
+    KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-25-6'
     DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {
@@ -104,7 +105,11 @@ pipeline {
                 sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi --text="एक" --cache_dir ${HI_TN_CACHE}'
             }
         }
-        
+        stage('L0: Codeswitched HI/EN ITN grammars') {
+          steps {
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi_en --text="एक" --cache_dir ${HI_EN_TN_CACHE}'
+          }
+        }
       }
     }
 
@@ -168,7 +173,6 @@ pipeline {
             sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ar --text="اثنان " --cache_dir ${AR_TN_CACHE}'
           }
         }
-
       }
     }
 
@@ -409,6 +413,11 @@ pipeline {
             sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/es_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${ES_EN_TN_CACHE}'
           }
         }
+        stage('L1: Run all Codeswitched HI/EN TN/ITN tests (restore grammars from cache)') {
+          steps {
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hi_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${HI_EN_TN_CACHE}'
+          }
+        }
         stage('L1: Run all AR TN/ITN tests (restore grammars from cache)') {
           steps {
             sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ar/ -m "not pleasefixme" --cpu --tn_cache_dir ${AR_TN_CACHE}'
 
@@ -0,0 +1,17 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_text_processing.inverse_text_normalization.hi_en.taggers.tokenize_and_classify import ClassifyFst
+from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize import VerbalizeFst
+from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize_final import VerbalizeFinalFst
@@ -0,0 +1,13 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,13 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,173 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst as EnCardinalFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.date import DateFst as EnDateFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst as EnDecimalFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.electronic import ElectronicFst as EnElectronicFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.measure import MeasureFst as EnMeasureFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst as EnMoneyFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.ordinal import OrdinalFst as EnOrdinalFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst as EnPunctuationFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.telephone import TelephoneFst as EnTelephoneFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.time import TimeFst as EnTimeFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.whitelist import WhiteListFst as EnWhiteListFst
+from nemo_text_processing.inverse_text_normalization.en.taggers.word import WordFst as EnWordFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.date import DateFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.decimal import DecimalFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.fraction import FractionFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.measure import MeasureFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    INPUT_LOWER_CASED,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
+from nemo_text_processing.utils.logging import logger
+
+
+class ClassifyFst(GraphFst):
+    """
+    Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
+    For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
+    More details to deployment at NeMo/tools/text_processing_deployment.
+
+    Args:
+        cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
+        overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with Hindi whitelist replacements. If None, defaults to the Hindi whitelist at
+            nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv
+        en_whitelist: path to a file with English whitelist replacements. If None, defaults to the English whitelist at
+            nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
+        input_case: accepting either "lower_cased" or "cased" input.
+    """
+
+    def __init__(
+        self,
+        cache_dir: str = None,
+        overwrite_cache: bool = False,
+        whitelist: str = None,
+        en_whitelist: str = None,
+        input_case: str = INPUT_LOWER_CASED,
+    ):
+        super().__init__(name="tokenize_and_classify", kind="classify")
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, f"hi_en_itn_{input_case}.far")
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
+            logger.info(f"ClassifyFst.fst was restored from {far_file}.")
+        else:
+            logger.info(f"Creating ClassifyFst grammars.")
+
+            cardinal = CardinalFst()
+            cardinal_graph = cardinal.fst
+
+            ordinal = OrdinalFst(cardinal)
+            ordinal_graph = ordinal.fst
+
+            decimal = DecimalFst(cardinal)
+            decimal_graph = decimal.fst
+
+            fraction = FractionFst(cardinal)
+            fraction_graph = fraction.fst
+
+            measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
+            date_graph = DateFst(cardinal, ordinal).fst
+            word_graph = WordFst().fst
+            time_graph = TimeFst(cardinal).fst
+            money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
+            whitelist_graph = WhiteListFst(input_file=whitelist).fst
+            punct_graph = PunctuationFst().fst
+            telephone_graph = TelephoneFst(cardinal).fst
+
+            en_cardinal = EnCardinalFst(input_case=input_case)
+            en_cardinal_graph = en_cardinal.fst
+
+            en_ordinal = EnOrdinalFst(cardinal=en_cardinal, input_case=input_case)
+            en_ordinal_graph = en_ordinal.fst
+
+            en_decimal = EnDecimalFst(cardinal=en_cardinal, input_case=input_case)
+            en_decimal_graph = en_decimal.fst
+
+            en_measure_graph = EnMeasureFst(cardinal=en_cardinal, decimal=en_decimal, input_case=input_case).fst
+            en_date_graph = EnDateFst(ordinal=en_ordinal, input_case=input_case).fst
+            en_word_graph = EnWordFst().fst
+            en_time_graph = EnTimeFst(input_case=input_case).fst
+            en_money_graph = EnMoneyFst(cardinal=en_cardinal, decimal=en_decimal, input_case=input_case).fst
+            en_whitelist_graph = EnWhiteListFst(input_file=en_whitelist, input_case=input_case).fst
+            en_punct_graph = EnPunctuationFst().fst
+            en_electronic_graph = EnElectronicFst(input_case=input_case).fst
+            en_telephone_graph = EnTelephoneFst(cardinal=en_cardinal, input_case=input_case).fst
+
+            classify = (
+                pynutil.add_weight(whitelist_graph, 1.01)
+                | pynutil.add_weight(en_whitelist_graph, 1.01)
+                | pynutil.add_weight(time_graph, 1.1)
+                | pynutil.add_weight(en_time_graph, 1.1)
+                | pynutil.add_weight(date_graph, 1.09)
+                | pynutil.add_weight(en_date_graph, 1.09)
+                | pynutil.add_weight(decimal_graph, 1.09)
+                | pynutil.add_weight(en_decimal_graph, 1.09)
+                | pynutil.add_weight(fraction_graph, 1.09)
+                | pynutil.add_weight(measure_graph, 1.6)
+                | pynutil.add_weight(en_measure_graph, 1.1)
+                | pynutil.add_weight(cardinal_graph, 1.6)
+                | pynutil.add_weight(en_cardinal_graph, 1.1)
+                | pynutil.add_weight(ordinal_graph, 1.6)
+                | pynutil.add_weight(en_ordinal_graph, 1.09)
+                | pynutil.add_weight(money_graph, 1.6)
+                | pynutil.add_weight(en_money_graph, 1.1)
+                | pynutil.add_weight(telephone_graph, 1.6)
+                | pynutil.add_weight(en_telephone_graph, 1.1)
+                | pynutil.add_weight(en_electronic_graph, 1.1)
+                | pynutil.add_weight(word_graph, 100)
+                | pynutil.add_weight(en_word_graph, 120)
+            )
+
+            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
+            en_punct = (
+                pynutil.insert("tokens { ") + pynutil.add_weight(en_punct_graph, weight=1.3) + pynutil.insert(" }")
+            )
+            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
+            token_plus_punct = (
+                pynini.closure(punct + pynutil.insert(" "))
+                + token
+                + pynini.closure(pynutil.insert(" ") + punct | en_punct)
+            )
+
+            graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
+            graph = delete_space + graph + delete_space
+
+            self.fst = graph.optimize()
+
+            if far_file:
+                generator_main(far_file, {"tokenize_and_classify": self.fst})
+                logger.info(f"ClassifyFst grammars are saved to {far_file}.")
@@ -0,0 +1,27 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+def get_abs_path(rel_path):
+    """
+    Get absolute path
+
+    Args:
+        rel_path: relative path to this file
+
+    Returns absolute path
+    """
+    return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
@@ -0,0 +1,13 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
Original file line number	Diff line number	Diff line change
`@@ -12,9 +12,10 @@ pipeline {`
`12`	`12`	`environment {`
`13`	`13`	`AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0'`
`14`	`14`	`DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0'`
`15`		`- EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-25-0'`
	`15`	`+ EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-3'`
`16`	`16`	`ES_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-24-0'`
`17`	`17`	`ES_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-30-24-0'`
	`18`	`+ HI_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-4'`
`18`	`19`	`FR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-07-25-0'`
`19`	`20`	`HU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/07-16-24-0'`
`20`	`21`	`PT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-01-26-1'`
`@@ -27,8 +28,8 @@ pipeline {`
`27`	`28`	`HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0'`
`28`	`29`	`MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'`
`29`	`30`	`JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'`
`30`		`- KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'`
`31`		`- HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'`
	`31`	`+ HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-5'`
	`32`	`+ KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-25-6'`
`32`	`33`	`DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'`
`33`	`34`	`}`
`34`	`35`	`stages {`
`@@ -104,7 +105,11 @@ pipeline {`
`104`	`105`	`sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi --text="एक" --cache_dir ${HI_TN_CACHE}'`
`105`	`106`	`}`
`106`	`107`	`}`
`107`		`-`
	`108`	`+ stage('L0: Codeswitched HI/EN ITN grammars') {`
	`109`	`+ steps {`
	`110`	`+ sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi_en --text="एक" --cache_dir ${HI_EN_TN_CACHE}'`
	`111`	`+ }`
	`112`	`+ }`
`108`	`113`	`}`
`109`	`114`	`}`
`110`	`115`
`@@ -168,7 +173,6 @@ pipeline {`
`168`	`173`	`sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ar --text="اثنان " --cache_dir ${AR_TN_CACHE}'`
`169`	`174`	`}`
`170`	`175`	`}`
`171`		`-`
`172`	`176`	`}`
`173`	`177`	`}`
`174`	`178`
`@@ -409,6 +413,11 @@ pipeline {`
`409`	`413`	`sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/es_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${ES_EN_TN_CACHE}'`
`410`	`414`	`}`
`411`	`415`	`}`
	`416`	`+ stage('L1: Run all Codeswitched HI/EN TN/ITN tests (restore grammars from cache)') {`
	`417`	`+ steps {`
	`418`	`+ sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hi_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${HI_EN_TN_CACHE}'`
	`419`	`+ }`
	`420`	`+ }`
`412`	`421`	`stage('L1: Run all AR TN/ITN tests (restore grammars from cache)') {`
`413`	`422`	`steps {`
`414`	`423`	`sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ar/ -m "not pleasefixme" --cpu --tn_cache_dir ${AR_TN_CACHE}'`