Add whitelist param to ITN (#30)

ekmb · web-flow · commit 4cc07ee1c4fa · 2023-02-02T19:19:23.000-05:00
* add whitelist param to itn

Signed-off-by: ekmb &lt;ebakhturina@nvidia.com&gt;

* add whitelist to export

Signed-off-by: ekmb &lt;ebakhturina@nvidia.com&gt;

* update docstrings

Signed-off-by: ekmb &lt;ebakhturina@nvidia.com&gt;

---------

Signed-off-by: ekmb &lt;ebakhturina@nvidia.com&gt;
diff --git a/nemo_text_processing/inverse_text_normalization/ar/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ar/taggers/tokenize_and_classify.py
@@ -41,9 +41,10 @@ class ClassifyFst(GraphFst):
     Args:
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
     """
 
-    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None):
         super().__init__(name="tokenize_and_classify", kind="classify")
 
         far_file = None
diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/de/taggers/tokenize_and_classify.py
@@ -57,9 +57,12 @@ class ClassifyFst(GraphFst):
     Args:
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
     """
 
-    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True):
+    def __init__(
+        self, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True, whitelist: str = None
+    ):
         super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
 
         far_file = None
@@ -80,7 +83,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, determi
             tn_date_verbalizer = TNDateVerbalizer(ordinal=tn_ordinal_verbalizer, deterministic=False)
             tn_electronic_tagger = TNElectronicTagger(deterministic=False)
             tn_electronic_verbalizer = TNElectronicVerbalizer(deterministic=False)
-            tn_whitelist_tagger = TNWhitelistTagger(input_case="cased", deterministic=False)
+            tn_whitelist_tagger = TNWhitelistTagger(input_case="cased", deterministic=False, input_file=whitelist)
 
             cardinal = CardinalFst(tn_cardinal_tagger=tn_cardinal_tagger)
             cardinal_graph = cardinal.fst
diff --git a/nemo_text_processing/inverse_text_normalization/de/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/de/taggers/whitelist.py
@@ -24,11 +24,17 @@ class WhiteListFst(GraphFst):
         e.g. misses -> tokens { name: "Mrs." }
     Args:
         tn_whitelist_tagger: TN whitelist tagger
+        input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n),
+        e.g. nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
     """
 
-    def __init__(self, tn_whitelist_tagger: GraphFst, deterministic: bool = True):
+    def __init__(self, tn_whitelist_tagger: GraphFst, deterministic: bool = True, input_file: str = None):
         super().__init__(name="whitelist", kind="classify", deterministic=deterministic)
 
-        whitelist = pynini.invert(tn_whitelist_tagger.graph)
+        if input_file:
+            whitelist = pynini.string_file(input_file).invert()
+        else:
+            whitelist = pynini.invert(tn_whitelist_tagger.graph)
+
         graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"")
         self.fst = graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py
@@ -47,9 +47,10 @@ class ClassifyFst(GraphFst):
     Args:
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
     """
 
-    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None):
         super().__init__(name="tokenize_and_classify", kind="classify")
 
         far_file = None
@@ -75,7 +76,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
             word_graph = WordFst().fst
             time_graph = TimeFst().fst
             money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
-            whitelist_graph = WhiteListFst().fst
+            whitelist_graph = WhiteListFst(input_file=whitelist).fst
             punct_graph = PunctuationFst().fst
             electronic_graph = ElectronicFst().fst
             telephone_graph = TelephoneFst(cardinal).fst
diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/en/taggers/whitelist.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
+
 import pynini
 from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path
 from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
@@ -23,12 +25,20 @@ class WhiteListFst(GraphFst):
     """
     Finite state transducer for classifying whitelisted tokens
         e.g. misses -> tokens { name: "mrs." }
-    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+    This class has highest priority among all classifier grammars.
+    Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified).
+
+    Args:
+        input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n),
+        e.g. nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
     """
 
-    def __init__(self):
+    def __init__(self, input_file: str = None):
         super().__init__(name="whitelist", kind="classify")
 
-        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
+        if input_file:
+            whitelist = pynini.string_file(input_file).invert()
+        else:
+            whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
         graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"")
         self.fst = graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/es/taggers/tokenize_and_classify.py
@@ -47,9 +47,10 @@ class ClassifyFst(GraphFst):
     Args:
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
     """
 
-    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None):
         super().__init__(name="tokenize_and_classify", kind="classify")
 
         far_file = None
@@ -79,7 +80,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
             word_graph = WordFst().fst
             time_graph = TimeFst().fst
             money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
-            whitelist_graph = WhiteListFst().fst
+            whitelist_graph = WhiteListFst(input_file=whitelist).fst
             punct_graph = PunctuationFst().fst
             electronic_graph = ElectronicFst().fst
             telephone_graph = TelephoneFst().fst
diff --git a/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/es/taggers/whitelist.py
@@ -22,12 +22,21 @@ class WhiteListFst(GraphFst):
     """
     Finite state transducer for classifying whitelisted tokens
         e.g. usted -> tokens { name: "ud." }
-    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+    This class has highest priority among all classifier grammars.
+
+    Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified).
+
+    Args:
+        input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n),
+        e.g. nemo_text_processing/inverse_text_normalization/es/data/whitelist.tsv
     """
 
-    def __init__(self):
+    def __init__(self, input_file: str = None):
         super().__init__(name="whitelist", kind="classify")
 
-        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
+        if input_file:
+            whitelist = pynini.string_file(input_file).invert()
+        else:
+            whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
         graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"")
         self.fst = graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py
@@ -47,9 +47,10 @@ class ClassifyFst(GraphFst):
     Args:
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
     """
 
-    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None):
         super().__init__(name="tokenize_and_classify", kind="classify")
 
         far_file = None
@@ -79,7 +80,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
             word_graph = WordFst().fst
             time_graph = TimeFst().fst
             money_graph = MoneyFst(cardinal, decimal).fst
-            whitelist_graph = WhiteListFst().fst
+            whitelist_graph = WhiteListFst(input_file=whitelist).fst
             punct_graph = PunctuationFst().fst
             electronic_graph = ElectronicFst().fst
             telephone_graph = TelephoneFst().fst
diff --git a/nemo_text_processing/inverse_text_normalization/fr/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/fr/taggers/whitelist.py
@@ -22,12 +22,20 @@ class WhiteListFst(GraphFst):
     """
     Finite state transducer for classifying whitelisted tokens
         e.g. misses -> tokens { name: "mrs." }
-    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+    This class has highest priority among all classifier grammars.
+    Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified).
+
+    Args:
+        input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n),
+        e.g. nemo_text_processing/inverse_text_normalization/fr/data/whitelist.tsv
     """
 
-    def __init__(self):
+    def __init__(self, input_file: str = None):
         super().__init__(name="whitelist", kind="classify")
 
-        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv"))
+        if input_file:
+            whitelist = pynini.string_file(input_file).invert()
+        else:
+            whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv"))
         graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"")
         self.fst = graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from argparse import ArgumentParser
 from time import perf_counter
 from typing import List
@@ -28,6 +29,8 @@ class InverseNormalizer(Normalizer):
 
     Args:
         lang: language specifying the ITN
+        whitelist: path to a file with whitelist replacements. (each line of the file: written_form\tspoken_form\n),
+            e.g. nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
         max_number_of_permutations_per_split: a maximum number
@@ -37,6 +40,7 @@ class InverseNormalizer(Normalizer):
     def __init__(
         self,
         lang: str = 'en',
+        whitelist: str = None,
         cache_dir: str = None,
         overwrite_cache: bool = False,
         max_number_of_permutations_per_split: int = 729,
@@ -87,7 +91,7 @@ def __init__(
                 VerbalizeFinalFst,
             )
 
-        self.tagger = ClassifyFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache)
+        self.tagger = ClassifyFst(cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache)
         self.verbalizer = VerbalizeFinalFst()
         self.parser = TokenParser()
         self.lang = lang
@@ -128,6 +132,12 @@ def parse_args():
     parser.add_argument(
         "--language", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi'], default="en", type=str
     )
+    parser.add_argument(
+        "--whitelist",
+        help="Path to a file with with whitelist replacements," "e.g., inverse_normalization/en/data/whitelist.tsv",
+        default=None,
+        type=str,
+    )
     parser.add_argument("--verbose", help="print info for debugging", action='store_true')
     parser.add_argument("--overwrite_cache", help="set to True to re-create .far grammar files", action="store_true")
     parser.add_argument(
@@ -141,9 +151,11 @@ def parse_args():
 
 if __name__ == "__main__":
     args = parse_args()
+
+    whitelist = os.path.abspath(args.whitelist) if args.whitelist else None
     start_time = perf_counter()
     inverse_normalizer = InverseNormalizer(
-        lang=args.language, cache_dir=args.cache_dir, overwrite_cache=args.overwrite_cache
+        lang=args.language, cache_dir=args.cache_dir, overwrite_cache=args.overwrite_cache, whitelist=whitelist,
     )
     print(f'Time to generate graph: {round(perf_counter() - start_time, 2)} sec')
 
diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py
@@ -46,9 +46,10 @@ class ClassifyFst(GraphFst):
     Args:
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
     """
 
-    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None):
         super().__init__(name="tokenize_and_classify", kind="classify")
 
         far_file = None
@@ -74,7 +75,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
             word_graph = WordFst().fst
             time_graph = TimeFst().fst
             money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
-            whitelist_graph = WhiteListFst().fst
+            whitelist_graph = WhiteListFst(input_file=whitelist).fst
             punct_graph = PunctuationFst().fst
             electronic_graph = ElectronicFst().fst
             telephone_graph = TelephoneFst().fst
diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py
@@ -22,12 +22,21 @@ class WhiteListFst(GraphFst):
     """
     Finite state transducer for classifying whitelisted tokens
         e.g. usted -> tokens { name: "ud." }
-    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+
+    This class has highest priority among all classifier grammars.
+    Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified).
+
+    Args:
+        input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n),
+        e.g. nemo_text_processing/inverse_text_normalization/pt/data/whitelist.tsv
     """
 
-    def __init__(self):
+    def __init__(self, input_file: str = None):
         super().__init__(name="whitelist", kind="classify")
 
-        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
+        if input_file:
+            whitelist = pynini.string_file(input_file).invert()
+        else:
+            whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
         graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"")
         self.fst = graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/tokenize_and_classify.py
@@ -47,9 +47,10 @@ class ClassifyFst(GraphFst):
     Args:
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
     """
 
-    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None):
         super().__init__(name="tokenize_and_classify", kind="classify")
 
         far_file = None
@@ -78,7 +79,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
             word_graph = WordFst().fst
             time_graph = TimeFst(tn_time=tn_classify.time).fst
             money_graph = MoneyFst(tn_money=tn_classify.money).fst
-            whitelist_graph = WhiteListFst().fst
+            whitelist_graph = WhiteListFst(input_file=whitelist).fst
             punct_graph = PunctuationFst().fst
             electronic_graph = ElectronicFst(tn_electronic=tn_classify.electronic).fst
             telephone_graph = TelephoneFst(tn_telephone=tn_classify.telephone).fst
diff --git a/nemo_text_processing/inverse_text_normalization/ru/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ru/taggers/whitelist.py
@@ -27,10 +27,16 @@ class WhiteListFst(GraphFst):
     Args:
         deterministic: if True will provide a single transduction option,
             for False multiple transduction are generated (used for audio-based normalization)
+        input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n),
+            e.g. nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
     """
 
-    def __init__(self, deterministic: bool = True):
+    def __init__(self, deterministic: bool = True, input_file: str = None):
         super().__init__(name="whitelist", kind="classify", deterministic=deterministic)
-        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
+
+        if input_file:
+            whitelist = pynini.string_file(input_file).invert()
+        else:
+            whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
         graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"")
         self.fst = graph.optimize()
diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/tokenize_and_classify.py
@@ -48,9 +48,10 @@ class ClassifyFst(GraphFst):
     Args:
         cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
         overwrite_cache: set to True to overwrite .far files
+        whitelist: path to a file with whitelist replacements
     """
 
-    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None):
         super().__init__(name="tokenize_and_classify", kind="classify")
 
         far_file = None
@@ -79,7 +80,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
             word_graph = WordFst().fst
             time_graph = TimeFst().fst
             money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
-            whitelist_graph = WhiteListFst().fst
+            whitelist_graph = WhiteListFst(input_file=whitelist).fst
             punct_graph = PunctuationFst().fst
             electronic_graph = ElectronicFst().fst
             telephone_graph = TelephoneFst().fst
diff --git a/nemo_text_processing/inverse_text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/vi/taggers/whitelist.py
@@ -23,12 +23,21 @@ class WhiteListFst(GraphFst):
     """
     Finite state transducer for classifying whitelisted tokens
         e.g. misses -> tokens { name: "mrs." }
-    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+
+    This class has highest priority among all classifier grammars.
+    Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified).
+
+    Args:
+        input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n),
+        e.g. nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
     """
 
-    def __init__(self):
+    def __init__(self, input_file: str = None):
         super().__init__(name="whitelist", kind="classify")
 
-        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
+        if input_file:
+            whitelist = pynini.string_file(input_file).invert()
+        else:
+            whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
         graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
         self.fst = graph.optimize()
diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh
diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py