Print warning instead exception (#97)

karpnv · pre-commit-ci[bot] · ekmb · web-flow · commit 80d2dcb58ca5 · 2023-09-26T19:11:50.000-07:00
* raise text Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * text arg Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * Failed text Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * add logger Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm raise Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * logger Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * NeMo-text-processing Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * info level Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm raise Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * verbose Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Normalizer.select_verbalizer Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * Exception Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * verbose Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * restart ci Signed-off-by: Evelina <ebakhturina@nvidia.com> --------- Signed-off-by: Nikolay Karpov <karpnv@gmail.com> Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> Signed-off-by: Evelina <ebakhturina@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Nikolay Karpov <nkarpov@nvidia.com> Co-authored-by: Evelina <ebakhturina@nvidia.com>
diff --git a/nemo_text_processing/logging.py b/nemo_text_processing/logging.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+# overriding with the library specific logger, so that it's possible to
+# customize in any downstream applications
+logger = logging.getLogger("NeMo-text-processing")
+logger.setLevel(logging.INFO)
diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
@@ -29,6 +29,7 @@
 import regex
 import tqdm
 from joblib import Parallel, delayed
+from nemo_text_processing.logging import logger
 from nemo_text_processing.text_normalization.data_loader_utils import (
     load_file,
     post_process_punct,
@@ -96,6 +97,7 @@ class Normalizer:
             Note: punct_post_process flag in normalize() supports all languages.
         max_number_of_permutations_per_split: a maximum number
             of permutations which can be generated from input sequence of tokens.
+        verbose: whether to print intermediate meta information
     """
 
     def __init__(
@@ -312,15 +314,15 @@ def normalize(
 
         Args:
             text: string that may include semiotic classes
-            verbose: whether to print intermediate meta information
             punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
             punct_post_process: whether to normalize punctuation
+            verbose: whether to print intermediate meta information
 
         Returns: spoken form
         """
         if len(text.split()) > 500:
-            print(
-                "WARNING! Your input is too long and could take a long time to normalize."
+            logger.warning(
+                "Your input is too long and could take a long time to normalize. "
                 "Use split_text_into_sentences() to make the input shorter and then call normalize_list()."
             )
         original_text = text
@@ -329,29 +331,35 @@ def normalize(
         text = text.strip()
         if not text:
             if verbose:
-                print(text)
+                logger.info(text)
             return text
         text = pynini.escape(text)
         tagged_lattice = self.find_tags(text)
         tagged_text = Normalizer.select_tag(tagged_lattice)
         if verbose:
-            print(tagged_text)
+            logger.info(tagged_text)
         self.parser(tagged_text)
         tokens = self.parser.parse()
         split_tokens = self._split_tokens_to_reduce_number_of_permutations(tokens)
         output = ""
         for s in split_tokens:
-            tags_reordered = self.generate_permutations(s)
-            verbalizer_lattice = None
-            for tagged_text in tags_reordered:
-                tagged_text = pynini.escape(tagged_text)
-
-                verbalizer_lattice = self.find_verbalizer(tagged_text)
-                if verbalizer_lattice.num_states() != 0:
-                    break
-            if verbalizer_lattice is None:
-                raise ValueError(f"No permutations were generated from tokens {s}")
-            output += ' ' + Normalizer.select_verbalizer(verbalizer_lattice)
+            try:
+                tags_reordered = self.generate_permutations(s)
+                verbalizer_lattice = None
+                for tagged_text in tags_reordered:
+                    tagged_text = pynini.escape(tagged_text)
+
+                    verbalizer_lattice = self.find_verbalizer(tagged_text)
+                    if verbalizer_lattice.num_states() != 0:
+                        break
+                if verbalizer_lattice is None:
+                    logger.warning(f"No permutations were generated from tokens {s}")
+                    return text
+                output += ' ' + Normalizer.select_verbalizer(verbalizer_lattice)
+            except Exception as e:
+                if verbose:
+                    logger.warning("Failed text: " + text + str(e))
+                return text
         output = SPACE_DUP.sub(' ', output[1:])
 
         if self.lang == "en" and hasattr(self, 'post_processor'):
@@ -405,6 +413,7 @@ def normalize_manifest(
         batch_size: int,
         output_filename: Optional[str] = None,
         text_field: str = "text",
+        verbose: bool = False,
         **kwargs,
     ):
         """
@@ -427,6 +436,7 @@ def _process_batch(
             batch_idx: int,
             batch: List[str],
             dir_name: str,
+            verbose=verbose,
             punct_pre_process=False,
             punct_post_process=True,
             text_field: str = "text",
@@ -443,7 +453,7 @@ def _process_batch(
             normalized_lines = [
                 self.normalize_line(
                     line=line,
-                    verbose=False,
+                    verbose=verbose,
                     punct_post_process=punct_post_process,
                     punct_pre_process=punct_pre_process,
                     text_field=text_field,
@@ -455,17 +465,22 @@ def _process_batch(
 
             with open(f"{dir_name}/{batch_idx:06}.json", "w") as f_out:
                 for line in normalized_lines:
+                    if isinstance(line[output_field], set):
+                        if len(line[output_field]) > 1:
+                            logger.warning("Len of " + str(line[output_field]) + " > 1 ")
+                        line[output_field] = line[output_field].pop()
+
                     f_out.write(json.dumps(line, ensure_ascii=False) + '\n')
 
-            print(f"Batch -- {batch_idx} -- is complete")
+            logger.info(f"Batch -- {batch_idx} -- is complete")
 
         if output_filename is None:
             output_filename = manifest.replace('.json', '_normalized.json')
 
         with open(manifest, 'r') as f:
             lines = f.readlines()
 
-        print(f'Normalizing {len(lines)} line(s) of {manifest}...')
+        logger.warning(f'Normalizing {len(lines)} line(s) of {manifest}...')
 
         # to save intermediate results to a file
         batch = min(len(lines), batch_size)
@@ -481,6 +496,7 @@ def _process_batch(
                 lines[i : i + batch],
                 tmp_dir,
                 text_field=text_field,
+                verbose=verbose,
                 punct_pre_process=punct_pre_process,
                 punct_post_process=punct_post_process,
                 **kwargs,
@@ -495,7 +511,7 @@ def _process_batch(
                     lines = f_in.read()
                     f_out.write(lines)
 
-        print(f'Normalized version saved at {output_filename}')
+        logger.warning(f'Normalized version saved at {output_filename}')
 
     def split_text_into_sentences(self, text: str, additional_split_symbols: str = "") -> List[str]:
         """
@@ -557,7 +573,7 @@ def _permute(self, d: OrderedDict) -> List[str]:
                 elif isinstance(v, bool):
                     subl = ["".join(x) for x in itertools.product(subl, [f"{k}: true "])]
                 else:
-                    raise ValueError()
+                    raise ValueError("Key: " + str(k) + " Value: " + str(v))
             l.extend(subl)
         return l
 
@@ -636,6 +652,7 @@ def select_verbalizer(lattice: 'pynini.FstLike') -> str:
 
         Args:
             lattice: verbalization lattice
+            text: full text line to raise in case of an exception
 
         Returns: shortest path
         """
@@ -756,7 +773,7 @@ def parse_args():
     )
     start_time = perf_counter()
     if args.input_string:
-        print(
+        logger.info(
             normalizer.normalize(
                 args.input_string,
                 verbose=args.verbose,
@@ -775,13 +792,14 @@ def parse_args():
                 text_field=args.manifest_text_field,
                 output_field=args.output_field,
                 output_filename=args.output_file,
+                verbose=args.verbose,
             )
 
         else:
-            print("Loading data: " + args.input_file)
+            logger.warning("Loading data: " + args.input_file)
             data = load_file(args.input_file)
 
-            print("- Data: " + str(len(data)) + " sentences")
+            logger.warning("- Data: " + str(len(data)) + " sentences")
             normalizer_prediction = normalizer.normalize_list(
                 data,
                 verbose=args.verbose,
@@ -790,8 +808,8 @@ def parse_args():
             )
             if args.output_file:
                 write_file(args.output_file, normalizer_prediction)
-                print(f"- Normalized. Writing out to {args.output_file}")
+                logger.warning(f"- Normalized. Writing out to {args.output_file}")
             else:
-                print(normalizer_prediction)
+                logger.warning(normalizer_prediction)
 
-    print(f"Execution time: {perf_counter() - start_time:.02f} sec")
+    logger.warning(f"Execution time: {perf_counter() - start_time:.02f} sec")
diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py
@@ -20,6 +20,7 @@
 
 import editdistance
 import pynini
+from nemo_text_processing.logging import logger
 from nemo_text_processing.text_normalization.data_loader_utils import post_process_punct, pre_process
 from nemo_text_processing.text_normalization.normalize import Normalizer
 from nemo_text_processing.text_normalization.utils_audio_based import get_alignment
@@ -172,7 +173,7 @@ def normalize(
                     if cer_threshold > 0 and cer > cer_threshold:
                         best_option = cur_deter_norm
                         if verbose and True:
-                            print(
+                            logger.info(
                                 f"CER of the best normalization option is above cer_theshold, using determinictis option. CER: {cer}"
                             )
                 except:
@@ -201,8 +202,8 @@ def normalize_non_deterministic(
         text = pre_process(text)  # to handle []
         text = text.strip()
         if not text:
-            if verbose:
-                print(text)
+            if self.verbose:
+                logger.info(text)
             return text
 
         text = pynini.escape(text)
@@ -237,7 +238,8 @@ def normalize_non_deterministic(
                 self._verbalize(tagged_text, normalized_texts, n_tagged, verbose=verbose)
 
         if len(normalized_texts) == 0:
-            raise ValueError()
+            logger.warning("Failed text: " + text + ", normalized_texts: " + str(normalized_texts))
+            return text
 
         if punct_post_process:
             # do post-processing based on Moses detokenizer
@@ -358,7 +360,7 @@ def get_verbalized_text(tagged_text):
                 tagged_text_reordered = pynini.escape(tagged_text_reordered)
                 normalized_texts.extend(get_verbalized_text(tagged_text_reordered))
                 if verbose:
-                    print(tagged_text_reordered)
+                    logger.info(tagged_text_reordered)
 
             except pynini.lib.rewrite.Error:
                 continue
@@ -383,10 +385,10 @@ def select_best_match(
         normalized_text, cer, idx = normalized_texts_cer[0]
 
         if verbose:
-            print('-' * 30)
+            logger.info('-' * 30)
             for option in normalized_texts:
-                print(option)
-            print('-' * 30)
+                logger.info(option)
+            logger.info('-' * 30)
         return normalized_text, cer, idx
 
 
@@ -510,7 +512,7 @@ def parse_args():
             verbose=args.verbose,
         )
         for option in options:
-            print(option)
+            logger.info(option)
     elif args.manifest.endswith('.json'):
         normalizer = NormalizerWithAudio(
             input_case=args.input_case,
@@ -532,10 +534,11 @@ def parse_args():
             text_field=args.manifest_text_field,
             asr_pred_field=args.manifest_asr_pred_field,
             cer_threshold=args.cer_threshold,
+            verbose=args.verbose,
         )
     else:
         raise ValueError(
             "Provide either path to .json manifest with '--manifest' OR "
             + "an input text with '--text' (for debugging without audio)"
         )
-    print(f'Execution time: {round((perf_counter() - start)/60, 2)} min.')
+    logger.warning(f'Execution time: {round((perf_counter() - start)/60, 2)} min.')