Skip to content

Commit 80d2dcb

Browse files
karpnvpre-commit-ci[bot]ekmb
authored
Print warning instead exception (#97)
* raise text Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * text arg Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * Failed text Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * add logger Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm raise Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * logger Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * NeMo-text-processing Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * info level Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm raise Signed-off-by: Nikolay Karpov <karpnv@gmail.com> * verbose Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Normalizer.select_verbalizer Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * Exception Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * verbose Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * restart ci Signed-off-by: Evelina <ebakhturina@nvidia.com> --------- Signed-off-by: Nikolay Karpov <karpnv@gmail.com> Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com> Signed-off-by: Evelina <ebakhturina@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Nikolay Karpov <nkarpov@nvidia.com> Co-authored-by: Evelina <ebakhturina@nvidia.com>
1 parent f96e37d commit 80d2dcb

3 files changed

Lines changed: 77 additions & 37 deletions

File tree

nemo_text_processing/logging.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import logging
15+
16+
# overriding with the library specific logger, so that it's possible to
17+
# customize in any downstream applications
18+
logger = logging.getLogger("NeMo-text-processing")
19+
logger.setLevel(logging.INFO)

nemo_text_processing/text_normalization/normalize.py

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import regex
3030
import tqdm
3131
from joblib import Parallel, delayed
32+
from nemo_text_processing.logging import logger
3233
from nemo_text_processing.text_normalization.data_loader_utils import (
3334
load_file,
3435
post_process_punct,
@@ -96,6 +97,7 @@ class Normalizer:
9697
Note: punct_post_process flag in normalize() supports all languages.
9798
max_number_of_permutations_per_split: a maximum number
9899
of permutations which can be generated from input sequence of tokens.
100+
verbose: whether to print intermediate meta information
99101
"""
100102

101103
def __init__(
@@ -312,15 +314,15 @@ def normalize(
312314
313315
Args:
314316
text: string that may include semiotic classes
315-
verbose: whether to print intermediate meta information
316317
punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
317318
punct_post_process: whether to normalize punctuation
319+
verbose: whether to print intermediate meta information
318320
319321
Returns: spoken form
320322
"""
321323
if len(text.split()) > 500:
322-
print(
323-
"WARNING! Your input is too long and could take a long time to normalize."
324+
logger.warning(
325+
"Your input is too long and could take a long time to normalize. "
324326
"Use split_text_into_sentences() to make the input shorter and then call normalize_list()."
325327
)
326328
original_text = text
@@ -329,29 +331,35 @@ def normalize(
329331
text = text.strip()
330332
if not text:
331333
if verbose:
332-
print(text)
334+
logger.info(text)
333335
return text
334336
text = pynini.escape(text)
335337
tagged_lattice = self.find_tags(text)
336338
tagged_text = Normalizer.select_tag(tagged_lattice)
337339
if verbose:
338-
print(tagged_text)
340+
logger.info(tagged_text)
339341
self.parser(tagged_text)
340342
tokens = self.parser.parse()
341343
split_tokens = self._split_tokens_to_reduce_number_of_permutations(tokens)
342344
output = ""
343345
for s in split_tokens:
344-
tags_reordered = self.generate_permutations(s)
345-
verbalizer_lattice = None
346-
for tagged_text in tags_reordered:
347-
tagged_text = pynini.escape(tagged_text)
348-
349-
verbalizer_lattice = self.find_verbalizer(tagged_text)
350-
if verbalizer_lattice.num_states() != 0:
351-
break
352-
if verbalizer_lattice is None:
353-
raise ValueError(f"No permutations were generated from tokens {s}")
354-
output += ' ' + Normalizer.select_verbalizer(verbalizer_lattice)
346+
try:
347+
tags_reordered = self.generate_permutations(s)
348+
verbalizer_lattice = None
349+
for tagged_text in tags_reordered:
350+
tagged_text = pynini.escape(tagged_text)
351+
352+
verbalizer_lattice = self.find_verbalizer(tagged_text)
353+
if verbalizer_lattice.num_states() != 0:
354+
break
355+
if verbalizer_lattice is None:
356+
logger.warning(f"No permutations were generated from tokens {s}")
357+
return text
358+
output += ' ' + Normalizer.select_verbalizer(verbalizer_lattice)
359+
except Exception as e:
360+
if verbose:
361+
logger.warning("Failed text: " + text + str(e))
362+
return text
355363
output = SPACE_DUP.sub(' ', output[1:])
356364

357365
if self.lang == "en" and hasattr(self, 'post_processor'):
@@ -405,6 +413,7 @@ def normalize_manifest(
405413
batch_size: int,
406414
output_filename: Optional[str] = None,
407415
text_field: str = "text",
416+
verbose: bool = False,
408417
**kwargs,
409418
):
410419
"""
@@ -427,6 +436,7 @@ def _process_batch(
427436
batch_idx: int,
428437
batch: List[str],
429438
dir_name: str,
439+
verbose=verbose,
430440
punct_pre_process=False,
431441
punct_post_process=True,
432442
text_field: str = "text",
@@ -443,7 +453,7 @@ def _process_batch(
443453
normalized_lines = [
444454
self.normalize_line(
445455
line=line,
446-
verbose=False,
456+
verbose=verbose,
447457
punct_post_process=punct_post_process,
448458
punct_pre_process=punct_pre_process,
449459
text_field=text_field,
@@ -455,17 +465,22 @@ def _process_batch(
455465

456466
with open(f"{dir_name}/{batch_idx:06}.json", "w") as f_out:
457467
for line in normalized_lines:
468+
if isinstance(line[output_field], set):
469+
if len(line[output_field]) > 1:
470+
logger.warning("Len of " + str(line[output_field]) + " > 1 ")
471+
line[output_field] = line[output_field].pop()
472+
458473
f_out.write(json.dumps(line, ensure_ascii=False) + '\n')
459474

460-
print(f"Batch -- {batch_idx} -- is complete")
475+
logger.info(f"Batch -- {batch_idx} -- is complete")
461476

462477
if output_filename is None:
463478
output_filename = manifest.replace('.json', '_normalized.json')
464479

465480
with open(manifest, 'r') as f:
466481
lines = f.readlines()
467482

468-
print(f'Normalizing {len(lines)} line(s) of {manifest}...')
483+
logger.warning(f'Normalizing {len(lines)} line(s) of {manifest}...')
469484

470485
# to save intermediate results to a file
471486
batch = min(len(lines), batch_size)
@@ -481,6 +496,7 @@ def _process_batch(
481496
lines[i : i + batch],
482497
tmp_dir,
483498
text_field=text_field,
499+
verbose=verbose,
484500
punct_pre_process=punct_pre_process,
485501
punct_post_process=punct_post_process,
486502
**kwargs,
@@ -495,7 +511,7 @@ def _process_batch(
495511
lines = f_in.read()
496512
f_out.write(lines)
497513

498-
print(f'Normalized version saved at {output_filename}')
514+
logger.warning(f'Normalized version saved at {output_filename}')
499515

500516
def split_text_into_sentences(self, text: str, additional_split_symbols: str = "") -> List[str]:
501517
"""
@@ -557,7 +573,7 @@ def _permute(self, d: OrderedDict) -> List[str]:
557573
elif isinstance(v, bool):
558574
subl = ["".join(x) for x in itertools.product(subl, [f"{k}: true "])]
559575
else:
560-
raise ValueError()
576+
raise ValueError("Key: " + str(k) + " Value: " + str(v))
561577
l.extend(subl)
562578
return l
563579

@@ -636,6 +652,7 @@ def select_verbalizer(lattice: 'pynini.FstLike') -> str:
636652
637653
Args:
638654
lattice: verbalization lattice
655+
text: full text line to raise in case of an exception
639656
640657
Returns: shortest path
641658
"""
@@ -756,7 +773,7 @@ def parse_args():
756773
)
757774
start_time = perf_counter()
758775
if args.input_string:
759-
print(
776+
logger.info(
760777
normalizer.normalize(
761778
args.input_string,
762779
verbose=args.verbose,
@@ -775,13 +792,14 @@ def parse_args():
775792
text_field=args.manifest_text_field,
776793
output_field=args.output_field,
777794
output_filename=args.output_file,
795+
verbose=args.verbose,
778796
)
779797

780798
else:
781-
print("Loading data: " + args.input_file)
799+
logger.warning("Loading data: " + args.input_file)
782800
data = load_file(args.input_file)
783801

784-
print("- Data: " + str(len(data)) + " sentences")
802+
logger.warning("- Data: " + str(len(data)) + " sentences")
785803
normalizer_prediction = normalizer.normalize_list(
786804
data,
787805
verbose=args.verbose,
@@ -790,8 +808,8 @@ def parse_args():
790808
)
791809
if args.output_file:
792810
write_file(args.output_file, normalizer_prediction)
793-
print(f"- Normalized. Writing out to {args.output_file}")
811+
logger.warning(f"- Normalized. Writing out to {args.output_file}")
794812
else:
795-
print(normalizer_prediction)
813+
logger.warning(normalizer_prediction)
796814

797-
print(f"Execution time: {perf_counter() - start_time:.02f} sec")
815+
logger.warning(f"Execution time: {perf_counter() - start_time:.02f} sec")

nemo_text_processing/text_normalization/normalize_with_audio.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import editdistance
2222
import pynini
23+
from nemo_text_processing.logging import logger
2324
from nemo_text_processing.text_normalization.data_loader_utils import post_process_punct, pre_process
2425
from nemo_text_processing.text_normalization.normalize import Normalizer
2526
from nemo_text_processing.text_normalization.utils_audio_based import get_alignment
@@ -172,7 +173,7 @@ def normalize(
172173
if cer_threshold > 0 and cer > cer_threshold:
173174
best_option = cur_deter_norm
174175
if verbose and True:
175-
print(
176+
logger.info(
176177
f"CER of the best normalization option is above cer_theshold, using determinictis option. CER: {cer}"
177178
)
178179
except:
@@ -201,8 +202,8 @@ def normalize_non_deterministic(
201202
text = pre_process(text) # to handle []
202203
text = text.strip()
203204
if not text:
204-
if verbose:
205-
print(text)
205+
if self.verbose:
206+
logger.info(text)
206207
return text
207208

208209
text = pynini.escape(text)
@@ -237,7 +238,8 @@ def normalize_non_deterministic(
237238
self._verbalize(tagged_text, normalized_texts, n_tagged, verbose=verbose)
238239

239240
if len(normalized_texts) == 0:
240-
raise ValueError()
241+
logger.warning("Failed text: " + text + ", normalized_texts: " + str(normalized_texts))
242+
return text
241243

242244
if punct_post_process:
243245
# do post-processing based on Moses detokenizer
@@ -358,7 +360,7 @@ def get_verbalized_text(tagged_text):
358360
tagged_text_reordered = pynini.escape(tagged_text_reordered)
359361
normalized_texts.extend(get_verbalized_text(tagged_text_reordered))
360362
if verbose:
361-
print(tagged_text_reordered)
363+
logger.info(tagged_text_reordered)
362364

363365
except pynini.lib.rewrite.Error:
364366
continue
@@ -383,10 +385,10 @@ def select_best_match(
383385
normalized_text, cer, idx = normalized_texts_cer[0]
384386

385387
if verbose:
386-
print('-' * 30)
388+
logger.info('-' * 30)
387389
for option in normalized_texts:
388-
print(option)
389-
print('-' * 30)
390+
logger.info(option)
391+
logger.info('-' * 30)
390392
return normalized_text, cer, idx
391393

392394

@@ -510,7 +512,7 @@ def parse_args():
510512
verbose=args.verbose,
511513
)
512514
for option in options:
513-
print(option)
515+
logger.info(option)
514516
elif args.manifest.endswith('.json'):
515517
normalizer = NormalizerWithAudio(
516518
input_case=args.input_case,
@@ -532,10 +534,11 @@ def parse_args():
532534
text_field=args.manifest_text_field,
533535
asr_pred_field=args.manifest_asr_pred_field,
534536
cer_threshold=args.cer_threshold,
537+
verbose=args.verbose,
535538
)
536539
else:
537540
raise ValueError(
538541
"Provide either path to .json manifest with '--manifest' OR "
539542
+ "an input text with '--text' (for debugging without audio)"
540543
)
541-
print(f'Execution time: {round((perf_counter() - start)/60, 2)} min.')
544+
logger.warning(f'Execution time: {round((perf_counter() - start)/60, 2)} min.')

0 commit comments

Comments
 (0)