Skip to content

Commit f806134

Browse files
update ITN to work after Punctuation capitalization model (#22)
* add cases with capitalization, cardinal, decimal pass Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix telephone, ordinal Signed-off-by: ekmb <ebakhturina@nvidia.com> * restarting ci Signed-off-by: ekmb <ebakhturina@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * restarting ci Signed-off-by: ekmb <ebakhturina@nvidia.com> * restarting ci Signed-off-by: ekmb <ebakhturina@nvidia.com> * restarting ci Signed-off-by: ekmb <ebakhturina@nvidia.com> * update electronic Signed-off-by: ekmb <ebakhturina@nvidia.com> * review feedback, update whitelist Signed-off-by: ekmb <ebakhturina@nvidia.com> * rename capitalize func Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix SH tests Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix tests Signed-off-by: ekmb <ebakhturina@nvidia.com> * update jenkins folder name Signed-off-by: ekmb <ebakhturina@nvidia.com> * added cased arg to ITN Signed-off-by: ekmb <ebakhturina@nvidia.com> * add input_case arg to other lang Signed-off-by: ekmb <ebakhturina@nvidia.com> * jenkins dirs update Signed-off-by: ekmb <ebakhturina@nvidia.com> * update test Signed-off-by: ekmb <ebakhturina@nvidia.com> * update test Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix codeql errors Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix sh Signed-off-by: ekmb <ebakhturina@nvidia.com> * review Signed-off-by: ekmb <ebakhturina@nvidia.com> * update jenkins dir Signed-off-by: ekmb <ebakhturina@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix default value Signed-off-by: ekmb <ebakhturina@nvidia.com> --------- Signed-off-by: ekmb <ebakhturina@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 56574c1 commit f806134

65 files changed

Lines changed: 1524 additions & 149 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Jenkinsfile

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,17 @@ pipeline {
1010
disableConcurrentBuilds(abortPrevious: true)
1111
}
1212
environment {
13-
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-1'
14-
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-2'
15-
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-03-23-1'
16-
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-1'
13+
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
14+
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
15+
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-16-23-0'
16+
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
1717
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-16-23-1'
18-
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-1'
19-
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-1'
20-
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-1'
21-
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-1'
22-
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-1'
23-
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-09-23-1'
24-
25-
18+
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
19+
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
20+
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
21+
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
22+
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
23+
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
2624
}
2725
stages {
2826

nemo_text_processing/hybrid/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
DELIMITER = '~~'
3232

33-
cardinal_graph = CardinalFst().graph_no_exception
33+
cardinal_graph = CardinalFst(input_case="cased").graph_no_exception
3434
cardinal_graph = (
3535
pynini.closure(pynini.union("In ", "in ")) + cardinal_graph + pynini.closure(pynini.accep(" ") + cardinal_graph)
3636
)

nemo_text_processing/inverse_text_normalization/ar/taggers/tokenize_and_classify.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
generator_main,
3030
)
3131
from nemo_text_processing.text_normalization.ar.taggers.tokenize_and_classify import ClassifyFst as TNClassifyFst
32+
from nemo_text_processing.text_normalization.en.graph_utils import INPUT_LOWER_CASED
3233
from pynini.lib import pynutil
3334

3435

@@ -42,15 +43,22 @@ class ClassifyFst(GraphFst):
4243
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
4344
overwrite_cache: set to True to overwrite .far files
4445
whitelist: path to a file with whitelist replacements
46+
input_case: accepting either "lower_cased" or "cased" input.
4547
"""
4648

47-
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None):
49+
def __init__(
50+
self,
51+
cache_dir: str = None,
52+
overwrite_cache: bool = False,
53+
whitelist: str = None,
54+
input_case: str = INPUT_LOWER_CASED,
55+
):
4856
super().__init__(name="tokenize_and_classify", kind="classify")
4957

5058
far_file = None
5159
if cache_dir is not None and cache_dir != "None":
5260
os.makedirs(cache_dir, exist_ok=True)
53-
far_file = os.path.join(cache_dir, "_en_itn.far")
61+
far_file = os.path.join(cache_dir, f"ar_itn_{input_case}.far")
5462
if not overwrite_cache and far_file and os.path.exists(far_file):
5563
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
5664
logging.info(f"ClassifyFst.fst was restored from {far_file}.")

nemo_text_processing/inverse_text_normalization/de/taggers/tokenize_and_classify.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@
4040
from nemo_text_processing.text_normalization.de.verbalizers.ordinal import OrdinalFst as TNOrdinalVerbalizer
4141
from nemo_text_processing.text_normalization.de.verbalizers.time import TimeFst as TNTimeVerbalizer
4242
from nemo_text_processing.text_normalization.en.graph_utils import (
43+
INPUT_LOWER_CASED,
4344
GraphFst,
4445
delete_extra_space,
4546
delete_space,
@@ -58,17 +59,23 @@ class ClassifyFst(GraphFst):
5859
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
5960
overwrite_cache: set to True to overwrite .far files
6061
whitelist: path to a file with whitelist replacements
62+
input_case: accepting either "lower_cased" or "cased" input.
6163
"""
6264

6365
def __init__(
64-
self, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True, whitelist: str = None
66+
self,
67+
cache_dir: str = None,
68+
overwrite_cache: bool = False,
69+
deterministic: bool = True,
70+
whitelist: str = None,
71+
input_case: str = INPUT_LOWER_CASED,
6572
):
6673
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
6774

6875
far_file = None
6976
if cache_dir is not None and cache_dir != 'None':
7077
os.makedirs(cache_dir, exist_ok=True)
71-
far_file = os.path.join(cache_dir, "_de_itn.far")
78+
far_file = os.path.join(cache_dir, f"de_itn_{input_case}.far")
7279
if not overwrite_cache and far_file and os.path.exists(far_file):
7380
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
7481
logging.info(f"ClassifyFst.fst was restored from {far_file}.")

nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
°f fahrenheit
2-
°c celsius
3-
°f degree fahrenheit
4-
°c degree celsius
5-
k kelvin
1+
°F fahrenheit
2+
°C celsius
3+
°F degree fahrenheit
4+
°C degree celsius
5+
K kelvin
66
km kilometer
77
m meter
88
cm centimeter

nemo_text_processing/inverse_text_normalization/en/data/months.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ august
99
september
1010
october
1111
november
12-
december
12+
december
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
January
2+
February
3+
March
4+
April
5+
May
6+
June
7+
July
8+
August
9+
September
10+
October
11+
November
12+
December
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
P M P.M.
2+
PM P.M.
3+
P.M.
4+
P.M P.M.
5+
AM A.M.
6+
A.M.
7+
A.M A.M.
8+
A M A.M.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
CST C S T
2+
CET C E T
3+
PST P S T
4+
EST E S T
5+
PT P T
6+
ET E T
7+
GMT G M T

nemo_text_processing/inverse_text_normalization/en/data/time/to_hour.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ four 3
55
five 4
66
six 5
77
seven 6
8-
eigh 7
8+
eight 7
99
nine 8
1010
ten 9
1111
eleven 10

0 commit comments

Comments
 (0)