Skip to content

Commit 3d3bfd4

Browse files
committed
Add tutorial
1 parent 48ca992 commit 3d3bfd4

File tree

12 files changed

+432
-0
lines changed

12 files changed

+432
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

nemo_text_processing/text_normalization/fr/data/dates/months.tsv

Whitespace-only changes.
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import pynini
2+
from pynini.lib import pynutil
3+
from nemo_text_processing.text_normalization.fr.utils import get_abs_path
4+
5+
6+
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
7+
8+
# TODO: add articles? 'le...'
9+
10+
month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv"))
11+
delete_leading_zero = (pynutil.delete("0") | (NEMO_DIGIT - "0")) + NEMO_DIGIT #reminder, NEMO_DIGIT = filter on digits
12+
13+
class DateFst(GraphFst):
14+
''' Finite state transducer for classyfing dates, e.g.:
15+
'02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true }
16+
'''
17+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
18+
super().__init__(name="dates", kind="classify")
19+
20+
#TODO; actually fix for 1, 'premier'
21+
# '01' -> 'un'
22+
numbers = cardinal.graph
23+
optional_leading_zero = delete_leading_zero | NEMO_DIGIT
24+
digit_day = optional_leading_zero @ pynini.union(*[str(x) for x in range(1,32)]) @ numbers
25+
26+
# '03' -> 'mars'
27+
number_to_month = month_numbers.optimize()
28+
number_to_month_graph = pynini.project(number_to_month, "output")
29+
30+
digit_month = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
31+
number_to_month = digit_month @ number_to_month
32+
33+
# Formatting for '{month: mars}'
34+
month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
35+
36+
digit_year = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)
37+
year_graph = pynutil.insert("year: \"") + month_graph + pynutil.insert("\"")
38+
39+
40+
41+
42+
43+
44+
45+
46+
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
zéro 0
2+
un 1
3+
une 1
4+
deux 2
5+
trois 3
6+
quatre 4
7+
cinq 5
8+
six 6
9+
sept 7
10+
huit 8
11+
neuf 9
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Mᵐᵉ madame
2+
Mᵐᵉˢ mesdames
3+
Mˡˡᵉ mademoiselle
4+
Mˡˡᵉˢ mademoiselles
5+
docteur
6+
Dʳˢ docteurs
7+
Dʳᵉ docteure
8+
Dʳᵉˢ docteures
9+
apr. J.-C. après jésus-christ
10+
av. J.-C. avant Jésus-Christ
11+
le hon. l’honorable
12+
le très hon. le très hononrable
13+
% pour cent
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import pynini
2+
from pynini.lib import pynutil
3+
4+
from nemo_text_processing.text_normalization.fr.utils import get_abs_path
5+
6+
def apply_fst(text, fst):
7+
""" Given a string input, returns the output string
8+
produced by traversing the path with lowest weight.
9+
If no valid path accepts input string, returns an
10+
error.
11+
"""
12+
try:
13+
print(pynini.shortestpath(text @ fst).string())
14+
except pynini.FstOpError:
15+
print(f"Error: No valid output with given input: '{text}'")
16+
17+
zero = pynini.string_map([("zéro","0")]) # French only pronounces zeroes as stand alone
18+
digits_map = pynini.string_map([ # pynini function that creates explicit input-output mappings for a WFST
19+
("un","1"),
20+
("une","1"),
21+
("deux","2"),
22+
("trois","3"),
23+
("quatre","4"),
24+
("cinq","5"),
25+
("six","6"),
26+
("sept","7"),
27+
("huit","8"),
28+
("neuf","9")
29+
])
30+
31+
digits = pynini.string_file("data/numbers/digits.tsv")
32+
33+
teens = pynini.string_map([
34+
("onze", "11"),
35+
("douze", "12"),
36+
("treize", "13"),
37+
("quatorze", "14"),
38+
("quinze", "16"),
39+
])
40+
41+
tens = pynini.string_map([("dix", "1")])
42+
delete_hyphen = pynini.closure(pynutil.delete("-"), 0, 1) # Applies a closure from 0-1 of operation. Equivalent to regex /?/
43+
44+
graph_tens = tens + delete_hyphen + digits
45+
graph_tens_and_teens = graph_tens | teens
46+
47+
graph_digits = digits | pynutil.insert("0")
48+
49+
apply_fst("un", graph_tens_and_teens)
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
17+
import pynini
18+
from pynini.lib import pynutil
19+
20+
from nemo_text_processing.text_normalization.en.graph_utils import (
21+
NEMO_WHITE_SPACE,
22+
GraphFst,
23+
delete_extra_space,
24+
delete_space,
25+
generator_main,
26+
)
27+
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
28+
from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst
29+
from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst
30+
from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst
31+
from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst
32+
from nemo_text_processing.text_normalization.fr.taggers.whitelist import WhiteListFst
33+
from nemo_text_processing.text_normalization.fr.taggers.word import WordFst
34+
from nemo_text_processing.utils.logging import logger
35+
36+
37+
class ClassifyFst(GraphFst):
38+
"""
39+
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
40+
For deployment, this grammar will be compiled and exported to OpenFst Finate State aRchive (FAR) File.
41+
More details to deployment at NeMo-text-processing/tools/text_processing_deployment.
42+
Args:
43+
input_case: accepting either "lower_cased" or "cased" input.
44+
deterministic: if True will provide a single transduction option,
45+
for False multiple options (used for audio-based normalization)
46+
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
47+
overwrite_cache: set to True to overwrite .far files
48+
whitelist: path to a file with whitelist replacements
49+
"""
50+
51+
def __init__(
52+
self,
53+
input_case: str,
54+
deterministic: bool = False,
55+
cache_dir: str = None,
56+
overwrite_cache: bool = False,
57+
whitelist: str = None,
58+
):
59+
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
60+
far_file = None
61+
if cache_dir is not None and cache_dir != "None":
62+
os.makedirs(cache_dir, exist_ok=True)
63+
whitelist_file = os.path.basename(whitelist) if whitelist else ""
64+
far_file = os.path.join(
65+
cache_dir, f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far",
66+
)
67+
if not overwrite_cache and far_file and os.path.exists(far_file):
68+
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
69+
logger.info(f"ClassifyFst.fst was restored from {far_file}.")
70+
else:
71+
logger.info(f"Creating ClassifyFst grammars. This might take some time...")
72+
73+
self.cardinal = CardinalFst(deterministic=deterministic)
74+
cardinal_graph = self.cardinal.fst
75+
76+
self.ordinal = OrdinalFst(cardinal=self.cardinal, deterministic=deterministic)
77+
ordinal_graph = self.ordinal.fst
78+
79+
self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
80+
decimal_graph = self.decimal.fst
81+
82+
self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,)
83+
fraction_graph = self.fraction.fst
84+
word_graph = WordFst(deterministic=deterministic).fst
85+
self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
86+
whitelist_graph = self.whitelist.fst
87+
punct_graph = PunctuationFst(deterministic=deterministic).fst
88+
89+
classify = (
90+
pynutil.add_weight(whitelist_graph, 1.01)
91+
| pynutil.add_weight(cardinal_graph, 1.1)
92+
| pynutil.add_weight(fraction_graph, 1.09)
93+
| pynutil.add_weight(ordinal_graph, 1.1)
94+
| pynutil.add_weight(decimal_graph, 1.1)
95+
| pynutil.add_weight(word_graph, 200)
96+
)
97+
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
98+
punct = pynini.closure(
99+
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
100+
| (pynutil.insert(" ") + punct),
101+
1,
102+
)
103+
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
104+
token_plus_punct = (
105+
pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
106+
)
107+
108+
graph = token_plus_punct + pynini.closure((delete_extra_space).ques + token_plus_punct)
109+
graph = delete_space + graph + delete_space
110+
graph |= punct
111+
112+
self.fst = graph.optimize()
113+
114+
if far_file:
115+
generator_main(far_file, {"tokenize_and_classify": self.fst})
116+
logger.info(f"ClassifyFst grammars are saved to {far_file}.")
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import csv
16+
import os
17+
18+
19+
def get_abs_path(rel_path):
20+
"""
21+
Get absolute path
22+
23+
Args:
24+
rel_path: relative path to this file
25+
26+
Returns absolute path
27+
"""
28+
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
29+
30+
31+
def load_labels(abs_path):
32+
"""
33+
loads relative path file as dictionary
34+
35+
Args:
36+
abs_path: absolute path
37+
38+
Returns dictionary of mappings
39+
"""
40+
label_tsv = open(abs_path)
41+
labels = list(csv.reader(label_tsv, delimiter="\t"))
42+
label_tsv.close()
43+
return labels

0 commit comments

Comments
 (0)