Skip to content

Commit fd3e7bf

Browse files
committed
Add date parser
1 parent 3d3bfd4 commit fd3e7bf

File tree

8 files changed

+226
-19
lines changed

8 files changed

+226
-19
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
20s twenties
2+
30s thirties
3+
40s forties
4+
50s fifties
5+
60s sixties
6+
70s seventies
7+
80s eighties
8+
90s nineties
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
1 janvier
2+
2 février
3+
3 mars
4+
4 avril
5+
5 mai
6+
6 juin
7+
7 juillet
8+
8 août
9+
9 septembre
10+
10 octobre
11+
11 novembre
12+
12 décembre

nemo_text_processing/text_normalization/fr/taggers/date.py

Lines changed: 110 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,45 +2,137 @@
22
from pynini.lib import pynutil
33
from nemo_text_processing.text_normalization.fr.utils import get_abs_path
44

5+
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, NEMO_DIGIT
56

6-
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
77

88
# TODO: add articles? 'le...'
99

1010
month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv"))
11+
eras = pynini.string_file(get_abs_path("data/dates/eras.tsv"))
1112
delete_leading_zero = (pynutil.delete("0") | (NEMO_DIGIT - "0")) + NEMO_DIGIT #reminder, NEMO_DIGIT = filter on digits
1213

1314
class DateFst(GraphFst):
1415
''' Finite state transducer for classyfing dates, e.g.:
15-
'02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true }
16+
'02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true}
1617
'''
17-
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
18+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
1819
super().__init__(name="dates", kind="classify")
20+
21+
cardinal_graph = cardinal.all_nums_no_tokens
22+
23+
# 'le' -> 'le', 'les' -> 'les'
24+
le_determiner = pynini.accep("le ") | pynini.accep("les ")
25+
self.optional_le = pynini.closure(le_determiner, 0, 1)
1926

20-
#TODO; actually fix for 1, 'premier'
2127
# '01' -> 'un'
22-
numbers = cardinal.graph
23-
optional_leading_zero = delete_leading_zero | NEMO_DIGIT
24-
digit_day = optional_leading_zero @ pynini.union(*[str(x) for x in range(1,32)]) @ numbers
25-
26-
# '03' -> 'mars'
28+
optional_leading_zero = delete_leading_zero | NEMO_DIGIT
29+
valid_day_number = pynini.union(*[str(x) for x in range(1,32)])
30+
premier = pynini.string_map([("1", "premier")])
31+
day_number_to_word = premier | cardinal_graph
32+
33+
digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word
34+
self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"")
35+
36+
# '03' -> 'mars'
37+
normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
2738
number_to_month = month_numbers.optimize()
28-
number_to_month_graph = pynini.project(number_to_month, "output")
39+
month_graph = normalize_month_number @ number_to_month
40+
self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
2941

30-
digit_month = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
31-
number_to_month = digit_month @ number_to_month
32-
33-
# Formatting for '{month: mars}'
34-
month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
42+
# 2025 -> deux mille vingt cinq
43+
accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)
44+
digits_to_year = accept_year_digits @ cardinal_graph
45+
self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"")
3546

36-
digit_year = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)
37-
year_graph = pynutil.insert("year: \"") + month_graph + pynutil.insert("\"")
47+
# Putting it all together
48+
self.fst = pynini.accep("")
3849

50+
for separator in ["/", ".", "-"]:
51+
self.fst |= (
52+
pynutil.insert("date { ")
53+
+ self.day_graph
54+
+ pynutil.delete(separator)
55+
+ pynutil.insert(" ")
56+
+ self.month_graph
57+
+ pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1)
58+
+ pynutil.insert(" preserve_order: true }")
59+
)
3960

61+
# Accepts "janvier", "février", etc
62+
month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"")
4063

64+
self.fst |= (
65+
pynutil.insert("date { ")
66+
+ self.day_graph
67+
+ pynini.accep(" ")
68+
+ month_name_graph
69+
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
70+
+ pynutil.insert(" preserve_order: true}")
71+
)
4172

73+
# Accepts "70s", "80s", etc
74+
self.fst |= pynutil.insert("date { decade: \"") + eras + pynutil.insert("\" preserve_order: true }")
4275

4376

44-
77+
# Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"}
78+
for separator in ["-", "/"]:
79+
self.fst |= (
80+
pynutil.insert("date { ")
81+
+ pynini.closure(self.day_graph + pynutil.delete(separator) + pynutil.insert(" "), 1)
82+
+ self.day_graph
83+
+ pynini.accep(" ")
84+
+ month_name_graph
85+
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
86+
+ pynutil.insert(" preserve_order: true }")
87+
)
88+
89+
self.fst = self.fst.optimize()
90+
91+
92+
93+
def apply_fst(text, fst):
94+
try:
95+
print(text, "-->", pynini.shortestpath(text @ fst).string())
96+
except pynini.FstOpError:
97+
print(f"Error: No valid output with given input: '{text}'")
98+
99+
if __name__ == "__main__":
100+
from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst
101+
fst = DateFst(CardinalFst())
102+
103+
print('DETERMINER')
104+
apply_fst("le ", fst.optional_le)
105+
apply_fst("", fst.optional_le)
106+
107+
print("\nDAY GRAPH")
108+
apply_fst("01", fst.day_graph)
109+
apply_fst("02", fst.day_graph)
110+
apply_fst("3", fst.day_graph)
111+
apply_fst("12", fst.day_graph)
112+
apply_fst("le 01", fst.day_graph)
113+
apply_fst("le 12", fst.day_graph)
114+
115+
print("\nMONTH GRAPH")
116+
apply_fst("1", fst.month_graph)
117+
apply_fst("3", fst.month_graph)
118+
apply_fst("06", fst.month_graph)
119+
120+
print("\nYEAR")
121+
apply_fst("2025", fst.year_graph)
122+
123+
print("\nDATE")
124+
apply_fst("02.03.2003", fst.fst)
125+
apply_fst("02/03/2003", fst.fst)
126+
apply_fst("02-03-2003", fst.fst)
127+
apply_fst("le 02.03.2003", fst.fst)
128+
129+
apply_fst("02.03", fst.fst)
130+
apply_fst("17 janvier", fst.fst)
131+
apply_fst("10 mars 2023", fst.fst)
132+
apply_fst("le 10 mars 2023", fst.fst)
45133

134+
print("\nERAS")
135+
apply_fst("80s", fst.fst)
46136

137+
print("\nDATE RANGES")
138+
apply_fst("les 17/18/19 juin", fst.fst) # returns: date { day: "les dix-sept" day: "dix-huit" day: "dix-neuf" month: "juin" preserve_order: true }

nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst
3232
from nemo_text_processing.text_normalization.fr.taggers.whitelist import WhiteListFst
3333
from nemo_text_processing.text_normalization.fr.taggers.word import WordFst
34+
from nemo_text_processing.text_normalization.fr.taggers.date import DateFst
3435
from nemo_text_processing.utils.logging import logger
3536

3637

@@ -85,9 +86,13 @@ def __init__(
8586
self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
8687
whitelist_graph = self.whitelist.fst
8788
punct_graph = PunctuationFst(deterministic=deterministic).fst
89+
90+
self.date = DateFst(self.cardinal, deterministic=deterministic)
91+
date_graph = self.date.fst
8892

8993
classify = (
9094
pynutil.add_weight(whitelist_graph, 1.01)
95+
| pynutil.add_weight(date_graph, 1.1)
9196
| pynutil.add_weight(cardinal_graph, 1.1)
9297
| pynutil.add_weight(fraction_graph, 1.09)
9398
| pynutil.add_weight(ordinal_graph, 1.1)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.en.graph_utils import (
19+
NEMO_NOT_QUOTE,
20+
NEMO_SPACE,
21+
GraphFst,
22+
delete_preserve_order
23+
)
24+
25+
class DateFst(GraphFst):
26+
"""
27+
Finite state transducer for verbalizing date, e.g.
28+
date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois
29+
30+
Args:
31+
ordinal: OrdinalFst
32+
deterministic: if True will provide a single transduction option,
33+
for False multiple transduction are generated (used for audio-based normalization)
34+
"""
35+
36+
def __init__(self, deterministic: bool = True):
37+
super().__init__(name="date", kind="verbalize", deterministic=deterministic)
38+
39+
day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
40+
month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
41+
year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
42+
decade = pynutil.delete("decade: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
43+
44+
graph_dmy = pynini.closure(day + NEMO_SPACE, 1) + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order
45+
graph_my = month + NEMO_SPACE + year + delete_preserve_order
46+
graph_decade = decade + delete_preserve_order
47+
48+
self.graph = graph_dmy | graph_my | graph_decade
49+
50+
delete_tokens = self.delete_tokens(self.graph)
51+
self.fst = delete_tokens.optimize()
52+
53+
54+
def apply_fst(text, fst):
55+
try:
56+
print(text, "-->", pynini.shortestpath(text @ fst).string())
57+
except pynini.FstOpError:
58+
print(f"Error: No valid output with given input: '{text}'")
59+
60+
if __name__ == "__main__":
61+
fst = DateFst()
62+
63+
# tagger output for "les 17/18/19 juin"
64+
apply_fst('date { day: "les dix-sept" day: "dix-huit" day: "dix-neuf" month: "juin" preserve_order: true }', fst.fst)

nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst
1818
from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst
1919
from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst
20+
from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst
2021

2122

2223
class VerbalizeFst(GraphFst):
@@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True):
4041
fraction = FractionFst(ordinal=ordinal, deterministic=deterministic)
4142
fraction_graph = fraction.fst
4243
whitelist_graph = WhiteListFst(deterministic=deterministic).fst
44+
date = DateFst(deterministic=deterministic)
45+
date_graph = date.fst
4346

44-
graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph
47+
graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph
4548
self.fst = graph
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
02.03.2003~deux mars deux mille trois
2+
02/03/2003~deux mars deux mille trois
3+
02-03-2003~deux mars deux mille trois
4+
le 02.03.2003~le deux mars deux mille trois
5+
17.06~dix-sept juin
6+
17 janvier~dix-sept janvier
7+
10 mars 2023~dix mars deux mille vingt-trois
8+
le 10 mars 2023~le dix mars deux mille vingt-trois
9+
les 80s~les eighties
10+
les 17/18 juin~les dix-sept dix-huit juin
11+
les 17/18/19 mars~les ldix-sept dix-huit dix-neuf mars
12+
les 17-18-19 juin~les dix-sept dix-huit dix-neuf juin
13+
les 17-18-19 juin 2025~les dix-sept dix-huit dix-neuf juin deux mille vingt-cinq

tests/nemo_text_processing/fr/test_date.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from parameterized import parameterized
1717

1818
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
19+
from nemo_text_processing.text_normalization.normalize import Normalizer
1920

2021
from ..utils import CACHE_DIR, parse_test_case_file
2122

@@ -29,3 +30,12 @@ class TestDate:
2930
def test_denorm(self, test_input, expected):
3031
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
3132
assert pred == expected
33+
34+
normalizer = Normalizer(input_case='cased', lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False)
35+
36+
@parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_date.txt'))
37+
@pytest.mark.run_only_on('CPU')
38+
@pytest.mark.unit
39+
def test_norm(self, test_input, expected):
40+
pred = self.normalizer.normalize(test_input, verbose=False)
41+
assert pred == expected

0 commit comments

Comments
 (0)