22from pynini .lib import pynutil
33from nemo_text_processing .text_normalization .fr .utils import get_abs_path
44
5+ from nemo_text_processing .text_normalization .en .graph_utils import GraphFst , NEMO_DIGIT
56
6- from nemo_text_processing .text_normalization .en .graph_utils import GraphFst
77
88# TODO: add articles? 'le...'
99
1010month_numbers = pynini .string_file (get_abs_path ("data/dates/months.tsv" ))
11+ eras = pynini .string_file (get_abs_path ("data/dates/eras.tsv" ))
1112delete_leading_zero = (pynutil .delete ("0" ) | (NEMO_DIGIT - "0" )) + NEMO_DIGIT #reminder, NEMO_DIGIT = filter on digits
1213
1314class DateFst (GraphFst ):
1415 ''' Finite state transducer for classyfing dates, e.g.:
15- '02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true }
16+ '02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true}
1617 '''
17- def __init__ (self , cardinal : GraphFst , deterministic : bool = True ):
18+ def __init__ (self , cardinal : GraphFst , deterministic : bool = True ):
1819 super ().__init__ (name = "dates" , kind = "classify" )
20+
21+ cardinal_graph = cardinal .all_nums_no_tokens
22+
23+ # 'le' -> 'le', 'les' -> 'les'
24+ le_determiner = pynini .accep ("le " ) | pynini .accep ("les " )
25+ self .optional_le = pynini .closure (le_determiner , 0 , 1 )
1926
20- #TODO; actually fix for 1, 'premier'
2127 # '01' -> 'un'
22- numbers = cardinal .graph
23- optional_leading_zero = delete_leading_zero | NEMO_DIGIT
24- digit_day = optional_leading_zero @ pynini .union (* [str (x ) for x in range (1 ,32 )]) @ numbers
25-
26- # '03' -> 'mars'
28+ optional_leading_zero = delete_leading_zero | NEMO_DIGIT
29+ valid_day_number = pynini .union (* [str (x ) for x in range (1 ,32 )])
30+ premier = pynini .string_map ([("1" , "premier" )])
31+ day_number_to_word = premier | cardinal_graph
32+
33+ digit_to_day = self .optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word
34+ self .day_graph = pynutil .insert ("day: \" " ) + digit_to_day + pynutil .insert ("\" " )
35+
36+ # '03' -> 'mars'
37+ normalize_month_number = optional_leading_zero @ pynini .union (* [str (x ) for x in range (1 , 13 )])
2738 number_to_month = month_numbers .optimize ()
28- number_to_month_graph = pynini .project (number_to_month , "output" )
39+ month_graph = normalize_month_number @ number_to_month
40+ self .month_graph = pynutil .insert ("month: \" " ) + month_graph + pynutil .insert ("\" " )
2941
30- digit_month = optional_leading_zero @ pynini .union (* [str (x ) for x in range (1 , 13 )])
31- number_to_month = digit_month @ number_to_month
32-
33- # Formatting for '{month: mars}'
34- month_graph = pynutil .insert ("month: \" " ) + month_graph + pynutil .insert ("\" " )
42+ # 2025 -> deux mille vingt cinq
43+ accept_year_digits = (NEMO_DIGIT - "0" ) + pynini .closure (NEMO_DIGIT , 1 , 3 )
44+ digits_to_year = accept_year_digits @ cardinal_graph
45+ self .year_graph = pynutil .insert ("year: \" " ) + digits_to_year + pynutil .insert ("\" " )
3546
36- digit_year = ( NEMO_DIGIT - "0" ) + pynini . closure ( NEMO_DIGIT , 1 , 3 )
37- year_graph = pynutil . insert ( "year: \" " ) + month_graph + pynutil . insert ( " \ " " )
47+ # Putting it all together
48+ self . fst = pynini . accep ( "" )
3849
50+ for separator in ["/" , "." , "-" ]:
51+ self .fst |= (
52+ pynutil .insert ("date { " )
53+ + self .day_graph
54+ + pynutil .delete (separator )
55+ + pynutil .insert (" " )
56+ + self .month_graph
57+ + pynini .closure (pynutil .delete (separator ) + pynutil .insert (" " ) + self .year_graph , 0 , 1 )
58+ + pynutil .insert (" preserve_order: true }" )
59+ )
3960
61+ # Accepts "janvier", "février", etc
62+ month_name_graph = pynutil .insert ("month: \" " ) + month_numbers .project ("output" ) + pynutil .insert ("\" " )
4063
64+ self .fst |= (
65+ pynutil .insert ("date { " )
66+ + self .day_graph
67+ + pynini .accep (" " )
68+ + month_name_graph
69+ + pynini .closure (pynini .accep (" " ) + self .year_graph , 0 , 1 )
70+ + pynutil .insert (" preserve_order: true}" )
71+ )
4172
73+ # Accepts "70s", "80s", etc
74+ self .fst |= pynutil .insert ("date { decade: \" " ) + eras + pynutil .insert ("\" preserve_order: true }" )
4275
4376
44-
77+ # Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"}
78+ for separator in ["-" , "/" ]:
79+ self .fst |= (
80+ pynutil .insert ("date { " )
81+ + pynini .closure (self .day_graph + pynutil .delete (separator ) + pynutil .insert (" " ), 1 )
82+ + self .day_graph
83+ + pynini .accep (" " )
84+ + month_name_graph
85+ + pynini .closure (pynini .accep (" " ) + self .year_graph , 0 , 1 )
86+ + pynutil .insert (" preserve_order: true }" )
87+ )
88+
89+ self .fst = self .fst .optimize ()
90+
91+
92+
93+ def apply_fst (text , fst ):
94+ try :
95+ print (text , "-->" , pynini .shortestpath (text @ fst ).string ())
96+ except pynini .FstOpError :
97+ print (f"Error: No valid output with given input: '{ text } '" )
98+
99+ if __name__ == "__main__" :
100+ from nemo_text_processing .text_normalization .fr .taggers .cardinal import CardinalFst
101+ fst = DateFst (CardinalFst ())
102+
103+ print ('DETERMINER' )
104+ apply_fst ("le " , fst .optional_le )
105+ apply_fst ("" , fst .optional_le )
106+
107+ print ("\n DAY GRAPH" )
108+ apply_fst ("01" , fst .day_graph )
109+ apply_fst ("02" , fst .day_graph )
110+ apply_fst ("3" , fst .day_graph )
111+ apply_fst ("12" , fst .day_graph )
112+ apply_fst ("le 01" , fst .day_graph )
113+ apply_fst ("le 12" , fst .day_graph )
114+
115+ print ("\n MONTH GRAPH" )
116+ apply_fst ("1" , fst .month_graph )
117+ apply_fst ("3" , fst .month_graph )
118+ apply_fst ("06" , fst .month_graph )
119+
120+ print ("\n YEAR" )
121+ apply_fst ("2025" , fst .year_graph )
122+
123+ print ("\n DATE" )
124+ apply_fst ("02.03.2003" , fst .fst )
125+ apply_fst ("02/03/2003" , fst .fst )
126+ apply_fst ("02-03-2003" , fst .fst )
127+ apply_fst ("le 02.03.2003" , fst .fst )
128+
129+ apply_fst ("02.03" , fst .fst )
130+ apply_fst ("17 janvier" , fst .fst )
131+ apply_fst ("10 mars 2023" , fst .fst )
132+ apply_fst ("le 10 mars 2023" , fst .fst )
45133
134+ print ("\n ERAS" )
135+ apply_fst ("80s" , fst .fst )
46136
137+ print ("\n DATE RANGES" )
138+ apply_fst ("les 17/18/19 juin" , fst .fst ) # returns: date { day: "les dix-sept" day: "dix-huit" day: "dix-neuf" month: "juin" preserve_order: true }
0 commit comments