@@ -36,117 +36,120 @@ class CardinalFst(GraphFst):
3636
3737 def __init__ (self ):
3838 super ().__init__ (name = "cardinal" , kind = "classify" )
39- graph_zero = pynini .string_file (get_abs_path ("data/numbers/zero.tsv" ))
40- graph_digit = pynini .string_file (get_abs_path ("data/numbers/digit.tsv" ))
4139 graph_ties = pynini .string_file (get_abs_path ("data/numbers/ties.tsv" ))
4240 graph_teen = pynini .string_file (get_abs_path ("data/numbers/teen.tsv" ))
4341
42+ thousand_words = pynini .union ("ngàn" , "nghìn" )
43+ negative_words = pynini .union ("âm" , "trừ" )
44+
45+ graph_hundred = pynini .cross ("trăm" , "" )
46+ graph_ten = pynini .cross ("mươi" , "" )
47+ zero = pynini .cross (pynini .union ("linh" , "lẻ" ), "0" )
48+
49+ graph_zero = pynini .string_file (get_abs_path ("data/numbers/zero.tsv" ))
50+ graph_digit = pynini .string_file (get_abs_path ("data/numbers/digit.tsv" ))
4451 graph_one = pynini .cross ("mốt" , "1" )
4552 graph_four = pynini .cross ("tư" , "4" )
4653 graph_five = pynini .cross ("lăm" , "5" )
4754 graph_half = pynini .cross ("rưỡi" , "5" )
48- graph_hundred = pynini .cross ("trăm" , "" )
49- graph_ten = pynini .cross ("mươi" , "" )
50- zero = pynini .cross (pynini .union ("linh" , "lẻ" ), "0" )
5155
5256 optional_ten = pynini .closure (delete_space + graph_ten , 0 , 1 )
5357 last_digit_exception = pynini .project (pynini .cross ("năm" , "5" ), "input" )
54- last_digit = pynini .union (
58+ self . last_digit = pynini .union (
5559 (pynini .project (graph_digit , "input" ) - last_digit_exception .arcsort ()) @ graph_digit ,
5660 graph_one ,
5761 graph_four ,
5862 graph_five ,
5963 )
60-
61- graph_hundred_ties_component = (graph_digit | graph_zero ) + delete_space + graph_hundred
62- graph_hundred_ties_component += delete_space
63- graph_hundred_ties_component += pynini .union (
64+ last_digit = self .last_digit
65+ # Build hundreds component (e.g., "một trăm", "hai trăm")
66+ graph_hundreds_component = (graph_digit | graph_zero ) + delete_space + graph_hundred
67+ graph_hundreds_component += delete_space
68+ graph_hundreds_component += pynini .union (
6469 graph_teen ,
65- (graph_half | graph_four | graph_one ) + pynutil .insert ("0" ),
66- graph_ties + optional_ten + ((delete_space + last_digit ) | pynutil .insert ("0" )),
67- zero + delete_space + (graph_digit | graph_four ),
68- pynutil .insert ("00" ),
69- )
70- graph_hundred_ties_component |= (
70+ (graph_half | graph_four | graph_one ) + pynutil .insert ("0" , weight = 0.1 ),
71+ graph_ties + optional_ten + ((delete_space + last_digit ) | pynutil .insert ("0" , weight = 0.1 )),
72+ zero + delete_space + (graph_digit | graph_four | graph_five ),
73+ pynutil .insert ("00" , weight = 0.1 ),
74+ ). optimize ()
75+ graph_hundreds_component |= (
7176 pynutil .insert ("0" )
7277 + delete_space
7378 + pynini .union (
7479 graph_teen ,
7580 graph_ties + optional_ten + delete_space + last_digit ,
76- graph_ties + delete_space + graph_ten + pynutil .insert ("0" ),
77- zero + delete_space + (graph_digit | graph_four ),
78- )
81+ graph_ties + delete_space + graph_ten + pynutil .insert ("0" , weight = 0.1 ),
82+ zero + delete_space + (graph_digit | graph_four | graph_five ),
83+ ).optimize ()
84+ )
85+ graph_hundred_component = graph_hundreds_component | (
86+ pynutil .insert ("00" , weight = 0.1 ) + delete_space + graph_digit
7987 )
80- graph_hundred_component = graph_hundred_ties_component | (pynutil .insert ("00" ) + delete_space + graph_digit )
8188
8289 graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
8390 pynini .closure (NEMO_DIGIT ) + (NEMO_DIGIT - "0" ) + pynini .closure (NEMO_DIGIT )
8491 )
8592 self .graph_hundred_component_at_least_one_none_zero_digit = (
86- graph_hundred_component_at_least_one_none_zero_digit
93+ graph_hundred_component_at_least_one_none_zero_digit . optimize ()
8794 )
88- graph_hundred_ties_zero = graph_hundred_ties_component | pynutil .insert ("000" )
95+ graph_hundreds_zero = graph_hundreds_component | pynutil .insert ("000" , weight = 0.1 )
8996
9097 graph_thousands = pynini .union (
91- graph_hundred_component_at_least_one_none_zero_digit
92- + delete_space
93- + pynutil .delete (pynini .union ("nghìn" , "ngàn" )),
98+ graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil .delete (thousand_words ),
9499 pynutil .insert ("000" , weight = 0.1 ),
95- )
96-
97- graph_ten_thousand = pynini .union (
98- graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil .delete ("vạn" ),
99- pynutil .insert ("0000" , weight = 0.1 ),
100- )
101-
102- graph_ten_thousand_suffix = pynini .union (
103- graph_digit + delete_space + pynutil .delete (pynini .union ("nghìn" , "ngàn" )),
104- pynutil .insert ("0" , weight = 0.1 ),
105- )
100+ ).optimize ()
106101
107102 graph_million = pynini .union (
108103 graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil .delete ("triệu" ),
109104 pynutil .insert ("000" , weight = 0.1 ),
110- )
105+ ). optimize ()
111106 graph_billion = pynini .union (
112107 graph_hundred_component_at_least_one_none_zero_digit
113108 + delete_space
114109 + pynutil .delete (pynini .union ("tỉ" , "tỷ" )),
115110 pynutil .insert ("000" , weight = 0.1 ),
116- )
111+ ). optimize ()
117112
113+ # Main graph combining all magnitude levels
118114 graph = pynini .union (
115+ # Full format: billion + million + thousand + hundred
119116 graph_billion
120117 + delete_space
121118 + graph_million
122119 + delete_space
123120 + graph_thousands
124121 + delete_space
125- + graph_hundred_ties_zero ,
126- graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_ties_zero ,
122+ + graph_hundreds_zero ,
123+ # Special thousand format with last digit or "rưỡi" (half)
127124 graph_hundred_component_at_least_one_none_zero_digit
128125 + delete_space
129- + pynutil .delete (pynini . union ( "nghìn" , "ngàn" ) )
126+ + pynutil .delete (thousand_words )
130127 + delete_space
131- + (((last_digit | graph_half ) + pynutil .insert ("00" )) | graph_hundred_ties_zero ),
128+ + pynini .union (
129+ pynini .union (last_digit , graph_half ) + pynutil .insert ("00" , weight = 0.1 ), graph_hundreds_zero
130+ ),
131+ # Single digits (for non-exception cases)
132132 graph_digit ,
133133 graph_zero ,
134134 )
135135
136- graph = graph @ pynini .union (
137- pynutil .delete (pynini .closure ("0" )) + pynini .difference (NEMO_DIGIT , "0" ) + pynini .closure (NEMO_DIGIT ),
138- "0" ,
136+ graph = (
137+ graph
138+ @ pynini .union (
139+ pynutil .delete (pynini .closure ("0" )) + pynini .difference (NEMO_DIGIT , "0" ) + pynini .closure (NEMO_DIGIT ),
140+ "0" ,
141+ ).optimize ()
139142 )
140143
141144 # don't convert cardinals from zero to nine inclusive
142- graph_exception = pynini .project (pynini .union (graph_digit , graph_zero ), "input" )
145+ single_digits = pynini .project (pynini .union (graph_digit , graph_zero ), "input" ). optimize ( )
143146
144147 self .graph_no_exception = graph
145148
146- self .graph = (pynini .project (graph , "input" ) - graph_exception . arcsort () ) @ graph
149+ self .graph = pynini . difference (pynini .project (graph , "input" ), single_digits ) @ graph
147150
148151 optional_minus_graph = pynini .closure (
149- pynutil .insert ("negative: " ) + pynini .cross (pynini . union ( "âm" , "trừ" ) , '"-"' ) + NEMO_SPACE ,
152+ pynutil .insert ("negative: " ) + pynini .cross (negative_words , '"-"' ) + NEMO_SPACE ,
150153 0 ,
151154 1 ,
152155 )
0 commit comments