Skip to content

Commit 39704ac

Browse files
Time - semiotic class for Vietnamese TN (#302)
* Time - semiotic class for Vietnamese TN Signed-off-by: folivoramanh <palasek182@gmail.com> * remove irrelevant import and comment Signed-off-by: folivoramanh <palasek182@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add comment and refractor pattern Signed-off-by: folivoramanh <palasek182@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change the spaces to NEMO_SPACE for maintenance. Signed-off-by: folivoramanh <palasek182@gmail.com> * Change the spaces to NEMO_SPACE for maintenance. Signed-off-by: folivoramanh <palasek182@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change the spaces to NEMO_SPACE for maintenance. - remove quote Signed-off-by: folivoramanh <palasek182@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: folivoramanh <palasek182@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 0ddd5ea commit 39704ac

File tree

10 files changed

+521
-21
lines changed

10 files changed

+521
-21
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
GMT GMT
2+
UTC UTC
3+
CST CST
4+
PST PST
5+
EST EST
6+
JST JST
7+
PT PT
8+
ET ET
9+
CET CET
10+
gmt GMT
11+
utc UTC
12+
cst CST
13+
pst PST
14+
est EST
15+
jst JST
16+
pt PT
17+
et ET
18+
cet CET

nemo_text_processing/text_normalization/vi/graph_utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,16 @@
4848
delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1))
4949
insert_space = pynutil.insert(" ")
5050
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
51+
delete_preserve_order = pynini.closure(
52+
pynutil.delete(" preserve_order: true")
53+
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
54+
)
55+
56+
quoted_text = pynini.closure(NEMO_NOT_QUOTE)
57+
58+
59+
def extract_field(field_name):
60+
return pynutil.delete(f"{field_name}:") + delete_space + pynutil.delete("\"") + quoted_text + pynutil.delete("\"")
5161

5262

5363
def convert_space(fst) -> "pynini.FstLike":
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.vi.graph_utils import (
19+
NEMO_DIGIT,
20+
NEMO_SPACE,
21+
GraphFst,
22+
convert_space,
23+
insert_space,
24+
)
25+
from nemo_text_processing.text_normalization.vi.utils import get_abs_path
26+
27+
28+
class TimeFst(GraphFst):
29+
"""
30+
Finite state transducer for classifying time in Vietnamese.
31+
32+
Supports various formats including:
33+
- Digital formats: "8:30", "14:45", "5:20:35"
34+
- Vietnamese formats: "14 giờ 30 phút", "2 giờ 15 phút 10 giây"
35+
- Abbreviated formats: "9h", "9g", "14h30", "14g30", "3p20s"
36+
- With time zones: "8:23 gmt", "15h cst"
37+
38+
Args:
39+
cardinal: CardinalFst for number conversion
40+
deterministic: if True will provide a single transduction option,
41+
for False multiple transduction are generated (used for audio-based normalization)
42+
"""
43+
44+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
45+
super().__init__(name="time", kind="classify", deterministic=deterministic)
46+
47+
time_zone = pynini.string_file(get_abs_path("data/time/time_zones.tsv"))
48+
digit = NEMO_DIGIT
49+
delete_leading_zero = (pynutil.delete("0").ques | (digit - "0")) + digit
50+
cardinal_graph = cardinal.graph
51+
52+
hours = pynini.union(*[str(x) for x in range(0, 25)])
53+
minutes_seconds = pynini.union(*[str(x) for x in range(0, 60)])
54+
55+
def label(name, graph):
56+
return pynutil.insert(f'{name}: "') + graph + pynutil.insert('"')
57+
58+
hour = label('hours', delete_leading_zero @ hours @ cardinal_graph)
59+
minute = label('minutes', delete_leading_zero @ minutes_seconds @ cardinal_graph)
60+
second = label('seconds', delete_leading_zero @ minutes_seconds @ cardinal_graph)
61+
zone = label('zone', convert_space(time_zone))
62+
63+
h_suffix = pynini.union(pynutil.delete("h"), pynutil.delete("g"))
64+
h_word = pynutil.delete(" giờ")
65+
m_word = pynutil.delete(" phút")
66+
s_word = pynutil.delete(" giây")
67+
68+
opt_zone_space = pynini.closure(pynini.accep(NEMO_SPACE) + zone, 0, 1)
69+
opt_zone = pynini.closure(zone, 0, 1)
70+
preserve = pynutil.insert(" preserve_order: true")
71+
72+
# Define sub-patterns for better readability
73+
# Digital formats
74+
pattern_hour_minute = hour + pynutil.delete(":") + insert_space + minute + opt_zone_space
75+
76+
pattern_hour_minute_second = (
77+
hour
78+
+ pynutil.delete(":")
79+
+ insert_space
80+
+ minute
81+
+ pynutil.delete(":")
82+
+ insert_space
83+
+ second
84+
+ opt_zone_space
85+
+ preserve
86+
)
87+
88+
# Abbreviated formats
89+
pattern_hour_suffix = hour + h_suffix + opt_zone_space
90+
pattern_hour_suffix_minute = hour + h_suffix + minute + opt_zone
91+
pattern_minute_p = minute + pynutil.delete("p")
92+
pattern_second_s = second + pynutil.delete("s")
93+
pattern_minute_p_second_s = minute + pynutil.delete("p") + insert_space + second + pynutil.delete("s")
94+
95+
# Vietnamese word formats
96+
pattern_hour_word = hour + h_word + opt_zone_space
97+
98+
pattern_hour_word_minute = hour + h_word + pynutil.delete(NEMO_SPACE) + minute + m_word + opt_zone_space
99+
100+
pattern_hour_word_minute_second = (
101+
hour
102+
+ h_word
103+
+ pynutil.delete(NEMO_SPACE)
104+
+ minute
105+
+ m_word
106+
+ pynutil.delete(NEMO_SPACE)
107+
+ second
108+
+ s_word
109+
+ opt_zone_space
110+
+ preserve
111+
)
112+
113+
pattern_minute_word = minute + m_word
114+
pattern_minute_word_second = minute + m_word + pynutil.delete(NEMO_SPACE) + second + s_word
115+
pattern_second_word = second + s_word
116+
117+
# Time zone specific patterns
118+
pattern_hour_suffix_space_zone = hour + h_suffix + pynini.accep(NEMO_SPACE) + zone
119+
pattern_hour_suffix_zone = hour + h_suffix + zone
120+
121+
patterns = [
122+
pattern_hour_minute,
123+
pattern_hour_minute_second,
124+
pattern_hour_suffix,
125+
pattern_hour_suffix_minute,
126+
pattern_minute_p,
127+
pattern_second_s,
128+
pattern_minute_p_second_s,
129+
pattern_hour_word,
130+
pattern_hour_word_minute,
131+
pattern_hour_word_minute_second,
132+
pattern_minute_word,
133+
pattern_minute_word_second,
134+
pattern_second_word,
135+
pattern_hour_suffix_space_zone,
136+
pattern_hour_suffix_zone,
137+
]
138+
139+
final_graph = pynini.union(*patterns).optimize()
140+
141+
self.fst = self.add_tokens(final_graph).optimize()

nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst
3232
from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst
3333
from nemo_text_processing.text_normalization.vi.taggers.roman import RomanFst
34+
from nemo_text_processing.text_normalization.vi.taggers.time import TimeFst
3435
from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst
3536
from nemo_text_processing.text_normalization.vi.taggers.word import WordFst
3637
from nemo_text_processing.utils.logging import logger
@@ -104,6 +105,11 @@ def __init__(
104105
roman_graph = roman.fst
105106
logger.debug(f"roman: {time.time() - start_time: .2f}s -- {roman_graph.num_states()} nodes")
106107

108+
start_time = time.time()
109+
time_fst = TimeFst(cardinal=cardinal, deterministic=deterministic)
110+
time_graph = time_fst.fst
111+
logger.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes")
112+
107113
classify = (
108114
pynutil.add_weight(whitelist_graph, 1.01)
109115
| pynutil.add_weight(roman_graph, 1.1)
@@ -112,6 +118,7 @@ def __init__(
112118
| pynutil.add_weight(ordinal_graph, 1.1)
113119
| pynutil.add_weight(decimal_graph, 1.1)
114120
| pynutil.add_weight(fraction_graph, 1.1)
121+
| pynutil.add_weight(time_graph, 1.1)
115122
| pynutil.add_weight(word_graph, 100)
116123
)
117124
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(" }")
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.vi.graph_utils import (
19+
NEMO_NOT_QUOTE,
20+
NEMO_SPACE,
21+
GraphFst,
22+
convert_space,
23+
delete_preserve_order,
24+
delete_space,
25+
extract_field,
26+
)
27+
from nemo_text_processing.text_normalization.vi.utils import get_abs_path
28+
29+
30+
class TimeFst(GraphFst):
31+
"""
32+
Finite state transducer for verbalizing Vietnamese time.
33+
34+
Converts tagged time entities into spoken form, e.g.:
35+
- time { hours: "tám" minutes: "ba mươi" } -> tám giờ ba mươi phút
36+
- time { hours: "mười bốn" minutes: "mười lăm" } -> mười bốn giờ mười lăm phút
37+
- time { hours: "chín" } -> chín giờ
38+
- time { minutes: "ba" seconds: "hai mươi" } -> ba phút hai mươi giây
39+
- time { hours: "tám" minutes: "hai mươi ba" zone: "g m t" } -> tám giờ hai mươi ba phút GMT
40+
41+
Args:
42+
deterministic: if True will provide a single transduction option,
43+
for False multiple transduction are generated (used for audio-based normalization)
44+
"""
45+
46+
def __init__(self, deterministic: bool = True):
47+
super().__init__(name="time", kind="verbalize", deterministic=deterministic)
48+
49+
time_zone = convert_space(pynini.string_file(get_abs_path("data/time/time_zones.tsv")))
50+
51+
# Extract components
52+
hour_component = extract_field("hours")
53+
timezone_component = extract_field("zone") @ time_zone
54+
55+
# Handle zero and non-zero components
56+
zero_minute_component = pynutil.delete("minutes:") + delete_space + pynutil.delete("\"không\"")
57+
zero_second_component = pynutil.delete("seconds:") + delete_space + pynutil.delete("\"không\"")
58+
59+
non_zero_minute_component = (
60+
pynutil.delete("minutes:")
61+
+ delete_space
62+
+ pynutil.delete("\"")
63+
+ pynini.closure(NEMO_NOT_QUOTE - pynini.accep("không"))
64+
+ pynutil.delete("\"")
65+
)
66+
non_zero_second_component = (
67+
pynutil.delete("seconds:")
68+
+ delete_space
69+
+ pynutil.delete("\"")
70+
+ pynini.closure(NEMO_NOT_QUOTE - pynini.accep("không"))
71+
+ pynutil.delete("\"")
72+
)
73+
74+
# Components with units
75+
hour_with_unit = hour_component + pynutil.insert(" giờ")
76+
minute_with_unit = non_zero_minute_component + pynutil.insert(" phút")
77+
second_with_unit = non_zero_second_component + pynutil.insert(" giây")
78+
79+
# Optional components
80+
optional_timezone = pynini.closure(delete_space + pynutil.insert(NEMO_SPACE) + timezone_component, 0, 1)
81+
optional_preserve_order = pynini.closure(delete_space + delete_preserve_order, 0, 1)
82+
83+
# Pattern 1: hours + optional zero minutes/seconds + optional timezone
84+
pattern_hours_only = (
85+
hour_with_unit
86+
+ pynini.closure(delete_space + zero_minute_component, 0, 1)
87+
+ pynini.closure(delete_space + zero_second_component, 0, 1)
88+
+ optional_timezone
89+
+ optional_preserve_order
90+
)
91+
92+
# Pattern 2: hours + minutes + optional zero seconds + optional timezone
93+
pattern_hours_minutes = (
94+
hour_with_unit
95+
+ delete_space
96+
+ pynutil.insert(NEMO_SPACE)
97+
+ minute_with_unit
98+
+ pynini.closure(delete_space + zero_second_component, 0, 1)
99+
+ optional_timezone
100+
+ optional_preserve_order
101+
)
102+
103+
# Pattern 3: hours + zero minutes + seconds + optional timezone
104+
pattern_hours_seconds = (
105+
hour_with_unit
106+
+ delete_space
107+
+ zero_minute_component
108+
+ delete_space
109+
+ pynutil.insert(NEMO_SPACE)
110+
+ second_with_unit
111+
+ optional_timezone
112+
+ optional_preserve_order
113+
)
114+
115+
# Pattern 4: hours + minutes + seconds + optional timezone
116+
pattern_hours_minutes_seconds = (
117+
hour_with_unit
118+
+ delete_space
119+
+ pynutil.insert(NEMO_SPACE)
120+
+ minute_with_unit
121+
+ delete_space
122+
+ pynutil.insert(NEMO_SPACE)
123+
+ second_with_unit
124+
+ optional_timezone
125+
+ optional_preserve_order
126+
)
127+
128+
# Pattern 5: minutes only + optional zero seconds
129+
pattern_minutes_only = minute_with_unit + pynini.closure(delete_space + zero_second_component, 0, 1)
130+
131+
# Pattern 6: minutes + seconds
132+
pattern_minutes_seconds = minute_with_unit + delete_space + pynutil.insert(NEMO_SPACE) + second_with_unit
133+
134+
# Pattern 7: seconds only
135+
pattern_seconds_only = second_with_unit
136+
137+
patterns = [
138+
pattern_hours_only,
139+
pattern_hours_minutes,
140+
pattern_hours_seconds,
141+
pattern_hours_minutes_seconds,
142+
pattern_minutes_only,
143+
pattern_minutes_seconds,
144+
pattern_seconds_only,
145+
]
146+
147+
final_graph = pynini.union(*patterns)
148+
149+
if not deterministic:
150+
# Add special case for half hour ("rưỡi")
151+
half_hour = (
152+
pynutil.delete("minutes:") + delete_space + pynutil.delete("\"ba mươi\"") + pynutil.insert("rưỡi")
153+
)
154+
half_hour_pattern = (
155+
hour_with_unit
156+
+ delete_space
157+
+ pynutil.insert(NEMO_SPACE)
158+
+ half_hour
159+
+ optional_timezone
160+
+ optional_preserve_order
161+
)
162+
self.graph = pynini.union(final_graph, half_hour_pattern)
163+
else:
164+
self.graph = final_graph
165+
166+
# Remove zero minutes and seconds from output
167+
remove_zero_minutes = pynini.cdrewrite(pynutil.delete(" không phút"), "", "", pynini.closure(NEMO_NOT_QUOTE))
168+
remove_zero_seconds = pynini.cdrewrite(pynutil.delete(" không giây"), "", "", pynini.closure(NEMO_NOT_QUOTE))
169+
170+
self.fst = (
171+
self.delete_tokens(self.graph + optional_preserve_order).optimize()
172+
@ remove_zero_minutes
173+
@ remove_zero_seconds
174+
)

0 commit comments

Comments
 (0)