Skip to content

Commit c6e2713

Browse files
Ordinal and Decimal for Vietnamese TN (#290)
* Add Vietnamese text normalization for ordinal and decimal semiotic classes Signed-off-by: folivoramanh <palasek182@gmail.com> * update sparrowhawk Signed-off-by: folivoramanh <palasek182@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refractor decimal code and docstring Signed-off-by: folivoramanh <palasek182@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: folivoramanh <palasek182@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 2c22d73 commit c6e2713

File tree

13 files changed

+415
-38
lines changed

13 files changed

+415
-38
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
1 nhất
2+
4
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
19+
from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels
20+
21+
22+
class DecimalFst(GraphFst):
23+
"""
24+
Finite state transducer for classifying Vietnamese decimal numbers, e.g.
25+
-12,5 tỷ -> decimal { negative: "true" integer_part: "mười hai" fractional_part: "năm" quantity: "tỷ" }
26+
818,303 -> decimal { integer_part: "tám trăm mười tám" fractional_part: "ba không ba" }
27+
0,2 triệu -> decimal { integer_part: "không" fractional_part: "hai" quantity: "triệu" }
28+
29+
Args:
30+
cardinal: CardinalFst instance for processing integer parts
31+
deterministic: if True will provide a single transduction option,
32+
for False multiple options (used for audio-based normalization)
33+
"""
34+
35+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
36+
super().__init__(name="decimal", kind="classify", deterministic=deterministic)
37+
38+
cardinal_graph = cardinal.graph_with_and
39+
self.graph = cardinal.single_digits_graph.optimize()
40+
if not deterministic:
41+
self.graph = self.graph | cardinal_graph
42+
43+
single_digit_map = pynini.union(
44+
*[pynini.cross(k, v) for k, v in load_labels(get_abs_path("data/numbers/digit.tsv"))],
45+
*[pynini.cross(k, v) for k, v in load_labels(get_abs_path("data/numbers/zero.tsv"))]
46+
)
47+
48+
quantity_units = pynini.union(*[v for _, v in load_labels(get_abs_path("data/numbers/magnitudes.tsv"))])
49+
50+
integer_part = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
51+
fractional_part = (
52+
pynutil.insert("fractional_part: \"")
53+
+ (single_digit_map + pynini.closure(pynutil.insert(" ") + single_digit_map))
54+
+ pynutil.insert("\"")
55+
)
56+
57+
decimal_pattern = (
58+
(integer_part + pynutil.insert(" ")).ques + pynutil.delete(",") + pynutil.insert(" ") + fractional_part
59+
)
60+
61+
quantity_suffix = (
62+
pynutil.delete(" ").ques + pynutil.insert(" quantity: \"") + quantity_units + pynutil.insert("\"")
63+
)
64+
65+
decimal_with_quantity = decimal_pattern + quantity_suffix
66+
cardinal_with_quantity = integer_part + quantity_suffix
67+
68+
negative = (pynutil.insert("negative: ") + pynini.cross("-", "\"true\" ")).ques
69+
final_graph = negative + pynini.union(decimal_pattern, decimal_with_quantity, cardinal_with_quantity)
70+
71+
self.fst = self.add_tokens(final_graph).optimize()
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import pynini
17+
from pynini.lib import pynutil
18+
19+
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst
20+
from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels
21+
22+
23+
class OrdinalFst(GraphFst):
24+
"""
25+
Finite state transducer for classifying Vietnamese ordinals, e.g.
26+
thứ 1 -> ordinal { integer: "nhất" }
27+
thứ 4 -> ordinal { integer: "tư" }
28+
thứ 15 -> ordinal { integer: "mười lăm" }
29+
Args:
30+
cardinal: CardinalFst for number conversion
31+
deterministic: if True will provide a single transduction option,
32+
for False multiple options (used for audio-based normalization)
33+
"""
34+
35+
def __init__(self, cardinal, deterministic: bool = True):
36+
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)
37+
38+
prefix = "thứ "
39+
number_pattern = pynini.closure(NEMO_DIGIT, 1)
40+
41+
ordinal_exceptions = {
42+
row[0]: row[1] for row in load_labels(get_abs_path("data/ordinal/ordinal_exceptions.tsv"))
43+
}
44+
45+
exception_patterns = []
46+
for digit, word in ordinal_exceptions.items():
47+
exception_patterns.append(pynini.cross(digit, word))
48+
49+
exception_graph = pynini.union(*exception_patterns) if exception_patterns else None
50+
51+
combined_graph = cardinal.graph
52+
if exception_graph:
53+
combined_graph = pynini.union(exception_graph, cardinal.graph)
54+
55+
self.graph = (
56+
pynutil.delete(prefix)
57+
+ pynutil.insert("integer: \"")
58+
+ pynini.compose(number_pattern, combined_graph)
59+
+ pynutil.insert("\"")
60+
)
61+
62+
self.fst = self.add_tokens(self.graph).optimize()

nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
generator_main,
2626
)
2727
from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst
28+
from nemo_text_processing.text_normalization.vi.taggers.decimal import DecimalFst
29+
from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst
2830
from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst
2931
from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst
3032
from nemo_text_processing.text_normalization.vi.taggers.word import WordFst
@@ -74,8 +76,20 @@ def __init__(
7476
word_graph = WordFst(deterministic=deterministic).fst
7577
logger.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes")
7678

79+
start_time = time.time()
80+
ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
81+
ordinal_graph = ordinal.fst
82+
logger.debug(f"ordinal: {time.time() - start_time: .2f}s -- {ordinal_graph.num_states()} nodes")
83+
84+
start_time = time.time()
85+
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
86+
decimal_graph = decimal.fst
87+
logger.debug(f"decimal: {time.time() - start_time: .2f}s -- {decimal_graph.num_states()} nodes")
88+
7789
classify = (
7890
pynutil.add_weight(whitelist_graph, 0.8)
91+
| pynutil.add_weight(ordinal_graph, 0.81)
92+
| pynutil.add_weight(decimal_graph, 0.85)
7993
| pynutil.add_weight(cardinal_graph, 0.9)
8094
| pynutil.add_weight(word_graph, 100)
8195
)
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space
19+
20+
21+
class DecimalFst(GraphFst):
22+
"""
23+
Finite state transducer for verbalizing Vietnamese decimal numbers, e.g.
24+
decimal { negative: "true" integer_part: "mười hai" fractional_part: "năm" quantity: "tỷ" } -> âm mười hai phẩy năm tỷ
25+
decimal { integer_part: "tám trăm mười tám" fractional_part: "ba không ba" } -> tám trăm mười tám phẩy ba không ba
26+
decimal { integer_part: "không" fractional_part: "hai" quantity: "triệu" } -> không phẩy hai triệu
27+
28+
Args:
29+
cardinal: CardinalFst instance for handling integer verbalization
30+
deterministic: if True will provide a single transduction option,
31+
for False multiple transduction are generated (used for audio-based normalization)
32+
"""
33+
34+
def __init__(self, cardinal, deterministic: bool = True):
35+
super().__init__(name="decimal", kind="verbalize", deterministic=deterministic)
36+
37+
# Handle negative sign - Vietnamese uses "âm" for negative numbers
38+
self.optional_sign = pynini.cross("negative: \"true\"", "âm ")
39+
if not deterministic:
40+
# Alternative ways to say negative in Vietnamese
41+
self.optional_sign |= pynini.cross("negative: \"true\"", "trừ ")
42+
43+
self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)
44+
45+
self.integer = pynutil.delete("integer_part:") + cardinal.integer
46+
self.optional_integer = pynini.closure(self.integer + delete_space + insert_space, 0, 1)
47+
48+
# Handle fractional part - Vietnamese uses "phẩy" (comma) instead of "point"
49+
self.fractional_default = (
50+
pynutil.delete("fractional_part:")
51+
+ delete_space
52+
+ pynutil.delete("\"")
53+
+ pynini.closure(NEMO_NOT_QUOTE, 1)
54+
+ pynutil.delete("\"")
55+
)
56+
57+
self.fractional = pynutil.insert("phẩy ") + self.fractional_default
58+
59+
self.quantity = (
60+
delete_space
61+
+ insert_space
62+
+ pynutil.delete("quantity:")
63+
+ delete_space
64+
+ pynutil.delete("\"")
65+
+ pynini.closure(NEMO_NOT_QUOTE, 1)
66+
+ pynutil.delete("\"")
67+
)
68+
self.optional_quantity = pynini.closure(self.quantity, 0, 1)
69+
70+
graph = self.optional_sign + (
71+
self.integer
72+
| (self.integer + self.quantity)
73+
| (self.optional_integer + self.fractional + self.optional_quantity)
74+
)
75+
76+
self.numbers = graph
77+
delete_tokens = self.delete_tokens(graph)
78+
self.fst = delete_tokens.optimize()
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
19+
20+
21+
class OrdinalFst(GraphFst):
22+
"""
23+
Finite state transducer for verbalizing Vietnamese ordinals, e.g.
24+
ordinal { integer: "nhất" } -> thứ nhất
25+
ordinal { integer: "tư" } -> thứ tư
26+
ordinal { integer: "mười lăm" } -> thứ mười lăm
27+
ordinal { integer: "một trăm" } -> thứ một trăm
28+
29+
Args:
30+
deterministic: if True will provide a single transduction option,
31+
for False multiple options (used for audio-based normalization)
32+
"""
33+
34+
def __init__(self, deterministic: bool = True):
35+
super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)
36+
37+
quoted_content = pynini.closure(NEMO_NOT_QUOTE)
38+
39+
integer = (
40+
pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + quoted_content + pynutil.delete("\"")
41+
)
42+
43+
ordinal_pattern = pynutil.insert("thứ ") + integer
44+
45+
self.ordinal_graph = ordinal_pattern
46+
47+
delete_tokens = self.delete_tokens(self.ordinal_graph)
48+
self.fst = delete_tokens.optimize()

nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
1616
from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst
1717
from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst
18+
from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst
19+
from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst
1820
from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst
1921

2022

@@ -32,7 +34,13 @@ def __init__(self, deterministic: bool = True):
3234
word = WordFst(deterministic=deterministic)
3335
word_graph = word.fst
3436

37+
ordinal = OrdinalFst(deterministic=deterministic)
38+
ordinal_graph = ordinal.fst
39+
40+
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
41+
decimal_graph = decimal.fst
42+
3543
# Combine all verbalizers
36-
graph = cardinal_graph | whitelist_graph | word_graph
44+
graph = cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph
3745

3846
self.fst = graph
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
0,2 triệu~không phẩy hai triệu
2+
18 vạn~mười tám vạn
3+
818,303~tám trăm mười tám phẩy ba không ba
4+
-99,95 tỷ~âm chín mươi chín phẩy chín năm tỷ
5+
60,240~sáu mươi phẩy hai bốn không
6+
-0,007~âm không phẩy không không bảy
7+
123,000~một trăm hai mươi ba phẩy không không không
8+
1,5 triệu~một phẩy năm triệu
9+
3,14 tỷ~ba phẩy một bốn tỷ
10+
10,01 vạn~mười phẩy không một vạn
11+
-12,5~âm mười hai phẩy năm
12+
0,0001~không phẩy không không không một
13+
999,999~chín trăm chín mươi chín phẩy chín chín chín
14+
1,01~một phẩy không một
15+
-1,01~âm một phẩy không một
16+
15,6~mười lăm phẩy sáu
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
thứ 1~thứ nhất
2+
hôm nay là thứ hai~hôm nay là thứ hai
3+
thứ 3 là ngày giữa tuần~thứ ba là ngày giữa tuần
4+
thứ 4 nên làm gì~thứ tư nên làm gì
5+
thứ 7~thứ bảy
6+
con giáp thứ 13~con giáp thứ mười ba
7+
thứ 1~thứ nhất
8+
thứ 4~thứ tư
9+
thứ 2~thứ hai
10+
thứ 3~thứ ba
11+
thứ 5~thứ năm
12+
thứ 6~thứ sáu
13+
thứ 7~thứ bảy
14+
thứ 8~thứ tám
15+
thứ 9~thứ chín
16+
thứ 10~thứ mười
17+
thứ 11~thứ mười một
18+
thứ 12~thứ mười hai
19+
thứ 15~thứ mười lăm
20+
thứ 21~thứ hai mươi mốt
21+
thứ 24~thứ hai mươi tư
22+
thứ 34~thứ ba mươi tư
23+
thứ 100~thứ một trăm
24+
thứ 101~thứ một trăm linh một
25+
thứ 104~thứ một trăm linh bốn
26+
thứ 234~thứ hai trăm ba mươi tư
27+
thứ 1000~thứ một nghìn
28+
thứ 1234~thứ một nghìn hai trăm ba mươi tư
29+
hôm nay thứ 2~hôm nay thứ hai
30+
đứng thứ 15~đứng thứ mười lăm

0 commit comments

Comments
 (0)