Skip to content

Commit aacba0f

Browse files
Korean TN Decimal Support (NVIDIA#303)
* feat(ko/decimal): add Korean decimal TN support Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat(ko): Add fraction tagger and verbalizer with tests Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix(ko): Update decimal and fraction taggers Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Jinwoo Bae <bbae7050@gmail.com>
1 parent de28dce commit aacba0f

File tree

11 files changed

+415
-1
lines changed

11 files changed

+415
-1
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
19+
from nemo_text_processing.text_normalization.ko.utils import get_abs_path
20+
21+
22+
class DecimalFst(GraphFst):
23+
"""
24+
Finite state transducer for classifying decimal numbers in Korean, e.g.
25+
1.23 -> decimal { integer_part: "일" fractional_part: "이삼" }
26+
-0.5 -> decimal { negative: "마이너스" integer_part: "영" fractional_part: "오" }
27+
28+
Args:
29+
cardinal: CardinalFst
30+
"""
31+
32+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
33+
super().__init__(name="decimal", kind="classify", deterministic=deterministic)
34+
35+
cardinal_before_decimal = cardinal.graph
36+
cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv"))
37+
zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
38+
39+
DOUBLE_QUOTE = '"'
40+
41+
graph_integer = (
42+
pynutil.insert(f'integer_part: {DOUBLE_QUOTE}') + cardinal_before_decimal + pynutil.insert(DOUBLE_QUOTE)
43+
)
44+
graph_fractional = (
45+
pynutil.insert(f'fractional_part: {DOUBLE_QUOTE}')
46+
+ pynini.closure(cardinal_after_decimal | zero, 1)
47+
+ pynutil.insert(DOUBLE_QUOTE)
48+
)
49+
50+
# Decimal without a sign (e.g., 2.5)
51+
graph_decimal_no_sign = graph_integer + pynutil.delete('.') + pynutil.insert(NEMO_SPACE) + graph_fractional
52+
53+
# Negative sign handling (e.g., -2.5 or 마이너스2.5)
54+
graph_with_negative = (
55+
pynutil.insert(f'negative: {DOUBLE_QUOTE}')
56+
+ (pynini.cross("-", "마이너스") | pynini.accep("마이너스"))
57+
+ pynutil.insert(DOUBLE_QUOTE)
58+
)
59+
60+
graph_decimal = graph_decimal_no_sign | (
61+
graph_with_negative + pynutil.insert(NEMO_SPACE) + graph_decimal_no_sign
62+
)
63+
64+
# For internal use without tokens
65+
self.just_decimal = graph_decimal_no_sign.optimize()
66+
67+
# Final graph with tokens
68+
final_graph = self.add_tokens(graph_decimal)
69+
self.fst = final_graph.optimize()
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
19+
from nemo_text_processing.text_normalization.ko.utils import get_abs_path
20+
21+
22+
class FractionFst(GraphFst):
23+
"""
24+
Finite state transducer for classifying Korean fractions, e.g.
25+
3/5 → tokens { fraction { numerator: "삼" denominator: "오" } }
26+
2과7/9 → tokens { fraction { integer_part: "이" numerator: "칠" denominator: "구" } }
27+
마이너스3/5 → tokens { fraction { negative: "마이너스" numerator: "삼" denominator: "오" } }
28+
"""
29+
30+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
31+
super().__init__(name="fraction", kind="classify", deterministic=deterministic)
32+
33+
cardinal = cardinal.graph
34+
graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
35+
graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
36+
37+
DOUBLE_QUOTE = '"'
38+
slash = pynutil.delete('/')
39+
morphemes = pynini.accep('분의')
40+
root = pynini.accep('√')
41+
42+
# Decimal number (e.g., 1.23 → 일점이삼)
43+
decimal_number = cardinal + pynini.cross(".", "점") + pynini.closure(graph_digit | graph_zero)
44+
45+
# Accept cardinal / root + cardinal / decimal / root + decimal
46+
numeral = cardinal | (root + cardinal) | decimal_number | (root + decimal_number)
47+
48+
# Integer part (e.g., 2과, 1와)
49+
integer_component = (
50+
pynutil.insert(f'integer_part: {DOUBLE_QUOTE}')
51+
+ numeral
52+
+ (pynini.accep("과") | pynini.accep("와"))
53+
+ pynutil.insert(DOUBLE_QUOTE)
54+
)
55+
56+
integer_component_with_space = integer_component + pynutil.insert(NEMO_SPACE)
57+
58+
# Denominator and numerator
59+
denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE)
60+
61+
numerator_component = pynutil.insert(f'numerator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE)
62+
63+
# Format 1: 3/4 style
64+
graph_fraction_slash = (
65+
pynini.closure(integer_component_with_space, 0, 1)
66+
+ numerator_component
67+
+ slash
68+
+ pynutil.insert(NEMO_SPACE)
69+
+ denominator_component
70+
)
71+
72+
# Format 2: Korean native "4분의3" style
73+
graph_fraction_word = (
74+
pynini.closure(integer_component_with_space, 0, 1)
75+
+ denominator_component
76+
+ pynutil.delete("분의")
77+
+ pynutil.insert(NEMO_SPACE)
78+
+ pynutil.insert('morphosyntactic_features: "분의"')
79+
+ pynutil.insert(NEMO_SPACE)
80+
+ numerator_component
81+
)
82+
83+
# Optional minus sign
84+
optional_sign = (
85+
pynutil.insert(f'negative: {DOUBLE_QUOTE}')
86+
+ (pynini.accep("마이너스") | pynini.cross("-", "마이너스"))
87+
+ pynutil.insert(DOUBLE_QUOTE)
88+
+ pynutil.insert(NEMO_SPACE)
89+
)
90+
91+
# Combine full graph
92+
graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word)
93+
final_graph = self.add_tokens(graph)
94+
self.fst = final_graph.optimize()

nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main
2121
from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst
2222
from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst
23+
from nemo_text_processing.text_normalization.ko.taggers.fraction import FractionFst
2324
from nemo_text_processing.text_normalization.ko.taggers.ordinal import OrdinalFst
2425
from nemo_text_processing.text_normalization.ko.taggers.word import WordFst
2526
from nemo_text_processing.utils.logging import logger
@@ -62,9 +63,11 @@ def __init__(
6263
ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic)
6364
word = WordFst(deterministic=deterministic)
6465
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
66+
fraction = FractionFst(cardinal=cardinal, deterministic=deterministic)
6567

6668
classify = pynini.union(
6769
pynutil.add_weight(cardinal.fst, 1.1),
70+
pynutil.add_weight(fraction.fst, 1.0),
6871
pynutil.add_weight(ordinal.fst, 1.1),
6972
pynutil.add_weight(decimal.fst, 3.05),
7073
pynutil.add_weight(word.fst, 100),
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst
19+
20+
21+
class DecimalFst(GraphFst):
22+
def __init__(self, deterministic: bool = True):
23+
super().__init__(name="decimal", kind="verbalize", deterministic=deterministic)
24+
25+
# Extract integer part
26+
integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"')
27+
28+
# Extract fractional part and prepend "점"
29+
fractional_part = (
30+
pynutil.delete('fractional_part: "')
31+
+ pynutil.insert("점")
32+
+ pynini.closure(NEMO_NOT_QUOTE, 1)
33+
+ pynutil.delete('"')
34+
)
35+
36+
# Verbalize decimal number without sign
37+
decimal_positive = integer_part + pynutil.delete(" ") + fractional_part
38+
39+
# Handle negative sign
40+
negative_sign = (
41+
pynutil.delete('negative: "') + pynini.accep("마이너스") + pynutil.delete('"') + pynutil.delete(" ")
42+
)
43+
44+
# Combine positive and negative cases
45+
decimal = decimal_positive | (negative_sign + pynutil.insert(" ") + decimal_positive)
46+
47+
delete_tokens = self.delete_tokens(decimal)
48+
self.fst = delete_tokens.optimize()
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space
19+
20+
21+
class FractionFst(GraphFst):
22+
"""
23+
Finite state transducer for verbalizing Korean fractions, e.g.
24+
tokens { fraction { numerator: "3" denominator: "5" } } → 5분의3
25+
tokens { fraction { integer_part: "2" numerator: "7" denominator: "9" } } → 2과 9분의7
26+
tokens { fraction { denominator: "√8" numerator: "4" } } → 루트8분의4
27+
tokens { fraction { denominator: "2.75" numerator: "125" } } → 2.75분의125
28+
tokens { fraction { negative: "마이너스" numerator: "10" denominator: "11" } } → 마이너스11분의10
29+
"""
30+
31+
def __init__(self, deterministic: bool = True):
32+
super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
33+
34+
# Handles square root symbols like "√3" → "루트3"
35+
denominator_root = pynini.cross("√", "루트") + pynini.closure(NEMO_NOT_QUOTE)
36+
numerator_root = pynini.cross("√", "루트") + pynini.closure(NEMO_NOT_QUOTE)
37+
38+
# Matches non-root numeric content
39+
denominator = pynini.closure(NEMO_NOT_QUOTE - "√")
40+
numerator = pynini.closure(NEMO_NOT_QUOTE - "√")
41+
42+
# Delete FST field: denominator and extract value
43+
denominator_component = (
44+
pynutil.delete('denominator: "') + (denominator_root | denominator) + pynutil.delete('"')
45+
)
46+
numerator_component = pynutil.delete('numerator: "') + (numerator_root | numerator) + pynutil.delete('"')
47+
48+
# Match fraction form: "denominator + 분의 + numerator"
49+
# Also deletes optional morphosyntactic_features: "분의" if present
50+
graph_fraction = (
51+
denominator_component
52+
+ pynutil.delete(NEMO_SPACE)
53+
+ pynini.closure(
54+
pynutil.delete('morphosyntactic_features:') + delete_space + pynutil.delete('"분의"') + delete_space,
55+
0,
56+
1,
57+
)
58+
+ pynutil.insert("분의")
59+
+ numerator_component
60+
)
61+
62+
# Match and delete integer_part field (e.g., "2" in "2과3분의1")
63+
graph_integer = (
64+
pynutil.delete('integer_part:')
65+
+ delete_space
66+
+ pynutil.delete('"')
67+
+ pynini.closure(pynini.union("√", ".", NEMO_NOT_QUOTE - '"'))
68+
+ pynutil.delete('"')
69+
)
70+
graph_integer_fraction = graph_integer + delete_space + graph_fraction
71+
72+
# Match and delete optional negative field (e.g., "마이너스")
73+
optional_sign = (
74+
pynutil.delete('negative:')
75+
+ delete_space
76+
+ pynutil.delete('"')
77+
+ pynini.closure(NEMO_NOT_QUOTE - '"')
78+
+ pynutil.delete('"')
79+
+ delete_space
80+
)
81+
82+
# Final graph handles optional negative + (integer + fraction | fraction only)
83+
graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction)
84+
85+
# Final optimized verbalizer FST
86+
final_graph = self.delete_tokens(graph)
87+
self.fst = final_graph.optimize()

nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst
1818
from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst
1919
from nemo_text_processing.text_normalization.ko.verbalizers.decimal import DecimalFst
20+
from nemo_text_processing.text_normalization.ko.verbalizers.fraction import FractionFst
2021
from nemo_text_processing.text_normalization.ko.verbalizers.ordinal import OrdinalFst
2122
from nemo_text_processing.text_normalization.ko.verbalizers.word import WordFst
2223

@@ -39,7 +40,8 @@ def __init__(self, deterministic: bool = True):
3940
ordinal = OrdinalFst(deterministic=deterministic)
4041
decimal = DecimalFst(deterministic=deterministic)
4142
word = WordFst(deterministic=deterministic)
43+
fraction = FractionFst(deterministic=deterministic)
4244

43-
graph = pynini.union(cardinal.fst, ordinal.fst, word.fst, decimal.fst)
45+
graph = pynini.union(cardinal.fst, ordinal.fst, word.fst, decimal.fst, fraction.fst)
4446

4547
self.fst = graph.optimize()
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
-0.1~마이너스 영점일
2+
-0.5~마이너스 영점오
3+
-1.1~마이너스 일점일
4+
-2.5~마이너스 이점오
5+
-4.2~마이너스 사점이
6+
-11.99~마이너스 십일점구구
7+
-15.8~마이너스 십오점팔
8+
-25.3~마이너스 이십오점삼
9+
-30.8~마이너스 삼십점팔
10+
-72.4~마이너스 칠십이점사
11+
-100.5~마이너스 백점오
12+
0.1~영점일
13+
0.5~영점오
14+
1.1~일점일
15+
2.5~이점오
16+
4.2~사점이
17+
11.99~십일점구구
18+
15.8~십오점팔
19+
25.3~이십오점삼
20+
30.8~삼십점팔
21+
42.75~사십이점칠오
22+
72.4~칠십이점사
23+
100.5~백점오
24+
123.99~백이십삼점구구
25+
165.4~백육십오점사
26+
999.99~구백구십구점구구
27+
1000.01~천점영일
28+
123456.2234~십이만삼천사백오십육점이이삼사
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
1/2~이분의일
2+
-1/2~마이너스이분의일
3+
1과1/2~일과이분의일
4+
2와12/33~이와삼십삼분의십이
5+
-1과1/2~마이너스일과이분의일
6+
마이너스1과1/2~마이너스일과이분의일
7+
마이너스1과√1/2~마이너스일과이분의루트일
8+
-1과√1/2~마이너스일과이분의루트일
9+
1과√1/2~일과이분의루트일
10+
1과1/√3~일과루트삼분의일
11+
1과1/3~일과삼분의일
12+
1과√1/4~일과사분의루트일
13+
3분의1~삼분의일
14+
121분의3221~백이십일분의삼천이백이십일

0 commit comments

Comments
 (0)