Skip to content

Commit 4a3d52b

Browse files
Korean TN for Money and Telephone (NVIDIA#324)
* feat(ko/money): Korean Money TN only; add data & tests; wire tagger/verbalizer Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix(ko/money): polish tagger/verbalizer & expand tests Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ko: add Telephone TN (tagger+verbalizer) + wire + tests; include money/test updates Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ko: refactor money/telephone taggers & verbalizers Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ko/money: use NEMO_NOT_QUOTE, lowercase space helper, trim mid optimizes Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ko: update money/telephone taggers and telephone verbalizer Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * ko: update telephone taggers Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> --------- Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent acb1cd8 commit 4a3d52b

File tree

13 files changed

+593
-0
lines changed

13 files changed

+593
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
KRW
3+
krw
4+
$ 달러
5+
US$ 달러
6+
HK$ 홍콩 달러
7+
hk$ 홍콩 달러
8+
유로
9+
EUR 유로
10+
¥
11+
JPY
12+
CAD 캐나다 달러
13+
cad 캐나다 달러
14+
NZD 뉴질랜드 달러
15+
nzd 뉴질랜드 달러
16+
CHF 스위스 프랑
17+
chf 스위스 프랑
18+
AED 아랍에미리트 디르함
19+
aed 아랍에미리트 디르함
20+
Dh 디르함
21+
DH 디르함
22+
Dhs. 디르함
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
#     http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space, insert_space
19+
from nemo_text_processing.text_normalization.ko.utils import get_abs_path, load_labels
20+
21+
22+
class MoneyFst(GraphFst):
23+
"""
24+
Finite state transducer for classifying Korean money.
25+
26+
Example inputs and outputs:
27+
₩350 -> money { currency_maj: "원" integer_part: "삼백오십" }
28+
350원 -> money { integer_part: "삼백오십" currency_maj: "원" }
29+
KRW 12,050 -> money { currency_maj: "원" integer_part: "일만이천오십" }
30+
12만 500원 -> money { integer_part: "십이만오백" currency_maj: "원" }
31+
₩10.25 -> money { currency_maj: "원" integer_part: "십" minor_part: "이십오" } # optional 2-digit minor
32+
0원 -> money { integer_part: "영" currency_maj: "원" }
33+
34+
Args:
35+
cardinal: CardinalFst
36+
deterministic: If True, provide a single transduction;
37+
if False, allow multiple transductions.
38+
"""
39+
40+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
41+
super().__init__(name="money", kind="classify", deterministic=deterministic)
42+
43+
graph_cardinal = cardinal.graph
44+
sp = pynini.closure(delete_space) # absorb any amount of spaces in input
45+
46+
# --- Numbers (integer / optional minor) ---
47+
# Integer part: "0" or a non-zero leading digit; allow commas (e.g., 18,925,000)
48+
integer_part_fst = pynini.union("0", (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT | pynutil.delete(",")))
49+
50+
# Plain integer → integer_part: "<Korean number>"
51+
graph_integer_plain = (
52+
pynutil.insert('integer_part: "') + (integer_part_fst @ graph_cardinal) + pynutil.insert('" ')
53+
)
54+
55+
# Optional 2-digit decimal (kept as minor_part if ever used downstream)
56+
decimal_part_fst = NEMO_DIGIT**2
57+
graph_minor = pynutil.insert('minor_part: "') + (decimal_part_fst @ graph_cardinal) + pynutil.insert('" ')
58+
59+
# Integer with scale suffix (만/억/조) → wrap the whole thing in one integer_part
60+
scale_unit = pynini.union("만", "억", "조")
61+
value_with_scale = (integer_part_fst @ graph_cardinal) + scale_unit
62+
graph_integer_with_suffix = (
63+
pynutil.insert('integer_part: "') + value_with_scale + pynutil.insert('" ')
64+
).optimize()
65+
66+
# Integer (+ optional ".<2-digit>" minor)
67+
number_component_plain = graph_integer_plain + pynini.closure(pynutil.delete(".") + graph_minor, 0, 1)
68+
number_component = (graph_integer_with_suffix | number_component_plain).optimize()
69+
70+
# --- Currency (prefix or suffix) ---
71+
# currency_major.tsv example:
72+
# ₩ 원
73+
# KRW 원
74+
# 원 원
75+
maj_labels = load_labels(get_abs_path("data/money/currency_major.tsv"))
76+
77+
# Prefix currency (e.g., ₩, KRW): emit currency_maj then number
78+
currency_major_prepended = pynini.union(
79+
*[pynutil.delete(surface) + pynutil.insert(f'currency_maj: "{unit}" ') for surface, unit in maj_labels]
80+
).optimize()
81+
82+
# Suffix currency (e.g., ...원, ...달러): convert unit literal to currency_maj
83+
currency_major_appended = pynini.union(
84+
*[pynutil.delete(unit) + pynutil.insert(f'currency_maj: "{unit}" ') for _, unit in maj_labels]
85+
).optimize()
86+
87+
# --- Compose (NO period handling) ---
88+
# NOTE: We deliberately do NOT consume '/월', '/년', '/주', '/일', '/시간' here.
89+
# If present in the raw text, they remain outside the money token and can be handled upstream/elsewhere.
90+
91+
# [currency] [number]
92+
graph_prepend = (currency_major_prepended + sp + number_component).optimize()
93+
94+
# [number] [currency]
95+
graph_append = (number_component + currency_major_appended).optimize()
96+
97+
graph = (graph_prepend | graph_append).optimize()
98+
99+
self.fst = self.add_tokens(graph).optimize()
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
#     http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, GraphFst, delete_space, insert_space
19+
from nemo_text_processing.text_normalization.ko.utils import get_abs_path
20+
21+
22+
class TelephoneFst(GraphFst):
23+
"""
24+
Finite state transducer for classifying Korean telephone numbers.
25+
26+
Example inputs → tokens:
27+
+82-10-3713-7050 -> telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" }
28+
+1 (415) 555-0123 -> telephone { country_code: "플러스 일," number_part: "사일오, 오오오, 영일이삼" }
29+
(031)371-3700 -> telephone { number_part: "영삼일, 삼칠일, 삼칠영영" }
30+
010-3713-7050 -> telephone { number_part: "영일영, 삼칠일삼, 칠영오영" }
31+
010.777.8888 -> telephone { number_part: "영일영, 칠칠칠, 팔팔팔팔" }
32+
33+
Args:
34+
deterministic (bool, optional): If True, provide a single transduction;
35+
if False, allow multiple transductions.
36+
"""
37+
38+
def __init__(self, deterministic: bool = True):
39+
super().__init__(name="telephone", kind="classify", deterministic=deterministic)
40+
41+
add_sep = pynutil.insert(", ") # standard block separator ", "
42+
43+
# 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert)
44+
digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize()
45+
zero_map = pynini.cross("0", "영")
46+
digit_ko = (digit | zero_map).optimize()
47+
48+
three_digits = digit_ko**3
49+
four_digits = digit_ko**4
50+
51+
# country code: "+1", "+82", "+1-"
52+
country_core = (
53+
pynini.cross("+", "플러스 ")
54+
+ pynini.closure(digit_ko + insert_space, 0, 2)
55+
+ digit_ko
56+
+ pynutil.insert(",")
57+
)
58+
country_code = pynutil.insert('country_code: "') + country_core + pynutil.insert('"')
59+
country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space
60+
61+
# area part: "123-" | "123." | "(123)" [space?] or "(123)-"
62+
area_core = three_digits
63+
area_part = (
64+
(area_core + (pynutil.delete("-") | pynutil.delete(".")))
65+
| (
66+
pynutil.delete("(")
67+
+ area_core
68+
+ ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-"))
69+
)
70+
) + add_sep
71+
72+
# 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050)
73+
mid = pynini.union(three_digits, four_digits)
74+
last4 = four_digits
75+
76+
# consume '-' or '.' between middle and last blocks
77+
number_part_core = area_part + mid + (pynutil.delete("-") | pynutil.delete(".")) + add_sep + last4
78+
number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"')
79+
80+
# final graph: with or without country code
81+
graph = pynini.union(country_code + number_part, number_part).optimize()
82+
83+
self.fst = self.add_tokens(graph).optimize()

nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
from nemo_text_processing.text_normalization.ko.taggers.date import DateFst
2323
from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst
2424
from nemo_text_processing.text_normalization.ko.taggers.fraction import FractionFst
25+
from nemo_text_processing.text_normalization.ko.taggers.money import MoneyFst
2526
from nemo_text_processing.text_normalization.ko.taggers.ordinal import OrdinalFst
2627
from nemo_text_processing.text_normalization.ko.taggers.punctuation import PunctuationFst
28+
from nemo_text_processing.text_normalization.ko.taggers.telephone import TelephoneFst
2729
from nemo_text_processing.text_normalization.ko.taggers.time import TimeFst
2830
from nemo_text_processing.text_normalization.ko.taggers.whitelist import WhiteListFst
2931
from nemo_text_processing.text_normalization.ko.taggers.word import WordFst
@@ -72,6 +74,8 @@ def __init__(
7274
fraction = FractionFst(cardinal=cardinal, deterministic=deterministic)
7375
whitelist = WhiteListFst(deterministic=deterministic)
7476
punctuation = PunctuationFst(deterministic=deterministic)
77+
money = MoneyFst(cardinal=cardinal, deterministic=deterministic)
78+
telephone = TelephoneFst(deterministic=deterministic)
7579

7680
classify = pynini.union(
7781
pynutil.add_weight(cardinal.fst, 1.1),
@@ -81,8 +85,10 @@ def __init__(
8185
pynutil.add_weight(ordinal.fst, 1.1),
8286
pynutil.add_weight(decimal.fst, 3.05),
8387
pynutil.add_weight(word.fst, 100),
88+
pynutil.add_weight(money.fst, 1.1),
8489
pynutil.add_weight(punctuation.fst, 1.0),
8590
pynutil.add_weight(whitelist.fst, 1.1),
91+
pynutil.add_weight(telephone.fst, 1.1),
8692
)
8793

8894
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
#     http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space
19+
20+
# ===== whitespace & token helpers =====
21+
sp = pynini.closure(delete_space) # absorb 0+ spaces
22+
FIELD_VAL = pynini.closure(NEMO_NOT_QUOTE, 1)
23+
24+
25+
def del_key_val(key: str):
26+
"""
27+
Delete the token field prefix and quotes, keep only the value.
28+
29+
Input format: [sp] key: "<VAL>"
30+
Output: <VAL>
31+
32+
Example:
33+
input 'integer_part: "삼백오십"'
34+
output '삼백오십'
35+
"""
36+
return (sp + pynutil.delete(f'{key}: "') + FIELD_VAL + pynutil.delete('"')).optimize()
37+
38+
39+
def drop_key_val(key: str):
40+
"""
41+
Delete the entire key-value pair (key and its quoted value).
42+
43+
Input format: [sp] key: "<ANY>"
44+
Output: (nothing)
45+
46+
Example:
47+
input 'minor_part: "십"'
48+
output ''
49+
"""
50+
return (sp + pynutil.delete(f'{key}: "') + FIELD_VAL + pynutil.delete('"')).optimize()
51+
52+
53+
def drop_key_exact(key: str, val: str):
54+
"""
55+
Delete the exact key-value pair if it matches the given value.
56+
57+
Input format: [sp] key: "val"
58+
Output: (nothing)
59+
60+
Example:
61+
input 'currency_maj: "원"'
62+
output ''
63+
"""
64+
return (sp + pynutil.delete(f'{key}: "{val}"')).optimize()
65+
66+
67+
class MoneyFst(GraphFst):
68+
"""
69+
Verbalize Korean money.
70+
71+
Input tokens:
72+
tokens { money { integer_part: "..." currency_maj: "..." [minor_part: "..."] } }
73+
74+
Period (e.g., /월, /년, …) is intentionally NOT handled here.
75+
Output examples:
76+
integer_part: "십" currency_maj: "원" -> "십원"
77+
integer_part: "삼십억" currency_maj: "원" -> "삼십억원"
78+
integer_part: "이백" currency_maj: "달러" -> "이백 달러"
79+
"""
80+
81+
def __init__(self, deterministic: bool = True):
82+
super().__init__(name="money", kind="verbalize", deterministic=deterministic)
83+
84+
# --- fields ---
85+
integer_part = del_key_val("integer_part")
86+
minor_part_drop = drop_key_val("minor_part") # ignore minor for KRW
87+
currency_val_any = del_key_val("currency_maj") # ex) "원", "달러", "유로"
88+
won_key_drop = drop_key_exact("currency_maj", "원") # don't print the key for KRW
89+
90+
# ===== KRW (원) =====
91+
# (A) [integer] [원] -> "{integer}원"
92+
won_a = integer_part + sp + won_key_drop + pynutil.insert("원")
93+
# (B) [원] [integer] -> "{integer}원"
94+
won_b = won_key_drop + sp + integer_part + pynutil.insert("원")
95+
won_core = won_a | won_b
96+
won_core = (won_core + pynini.closure(minor_part_drop, 0, 1)).optimize()
97+
98+
# ===== Other currencies =====
99+
# "{integer} {currency}" (KRW sticks; others are spaced)
100+
other_core = (integer_part + insert_space + currency_val_any).optimize()
101+
other_core = (other_core + pynini.closure(minor_part_drop, 0, 1)).optimize()
102+
103+
# ===== combine (no period) =====
104+
graph_core = (pynutil.add_weight(won_core, 0.0) | pynutil.add_weight(other_core, 0.5)).optimize()
105+
106+
# no trailing period mapping
107+
graph = graph_core
108+
109+
# strip tokens wrapper
110+
self.fst = self.delete_tokens(graph).optimize()

0 commit comments

Comments
 (0)