Skip to content

Commit 44cd225

Browse files
Korean TN fixes: cardinal, decimal, fraction, date (NVIDIA#374)
* Korean TN fixes: cardinal, decimal, fraction, date Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add ko electronic extensions and improve electronic/telephone normalization Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix Korean TN issues and update test cases Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix Korean TN electronic and post-processing issues Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * Fix Korean TN spacing and electronic/cardinal handling Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * Fix optional token separator and remove redundant whitespace normalization Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unused KO post_processing and update exporter Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> --------- Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent a515acd commit 44cd225

File tree

24 files changed

+289
-254
lines changed

24 files changed

+289
-254
lines changed

nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
카드 끝자리 카드 끝자리
22
카드 마지막 네자리 카드 마지막 네자리
3-
카드 마지막 4자리 카드 마지막 4자리
3+
카드 마지막 4자리 카드 마지막 네자리
44
신용카드 번호 신용카드 번호
55
신용카드 신용카드
66
체크카드 번호 체크카드 번호

nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,11 @@
1818
.or.kr 닷 오알 닷 케이알
1919
.go.kr 닷 지오 닷 케이알
2020
.re.kr 닷 알이 닷 케이알
21-
.jp 닷 제이피
2221
.cn 닷 씨엔
2322
.fr 닷 에프알
2423
.de 닷 디이
2524
.it 닷 아이티
2625
.uk 닷 유케이
2726
.br 닷 비알
2827
.in 닷 아이엔
29-
.ru 닷 알유
30-
.jpg 닷 제이피지
31-
.png 닷 피엔지
32-
.pdf 닷 피디에프
28+
.ru 닷 알유
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
.jpg 닷 제이피지
2+
.png 닷 피엔지
3+
.pdf 닷 피디에프
4+
.JPG 닷 제이피지
5+
.PNG 닷 피엔지
6+
.PDF 닷 피디에프

nemo_text_processing/text_normalization/ko/taggers/cardinal.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,17 @@
1616
import pynini
1717
from pynini.lib import pynutil
1818

19-
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst
19+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, GraphFst
2020
from nemo_text_processing.text_normalization.ko.utils import get_abs_path
2121

2222

2323
class CardinalFst(GraphFst):
2424
def __init__(self, deterministic: bool = True):
2525
super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
26+
27+
# Optional small whitespace inside parentheses or after signs
28+
ws = pynini.closure(NEMO_SPACE, 0, 2)
29+
2630
# Load base .tsv files
2731
graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
2832
graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
@@ -43,8 +47,9 @@ def __init__(self, deterministic: bool = True):
4347
graph_hundred = hundreds @ graph_hundred_component
4448

4549
thousands = NEMO_DIGIT**4
46-
graph_thousand_component = (
47-
pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천'))
50+
graph_thousand_component = pynini.union(
51+
pynini.cross('1', '천'),
52+
graph_digit_no_zero_one + pynutil.insert('천'),
4853
) + pynini.union(
4954
pynini.closure(pynutil.delete('0')),
5055
graph_hundred_component,
@@ -53,7 +58,10 @@ def __init__(self, deterministic: bool = True):
5358
graph_thousand = thousands @ graph_thousand_component
5459

5560
ten_thousands = NEMO_DIGIT**5
56-
graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union(
61+
graph_ten_thousand_component = pynini.union(
62+
pynini.cross('1', '만'),
63+
graph_digit_no_zero_one + pynutil.insert('만'),
64+
) + pynini.union(
5765
pynini.closure(pynutil.delete('0')),
5866
graph_thousand_component,
5967
(pynutil.delete('0') + graph_hundred_component),
@@ -62,12 +70,11 @@ def __init__(self, deterministic: bool = True):
6270
graph_ten_thousand = ten_thousands @ graph_ten_thousand_component
6371

6472
hundred_thousands = NEMO_DIGIT**6
65-
66-
graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union(
67-
pynini.closure(pynutil.delete('0')),
73+
graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert("만")) + pynini.union(
74+
pynini.closure(pynutil.delete("0")),
6875
graph_thousand_component,
69-
(pynutil.delete('0') + graph_hundred_component),
70-
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
76+
(pynutil.delete("0") + graph_hundred_component),
77+
(pynini.closure(pynutil.delete("0")) + graph_1_to_99),
7178
)
7279
graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component
7380

@@ -268,8 +275,27 @@ def __init__(self, deterministic: bool = True):
268275
).optimize()
269276

270277
# Sign and final formatting
271-
optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1)
272-
final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"')
278+
# Build the integer token (integer: "...")
279+
integer_token = pynutil.insert('integer: "') + graph_num + pynutil.insert('"')
280+
281+
# Sign handling:
282+
# - minus sets negative flag
283+
# - plus is ignored (positive number)
284+
minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-")
285+
plus_prefix = pynutil.delete("+")
286+
287+
# Accounting negative: "( 1,234 )" -> negative + integer:"1234"
288+
paren_negative = (
289+
pynutil.insert('negative: "true" ') + pynutil.delete("(") + ws + integer_token + ws + pynutil.delete(")")
290+
)
291+
292+
# Signed number: optional (+|-) + integer
293+
signed_integer = (minus_prefix | plus_prefix).ques + integer_token
294+
295+
# Prefer accounting-form first, then signed form
296+
final_graph = paren_negative | signed_integer
297+
298+
# Wrap with class tokens and finalize
273299
final_graph = self.add_tokens(final_graph)
274300
self.fst = final_graph.optimize()
275-
self.graph = graph_num.optimize()
301+
self.graph = graph_num

nemo_text_processing/text_normalization/ko/taggers/decimal.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst
1919
from nemo_text_processing.text_normalization.ko.utils import get_abs_path
2020

2121

@@ -32,7 +32,14 @@ class DecimalFst(GraphFst):
3232
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
3333
super().__init__(name="decimal", kind="classify", deterministic=deterministic)
3434

35-
cardinal_before_decimal = cardinal.graph
35+
# Use the base cardinal graph for the integer part
36+
base_integer_graph = cardinal.graph
37+
# Only special-case 10000 -> 만 for decimal integer part (if needed)
38+
specials_input = pynini.cross("10000", "만")
39+
40+
# Try the special mapping first, then fall back to normal cardinal
41+
cardinal_before_decimal = (specials_input | base_integer_graph).optimize()
42+
3643
cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv"))
3744
zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
3845

nemo_text_processing/text_normalization/ko/taggers/electronic.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,24 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
121121
dollar_accep = pynini.accep("$")
122122
excluded_symbols = DOT | dollar_accep | AT
123123
filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols)
124-
accepted_characters = ASCII_ALNUM | filtered_symbols
125124
# Domain core graph
126125
graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize()
127126
graph |= graph_domain
128127

128+
known_extensions = pynini.project(
129+
pynini.string_file(get_abs_path("data/electronic/extensions.tsv")),
130+
"input",
131+
)
132+
133+
filename_stem = pynini.closure(
134+
pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)),
135+
1,
136+
)
137+
138+
file_with_extension = filename_stem + known_extensions
139+
140+
graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize()
141+
129142
# (3) URL with protocol
130143
graph |= protocol + insert_space + domain_graph_with_class_tags
131144

@@ -144,9 +157,8 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
144157

145158
four = pynini.closure(NEMO_DIGIT, 4, 4)
146159
sep_token = pynini.union(HYPHEN, NEMO_SPACE)
147-
sep_del = pynutil.delete(pynini.closure(sep_token, 1)) # allow mix of - or space
148-
149-
cc16_grouped = four + sep_del + four + sep_del + four + sep_del + four
160+
cc16_grouped = (four + pynini.cross(sep_token, " ")) ** 3 + four
161+
cc16_grouped = cc16_grouped + delete_space
150162

151163
cc16_no_cue = (
152164
pynutil.insert('protocol: "신용카드 " ')

nemo_text_processing/text_normalization/ko/taggers/fraction.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
18+
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space
1919
from nemo_text_processing.text_normalization.ko.utils import get_abs_path
2020

2121

@@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
5353
+ pynutil.insert(DOUBLE_QUOTE)
5454
)
5555

56-
integer_component_with_space = integer_component + pynutil.insert(NEMO_SPACE)
56+
integer_component_with_space = integer_component + delete_space + pynutil.insert(NEMO_SPACE)
5757

5858
# Denominator and numerator
5959
denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE)

nemo_text_processing/text_normalization/ko/taggers/ordinal.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,20 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
6666

6767
graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize() # Handles 1-39 # Handles 40+
6868

69-
final_graph = pynutil.insert('integer: "') + graph_ordinal + pynutil.insert('"')
69+
# Single-character particles (가, 이, 은, 는, 로, 도 ...)
70+
josa_single = pynini.union("가", "이", "은", "는", "를", "을", "로", "도", "다")
71+
72+
# Multi-character particles (부터, 까지)
73+
josa_multi = pynini.union("부터", "까지")
74+
75+
# Allow patterns like:
76+
# 번째 + (optional single-josa) + (optional multi-josa)
77+
josa = (josa_single.ques + josa_multi.ques).optimize()
78+
79+
# Final ordinal graph with optional particles
80+
graph_ordinal_with_josa = (graph_ordinal + josa).optimize()
81+
82+
# Build the “integer: …” token structure
83+
final_graph = pynutil.insert('integer: "') + graph_ordinal_with_josa + pynutil.insert('"')
84+
7085
self.fst = self.add_tokens(final_graph).optimize()

nemo_text_processing/text_normalization/ko/taggers/telephone.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ class TelephoneFst(GraphFst):
2424
Finite state transducer for classifying Korean telephone numbers.
2525
2626
Example inputs → tokens:
27-
+82-10-3713-7050 -> telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" }
28-
+1 (415) 555-0123 -> telephone { country_code: "플러스 일," number_part: "사일오, 오오오, 영일이삼" }
29-
(031)371-3700 -> telephone { number_part: "영삼일, 삼칠일, 삼칠영영" }
30-
010-3713-7050 -> telephone { number_part: "영일영, 삼칠일삼, 칠영오영" }
31-
010.777.8888 -> telephone { number_part: "영일영, 칠칠칠, 팔팔팔팔" }
27+
+82 010-3713-7050 -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" }
28+
+1 (415) 555-0123 -> telephone { country_code: "국가번호 일," number_part: "사일오 오오오 영일이삼" }
29+
(031)371-3700 -> telephone { number_part: "영삼일 삼칠일 삼칠영영" }
30+
010-3713-7050 -> telephone { number_part: "영일영 삼칠일삼 칠영오영" }
31+
010.777.8888 -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" }
3232
3333
Args:
3434
deterministic (bool, optional): If True, provide a single transduction;
@@ -37,8 +37,10 @@ class TelephoneFst(GraphFst):
3737

3838
def __init__(self, deterministic: bool = True):
3939
super().__init__(name="telephone", kind="classify", deterministic=deterministic)
40-
41-
add_sep = pynutil.insert(", ") # standard block separator ", "
40+
# Separator between digit blocks (e.g., "-" or ".")
41+
delete_sep = pynutil.delete("-") | pynutil.delete(".")
42+
# Optional space inserted between blocks
43+
insert_block_space = insert_space
4244

4345
# 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert)
4446
digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize()
@@ -49,35 +51,39 @@ def __init__(self, deterministic: bool = True):
4951
four_digits = digit_ko**4
5052

5153
# country code: "+1", "+82", "+1-"
52-
country_core = (
53-
pynini.cross("+", "플러스 ")
54-
+ pynini.closure(digit_ko + insert_space, 0, 2)
55-
+ digit_ko
56-
+ pynutil.insert(",")
54+
cc_digits = pynini.closure(digit_ko, 1, 3)
55+
56+
country_code = (
57+
pynutil.delete("+")
58+
+ pynutil.insert('country_code: "')
59+
+ cc_digits
60+
+ pynutil.insert('"')
61+
+ pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1)
62+
+ delete_space
5763
)
58-
country_code = pynutil.insert('country_code: "') + country_core + pynutil.insert('"')
59-
country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space
6064

6165
# area part: "123-" | "123." | "(123)" [space?] or "(123)-"
6266
area_core = three_digits
6367
area_part = (
64-
(area_core + (pynutil.delete("-") | pynutil.delete(".")))
68+
(area_core + delete_sep)
6569
| (
6670
pynutil.delete("(")
6771
+ area_core
68-
+ ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-"))
72+
+ pynutil.delete(")")
73+
+ pynini.closure(pynutil.delete(" "), 0, 1)
74+
+ pynini.closure(delete_sep, 0, 1)
6975
)
70-
) + add_sep
76+
) + insert_block_space
7177

7278
# 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050)
7379
mid = pynini.union(three_digits, four_digits)
7480
last4 = four_digits
7581

7682
# consume '-' or '.' between middle and last blocks
77-
number_part_core = area_part + mid + (pynutil.delete("-") | pynutil.delete(".")) + add_sep + last4
83+
number_part_core = area_part + mid + delete_sep + insert_block_space + last4
7884
number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"')
7985

8086
# final graph: with or without country code
81-
graph = pynini.union(country_code + number_part, number_part).optimize()
87+
graph = pynini.union(country_code + insert_space + number_part, number_part).optimize()
8288

8389
self.fst = self.add_tokens(graph).optimize()

nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,13 @@
1717
import pynini
1818
from pynini.lib import pynutil
1919

20-
from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main
20+
from nemo_text_processing.text_normalization.ko.graph_utils import (
21+
NEMO_WHITE_SPACE,
22+
GraphFst,
23+
delete_extra_space,
24+
delete_space,
25+
generator_main,
26+
)
2127
from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst
2228
from nemo_text_processing.text_normalization.ko.taggers.date import DateFst
2329
from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst
@@ -98,9 +104,12 @@ def __init__(
98104
)
99105

100106
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
101-
tagger = pynini.closure(token, 1)
102107

103-
self.fst = tagger.optimize()
108+
graph = (
109+
delete_space + token + pynini.closure((delete_extra_space | pynini.accep("")) + token) + delete_space
110+
)
111+
112+
self.fst = graph.optimize()
104113

105114
if far_file:
106115
generator_main(far_file, {"tokenize_and_classify": self.fst})

0 commit comments

Comments
 (0)