Skip to content

Commit 1a38df0

Browse files
mgrafubbae0312pre-commit-ci[bot]
authored
Add Korean TN post-processing rules for particle agreement and month … (#413)
* Add Korean TN post-processing rules for particle agreement and month … (#409) * Add Korean TN post-processing rules for particle agreement and month handling Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add Korean TN fraction test cases for particle agreement Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * Fix Korean fraction verbalization with particle-aware handling and remove post_processing dependency Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix date and fraction normalization issues based on review feedback Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Jenkins fix (#419) Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com> * update jenkins cache Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com> --------- Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com> Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Jinwoo Bae <bbae7050@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 34743dd commit 1a38df0

5 files changed

Lines changed: 147 additions & 25 deletions

File tree

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ pipeline {
2727
HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0'
2828
MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
2929
JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
30+
KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
3031
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
31-
KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-26-0'
3232
DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
3333
}
3434
stages {

nemo_text_processing/text_normalization/ko/taggers/date.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -226,36 +226,34 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
226226
+ insert_space
227227
+ pynutil.insert("year: \"")
228228
+ (YEAR_ERA_1TO4 @ graph_cardinal)
229-
+ pynutil.delete("년")
230-
+ pynutil.insert("년")
229+
+ pynini.accep("년")
231230
+ pynutil.insert("\"")
232231
)
233232
|
234233
# no era: 1~4 digits, no leading zero
235234
(
236235
pynutil.insert("year: \"")
237236
+ (YEAR_NO_ERA_1TO4 @ graph_cardinal)
238-
+ pynutil.delete("년")
239-
+ pynutil.insert("년")
237+
+ pynini.accep("년")
240238
+ pynutil.insert("\"")
241239
)
242240
).optimize()
243241

244242
individual_month_component = (
245-
pynutil.insert("month: \"")
246-
+ month_cardinal
247-
+ pynutil.delete("월")
248-
+ pynutil.insert("월")
249-
+ pynutil.insert("\"")
243+
pynutil.insert("month: \"") + month_cardinal + pynini.accep("월") + pynutil.insert("\"")
250244
)
251245

252-
individual_day_component = (
253-
pynutil.insert("day: \"")
254-
+ cardinal_lz
255-
+ pynutil.delete("일")
256-
+ pynutil.insert("일")
257-
+ pynutil.insert("\"")
258-
)
246+
month_josa = pynini.union("에", "은", "는", "에는")
247+
248+
individual_month_component_with_josa = (
249+
pynutil.insert('month: "')
250+
+ month_cardinal
251+
+ pynini.accep("월")
252+
+ pynini.closure(month_josa, 0, 1)
253+
+ pynutil.insert('"')
254+
).optimize()
255+
256+
individual_day_component = pynutil.insert("day: \"") + cardinal_lz + pynini.accep("일") + pynutil.insert("\"")
259257

260258
week_full_word_acceptor = pynini.project(week, "output")
261259
week_component_full_word = pynutil.insert("weekday: \"") + week_full_word_acceptor + pynutil.insert("\"")
@@ -272,6 +270,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
272270
day_and_weekday_component
273271
| month_and_weekday_component
274272
| individual_year_component
273+
| individual_month_component_with_josa
275274
| individual_month_component
276275
| individual_day_component
277276
| week_component

nemo_text_processing/text_normalization/ko/taggers/fraction.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,23 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
8181
+ numerator_component
8282
)
8383

84+
# Optional particles following the fraction
85+
particle_subject = pynutil.insert('morphosyntactic_features: "분의_subject"') + (
86+
pynutil.delete("이") | pynutil.delete("가")
87+
)
88+
particle_topic = pynutil.insert('morphosyntactic_features: "분의_topic"') + (
89+
pynutil.delete("은") | pynutil.delete("는")
90+
)
91+
particle_object = pynutil.insert('morphosyntactic_features: "분의_object"') + (
92+
pynutil.delete("을") | pynutil.delete("를")
93+
)
94+
95+
optional_particle = pynini.closure(
96+
pynutil.insert(NEMO_SPACE) + (particle_subject | particle_topic | particle_object),
97+
0,
98+
1,
99+
)
100+
84101
# Optional minus sign
85102
optional_sign = (
86103
pynutil.insert(f'negative: {DOUBLE_QUOTE}')
@@ -90,7 +107,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
90107
)
91108

92109
# Combine full graph
93-
graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word)
110+
graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + optional_particle
94111
self.graph = graph.optimize()
95112
final_graph = self.add_tokens(graph)
96113
self.fst = final_graph.optimize()

nemo_text_processing/text_normalization/ko/verbalizers/fraction.py

Lines changed: 98 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,43 @@ def __init__(self, deterministic: bool = True):
6060
+ numerator_component
6161
)
6262

63-
# Match and delete integer_part field (e.g., "2" in "2과3분의1")
63+
# Handle subject particle feature (분의_subject)
64+
# Insert default particle "이" (will be corrected later via rewrite rules)
65+
subject_suffix = (
66+
pynutil.delete(NEMO_SPACE)
67+
+ pynutil.delete('morphosyntactic_features:')
68+
+ delete_space
69+
+ pynutil.delete('"분의_subject"')
70+
+ delete_space
71+
+ pynutil.insert("이") # 일단 기본값
72+
)
73+
74+
# Handle topic particle feature (분의_topic)
75+
topic_suffix = (
76+
pynutil.delete(NEMO_SPACE)
77+
+ pynutil.delete('morphosyntactic_features:')
78+
+ delete_space
79+
+ pynutil.delete('"분의_topic"')
80+
+ delete_space
81+
+ pynutil.insert("은")
82+
)
83+
84+
# Handle object particle feature (분의_object)
85+
object_suffix = (
86+
pynutil.delete(NEMO_SPACE)
87+
+ pynutil.delete('morphosyntactic_features:')
88+
+ delete_space
89+
+ pynutil.delete('"분의_object"')
90+
+ delete_space
91+
+ pynutil.insert("을")
92+
)
93+
94+
# Combine fraction + optional particle suffix
95+
# Particle is always inserted first in default form and later corrected
96+
graph_fraction_all = graph_fraction + pynini.closure(subject_suffix | topic_suffix | object_suffix, 0, 1)
97+
98+
# Handle integer + fraction (e.g., "2과 3/4")
99+
# integer_part is removed and replaced with proper spacing
64100
graph_integer = (
65101
pynutil.delete('integer_part:')
66102
+ delete_space
@@ -69,9 +105,10 @@ def __init__(self, deterministic: bool = True):
69105
+ pynutil.delete('"')
70106
+ pynutil.insert(NEMO_SPACE)
71107
)
72-
graph_integer_fraction = graph_integer + delete_space + graph_fraction
108+
# Combine integer part with fraction
109+
graph_integer_fraction = graph_integer + delete_space + graph_fraction_all
73110

74-
# Match and delete optional negative field (e.g., "마이너스")
111+
# Handle optional negative prefix (e.g., "마이너스")
75112
optional_sign = (
76113
pynutil.delete('negative:')
77114
+ delete_space
@@ -82,9 +119,64 @@ def __init__(self, deterministic: bool = True):
82119
+ pynutil.insert(NEMO_SPACE)
83120
)
84121

85-
# Final graph handles optional negative + (integer + fraction | fraction only)
86-
graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction)
122+
# Final structure:
123+
# [optional negative] + (integer + fraction OR fraction only)
124+
graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction_all)
87125

88-
# Final optimized verbalizer FST
126+
# Remove token wrappers
89127
final_graph = self.delete_tokens(graph)
128+
129+
# Sigma for rewrite context (entire string)
130+
sigma = pynini.closure(NEMO_NOT_QUOTE | NEMO_SPACE)
131+
132+
# Fix subject particle agreement (이 → 가 for vowel-ending numerals)
133+
# e.g., 사이 → 사가, 구이 → 구가
134+
subject_rewrite = pynini.cdrewrite(
135+
pynini.string_map(
136+
[
137+
("이이", "이가"),
138+
("사이", "사가"),
139+
("오이", "오가"),
140+
("구이", "구가"),
141+
]
142+
),
143+
"",
144+
"",
145+
sigma,
146+
)
147+
148+
# Fix topic particle agreement (은 → 는)
149+
# e.g., 이은 → 이는, 사은 → 사는
150+
topic_rewrite = pynini.cdrewrite(
151+
pynini.string_map(
152+
[
153+
("이은", "이는"),
154+
("사은", "사는"),
155+
("오은", "오는"),
156+
("구은", "구는"),
157+
]
158+
),
159+
"",
160+
"",
161+
sigma,
162+
)
163+
164+
# Fix object particle agreement (을 → 를)
165+
# e.g., 오을 → 오를, 이을 → 이를
166+
object_rewrite = pynini.cdrewrite(
167+
pynini.string_map(
168+
[
169+
("이을", "이를"),
170+
("사을", "사를"),
171+
("오을", "오를"),
172+
("구을", "구를"),
173+
]
174+
),
175+
"",
176+
"",
177+
sigma,
178+
)
179+
180+
# Apply all rewrite rules sequentially and final optimized FST
181+
final_graph = final_graph @ subject_rewrite @ topic_rewrite @ object_rewrite
90182
self.fst = final_graph.optimize()

tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,18 @@
1111
1과1/3~일과 삼분의 일
1212
1과√1/4~일과 사분의 루트 일
1313
3분의1~삼분의 일
14-
121분의3221~백이십일분의 삼천이백이십일
14+
121분의3221~백이십일분의 삼천이백이십일
15+
이번 경기의 3/5이 중요하다~이번 경기의 오분의 삼이 중요하다
16+
전체 구역의 4/7이 통제되었다~전체 구역의 칠분의 사가 통제되었다
17+
설문 응답자의 9/10이 찬성했다~설문 응답자의 십분의 구가 찬성했다
18+
그 중 2/3은 성공했다~그 중 삼분의 이는 성공했다
19+
참가자의 5/8이 탈락했다~참가자의 팔분의 오가 탈락했다
20+
참가자의 6/7 이 통과했다~참가자의 칠분의 육 이 통과했다
21+
전체의 3/4 이 감소했다~전체의 사분의 삼 이 감소했다
22+
응답자의 2/5이 반대했다~응답자의 오분의 이가 반대했다
23+
학생의 7/9 이 합격했다~학생의 구분의 칠 이 합격했다
24+
전체의 1/2 이 남았다~전체의 이분의 일 이 남았다
25+
그 중 4/5이 성공했다~그 중 오분의 사가 성공했다
26+
전체의 5/6이 완료되었다~전체의 육분의 오가 완료되었다
27+
참가자의 3/8이 탈락했다~참가자의 팔분의 삼이 탈락했다
28+
응답자의 6/10 이 동의했다~응답자의 십분의 육 이 동의했다

0 commit comments

Comments
 (0)