diff --git a/nemo_text_processing/text_normalization/ko/taggers/date.py b/nemo_text_processing/text_normalization/ko/taggers/date.py index 4f2da5702..9748abc49 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/date.py +++ b/nemo_text_processing/text_normalization/ko/taggers/date.py @@ -226,8 +226,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + insert_space + pynutil.insert("year: \"") + (YEAR_ERA_1TO4 @ graph_cardinal) - + pynutil.delete("년") - + pynutil.insert("년") + + pynini.accep("년") + pynutil.insert("\"") ) | @@ -235,27 +234,26 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ( pynutil.insert("year: \"") + (YEAR_NO_ERA_1TO4 @ graph_cardinal) - + pynutil.delete("년") - + pynutil.insert("년") + + pynini.accep("년") + pynutil.insert("\"") ) ).optimize() individual_month_component = ( - pynutil.insert("month: \"") - + month_cardinal - + pynutil.delete("월") - + pynutil.insert("월") - + pynutil.insert("\"") + pynutil.insert("month: \"") + month_cardinal + pynini.accep("월") + pynutil.insert("\"") ) - individual_day_component = ( - pynutil.insert("day: \"") - + cardinal_lz - + pynutil.delete("일") - + pynutil.insert("일") - + pynutil.insert("\"") - ) + month_josa = pynini.union("에", "은", "는", "에는") + + individual_month_component_with_josa = ( + pynutil.insert('month: "') + + month_cardinal + + pynini.accep("월") + + pynini.closure(month_josa, 0, 1) + + pynutil.insert('"') + ).optimize() + + individual_day_component = pynutil.insert("day: \"") + cardinal_lz + pynini.accep("일") + pynutil.insert("\"") week_full_word_acceptor = pynini.project(week, "output") week_component_full_word = pynutil.insert("weekday: \"") + week_full_word_acceptor + pynutil.insert("\"") @@ -272,6 +270,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): day_and_weekday_component | month_and_weekday_component | individual_year_component + | individual_month_component_with_josa | individual_month_component | individual_day_component | week_component diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py index 2163f5f7f..64ea0c56e 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py @@ -81,6 +81,23 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + numerator_component ) + # Optional particles following the fraction + particle_subject = pynutil.insert('morphosyntactic_features: "분의_subject"') + ( + pynutil.delete("이") | pynutil.delete("가") + ) + particle_topic = pynutil.insert('morphosyntactic_features: "분의_topic"') + ( + pynutil.delete("은") | pynutil.delete("는") + ) + particle_object = pynutil.insert('morphosyntactic_features: "분의_object"') + ( + pynutil.delete("을") | pynutil.delete("를") + ) + + optional_particle = pynini.closure( + pynutil.insert(NEMO_SPACE) + (particle_subject | particle_topic | particle_object), + 0, + 1, + ) + # Optional minus sign optional_sign = ( pynutil.insert(f'negative: {DOUBLE_QUOTE}') @@ -90,7 +107,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) # Combine full graph - graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + optional_particle self.graph = graph.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py index bafbf133d..472b8a86d 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py @@ -60,7 +60,43 @@ def __init__(self, deterministic: bool = True): + numerator_component ) - # Match and delete integer_part field (e.g., "2" in "2과3분의1") + # Handle subject particle feature (분의_subject) + # Insert default particle "이" (will be corrected later via rewrite rules) + subject_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_subject"') + + delete_space + + pynutil.insert("이") # 일단 기본값 + ) + + # Handle topic particle feature (분의_topic) + topic_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_topic"') + + delete_space + + pynutil.insert("은") + ) + + # Handle object particle feature (분의_object) + object_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_object"') + + delete_space + + pynutil.insert("을") + ) + + # Combine fraction + optional particle suffix + # Particle is always inserted first in default form and later corrected + graph_fraction_all = graph_fraction + pynini.closure(subject_suffix | topic_suffix | object_suffix, 0, 1) + + # Handle integer + fraction (e.g., "2과 3/4") + # integer_part is removed and replaced with proper spacing graph_integer = ( pynutil.delete('integer_part:') + delete_space @@ -69,9 +105,10 @@ def __init__(self, deterministic: bool = True): + pynutil.delete('"') + pynutil.insert(NEMO_SPACE) ) - graph_integer_fraction = graph_integer + delete_space + graph_fraction + # Combine integer part with fraction + graph_integer_fraction = graph_integer + delete_space + graph_fraction_all - # Match and delete optional negative field (e.g., "마이너스") + # Handle optional negative prefix (e.g., "마이너스") optional_sign = ( pynutil.delete('negative:') + delete_space @@ -82,9 +119,64 @@ def __init__(self, deterministic: bool = True): + pynutil.insert(NEMO_SPACE) ) - # Final graph handles optional negative + (integer + fraction | fraction only) - graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction) + # Final structure: + # [optional negative] + (integer + fraction OR fraction only) + graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction_all) - # Final optimized verbalizer FST + # Remove token wrappers final_graph = self.delete_tokens(graph) + + # Sigma for rewrite context (entire string) + sigma = pynini.closure(NEMO_NOT_QUOTE | NEMO_SPACE) + + # Fix subject particle agreement (이 → 가 for vowel-ending numerals) + # e.g., 사이 → 사가, 구이 → 구가 + subject_rewrite = pynini.cdrewrite( + pynini.string_map( + [ + ("이이", "이가"), + ("사이", "사가"), + ("오이", "오가"), + ("구이", "구가"), + ] + ), + "", + "", + sigma, + ) + + # Fix topic particle agreement (은 → 는) + # e.g., 이은 → 이는, 사은 → 사는 + topic_rewrite = pynini.cdrewrite( + pynini.string_map( + [ + ("이은", "이는"), + ("사은", "사는"), + ("오은", "오는"), + ("구은", "구는"), + ] + ), + "", + "", + sigma, + ) + + # Fix object particle agreement (을 → 를) + # e.g., 오을 → 오를, 이을 → 이를 + object_rewrite = pynini.cdrewrite( + pynini.string_map( + [ + ("이을", "이를"), + ("사을", "사를"), + ("오을", "오를"), + ("구을", "구를"), + ] + ), + "", + "", + sigma, + ) + + # Apply all rewrite rules sequentially and final optimized FST + final_graph = final_graph @ subject_rewrite @ topic_rewrite @ object_rewrite self.fst = final_graph.optimize() diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt index a183be59b..65e5049b8 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt @@ -11,4 +11,18 @@ 1과1/3~일과 삼분의 일 1과√1/4~일과 사분의 루트 일 3분의1~삼분의 일 -121분의3221~백이십일분의 삼천이백이십일 \ No newline at end of file +121분의3221~백이십일분의 삼천이백이십일 +이번 경기의 3/5이 중요하다~이번 경기의 오분의 삼이 중요하다 +전체 구역의 4/7이 통제되었다~전체 구역의 칠분의 사가 통제되었다 +설문 응답자의 9/10이 찬성했다~설문 응답자의 십분의 구가 찬성했다 +그 중 2/3은 성공했다~그 중 삼분의 이는 성공했다 +참가자의 5/8이 탈락했다~참가자의 팔분의 오가 탈락했다 +참가자의 6/7 이 통과했다~참가자의 칠분의 육 이 통과했다 +전체의 3/4 이 감소했다~전체의 사분의 삼 이 감소했다 +응답자의 2/5이 반대했다~응답자의 오분의 이가 반대했다 +학생의 7/9 이 합격했다~학생의 구분의 칠 이 합격했다 +전체의 1/2 이 남았다~전체의 이분의 일 이 남았다 +그 중 4/5이 성공했다~그 중 오분의 사가 성공했다 +전체의 5/6이 완료되었다~전체의 육분의 오가 완료되었다 +참가자의 3/8이 탈락했다~참가자의 팔분의 삼이 탈락했다 +응답자의 6/10 이 동의했다~응답자의 십분의 육 이 동의했다 \ No newline at end of file