diff --git a/Jenkinsfile b/Jenkinsfile index 51ce37a10..8a6fd760b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-25-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv index eaddf930a..6bdfb34f8 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv @@ -4,9 +4,11 @@ h घंटे min मिनट doz दर्जन yr साल -yr वर्ष hp हॉर्सपॉवर d दिन month महीना months महीने -हफ़्ते हफ़्ते \ No newline at end of file +हफ़्ते +सप्ताह +सदियां +सदियों \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 189512687..4065bc86b 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -134,7 +134,6 @@ KHz किलोहर्ट्ज़ N न्यूटन dB डेसीबल yr साल -yr वर्ष hp हॉर्सपॉवर d दिन month महीना diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py b/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/country_codes.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/country_codes.tsv new file mode 100644 index 000000000..685a5866d --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/country_codes.tsv @@ -0,0 +1,2 @@ +९१ नौ एक +91 नौ एक diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv new file mode 100644 index 000000000..46b485af6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv @@ -0,0 +1,3 @@ +नंबर +कार्ड +क्रेडिट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv new file mode 100644 index 000000000..17a123bee --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv @@ -0,0 +1,5 @@ +नंबर +मोबाइल +फोन +लैंडलाइन +कॉल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv new file mode 100644 index 000000000..f2fa6e52f --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv @@ -0,0 +1,4 @@ +नंबर +मोबाइल +फोन +कॉल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv new file mode 100644 index 000000000..e8c04b723 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv @@ -0,0 +1,10 @@ +0 शून्य +1 एक +2 दो +3 तीन +4 चार +5 पाँच +6 छह +7 सात +8 आठ +9 नौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv new file mode 100644 index 000000000..322c7248e --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv @@ -0,0 +1,4 @@ +नंबर +पिन +कोड +पिनकोड \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv b/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv new file mode 100644 index 000000000..3477871e4 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv @@ -0,0 +1,100 @@ +० एक +१ दो +२ तीन +३ चार +४ पाँच +५ छह +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह +१२ तेरह +१३ चौदह +१४ पंद्रह +१५ सोलह +१६ सत्रह +१७ अठारह +१८ उन्नीस +१९ बीस +२० इक्कीस +२१ बाईस +२२ तेईस +२३ चौबीस +२४ पच्चीस +२५ छब्बीस +२६ सत्ताईस +२७ अट्ठाईस +२८ उनतीस +२९ तीस +३० इकतीस +३१ बत्तीस +३२ तैंतीस +३३ चौंतीस +३४ पैंतीस +३५ छत्तीस +३६ सैंतीस +३७ अड़तीस +३८ उनतालीस +३९ चालीस +४० इकतालीस +४१ बयालीस +४२ तैंतालीस +४३ चौवालीस +४४ पैंतालीस +४५ छियालीस +४६ सैंतालीस +४७ अड़तालीस +४८ उनचास +४९ पचास +५० इक्यावन +५१ बावन +५२ तिरेपन +५३ चौवन +५४ पचपन +५५ छप्पन +५६ सत्तावन +५७ अट्ठावन +५८ उनसठ +५९ साठ +६० इकसठ +६१ बासठ +६२ तिरेसठ +६३ चौंसठ +६४ पैंसठ +६५ छियासठ +६६ सड़सठ +६७ अड़सठ +६८ उनहत्तर +६९ सत्तर +७० इकहत्तर +७१ बहत्तर +७२ तिहत्तर +७३ चौहत्तर +७४ पचहत्तर +७५ छिहत्तर +७६ सतहत्तर +७७ अठहत्तर +७८ उनासी +७९ अस्सी +८० इक्यासी +८१ बयासी +८२ तिरासी +८३ चौरासी +८४ पचासी +८५ छियासी +८६ सत्तासी +८७ अट्ठासी +८८ नवासी +८९ नब्बे +९० इक्यानबे +९१ बानबे +९२ तिरानबे +९३ चौरानबे +९४ पंचानबे +९५ छियानबे +९६ सत्तानबे +९७ अट्ठानबे +९८ निन्यानबे +९९ एक सौ diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index c50384acf..bc7594ad9 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -21,12 +21,12 @@ class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. - -२३ -> cardinal { negative: "true" integer: "तेइस" } } - s - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + Finite state transducer for classifying cardinals, e.g. + -२३ -> cardinal { negative: "true" integer: "तेइस" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True, lm: bool = False): @@ -37,6 +37,10 @@ def __init__(self, deterministic: bool = True, lm: bool = False): teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + self.digit = digit + self.zero = zero + self.teens_and_ties = teens_and_ties + def create_graph_suffix(digit_graph, suffix, zeros_counts): zero = pynutil.add_weight(pynutil.delete("०"), -0.1) if zeros_counts == 0: diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py index 955e8c0d3..cb21d85b1 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -58,9 +58,7 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) - graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_digit |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - + graph_digit = cardinal.digit | cardinal.zero cardinal_graph = cardinal.final_graph self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index 8971cd3dd..d995608da 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -16,6 +16,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.utils import get_abs_path class FractionFst(GraphFst): @@ -47,13 +48,43 @@ def __init__(self, cardinal, deterministic: bool = True): ) self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") - self.graph = ( + dedh_dhai_graph = pynini.string_map([("१ १/२", "डेढ़"), ("२ १/२", "ढाई")]) + + savva_numbers = cardinal_graph + pynini.cross(" १/४", "") + savva_graph = pynutil.insert("सवा ") + savva_numbers + + sadhe_numbers = cardinal_graph + pynini.cross(" १/२", "") + sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(" ३/४", "") + paune_graph = pynutil.insert("पौने ") + paune_numbers + + graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ") + + graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ") + + graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ") + + graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ") + + final_graph = ( self.optional_graph_negative + pynini.closure(self.integer + pynini.accep(" "), 0, 1) + self.numerator + self.denominator ) + weighted_graph = ( + final_graph + | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_savva, -0.1) + | pynutil.add_weight(graph_sadhe, -0.1) + | pynutil.add_weight(graph_paune, -0.2) + ) + + self.graph = weighted_graph + graph = self.graph - final_graph = self.add_tokens(graph) - self.fst = final_graph.optimize() + graph = self.add_tokens(graph) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 9f1ffbd39..919a69929 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -41,8 +41,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = ( - digit - | teens_and_ties + cardinal.zero + | cardinal.digit + | cardinal.teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands | cardinal.graph_ten_thousands @@ -52,6 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): point = pynutil.delete(".") decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional + unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) @@ -93,10 +95,50 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + unit ) - graph_quarter = ( + dedh_dhai = pynini.string_map([("१.५", "डेढ़"), ("२.५", "ढाई")]) + dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"") + + savva_numbers = cardinal_graph + pynini.cross(".२५", "") + savva_graph = pynutil.insert("integer: \"सवा ") + savva_numbers + pynutil.insert("\"") + + sadhe_numbers = cardinal_graph + pynini.cross(".५", "") + sadhe_graph = pynutil.insert("integer: \"साढ़े ") + sadhe_numbers + pynutil.insert("\"") + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(".७५", "") + paune_graph = pynutil.insert("integer: \"पौने ") + paune_numbers + pynutil.insert("\"") + + graph_dedh_dhai = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + dedh_dhai_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_savva = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + savva_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_sadhe = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + sadhe_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_paune = ( pynutil.insert("cardinal { ") + optional_graph_negative - + quarter_graph + + paune_graph + pynutil.insert(" }") + delete_space + units @@ -135,9 +177,12 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph = ( pynutil.add_weight(graph_decimal, 0.01) - | pynutil.add_weight(graph_quarter, 0.005) | pynutil.add_weight(graph_cardinal, 0.01) | pynutil.add_weight(graph_exceptions, 0.01) + | pynutil.add_weight(graph_dedh_dhai, 0.001) + | pynutil.add_weight(graph_savva, 0.005) + | pynutil.add_weight(graph_sadhe, 0.005) + | pynutil.add_weight(graph_paune, -0.2) ) self.graph = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/telephone.py b/nemo_text_processing/text_normalization/hi/taggers/telephone.py new file mode 100644 index 000000000..f47587acf --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/telephone.py @@ -0,0 +1,216 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_CHAR, + NEMO_WHITE_SPACE, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +delete_zero = pynutil.delete(pynini.union("0", "०")) +delete_zero_optional = pynini.closure(delete_zero, 0, 1) +insert_shunya = pynutil.insert('शून्य') + insert_space + +# Load the number mappings from the TSV file +digit_to_word = pynini.string_file(get_abs_path("data/telephone/number.tsv")) +digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) +country_codes = pynini.string_file(get_abs_path("data/telephone/country_codes.tsv")) +mobile_context = pynini.string_file(get_abs_path("data/telephone/mobile_context.tsv")) +landline_context = pynini.string_file(get_abs_path("data/telephone/landline_context.tsv")) +credit_context = pynini.string_file(get_abs_path("data/telephone/credit_context.tsv")) +pincode_context = pynini.string_file(get_abs_path("data/telephone/pincode_context.tsv")) + + +def generate_mobile(context_keywords): + context_before, context_after = get_context(context_keywords) + + allowed_digits = pynini.union("६", "७", "८", "९", "6", "7", "8", "9") + + # Filter cardinals to only include allowed digits + mobile_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word + + country_code = ( + pynutil.insert("country_code: \"") + + context_before + + pynini.cross("+", "प्लस") + + insert_space + + country_codes + + pynutil.insert("\" ") + + pynini.closure(delete_space, 0, 1) + ) + + extension_optional = pynini.closure( + pynutil.insert("extension: \"") + + pynini.closure((digit_to_word | digits | zero) + insert_space, 1, 3) + + context_after + + pynutil.insert("\" ") + + delete_space, + 0, + 1, + ) + + number_without_country = ( + pynutil.insert("number_part: \"") + + context_before + + delete_zero_optional + + insert_shunya + + mobile_start_digit + + insert_space + + pynini.closure((digit_to_word | digits | zero) + insert_space, 9) + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + number_with_country = ( + country_code + + pynutil.insert("number_part: \"") + + mobile_start_digit + + insert_space + + pynini.closure((digit_to_word | digits | zero) + insert_space, 9) + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + return (number_with_country | number_without_country) + extension_optional + + +def get_landline(std_length, context_keywords): + context_before, context_after = get_context(context_keywords) + + allowed_digits = pynini.union("२", "३", "४", "६", "2", "3", "4", "6") + + # Filter cardinals to only include allowed digits + landline_start_digit = allowed_digits @ digits | allowed_digits @ digit_to_word + + std_code_graph = ( + delete_zero_optional + + insert_shunya + + pynini.closure((digit_to_word | digits | zero) + insert_space, std_length, std_length) + ) + + landline_digit_count = 9 - std_length + landline_graph = ( + landline_start_digit + + insert_space + + pynini.closure((digit_to_word | digits | zero) + insert_space, landline_digit_count, landline_digit_count) + ) + + separator_optional = pynini.closure(pynini.cross("-", ""), 0, 1) + + return ( + pynutil.insert("number_part: \"") + + context_before + + std_code_graph + + separator_optional + + delete_space + + landline_graph + + context_after + + pynutil.insert("\" ") + ) + + +def generate_landline(context_keywords): + graph = ( + get_landline(2, context_keywords) + | get_landline(3, context_keywords) + | get_landline(4, context_keywords) + | get_landline(5, context_keywords) + | get_landline(6, context_keywords) + | get_landline(7, context_keywords) + ) + + return graph + + +def get_context(keywords: list): + + hindi_digits = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९") + english_digits = pynini.union("0", "1", "2", "3", "4", "5", "6", "7", "8", "9") + all_digits = pynini.union(hindi_digits, english_digits) + + non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE)) + word = pynini.closure(non_digit_char, 1) + pynini.accep(" ") + + window = pynini.closure(word, 0, 5) + + before = pynini.closure(keywords + pynini.accep(" ") + window, 0, 1) + + after = pynini.closure(pynutil.delete(" ") + window + keywords, 0, 1) + + return before.optimize(), after.optimize() + + +def generate_credit(context_keywords): + context_before, context_after = get_context(context_keywords) + return ( + pynutil.insert("number_part: \"") + + context_before + + pynini.closure((digit_to_word | digits | zero) + insert_space, 4) + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + +def generate_pincode(context_keywords): + context_before, context_after = get_context(context_keywords) + return ( + pynutil.insert("number_part: \"") + + context_before + + pynini.closure((digit_to_word | digits | zero) + insert_space, 6) + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for tagging telephone numbers, e.g. + ९१५७११४००७ -> telephone { number_part: "शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात" } + +९१ ९२१०५१५६०६ -> telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" } + १३७४-३०९९८८ -> telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization + """ + + def __init__(self): + super().__init__(name="telephone", kind="classify") + + mobile_number = generate_mobile(mobile_context) + landline = generate_landline(landline_context) + credit_card = generate_credit(credit_context) + pincode = generate_pincode(pincode_context) + + graph = ( + pynutil.add_weight(mobile_number, 0.7) + | pynutil.add_weight(landline, 0.8) + | pynutil.add_weight(credit_card, 0.9) + | pynutil.add_weight(pincode, 1) + ) + + self.final = graph.optimize() + self.fst = self.add_tokens(self.final) diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index 6c87c9aad..e78b31380 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -36,10 +36,11 @@ class TimeFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="classify") delete_colon = pynutil.delete(":") + cardinal_graph = cardinal.digit | cardinal.teens_and_ties self.hours = pynutil.insert("hours: \"") + hours_graph + pynutil.insert("\" ") self.minutes = pynutil.insert("minutes: \"") + minutes_graph + pynutil.insert("\" ") @@ -56,7 +57,35 @@ def __init__(self): # hour graph_h = self.hours + delete_colon + pynutil.delete("००") - final_graph = graph_hms | graph_hm | graph_h + dedh_dhai_graph = pynini.string_map([("१:३०", "डेढ़"), ("२:३०", "ढाई")]) + + savva_numbers = cardinal_graph + pynini.cross(":१५", "") + savva_graph = pynutil.insert("सवा ") + savva_numbers + + sadhe_numbers = cardinal_graph + pynini.cross(":३०", "") + sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(":४५", "") + paune_graph = pynutil.insert("पौने ") + paune_numbers + + graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ") + + graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ") + + graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ") + + graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ") + + final_graph = ( + graph_hms + | pynutil.add_weight(graph_hm, 0.01) + | pynutil.add_weight(graph_h, 0.01) + | pynutil.add_weight(graph_dedh_dhai, 0.001) + | pynutil.add_weight(graph_savva, 0.005) + | pynutil.add_weight(graph_sadhe, 0.005) + | pynutil.add_weight(graph_paune, 0.001) + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index b1bbd2a10..08182b34d 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -33,6 +33,7 @@ from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.hi.taggers.word import WordFst @@ -98,7 +99,7 @@ def __init__( logging.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") start_time = time.time() - timefst = TimeFst() + timefst = TimeFst(cardinal=cardinal) time_graph = timefst.fst logging.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes") @@ -123,6 +124,11 @@ def __init__( punct_graph = punctuation.fst logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes") + start_time = time.time() + telephone = TelephoneFst() + telephone_graph = telephone.fst + logging.debug(f"telephone: {time.time() - start_time: .2f}s -- {telephone_graph.num_states()} nodes") + classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(cardinal_graph, 1.1) @@ -132,6 +138,7 @@ def __init__( | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) ) start_time = time.time() diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py index bc354232b..151a72e99 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/word.py +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -43,7 +43,6 @@ def __init__(self, punctuation: PunctuationFst, deterministic: bool = True): *[chr(i) for i in range(ord("ऀ"), ord("ः") + 1)], # Hindi vowels and consonants *[chr(i) for i in range(ord("अ"), ord("ह") + 1)], # More Hindi characters *[chr(i) for i in range(ord("ा"), ord("्") + 1)], # Hindi diacritics - *[chr(i) for i in range(ord("०"), ord("९") + 1)], # Hindi digits ).optimize() # Include punctuation in the graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index 7e3b33b7c..a07c41eae 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -40,6 +40,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") insert_bata = pynutil.insert(" बटा ") insert_aur = pynutil.insert(" और ") + graph_quarter = ( + pynutil.delete("morphosyntactic_features: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) fraction_default = numerator + insert_bata + denominator @@ -47,7 +50,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + fraction_default - ) + ) | graph_quarter graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py new file mode 100644 index 000000000..f55cf4241 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py @@ -0,0 +1,65 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone numbers, e.g. + telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" } -> प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह + telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" } -> शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) + + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + delete_space + + insert_space, + 0, + 1, + ) + + number_part = ( + pynutil.delete("number_part: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynini.closure(pynutil.add_weight(pynutil.delete(" "), -0.0001), 0, 1) + + pynutil.delete("\"") + ) + + optional_extension = pynini.closure( + delete_space + + insert_space + + pynutil.delete("extension: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\""), + 0, + 1, + ) + + graph = optional_country_code + number_part + optional_extension + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/time.py b/nemo_text_processing/text_normalization/hi/verbalizers/time.py index da10df4a0..df232e3cd 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/time.py @@ -30,7 +30,7 @@ class TimeFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="verbalize") hour = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space @@ -63,13 +63,17 @@ def __init__(self): + insert_second ) + graph_quarter = ( + pynutil.delete("morphosyntactic_features: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + # hour minute graph_hm = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute # hour graph_h = hour + delete_space + insert_baje - self.graph = graph_hms | graph_hm | graph_h + self.graph = graph_hms | graph_hm | graph_h | graph_quarter final_graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index e91f0d9f6..f824a075a 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -19,6 +19,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -49,7 +50,7 @@ def __init__(self, deterministic: bool = True): date = DateFst() date_graph = date.fst - time = TimeFst() + time = TimeFst(cardinal=cardinal) time_graph = time.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal) @@ -58,6 +59,9 @@ def __init__(self, deterministic: bool = True): money = MoneyFst() money_graph = money.fst + telephone = TelephoneFst() + telephone_graph = telephone.fst + whitelist_graph = WhiteListFst(deterministic=deterministic).fst graph = ( @@ -69,6 +73,7 @@ def __init__(self, deterministic: bool = True): | measure_graph | money_graph | whitelist_graph + | telephone_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt index 6ba21de69..c7015e938 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt @@ -142,4 +142,4 @@ १०२२३४५५६७~एक अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ ११०२२३४५५६७~ग्यारह अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ ५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ -२ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल +२ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt index d1473412e..189e6d4ef 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -20,4 +20,8 @@ १०००००००००००००००/८~एक पद्म बटा आठ १०००००००००००००००००/४१२~एक शंख बटा चार सौ बारह २ २/७~दो और दो बटा सात -१२० ७५/९०~एक सौ बीस और पचहत्तर बटा नब्बे \ No newline at end of file +१२० ७५/९०~एक सौ बीस और पचहत्तर बटा नब्बे +१ १/२~डेढ़ +२ १/२~ढाई +८ १/२~साढ़े आठ +१०००० १/४~सवा दस हज़ार \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 86a824f72..0d31a5833 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -64,3 +64,8 @@ ५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा २x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब १३x१३ का घर~तेरह बाई तेरह का घर +५७.२५ hp~सवा सत्तावन हॉर्सपॉवर +७.५ हफ़्ते~साढ़े सात हफ़्ते +१०.७५ min~पौने ग्यारह मिनट +१.५ doz~डेढ़ दर्जन +२.५ yr~ढाई साल \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..9004cb889 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt @@ -0,0 +1,16 @@ +मेरा पुराना नंबर था ९१५७११४००७~मेरा पुराना नंबर था शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात +इसपे कॉल करो ०३८६२-३५१७९१~इसपे कॉल करो शून्य तीन आठ छह दो तीन पाँच एक सात नौ एक +मेरे इस नंबर पे कॉल करो १३७४-३०९९८८~मेरे इस नंबर पे कॉल करो शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ +इसपे कॉल करो ०१६८९११-४५७३~इसपे कॉल करो शून्य एक छह आठ नौ एक एक चार पाँच सात तीन ++९१ ७४४०४३१०८३ मेरे इस नंबर पे कॉल करो~प्लस नौ एक सात चार चार शून्य चार तीन एक शून्य आठ तीन मेरे इस नंबर पे कॉल करो ++९१ ९२१०५१५६०६ मेरे इस नंबर पे कॉल करो~प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो +भुगतान के लिए कार्ड के आखिरी अंक १२३४ दर्ज करें~भुगतान के लिए कार्ड के आखिरी अंक एक दो तीन चार दर्ज करें +मेरा पिन कोड ११००२३ है~मेरा पिन कोड एक एक शून्य शून्य दो तीन है +मेरा पुराना नंबर था 9157114007~मेरा पुराना नंबर था शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात +इसपे कॉल करो 03862-351791~इसपे कॉल करो शून्य तीन आठ छह दो तीन पाँच एक सात नौ एक +मेरे इस नंबर पे कॉल करो 1374-309988~मेरे इस नंबर पे कॉल करो शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ +इसपे कॉल करो 0168911-4573~इसपे कॉल करो शून्य एक छह आठ नौ एक एक चार पाँच सात तीन ++91 7440431083 मेरे इस नंबर पे कॉल करो~प्लस नौ एक सात चार चार शून्य चार तीन एक शून्य आठ तीन मेरे इस नंबर पे कॉल करो ++91 9210515606 मेरे इस नंबर पे कॉल करो~प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो +भुगतान के लिए कार्ड के आखिरी अंक 1234 दर्ज करें~भुगतान के लिए कार्ड के आखिरी अंक एक दो तीन चार दर्ज करें +मेरा पिन कोड 110023 है~मेरा पिन कोड एक एक शून्य शून्य दो तीन है \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt index 9d670aa8a..6b059a626 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt @@ -15,4 +15,9 @@ दोपहर के ३:००~दोपहर के तीन बजे रात के १०:४८:५०~रात के दस बजकर अड़तालीस मिनट पचास सेकंड रात के ११:५०~रात के ग्यारह बजकर पचास मिनट -रात के ८:००~रात के आठ बजे \ No newline at end of file +रात के ८:००~रात के आठ बजे +१:३०~डेढ़ +२:३०~ढाई +४:४५~पौने पाँच +१०:१५~सवा दस +१२:३०~साढ़े बारह \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 498443f71..39d710120 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -81,10 +81,10 @@ testTNMoney() { # runtest $input #} -#testTNTelephone() { -# input=$PROJECT_DIR/en/data_text_normalization/test_cases_telephone.txt -# runtest $input -#} +testTNTelephone() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_telephone.txt + runtest $input +} testTNTime() { input=$PROJECT_DIR/hi/data_text_normalization/test_cases_time.txt diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py index 7e43f7e82..e7b9f1c3d 100644 --- a/tests/nemo_text_processing/hi/test_telephone.py +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -16,12 +16,16 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestTelephone: inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @@ -29,3 +33,10 @@ class TestTelephone: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred.strip() == expected.strip() + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected