diff --git a/Jenkinsfile b/Jenkinsfile index c94c107c6..51ce37a10 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv new file mode 100644 index 000000000..d4c1ca0b1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv @@ -0,0 +1,3 @@ +सन् +सन +साल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv new file mode 100644 index 000000000..6806d3f12 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv @@ -0,0 +1,10 @@ + में + का + की + के + से + तक + ईस्वी + शताब्दी + दशक + सदी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv new file mode 100644 index 000000000..acb37d534 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv @@ -0,0 +1,2 @@ +ई. पू. ईसा पूर्व +ई. ईसवी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv new file mode 100644 index 000000000..eaddf930a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv @@ -0,0 +1,12 @@ +s सेकंड +hr घंटा +h घंटे +min मिनट +doz दर्जन +yr साल +yr वर्ष +hp हॉर्सपॉवर +d दिन +month महीना +months महीने +हफ़्ते हफ़्ते \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 0bf561379..189512687 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -141,14 +141,16 @@ month महीना months महीने ct कैरेट pH पीएच +km/h किलोमीटर प्रति घंटा km/hr किलोमीटर प्रति घंटा km/min किलोमीटर प्रति मिनट +m/h मीटर प्रति घंटा m/hr मीटर प्रति घंटा mi/s मील प्रति सेकंड +mi/h मील प्रति घंटा mi/hr मील प्रति घंटा mi/min मील प्रति मिनट ₹/ac रुपए प्रति एकड़ x बाई X बाई * बाई -- से diff --git a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv index 88633ec7c..8f4a955cc 100644 --- a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv @@ -1,5 +1,4 @@ ₹ रुपए -P पैसे £ पाउंड ₩ वॉन $ डॉलर @@ -7,4 +6,4 @@ $ डॉलर ৳ टका ¥ येन ₦ नाइरा -€ यूरो +€ यूरो \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv new file mode 100644 index 000000000..cf62891d1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv @@ -0,0 +1,9 @@ +रुपए पैसे +पाउंड पेंस +वॉन जिओन +डॉलर सेंट +लीरा कुरस +टका पैसे +येन सेन +नाइरा कोबो +यूरो सेंट diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv index 1d61c77b7..fbf248266 100644 --- a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv +++ b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv @@ -79,12 +79,12 @@ ८८ अट्ठासी ८९ नवासी ९० नब्बे -९१ इक्यानबे -९२ बानबे -९३ तिरानबे -९४ चौरानबे -९५ पंचानबे -९६ छियानबे -९७ सत्तानबे -९८ अट्ठानबे +९१ इक्यानबे +९२ बानबे +९३ तिरानबे +९४ चौरानबे +९५ पंचानबे +९६ छियानबे +९७ सत्तानबे +९८ अट्ठानबे ९९ निन्यानबे diff --git a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv index d5e85a784..dd8623284 100644 --- a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv +++ b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv @@ -1,3 +1,4 @@ +० शून्य १ एक २ दो ३ तीन diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index f6a8bdd65..c50384acf 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -80,6 +80,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 1, teens_ties) graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 0, graph_hundreds) graph_ten_thousands.optimize() + self.graph_ten_thousands = graph_ten_thousands # Lakhs graph and ten lakhs graph suffix_lakhs = pynutil.insert(" लाख") @@ -90,6 +91,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 1, graph_thousands) graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 0, graph_ten_thousands) graph_lakhs.optimize() + self.graph_lakhs = graph_lakhs graph_ten_lakhs = create_graph_suffix(teens_and_ties, suffix_lakhs, 5) graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 4, digit) @@ -98,6 +100,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 1, graph_thousands) graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 0, graph_ten_thousands) graph_ten_lakhs.optimize() + self.graph_ten_lakhs = graph_ten_lakhs # Crores graph ten crores graph suffix_crores = pynutil.insert(" करोड़") diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 42135add7..37b192165 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -26,6 +26,20 @@ days = pynini.string_file(get_abs_path("data/date/days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) +year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) +digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) +teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + +# Read suffixes from file into a list +with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: + suffixes_list = f.read().splitlines() +with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: + prefixes_list = f.read().splitlines() + +# Create union of suffixes and prefixes +suffix_union = pynini.union(*suffixes_list) +prefix_union = pynini.union(*prefixes_list) class DateFst(GraphFst): @@ -51,6 +65,10 @@ def __init__(self, cardinal: GraphFst): (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand ) + cardinal_graph = ( + digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands + ) + graph_year = graph_year_thousands | graph_year_hundreds_as_thousands delete_dash = pynutil.delete("-") @@ -68,6 +86,22 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd += pynutil.insert(" preserve_order: true ") + # Graph for era + era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + + range_graph = pynini.cross("-", "से") + + # Graph for year + century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") + century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space + + # Updated logic to use suffix_union + year_number = graph_year + suffix_union + year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space + + # Updated logic to use prefix_union + year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -78,7 +112,20 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + years_graph + graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph + + graph_year_suffix = era_graph + + graph_range = ( + pynutil.insert("era: \"") + + cardinal_graph + + insert_space + + range_graph + + insert_space + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" preserve_order: true ") + ) # default assume dd_mm_yyyy @@ -87,7 +134,12 @@ def __init__(self, cardinal: GraphFst): | graph_mm_dd | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy - | graph_mm_yyyy + | pynutil.add_weight(graph_mm_yyyy, -0.2) + | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_range, -0.005) + | pynutil.add_weight(century_text, -0.001) + | pynutil.add_weight(year_text, -0.001) + | pynutil.add_weight(year_prefix, -0.009) ) self.final_graph = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 55279f4da..9f1ffbd39 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -19,6 +19,11 @@ from nemo_text_processing.text_normalization.hi.utils import get_abs_path +digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) +teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + + class MeasureFst(GraphFst): """ Finite state transducer for classifying measure, suppletive aware, e.g. @@ -35,9 +40,20 @@ class MeasureFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") - cardinal_graph = cardinal.final_graph - decimal_graph = decimal.final_graph_wo_negative + cardinal_graph = ( + digit + | teens_and_ties + | cardinal.graph_hundreds + | cardinal.graph_thousands + | cardinal.graph_ten_thousands + | cardinal.graph_lakhs + | cardinal.graph_ten_lakhs + ) + point = pynutil.delete(".") + decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, @@ -45,18 +61,48 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): 1, ) + # Define the quarterly measurements + quarter = pynini.string_map( + [ + (".५", "साढ़े"), + ("१.५", "डेढ़"), + ("२.५", "ढाई"), + ] + ) + quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") + # Define the unit handling - self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ") + units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") + + # Handling symbols like x, X, * + symbol_graph = pynini.string_map( + [ + ("x", "बाई"), + ("X", "बाई"), + ("*", "बाई"), + ] + ) - graph_measurements = ( + graph_decimal = ( pynutil.insert("decimal { ") + optional_graph_negative + decimal_graph + pynutil.insert(" }") + delete_space - + self.unit + + unit ) - graph_measurements |= ( + + graph_quarter = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + quarter_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") @@ -64,10 +110,35 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("\"") + pynutil.insert(" }") + delete_space - + self.unit + + unit ) - graph = graph_measurements + # Handling cardinal clubbed with symbol as single token + graph_exceptions = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynutil.insert(" units: \"") + + symbol_graph + + pynutil.insert("\" ") + + pynutil.insert("} }") + + insert_space + + pynutil.insert("tokens { cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + ) + + graph = ( + pynutil.add_weight(graph_decimal, 0.01) + | pynutil.add_weight(graph_quarter, 0.005) + | pynutil.add_weight(graph_cardinal, 0.01) + | pynutil.add_weight(graph_exceptions, 0.01) + ) self.graph = graph.optimize() final_graph = self.add_tokens(graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index 7446b77e5..01e46352f 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -24,8 +24,10 @@ class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. - ₹1 -> money { currency: "रुपए" integer_part: "एक" } - ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } + ₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" } + ₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" } + ₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } + Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination Args: cardinal: CardinalFst @@ -34,7 +36,7 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self, cardinal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.final_graph @@ -44,21 +46,25 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): 0, 1, ) - self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") - self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") - self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ") + currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"') + integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"') + currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"') - graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger - graph_currencies |= ( + graph_major_only = optional_graph_negative + currency_major + insert_space + integer + graph_major_and_minor = ( optional_graph_negative - + self.currency + + currency_major + insert_space - + self.interger - + pynutil.delete(".") + + integer + + pynini.cross(".", " ") + + fraction + insert_space - + self.fraction + + currency_minor ) - graph = graph_currencies - self.graph = graph.optimize() + + graph_currencies = graph_major_only | graph_major_and_minor + + graph = graph_currencies.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index cc22a99f5..b1bbd2a10 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -68,11 +68,12 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far" + cache_dir, + f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - logging.info(f'ClassifyFst.fst was restored from {far_file}.') + logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") @@ -107,7 +108,7 @@ def __init__( logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") start_time = time.time() - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst(cardinal=cardinal) money_graph = money.fst logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index f0af1a2d4..8904f63c8 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -39,6 +39,8 @@ def __init__(self): year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_dd_mm = day + NEMO_SPACE + month graph_mm_dd = month + NEMO_SPACE + day @@ -60,7 +62,7 @@ def __init__(self): ) self.graph = ( - (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy) + (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index 39b16b423..7e3b33b7c 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -39,10 +39,15 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") insert_bata = pynutil.insert(" बटा ") + insert_aur = pynutil.insert(" और ") fraction_default = numerator + insert_bata + denominator - self.graph = optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space) + fraction_default + self.graph = ( + optional_sign + + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + + fraction_default + ) graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/money.py b/nemo_text_processing/text_normalization/hi/verbalizers/money.py index d5cab33d8..048140295 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/money.py @@ -15,14 +15,26 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +major_minor_currencies = { + "रुपए": "पैसे", + "पाउंड": "पेंस", + "वॉन": "जिओन", + "डॉलर": "सेंट", + "लीरा": "कुरस", + "टका": "पैसे", + "येन": "सेन", + "नाइरा": "कोबो", + "यूरो": "सेंट", +} +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst class MoneyFst(GraphFst): """ Finite state transducer for verbalizing money, e.g. - money { integer_part: "बारह" currency: "रुपए" } -> बारह रुपए - money { integer_part: "बारह" currency: "रुपए" fractional_part: "पचास" currency: "पैसे" } -> बारह रुपए पचास पैसे + money { integer_part: "बारह" currency_maj: "रुपए" } -> बारह रुपए + money { integer_part: "बारह" currency_maj: "रुपए" fractional_part: "पचास" currency_min: "centiles" } -> बारह रुपए पचास पैसे + money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } -> पचास पैसे Args: cardinal: CardinalFst @@ -31,33 +43,58 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self): super().__init__(name="money", kind="verbalize") - insert_paise = pynutil.insert("पैसे") + currency_major = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') - currency = ( - pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) - - integer_part = ( - pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') fractional_part = ( - pynutil.delete('fractional_part: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('" ') - + insert_space + pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - graph_integer = integer_part + delete_space + currency + # Handles major denominations only + graph_major_only = integer_part + pynini.accep(NEMO_SPACE) + currency_major - graph_interger_fraction = ( - integer_part + delete_space + currency + delete_space + fractional_part + delete_space + insert_paise - ) + # Handles both major and minor denominations + major_minor_graphs = [] + + # Handles minor denominations only + minor_graphs = [] + + # Logic for handling minor denominations + for major, minor in major_minor_currencies.items(): + graph_major = pynutil.delete('currency_maj: "') + pynini.accep(major) + pynutil.delete('"') + graph_minor = pynutil.delete('currency_min: "') + pynini.cross("centiles", minor) + pynutil.delete('"') + graph_major_minor_partial = ( + integer_part + + pynini.accep(NEMO_SPACE) + + graph_major + + pynini.accep(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + major_minor_graphs.append(graph_major_minor_partial) + + graph_minor_partial = ( + pynutil.delete('integer_part: "शून्य"') + + pynutil.delete(NEMO_SPACE) + + pynutil.delete('currency_maj: "') + + pynutil.delete(major) + + pynutil.delete('"') + + pynutil.delete(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + minor_graphs.append(graph_minor_partial) + + graph_major_minor = pynini.union(*major_minor_graphs) + graph_minor_only = pynini.union(*minor_graphs) - graph = graph_integer | graph_interger_fraction + graph = graph_major_only | graph_major_minor | pynutil.add_weight(graph_minor_only, -0.1) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index ca06fc9c3..e91f0d9f6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -20,8 +20,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst - -# from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -56,11 +55,20 @@ def __init__(self, deterministic: bool = True): measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst() money_graph = money.fst - # whitelist_graph = WhiteListFst(deterministic=deterministic).fst - - graph = cardinal_graph | decimal_graph | fraction_graph | date_graph | time_graph | measure_graph | money_graph + whitelist_graph = WhiteListFst(deterministic=deterministic).fst + + graph = ( + cardinal_graph + | decimal_graph + | fraction_graph + | date_graph + | time_graph + | measure_graph + | money_graph + | whitelist_graph + ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py index 58dbc9583..d846dfa58 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index d92a53852..a4b3caf07 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -17,3 +17,18 @@ ११-२०२४~नवंबर दो हज़ार चौबीस २०७०~दो हज़ार सत्तर २०२४~दो हज़ार चौबीस +१२० ई. पू.~एक सौ बीस ईसा पूर्व +२९७-२७२ ई. पू.~दो सौ सत्तानबे से दो सौ बहत्तर ईसा पूर्व +३२७वीं सदी~तीन सौ सत्ताईसवीं सदी +१८वीं शताब्दी~अठारहवीं शताब्दी +१९वीं दशक~उन्नीसवीं दशक +१९९९ में~उन्नीस सौ निन्यानबे में +१९९० का~उन्नीस सौ नब्बे का +१९९२ की~उन्नीस सौ बानबे की +१९६० के अभिनेता है~उन्नीस सौ साठ के अभिनेता है +१७८८ से~सत्रह सौ अट्ठासी से +१९५४ तक~उन्नीस सौ चौवन तक +सन १९९९~सन उन्नीस सौ निन्यानबे +सन् १९२०~सन् उन्नीस सौ बीस +साल १९७१~साल उन्नीस सौ इकहत्तर +१९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt index 25c18b777..d1473412e 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -1,5 +1,5 @@ ९९/९९~निन्यानबे बटा निन्यानबे -२२ ३१/१७~बाईस इकतीस बटा सत्रह +२२ ३१/१७~बाईस और इकतीस बटा सत्रह ९७/०~सत्तानबे बटा शून्य २५६३/४१२~दो हज़ार पाँच सौ तिरेसठ बटा चार सौ बारह ७२८६०/७०~बहत्तर हज़ार आठ सौ साठ बटा सत्तर @@ -19,3 +19,5 @@ १०००००००००००००/३~एक नील बटा तीन १०००००००००००००००/८~एक पद्म बटा आठ १०००००००००००००००००/४१२~एक शंख बटा चार सौ बारह +२ २/७~दो और दो बटा सात +१२० ७५/९०~एक सौ बीस और पचहत्तर बटा नब्बे \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 453369f82..86a824f72 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -60,3 +60,7 @@ ९९.५ oz~निन्यानबे दशमलव पाँच आउन्स ८५ q~पचासी क्विंटल ८५.९९ q~पचासी दशमलव नौ नौ क्विंटल +२००x१० के गद्दे~दो सौ बाई दस के गद्दे +५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा +२x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब +१३x१३ का घर~तेरह बाई तेरह का घर diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt index c7b32628b..b576dac38 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt @@ -97,4 +97,22 @@ $२८२१~दो हज़ार आठ सौ इक्कीस डॉल ₹५४५~पाँच सौ पैंतालीस रुपए ₹१८४५~एक हज़ार आठ सौ पैंतालीस रुपए ₹३७२~तीन सौ बहत्तर रुपए -$९८~अट्ठानबे डॉलर \ No newline at end of file +$९८~अट्ठानबे डॉलर +₹१२३.५७~एक सौ तेईस रुपए सत्तावन पैसे +₹९९९.५०~नौ सौ निन्यानबे रुपए पचास पैसे +£१५०.२९~एक सौ पचास पाउंड उनतीस पेंस +£८०.३१~अस्सी पाउंड इकतीस पेंस +₩२३४५.१०~दो हज़ार तीन सौ पैंतालीस वॉन दस जिओन +₩१००.२५~एक सौ वॉन पच्चीस जिओन +$१२५.७०~एक सौ पच्चीस डॉलर सत्तर सेंट +$९.९९~नौ डॉलर निन्यानबे सेंट +₺८०.३६~अस्सी लीरा छत्तीस कुरस +₺१२३४.७८~एक हज़ार दो सौ चौंतीस लीरा अठहत्तर कुरस +৳१००.४२~एक सौ टका बयालीस पैसे +৳३०२५.८७~तीन हज़ार पच्चीस टका सत्तासी पैसे +¥१००.४८~एक सौ येन अड़तालीस सेन +¥७७७.२३~सात सौ सतहत्तर येन तेईस सेन +₦८७६.५३~आठ सौ छिहत्तर नाइरा तिरेपन कोबो +₦१०.२७~दस नाइरा सत्ताईस कोबो +€२००.९०~दो सौ यूरो नब्बे सेंट +€१२३४.७५~एक हज़ार दो सौ चौंतीस यूरो पचहत्तर सेंट