diff --git a/Jenkinsfile b/Jenkinsfile index 82a0a4799..c94c107c6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -16,7 +16,7 @@ pipeline { EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-04-24-0' ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-25-24-0' ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0' - FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0' + FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-07-25-0' HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0' PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' diff --git a/nemo_text_processing/text_normalization/fr/data/dates/__init__.py b/nemo_text_processing/text_normalization/fr/data/dates/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv b/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv new file mode 100644 index 000000000..6127bea93 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv @@ -0,0 +1,8 @@ +20s twenties +30s thirties +40s forties +50s fifties +60s sixties +70s seventies +80s eighties +90s nineties \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr/data/dates/months.tsv b/nemo_text_processing/text_normalization/fr/data/dates/months.tsv new file mode 100644 index 000000000..98a4e7d5d --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/months.tsv @@ -0,0 +1,12 @@ +1 janvier +2 février +3 mars +4 avril +5 mai +6 juin +7 juillet +8 août +9 septembre +10 octobre +11 novembre +12 décembre \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr/taggers/date.py b/nemo_text_processing/text_normalization/fr/taggers/date.py new file mode 100644 index 000000000..91e83c40c --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/taggers/date.py @@ -0,0 +1,97 @@ +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.fr.utils import get_abs_path + +# TODO: add articles? 'le...' + +month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv")) +eras = pynini.string_file(get_abs_path("data/dates/eras.tsv")) +delete_leading_zero = ( + pynutil.delete("0") | (NEMO_DIGIT - "0") +) + NEMO_DIGIT # reminder, NEMO_DIGIT = filter on digits + + +class DateFst(GraphFst): + ''' Finite state transducer for classyfing dates, e.g.: + '02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true} + ''' + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="dates", kind="classify") + + cardinal_graph = cardinal.all_nums_no_tokens + + # 'le' -> 'le', 'les' -> 'les' + le_determiner = pynini.accep("le ") | pynini.accep("les ") + self.optional_le = pynini.closure(le_determiner, 0, 1) + + # '01' -> 'un' + optional_leading_zero = delete_leading_zero | NEMO_DIGIT + valid_day_number = pynini.union(*[str(x) for x in range(1, 32)]) + premier = pynini.string_map([("1", "premier")]) + day_number_to_word = premier | cardinal_graph + + digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word + self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"") + + # '03' -> 'mars' + normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)]) + number_to_month = month_numbers.optimize() + month_graph = normalize_month_number @ number_to_month + self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") + + # 2025 -> deux mille vingt cinq + accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3) + digits_to_year = accept_year_digits @ cardinal_graph + self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"") + + # Putting it all together + self.fst = pynini.accep("") + + for separator in ["/", ".", "-"]: + self.fst |= ( + pynutil.insert("date { ") + + self.day_graph + + pynutil.delete(separator) + + pynutil.insert(" ") + + self.month_graph + + pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true }") + ) + + # Accepts "janvier", "février", etc + month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"") + + self.fst |= ( + pynutil.insert("date { ") + + self.day_graph + + pynini.accep(" ") + + month_name_graph + + pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true}") + ) + + # Accepts "70s", "80s", etc + self.fst |= pynutil.insert("date { year: \"") + eras + pynutil.insert("\" preserve_order: true }") + + # Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"} + for separator in ["-", "/"]: + day_range_graph = ( + pynutil.insert("day: \"") + + pynini.closure(digit_to_day + pynutil.delete(separator) + pynutil.insert(" "), 1) + + digit_to_day + + pynutil.insert("\"") + ) + + self.fst |= ( + pynutil.insert("date { ") + + day_range_graph + + pynini.accep(" ") + + month_name_graph + + pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true }") + ) + + self.fst = self.fst.optimize() diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index de9a0b047..cacc94bcf 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -26,6 +26,7 @@ ) from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.fr.taggers.date import DateFst from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst @@ -86,8 +87,12 @@ def __init__( whitelist_graph = self.whitelist.fst punct_graph = PunctuationFst(deterministic=deterministic).fst + self.date = DateFst(self.cardinal, deterministic=deterministic) + date_graph = self.date.fst + classify = ( pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.09) | pynutil.add_weight(ordinal_graph, 1.1) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/date.py b/nemo_text_processing/text_normalization/fr/verbalizers/date.py new file mode 100644 index 000000000..8c8c1aa21 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/verbalizers/date.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, + delete_preserve_order, +) + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, e.g. + date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois + Args: + ordinal: OrdinalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="date", kind="verbalize", deterministic=deterministic) + + day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + decade = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + graph_dmy = day + NEMO_SPACE + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order + graph_my = month + NEMO_SPACE + year + delete_preserve_order + graph_decade = decade + delete_preserve_order + + self.graph = graph_dmy | graph_my | graph_decade + + delete_tokens = self.delete_tokens(self.graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py index 02510ea5f..3ea0117af 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py @@ -14,6 +14,7 @@ from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.fr.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst @@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True): fraction = FractionFst(ordinal=ordinal, deterministic=deterministic) fraction_graph = fraction.fst whitelist_graph = WhiteListFst(deterministic=deterministic).fst + date = DateFst(deterministic=deterministic) + date_graph = date.fst - graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph + graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph self.fst = graph diff --git a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..3b4f09154 --- /dev/null +++ b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt @@ -0,0 +1,13 @@ +02.03.2003~deux mars deux mille trois +02/03/2003~deux mars deux mille trois +02-03-2003~deux mars deux mille trois +le 02.03.2003~le deux mars deux mille trois +17.06~dix-sept juin +17 janvier~dix-sept janvier +10 mars 2023~dix mars deux mille vingt-trois +le 10 mars 2023~le dix mars deux mille vingt-trois +les 80s~les eighties +les 17/18 juin~les dix-sept dix-huit juin +les 17/18/19 mars~les dix-sept dix-huit dix-neuf mars +les 17-18-19 juin~les dix-sept dix-huit dix-neuf juin +les 17-18-19 juin 2025~les dix-sept dix-huit dix-neuf juin deux mille vingt-cinq \ No newline at end of file diff --git a/tests/nemo_text_processing/fr/test_date.py b/tests/nemo_text_processing/fr/test_date.py index 614ed0e24..35e3086cd 100644 --- a/tests/nemo_text_processing/fr/test_date.py +++ b/tests/nemo_text_processing/fr/test_date.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,12 @@ class TestDate: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(input_case='cased', lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh index 009032118..71f3f4759 100644 --- a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh @@ -27,6 +27,11 @@ testTNCardinal() { runtest $input } +testTNDate() { + input=$PROJECT_DIR/fr/data_text_normalization/test_cases_date.txt + runtest $input +} + testTNDecimal() { input=$PROJECT_DIR/fr/data_text_normalization/test_cases_decimal.txt runtest $input