Skip to content

Commit c5be241

Browse files
Vietnamese TN - Fraction (#296)
* Fraction class for Vietnamese TN Signed-off-by: folivoramanh <palasek182@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove irrelavant test case Signed-off-by: folivoramanh <palasek182@gmail.com> * Remove irrelavant test case Signed-off-by: folivoramanh <palasek182@gmail.com> --------- Signed-off-by: folivoramanh <palasek182@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent c6e2713 commit c5be241

File tree

9 files changed

+195
-20
lines changed

9 files changed

+195
-20
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
4
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
19+
from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst
20+
from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels
21+
22+
23+
class FractionFst(GraphFst):
24+
"""
25+
Finite state transducer for classifying Vietnamese fraction numbers, e.g.
26+
23 1/5 -> fraction { integer_part: "hai mươi ba" numerator: "một" denominator: "năm" }
27+
3/9 -> fraction { numerator: "ba" denominator: "chín" }
28+
1/4 -> fraction { numerator: "một" denominator: "tư" }
29+
30+
Args:
31+
cardinal: CardinalFst for converting numbers to Vietnamese words
32+
deterministic: if True will provide a single transduction option,
33+
for False multiple options (used for audio-based normalization)
34+
"""
35+
36+
def __init__(self, cardinal: CardinalFst, deterministic: bool = True):
37+
super().__init__(name="fraction", kind="classify", deterministic=deterministic)
38+
39+
cardinal_graph = cardinal.graph
40+
digit = pynini.union(*[str(i) for i in range(10)])
41+
number = pynini.closure(digit, 1)
42+
43+
denominator_exceptions = {
44+
row[0]: row[1] for row in load_labels(get_abs_path("data/fraction/denominator_exceptions.tsv"))
45+
}
46+
47+
denominator_exception_patterns = [pynini.cross(k, v) for k, v in denominator_exceptions.items()]
48+
denominator_exception_graph = (
49+
pynini.union(*denominator_exception_patterns) if denominator_exception_patterns else None
50+
)
51+
denominator_graph = (
52+
pynini.union(denominator_exception_graph, cardinal_graph)
53+
if denominator_exception_graph
54+
else cardinal_graph
55+
)
56+
57+
numerator = (
58+
pynutil.insert("numerator: \"") + (number @ cardinal_graph) + pynutil.insert("\" ") + pynutil.delete("/")
59+
)
60+
denominator = pynutil.insert("denominator: \"") + (number @ denominator_graph) + pynutil.insert("\"")
61+
integer_part = pynutil.insert("integer_part: \"") + (number @ cardinal_graph) + pynutil.insert("\" ")
62+
63+
simple_fraction = numerator + denominator
64+
mixed_fraction = integer_part + pynutil.delete(" ") + numerator + denominator
65+
optional_graph_negative = (pynutil.insert("negative: ") + pynini.cross("-", "\"true\" ")).ques
66+
67+
self.fst = self.add_tokens(optional_graph_negative + (simple_fraction | mixed_fraction)).optimize()

nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
)
2727
from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst
2828
from nemo_text_processing.text_normalization.vi.taggers.decimal import DecimalFst
29+
from nemo_text_processing.text_normalization.vi.taggers.fraction import FractionFst
2930
from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst
3031
from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst
3132
from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst
@@ -86,11 +87,17 @@ def __init__(
8687
decimal_graph = decimal.fst
8788
logger.debug(f"decimal: {time.time() - start_time: .2f}s -- {decimal_graph.num_states()} nodes")
8889

90+
start_time = time.time()
91+
fraction = FractionFst(cardinal=cardinal, deterministic=deterministic)
92+
fraction_graph = fraction.fst
93+
logger.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes")
94+
8995
classify = (
9096
pynutil.add_weight(whitelist_graph, 0.8)
9197
| pynutil.add_weight(ordinal_graph, 0.81)
9298
| pynutil.add_weight(decimal_graph, 0.85)
9399
| pynutil.add_weight(cardinal_graph, 0.9)
100+
| pynutil.add_weight(fraction_graph, 1.0)
94101
| pynutil.add_weight(word_graph, 100)
95102
)
96103
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
19+
20+
21+
class FractionFst(GraphFst):
22+
"""
23+
Finite state transducer for verbalizing Vietnamese fraction numbers, e.g.
24+
fraction { negative: "true" integer_part: "hai mươi ba" numerator: "một" denominator: "năm" } -> âm hai mươi ba và một phần năm
25+
fraction { numerator: "ba" denominator: "chín" } -> ba phần chín
26+
fraction { integer_part: "một trăm" numerator: "hai" denominator: "ba" } -> một trăm và hai phần ba
27+
28+
Args:
29+
deterministic: if True will provide a single transduction option,
30+
for False multiple options (used for audio-based normalization)
31+
"""
32+
33+
def __init__(self, deterministic: bool = True):
34+
super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
35+
36+
optional_sign = pynini.cross("negative: \"true\"", "âm ")
37+
if not deterministic:
38+
optional_sign |= pynini.cross("negative: \"true\"", "trừ ")
39+
optional_sign = pynini.closure(optional_sign + delete_space, 0, 1)
40+
41+
part = pynini.closure(NEMO_NOT_QUOTE)
42+
delete_quotes = delete_space + pynutil.delete("\"") + part + pynutil.delete("\"")
43+
44+
integer_tagged = pynutil.delete("integer_part:") + delete_quotes
45+
numerator_tagged = pynutil.delete("numerator:") + delete_quotes
46+
denominator_tagged = pynutil.delete("denominator:") + delete_quotes
47+
48+
fraction_part = numerator_tagged + delete_space + pynutil.insert(" phần ") + denominator_tagged
49+
50+
simple_fraction = fraction_part
51+
mixed_fraction = integer_tagged + delete_space + pynutil.insert(" và ") + fraction_part
52+
53+
self.fst = self.delete_tokens(optional_sign + (simple_fraction | mixed_fraction)).optimize()

nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst
1717
from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst
1818
from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst
19+
from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst
1920
from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst
2021
from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst
2122

@@ -40,7 +41,10 @@ def __init__(self, deterministic: bool = True):
4041
decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic)
4142
decimal_graph = decimal.fst
4243

44+
fraction = FractionFst(deterministic=deterministic)
45+
fraction_graph = fraction.fst
46+
4347
# Combine all verbalizers
44-
graph = cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph
48+
graph = cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph | fraction_graph
4549

4650
self.fst = graph
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
1/2~một phần hai
2+
4/9~bốn phần chín
3+
9/4~chín phần tư
4+
1/4~một phần tư
5+
3/4~ba phần tư
6+
15/5~mười lăm phần năm
7+
1/3~một phần ba
8+
2/10~hai phần mười
9+
23 1/5~hai mươi ba và một phần năm
10+
-3/4~âm ba phần tư
11+
-12 1/4 nha~âm mười hai và một phần tư nha
12+
-5 2/3~âm năm và hai phần ba
13+
5 1/2~năm và một phần hai

tests/nemo_text_processing/vi/test_fraction.py

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,32 +12,49 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
15+
# pytest tests/nemo_text_processing/vi/test_fraction.py --cpu --cache-clear
1616
import pytest
1717
from parameterized import parameterized
1818

19-
from ..utils import CACHE_DIR, parse_test_case_file
20-
21-
try:
22-
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
19+
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
20+
from nemo_text_processing.text_normalization.normalize import Normalizer
21+
from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio
2322

24-
PYNINI_AVAILABLE = True
25-
except (ImportError, ModuleNotFoundError):
26-
PYNINI_AVAILABLE = False
23+
from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file
2724

2825

2926
class TestFraction:
30-
inverse_normalizer = (
31-
InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
32-
)
27+
28+
inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False)
3329

3430
@parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_fraction.txt'))
35-
@pytest.mark.skipif(
36-
not PYNINI_AVAILABLE,
37-
reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh",
38-
)
3931
@pytest.mark.run_only_on('CPU')
4032
@pytest.mark.unit
4133
def test_denorm(self, test_input, expected):
4234
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
4335
assert pred == expected
36+
37+
normalizer = Normalizer(
38+
input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True
39+
)
40+
41+
normalizer_with_audio = (
42+
NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False)
43+
if CACHE_DIR and RUN_AUDIO_BASED_TESTS
44+
else None
45+
)
46+
47+
@parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_fraction.txt'))
48+
@pytest.mark.run_only_on('CPU')
49+
@pytest.mark.unit
50+
def test_norm(self, test_input, expected):
51+
pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False)
52+
assert pred == expected, f"input: {test_input}"
53+
54+
if self.normalizer_with_audio:
55+
pred_non_deterministic = self.normalizer_with_audio.normalize(
56+
test_input,
57+
n_tagged=30,
58+
punct_post_process=False,
59+
)
60+
assert expected in pred_non_deterministic, f"input: {test_input}"

tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ testTNOrdinal() {
4343
runtest $input
4444
}
4545

46-
# testTNFraction() {
47-
# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_fraction.txt
48-
# runtest $input
49-
# }
46+
testTNFraction() {
47+
input=$PROJECT_DIR/vi/data_text_normalization/test_cases_fraction.txt
48+
runtest $input
49+
}
5050

5151
# testTNTime() {
5252
# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_time.txt

0 commit comments

Comments
 (0)