Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Comment thread
mayuris-00 marked this conversation as resolved.
Empty file.
Comment thread
mayuris-00 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
% प्रतिशत
% परसेंट
% फ़ीसदी
% फीसदी
Comment thread
mayuris-00 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.inverse_text_normalization.hi.graph_utils import (
INPUT_LOWER_CASED,
GraphFst,
delete_space,
)
from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path


class PercentageFst(GraphFst):
def __init__(self, cardinal, input_case: str = INPUT_LOWER_CASED):
super().__init__(name="percentage", kind="classify")

# load percent words and flip mapping: प्रतिशत → %
percent_graph = pynini.string_file(
get_abs_path("data/percentage/percent_symbol.tsv")
).invert()

# reuse number logic (बीस → २०, पाँच सौ → ५००)
integer_graph = cardinal.graph_no_exception

# match: <number> + <percent word>
# and convert into structured format
final_graph = (
pynutil.insert('integer: "')
+ integer_graph
+ pynutil.insert('"')
+ delete_space
+ pynutil.insert(' percent: "')
+ percent_graph
+ pynutil.insert('"')
)

# wrap as: percentage { ... }
final_graph = self.add_tokens(final_graph)

self.fst = final_graph.optimize()
Comment thread
mayuris-00 marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from nemo_text_processing.inverse_text_normalization.hi.taggers.measure import MeasureFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.percentage import PercentageFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
Expand Down Expand Up @@ -79,6 +80,8 @@ def __init__(
decimal_graph = decimal.fst
fraction = FractionFst(cardinal)
fraction_graph = fraction.fst
percentage = PercentageFst(cardinal)
percentage_graph = percentage.fst
date = DateFst(cardinal)
date_graph = date.fst
time = TimeFst()
Expand All @@ -98,6 +101,7 @@ def __init__(
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.1)
| pynutil.add_weight(percentage_graph, 1.1)
| pynutil.add_weight(date_graph, 1.1)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
Expand All @@ -120,4 +124,4 @@ def __init__(

if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
Comment thread
mayuris-00 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.inverse_text_normalization.hi.graph_utils import (
NEMO_NOT_QUOTE,
GraphFst,
delete_space,
)


class PercentageFst(GraphFst):
def __init__(self):
super().__init__(name="percentage", kind="verbalize")

# extract number part (remove labels and quotes)
# example: integer: "२०" → २०
integer_part = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)

# extract percent symbol
# example: percent: "%" → %
percent_part = (
pynutil.delete("percent:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete("\"")
)

# combine both → २०%
graph = integer_part + delete_space + percent_part

# remove outer wrapper: percentage { ... }
delete_tokens = self.delete_tokens(graph)

self.fst = delete_tokens.optimize()
Comment thread
mayuris-00 marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.percentage import PercentageFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
Expand All @@ -38,15 +39,16 @@ def __init__(self):
super().__init__(name="verbalize", kind="verbalize")
cardinal = CardinalFst()
cardinal_graph = cardinal.fst
ordinal_graph = OrdinalFst().fst
decimal = DecimalFst()
ordinal_graph = OrdinalFst().fst # takes nothing
decimal = DecimalFst() # takes nothing
decimal_graph = decimal.fst
fraction_graph = FractionFst().fst
date_graph = DateFst().fst
time_graph = TimeFst().fst
measure_graph = MeasureFst(cardinal, decimal).fst
money_graph = MoneyFst(cardinal, decimal).fst
telephone_graph = TelephoneFst(cardinal).fst
fraction_graph = FractionFst().fst # takes nothing
percentage_graph = PercentageFst().fst # takes nothing
date_graph = DateFst().fst # takes nothing
time_graph = TimeFst().fst # takes nothing
measure_graph = MeasureFst(cardinal, decimal).fst # takes cardinal, decimal
money_graph = MoneyFst(cardinal, decimal).fst # takes cardinal, decimal
telephone_graph = TelephoneFst(cardinal).fst # takes cardinal
word_graph = WordFst().fst
whitelist_graph = WhiteListFst().fst

Expand All @@ -57,10 +59,11 @@ def __init__(self):
| ordinal_graph
| decimal_graph
| fraction_graph
| percentage_graph
| date_graph
| time_graph
| measure_graph
| money_graph
| telephone_graph
)
self.fst = graph
self.fst = graph
5 changes: 4 additions & 1 deletion nemo_text_processing/text_normalization/utils_audio_based.py
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This cdifflib fallback fix is unrelated to the percentage class. Even if it was needed to get your environment working, it should be a separate commit or a separate PR. Mixing unrelated fixes into a feature PR makes the review harder and the git history messier. Please remove this change from this PR and raise it separately if needed.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverted the cdifflib change from this PR. Will raise it separately if needed.

Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

from typing import Dict

from cdifflib import CSequenceMatcher
try:
from cdifflib import CSequenceMatcher
except ImportError:
from difflib import SequenceMatcher as CSequenceMatcher

from nemo_text_processing.utils.logging import logger

Expand Down
29 changes: 29 additions & 0 deletions test_percentage.py
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file should not be here. You already have the correct copy at tests/nemo_text_processing/hi/test_percentage.py. This root-level version will not work -- the relative import from ..utils import CACHE_DIR requires the file to be inside the tests/nemo_text_processing/hi/ package. Please delete this file.

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
from parameterized import parameterized
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from ..utils import CACHE_DIR, parse_test_case_file


class TestPercentage:
inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected
Comment thread
RajanPutty marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
बीस प्रतिशत~२०%
पचास प्रतिशत~५०%
दस प्रतिशत~१०%
सौ प्रतिशत~१००%
पच्चीस प्रतिशत~२५%
पाँच प्रतिशत~५%
तीन प्रतिशत~३%
सत्तर परसेंट~७०%
एक प्रतिशत~१%
शून्य प्रतिशत~०%
पाँच सौ फ़ीसदी~५००%
तेरह प्रतिशत~१३%
29 changes: 29 additions & 0 deletions tests/nemo_text_processing/hi/test_percentage.py
Comment thread
RajanPutty marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
from parameterized import parameterized
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from ..utils import CACHE_DIR, parse_test_case_file


class TestPercentage:
inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_percentage.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected
2 changes: 1 addition & 1 deletion tests/nemo_text_processing/utils.py
Comment thread
RajanPutty marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def parse_test_case_file(file_name: str):
Prepares tests pairs for ITN and TN tests
"""
test_pairs = []
with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f:
with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r', encoding='utf-8') as f:
for line in f:
components = line.strip("\n").split("~")
spoken = components[0]
Expand Down