Skip to content

Commit bb3e4a3

Browse files
tarushi2k2pre-commit-ci[bot]
authored andcommitted
Hindi ITN - Addition of Whitelist and Word (#248)
* Addition of whitelist and word classes Signed-off-by: Tarushi V <tarushiv@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updation of Jenkins date Signed-off-by: Tarushi V <tarushiv@nvidia.com> * Cleanup Signed-off-by: Tarushi V <tarushiv@nvidia.com> * Updation Signed-off-by: Tarushi V <tarushiv@nvidia.com> * Updation Signed-off-by: Tarushi V <tarushiv@nvidia.com> --------- Signed-off-by: Tarushi V <tarushiv@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 416c644 commit bb3e4a3

13 files changed

Lines changed: 77 additions & 12 deletions

File tree

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pipeline {
2727
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
2828
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
2929
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
30-
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0'
30+
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1'
3131
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
3232
}
3333
stages {
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
१/४ पाव
2+
१/२ आधा
3+
३/४ पौन
4+
१:३० डेढ़ बजे
5+
२:३० ढाई बजे
6+
१.५ डेढ़
7+
२.५ ढाई
8+
कु. कुमारी
9+
स्मि. श्रीमती
10+
श्री. श्री
11+
श्री. श्रीमान
12+
मा. मास्टर
13+
डॉ. डॉक्टर

nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv

Lines changed: 0 additions & 3 deletions
This file was deleted.

nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv

Lines changed: 0 additions & 2 deletions
This file was deleted.

nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@
3434
from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
3535
from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
3636
from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
37+
from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
3738
from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
38-
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
3939

4040

4141
class ClassifyFst(GraphFst):
@@ -83,7 +83,7 @@ def __init__(
8383
money = MoneyFst(cardinal, decimal)
8484
money_graph = money.fst
8585
punct_graph = PunctuationFst().fst
86-
# whitelist_graph = WhiteListFst(input_file=whitelist).fst
86+
whitelist_graph = WhiteListFst().fst
8787
word_graph = WordFst().fst
8888

8989
classify = (
@@ -96,7 +96,7 @@ def __init__(
9696
| pynutil.add_weight(measure_graph, 1.1)
9797
| pynutil.add_weight(money_graph, 1.1)
9898
| pynutil.add_weight(word_graph, 100)
99-
# | pynutil.add_weight(whitelist_graph, 1.01)
99+
| pynutil.add_weight(whitelist_graph, 1.01)
100100
)
101101

102102
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")

nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None):
4747
super().__init__(name="whitelist", kind="classify")
4848

4949
if input_file is None:
50-
input_file = get_abs_path("data/whitelist.tsv")
50+
input_file = get_abs_path("data/whitelist/whitelist.tsv")
5151

5252
if not os.path.exists(input_file):
5353
raise ValueError(f"Whitelist file {input_file} not found")

nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst
2424
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst
2525
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
26+
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst
2627

2728

2829
class VerbalizeFst(GraphFst):
@@ -44,12 +45,13 @@ def __init__(self):
4445
time_graph = TimeFst().fst
4546
measure_graph = MeasureFst(cardinal, decimal).fst
4647
money_graph = MoneyFst(cardinal, decimal).fst
47-
48+
word_graph = WordFst().fst
4849
whitelist_graph = WhiteListFst().fst
4950

5051
graph = (
5152
cardinal_graph
5253
| whitelist_graph
54+
| word_graph
5355
| ordinal_graph
5456
| decimal_graph
5557
| fraction_graph
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
डेढ़ बजे~१:३०
2+
ढाई बजे~२:३०
3+
मास्टर निखिल तनिष~मा. निखिल तनिष
4+
पाव~१/४
5+
श्रीमती ज्योत्सना~स्मि. ज्योत्सना
6+
डॉक्टर~डॉ.
7+
आधा कप चाय~१/२ कप चाय
8+
श्रीमान भारत कुमार~श्री. भारत कुमार
9+
डॉक्टर प्रशांत~डॉ. प्रशांत
10+
डेढ़~१.५
11+
कुमारी~कु.
12+
ढाई~२.५
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
नींद~नींद
2+
याहू!~याहू!
3+
-~-
4+
आआआ~आआआ
5+
आकाशगंगा~आकाशगंगा
6+
लटरपटर~लटरपटर
7+
कच्चा-पक्का~कच्चा-पक्का
8+
गुब्बारा~गुब्बारा
9+
चिट्ठी~चिट्ठी
10+
ढूंढना~ढूंढना
11+
लोहे का!~लोहे का!
12+
टाटा~टाटा
13+
~
14+
झ~झ
15+
संगीत~संगीत

tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ testITNMoney() {
6363
runtest $input
6464
}
6565

66+
testITNWord() {
67+
input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt
68+
runtest $input
69+
}
70+
71+
testITNWhiteList() {
72+
input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_whitelist.txt
73+
runtest $input
74+
}
75+
6676

6777
# Load shUnit2
6878
. $PROJECT_DIR/../shunit2/shunit2

0 commit comments

Comments
 (0)