Skip to content

Commit f893d89

Browse files
committed
Modified/removed files
Signed-off-by: hmlee245 <hmlee245@gmail.com>
2 parents 9f7e876 + 77da79d commit f893d89

13 files changed

Lines changed: 70 additions & 47 deletions

File tree

nemo_text_processing/inverse_text_normalization/inverse_normalize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def __init__(
135135
from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst
136136
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import (
137137
VerbalizeFinalFst,
138-
)
138+
)
139139

140140
self.tagger = ClassifyFst(
141141
cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case
@@ -180,7 +180,7 @@ def parse_args():
180180
parser.add_argument(
181181
"--language",
182182
help="language",
183-
choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'],
183+
choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'],
184184
default="en",
185185
type=str,
186186
)

nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space
2020
from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path
2121

22+
2223
class CardinalFst(GraphFst):
2324
"""
2425
Finite state transducer for classifying cardinals
@@ -34,13 +35,19 @@ def __init__(self):
3435
graph_zero = pynini.cross("영", "0")
3536
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
3637

38+
<<<<<<< HEAD
39+
=======
40+
graph_negative = pynini.cross("마이너스", "-")
41+
graph_negative += delete_space
42+
43+
>>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545
3744
ten = pynutil.delete("십")
3845
ten_alt = pynini.cross("십", "1")
3946
### Responsible for second digit of two digit number. ex) 20's 2
4047
graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0"))
4148
### Responsible for the first digit of number. ex) 1,2,3,4,5,,,
4249
graph_ten_component += graph_digit | pynutil.insert("0")
43-
50+
4451
hundred = pynutil.delete("백")
4552
hundred_alt = pynini.cross("백", "1")
4653
graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0"))
@@ -55,46 +62,55 @@ def __init__(self):
5562
tenthousand_alt = pynini.cross("만", "1")
5663
### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space
5764
### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits
58-
graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000"))
65+
graph_tenthousand_component = pynini.union(
66+
((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")
67+
)
5968
graph_tenthousand_component += graph_thousand_component
6069

6170
hundredmillion = pynutil.delete("억")
6271
hundredmillion_alt = pynini.cross("억", "1")
63-
graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000"))
64-
graph_hundredmillion_component += graph_tenthousand_component
65-
72+
graph_hundredmillion_component = pynini.union(
73+
((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")
74+
)
75+
graph_hundredmillion_component += graph_tenthousand_component
76+
6677
trillion = pynutil.delete("조")
6778
trillion_alt = pynini.cross("조", "1")
68-
graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000"))
79+
graph_trillion_component = pynini.union(
80+
((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")
81+
)
6982
graph_trillion_component += graph_hundredmillion_component
7083

7184
tenquadrillion = pynutil.delete("경")
7285
tenquadrillion_alt = pynini.cross("경", "1")
73-
graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000"))
86+
graph_tenquadrillion_component = pynini.union(
87+
((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")
88+
)
7489
graph_tenquadrillion_component += graph_trillion_component
7590

76-
7791
graph = pynini.union(
7892
### From biggest unit to smallest, everything is included
79-
graph_tenquadrillion_component|
80-
graph_zero
93+
graph_tenquadrillion_component
94+
| graph_zero
8195
)
8296

8397
leading_zero = (
8498
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)
8599
)
86100
graph_nonzero = graph @ leading_zero
87101
graph = pynini.union(graph_nonzero, graph_zero)
88-
102+
89103
graph = graph @ leading_zero | graph_zero
90104

91105
self.just_cardinals = graph
92106

93-
optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1)
107+
optional_sign = pynini.closure(
108+
(pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1
109+
)
94110

95111
final_graph = (
96112
optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"")
97113
) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\""))
98114

99115
final_graph = self.add_tokens(final_graph)
100-
self.fst = final_graph.optimize()
116+
self.fst = final_graph.optimize()

nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919
import pynini
2020
from pynini.lib import pynutil
2121

22-
from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst
23-
from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst
24-
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import (
22+
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import (
2523
INPUT_LOWER_CASED,
2624
GraphFst,
2725
generator_main,
2826
)
27+
from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst
28+
from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst
2929

3030

3131
class ClassifyFst(GraphFst):
@@ -62,13 +62,13 @@ def __init__(
6262
cardinal = CardinalFst()
6363
cardinal_graph = cardinal.fst
6464
word_graph = WordFst().fst
65-
classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100))
66-
65+
classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100)
66+
6767
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ")
6868
tagger = pynini.closure(token, 1)
6969

7070
self.fst = tagger
7171

7272
if far_file:
7373
generator_main(far_file, {"tokenize_and_classify": self.fst})
74-
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
74+
logging.info(f"ClassifyFst grammars are saved to {far_file}.")

nemo_text_processing/inverse_text_normalization/ko/taggers/word.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,5 @@ class WordFst(GraphFst):
2727

2828
def __init__(self):
2929
super().__init__(name="word", kind="classify")
30-
word = pynutil.insert(
31-
"name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"")
30+
word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"")
3231
self.fst = word.optimize()

nemo_text_processing/inverse_text_normalization/ko/utils.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@
1515
import os
1616

1717

18-
1918
def get_abs_path(rel_path):
2019

2120
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
22-
23-

nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,10 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
<<<<<<< HEAD
15+
=======
16+
17+
from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst
18+
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst
19+
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst
20+
>>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545

nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,7 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import (
19-
NEMO_NOT_QUOTE,
20-
GraphFst,
21-
delete_space,
22-
)
18+
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
2319

2420

2521
class CardinalFst(GraphFst):
@@ -34,21 +30,17 @@ def __init__(self):
3430
pynutil.delete("negative:")
3531
+ delete_space
3632
+ pynutil.delete("\"")
37-
+ pynini.accep("-")
33+
+ pynini.accep("-")
3834
+ pynutil.delete("\"")
3935
)
4036

4137
optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1)
4238

43-
digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1)
39+
digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1)
4440
integer_cardinal = (
45-
pynutil.delete("integer:")
46-
+ delete_space
47-
+ pynutil.delete("\"")
48-
+ digits_from_tag
49-
+ pynutil.delete("\"")
41+
pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"")
5042
)
5143

5244
graph = integer_cardinal
5345
final_graph = optional_sign_output + graph
54-
self.fst = self.delete_tokens(final_graph).optimize()
46+
self.fst = self.delete_tokens(final_graph).optimize()

nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst
1617
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst
1718
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst
18-
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst
1919

2020

2121
class VerbalizeFst(GraphFst):
@@ -30,7 +30,6 @@ def __init__(self):
3030
cardinal = CardinalFst()
3131
cardinal_graph = cardinal.fst
3232
word_graph = WordFst().fst
33-
34-
graph = (cardinal_graph|word_graph)
33+
34+
graph = cardinal_graph | word_graph
3535
self.fst = graph
36-

nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,21 @@
1818
import pynini
1919
from pynini.lib import pynutil
2020

21+
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main
2122
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst
23+
<<<<<<< HEAD
2224
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space
25+
=======
26+
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst
27+
>>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545
2328

2429

2530
class VerbalizeFinalFst(GraphFst):
2631
"""
2732
Finite state transducer that verbalizes an entire sentence, e.g.
2833
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
2934
"""
35+
3036
def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False):
3137
super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic)
3238
far_file = None

nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst
1919

2020

21-
2221
class WordFst(GraphFst):
2322
'''
2423
tokens { name: "一" } -> 一

0 commit comments

Comments
 (0)