Skip to content

Commit 2db73f5

Browse files
Audio-based TN for Swedish (#49)
* Audio-based TN for Swedish, for Språkbanken Tal Replaces #48 Signed-off-by: Jim O'Regan <joregan@kth.se> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updating cache directory (Not entirely sure what the pattern is) Signed-off-by: Jim O’Regan <joregan@kth.se> * Delete tokenize_and_classify_lm.py Signed-off-by: Jim O’Regan <joregan@kth.se> * fraction fix from ITN branch Signed-off-by: Jim O'Regan <joregan@kth.se> --------- Signed-off-by: Jim O'Regan <joregan@kth.se> Signed-off-by: Jim O’Regan <joregan@kth.se> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 36fdbf4 commit 2db73f5

28 files changed

Lines changed: 431 additions & 27 deletions

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pipeline {
1919
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
2020
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
2121
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
22-
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
22+
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-16-23-0'
2323
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
2424
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-15-23-0'
2525

nemo_text_processing/text_normalization/normalize_with_audio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ def parse_args():
418418
"--input_case", help="input capitalization", choices=["lower_cased", "cased"], default="cased", type=str
419419
)
420420
parser.add_argument(
421-
"--language", help="Select target language", choices=["en", "ru", "de", "es"], default="en", type=str
421+
"--language", help="Select target language", choices=["en", "ru", "de", "es", "sv"], default="en", type=str
422422
)
423423
parser.add_argument("--manifest", default=None, help="path to .json manifest")
424424
parser.add_argument(

nemo_text_processing/text_normalization/sv/graph_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
bos_or_space = pynini.union("[BOS]", " ")
3434
eos_or_space = pynini.union("[EOS]", " ")
3535

36-
ensure_space = pynini.closure(delete_space, 0, 1) + insert_space
36+
ensure_space = pynini.cross(pynini.closure(delete_space, 0, 1), " ")
3737

3838

3939
def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':

nemo_text_processing/text_normalization/sv/taggers/cardinal.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,12 @@ def make_million(number: str, non_zero_no_one: 'pynini.FstLike', deterministic:
4646
for one in ["en", "ett"]:
4747
graph |= pynutil.add_weight(pynini.cross("001", f"{one} {number}"), -0.001)
4848
graph |= pynutil.add_weight(pynini.cross("001", f"{one} {old_orth}"), -0.001)
49+
graph |= pynutil.add_weight(pynini.cross("001", f"{one}{number}"), -0.001)
50+
graph |= pynutil.add_weight(pynini.cross("001", f"{one}{old_orth}"), -0.001)
4951
graph |= non_zero_no_one + pynutil.insert(f" {number}er")
5052
if not deterministic:
5153
graph |= pynutil.add_weight(non_zero_no_one + pynutil.insert(f" {old_orth}er"), -0.001)
54+
graph |= pynutil.add_weight(non_zero_no_one + pynutil.insert(f"{old_orth}er"), -0.001)
5255
graph |= pynutil.delete("000")
5356
graph += insert_space
5457
return graph
@@ -100,17 +103,17 @@ def __init__(self, deterministic: bool = True):
100103
digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")))
101104
teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv")))
102105
ties = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv")))
103-
ett_to_en = pynini.string_map([("ett", "en")])
106+
ett_to_en = pynini.cross("ett", "en")
104107
ties_alt_endings = pynini.string_map([("go", "gi"), ("tio", "ti")])
105108

106109
# Any single digit
107110
graph_digit = digit
108111
digits_no_one = (NEMO_DIGIT - "1") @ graph_digit
109-
both_ones = pynini.cross("1", "en") | pynini.cross("1", "ett")
110112
if deterministic:
111113
final_digit = digit
112114
else:
113-
final_digit = digits_no_one | both_ones
115+
final_digit = digit | pynini.cross("1", "en")
116+
graph_digit = final_digit
114117
self.digit = final_digit
115118

116119
single_digits_graph = graph_digit | zero
@@ -131,14 +134,13 @@ def __init__(self, deterministic: bool = True):
131134
else:
132135
graph_tens |= pynutil.add_weight(pynini.cross("18", "aderton"), -0.001)
133136
graph_tens |= pynutil.add_weight(
134-
graph_ties + (pynutil.delete('0') | (graph_digit | pynutil.insert(' ') + graph_digit)), -0.001
137+
graph_ties + (pynutil.delete('0') | (graph_digit | insert_space + graph_digit)), -0.001
135138
)
136139

137140
hundreds = digits_no_one + pynutil.insert("hundra")
138141
hundreds |= pynini.cross("1", "hundra")
139142
if not deterministic:
140143
hundreds |= pynutil.add_weight(pynini.cross("1", "etthundra"), -0.001)
141-
hundreds |= pynutil.add_weight(pynini.cross("1", "ett hundra"), -0.001)
142144
hundreds |= pynutil.add_weight(digit + pynutil.insert(NEMO_SPACE) + pynutil.insert("hundra"), -0.001)
143145

144146
self.tens = graph_tens.optimize()
@@ -180,6 +182,7 @@ def __init__(self, deterministic: bool = True):
180182
graph_hundreds_component_at_least_one_non_zero_digit = graph_hundreds_component | (
181183
pynutil.delete("00") + graph_digit
182184
)
185+
183186
graph_hundreds_component_at_least_one_non_zero_digit_no_one = graph_hundreds_component | (
184187
pynutil.delete("00") + digits_no_one
185188
)
@@ -192,18 +195,21 @@ def __init__(self, deterministic: bool = True):
192195
if not deterministic:
193196
tusen |= pynutil.add_weight(pynutil.insert(" tusen"), -0.001)
194197
etttusen = tusen
195-
etttusen |= pynutil.add_weight(pynutil.insert("ettusen"), -0.001)
196-
etttusen |= pynutil.add_weight(pynutil.insert(" ettusen"), -0.001)
198+
etttusen |= pynutil.add_weight(pynutil.insert("etttusen"), -0.001)
199+
etttusen |= pynutil.add_weight(pynutil.insert(" etttusen"), -0.001)
197200
etttusen |= pynutil.add_weight(pynutil.insert("ett tusen"), -0.001)
198201
etttusen |= pynutil.add_weight(pynutil.insert(" ett tusen"), -0.001)
199202

203+
following_hundred = insert_space + graph_hundreds_component_at_least_one_non_zero_digit
204+
if not deterministic:
205+
following_hundred |= graph_hundreds_component_at_least_one_non_zero_digit
206+
200207
graph_thousands_component_at_least_one_non_zero_digit = pynini.union(
201208
pynutil.delete("000") + graph_hundreds_component_at_least_one_non_zero_digit,
202209
graph_hundreds_component_at_least_one_non_zero_digit_no_one
203210
+ tusen
204-
+ ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
205-
pynini.cross("001", etttusen)
206-
+ ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
211+
+ (following_hundred | pynutil.delete("000")),
212+
pynini.cross("001", etttusen) + (following_hundred | pynutil.delete("000")),
207213
)
208214
self.graph_thousands_component_at_least_one_non_zero_digit = (
209215
graph_thousands_component_at_least_one_non_zero_digit.optimize()
@@ -213,9 +219,8 @@ def __init__(self, deterministic: bool = True):
213219
pynutil.delete("000") + graph_hundreds_component_at_least_one_non_zero_digit_no_one,
214220
graph_hundreds_component_at_least_one_non_zero_digit_no_one
215221
+ tusen
216-
+ ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
217-
pynini.cross("001", etttusen)
218-
+ ((insert_space + graph_hundreds_component_at_least_one_non_zero_digit) | pynutil.delete("000")),
222+
+ (following_hundred | pynutil.delete("000")),
223+
pynini.cross("001", etttusen) + (following_hundred | pynutil.delete("000")),
219224
)
220225
self.graph_thousands_component_at_least_one_non_zero_digit_no_one = (
221226
graph_thousands_component_at_least_one_non_zero_digit_no_one.optimize()
@@ -326,11 +331,19 @@ def __init__(self, deterministic: bool = True):
326331

327332
self.graph |= zero
328333

334+
self.graph_unfiltered = self.graph
329335
self.graph = filter_punctuation(self.graph).optimize()
330336
self.graph_en = self.graph @ pynini.cdrewrite(ett_to_en, "", "[EOS]", NEMO_SIGMA)
331337
self.graph_no_one = (pynini.project(self.graph, "input") - "1") @ self.graph
332338
self.graph_no_one_en = (pynini.project(self.graph_en, "input") - "1") @ self.graph_en
333339

340+
joiner_chars = pynini.union("-", "–", "—")
341+
joiner = pynini.cross(joiner_chars, " till ")
342+
self.range = self.graph + joiner + self.graph
343+
if not deterministic:
344+
either_one = self.graph | self.graph_en
345+
self.range = either_one + joiner + either_one
346+
334347
optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1)
335348

336349
final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")

nemo_text_processing/text_normalization/sv/taggers/date.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,16 +68,28 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool):
6868

6969
# prefer cardinal over year
7070
year_first = ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0, 1)) @ numbers
71-
year_second = (
72-
pynini.union((NEMO_DIGIT - "0") + (NEMO_DIGIT - "0"), "0" + (NEMO_DIGIT - "0"), (NEMO_DIGIT - "0") + "0")
73-
@ numbers
71+
year_second = pynini.union(
72+
((NEMO_DIGIT - "0") + (NEMO_DIGIT - "0")) @ numbers,
73+
pynini.cross("0", "hundra") + ((NEMO_DIGIT - "0") @ numbers),
74+
((NEMO_DIGIT - "0") + "0") @ numbers,
7475
)
76+
year_hundra = year_first + pynutil.insert("hundra") + year_second
77+
year_hundra |= year_first + pynutil.insert(" hundra") + year_second
78+
year_hundra |= year_first + pynutil.insert(" hundra ") + year_second
79+
year_hundra |= year_first + pynutil.insert("hundra ") + year_second
7580
year_second |= pynini.cross("00", "hundra")
7681
year_cardinal = ((NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)) @ numbers
7782
year = pynini.union(year_first + year_second, year_first) # 90, 990, 1990
7883
if not deterministic:
7984
year |= year_cardinal
85+
year |= year_hundra
8086
self.year = year
87+
self.year_cardinal = year_cardinal
88+
sou_number = self.year + pynini.cross(":", " kolon ") + numbers
89+
sou_word = pynini.accep("SOU")
90+
if not deterministic:
91+
sou_word |= pynini.cross("SOU", "statens offentliga utredningar")
92+
self.sou = sou_word + NEMO_SPACE + sou_number
8193

8294
year_second_decades = ((NEMO_DIGIT - "0") + "0") @ numbers
8395
year_second_decades |= pynini.cross("00", "hundra")

nemo_text_processing/text_normalization/sv/taggers/fraction.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515
import pynini
1616
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst
17+
from nemo_text_processing.text_normalization.sv.graph_utils import ensure_space
1718
from nemo_text_processing.text_normalization.sv.utils import get_abs_path
1819
from pynini.lib import pynutil
1920

@@ -86,7 +87,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool =
8687
denominator = pynutil.insert("denominator: \"") + fractions + pynutil.insert("\"")
8788

8889
graph = pynini.closure(integer + pynini.accep(" "), 0, 1) + (numerator + denominator)
89-
graph |= pynini.closure(integer + (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose(
90+
graph |= pynini.closure(integer + ensure_space, 0, 1) + pynini.compose(
9091
pynini.string_file(get_abs_path("data/numbers/fraction.tsv")), (numerator + denominator)
9192
)
9293

nemo_text_processing/text_normalization/sv/taggers/ordinal.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,13 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
194194

195195
self.suffixed_to_words = self.suffixed_ordinal @ self.graph
196196

197+
self.bare_ordinals = cleaned_graph
198+
kapitlet_word = pynini.union("kapitlet", pynini.cross("kap", "kapitlet"))
199+
kapitlet = cleaned_graph + NEMO_SPACE + kapitlet_word
200+
197201
tok_graph = (
198202
pynutil.insert("integer: \"")
199-
+ (cleaned_graph + pynutil.delete(".") | self.suffixed_to_words)
203+
+ (cleaned_graph + pynutil.delete(".") | self.suffixed_to_words | kapitlet)
200204
+ pynutil.insert("\"")
201205
)
202206

0 commit comments

Comments
 (0)