Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,30 @@ ci:

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
rev: v5.0.0
hooks:
- id: check-yaml
- id: check-case-conflict
- id: detect-private-key
- id: requirements-txt-fixer

- repo: https://github.com/PyCQA/flake8
rev: 7.1.1
rev: 7.2.0
hooks:
- id: flake8
args:
- --select=W605

- repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 6.0.1
hooks:
- id: isort
name: Format imports
args: [ --multi-line=3, --trailing-comma, --force-grid-wrap=0, --use-parentheses, --line-width=119, -rc, -ws ]
exclude: docs/

- repo: https://github.com/psf/black
rev: 19.10b0
rev: 25.1.0
hooks:
- id: black
name: Format code
Expand Down
2 changes: 1 addition & 1 deletion nemo_text_processing/fst_alignment/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def parse_args():


EPS = "<eps>"
WHITE_SPACE = "\u23B5"
WHITE_SPACE = "\u23b5"
ITN_MODE = "itn"
TN_MODE = "tn"
tn_item_special_chars = ["$", "\\", ":", "+", "-", "="]
Expand Down
17 changes: 14 additions & 3 deletions nemo_text_processing/hybrid/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def remove_punctuation(text: str, remove_spaces=True, do_lower=True, lang="en",

text = re.sub(r" +", " ", text)
if remove_spaces:
text = text.replace(" ", "").replace("\u00A0", "").strip()
text = text.replace(" ", "").replace("\u00a0", "").strip()

if do_lower:
text = text.lower()
Expand Down Expand Up @@ -515,7 +515,11 @@ def _relax_diff(text):
return acceptable


def get_labels(targets: List[str], norm_texts_weights: List[Tuple[str, str]], lang="en",) -> List[List[str]]:
def get_labels(
targets: List[str],
norm_texts_weights: List[Tuple[str, str]],
lang="en",
) -> List[List[str]]:
"""
Assign labels to generated normalization options (1 - for ground truth, 0 - other options)
Args:
Expand Down Expand Up @@ -605,7 +609,14 @@ def print_df(df):
prints data frame
"""
with pd.option_context(
"display.max_rows", None, "display.max_columns", None, "display.width", 1000, "display.max_colwidth", 400,
"display.max_rows",
None,
"display.max_columns",
None,
"display.width",
1000,
"display.max_colwidth",
400,
):
print(df)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def __init__(self, tn_cardinal):
self.graph = pynini.invert(tn_cardinal.cardinal_numbers).optimize()

optional_minus_graph = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE, 0, 1,
pynutil.insert("negative: ") + pynini.cross("سالب", '"-"') + NEMO_SPACE,
0,
1,
)

final_graph = optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def __init__(self, tn_decimal):
super().__init__(name="decimal", kind="classify")

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space, 0, 1,
pynutil.insert("negative: ") + pynini.cross("سالب", '"true"') + delete_extra_space,
0,
1,
)

graph_fractional_part = pynini.invert(tn_decimal.graph_fractional).optimize()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def __init__(self):
super().__init__(name="word", kind="verbalize")
chars = pynini.closure(NEMO_CHAR - " ", 1)
char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"")
graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)

self.fst = graph.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
+ graph_in_thousands
)

graph = pynini.union((graph_int | graph_ind) + delete_space + graph_hundreds, graph_zero,)
graph = pynini.union(
(graph_int | graph_ind) + delete_space + graph_hundreds,
graph_zero,
)

graph = graph @ pynini.union(
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,11 @@ def __init__(self, ordinal: GraphFst, input_case: str):
+ pynutil.add_weight(year_graph, -YEAR_WEIGHT)
+ pynutil.insert("\"")
)
optional_graph_year = pynini.closure(graph_year, 0, 1,)
optional_graph_year = pynini.closure(
graph_year,
0,
1,
)
graph_mdy = month_graph + (
(delete_extra_space + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):
point = pynutil.delete("point")

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1,
pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space,
0,
1,
)

graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
delete_extra_space
+ url_symbols
+ delete_extra_space
+ (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username)
+ (
domain
| pynini.closure(
accepted_username + delete_extra_space,
)
+ accepted_username
)
)

protocol_default = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU
graph_unit_plural = pynini.compose(casing_graph, graph_unit_plural).optimize()

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space, 0, 1,
pynutil.insert("negative: ") + pynini.cross(MINUS, "\"true\"") + delete_extra_space,
0,
1,
)

unit_singular = convert_space(graph_unit_singular)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, input_case: str = INPU
# "one fifty" -> "one hundred fifty"
with_hundred = pynini.compose(
pynini.closure(NEMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("hundred ") + NEMO_SIGMA,
pynini.compose(cardinal_graph, NEMO_DIGIT ** 3),
pynini.compose(cardinal_graph, NEMO_DIGIT**3),
)
cardinal_graph |= with_hundred
graph_decimal_final = decimal.final_graph_wo_negative
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def get_serial_number(cardinal):
"""

digit = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT)
two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT ** 2), 0.002)
two_digit = pynutil.add_weight(pynini.compose(cardinal.graph_two_digit, NEMO_DIGIT**2), 0.002)
character = digit | two_digit | NEMO_ALPHA
sequence = (NEMO_LOWER_NOT_A | digit) + pynini.closure(pynutil.delete(" ") + character, 2)
sequence |= character + pynini.closure(pynutil.delete(" ") + (digit | NEMO_ALPHA), 2)
Expand Down Expand Up @@ -116,7 +116,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):
triple_digit.invert()

# to handle cases like "one twenty three"
two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2)
two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT**2)
double_digit_to_digit = (
pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal
)
Expand All @@ -139,7 +139,7 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):

number_part = pynini.compose(
single_double_or_triple_digit,
NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4,
NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**4,
).optimize()
number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"")

Expand All @@ -156,24 +156,24 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):
graph = optional_country_code + number_part

# credit card number
space_four_digits = insert_space + NEMO_DIGIT ** 4
space_four_digits = insert_space + NEMO_DIGIT**4
space_five_digits = space_four_digits + NEMO_DIGIT
space_six_digits = space_five_digits + NEMO_DIGIT
credit_card_graph = pynini.compose(
single_double_or_triple_digit,
NEMO_DIGIT ** 4 + (space_six_digits | (space_four_digits ** 2)) + space_four_digits,
NEMO_DIGIT**4 + (space_six_digits | (space_four_digits**2)) + space_four_digits,
).optimize()

credit_card_graph |= pynini.compose(
single_double_or_triple_digit, NEMO_DIGIT ** 4 + space_six_digits + space_five_digits
single_double_or_triple_digit, NEMO_DIGIT**4 + space_six_digits + space_five_digits
).optimize()

graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"")

# SSN
ssn_graph = pynini.compose(
single_double_or_triple_digit,
NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4,
NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**2 + pynutil.insert("-") + NEMO_DIGIT**4,
).optimize()
graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"")

Expand Down
22 changes: 20 additions & 2 deletions nemo_text_processing/inverse_text_normalization/en/taggers/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,32 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15")
oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock", "hundred hours",), "",)
oclock = pynini.cross(
pynini.union(
"o' clock",
"o clock",
"o'clock",
"oclock",
"hundred hours",
),
"",
)

if input_case == INPUT_CASED:
minute_to_graph = capitalized_input_graph(minute_to_graph)
graph_minute_single = capitalized_input_graph(graph_minute_single)
graph_minute_double = capitalized_input_graph(graph_minute_double)
graph_minute_verbose |= pynini.cross("Half", "30") | pynini.cross("Quarter", "15")
oclock |= pynini.cross(pynini.union("O' clock", "O clock", "O'clock", "Oclock", "Hundred hours",), "",)
oclock |= pynini.cross(
pynini.union(
"O' clock",
"O clock",
"O'clock",
"Oclock",
"Hundred hours",
),
"",
)

final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"")
graph_minute = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ def __init__(self):
+ pynini.closure(NEMO_CHAR - " ", 1)
+ pynutil.delete("\"")
)
graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)
self.fst = graph.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ def __init__(self):
super().__init__(name="word", kind="verbalize")
chars = pynini.closure(NEMO_CHAR - " ", 1)
char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"")
graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)

self.fst = graph.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -160,18 +160,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
self.graph_no_exception = graph.optimize()

# save self.numbers_up_to_thousand for use in DecimalFst
digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3)
digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3)
numbers_up_to_thousand = pynini.compose(self.graph_no_exception, digits_up_to_thousand).optimize()
self.numbers_up_to_thousand = numbers_up_to_thousand.optimize()

# save self.numbers_up_to_million for use in DecimalFst
digits_up_to_million = (
NEMO_DIGIT
| (NEMO_DIGIT ** 2)
| (NEMO_DIGIT ** 3)
| (NEMO_DIGIT ** 4)
| (NEMO_DIGIT ** 5)
| (NEMO_DIGIT ** 6)
NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6)
)
numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize()
self.numbers_up_to_million = numbers_up_to_million.optimize()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,13 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
delete_extra_space
+ symbols
+ delete_extra_space
+ (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username)
+ (
domain
| pynini.closure(
accepted_username + delete_extra_space,
)
+ accepted_username
)
)

protocol_default = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,13 @@ def __init__(self, cardinal: GraphFst, input_case: str = INPUT_LOWER_CASED):

full_graph_ties = graph_ties | (graph_ties + pynini.cross(" ", "y") + graph_digit)

ordinal_graph_union = pynini.union(graph_digit, graph_teens, graph_twenties, full_graph_ties, graph_hundreds,)
ordinal_graph_union = pynini.union(
graph_digit,
graph_teens,
graph_twenties,
full_graph_ties,
graph_hundreds,
)

accept_o_endings = NEMO_SIGMA + pynini.accep("o")
accept_a_endings = NEMO_SIGMA + pynini.accep("a")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED):
# Denormalized phone numbers are grouped in sets of 3 or 4 digits
group_of_two = pynini.union(doubled_digit, digit_twice, double_digits)

group_of_three = pynini.union(tripled_digit, single_digits + pynutil.delete(" ") + group_of_two,)
group_of_three = pynini.union(
tripled_digit,
single_digits + pynutil.delete(" ") + group_of_two,
)

group_of_four = pynini.union(
group_of_two + pynutil.delete(" ") + group_of_two,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ def __init__(self):
+ pynini.closure(NEMO_CHAR - " ", 1)
+ pynutil.delete("\"")
)
graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)
self.fst = graph.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ def __init__(self):
super().__init__(name="word", kind="verbalize")
chars = pynini.closure(NEMO_CHAR - " ", 1)
char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"")
graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
graph = char @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)

self.fst = graph.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
NEMO_NON_BREAKING_SPACE = u"\u00A0"
NEMO_NON_BREAKING_SPACE = u"\u00a0"
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

Expand Down Expand Up @@ -188,4 +188,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike':
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA)
Loading
Loading