Skip to content

Commit e89d485

Browse files
committed
When NER crashes on TSV cell, just output "NER error" into that cell
1 parent 9051e3e commit e89d485

2 files changed

Lines changed: 13 additions & 10 deletions

File tree

Anonymise.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def get_segments_from_ner(phase, user_input, reserved_entities_dict, ner_entitie
9191
replacement = (
9292
"Number"
9393
if anonymise_numbers
94-
# and len(text_normalised) > 2 # # do not anonymise short number since they are likely ordinals too
94+
# and len(text_normalised) > 2 # # do not anonymise short numbers since they are likely ordinals too
9595
and regex.search(r"(\d|\s)", text_normalised) is not None # if it is a one-word textual representation of a number then do not normalise it. It might be phrase like "one-sided" etc, which is actually not a number
9696
else None
9797
)
@@ -318,6 +318,9 @@ def __init__(self, user_input):
318318
def anonymise(user_input, anonymise_names, anonymise_numbers, anonymise_dates, anonymise_titles_of_work, anonymise_title_cased_word_sequences, anonymise_urls, anonymise_emails, anonymise_phone_numbers, ner_model, use_only_numeric_replacements = False, state = None):
319319
global spacy_loaded #, spacy
320320

321+
# TODO!!! speed up the process by detecting cells with same content as before and applying same result
322+
# TODO!!! speed up the process by skipping empty cells
323+
321324
if True: # for debugging regex-based entities
322325
# if not spacy_loaded:
323326
with Timer("Loading Spacy", quiet=spacy_loaded):
@@ -365,12 +368,16 @@ def anonymise(user_input, anonymise_names, anonymise_numbers, anonymise_dates, a
365368
# TODO: preserve "someword(s)" sequence only in English text
366369
# TODO: restore original character locations later
367370

368-
bracket_or_dash_re = r'(\p{Ll})((?!\(s\))[' + left_brac + ']|' + dash_between_words_re + r'\s|[\/\\]\s?)' # include / and \ chars here as well but do not require space after it
371+
bracket_or_dash_re = r'(\p{Ll})((?!\(s\))[' + left_brac + r']|' + dash_between_words_re + r'\s|[\/\\]\s?)' # include / and \ chars here as well but do not require space after it
369372
user_input = regex.sub(bracket_or_dash_re, r'\1 \2', user_input)
370373

371374

372375
with Timer("Running NER", quiet=True):
373-
ner_entities = NER(user_input)
376+
try:
377+
ner_entities = NER(user_input)
378+
except ValueError as ex: # for some content, NER fails with message like "ValueError: Shape mismatch for blis.gemm: (1, 0), (768, 49)"
379+
result = "NER error" # TODO: make the error message configurable
380+
return result, state
374381

375382
else:
376383
ner_entities = DummyNer(user_input)
@@ -445,7 +452,7 @@ def anonymise(user_input, anonymise_names, anonymise_numbers, anonymise_dates, a
445452
# detect any pre-existing anonymous entities like Person A, Person B in the input text and reserve these letters in the dict so that they are not reused
446453

447454
with Timer("Running regexes", quiet=True):
448-
re_matches = regex.findall(r"(^|\s)(" + active_replacements + ")(\s+)([" + regex.escape(letters) + "]|[0-9]+)(\s|:|$)", user_input) # NB! capture also numbers starting with 0 so that for example number 09 still ends up reserving number 9.
455+
re_matches = regex.findall(r"(^|\s)(" + active_replacements + r")(\s+)([" + regex.escape(letters) + r"]|[0-9]+)(\s|:|$)", user_input) # NB! capture also numbers starting with 0 so that for example number 09 still ends up reserving number 9.
449456

450457
for re_match in re_matches:
451458

@@ -477,10 +484,6 @@ def anonymise(user_input, anonymise_names, anonymise_numbers, anonymise_dates, a
477484

478485

479486

480-
if user_input == 'Threshold Glasgow Day Opportunities':
481-
qqq = True
482-
483-
484487
for phase in range(0, 2): # Two phases: 1) counting unique entities, 2) replacing them. Phase 1 is needed so that same entity will have same replacement in all places.
485488
for segment in get_segments_including_custom_replacements(phase, user_input, reserved_entities_dict, ner_entities, anonymise_names, anonymise_numbers, anonymise_dates, anonymise_titles_of_work, anonymise_title_cased_word_sequences, anonymise_urls, anonymise_emails, anonymise_phone_numbers, ner_model):
486489

Anonymiser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ async def anonymiser(argv = None):
282282
if encoding == "auto":
283283
encoding = await detect_encoding(input_filename)
284284

285-
user_input = pd.read_csv(fullfilename, delimiter="\t", dtype=str, na_filter=False, encoding=encoding, encoding_errors="ignore", on_bad_lines="warn", header=None if csv_anonymise_header else 0)
285+
user_input = pd.read_csv(fullfilename, delimiter="\t", dtype=str, na_filter=False, encoding=encoding, encoding_errors="ignore", on_bad_lines="warn", header=None if csv_anonymise_header else 0, index_col=False)
286286

287287
elif file_extension in sheet_extensions: # NB! only first sheet is processed
288288
is_table = True
@@ -337,7 +337,7 @@ async def anonymiser(argv = None):
337337
# TODO: add "quoting" and "skipinitialspace" parameters as well?
338338
# TODO: add "dialect" parameter? (But note: .to_csv() does not have support for dialect parameter).
339339

340-
user_input = pd.read_csv(fullfilename, delimiter=csv_delimiter, dtype=str, na_filter=False, quotechar=csv_quotechar, doublequote=csv_doublequote, escapechar=csv_escapechar, encoding=encoding, encoding_errors="ignore", on_bad_lines="warn", header=None if csv_anonymise_header else 0)
340+
user_input = pd.read_csv(fullfilename, delimiter=csv_delimiter, dtype=str, na_filter=False, quotechar=csv_quotechar, doublequote=csv_doublequote, escapechar=csv_escapechar, encoding=encoding, encoding_errors="ignore", on_bad_lines="warn", header=None if csv_anonymise_header else 0, index_col=False)
341341

342342
elif file_extension == ".txt":
343343
is_table = False

0 commit comments

Comments
 (0)