You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# and len(text_normalised) > 2 # # do not anonymise short number since they are likely ordinals too
94
+
# and len(text_normalised) > 2 # # do not anonymise short numbers since they are likely ordinals too
95
95
andregex.search(r"(\d|\s)", text_normalised) isnotNone# if it is a one-word textual representation of a number then do not normalise it. It might be phrase like "one-sided" etc, which is actually not a number
# TODO!!! speed up the process by detecting cells with same content as before and applying same result
322
+
# TODO!!! speed up the process by skipping empty cells
323
+
321
324
ifTrue: # for debugging regex-based entities
322
325
# if not spacy_loaded:
323
326
withTimer("Loading Spacy", quiet=spacy_loaded):
@@ -365,12 +368,16 @@ def anonymise(user_input, anonymise_names, anonymise_numbers, anonymise_dates, a
365
368
# TODO: preserve "someword(s)" sequence only in English text
366
369
# TODO: restore original character locations later
367
370
368
-
bracket_or_dash_re=r'(\p{Ll})((?!\(s\))['+left_brac+']|'+dash_between_words_re+r'\s|[\/\\]\s?)'# include / and \ chars here as well but do not require space after it
371
+
bracket_or_dash_re=r'(\p{Ll})((?!\(s\))['+left_brac+r']|'+dash_between_words_re+r'\s|[\/\\]\s?)'# include / and \ chars here as well but do not require space after it
exceptValueErrorasex: # for some content, NER fails with message like "ValueError: Shape mismatch for blis.gemm: (1, 0), (768, 49)"
379
+
result="NER error"# TODO: make the error message configurable
380
+
returnresult, state
374
381
375
382
else:
376
383
ner_entities=DummyNer(user_input)
@@ -445,7 +452,7 @@ def anonymise(user_input, anonymise_names, anonymise_numbers, anonymise_dates, a
445
452
# detect any pre-existing anonymous entities like Person A, Person B in the input text and reserve these letters in the dict so that they are not reused
446
453
447
454
withTimer("Running regexes", quiet=True):
448
-
re_matches=regex.findall(r"(^|\s)("+active_replacements+")(\s+)(["+regex.escape(letters) +"]|[0-9]+)(\s|:|$)", user_input) # NB! capture also numbers starting with 0 so that for example number 09 still ends up reserving number 9.
455
+
re_matches=regex.findall(r"(^|\s)("+active_replacements+r")(\s+)(["+regex.escape(letters) +r"]|[0-9]+)(\s|:|$)", user_input) # NB! capture also numbers starting with 0 so that for example number 09 still ends up reserving number 9.
449
456
450
457
forre_matchinre_matches:
451
458
@@ -477,10 +484,6 @@ def anonymise(user_input, anonymise_names, anonymise_numbers, anonymise_dates, a
477
484
478
485
479
486
480
-
ifuser_input=='Threshold Glasgow Day Opportunities':
481
-
qqq=True
482
-
483
-
484
487
forphaseinrange(0, 2): # Two phases: 1) counting unique entities, 2) replacing them. Phase 1 is needed so that same entity will have same replacement in all places.
0 commit comments