forked from BCHSI/philter-ucsf
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
129 lines (114 loc) · 5.51 KB
/
main.py
File metadata and controls
129 lines (114 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import argparse
import distutils.util
import re
import pickle
from philter import Philter
import gzip
import json
def main():
# get input/output/filename
help_str = """ Philter -- PHI filter for clinical notes """
ap = argparse.ArgumentParser(description=help_str)
ap.add_argument("-i", "--input", default="./data/i2b2_notes/",
help="Path to the directory or the file that contains the PHI note, the default is ./data/i2b2_notes/",
type=str)
ap.add_argument("-a", "--anno", default="./data/i2b2_anno/",
help="Path to the directory or the file that contains the PHI annotation, the default is ./data/i2b2_anno/",
type=str)
ap.add_argument("-o", "--output", default="./data/i2b2_results/",
help="Path to the directory to save the PHI-reduced notes in, the default is ./data/i2b2_results/",
type=str)
ap.add_argument("-f", "--filters", default="./configs/integration_1.json",
help="Path to our config file, the default is ./configs/integration_1.json",
type=str)
ap.add_argument("-x", "--xml", default="./data/phi_notes.json",
help="Path to the json file that contains all xml data",
type=str)
ap.add_argument("-c", "--coords", default="./data/coordinates.json",
help="Path to the json file that contains the coordinate map data",
type=str)
ap.add_argument("--eval_output", default="./data/phi/",
help="Path to the directory that the detailed eval files will be outputted to",
type=str)
ap.add_argument("-v", "--verbose", default=True,
help="When verbose is true, will emit messages about script progress",
type=lambda x:bool(distutils.util.strtobool(x)))
ap.add_argument("-e", "--run_eval", default=True,
help="When run_eval is true, will run our eval script and emit summarized results to terminal",
type=lambda x:bool(distutils.util.strtobool(x)))
ap.add_argument("-t", "--freq_table", default=False,
help="When freqtable is true, will output a unigram/bigram frequency table of all note words and their PHI/non-PHI counts",
type=lambda x:bool(distutils.util.strtobool(x)))
ap.add_argument("-n", "--initials", default=True,
help="When initials is true, will include initials PHI in recall/precision calculations",
type=lambda x:bool(distutils.util.strtobool(x)))
ap.add_argument("--outputformat", default="i2b2",
help="Define format of annotation, allowed values are \"asterisk\", \"i2b2\". Default is \"asterisk\"",
type=str)
ap.add_argument("--ucsfformat", default=False,
help="When ucsfformat is true, will adjust eval script for slightly different xml format",
type=lambda x:bool(distutils.util.strtobool(x)))
ap.add_argument("--prod", default=False,
help="When prod is true, this will run the script with output in i2b2 xml format without running the eval script",
type=lambda x:bool(distutils.util.strtobool(x)))
ap.add_argument("--cachepos", default=None,
help="Path to a directoy to store/load the pos data for all notes. If no path is specified then memory caching will be used.",
type=str)
args = ap.parse_args()
run_eval = args.run_eval
verbose = args.verbose
if args.prod:
run_eval = False
verbose = False
philter_config = {
"verbose":verbose,
"run_eval":run_eval,
"finpath":args.input,
"foutpath":args.output,
"outformat":args.outputformat,
"filters":args.filters,
"cachepos":args.cachepos
}
else:
philter_config = {
"verbose":args.verbose,
"run_eval":args.run_eval,
"freq_table":args.freq_table,
"initials":args.initials,
"finpath":args.input,
"foutpath":args.output,
"outformat":args.outputformat,
"ucsfformat":args.ucsfformat,
"anno_folder":args.anno,
"filters":args.filters,
"xml":args.xml,
"coords":args.coords,
"eval_out":args.eval_output,
"cachepos":args.cachepos
}
if verbose:
print("RUNNING ", philter_config['filters'])
filterer = Philter(philter_config)
#map any sets, pos and regex groups we have in our config
filterer.map_coordinates()
#transform the data
#Priority order is maintained in the pattern list
filterer.transform()
#evaluate the effectiveness
if run_eval and args.outputformat == "asterisk":
filterer.eval(
philter_config,
in_path=args.output,
anno_path=args.anno,
anno_suffix=".txt",
fn_output = "data/phi/fn.txt",
fp_output = "data/phi/fp.txt",
summary_output="./data/phi/summary.json",
phi_matcher=re.compile("\*+"),
pre_process=r":|\,|\-|\/|_|~", #characters we're going to strip from our notes to analyze against anno
only_digits=False,
pre_process2= r"[^a-zA-Z0-9]",
punctuation_matcher=re.compile(r"[^a-zA-Z0-9\*]"))
# error analysis
if __name__ == "__main__":
main()