-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess_share.py
More file actions
113 lines (75 loc) · 2.97 KB
/
Copy pathprocess_share.py
File metadata and controls
113 lines (75 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import csv
import json
import os
import re
import process_funtions as process
import read_files as read
def build_dataset(input_path):
train_input = read.textfile2list(input_path)
train_input = [item.split("\t") for item in train_input]
ontology = read.read_from_json("data/share/umls/cui_umls_for_share")
print(len(ontology))
# input = []
processed = []
unseen = []
for mention, concept in train_input:
concept = concept.replace(".", "")
concept = concept.strip()
if concept in ontology:
processed.append([mention, concept])
elif concept.lower() == "cui-less":
processed.append([mention, "CUI-less"])
else:
processed.append([mention, ontology[0]])
unseen.append(concept)
print(unseen)
print(len(unseen))
print(len(list(set(unseen))))
read.save_in_tsv("data/share/processed/data_raw/test.tsv", processed)
# input_path = "data/share/raw/train.txt"
# input_path = "data/share/raw/dev.txt"
# input_path = "data/share/raw/test.txt"
# build_dataset(input_path)
def data_st():
input = read.read_from_tsv("data/share/processed/data_raw/test.tsv")
semantic_type = read.read_from_json("data/share/umls/cui_share_st")
semantic_type['CUI-less'] = ['CUI_less']
input_new = []
for [mention, concept] in input:
input_st = '_'.join(
process.get_st_cui(semantic_type, concept).split(" "))
input_synonym_new = "<e> " + mention + " </e>"
input_new.append([input_st, concept, input_synonym_new])
read.save_in_tsv("data/share/processed/data/test.tsv", input_new)
# data_st()
def combine_train_dev():
umls = read.read_from_tsv("data/share/snomed/ontology.tsv")
train = read.read_from_tsv("data/share/processed/data/train.tsv")
train_new = umls + train * 10
read.save_in_tsv("data/share/processed/snomed+data/train.tsv", train_new)
# combine_train_dev()
def share_input():
ontology = read.read_from_tsv("data/share/snomed/ontology.tsv")
train = read.read_from_tsv("data/share/processed/data/train.tsv")
# dev = read.read_from_tsv("data/share/processed/data/dev.tsv")
ontology = ontology + train
cuis = read.read_from_json("data/share/umls/cui_umls_for_share")
norm_mention = {}
for idx, [_, norm, mention] in enumerate(ontology):
read.add_dict(norm_mention, norm, mention)
mentions = []
idx = 0
cui_mention_idx = {}
for cui in cuis:
cui_mentions = list(set(norm_mention[cui]))
mentions += cui_mentions
end = idx + len(cui_mentions)
cui_mention_idx[cui] = (idx, end)
idx = end
mentions = [[syn] for syn in mentions]
print(len(cui_mention_idx))
read.save_in_tsv("data/share/snomed_concept_train/ontology_synonyms.tsv", mentions)
read.save_in_json("data/share/snomed_concept_train/ontology_concept_synonyms_idx",
cui_mention_idx)
return mentions, cui_mention_idx
# share_input()