-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
55 lines (38 loc) · 1.49 KB
/
data.py
File metadata and controls
55 lines (38 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import glob
import json
def load_json_from_file(file_path):
with open(file_path, "r") as f:
return json.load(f)
def open_content(path):
paths = glob.glob(os.path.join(path, "*.json"))
datasets = {'train': None, 'dev': None, 'test': None, 'labels': None}
for p in paths:
for key in datasets.keys():
if key in p:
datasets[key] = load_json_from_file(p)
return datasets['train'], datasets['dev'], datasets['test'], datasets['labels']
def get_word_positions(start_char, end_char, words):
start_word, end_word, char_count = None, None, 0
for i, word in enumerate(words):
if char_count == start_char:
start_word = i
if char_count + len(word) == end_char:
end_word = i
break
char_count += len(word) + 1
return start_word, end_word
def process(data):
words = data['sentence'].split()
entities = []
for entity in data['entities']:
start_char, end_char = entity['pos']
start_word, end_word = get_word_positions(start_char, end_char, words)
entities.append((start_word, end_word, entity['type']))
return {"tokens": words, "ner": entities}
def create_dataset(path):
train, dev, test, labels = open_content(path)
train_dataset = [process(data) for data in train]
dev_dataset = [process(data) for data in dev]
test_dataset = [process(data) for data in test]
return train_dataset, dev_dataset, test_dataset, labels