forked from dbpedia/fact-extractor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_crowdflower_input.py
More file actions
111 lines (100 loc) · 4.74 KB
/
create_crowdflower_input.py
File metadata and controls
111 lines (100 loc) · 4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
# coding: utf-8
import codecs
import csv
import json
import os
import re
import sys
from collections import defaultdict
from nltk import RegexpParser
from get_meaningful_sentences import CHUNKER_GRAMMAR, load_pos_data
LU_FRAME_MAP_LOCATION = 'resources/lu-frame-map.json'
LU_FRAME_MAP = json.load(open(LU_FRAME_MAP_LOCATION))
TOKENS = []
def load_all_tokens():
for lu in LU_FRAME_MAP.keys():
if LU_FRAME_MAP[lu].get('tokens'):
for token in LU_FRAME_MAP[lu].get('tokens'):
TOKENS.append(token)
def add_chunk_data(pos_data):
chunker = RegexpParser(CHUNKER_GRAMMAR)
chunks = {}
for sentence_id, data in pos_data.iteritems():
result = chunker.parse(data)
chunks[sentence_id] = {}
chunks[sentence_id]['chunks'] = [' '.join([token for token, pos in t.leaves()]) for t in result.subtrees(lambda result: result.label() == 'SN')]
return chunks
# Assumes entity linked sentences have the <strong> tag
def prepare_crowdflower_input(entity_linked_dir, chunk_data):
input_data = []
# Walk into entity linked dir
for path, subdirs, files in os.walk(entity_linked_dir):
for name in files:
# Get sentence ID based on file naming convention {number}.json
row_id = name.split('.')[0]
if row_id in chunk_data.keys():
f = os.path.join(path, name)
# Load entity linked JSON file
entity_linked_data = json.load(codecs.open(f, 'rb', 'utf-8'), encoding='utf-8')
input_row = {}
input_row['id'] = row_id
sentence = entity_linked_data.keys()[0]
input_row['sentence'] = sentence
# Extract LU token based on the <strong> tag
match = re.search(r'<strong>([^<]+)</strong>', sentence)
if match: token = match.group(1)
else: print "No match in sentence -> %s" % sentence
frames = []
for lu in LU_FRAME_MAP.keys():
tokens = LU_FRAME_MAP[lu].get('tokens')
# Check if the LU token exists in the preloaded LU_FRAME_MAP
if tokens and token in tokens:
input_row['lu'] = lu
frames = [frame for frame in LU_FRAME_MAP[lu].keys() if frame != 'tokens']
linked_entities = entity_linked_data[sentence]
for frame in frames:
input_row['frame'] = frame
fe_names = LU_FRAME_MAP[input_row['lu']][frame]
# Store FE, chunks and linked entities with incremental numbers
# fe_name{i}, fe{j}, entity{j}, type{j_k}
for i in xrange(0, len(fe_names)):
input_row['fe_name' + str(i)] = fe_names[i]
for j in xrange(0, len(chunk_data[row_id]['chunks'])):
current_np = chunk_data[row_id]['chunks'][j]
input_row['fe' + str(j)] = current_np
for linked in linked_entities:
# Retrieve entity in the sentence based on indices
entity_string = sentence[linked['start']:linked['end']]
if current_np.find(entity_string) != -1:
input_row['entity' + str(j)] = entity_string
for k in xrange(0, len(linked['types'])):
input_row['type' + str(j) + '_' + str(k)] = linked['types'][k][28:]
# Prepare input for DictWriter, since it won't write UTF-8
input_data.append({k:v.encode('utf-8') for k,v in input_row.items()})
return input_data
def write_input_spreadsheet(input_data, outfile='input-data.csv'):
# Merge all the keys to prepare the CSV headers
fields = set([k for d in input_data for k in d.keys()])
fields.add('_golden')
fields = list(fields)
fields.sort()
writer = csv.DictWriter(open(outfile, 'wb'), fields)
writer.writeheader()
writer.writerows(input_data)
return 0
if __name__ == "__main__":
if len(sys.argv) == 4:
pos_data = load_pos_data(sys.argv[1])
chunk_data = add_chunk_data(pos_data)
input_data = prepare_crowdflower_input(sys.argv[2], chunk_data)
outfile = sys.argv[3]
write_input_spreadsheet(input_data, outfile)
elif len(sys.argv) == 3:
pos_data = load_pos_data(sys.argv[1])
chunk_data = add_chunk_data(pos_data)
input_data = prepare_crowdflower_input(sys.argv[2], chunk_data)
write_input_spreadsheet(input_data)
else:
print "Usage: python %s <POS_DATA_DIR> <ENTITY_LINKED_DATA_DIR> [OUTPUT_FILE]" % __file__
sys.exit(1)