-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathMAIN_MAGIC_DB.py
More file actions
98 lines (83 loc) · 3.79 KB
/
MAIN_MAGIC_DB.py
File metadata and controls
98 lines (83 loc) · 3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from MAGIC import Magic
from rdflib_hdt import HDTStore
from ink.base.connectors import AbstractConnector
from rdflib import Graph
import awena
import glob
import pandas as pd
from tqdm import tqdm
import json
import requests
skip_list_db = ['http://schema.org/', 'http://www.w3.org/2004/02/skos/core', 'http://www.w3.org/2002/07/owl#sameAs',
'http://dbpedia.org/property/wikiPageUsesTemplate','http://dbpedia.org/ontology/wikiPageWikiLink',
'http://purl.org/dc/terms/subject', 'http://www.w3.org/2002/07/owl#Thing']
global g
class DBMagic(Magic):
def __init__(self, connector, structured_file, header, index_col, main_col):
super().__init__(connector, structured_file, header, index_col, main_col,'http://dbpedia.org/property/','http://www.w3.org/1999/02/22-rdf-syntax-ns#type§', skip_list_db)
def search_entity_api(self, entity):
try:
data = {'confidence': 0.1, 'text': entity}
headers = {
'Accept': 'application/json',
}
response = requests.post('http://localhost:2222/rest/candidates', headers=headers, data=data)
js = json.loads(response.text)
if isinstance(js['annotation']['surfaceForm'],list):
data = []
for s in js['annotation']['surfaceForm']:
if isinstance(s['resource'], list):
data.extend(['http://dbpedia.org/resource/'+r['@uri'] for r in s['resource']])
else:
data.extend(['http://dbpedia.org/resource/'+s['resource']['@uri']])
else:
if isinstance(js['annotation']['surfaceForm']['resource'], list):
data = ['http://dbpedia.org/resource/' + r['@uri'] for r in js['annotation']['surfaceForm']['resource']]
else:
data = ['http://dbpedia.org/resource/' + js['annotation']['surfaceForm']['resource']['@uri']]
except:
data = []
return data
class HDTConnector(AbstractConnector):
def __init__(self):
self.db_type = 'rdflib'
def query(self, q_str):
res = g.query(q_str)
return json.loads(res.serialize(format="json"))['results']['bindings']
def query_relation(self, ind, rel):
q_str="""
SELECT ?o ?l WHERE {
<"""+str(ind)+"> <"+str(rel)+"> ?o"+""".
?o <http://www.w3.org/2000/01/rdf-schema#label> ?l .
FILTER (langMatches( lang(?l), "EN" ) )
}
"""
res = g.query(q_str)
return json.loads(res.serialize(format="json"))['results']['bindings']
def query_column(self, rel):
q_str="""
SELECT ?s ?o ?l WHERE {
?s"""+" <"+str(rel)+"> ?o"+""".
}
ORDER BY RAND()
LIMIT 250
"""
res = g.query(q_str)
return json.loads(res.serialize(format="json"))['results']['bindings']
if __name__ == '__main__':
store = HDTStore("/users/bsteenwi/dbpedia_hdt/dbpedia2016-10.hdt")
g = Graph(store)
connector = HDTConnector()
#cpa_targets = pd.read_csv("HardTablesR3_CPA_WD_Round3_Targets.csv", header=None)
cta_targets = pd.read_csv("CTA_DBP_Round1_Targets.csv", header=None)
done = pd.read_csv("DBPediaR1_DB_cta.txt", header=None)
done = set(done[0].values)
for file in tqdm(glob.glob('/users/bsteenwi/dbpedia_hdt/code/DBPediaR1/*.csv')):
name = file.split('/')[-1].split('.')[0]
print(name)
if name not in done:
# cpa_targets[cpa_targets[0] == name][1].value_counts().idxmax()
for main_col in cta_targets[cta_targets[0]==name][1].values:
annotator = DBMagic(connector,file,0,None,main_col)#WikiMagic(connector,file,main_col)
annotator.annotate()
annotator.export_files("DBPediaR1_DB")