Skip to content

Commit 552b9e5

Browse files
committed
2 parents 1fe0294 + 796f29a commit 552b9e5

7 files changed

Lines changed: 151 additions & 0 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,6 @@ GTgraph/
118118
log
119119

120120
tmp.txt
121+
122+
# MacOS directory structure meta
123+
/**/.DS_Store

tools/LUBM/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# LUBM dataset
2+
3+
## Folder structure
4+
- config.txt - configures mappings from IRI names to short string names (for our purposes)
5+
- download.sh - downloads and unzips the lubm generator from oficial website
6+
- generate.sh - generates dataset for 10 universities (this could be changed in the script: set -univ N)
7+
- prepare.py - prepares files names of the generated dataset (optional)
8+
- converter.py - convert set of database files to single file with mappings applied
9+
10+
## Usage
11+
12+
Download lubm java based generating tool. Run dataset generating for 10 universities, and then
13+
generated database file with mapping for merged 3 universities (indicies 0..2) as an example.
14+
15+
```
16+
$ bash download.sh
17+
$ bash generate.sh
18+
$ python3 converter.py univ\\University 3 config.txt
19+
```

tools/LUBM/config.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#subOrganizationOf SO
2+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#worksFor WF
3+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#headOf HF
4+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#teacherOf TF
5+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#memberOf MF
6+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#advisor AD
7+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#takesCourse TC
8+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#publicationAuthor PA
9+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#undergraduateDegreeFrom UG
10+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#mastersDegreeFrom MD
11+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#doctoralDegreeFrom DD
12+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#emailAddress EA
13+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#name NM
14+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#researchInterest RI
15+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#telephone TP
16+
http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#teachingAssistantOf TA

tools/LUBM/converter.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""
2+
RDF merger and converter for LUBM dataset.
3+
Database is stored in files <prefix><id>_<sub_id>.owl
4+
This files are merged for specified number of universities (ids range),
5+
and edges are replaced with specified mapping.
6+
Also vertices labels also replaces with integer based names.
7+
8+
Usage:
9+
- Create a conversion configuration file. Each line must contain an IRI,
10+
a whitespace character and a string to replace the IRI by.
11+
- Run converter.py <prefix> <count> <config>
12+
- Result will have name <pefix><count><vertices count><indices count>.xml
13+
14+
The graph will contain explicit inverted edges added an 'R'.
15+
"""
16+
17+
import rdflib, sys, os
18+
19+
URI_PREFIX = 'http://yacc/'
20+
MAX_FILES_PER_UNI = 30
21+
22+
# RDF serialization
23+
def write_to_rdf(target, graph):
24+
graph.serialize(target + '.xml', format='xml')
25+
26+
# Edge addition (grapf constructing)
27+
def add_rdf_edge(subj, pred, obj, graph):
28+
s = rdflib.BNode('id-%s' % (subj))
29+
p = rdflib.URIRef(URI_PREFIX + pred)
30+
o = rdflib.BNode('id-%s' % (obj))
31+
graph.add((s, p, o))
32+
33+
if len(sys.argv) < 3:
34+
print('Usage: converter.py <prefix> <count> <config>')
35+
exit()
36+
37+
replace = {} # map for replacing predicates
38+
config = sys.argv[3]
39+
for l in open(config,'r').readlines():
40+
pair = l.split(' ')
41+
old = rdflib.URIRef(pair[0].strip(' '))
42+
new = pair[1].strip('\n').strip(' ')
43+
replace[old] = new
44+
45+
print(replace)
46+
47+
res = {} # map from resources to integer ids
48+
next_id = 0 # id counter
49+
edges_count = 0 # Total edges
50+
51+
graph = rdflib.Graph()
52+
prefix = sys.argv[1]
53+
count = int(sys.argv[2])
54+
55+
processed = []
56+
notreplaced = set()
57+
58+
for i in range(0,count):
59+
for j in range(0,MAX_FILES_PER_UNI):
60+
filename = prefix + str(i) + '_' + str(j) + '.owl'
61+
try:
62+
g = rdflib.Graph()
63+
g.parse(filename)
64+
65+
for s,p,o in g:
66+
for r in [s,o]:
67+
if r not in res:
68+
res[r] = str(next_id)
69+
next_id += 1
70+
71+
if p in replace:
72+
add_rdf_edge(res[s], replace[p], res[o], graph)
73+
add_rdf_edge(res[s], replace[p] + 'R', res[o], graph)
74+
edges_count += 2
75+
else:
76+
add_rdf_edge(res[s], 'OTHER', res[o], graph)
77+
edges_count += 1
78+
notreplaced.add(p)
79+
80+
processed.append(filename)
81+
print('Merged:', filename)
82+
except Exception:
83+
pass
84+
85+
target = prefix + str(count) + 'v' + str(next_id) + 'e' + str(edges_count) # output file
86+
write_to_rdf(target,graph)
87+
88+
print('Total vertices:', next_id)
89+
print('Total edges:', edges_count)
90+
print('Processed files:\n', processed)
91+
print('Not replaced labels:', notreplaced)

tools/LUBM/download.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
wget http://swat.cse.lehigh.edu/projects/lubm/uba1.7.zip
2+
unzip uba1.7.zip -d univ

tools/LUBM/generate.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
cd univ
2+
java -cp classes edu.lehigh.swat.bench.uba.Generator -univ 10 -onto http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl
3+
cd ..

tools/LUBM/prepare.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import os, sys
2+
3+
if len(sys.argv) < 2:
4+
print('Usage: prepare.py <prefix> <new>')
5+
exit()
6+
7+
prefix = sys.argv[1]
8+
new = sys.argv[2]
9+
files = os.listdir()
10+
11+
for f in files:
12+
if f.startswith(prefix):
13+
name = f.replace(prefix,new)
14+
try:
15+
os.rename(f, name)
16+
except Exception:
17+
print('Failed to rename file: ' + f + ' to ' + name)

0 commit comments

Comments
 (0)