Merge branch 'master' of https://github.com/JetBrains-Research/CFPQ_Data

gsvgit · gsvgit · commit 552b9e512e1a · 2020-07-06T20:32:19.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -118,3 +118,6 @@ GTgraph/
 log
 
 tmp.txt
+
+# MacOS directory structure meta
+/**/.DS_Store
diff --git a/tools/LUBM/README.md b/tools/LUBM/README.md
@@ -0,0 +1,19 @@
+# LUBM dataset
+
+## Folder structure
+- config.txt - configures mappings from IRI names to short string names (for our purposes)
+- download.sh - downloads and unzips the lubm generator from oficial website
+- generate.sh - generates dataset for 10 universities (this could be changed in the script: set -univ N)
+- prepare.py - prepares files names of the generated dataset (optional)
+- converter.py - convert set of database files to single file with mappings applied
+
+## Usage
+
+Download lubm java based generating tool. Run dataset generating for  10 universities, and then 
+generated database file with mapping for merged 3 universities (indicies 0..2) as an example.
+
+```
+$ bash download.sh
+$ bash generate.sh
+$ python3 converter.py univ\\University 3 config.txt
+```
diff --git a/tools/LUBM/config.txt b/tools/LUBM/config.txt
@@ -0,0 +1,16 @@
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#subOrganizationOf SO
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#worksFor WF
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#headOf HF
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#teacherOf TF
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#memberOf MF
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#advisor AD
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#takesCourse TC
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#publicationAuthor PA
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#undergraduateDegreeFrom UG
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#mastersDegreeFrom MD
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#doctoralDegreeFrom DD
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#emailAddress EA
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#name NM
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#researchInterest RI
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#telephone TP
+http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl#teachingAssistantOf TA
diff --git a/tools/LUBM/converter.py b/tools/LUBM/converter.py
@@ -0,0 +1,91 @@
+"""
+    RDF merger and converter for LUBM dataset.
+    Database is stored in files <prefix><id>_<sub_id>.owl
+    This files are merged for specified number of universities (ids range),
+    and edges are replaced with specified mapping.
+    Also vertices labels also replaces with integer based names.
+    
+    Usage:
+    - Create a conversion configuration file. Each line must contain an IRI,
+    a whitespace character and a string to replace the IRI by.
+    - Run converter.py <prefix> <count> <config>
+    - Result will have name <pefix><count><vertices count><indices count>.xml
+    
+    The graph will contain explicit inverted edges added an 'R'.
+    """
+
+import rdflib, sys, os
+
+URI_PREFIX = 'http://yacc/'
+MAX_FILES_PER_UNI = 30
+
+# RDF serialization
+def write_to_rdf(target, graph):
+    graph.serialize(target + '.xml', format='xml')
+
+# Edge addition (grapf constructing)
+def add_rdf_edge(subj, pred, obj, graph):
+    s = rdflib.BNode('id-%s' % (subj))
+    p = rdflib.URIRef(URI_PREFIX + pred)
+    o = rdflib.BNode('id-%s' % (obj))
+    graph.add((s, p, o))
+
+if len(sys.argv) < 3:
+    print('Usage: converter.py <prefix> <count> <config>')
+    exit()
+
+replace = {} # map for replacing predicates
+config = sys.argv[3]
+for l in open(config,'r').readlines():
+  pair = l.split(' ')
+  old = rdflib.URIRef(pair[0].strip(' '))
+  new = pair[1].strip('\n').strip(' ')
+  replace[old] = new
+
+print(replace)
+
+res = {}        # map from resources to integer ids
+next_id = 0     # id counter
+edges_count = 0 # Total edges
+
+graph = rdflib.Graph()
+prefix = sys.argv[1]
+count = int(sys.argv[2])
+
+processed = []
+notreplaced = set()
+
+for i in range(0,count):
+  for j in range(0,MAX_FILES_PER_UNI):
+    filename = prefix + str(i) + '_' + str(j) + '.owl'
+    try:
+      g = rdflib.Graph()
+      g.parse(filename)
+      
+      for s,p,o in g:
+        for r in [s,o]:
+          if r not in res:
+            res[r] = str(next_id)
+            next_id += 1
+
+        if p in replace:
+          add_rdf_edge(res[s], replace[p], res[o], graph)
+          add_rdf_edge(res[s], replace[p] + 'R', res[o], graph)
+          edges_count += 2
+        else:
+          add_rdf_edge(res[s], 'OTHER', res[o], graph)
+          edges_count += 1
+          notreplaced.add(p)
+
+      processed.append(filename)
+      print('Merged:', filename)
+    except Exception:
+      pass
+
+target = prefix + str(count) + 'v' + str(next_id) + 'e' + str(edges_count)  # output file
+write_to_rdf(target,graph)
+
+print('Total vertices:', next_id)
+print('Total edges:', edges_count)
+print('Processed files:\n', processed)
+print('Not replaced labels:', notreplaced)
diff --git a/tools/LUBM/download.sh b/tools/LUBM/download.sh
@@ -0,0 +1,2 @@
+wget http://swat.cse.lehigh.edu/projects/lubm/uba1.7.zip
+unzip uba1.7.zip -d univ
diff --git a/tools/LUBM/generate.sh b/tools/LUBM/generate.sh
@@ -0,0 +1,3 @@
+cd univ
+java -cp classes edu.lehigh.swat.bench.uba.Generator -univ 10 -onto http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl
+cd ..
diff --git a/tools/LUBM/prepare.py b/tools/LUBM/prepare.py
@@ -0,0 +1,17 @@
+import os, sys
+
+if len(sys.argv) < 2:
+    print('Usage: prepare.py <prefix> <new>')
+    exit()
+
+prefix = sys.argv[1]
+new = sys.argv[2]
+files = os.listdir()
+
+for f in files:
+  if f.startswith(prefix):
+    name = f.replace(prefix,new)
+    try:
+      os.rename(f, name)
+    except Exception:
+      print('Failed to rename file: ' + f + ' to ' + name)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+wget http://swat.cse.lehigh.edu/projects/lubm/uba1.7.zip`
	`2`	`+unzip uba1.7.zip -d univ`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+cd univ`
	`2`	`+java -cp classes edu.lehigh.swat.bench.uba.Generator -univ 10 -onto http://www.lehigh.edu/~zhp2/2004/0401/univ-bench.owl`
	`3`	`+cd ..`