forked from gaolk/graph-database-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathgenerate_twitter_inputs.py
More file actions
executable file
·62 lines (48 loc) · 1.86 KB
/
generate_twitter_inputs.py
File metadata and controls
executable file
·62 lines (48 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import sys
# Read the node input file and translate the input IDs into a contiguous range.
# Then, read the relation input file and translate all source and destination node IDs
# to their updated contiguous values.
# User-provided input data directory
if len(sys.argv) < 2 or os.path.exists(sys.argv[1]) is False:
print("Usage: generate_inputs.py [path_to_inputs]")
exit(1)
inputdir = sys.argv[1]
# Input filenames
nodefile = 'twitter_rv.net_unique_node'
relfile = 'twitter_rv.net'
seedfile = 'twitter_rv.net-seed'
# Output data directory
datadir = 'data'
# Create updated data directory if it doesn't exist
try:
os.mkdir(datadir)
except OSError:
pass
updated_id = 0
updated_node_file = open(os.path.join(datadir, nodefile.replace('.', '_')), 'w')
updated_node_file.write('id\n') # Output a header row
updated_relation_file = open(os.path.join(datadir, relfile.replace('.', '_')), 'w')
updated_seed_file = open(os.path.join(datadir, seedfile.replace('.', '_')), 'w')
# Map every node ID to its line number
# and generate an updated node file.
placement = {}
with open(os.path.join(inputdir, nodefile)) as f:
for line in f:
placement[int(line)] = updated_id
updated_node_file.write('%d\n' % (updated_id))
updated_id += 1
with open(os.path.join(inputdir, relfile)) as f:
for line in f:
# Tokenize every line and convert the data to ints
src, dst = map(int, line.split())
# Retrieve the updated ID of each source and destination
a = placement[src]
b = placement[dst]
# Output the updated edge description
updated_relation_file.write("%d,%d\n" % (a, b))
with open(os.path.join(inputdir, seedfile)) as f:
updated_seed_file.write(' '.join(str(placement[int(i)]) for i in f.read().split()))
updated_node_file.close()
updated_relation_file.close()
updated_seed_file.close()