-
Notifications
You must be signed in to change notification settings - Fork 26
Expand file tree
/
Copy pathEdgelistMaker.py
More file actions
48 lines (37 loc) · 1.33 KB
/
EdgelistMaker.py
File metadata and controls
48 lines (37 loc) · 1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""
This script maps integers 0...n to to all nodes in the csv containging the edges for a graph.
It then saves all of these mappings and a converted graph into csvs
"""
import pandas as pd
import numpy as np
#read the graph downloaded from the neo4j instance
df = pd.read_csv('data/graph.csv')
# loads source curies into a list and removes duplicate curie ids
sources = df['source'].unique()
targets = df['target'].unique()
# Iinitialize counter for node id
c = 0
# Initialize dict that maps curie ids to integer ids
map = {}
# Runs through list of sources and assigns ids
for x in sources:
map[x] = c
c += 1
# Runs through list of targets and assigns ids if not already assigned
for x in targets:
if x not in map.keys():
map[x] = c
c += 1
d = 0
# Runs through rows in daraframe and converts curies to integer ids
for row in range(len(df)):
df.at[row,'source'] = map[df['source'][row]]
df.at[row,'target'] = map[df['target'][row]]
d += 1
# This prints percentage progress every 10%. Uncomment if you want this.
#if d % int(len(df)/10 + 1) == 0:
# print(d/len(df))
# Saves the curie -> intiger id map and converted graph
map_df = pd.DataFrame(list(map.items()), columns = ['curie','id'])
df.to_csv('data/rel.edgelist',sep = ' ', header=False,index=False)
map_df.to_csv('data/map.csv',index=False)