-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCreateDataset.h
More file actions
161 lines (148 loc) · 7.04 KB
/
CreateDataset.h
File metadata and controls
161 lines (148 loc) · 7.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#pragma once //this line tells the compiler to only inlcude this file once
//#include <bits/stdc++.h>
#include <iostream> // cout, endl
#include <fstream> // ifstream, ofstream
#include <sstream> // stringstream
#include <string> // std::string
#include <set> // std::set
#include <random> // random_device, mt19937, distributions
#include <iomanip> // std::fixed, std::setprecision
#include "Constant.h"
//this code creates edge.csv ({U,V,Weight})
//this code creates nodes.csv ({id,domain,strenght})
//strength -- >indicator of strength in there skill
//domain --> domain of there work
class CreateData
{
public:
CreateData (const std::string FIlePath)
{
createEdgesCSV(FIlePath);
createNodesCSV(FIlePath);
}
private:
void createEdgesCSV(const std::string filePath)
{
std::ifstream data(filePath);
std::ofstream edges(EDGESFILEPATH);
std::string line;//to store line by line from data filePath
//here we have not used rand() and seed(), the old way in c because the rand() has predictive patterns
std::random_device rd;//seeding random number
std::mt19937 gen(rd());//psedo-random engine assigning
std::uniform_int_distribution<> distribute(1,4);//maps the engines output to [1,4] both inclusive for random weight described in later part of code
if(!edges.is_open())
{
std::cout << RED << "cannot open edges.txt file " << "Try Again" << RESET << "\n";
return;
}
if(data.is_open())
{
std::cout << CYAN << "Reading from " << filePath << " for Edges" << RESET << std::endl;
std::cout << CYAN << "-----------------" << RESET << std::endl;
edges << "u" << "," << "v" << "," << "weight" << '\n';//adding header
while(std::getline(data,line))
{
//now assigning random weights to edges according to the below chart
/*1 = LinkedIn / Acquaintance (Weak tie)
2 = Colleague / Business Partner (Moderate tie)
3 = Friend (Strong tie)
4 = Close Friend / Confidant (Very strong tie)*/
int random_Weight=distribute(gen);
std::stringstream ss(line);//for readin U and V
std::string u,v;
ss >> u >> v;
if(u.empty() || v.empty())
continue;
edges << u << "," << v << "," << random_Weight << "\n";
//undirected using this line
if(UNDIRECTED_GRAPH)
edges << v << "," << u << "," << random_Weight << '\n';
}
//closing the file pointers
edges.close();
data.close();
std::cout << GREEN << "Succesfully wrote the edges.txt file" << RESET << std::endl;
//renaming edges.txt to edges.csv for better readability
// int success=rename("Dataset\\edges.txt","Dataset\\edges.csv");
// if(success==0)
// cout << GREEN << "Succesfully renamed edges.txt to edges.csv" << RESET << endl;
// else
// cout << RED << "Renaming Failed" << RESET << endl;
}
else
std::cout << RED << filePath << " not opened" << RESET << std::endl;
}
void countUniqueIds(const std::string FilePath,std::set<std::string> &UniqueID)
{
std::cout << CYAN << "Starting the counting of nodes" << RESET << std::endl;
long long linesProcessed=0;//to count on what line the program is now
std::ifstream data(FilePath);
if(data.is_open())
{
std::string line;//to store line by line string from filePath
std::string node1,node2;//to store the nodes read in egdes
while(std::getline(data,line))//to get line by line from filePath
{
std::stringstream ss(line);
ss >> node1 >> node2;//to get node1 and node2
//storing the node1 and node2
if(!node1.empty())//just a check to avoid any sort of errors
UniqueID.insert(node1);
if(!node2.empty())
UniqueID.insert(node2);
linesProcessed++;
if(linesProcessed%10000 == 0)
{//just for checking progress at each point of time
std::cout << YELLOW << "Processed ..." << linesProcessed << " lines" << RESET << "\n";
}
}
std::cout << GREEN << "Finished counting nodes" << RESET << '\n';
data.close();
}
else
std::cout << RED << "Counting of unique IDS failed" << RESET <<"\n";
}
void createNodesCSV(const std::string filePath)
{
std::ofstream nodes(NODESFILEPATH);
std::string line;//to store line by line from data filePath
//here we have not used rand() and seed(), the old way in c because the rand() has predictive patterns
std::random_device rd;//seeding random number
std::mt19937 gen(rd());//psedo-random engine assigning
std::uniform_int_distribution<> distribute_domain(1,4);//maps the engines output to [1,4] both inclusive for random domains described in later part of code
std::uniform_real_distribution<> distribute_strength(0.0,10.0);//maps the engines output to [0.0,10.0] both inclusive for random strength
std::set<std::string> UniqueID;//to store the Unique ID i.e nodes
countUniqueIds(filePath,UniqueID);
if(nodes.is_open())
{
std::cout << CYAN << "Reading from " << filePath << " for Nodes" << RESET << std::endl;
std::cout << CYAN << "-----------------" << RESET << std::endl;
nodes << "id" << "," << "domain" << "," << "strength" << '\n';
while(!UniqueID.empty())
{
//now assigning random weights to edges according to the below chart
/*
1 = Finance
2 = Health
3 = AI / ML
4 = Academia*/
int random_domain=distribute_domain(gen);
double random_strength=distribute_strength(gen);
std::string ID= *UniqueID.begin();
UniqueID.erase(UniqueID.begin());
nodes << ID << "," << random_domain << "," << std::fixed << std::setprecision(2) << random_strength << "\n";
}
//closing the file pointers
nodes.close();
std::cout << GREEN << "Succesfully wrote the nodes.txt file" << RESET << std::endl;
//renaming edges.txt to edges.csv for better readability
// int success=rename("Dataset\\nodes.txt","Dataset\\nodes.csv");
// if(success==0)
// cout << GREEN << "Succesfully renamed nodes.txt to nodes.csv" << RESET << endl;
// else
// cout << RED << "Renaming Failed" << RESET << endl;
}
else
std::cout << RED << "Dataset/nodes.txt cannot be" << " not opened" << RESET << std::endl;
}
};