-
Notifications
You must be signed in to change notification settings - Fork 90
Expand file tree
/
Copy pathCommunity_Finder.cpp
More file actions
390 lines (308 loc) · 13.1 KB
/
Community_Finder.cpp
File metadata and controls
390 lines (308 loc) · 13.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
/*
* Community_Finder.cpp
*
* Created on: 04-Dec-2009
* Author: freid
*/
//Release canidate.
#include "Community_Finder.h"
//TIMEKEEPING
time_t t0 = clock();;
//REPORTING
#define REPORTING_OUTPUT_FREQUENCY 1000
//PARAMETER STORAGE
float Community_Finder::minimumOverlapToMerge;
float Community_Finder::numberOfTimesRequiredToBeSpokenFor;
float Community_Finder::spokenForThresholdOfUniqueness;
//SORT PREDICATES
//Note - flipped order in sort function - returns them sorted in reverse order - biggest first
bool sizeSortFunctionLargestFirst (Seed * two, Seed * one)
{
return one->getNumberOfNodes() < two->getNumberOfNodes();
}
bool sizeSortFunctionSmallestFirst (Seed * one, Seed * two)
{
return one->getNumberOfNodes() < two->getNumberOfNodes();
}
bool vectorSizeSortFunctionLargestFirst (vector <V> two, vector <V> one)
{
return one.size() < two.size();
}
bool vectorSizeSortFunctionSmallestFirst (vector <V> one, vector <V> two)
{
return one.size() < two.size();
}
//END SORT PREDICATES
//CONSTRUCTURORS
Community_Finder::Community_Finder(const char * filename, int minimumCliqueSize, float minimumOverlapToMerge, float alphaValueForFitness, float numberOfTimesRequiredToBeSpokenFor, float spokenForThresholdOfUniqueness) {
Community_Finder::minimumOverlapToMerge = minimumOverlapToMerge;
Seed::minimumOverlapToMerge = minimumOverlapToMerge;
Seed::alphaValueForFitness = alphaValueForFitness;
Community_Finder::numberOfTimesRequiredToBeSpokenFor = numberOfTimesRequiredToBeSpokenFor;
Community_Finder::spokenForThresholdOfUniqueness = spokenForThresholdOfUniqueness;
this->initialiseSeeds(filename, minimumCliqueSize);
}
Community_Finder::~Community_Finder() {
for (vector< Seed * >::iterator seedItr = this->seeds.begin(); seedItr != this->seeds.end(); ++seedItr)
{
delete (*seedItr);
}
}
//INTERFACE WITH CLIQUE FINDING CODE
int numberOfCliquesProcessed = 0;
void Community_Finder::operator () (const vector<V> & clique) { // V is simply typedef'd to an int. I suppose it might become a long if we deal with graphs with over 2 billion nodes/edges.
// WARNING: Do NOT take a pointer to the clique. It will be invalidated by the clique finding code upon returning. You must copy the data into your own structures.
vector <V > temp;
for ( vector<V>::const_iterator cliqueVertexIterator = (clique).begin(); cliqueVertexIterator != (clique).end(); ++cliqueVertexIterator)
{
temp.push_back(*cliqueVertexIterator);
}
this->cliques.push_back(temp);
numberOfCliquesProcessed++;
if (numberOfCliquesProcessed % REPORTING_OUTPUT_FREQUENCY == 0)
{
fprintf(stderr, "%.2fs: ",(double)(clock()-t0)/CLOCKS_PER_SEC);
cerr << "Processed: " << numberOfCliquesProcessed << " cliques..." << endl;
}
}
//Responsible for taking the initialised structures, and operating the algorithm on them.
void Community_Finder::run()
{
// print graph
/*
for (int i = 0; i < theGlobalGraph.vertex_count;i++)
{
pair <V*,V*> startAndEnd = theGlobalGraph.neighbours(i);
cerr << "Neighbours of vertex " << i << " : ";
for (V * otherVertex = startAndEnd.first; otherVertex != startAndEnd.second; otherVertex++)
{
cerr << " " << *otherVertex;
}
cerr << endl;
}
*/
fprintf(stderr, "%.2fs: ",(double)(clock()-t0)/CLOCKS_PER_SEC);
fprintf(stderr, "%.2fs: ",(double)(clock()-t0)/CLOCKS_PER_SEC);
cerr << "--------------------------------" << endl;
cerr << "Number of seeds: " << this->seeds.size() << endl;
sort(this->seeds.begin(),this->seeds.end(),sizeSortFunctionLargestFirst);
vector< Seed* > resultsVec;
int numberSeedsDiscardedBeforeExpansion = 0;
int numberSeedsDiscardedAfterExpansion = 0;
int numberSeedsKept = 0;
int numberSeedsProcessed = 0;
//for each seed
bool issueAlphaWarning = false;
for (vector< Seed * >::iterator seedItr = this->seeds.begin(); seedItr != this->seeds.end(); ++seedItr)
{
(*seedItr)->putIntoNodeToSeedsCache();
if (numberSeedsProcessed % REPORTING_OUTPUT_FREQUENCY == 0)
{
fprintf(stderr, "%.2fs: ",(double)(clock()-t0)/CLOCKS_PER_SEC);
cerr << "Processed: " << numberSeedsProcessed << " seeds\n";
if(issueAlphaWarning)
{
cerr << "Warning: size of growing communities exceeds probable size: try increasing Alpha value." << endl;
}
issueAlphaWarning = false;
}
numberSeedsProcessed++;
bool alreadyCounted = false;
alreadyCounted = (*seedItr)->overlapsAlreadyAcceptedSeed();
if(!alreadyCounted)
{
(*seedItr)->updateCachedEdgeValuesFromScratch();
(*seedItr)->updateFrontierFromScratch();
//expand to first peak fitness, within threshold
while ( (*seedItr)->addBestNodeFromFrontierToSeed() > 0)
{
}
if( (*seedItr)->getNumberOfNodes() > (theGlobalGraph.vertex_count / 4 ))
{
if(!issueAlphaWarning)
{
cerr << "Warning: size of growing communities exceeds probable size: try increasing Alpha value." << endl;
issueAlphaWarning = true;
}
}
alreadyCounted = (*seedItr)->overlapsAlreadyAcceptedSeed();
if(!alreadyCounted)
{
// cerr << "Was not a duplicate\n";
//add it to results
numberSeedsKept++;
resultsVec.push_back((*seedItr));
}
else
{
// cerr << "Was a duplicate\n";
numberSeedsDiscardedAfterExpansion++;
}
(*seedItr)->clearCaches();
}
else
{
numberSeedsDiscardedBeforeExpansion++;
}
if(alreadyCounted)
{
(*seedItr)->dead = true;
(*seedItr)->removeFromNodeToSeedsList();
}
}
fprintf(stderr, "%.2fs: ",(double)(clock()-t0)/CLOCKS_PER_SEC);
cerr << "Number seeds discarded before expansion: " << numberSeedsDiscardedBeforeExpansion << " seeds\n";
cerr << "Number seeds discarded after expansion: " << numberSeedsDiscardedAfterExpansion << " seeds\n";
cerr << "Number seeds kept: " << numberSeedsKept << " seeds\n";
this->sweepTheDead();
//TODO make this more efficient in future.
this->seeds.assign(resultsVec.begin(), resultsVec.end());
}
//This is the CCH implementation, which gets the number of times each node is 'spoken for' by another clique.
//In our release, we always use '1' as the value for this number.
//It also checks what percentage of each candidate clique has then being 'spoken for' or 'covered'. If this percentage
//is above a threshold, then the clique is rejected as being insufficiently distinct.
//This function uses the following static parameters: Community_Finder::numberOfTimesRequiredToBeSpokenFor (normally set to 1)
// and Community_Finder::spokenForThresholdOfUniqueness, normally set to around .75
void Community_Finder::doSpokenForPruning()
{
sort(this->seeds.begin(),this->seeds.end(),sizeSortFunctionLargestFirst);
cerr << "Cliques sorted. About to run spokenFor pruning..." << "number of seeds before: " << this->seeds.size()<< endl;
vector <int> spokenForTimes( theGlobalGraph.vertex_count );
int numberOfCliquesProcessed = 0;
for (vector< Seed * >::iterator seedItr = this->seeds.begin(); seedItr != this->seeds.end(); ++seedItr)
{
numberOfCliquesProcessed++;
if (numberOfCliquesProcessed % REPORTING_OUTPUT_FREQUENCY == 0)
{
fprintf(stderr, "%.2fs: ",(double)(clock()-t0)/CLOCKS_PER_SEC);
cerr << "Processed: " << numberOfCliquesProcessed << " cliques..." << endl;
}
int numberOfNodesAlreadySpokenFor = 0;
for (set<V>::iterator nodeItr = (*seedItr)->getNodes().begin(); nodeItr != (*seedItr)->getNodes().end(); ++nodeItr)
{
int numberOfTimesRequiredToBeSpokenFor = Community_Finder::numberOfTimesRequiredToBeSpokenFor;
if (spokenForTimes[(*nodeItr)] > numberOfTimesRequiredToBeSpokenFor)
{
numberOfNodesAlreadySpokenFor++;
}
}
//As the amount of nodes spoken for gets large, this value increases towards 1
float proportionOfNodesAlreadySpokenFor = (float)numberOfNodesAlreadySpokenFor / (float)(*seedItr)->getNodes().size();
if (proportionOfNodesAlreadySpokenFor >= Community_Finder::spokenForThresholdOfUniqueness)
{
//if the proportion of nodes that have been taken is greater than the threshold, then discard this clique.
// cerr << "pruning, number of nodes spoken for: " << (float)numberOfNodesAlreadySpokenFor << " size of seed: " << (float)(*seedItr)->getNodes().size() << "value: " << (float)numberOfNodesAlreadySpokenFor / (float)(*seedItr)->getNodes().size() << endl;
(*seedItr)->dead = true;
}
else
{
//this clique is good
//so tag its nodes as taken.
for (set<V>::iterator nodeItr = (*seedItr)->getNodes().begin(); nodeItr != (*seedItr)->getNodes().end(); ++nodeItr)
{
spokenForTimes[(*nodeItr)]++;
}
}
}
this->sweepTheDead();
cerr << "SpokenFor complete. Number of seeds after: "<< this->seeds.size() << endl;
}
//Responsible for using the clique finding code (which is an implementation of bron-kerbosch) to
//generate the maximal cliques in the graph.
//These cliques are then stored (in the callback) into the 'cliques' vector.
//This 'cliques' vector is then loaded into our 'Seed' objects, which are again stored in a vector.
//The Seed objects at this point are really just responsible for storing the initial, core, cliques.
//We then do the 'spoken for pruning' - or the CCH as it is also called, to prune out some of the highly
//degenerate cliques.
//After this, we consider the set of seeds to be initialised, and this function returns.
//Next, the 'run' function will be called to
//grow them into the candidate communities.
//TODO this implementation stores _every_ clique into memory before any are pruned using the CCH.
//this should later be optimised to run a clique at a time, only storing those not found to be discarded by the CCH, to save memory.
void Community_Finder::initialiseSeeds(const char * filename, int minimumCliqueSize)
{
graph_loading::loadSimpleIntGraphFromFile(theGlobalGraph, filename);
cliques::findCliquesJustIDs(theGlobalGraph, *this, minimumCliqueSize);
cerr << "Loaded : " << this->cliques.size() << " cliques" << endl;
//cliques now has all the cliques. sort it by size.
nodeToSeeds.clear();
nodeToSeeds.resize(theGlobalGraph.vertex_count);
numberOfCliquesProcessed = 0;
for ( vector < vector <V> >::iterator cliqueItr = this->cliques.begin(); cliqueItr != this->cliques.end(); ++cliqueItr)
{
Seed * mySeed = new Seed();
for (vector<V>::iterator innerCliqueItr = (*cliqueItr).begin(); innerCliqueItr != (*cliqueItr).end(); ++innerCliqueItr)
{
//mySeed->addNode((*innerCliqueItr));
mySeed->addNodeNoCaching((*innerCliqueItr));
}
this->seeds.push_back(mySeed);
numberOfCliquesProcessed++;
if (numberOfCliquesProcessed % REPORTING_OUTPUT_FREQUENCY == 0)
{
fprintf(stderr, "%.2fs: ",(double)(clock()-t0)/CLOCKS_PER_SEC);
cerr << "Processed: " << numberOfCliquesProcessed << " cliques..." << endl;
cerr << "Current size of clique being processed: " << (*cliqueItr).size() << endl;
}
}
//remove initial clique storage
this->cliques.clear();
//Run spoken for optimisation.
this->doSpokenForPruning();
fprintf(stderr, "%.2fs: ",(double)(clock()-t0)/CLOCKS_PER_SEC);
cerr << "Total number of seeds remaining: " << this->seeds.size() << endl;
}
//Utility function which discards seeds that have been marked as dead.
//Remove all the seeds that have previously been marked as 'dead' from the vector storing the seeds
//This is necessary because we store the seeds in a vector.
void Community_Finder::sweepTheDead()
{
//makes use of temporary vector
//This temporary vector means we can write all the seeds to the new vector without having to delete and reallocate as we go.
vector <Seed*> newVector;
cerr << "Sweeping the seeds marked for deletion\n";
for (vector< Seed * >::iterator seedItr = this->seeds.begin(); seedItr != this->seeds.end();++seedItr)
{
if ((*seedItr)->dead)
{
delete *seedItr;
}
else
{
newVector.push_back(*seedItr);
}
}
this->seeds.assign(newVector.begin(), newVector.end());
cerr << "Done\n";
}
//Seeds have some cached values.
//They cache a map of nodes that make up their frontier.
//The values in this map are the number of internal and external edges that each of these nodes has - ie, the internal and external degree of the frontier node.
//We also cache the total internal and external degree of all the nodes inside the seed not on the frontier.
void Community_Finder::refreshAllSeedInternalCaches()
{
for (vector< Seed * >::iterator seedItr = this->seeds.begin(); seedItr != this->seeds.end(); ++seedItr)
{
(*seedItr)->updateCachedEdgeValuesFromScratch();
(*seedItr)->updateFrontierFromScratch();
}
}
//TODO look up node names in map, here
void Community_Finder::rawPrint()
{
for (vector< Seed * >::iterator seedItr = this->seeds.begin(); seedItr != this->seeds.end(); ++seedItr)
{
//(*seedItr)->rawPrint();
(*seedItr)->rawPrintInOrderOfAddition();
//(*seedItr)->prettyPrintFrontier();
//(*seedItr)->prettyPrintFrontier();
}
}
void Community_Finder::printSeeds()
{
for (vector< Seed * >::iterator seedItr = this->seeds.begin(); seedItr != this->seeds.end(); ++seedItr)
{
(*seedItr)->prettyPrint();
}
}