-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuilder.h
More file actions
1947 lines (1663 loc) · 102 KB
/
Copy pathbuilder.h
File metadata and controls
1947 lines (1663 loc) · 102 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#pragma once
#include <algorithm>
#include <array>
#include <atomic>
#include <chrono>
#include <functional>
#include <iostream>
#include <mutex>
#include <random>
#include <span>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include "analysis.h"
#include "concurrent.h"
#include "graph.h"
namespace deglib::builder {
/**
* A UnionFind class to represent disjoint set.
* https://www.tutorialspoint.com/cplusplus-program-to-implement-disjoint-set-data-structure
**/
class UnionFind {
private:
uint32_t default_value;
std::unordered_map<uint32_t, uint32_t> parents; // TODO replace with robin_map
public:
/**
* Reserves space in the internal map
*/
UnionFind(int expected_size) {
parents.reserve(expected_size);
default_value = std::numeric_limits<uint32_t>::max();
}
/**
* get the default value if an element in not in the unsion
*/
uint32_t getDefaultValue() { return default_value; }
/**
* Find the root of the set in which element belongs
*/
uint32_t Find(uint32_t l) const {
auto it = parents.find(l);
if (it == parents.end()) return default_value;
auto entry = it->second;
if (entry == l) // if l is root
return l;
return Find(entry); // recurs for parent till we find root
}
/**
* perform Union of two subsets element1 and element2
*/
void Union(uint32_t m, uint32_t n) {
uint32_t x = Find(m);
uint32_t y = Find(n);
Update(x, y);
}
/**
* If the parents are known via find this method can be called instead of union
*/
void Update(uint32_t element, uint32_t parent) { parents[element] = parent; }
};
/**
* A group of vertices which can reach each other. Some of them might be missing edges.
* A vertex index is associated with this group to make it unique.
*/
struct ReachableGroup {
uint32_t vertex_index_;
std::unordered_set<uint32_t> missing_edges_; // TODO replace with robin_set
std::unordered_set<uint32_t> reachable_vertices_; // TODO replace with robin_set
ReachableGroup(uint32_t vertex_index, uint32_t expected_size) : vertex_index_(vertex_index) {
missing_edges_.reserve(expected_size);
reachable_vertices_.reserve(expected_size);
missing_edges_.insert(vertex_index);
reachable_vertices_.insert(vertex_index);
}
/**
* removed the element from the list of vertices with missing edges
*/
void hasEdge(uint32_t element) { missing_edges_.erase(element); }
/**
* return the vertex associated with this group
*/
uint32_t getVertexIndex() const { return vertex_index_; }
/**
* get the number of vertices which can be reached by this group
*/
size_t size() const { return reachable_vertices_.size(); }
/**
* get the number of vertices in this group which are missing an edge
*/
size_t getMissingEdgeSize() const { return missing_edges_.size(); }
/**
* get the vertices which are missing an edges
*/
const auto& getMissingEdges() { return missing_edges_; }
/**
* Copy the data from the other group to this group
*/
void copyFrom(ReachableGroup& otherGroup) {
// skip if both are the same object
if (vertex_index_ == otherGroup.vertex_index_) return;
missing_edges_.insert(otherGroup.missing_edges_.begin(), otherGroup.missing_edges_.end());
reachable_vertices_.insert(otherGroup.reachable_vertices_.begin(), otherGroup.reachable_vertices_.end());
// std::copy(otherGroup.missing_edges_.begin(), otherGroup.missing_edges_.end(), std::back_inserter(missing_edges_));
// std::copy(otherGroup.reachable_vertices_.begin(), otherGroup.reachable_vertices_.end(), std::back_inserter(reachable_vertices_));
}
};
/**
* Information about an graph edge. The edge might be no longer part of the graph
*/
struct GraphEdge {
uint32_t from_vertex;
uint32_t to_vertex;
float weight;
GraphEdge(uint32_t from_vertex, uint32_t to_vertex, float weight) : from_vertex(from_vertex), to_vertex(to_vertex), weight(weight) {}
};
/**
* Task to add a vertex to the graph
*/
struct BuilderAddTask {
uint32_t label;
uint64_t manipulation_index;
std::vector<std::byte> feature;
BuilderAddTask(uint32_t lbl, uint64_t index, std::vector<std::byte> feat)
: label(lbl), manipulation_index(index), feature(std::move(feat)) {}
};
/**
* Task to remove a vertex to the graph
*/
struct BuilderRemoveTask {
uint32_t label;
uint64_t manipulation_index;
BuilderRemoveTask(uint32_t lbl, uint64_t index) : label(lbl), manipulation_index(index) {}
};
/**
* Every graph change can be document with this struct. Needed to eventually revert back same changed.
*/
struct BuilderChange {
uint32_t internal_index;
uint32_t from_neighbor_index;
float from_neighbor_weight;
uint32_t to_neighbor_index;
float to_neighbor_weight;
BuilderChange(uint32_t internalIdx, uint32_t fromIdx, float fromWeight, uint32_t toIdx, float toWeight)
: internal_index(internalIdx),
from_neighbor_index(fromIdx),
from_neighbor_weight(fromWeight),
to_neighbor_index(toIdx),
to_neighbor_weight(toWeight) {}
};
/**
* Status of the build process.
* The process performs within a so called "step" a series of changes.
* A step is either a series of graph improvement tries or the
* addition/deletion of a vertex followed be the improvement tries.
* The build process can only be stopped between two steps.
*/
struct BuilderStatus {
uint64_t step; // number of graph manipulation steps
uint64_t added; // number of added vertices
uint64_t deleted; // number of deleted vertices
uint64_t improved; // number of successful improvement
uint64_t tries; // number of improvement tries
};
/**
* Information about the data distribution help to switch between different graph extension strategies.
* Below 15 = Low (schemeD in the paper)
* Above 15 = High (schemeC in the paper)
* For data with distribution Shifts Unknown is better.
*/
enum OptimizationTarget {
StreamingData_SchemeA, // Streaming or shifting distributions (just for the ablation study)
StreamingData_SchemeB, // Streaming or shifting distributions (just for the ablation study)
StreamingData_SchemeC, // Streaming or shifting distributions (default in the paper)
StreamingData_SchemeD, // Streaming or shifting distributions (just for the ablation study)
HighLID, // Optimized for datasets with high local intrinsic dimensionality (schemeC in the paper)
LowLID, // Optimized for datasets with low local intrinsic dimensionality (schemeD in the paper)
SchemeA, // Only for research purposes
SchemeB // Only for research purposes
};
class EvenRegularGraphBuilder {
// Optimization target for the graph (StreamingData, HighLID, LowLID)
const OptimizationTarget optimizationTarget_;
const uint8_t extend_k_; // k value for extending the graph
const float extend_eps_; // eps value for extending the graph
const uint8_t improve_k_; // k value for improving the graph
const float improve_eps_; // eps value for improving the graph
const uint8_t max_path_length_; // max amount of changes before canceling an improvement try
const uint32_t swap_tries_; // number of improvement attempts per build step
const uint32_t additional_swap_tries_; // additional improvement attempts after a successful improvement
const bool use_rng_; // Enable RNG (Relative Neighborhood Graph) pruning during graph extension
const bool use_path_verification_; // Enable path verification during graph restoration
const bool use_simple_edge_swaps_; // Enable simple edge swaps during graph improvement
std::mt19937& rnd_; // Reference to a random number generator used for randomized operations
deglib::graph::MutableGraph& graph_; // Reference to the mutable graph being built and optimized
BuilderStatus build_status_; // Status of the build process (steps, added, deleted, improved, tries)
std::atomic<uint64_t> manipulation_counter_; // Counter for graph manipulations (add/remove operations)
std::deque<BuilderAddTask> new_entry_queue_; // Queue of new entries to be added to the graph
std::queue<BuilderRemoveTask> remove_entry_queue_; // Queue of entries to be removed from the graph
// the batch_size should be thread_count * thread_task_count thread_task_size
uint32_t extend_batch_size = 32; // the overall number of elements per batch
uint32_t extend_thread_count = 1; // number of concurrent threads
uint32_t extend_thread_task_size = 32; // each thread processed 32 elements per task
uint32_t extend_thread_task_count = 10; // there are 10 tasks per thread per batch
// Mutex for synchronizing graph extension in multi-threaded scenarios
mutable std::mutex extend_mutex;
// Flag to indicate if the build loop should stop
bool stop_building_ = false;
public:
/**
* @brief Constructs an EvenRegularGraphBuilder for building and optimizing a regular graph.
*
* @param graph Reference to a MutableGraph object to be built and optimized.
* @param rnd Reference to a random number generator (std::mt19937) used for randomized operations.
* @param optimization_target Optimization target, determines the graph extension strategy (StreamingData, HighLID, LowLID).
* @param extend_k Number of neighbors to consider when extending the graph.
* @param extend_eps Epsilon value for neighbor search during graph extension.
* @param improve_k Number of neighbors to consider when improving the graph.
* @param improve_eps Epsilon value for neighbor search during graph improvement.
* @param max_path_length Maximum number of edge swaps in a single improvement attempt (default: 5).
* @param swap_tries Number of improvement attempts per build step (default: 0).
* @param additional_swap_tries Additional improvement attempts after a successful improvement (default: 0).
* @param use_rng Enable RNG (Relative Neighborhood Graph) pruning during graph extension (default: true).
*
* This constructor initializes the builder with the provided parameters and sets up internal
* batching and threading parameters for efficient graph construction and optimization.
*/
EvenRegularGraphBuilder(deglib::graph::MutableGraph& graph,
std::mt19937& rnd,
const OptimizationTarget optimization_target,
const uint8_t extend_k,
const float extend_eps,
const uint8_t improve_k,
const float improve_eps,
const uint8_t max_path_length = 5,
const uint32_t swap_tries = 0,
const uint32_t additional_swap_tries = 0,
const bool use_rng = true,
const bool use_path_verification = false,
const bool use_simple_edge_swaps = false)
: optimizationTarget_(optimization_target),
extend_k_(extend_k),
extend_eps_(extend_eps),
improve_k_(improve_k),
improve_eps_(improve_eps),
max_path_length_(max_path_length),
swap_tries_(swap_tries),
additional_swap_tries_(additional_swap_tries),
use_rng_(use_rng),
use_path_verification_(use_path_verification),
use_simple_edge_swaps_(use_simple_edge_swaps),
rnd_(rnd),
graph_(graph),
build_status_() {
// each core processes extend_thread_batch_size element per tasks, there are 10 tasks per threads
extend_thread_count = std::thread::hardware_concurrency() / 2;
extend_batch_size = extend_thread_count * extend_thread_task_count * extend_thread_task_size;
}
/**
* @brief Constructs an EvenRegularGraphBuilder with default parameters for streaming data.
*
* @param graph Reference to a MutableGraph object to be built and optimized.
* @param rnd Reference to a random number generator (std::mt19937) used for randomized operations.
* @param swaps Number of improvement attempts per build step (used for both swap_tries and additional_swap_tries).
*
* This constructor is a convenience overload for quickly creating a builder for streaming data
* with default extension and improvement parameters.
*/
EvenRegularGraphBuilder(deglib::graph::MutableGraph& graph, std::mt19937& rnd, const uint32_t swaps)
: EvenRegularGraphBuilder(
graph, rnd, OptimizationTarget::StreamingData_SchemeC, graph.getEdgesPerVertex(), 0.1f, 0, 0.0f, 5, swaps, swaps) {}
/**
* @brief Constructs an EvenRegularGraphBuilder with default parameters and a single swap attempt.
*
* @param graph Reference to a MutableGraph object to be built and optimized.
* @param rnd Reference to a random number generator (std::mt19937) used for randomized operations.
*
* This constructor is a convenience overload for quickly creating a builder with minimal configuration.
*/
EvenRegularGraphBuilder(deglib::graph::MutableGraph& graph, std::mt19937& rnd) : EvenRegularGraphBuilder(graph, rnd, 1) {}
/**
* Provide the builder a new entry which it will append to the graph in the build() process.
*/
void addEntry(const uint32_t label, std::vector<std::byte> feature) {
auto manipulation_index = manipulation_counter_.fetch_add(1);
new_entry_queue_.emplace_back(label, manipulation_index, std::move(feature));
}
/**
* Command the builder to remove a vertex from the graph as fast as possible.
*/
void removeEntry(const uint32_t label) {
auto manipulation_index = manipulation_counter_.fetch_add(1);
remove_entry_queue_.emplace(label, manipulation_index);
}
/**
* Numbers of entries which will be added to the graph
*/
size_t getNumNewEntries() { return new_entry_queue_.size(); }
/**
* Numbers of entries which will be removed from the graph
*/
size_t getNumRemoveEntries() { return remove_entry_queue_.size(); }
/**
* Set the number of threads used to extend the graph during building.
*
* When the thread count is greater than 1 and the optimization target is not StreamingData,
* the builder will utilize multiple threads to add elements to the graph in parallel.
* By default, all available CPU cores/threads are used unless specified.
* Note: The order in which elements are added is not guaranteed when using multiple threads.
*
* @param thread_count Number of threads which are used to extend the graph.
*/
void setThreadCount(uint32_t thread_count) {
extend_thread_count = thread_count;
extend_batch_size = extend_thread_count * extend_thread_task_count * extend_thread_task_size;
}
/**
* When adding multiple elements to the graph the builder processes them in batches.
* The elements in a batch are only available after the batch is fully processed.
*
* Depending on the thread count, optimization target and the desired latency
* (how fast changed are added to the graph), specify the batch size should be considered.
*
* e.g.
* thread count = 1 and batch size = 1: low throuput, medium latency, order of elements is guaranteed
* thread count > 1 and batch size = 1: high throuput, low latency, order of elements is not guaranteed
* thread count > 1 and batch size > 1: highest throuput, highest latency, order of elements is not guaranteed
* * Please note that the optimization target StreamingData only uses a thread count of 1.
*
* The batch size is calculated as:
* batch_size = thread_count * tasks_per_batch * task_size
* where
* thread_count = number of threads which are used to extend the graph
* tasks_per_batch = number of tasks in each batch
* task_size = number of elements each thread processes in one task
*
* A low tasks_per_batch improves the latency but reduces the throughput.
* Therefore it is recommended to use a higher task_size value.
*
* @param tasks_per_batch Number of tasks for each thread in one batch. (default: 32)
* @param task_size Number of elements each thread processes in one task. (default: 10)
*/
void setBatchSize(uint32_t tasks_per_batch, uint32_t task_size) {
extend_thread_task_size = task_size;
extend_thread_task_count = tasks_per_batch;
extend_batch_size = extend_thread_count * extend_thread_task_count * extend_thread_task_size;
}
/*
* Get the current batch size.
*/
uint32_t getBatchSize() const { return extend_batch_size; }
private:
/**
* Convert the queue into a vector with ascending distance order.
*
* @param queue The result set queue to convert.
* @return A vector of ObjectDistance sorted in ascending order.
*/
static auto topListAscending(deglib::search::ResultSet& queue) {
const auto size = (int32_t)queue.size();
auto topList = std::vector<deglib::search::ObjectDistance>(size);
for (int32_t i = size - 1; i >= 0; i--) {
topList[i] = queue.top();
queue.pop();
}
return topList;
}
/**
* Convert the queue into a vector with descending distance order.
*
* @param queue The result set queue to convert.
* @return A vector of ObjectDistance sorted in descending order.
*/
static auto topListDescending(deglib::search::ResultSet& queue) {
const auto size = queue.size();
auto topList = std::vector<deglib::search::ObjectDistance>(size);
for (size_t i = 0; i < size; i++) {
topList[i] = std::move(const_cast<deglib::search::ObjectDistance&>(queue.top()));
queue.pop();
}
return topList;
}
/**
* Extend the graph with a new vertex. Finds good existing vertices to which this new vertex gets connected.
*
* @param add_tasks The list of tasks describing vertices to add.
*/
void extendGraph(const std::vector<BuilderAddTask>& add_tasks) {
auto& graph = this->graph_;
// for computing distances to neighbors not in the result queue
const auto dist_func = graph.getFeatureSpace().get_dist_func();
const auto dist_func_param = graph.getFeatureSpace().get_dist_func_param();
// fully connect all vertices
uint32_t index = 0;
const auto edges_per_vertex = uint32_t(graph.getEdgesPerVertex());
while (graph.size() < edges_per_vertex + 1 && index < add_tasks.size()) {
const auto& add_task = add_tasks[index++];
const auto external_label = add_task.label;
// graph should not contain a vertex with the same label
if (graph.hasVertex(external_label)) {
std::fprintf(stderr, "graph contains vertex %u already. can not add it again \n", external_label);
std::perror("");
std::abort();
}
// add an empty vertex to the graph (no neighbor information yet)
const auto new_vertex_feature = add_task.feature.data();
const auto internal_index = graph.addVertex(external_label, new_vertex_feature);
// connect the new vertex to all other vertices in the graph
for (uint32_t i = 0; i < graph.size(); i++) {
if (i != internal_index) {
const auto dist = dist_func(new_vertex_feature, graph.getFeatureVector(i), dist_func_param);
graph.changeEdge(i, i, internal_index, dist);
graph.changeEdge(internal_index, internal_index, i, dist);
}
}
}
// LID is unknown for streaming data, so we add the new vertices one-by-one single threaded.
if (this->optimizationTarget_ == OptimizationTarget::StreamingData_SchemeA ||
this->optimizationTarget_ == OptimizationTarget::StreamingData_SchemeB ||
this->optimizationTarget_ == OptimizationTarget::StreamingData_SchemeC ||
this->optimizationTarget_ == OptimizationTarget::StreamingData_SchemeD) {
while (index < add_tasks.size()) extendGraphUnknownLID(add_tasks[index++]);
} else {
const auto remaining_add_tasks = std::vector<BuilderAddTask>(add_tasks.begin() + index, add_tasks.end());
auto batchExtendGraphKnownLID = [&](const std::vector<BuilderAddTask>& tasks, size_t task_index) {
const auto start_index = task_index * this->extend_thread_task_size;
const auto end_index = std::min(tasks.size(), (task_index + 1) * this->extend_thread_task_size);
for (size_t i = start_index; i < end_index; i++) extendGraphKnownLID(tasks[i]);
};
size_t task_count =
(remaining_add_tasks.size() / extend_thread_task_size) +
((remaining_add_tasks.size() % extend_thread_task_size != 0) ? 1 : 0); // +1, if n_queries % batch_size != 0
deglib::concurrent::parallel_for(0, task_count, extend_thread_count, 1, [&](size_t task_index, size_t thread_id) {
batchExtendGraphKnownLID(remaining_add_tasks, task_index);
});
}
}
/**
* The OptimizationTarget is StreamingData and we do not know the LID,
* add the new data one-by-one single threaded.
*
* @param add_task The task describing the vertex to add.
*/
void extendGraphUnknownLID(const BuilderAddTask& add_task) {
auto& graph = this->graph_;
const auto external_label = add_task.label;
const auto new_vertex_feature = add_task.feature.data();
const auto edges_per_vertex = uint32_t(graph.getEdgesPerVertex());
// find good neighbor candidates for the new vertex
auto distrib = std::uniform_int_distribution<uint32_t>(0, uint32_t(graph.size() - 1));
const std::vector<uint32_t> entry_vertex_indices = {distrib(this->rnd_)};
auto top_list = graph.search(
entry_vertex_indices,
new_vertex_feature,
this->extend_eps_,
std::max(uint32_t(this->extend_k_), edges_per_vertex * 2)); // need 2x otherwise it might lock during neighbor selection
const auto candidates = topListAscending(top_list);
// their should always be enough neighbors (search candidates), otherwise the graph would be broken
if (candidates.size() < edges_per_vertex) {
std::cerr << "the graph search for the new vertex " << external_label << "did only provide " << candidates.size()
<< " candidates" << std::endl;
std::perror("");
std::abort();
}
// add an empty vertex to the graph (no neighbor information yet)
const auto internal_index = graph.addVertex(external_label, new_vertex_feature);
// adding neighbors happens in two phases, the first tries to retain RNG, the second adds them without checking
bool check_rng_phase = this->use_rng_; // true = activated, false = deactived
// list of potential isolates vertices
auto isolated_vertices = std::vector<uint32_t>();
isolated_vertices.emplace_back(internal_index); // self loop needed for restore phase
// remove an edge of a good neighbor candidate and connect the candidate with the new vertex
auto slots = (uint32_t)edges_per_vertex - 1; // the new vertex will get an additional neighbor during the restore phase
while (slots > 0) {
for (size_t i = 0; i < candidates.size() && slots > 0; i++) {
const auto candidate_index = candidates[i].getInternalIndex();
const auto candidate_weight = candidates[i].getDistance();
// check if the vertex is already in the edge list of the new vertex (added during a previous loop-run)
// since all edges are undirected and the edge information of the new vertex does not yet exist, we search the other way
// around.
if (graph.hasEdge(candidate_index, internal_index)) continue;
// does the candidate has a neighbor which is connected to the new vertex and has a lower distance?
if (check_rng_phase &&
deglib::analysis::checkRNG(graph, edges_per_vertex, candidate_index, internal_index, candidate_weight) == false)
continue;
// the vertex is already missing an edge (one of its longer edges was removed during a previous iteration),
// just add a new edge between the candidate and the new vertex
if (graph.hasEdge(candidate_index, candidate_index)) {
graph.changeEdge(candidate_index, candidate_index, internal_index, candidate_weight);
graph.changeEdge(internal_index, internal_index, candidate_index, candidate_weight);
slots--;
continue;
}
// This version is good for high LID datasets or small graphs with a lot of equidistant vertices during ANNS
uint32_t new_neighbor_index = 0;
if (this->optimizationTarget_ == OptimizationTarget::StreamingData_SchemeA) {
// SchemeA: Connect the new vertex v to the most similar neighbor of the candidate.
// Select neighbor n with minimum distance to v.
const auto dist_func = graph.getFeatureSpace().get_dist_func();
const auto dist_func_param = graph.getFeatureSpace().get_dist_func_param();
float best_neighbor_distance = std::numeric_limits<float>::max();
const auto neighbor_indices = graph.getNeighborIndices(candidate_index);
for (size_t edge_idx = 0; edge_idx < edges_per_vertex; edge_idx++) {
const auto neighbor_index = neighbor_indices[edge_idx];
// the suggested neighbor might already be in the edge list of the new vertex
if (graph.hasEdge(neighbor_index, internal_index)) continue;
// is the neighbor already missing an edge?
if (graph.hasEdge(neighbor_index, neighbor_index)) continue;
const auto neighbor_distance =
dist_func(new_vertex_feature, graph.getFeatureVector(neighbor_index), dist_func_param);
if (neighbor_distance < best_neighbor_distance) {
best_neighbor_distance = neighbor_distance;
new_neighbor_index = neighbor_index;
}
}
// This can happens as each selected neighbor will be added to the neighor list of internal_index and removes an edges
// from another vertex. Which mean in each round this algorthm can eliminte two potential neighbor_index candidates of
// the next candidate. After 15 candiates 30 potential neighbor_index are already blocked and if all of them are
// neighbor of the current candidate_index than we will find nothing here.
if (best_neighbor_distance == std::numeric_limits<float>::max()) continue;
} else if (this->optimizationTarget_ == OptimizationTarget::StreamingData_SchemeB) {
// SchemeB: Remove the shortest edge of the candidate.
// Select neighbor n with minimum edge weight to the candidate.
float min_edge_weight = std::numeric_limits<float>::max();
const auto neighbor_indices = graph.getNeighborIndices(candidate_index);
const auto neighbor_weights = graph.getNeighborWeights(candidate_index);
for (size_t edge_idx = 0; edge_idx < edges_per_vertex; edge_idx++) {
const auto neighbor_index = neighbor_indices[edge_idx];
// the suggested neighbor might already be in the edge list of the new vertex
if (graph.hasEdge(neighbor_index, internal_index)) continue;
// is the neighbor already missing an edge?
if (graph.hasEdge(neighbor_index, neighbor_index)) continue;
const auto neighbor_weight = neighbor_weights[edge_idx];
if (neighbor_weight < min_edge_weight) {
min_edge_weight = neighbor_weight;
new_neighbor_index = neighbor_index;
}
}
// This can happens as each selected neighbor will be added to the neighor list of internal_index and removes an edges
// from another vertex. Which mean in each round this algorthm can eliminte two potential neighbor_index candidates of
// the next candidate. After 15 candiates 30 potential neighbor_index are already blocked and if all of them are
// neighbor of the current candidate_index than we will find nothing here.
if (min_edge_weight == std::numeric_limits<float>::max()) continue;
} else if (this->optimizationTarget_ == OptimizationTarget::StreamingData_SchemeC) {
// find the worst edge of the new neighbor
float new_neighbor_weight = std::numeric_limits<float>::lowest();
const auto neighbor_indices = graph.getNeighborIndices(candidate_index);
const auto neighbor_weights = graph.getNeighborWeights(candidate_index);
for (size_t edge_idx = 0; edge_idx < edges_per_vertex; edge_idx++) {
const auto neighbor_index = neighbor_indices[edge_idx];
// the suggested neighbor might already be in the edge list of the new vertex
if (graph.hasEdge(neighbor_index, internal_index)) continue;
// is the neighbor already missing an edge?
if (graph.hasEdge(neighbor_index, neighbor_index)) continue;
// find highest weighted neighbor
const auto neighbor_weight = neighbor_weights[edge_idx];
if (neighbor_weight > new_neighbor_weight) {
new_neighbor_weight = neighbor_weight;
new_neighbor_index = neighbor_index;
}
}
// This can happens as each selected neighbor will be added to the neighor list of internal_index and removes an edges
// from another vertex. Which mean in each round this algorthm can eliminte two potential neighbor_index candidates of
// the next candidate. After 15 candiates 30 potential neighbor_index are already blocked and if all of them are
// neighbor of the current candidate_index than we will find nothing here.
if (new_neighbor_weight == std::numeric_limits<float>::lowest()) continue;
} else if (this->optimizationTarget_ == OptimizationTarget::StreamingData_SchemeD) {
// SchemeD: find the edge which improves the distortion the most: (distance_new_edge1 + distance_new_edge2) -
// distance_removed_edge
const auto dist_func = graph.getFeatureSpace().get_dist_func();
const auto dist_func_param = graph.getFeatureSpace().get_dist_func_param();
float best_distortion = std::numeric_limits<float>::max();
const auto neighbor_indices = graph.getNeighborIndices(candidate_index);
const auto neighbor_weights = graph.getNeighborWeights(candidate_index);
for (size_t edge_idx = 0; edge_idx < edges_per_vertex; edge_idx++) {
const auto neighbor_index = neighbor_indices[edge_idx];
// the suggested neighbor might already be in the edge list of the new vertex
if (graph.hasEdge(neighbor_index, internal_index)) continue;
// is the neighbor already missing an edge?
if (graph.hasEdge(neighbor_index, neighbor_index)) continue;
// take the neighbor with the best distortion improvement
const auto neighbor_distance =
dist_func(new_vertex_feature, graph.getFeatureVector(neighbor_index), dist_func_param);
float distortion = (candidate_weight + neighbor_distance) - neighbor_weights[edge_idx];
if (distortion < best_distortion) {
best_distortion = distortion;
new_neighbor_index = neighbor_index;
}
}
// This can happens as each selected neighbor will be added to the neighor list of internal_index and removes an edges
// from another vertex. Which mean in each round this algorthm can eliminte two potential neighbor_index candidates of
// the next candidate. After 15 candiates 30 potential neighbor_index are already blocked and if all of them are
// neighbor of the current candidate_index than we will find nothing here.
if (best_distortion == std::numeric_limits<float>::max()) continue;
}
// place the new vertex in the edge list of the candidate_index and the new vertex internal_index
graph.changeEdge(candidate_index, new_neighbor_index, internal_index, candidate_weight);
graph.changeEdge(internal_index, internal_index, candidate_index, candidate_weight);
slots--;
// replace the edge to the candidate_index from the edge list of new_neighbor_index with a self-reference
graph.changeEdge(new_neighbor_index, candidate_index, new_neighbor_index, 0);
isolated_vertices.emplace_back(new_neighbor_index);
}
check_rng_phase = false;
}
// get all vertices which are missing an edge
isolated_vertices.erase(
std::remove_if(
isolated_vertices.begin(), isolated_vertices.end(), [&graph](int val) { return graph.hasEdge(val, val) == false; }),
isolated_vertices.end());
// restore the potential disconnected graph componenten
restoreGraph(isolated_vertices, false);
}
/**
* The OptimizationTarget is a known LID, use multi-threading to build the graph.
*
* @param add_task The task describing the vertex to add.
*/
void extendGraphKnownLID(const BuilderAddTask& add_task) {
auto& graph = this->graph_;
const auto external_label = add_task.label;
const auto new_vertex_feature = add_task.feature.data();
const auto edges_per_vertex = uint32_t(graph.getEdgesPerVertex());
// for computing distances to neighbors not in the result queue
const auto dist_func = graph.getFeatureSpace().get_dist_func();
const auto dist_func_param = graph.getFeatureSpace().get_dist_func_param();
// find good neighbors for the new vertex
// auto distrib = std::uniform_int_distribution<uint32_t>(0, uint32_t(graph.size() - 1));
// const std::vector<uint32_t> entry_vertex_indices = { distrib(this->rnd_) };
const std::vector<uint32_t> entry_vertex_indices = {0};
auto top_list = graph.search(
entry_vertex_indices, new_vertex_feature, this->extend_eps_, std::max(uint32_t(this->extend_k_), edges_per_vertex));
const auto results = topListAscending(top_list);
// their should always be enough neighbors (search results), otherwise the graph would be broken
if (results.size() < edges_per_vertex) {
std::fprintf(stderr, "the graph search for the new vertex %u did only provided %zu results \n", external_label, results.size());
std::perror("");
std::abort();
}
uint32_t internal_index = 0;
{
std::lock_guard<std::mutex> lock(this->extend_mutex);
std::atomic_thread_fence(std::memory_order_acquire);
// graph should not contain a vertex with the same label
if (graph.hasVertex(external_label)) {
std::fprintf(stderr, "graph contains vertex %u already. can not add it again\n", external_label);
perror("");
abort();
}
// add an empty vertex to the graph (no neighbor information yet)
internal_index = graph.addVertex(external_label, new_vertex_feature);
std::atomic_thread_fence(std::memory_order_release);
}
// adding neighbors happens in two phases, the first tries to retain RNG, the second adds them without checking
bool check_rng_phase = this->use_rng_; // true = activated, false = deactived
// remove an edge of the good neighbors and connect them with this new vertex
auto new_neighbors = std::vector<std::pair<uint32_t, float>>();
while (new_neighbors.size() < edges_per_vertex) {
for (size_t i = 0; i < results.size() && new_neighbors.size() < edges_per_vertex; i++) {
const auto candidate_index = results[i].getInternalIndex();
const auto candidate_weight = results[i].getDistance();
// check if the vertex is already in the edge list of the new vertex (added during a previous loop-run)
// since all edges are undirected and the edge information of the new vertex does not yet exist, we search the other way
// around.
if (graph.hasEdge(candidate_index, internal_index)) continue;
// does the candidate has a neighbor which is connected to the new vertex and has a lower distance?
if (check_rng_phase &&
deglib::analysis::checkRNG(graph, edges_per_vertex, candidate_index, internal_index, candidate_weight) == false)
continue;
// SchemeC: This version is good for high OptimizationTarget datasets or small graphs with low distance count limit during
// ANNS
uint32_t new_neighbor_index = 0;
float new_neighbor_distance = std::numeric_limits<float>::lowest();
if (this->optimizationTarget_ == HighLID) {
// find the worst edge of the new neighbor
float new_neighbor_weight = std::numeric_limits<float>::lowest();
const auto neighbor_indices = graph.getNeighborIndices(candidate_index);
const auto neighbor_weights = graph.getNeighborWeights(candidate_index);
for (size_t edge_idx = 0; edge_idx < edges_per_vertex; edge_idx++) {
const auto neighbor_index = neighbor_indices[edge_idx];
// if another thread is building the candidate_index at the moment, than its neighbor list contains self references
if (candidate_index == neighbor_index) continue;
// the suggested neighbor might already be in the edge list of the new vertex
if (graph.hasEdge(neighbor_index, internal_index)) continue;
// the weight of the neighbor might not be worst than the current worst one
const auto neighbor_weight = neighbor_weights[edge_idx];
if (neighbor_weight > new_neighbor_weight) {
new_neighbor_weight = neighbor_weight;
new_neighbor_index = neighbor_index;
}
}
// Single Threaded: should not be possible, otherwise the new vertex is connected to every vertex in the neighbor-list
// of the result-vertex and still has space for more Multi Threaded: can happen if the mutation threads replaced an edge
// in the neighbor list at the same time
if (new_neighbor_weight == std::numeric_limits<float>::lowest()) {
std::cerr << "could not find a new neighbor for candidate " << candidate_index << " when adding vertex "
<< external_label << (this->extend_thread_count > 1 ? " (this can happen in multi-threaded mode)" : "") << std::endl;
continue;
}
new_neighbor_distance = dist_func(new_vertex_feature, graph.getFeatureVector(new_neighbor_index), dist_func_param);
} else if (this->optimizationTarget_ == LowLID) {
// find the edge which improves the distortion the most: (distance_new_edge1 + distance_new_edge2) -
// distance_removed_edge
float best_distortion = std::numeric_limits<float>::max();
const auto neighbor_indices = graph.getNeighborIndices(candidate_index);
const auto neighbor_weights = graph.getNeighborWeights(candidate_index);
for (size_t edge_idx = 0; edge_idx < edges_per_vertex; edge_idx++) {
const auto neighbor_index = neighbor_indices[edge_idx];
// if another thread is building the candidate_index at the moment, than its neighbor list contains self references
if (candidate_index == neighbor_index) continue;
// the suggested neighbor might already be in the edge list of the new vertex
if (graph.hasEdge(neighbor_index, internal_index)) continue;
// take the neighbor with the best distance to the new vertex, which might already be in its edge list
const auto neighbor_distance =
dist_func(new_vertex_feature, graph.getFeatureVector(neighbor_index), dist_func_param);
float distortion = (candidate_weight + neighbor_distance) - neighbor_weights[edge_idx]; // version D in the paper
if (distortion < best_distortion) {
best_distortion = distortion;
new_neighbor_index = neighbor_index;
new_neighbor_distance = neighbor_distance;
}
}
// Single Threaded: should not be possible, otherwise the new vertex is connected to every vertex in the neighbor-list
// of the result-vertex and still has space for more Multi Threaded: can happen if the mutation threads replaced an edge
// in the neighbor list at the same time
if (best_distortion == std::numeric_limits<float>::max()) {
std::cerr << "could not find a new neighbor for candidate " << candidate_index << " when adding vertex "
<< external_label << (this->extend_thread_count > 1 ? " (this can happen in multi-threaded mode)" : "") << std::endl;
continue;
}
} else if (this->optimizationTarget_ == SchemeA) {
// Scheme A: Connect the new vertex v to the most similar neighbor of b.
// Select neighbor n with minimum distance to v.
float best_neighbor_distance = std::numeric_limits<float>::max();
const auto neighbor_indices = graph.getNeighborIndices(candidate_index);
for (size_t edge_idx = 0; edge_idx < edges_per_vertex; edge_idx++) {
const auto neighbor_index = neighbor_indices[edge_idx];
// if another thread is building the candidate_index at the moment, than its neighbor list contains self references
if (candidate_index == neighbor_index) continue;
// the suggested neighbor might already be in the edge list of the new vertex
if (graph.hasEdge(neighbor_index, internal_index)) continue;
const auto neighbor_distance =
dist_func(new_vertex_feature, graph.getFeatureVector(neighbor_index), dist_func_param);
if (neighbor_distance < best_neighbor_distance) {
best_neighbor_distance = neighbor_distance;
new_neighbor_index = neighbor_index;
new_neighbor_distance = neighbor_distance;
}
}
// Single Threaded: should not be possible, otherwise the new vertex is connected to every vertex in the neighbor-list
// of the result-vertex and still has space for more Multi Threaded: can happen if the mutation threads replaced an edge
// in the neighbor list at the same time
if (best_neighbor_distance == std::numeric_limits<float>::max()) {
std::cerr << "could not find a new neighbor for candidate " << candidate_index << " when adding vertex "
<< external_label << (this->extend_thread_count > 1 ? " (this can happen in multi-threaded mode)" : "") << std::endl;
continue;
}
} else if (this->optimizationTarget_ == SchemeB) {
// Scheme B: Remove the shortest edge of b.
// Select neighbor n with minimum distance to b.
float min_edge_weight = std::numeric_limits<float>::max();
const auto neighbor_indices = graph.getNeighborIndices(candidate_index);
const auto neighbor_weights = graph.getNeighborWeights(candidate_index);
for (size_t edge_idx = 0; edge_idx < edges_per_vertex; edge_idx++) {
const auto neighbor_index = neighbor_indices[edge_idx];
// if another thread is building the candidate_index at the moment, than its neighbor list contains self references
if (candidate_index == neighbor_index) continue;
// the suggested neighbor might already be in the edge list of the new vertex
if (graph.hasEdge(neighbor_index, internal_index)) continue;
const auto neighbor_weight = neighbor_weights[edge_idx];
if (neighbor_weight < min_edge_weight) {
min_edge_weight = neighbor_weight;
new_neighbor_index = neighbor_index;
}
}
// Single Threaded: should not be possible, otherwise the new vertex is connected to every vertex in the neighbor-list
// of the result-vertex and still has space for more Multi Threaded: can happen if the mutation threads replaced an edge
// in the neighbor list at the same time
if (min_edge_weight == std::numeric_limits<float>::max()) {
std::cerr << "could not find a new neighbor for candidate " << candidate_index << " when adding vertex "
<< external_label << (this->extend_thread_count > 1 ? " (this can happen in multi-threaded mode)" : "") << std::endl;
continue;
}
if (min_edge_weight != std::numeric_limits<float>::max()) {
new_neighbor_distance = dist_func(new_vertex_feature, graph.getFeatureVector(new_neighbor_index), dist_func_param);
}
}
// this should not be possible, otherwise the new vertex is connected to every vertex in the neighbor-list of the
// result-vertex and still has space for more
if (new_neighbor_distance == std::numeric_limits<float>::lowest()) continue;
// update all edges
{
std::lock_guard<std::mutex> lock(this->extend_mutex);
std::atomic_thread_fence(std::memory_order_acquire);
// other threads might have already changed the edges of the new_neighbor_index
if (graph.hasEdge(candidate_index, new_neighbor_index) && graph.hasEdge(new_neighbor_index, candidate_index) &&
graph.hasEdge(internal_index, candidate_index) == false &&
graph.hasEdge(candidate_index, internal_index) == false &&
graph.hasEdge(internal_index, new_neighbor_index) == false &&
graph.hasEdge(new_neighbor_index, internal_index) == false) {
// update edge list of the new vertex
graph.changeEdge(internal_index, internal_index, candidate_index, candidate_weight);
graph.changeEdge(internal_index, internal_index, new_neighbor_index, new_neighbor_distance);
new_neighbors.emplace_back(candidate_index, candidate_weight);
new_neighbors.emplace_back(new_neighbor_index, new_neighbor_distance);
// place the new vertex in the edge list of the result-vertex
graph.changeEdge(candidate_index, new_neighbor_index, internal_index, candidate_weight);
// place the new vertex in the edge list of the best edge neighbor
graph.changeEdge(new_neighbor_index, candidate_index, internal_index, new_neighbor_distance);
}
std::atomic_thread_fence(std::memory_order_release);
}
}
check_rng_phase = false;
}
if (new_neighbors.size() < edges_per_vertex) {
std::fprintf(stderr,
"could find only %zu good neighbors for the new vertex %u need %u\n",
new_neighbors.size(),
internal_index,
edges_per_vertex);
std::perror("");
std::abort();
}
}
/**
* Removing a vertex from the graph.
*
* @param del_task The task describing the vertex to remove.
*/
void reduceGraph(const BuilderRemoveTask& del_task) {
auto& graph = this->graph_;
const auto edges_per_vertex = std::min(graph.size(), uint32_t(graph.getEdgesPerVertex()));
// 1 remove the vertex and collect the vertices which are missing an edge
const auto involved_indices = graph.removeVertex(del_task.label);
// 1.1 handle the use case where the graph does not have enough vertices to fulfill the edgesPerVertex requirement
// and just remove the vertex without reconnecting the involved vertices because they are all fully connected
if (graph.size() <= edges_per_vertex) return;
restoreGraph(involved_indices, true);
}
/**
* Reconnect the vertices indicated in the list of involved_indices.
* All these vertices are missing an edge.
*
* @param involved_indices The indices of vertices missing an edge.
* @param improve_edges Whether to attempt to improve the new edges after reconnecting.
*/
void restoreGraph(const std::vector<uint32_t>& involved_indices, bool improve_edges) {
auto& graph = this->graph_;
const auto edges_per_vertex = std::min(graph.size(), uint32_t(graph.getEdgesPerVertex()));
const auto& feature_space = graph.getFeatureSpace();
const auto dist_func = feature_space.get_dist_func();
const auto dist_func_param = feature_space.get_dist_func_param();
// Shortcut if its only two involved vertices: just connect them
if (involved_indices.size() == 2) {
const auto vertex_a = involved_indices[0];
const auto vertex_b = involved_indices[1];
const auto distance = dist_func(graph.getFeatureVector(vertex_a), graph.getFeatureVector(vertex_b), dist_func_param);