Skip to content

Commit b2c7198

Browse files
committed
graph stats
1 parent 4e4c00c commit b2c7198

2 files changed

Lines changed: 142 additions & 78 deletions

File tree

apps/benchmark/include/dataset.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,6 @@ struct DatasetInfo
138138
size_t query_count;
139139
uint32_t dims;
140140
uint32_t scale;
141-
uint32_t explore_depth;
142141

143142
std::string base_file;
144143
std::string query_file;
@@ -160,7 +159,7 @@ struct DatasetInfo
160159

161160
inline DatasetInfo make_dataset_info(const DatasetName &ds)
162161
{
163-
DatasetInfo info{ds, diskann::Metric::L2, 0, 0, 0, 1, 2, {}, {}, {}};
162+
DatasetInfo info{ds, diskann::Metric::L2, 0, 0, 0, 1, {}, {}, {}};
164163

165164
std::string name = ds.name();
166165

@@ -190,21 +189,18 @@ inline DatasetInfo make_dataset_info(const DatasetName &ds)
190189
info.query_count = 10000;
191190
info.dims = 100;
192191
info.scale = 100;
193-
info.metric = diskann::Metric::COSINE;
194192
}
195193
else if (ds == DatasetName::AUDIO)
196194
{
197195
info.base_count = 53387;
198196
info.query_count = 200;
199197
info.dims = 192;
200-
info.explore_depth = 1;
201198
}
202199
else if (ds == DatasetName::ENRON)
203200
{
204201
info.base_count = 94987;
205202
info.query_count = 200;
206203
info.dims = 1369;
207-
info.explore_depth = 1;
208204
}
209205

210206
return info;

apps/benchmark/src/diskann_build_and_test.cpp

Lines changed: 141 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ std::string get_index_path(const Dataset &ds, const DatasetConfig &conf)
111111
return prefix;
112112
}
113113

114+
void generate_graph_stats(const std::string &graph_file);
115+
114116
void run_create_index(const std::string &index_path, const Dataset &ds, const DatasetConfig &conf, uint32_t num_threads)
115117
{
116118
log("Building DiskANN index: %s\n", index_path.c_str());
@@ -156,8 +158,16 @@ void run_create_index(const std::string &index_path, const Dataset &ds, const Da
156158
auto index = index_factory.create_instance();
157159
index->set_start_points_at_random(static_cast<float>(0));
158160

159-
StopW timer;
161+
log("\nConstruction Parameters:\n");
162+
log("----------------------------------------\n");
163+
log("R (Max Degree) : %u\n", build_params.R);
164+
log("L (Build List Size): %u\n", build_params.L);
165+
log("Alpha : %.2f\n", build_params.alpha);
166+
log("PQ Chunks : %u\n", build_params.build_PQ_bytes);
167+
log("OPQ : %s\n", build_params.use_opq ? "Yes" : "No");
168+
log("----------------------------------------\n");
160169

170+
StopW timer;
161171
log("Building graph in one go...\n");
162172
index->build(data, data_num, tags);
163173
log("Graph built after %.2f seconds.\n", (timer.getElapsedTimeMicro() / 1000000.0));
@@ -169,6 +179,66 @@ void run_create_index(const std::string &index_path, const Dataset &ds, const Da
169179
// -----------------------------------------------------------------------------
170180
// Helpers
171181
// -----------------------------------------------------------------------------
182+
void generate_graph_stats(const std::string &graph_file)
183+
{
184+
std::ifstream in;
185+
in.exceptions(std::ios::badbit | std::ios::failbit);
186+
187+
try
188+
{
189+
in.open(graph_file, std::ios::binary);
190+
}
191+
catch (const std::exception &)
192+
{
193+
log("Warning: Could not open graph file %s for statistics calculation.\n", graph_file.c_str());
194+
return;
195+
}
196+
197+
size_t expected_file_size;
198+
uint32_t max_observed_degree;
199+
uint32_t start;
200+
size_t file_frozen_pts;
201+
202+
in.read((char *)&expected_file_size, sizeof(size_t));
203+
in.read((char *)&max_observed_degree, sizeof(uint32_t));
204+
in.read((char *)&start, sizeof(uint32_t));
205+
in.read((char *)&file_frozen_pts, sizeof(size_t));
206+
207+
size_t num_nodes = 0;
208+
size_t min_degree = std::numeric_limits<size_t>::max();
209+
size_t max_degree = 0;
210+
size_t total_degree = 0;
211+
size_t nodes_with_less_than_2_degree = 0;
212+
213+
size_t bytes_read = sizeof(size_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(size_t);
214+
215+
while (bytes_read < expected_file_size)
216+
{
217+
uint32_t k;
218+
in.read((char *)&k, sizeof(uint32_t));
219+
220+
in.seekg(k * sizeof(uint32_t), std::ios::cur);
221+
bytes_read += sizeof(uint32_t) * (k + 1);
222+
223+
min_degree = std::min(min_degree, (size_t)k);
224+
max_degree = std::max(max_degree, (size_t)k);
225+
total_degree += k;
226+
if (k < 2)
227+
nodes_with_less_than_2_degree++;
228+
229+
num_nodes++;
230+
}
231+
232+
log("\n----------------------------------------\n");
233+
log("Graph Statistics:\n");
234+
log("----------------------------------------\n");
235+
log("Total Nodes : %zu\n", num_nodes);
236+
log("Max Degree : %zu\n", max_degree);
237+
log("Min Degree : %zu\n", min_degree);
238+
log("Average Degree : %.2f\n", num_nodes > 0 ? (float)total_degree / num_nodes : 0.0f);
239+
log("Count (Degree<2) : %zu\n", nodes_with_less_than_2_degree);
240+
log("----------------------------------------\n\n");
241+
}
172242
std::unique_ptr<diskann::AbstractIndex> load_index(const std::string &index_path, const Dataset &ds,
173243
uint32_t num_threads, uint32_t scratch_size)
174244
{
@@ -311,12 +381,6 @@ void run_dynamic_data_test(const Dataset &ds, const DatasetConfig &conf, bool fo
311381
std::vector<uint32_t> tags(data_num);
312382
std::iota(tags.begin(), tags.end(), 1);
313383

314-
auto query_data = ds.load_query();
315-
size_t query_num = ds.info().query_count;
316-
size_t query_dim = ds.info().dims;
317-
318-
auto ground_truth = ds.load_groundtruth(conf.anns_k, true);
319-
320384
std::vector<DynamicScenario> scenarios = {DynamicScenario::AddHalf, DynamicScenario::AddAllRemoveHalf,
321385
DynamicScenario::AddHalfRemoveAndAddOneAtATime};
322386

@@ -372,88 +436,89 @@ void run_dynamic_data_test(const Dataset &ds, const DatasetConfig &conf, bool fo
372436
.is_concurrent_consolidate(false)
373437
.build();
374438

375-
auto index_factory = diskann::IndexFactory(config);
376-
auto index = index_factory.create_instance();
377-
index->set_start_points_at_random(static_cast<float>(0));
439+
{
440+
auto index_factory = diskann::IndexFactory(config);
441+
auto index = index_factory.create_instance();
442+
index->set_start_points_at_random(static_cast<float>(0));
378443

379-
const size_t max_elements = data_num;
380-
const size_t half_elements = max_elements / 2;
444+
log("\nConstruction Parameters:\n");
445+
log("----------------------------------------\n");
446+
log("R (Max Degree) : %u\n", build_params.R);
447+
log("L (Build List Size): %u\n", build_params.L);
448+
log("Alpha : %.2f\n", build_params.alpha);
449+
log("PQ Chunks : %u\n", build_params.build_PQ_bytes);
450+
log("OPQ : %s\n", build_params.use_opq ? "Yes" : "No");
451+
log("----------------------------------------\n");
381452

382-
StopW scenario_timer;
383-
log("\n--- Dynamic updates ---\n");
453+
const size_t max_elements = data_num;
454+
const size_t half_elements = max_elements / 2;
384455

385-
if (scenario == DynamicScenario::AddHalf)
386-
{
387-
StopW add_timer;
388-
for (size_t i = 0; i < half_elements; ++i)
389-
{
390-
index->insert_point(&data[i * data_dim], tags[i]);
391-
}
392-
log("Add time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
393-
}
394-
else if (scenario == DynamicScenario::AddAllRemoveHalf)
395-
{
396-
StopW add_timer;
397-
for (size_t i = 0; i < max_elements; ++i)
456+
StopW scenario_timer;
457+
log("\n--- Dynamic updates ---\n");
458+
459+
if (scenario == DynamicScenario::AddHalf)
398460
{
399-
index->insert_point(&data[i * data_dim], tags[i]);
461+
StopW add_timer;
462+
for (size_t i = 0; i < half_elements; ++i)
463+
{
464+
index->insert_point(&data[i * data_dim], tags[i]);
465+
}
466+
log("Add time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
400467
}
401-
log("Add time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
402-
403-
StopW del_stopw;
404-
for (size_t i = half_elements; i < max_elements; ++i)
468+
else if (scenario == DynamicScenario::AddAllRemoveHalf)
405469
{
406-
index->lazy_delete(tags[i]);
470+
StopW add_timer;
471+
for (size_t i = 0; i < max_elements; ++i)
472+
{
473+
index->insert_point(&data[i * data_dim], tags[i]);
474+
}
475+
log("Add time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
476+
477+
StopW del_stopw;
478+
for (size_t i = half_elements; i < max_elements; ++i)
479+
{
480+
index->lazy_delete(tags[i]);
481+
}
482+
log("Delete time: %.2f s\n", (del_stopw.getElapsedTimeMicro() / 1e6));
407483
}
408-
log("Delete time: %.2f s\n", (del_stopw.getElapsedTimeMicro() / 1e6));
409-
}
410-
else if (scenario == DynamicScenario::AddHalfRemoveAndAddOneAtATime)
411-
{
412-
// IMPORTANT: The half-dataset ground truth files correspond to the first half of labels [0..half-1].
413-
// For this scenario we want to end up with exactly that active set.
414-
// Therefore: start with the SECOND half in the index, then swap it out one-by-one.
415-
StopW add_timer;
416-
for (size_t i = half_elements; i < max_elements; ++i)
484+
else if (scenario == DynamicScenario::AddHalfRemoveAndAddOneAtATime)
417485
{
418-
index->insert_point(&data[i * data_dim], tags[i]);
486+
// IMPORTANT: The half-dataset ground truth files correspond to the first half of labels
487+
// [0..half-1]. For this scenario we want to end up with exactly that active set. Therefore:
488+
// start with the SECOND half in the index, then swap it out one-by-one.
489+
StopW add_timer;
490+
for (size_t i = half_elements; i < max_elements; ++i)
491+
{
492+
index->insert_point(&data[i * data_dim], tags[i]);
493+
}
494+
log("Add (second half) time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
495+
496+
StopW update_stopw;
497+
for (size_t i = 0; i < half_elements; ++i)
498+
{
499+
index->lazy_delete(tags[i + half_elements]);
500+
index->insert_point(&data[i * data_dim], tags[i]);
501+
}
502+
log("Update (Delete + Add) time: %.2f s\n", (update_stopw.getElapsedTimeMicro() / 1e6));
419503
}
420-
log("Add (second half) time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
421504

422-
StopW update_stopw;
423-
for (size_t i = 0; i < half_elements; ++i)
505+
if (scenario != DynamicScenario::AddHalf)
424506
{
425-
index->lazy_delete(tags[i + half_elements]);
426-
index->insert_point(&data[i * data_dim], tags[i]);
507+
StopW cons_stopw;
508+
index->consolidate_deletes(index_build_params);
509+
log("Consolidate time: %.2f s\n", (cons_stopw.getElapsedTimeMicro() / 1e6));
427510
}
428-
log("Update (Delete + Add) time: %.2f s\n", (update_stopw.getElapsedTimeMicro() / 1e6));
429-
}
430511

431-
if (scenario != DynamicScenario::AddHalf)
432-
{
433-
StopW cons_stopw;
434-
index->consolidate_deletes(index_build_params);
435-
log("Consolidate time: %.2f s\n", (cons_stopw.getElapsedTimeMicro() / 1e6));
436-
}
512+
log("Gesamt Zeit (Dynamic Graph Construction): %.2f s\n", (scenario_timer.getElapsedTimeMicro() / 1e6));
437513

438-
log("Gesamt Zeit (Dynamic Graph Construction): %.2f s\n", (scenario_timer.getElapsedTimeMicro() / 1e6));
514+
index->save(index_path.c_str(), true);
515+
}
439516

440-
index->save(index_path.c_str(), true);
517+
// Generate Graph Statistics (after index object is destroyed)
518+
generate_graph_stats(index_path);
441519

442-
// Test the index in memory directly without reload
443-
auto typed_index = dynamic_cast<diskann::Index<float, uint32_t, uint32_t> *>(index.get());
444-
if (typed_index)
445-
{
446-
if (ds.info().metric == diskann::FAST_L2)
447-
{
448-
typed_index->optimize_index_layout();
449-
}
450-
log("----------------------------------------\n");
451-
log("Running ANNS Tests (k=%u)\n", conf.anns_k);
452-
log("----------------------------------------\n");
453-
test_diskann_anns<float, uint32_t, uint32_t>(typed_index, query_data.data, query_num, query_dim,
454-
query_dim, ground_truth, conf.anns_k, conf.Lvec,
455-
num_threads);
456-
}
520+
// Test the index by loading it from disk (out-of-context testing)
521+
run_anns_test(index_path, ds, conf, num_threads);
457522

458523
log("%s: Log written to: %s\n", scenario_name.c_str(), log_file.c_str());
459524
}
@@ -500,10 +565,13 @@ void run_test_suite(const Dataset &ds, const DatasetConfig &conf, bool force_tes
500565
!diskann::benchmark::file_exists(index_path + ".data"))
501566
{
502567
run_create_index(index_path, ds, conf, num_threads);
568+
// Generate Graph Statistics after build (out-of-context)
569+
generate_graph_stats(index_path);
503570
}
504571
else if (!only_test)
505572
{
506573
log("Index files already exist at %s. Skipping build.\n", index_path.c_str());
574+
generate_graph_stats(index_path);
507575
}
508576

509577
run_anns_test(index_path, ds, conf, num_threads);

0 commit comments

Comments
 (0)