Skip to content

Commit 78f25d3

Browse files
committed
graph compaction bug for dynamic indices
1 parent 9bfc1aa commit 78f25d3

2 files changed

Lines changed: 38 additions & 11 deletions

File tree

apps/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ set(DISKANN_TOOLS
1616
range_search_disk_index
1717
test_streaming_scenario
1818
test_insert_deletes_consolidate
19+
test_insert_bug_repro
1920
diskann_build_and_test
2021
)
2122

@@ -54,6 +55,9 @@ target_link_libraries(test_streaming_scenario ${PROJECT_NAME} ${DISKANN_TOOLS_TC
5455
add_executable(test_insert_deletes_consolidate test_insert_deletes_consolidate.cpp)
5556
target_link_libraries(test_insert_deletes_consolidate ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)
5657

58+
add_executable(test_insert_bug_repro test_insert_bug_repro.cpp)
59+
target_link_libraries(test_insert_bug_repro ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
60+
5761
add_executable(diskann_build_and_test benchmark/src/diskann_build_and_test.cpp)
5862
target_include_directories(diskann_build_and_test PRIVATE benchmark/include)
5963
target_link_libraries(diskann_build_and_test ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} Boost::program_options)

apps/benchmark/src/diskann_build_and_test.cpp

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,14 @@ static DatasetConfig get_dataset_config(const DatasetName &dataset_name)
5050
// https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/algorithms/diskann/config.yml
5151
if (dataset_name == DatasetName::SIFT1M)
5252
{
53-
conf.build_params.R = 32;
53+
conf.build_params.R = 64;
5454
conf.build_params.L = 125;
5555
conf.build_params.alpha = 1.2f;
5656
conf.Lvec = {100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 250, 300};
5757
}
5858
else if (dataset_name == DatasetName::DEEP1M)
5959
{
60-
conf.build_params.R = 32;
60+
conf.build_params.R = 64;
6161
conf.build_params.L = 125;
6262
conf.build_params.alpha = 1.2f;
6363
conf.anns_k = 100;
@@ -66,14 +66,14 @@ static DatasetConfig get_dataset_config(const DatasetName &dataset_name)
6666
else if (dataset_name == DatasetName::GLOVE)
6767
{
6868
conf.build_params.R = 32;
69-
conf.build_params.L = 125;
69+
conf.build_params.L = 100;
7070
conf.build_params.alpha = 1.2f;
7171
conf.anns_k = 100;
72-
conf.Lvec = {100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 250, 300};
72+
conf.Lvec = {100, 250, 500, 1000, 1500, 2500, 5000, 10000};
7373
}
7474
else if (dataset_name == DatasetName::AUDIO)
7575
{
76-
conf.build_params.R = 32;
76+
conf.build_params.R = 64;
7777
conf.build_params.L = 125;
7878
conf.build_params.alpha = 1.2f;
7979
conf.anns_k = 20;
@@ -83,7 +83,7 @@ static DatasetConfig get_dataset_config(const DatasetName &dataset_name)
8383
else if (dataset_name == DatasetName::ENRON)
8484
{
8585
// https://github.com/microsoft/DiskANN/blob/7762821dbfe91e838ee7f6db93d010f48f4c4d6d/diskann-benchmark/perf_test_inputs/async_scalar_mimir_enron.json
86-
conf.build_params.R = 32;
86+
conf.build_params.R = 64;
8787
conf.build_params.L = 125;
8888
conf.build_params.alpha = 1.2f;
8989
conf.anns_k = 100;
@@ -126,6 +126,7 @@ void run_create_index(const std::string &index_path, const Dataset &ds, const Da
126126

127127
std::vector<uint32_t> tags(data_num);
128128
std::iota(tags.begin(), tags.end(), 1); // tag 0 is reserved for hidden points
129+
diskann::cout << "Tags from " << tags[0] << " to " << tags[data_num - 1] << std::endl;
129130

130131
auto index_build_params = diskann::IndexWriteParametersBuilder(build_params.L, build_params.R)
131132
.with_max_occlusion_size(build_params.max_occlusion_size)
@@ -377,9 +378,11 @@ void run_dynamic_tests(const Dataset &ds, const DatasetConfig &conf, bool force_
377378

378379
std::vector<uint32_t> tags(data_num);
379380
std::iota(tags.begin(), tags.end(), 1);
381+
diskann::cout << "Tags from " << tags[0] << " to " << tags[data_num - 1] << std::endl;
380382

381-
std::vector<DynamicScenario> scenarios = {DynamicScenario::AddHalf, DynamicScenario::AddAllRemoveHalf,
382-
DynamicScenario::AddHalfRemoveAndAddOneAtATime};
383+
std::vector<DynamicScenario> scenarios = {DynamicScenario::AddHalf};
384+
// std::vector<DynamicScenario> scenarios = {DynamicScenario::AddHalf, DynamicScenario::AddAllRemoveHalf,
385+
// DynamicScenario::AddHalfRemoveAndAddOneAtATime};
383386

384387
for (auto scenario : scenarios)
385388
{
@@ -460,6 +463,9 @@ void run_dynamic_tests(const Dataset &ds, const DatasetConfig &conf, bool force_
460463
for (size_t i = 0; i < half_elements; ++i)
461464
{
462465
index->insert_point(&data[i * data_dim], tags[i]);
466+
467+
if (i % 100000 == 0 && i > 0)
468+
log("Inserted %zu points...\n", i);
463469
}
464470
log("Add time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
465471
}
@@ -469,13 +475,22 @@ void run_dynamic_tests(const Dataset &ds, const DatasetConfig &conf, bool force_
469475
for (size_t i = 0; i < max_elements; ++i)
470476
{
471477
index->insert_point(&data[i * data_dim], tags[i]);
478+
479+
if (i % 100000 == 0 && i > 0)
480+
log("Inserted %zu points...\n", i);
472481
}
473482
log("Add time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
474483

475484
StopW del_stopw;
476485
for (size_t i = half_elements; i < max_elements; ++i)
477486
{
478487
index->lazy_delete(tags[i]);
488+
489+
if ((i - half_elements) % 100000 == 0 && i > half_elements)
490+
log("Deleted %zu points...\n", (i - half_elements));
491+
size_t del_count = i - half_elements + 1;
492+
if (del_count > 0 && (del_count % (half_elements / 10)) == 0)
493+
index->consolidate_deletes(index_build_params);
479494
}
480495
log("Delete time: %.2f s\n", (del_stopw.getElapsedTimeMicro() / 1e6));
481496
}
@@ -488,14 +503,22 @@ void run_dynamic_tests(const Dataset &ds, const DatasetConfig &conf, bool force_
488503
for (size_t i = half_elements; i < max_elements; ++i)
489504
{
490505
index->insert_point(&data[i * data_dim], tags[i]);
506+
507+
if ((i - half_elements) % 100000 == 0 && i > half_elements)
508+
log("Inserted %zu points...\n", (i - half_elements));
491509
}
492510
log("Add (second half) time: %.2f s\n", (add_timer.getElapsedTimeMicro() / 1e6));
493511

494512
StopW update_stopw;
495513
for (size_t i = 0; i < half_elements; ++i)
496514
{
497-
index->lazy_delete(tags[i + half_elements]);
498-
index->insert_point(&data[i * data_dim], tags[i]);
515+
index->lazy_delete(tags[i + half_elements]); // delete second half
516+
index->insert_point(&data[i * data_dim], tags[i]); // add first half
517+
518+
if (i % 100000 == 0 && i > 0)
519+
log("Updated %zu points...\n", i);
520+
if (i > 0 && (i % (half_elements / 10)) == 0)
521+
index->consolidate_deletes(index_build_params);
499522
}
500523
log("Update (Delete + Add) time: %.2f s\n", (update_stopw.getElapsedTimeMicro() / 1e6));
501524
}
@@ -655,7 +678,7 @@ int main(int argc, char **argv)
655678
{
656679
Dataset dataset(ds_name_to_run, data_root);
657680
DatasetConfig conf = get_dataset_config(ds_name_to_run);
658-
run_static_tests(dataset, conf, force_test, num_threads);
681+
// run_static_tests(dataset, conf, force_test, num_threads);
659682
run_dynamic_tests(dataset, conf, force_test, num_threads);
660683
}
661684

0 commit comments

Comments
 (0)