Skip to content

Commit 9bfc1aa

Browse files
committed
fix alignment and fast L2 bug
1 parent b2c7198 commit 9bfc1aa

2 files changed

Lines changed: 43 additions & 60 deletions

File tree

apps/benchmark/include/benchmark.h

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@ namespace diskann::benchmark
1515

1616
template <typename T, typename TagT, typename LabelT>
1717
static void test_diskann_anns(diskann::Index<T, TagT, LabelT> *index, const T *query_data, size_t query_num,
18-
size_t query_dim, size_t query_aligned_dim,
19-
const std::vector<std::vector<uint32_t>> &ground_truth, const uint32_t k,
20-
const std::vector<uint32_t> &Lvec, uint32_t num_threads)
18+
size_t query_dim, const std::vector<std::vector<uint32_t>> &ground_truth,
19+
const uint32_t k, const std::vector<uint32_t> &Lvec, uint32_t num_threads)
2120
{
2221
std::vector<TagT> query_result_tags(k * query_num);
2322
std::vector<float> latency_stats(query_num, 0);
@@ -40,7 +39,7 @@ static void test_diskann_anns(diskann::Index<T, TagT, LabelT> *index, const T *q
4039
std::vector<float> distances(k);
4140

4241
// Always search with tags as they represent the original point IDs.
43-
index->search_with_tags(query_data + i * query_aligned_dim, k, L, query_result_tags.data() + i * k,
42+
index->search_with_tags(query_data + i * query_dim, k, L, query_result_tags.data() + i * k,
4443
distances.data(), res_vectors);
4544

4645
auto qe = std::chrono::high_resolution_clock::now();
@@ -88,7 +87,7 @@ static void test_diskann_anns(diskann::Index<T, TagT, LabelT> *index, const T *q
8887

8988
template <typename T, typename TagT, typename LabelT>
9089
static void test_diskann_explore(diskann::Index<T, TagT, LabelT> *index, const T *explore_query_data,
91-
size_t explore_query_num, size_t explore_query_dim, size_t explore_query_aligned_dim,
90+
size_t explore_query_num, size_t explore_query_dim,
9291
const std::vector<std::vector<uint32_t>> &ground_truth,
9392
const std::vector<std::vector<uint32_t>> &entry_node_indices, const uint32_t k)
9493
{
@@ -114,9 +113,8 @@ static void test_diskann_explore(diskann::Index<T, TagT, LabelT> *index, const T
114113
std::vector<TagT> results(k);
115114
std::vector<float> dists(k);
116115

117-
index->explore_with_tags(explore_query_data + q * explore_query_aligned_dim, (uint64_t)k,
118-
max_distance_count, max_distance_count, entry_point, results.data(),
119-
dists.data());
116+
index->explore_with_tags(explore_query_data + q * explore_query_dim, (uint64_t)k, max_distance_count,
117+
max_distance_count, entry_point, results.data(), dists.data());
120118

121119
if (q < ground_truth.size())
122120
{

apps/benchmark/src/diskann_build_and_test.cpp

Lines changed: 37 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ static DatasetConfig get_dataset_config(const DatasetName &dataset_name)
6666
else if (dataset_name == DatasetName::GLOVE)
6767
{
6868
conf.build_params.R = 32;
69-
conf.build_params.L = 100;
69+
conf.build_params.L = 125;
7070
conf.build_params.alpha = 1.2f;
7171
conf.anns_k = 100;
7272
conf.Lvec = {100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 250, 300};
@@ -119,6 +119,7 @@ void run_create_index(const std::string &index_path, const Dataset &ds, const Da
119119
auto build_params = conf.build_params;
120120

121121
size_t data_num = ds.info().base_count;
122+
size_t data_dim = ds.info().dims;
122123

123124
auto data_wrapper = ds.load_base();
124125
float *data = data_wrapper.data;
@@ -146,7 +147,7 @@ void run_create_index(const std::string &index_path, const Dataset &ds, const Da
146147
.with_label_type("uint")
147148
.with_index_write_params(index_build_params)
148149
.with_index_search_params(index_search_params)
149-
.is_dynamic_index(true) // TODO can be false
150+
.is_dynamic_index(true)
150151
.is_enable_tags(true)
151152
.is_use_opq(build_params.use_opq)
152153
.is_pq_dist_build(build_params.build_PQ_bytes > 0)
@@ -162,18 +163,26 @@ void run_create_index(const std::string &index_path, const Dataset &ds, const Da
162163
log("----------------------------------------\n");
163164
log("R (Max Degree) : %u\n", build_params.R);
164165
log("L (Build List Size): %u\n", build_params.L);
166+
log("Max Occlusion Size : %u\n", build_params.max_occlusion_size);
165167
log("Alpha : %.2f\n", build_params.alpha);
166168
log("PQ Chunks : %u\n", build_params.build_PQ_bytes);
167169
log("OPQ : %s\n", build_params.use_opq ? "Yes" : "No");
168170
log("----------------------------------------\n");
169171

170172
StopW timer;
171173
log("Building graph in one go...\n");
172-
index->build(data, data_num, tags);
174+
for (size_t i = 0; i < data_num; i++)
175+
{
176+
index->insert_point(&data[i * data_dim], tags[i]);
177+
if (i > 0 && i % 100000 == 0)
178+
{
179+
log("added %zu after %.2f seconds.\n", i, (timer.getElapsedTimeMicro() / 1000000.0));
180+
}
181+
}
173182
log("Graph built after %.2f seconds.\n", (timer.getElapsedTimeMicro() / 1000000.0));
174183

175184
// Save dynamic index
176-
index->save(index_path.c_str(), true);
185+
index->save(index_path.c_str());
177186
}
178187

179188
// -----------------------------------------------------------------------------
@@ -208,7 +217,7 @@ void generate_graph_stats(const std::string &graph_file)
208217
size_t min_degree = std::numeric_limits<size_t>::max();
209218
size_t max_degree = 0;
210219
size_t total_degree = 0;
211-
size_t nodes_with_less_than_2_degree = 0;
220+
size_t vertex_with_degree1 = 0;
212221

213222
size_t bytes_read = sizeof(size_t) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(size_t);
214223

@@ -223,8 +232,8 @@ void generate_graph_stats(const std::string &graph_file)
223232
min_degree = std::min(min_degree, (size_t)k);
224233
max_degree = std::max(max_degree, (size_t)k);
225234
total_degree += k;
226-
if (k < 2)
227-
nodes_with_less_than_2_degree++;
235+
if (k == 1)
236+
vertex_with_degree1++;
228237

229238
num_nodes++;
230239
}
@@ -236,7 +245,7 @@ void generate_graph_stats(const std::string &graph_file)
236245
log("Max Degree : %zu\n", max_degree);
237246
log("Min Degree : %zu\n", min_degree);
238247
log("Average Degree : %.2f\n", num_nodes > 0 ? (float)total_degree / num_nodes : 0.0f);
239-
log("Count (Degree<2) : %zu\n", nodes_with_less_than_2_degree);
248+
log("Count (Degree<2) : %zu\n", vertex_with_degree1);
240249
log("----------------------------------------\n\n");
241250
}
242251
std::unique_ptr<diskann::AbstractIndex> load_index(const std::string &index_path, const Dataset &ds,
@@ -277,21 +286,11 @@ void run_anns_test(const std::string &index_path, const Dataset &ds, const Datas
277286
log("Running ANNS Tests (k=%u)\n", conf.anns_k);
278287
log("----------------------------------------\n");
279288

280-
if (ds.info().metric == diskann::FAST_L2)
281-
{
282-
log("Optimizing index layout for FAST_L2...\n");
283-
auto typed_index = dynamic_cast<diskann::Index<float, uint32_t, uint32_t> *>(index.get());
284-
if (typed_index)
285-
{
286-
typed_index->optimize_index_layout();
287-
}
288-
}
289-
290289
auto typed_index = dynamic_cast<diskann::Index<float, uint32_t, uint32_t> *>(index.get());
291290
if (typed_index)
292291
{
293-
test_diskann_anns<float, uint32_t, uint32_t>(typed_index, query_data.data, query_num, query_dim, query_dim,
294-
ground_truth, conf.anns_k, conf.Lvec, num_threads);
292+
test_diskann_anns<float, uint32_t, uint32_t>(typed_index, query_data.data, query_num, query_dim, ground_truth,
293+
conf.anns_k, conf.Lvec, num_threads);
295294
}
296295
else
297296
{
@@ -333,14 +332,12 @@ void run_explore_test(const std::string &index_path, const Dataset &ds, const Da
333332
unsigned num_explore = 0, dim_explore = 0;
334333
float *explore_queries =
335334
load_fvecs((ds.files_dir() / ds.info().explore_query_file).string().c_str(), num_explore, dim_explore);
336-
size_t aligned_dim_explore = dim_explore;
337335

338336
auto typed_index = dynamic_cast<diskann::Index<float, uint32_t, uint32_t> *>(index.get());
339337
if (typed_index && explore_queries)
340338
{
341339
test_diskann_explore<float, uint32_t, uint32_t>(typed_index, explore_queries, num_explore, dim_explore,
342-
aligned_dim_explore, explore_gt_vec, entry_indices,
343-
conf.explore_k);
340+
explore_gt_vec, entry_indices, conf.explore_k);
344341
}
345342

346343
if (explore_queries)
@@ -370,7 +367,7 @@ inline const char *dynamic_scenario_str(DynamicScenario scenario)
370367
}
371368
}
372369

373-
void run_dynamic_data_test(const Dataset &ds, const DatasetConfig &conf, bool force_test, uint32_t num_threads)
370+
void run_dynamic_tests(const Dataset &ds, const DatasetConfig &conf, bool force_test, uint32_t num_threads)
374371
{
375372
auto build_params = conf.build_params;
376373
size_t data_num = ds.info().base_count;
@@ -445,6 +442,7 @@ void run_dynamic_data_test(const Dataset &ds, const DatasetConfig &conf, bool fo
445442
log("----------------------------------------\n");
446443
log("R (Max Degree) : %u\n", build_params.R);
447444
log("L (Build List Size): %u\n", build_params.L);
445+
log("Max Occlusion Size : %u\n", build_params.max_occlusion_size);
448446
log("Alpha : %.2f\n", build_params.alpha);
449447
log("PQ Chunks : %u\n", build_params.build_PQ_bytes);
450448
log("OPQ : %s\n", build_params.use_opq ? "Yes" : "No");
@@ -514,19 +512,19 @@ void run_dynamic_data_test(const Dataset &ds, const DatasetConfig &conf, bool fo
514512
index->save(index_path.c_str(), true);
515513
}
516514

517-
// Generate Graph Statistics (after index object is destroyed)
518-
generate_graph_stats(index_path);
519-
520-
// Test the index by loading it from disk (out-of-context testing)
521-
run_anns_test(index_path, ds, conf, num_threads);
522-
523515
log("%s: Log written to: %s\n", scenario_name.c_str(), log_file.c_str());
524516
}
525517
catch (const std::exception &e)
526518
{
527519
log("Exception in dynamic test '%s': %s\n", scenario_name.c_str(), e.what());
528520
}
529521

522+
// Generate Graph Statistics (after index object is destroyed)
523+
generate_graph_stats(index_path);
524+
525+
// Test the index by loading it from disk (out-of-context testing)
526+
run_anns_test(index_path, ds, conf, num_threads);
527+
530528
detach_cout_from_log();
531529
reset_log_to_console();
532530
}
@@ -538,20 +536,20 @@ void run_common_tests(const std::string &index_path, const Dataset &ds, const Da
538536
run_explore_test(index_path, ds, conf, false, num_threads);
539537
}
540538

541-
void run_test_suite(const Dataset &ds, const DatasetConfig &conf, bool force_test, bool only_test, uint32_t num_threads)
539+
void run_static_tests(const Dataset &ds, const DatasetConfig &conf, bool force_test, uint32_t num_threads)
542540
{
543541
std::string index_path = get_index_path(ds, conf);
544542

545543
ensure_directory(ds.dataset_dir() / "diskann");
546544
std::string log_file = index_path + "_benchmark.log";
547545

548-
if (!force_test && !only_test && diskann::benchmark::file_exists(log_file))
546+
if (!force_test && diskann::benchmark::file_exists(log_file))
549547
{
550548
log("Log file %s already exists. Skipping.\n", log_file.c_str());
551549
return;
552550
}
553551

554-
set_log_file(log_file, force_test || only_test);
552+
set_log_file(log_file, force_test);
555553
attach_cout_to_log();
556554

557555
log("================================================================================\n");
@@ -560,20 +558,14 @@ void run_test_suite(const Dataset &ds, const DatasetConfig &conf, bool force_tes
560558

561559
try
562560
{
563-
if (!only_test && !diskann::benchmark::file_exists(index_path + "_pq_pivots.bin") &&
561+
if (!diskann::benchmark::file_exists(index_path + "_pq_pivots.bin") &&
564562
!diskann::benchmark::file_exists(index_path + "_sample_data.bin") &&
565563
!diskann::benchmark::file_exists(index_path + ".data"))
566564
{
567565
run_create_index(index_path, ds, conf, num_threads);
568-
// Generate Graph Statistics after build (out-of-context)
569-
generate_graph_stats(index_path);
570-
}
571-
else if (!only_test)
572-
{
573-
log("Index files already exist at %s. Skipping build.\n", index_path.c_str());
574-
generate_graph_stats(index_path);
575566
}
576567

568+
generate_graph_stats(index_path);
577569
run_anns_test(index_path, ds, conf, num_threads);
578570
run_explore_test(index_path, ds, conf, false, num_threads);
579571
}
@@ -584,9 +576,6 @@ void run_test_suite(const Dataset &ds, const DatasetConfig &conf, bool force_tes
584576

585577
detach_cout_from_log();
586578
reset_log_to_console();
587-
588-
// Now run the dynamic tests, appending to the single unified process
589-
// run_dynamic_data_test(ds, conf, force_test, num_threads);
590579
}
591580

592581
int main(int argc, char **argv)
@@ -603,18 +592,13 @@ int main(int argc, char **argv)
603592

604593
std::string data_root = DATA_PATH;
605594
DatasetName ds_name = DatasetName::GLOVE;
606-
bool only_test = false;
607-
bool force_test = false;
595+
bool force_test = true;
608596
uint32_t num_threads = 1;
609597

610598
for (int i = 1; i < argc; ++i)
611599
{
612600
std::string arg = argv[i];
613-
if (arg == "--only-test" || arg == "-t")
614-
{
615-
only_test = true;
616-
}
617-
else if (arg == "--force-test" || arg == "-f")
601+
if (arg == "--force-test" || arg == "-f")
618602
{
619603
force_test = true;
620604
}
@@ -671,7 +655,8 @@ int main(int argc, char **argv)
671655
{
672656
Dataset dataset(ds_name_to_run, data_root);
673657
DatasetConfig conf = get_dataset_config(ds_name_to_run);
674-
run_test_suite(dataset, conf, force_test, only_test, num_threads);
658+
run_static_tests(dataset, conf, force_test, num_threads);
659+
run_dynamic_tests(dataset, conf, force_test, num_threads);
675660
}
676661

677662
return 0;

0 commit comments

Comments
 (0)