From 36abd64adcbbc141f1e5156dc9cf956b9ab52e0a Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 15 Apr 2026 15:02:28 +0200 Subject: [PATCH 1/5] Add variable graph degree option to cagra optimize --- cpp/include/cuvs/neighbors/cagra.hpp | 15 ++ cpp/src/core/setup.cpp | 140 ++++++++++++++++++ cpp/src/neighbors/cagra.cuh | 6 +- .../neighbors/detail/cagra/cagra_build.cuh | 24 ++- .../detail/cagra/cagra_serialize.cuh | 10 +- cpp/src/neighbors/detail/cagra/graph_core.cuh | 91 +++++++++--- cpp/src/neighbors/detail/hnsw.hpp | 48 ++++-- 7 files changed, 295 insertions(+), 39 deletions(-) create mode 100644 cpp/src/core/setup.cpp diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index a7e1249677..3c9f9b5546 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -144,6 +144,21 @@ struct index_params : cuvs::neighbors::index_params { size_t intermediate_graph_degree = 128; /** Degree of output graph. */ size_t graph_degree = 64; + /** + * Fraction of output graph_degree used as the target for low-detour edges + * during the pruning step. Must be in (0, 1]. The default value of 1.0 + * disables variable-degree logic (original CAGRA behavior). Values < 1.0 + * enable variable-degree graphs: the optimize step finds the minimum detour + * threshold that covers at least ceil(graph_degree * fraction) edges per node, + * then lets reverse edges expand the degree further. Unused slots are filled + * with a sentinel value (IdxT(-1)). + * + * This is intended for the CAGRA-to-HNSW conversion pipeline: the resulting + * graph, when imported into hnswlib, produces variable-degree neighbor lists + * similar to natively-built HNSW graphs. Do not use this with CAGRA's native + * GPU search. + */ + double variable_graph_degree_fraction = 1.0; /** * Specify compression parameters if compression is desired. If set, overrides the * attach_dataset_on_build (and the compressed dataset is always added to the index). diff --git a/cpp/src/core/setup.cpp b/cpp/src/core/setup.cpp new file mode 100644 index 0000000000..1f05390d51 --- /dev/null +++ b/cpp/src/core/setup.cpp @@ -0,0 +1,140 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +/** Report a more verbose error with a backtrace when OOM occurs on RMM side. */ +inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool +{ + auto cuda_status = cudaGetLastError(); + size_t free = 0; + size_t total = 0; + RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total)); + RAFT_FAIL( + "[cuVS Performance] Failed to allocate %zu bytes using RMM memory resource. " + "NB: latest cuda status = %s, free memory = %zu, total memory = %zu.", + bytes, + cudaGetErrorName(cuda_status), + free, + total); +} + +/** Helper class to setup a pool memory resource for a single device. */ +class global_mem_resource { + public: + using pool_mr_type = rmm::mr::pool_memory_resource; + using mr_type = rmm::mr::failure_callback_resource_adaptor; + using large_mr_type = rmm::mr::managed_memory_resource; + + constexpr static size_t kInitialSize = 1024ull * 1024ull * 1024ull; + constexpr static double kMaxMemoryUsage = 0.8; + constexpr static double kMaxInitialMemoryUsage = 0.5; + + global_mem_resource() + try + : orig_resource_{rmm::mr::get_current_device_resource()}, + pool_resource_(orig_resource_, compute_initial_size(), compute_max_size()), + resource_(&pool_resource_, rmm_oom_callback, nullptr) { + rmm::mr::set_current_device_resource(&resource_); + } catch (const std::exception& e) { + auto cuda_status = cudaGetLastError(); + size_t free = 0; + size_t total = 0; + RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total)); + RAFT_FAIL( + "Failed to initialize shared raft resources (NB: latest cuda status = %s, free memory = %zu, " + "total memory = %zu): %s", + cudaGetErrorName(cuda_status), + free, + total, + e.what()); + } + + global_mem_resource(global_mem_resource&&) = delete; + auto operator=(global_mem_resource&&) -> global_mem_resource& = delete; + global_mem_resource(const global_mem_resource& res) = delete; + auto operator=(const global_mem_resource& other) -> global_mem_resource& = delete; + + ~global_mem_resource() noexcept { rmm::mr::set_current_device_resource(orig_resource_); } + + private: + rmm::mr::device_memory_resource* orig_resource_; + pool_mr_type pool_resource_; + mr_type resource_; + + static auto compute_initial_size() -> size_t + { + size_t free_bytes = 0; + size_t total_bytes = 0; + if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) { free_bytes = kInitialSize; } + auto limit = static_cast(free_bytes * kMaxInitialMemoryUsage); + return raft::round_up_safe(std::min(kInitialSize, limit), 256ull); + } + + static auto compute_max_size() -> size_t + { + size_t free_bytes = 0; + size_t total_bytes = 0; + if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) { total_bytes = kInitialSize; } + return raft::round_up_safe(total_bytes * kMaxMemoryUsage, 256ull); + } +}; + +/** Remember and restore the current device. */ +struct keep_current_device_raii { + int initial_device_id = 0; + keep_current_device_raii() { cudaGetDevice(&initial_device_id); } + ~keep_current_device_raii() { cudaSetDevice(initial_device_id); } +}; + +/** Handles for each RMM memory resource. */ +static std::array, 8> global_mem_resource_; + +// Initialize at the moment libcuvs.so is loaded. +__attribute__((constructor)) void cuvs_performance_init() +{ + int device_count = 0; + if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count <= 0) { return; } + + // Start CUDA profiler (ignore errors if not supported in this context) + (void)cudaProfilerStart(); + + keep_current_device_raii keep_current_device; + + // == DISABLED FOR NOW TO TEST ACTUAL PERFORMANCE == + // Configure each device with a pool memory resource + // for (int device_id = 0; device_id < device_count; ++device_id) { + // if (cudaSetDevice(device_id) != cudaSuccess) { continue; } + // global_mem_resource_[device_id].emplace(); + // } +} + +// Cleanup before unloading libcuvs.so. +__attribute__((destructor)) void cuvs_performance_cleanup() +{ + (void)cudaProfilerStop(); + + keep_current_device_raii keep_current_device; + for (auto& global_mem_resource : global_mem_resource_) { + global_mem_resource.reset(); + } +} diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh index 73c3794d39..b761563cce 100644 --- a/cpp/src/neighbors/cagra.cuh +++ b/cpp/src/neighbors/cagra.cuh @@ -257,9 +257,11 @@ void optimize( raft::resources const& res, raft::mdspan, raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, - const bool guarantee_connectivity = false) + const bool guarantee_connectivity = false, + const double variable_graph_degree_fraction = 1.0) { - detail::optimize(res, knn_graph, new_graph, guarantee_connectivity); + detail::optimize( + res, knn_graph, new_graph, guarantee_connectivity, variable_graph_degree_fraction); } template , raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, - const bool guarantee_connectivity = false) + const bool guarantee_connectivity = false, + const double variable_graph_degree_fraction = 1.0) { using internal_IdxT = typename std::make_unsigned::type; @@ -1921,8 +1922,12 @@ void optimize( knn_graph.extent(0), knn_graph.extent(1)); - cagra::detail::graph::optimize( - res, knn_graph_internal, new_graph_internal, guarantee_connectivity); + cagra::detail::graph::optimize(res, + knn_graph_internal, + new_graph_internal, + guarantee_connectivity, + true, + variable_graph_degree_fraction); } // RAII wrapper for allocating memory with Transparent HugePage @@ -2141,8 +2146,11 @@ auto iterative_build_graph( auto next_graph_size = curr_query_size; cagra_graph = raft::make_host_matrix(0, 0); // delete existing grahp cagra_graph = raft::make_host_matrix(next_graph_size, next_graph_degree); - optimize( - res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0); + optimize(res, + neighbors_view, + cagra_graph.view(), + flag_last ? params.guarantee_connectivity : false, + flag_last ? params.variable_graph_degree_fraction : 1.0); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_ms = std::chrono::duration_cast(end - start).count(); @@ -2262,7 +2270,11 @@ index build( cagra_graph = raft::make_host_matrix(dataset.extent(0), graph_degree); RAFT_LOG_TRACE("optimizing graph"); - optimize(res, knn_graph->view(), cagra_graph.view(), params.guarantee_connectivity); + optimize(res, + knn_graph->view(), + cagra_graph.view(), + params.guarantee_connectivity, + params.variable_graph_degree_fraction); // free intermediate graph before trying to create the index knn_graph.reset(); diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh index 323184e757..1bb48c4787 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh @@ -196,8 +196,14 @@ void serialize_to_hnswlib( size_t bytes_written = 0; float GiB = 1 << 30; for (std::size_t i = 0; i < index_.size(); i++) { - auto graph_degree = static_cast(index_.graph_degree()); - os.write(reinterpret_cast(&graph_degree), sizeof(int)); + int actual_degree = static_cast(index_.graph_degree()); + for (int j = 0; j < actual_degree; j++) { + if (host_graph(i, j) == static_cast(-1)) { + actual_degree = j; + break; + } + } + os.write(reinterpret_cast(&actual_degree), sizeof(int)); IdxT* graph_row = &host_graph(i, 0); os.write(reinterpret_cast(graph_row), sizeof(IdxT) * index_.graph_degree()); diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 8546ad307e..eec9b1179f 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1151,11 +1151,10 @@ void optimize( raft::resources const& res, raft::mdspan, raft::row_major, g_accessor> knn_graph, raft::host_matrix_view new_graph, - const bool guarantee_connectivity = true, - const bool use_gpu = true) + const bool guarantee_connectivity = true, + const bool use_gpu = true, + const double variable_graph_degree_fraction = 1.0) { - RAFT_LOG_DEBUG( - "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0), @@ -1163,14 +1162,23 @@ void optimize( RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1), "output graph cannot have more columns than input graph"); // const uint64_t input_graph_degree = knn_graph.extent(1); - const uint64_t knn_graph_degree = knn_graph.extent(1); - const uint64_t output_graph_degree = new_graph.extent(1); - const uint64_t graph_size = new_graph.extent(0); + const uint64_t knn_graph_degree = knn_graph.extent(1); + const uint64_t output_graph_degree = new_graph.extent(1); + const uint64_t graph_size = new_graph.extent(0); + const uint64_t target_pruned_degree = std::max( + 1, static_cast(std::ceil(output_graph_degree * variable_graph_degree_fraction))); // auto input_graph_ptr = knn_graph.data_handle(); auto output_graph_ptr = new_graph.data_handle(); + RAFT_LOG_INFO("# Pruning kNN graph (size=%lu, degree=%lu, target_pruned_degree=%lu)\n", + graph_size, + knn_graph_degree, + target_pruned_degree); raft::common::nvtx::range fun_scope( "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree); + const bool variable_graph_degree = (target_pruned_degree < output_graph_degree); + auto natural_degree_vec = raft::make_host_vector(graph_size); + // MST optimization auto mst_graph = raft::make_host_matrix(0, 0); auto mst_graph_num_edges = raft::make_host_vector(graph_size); @@ -1333,8 +1341,9 @@ void optimize( for (uint64_t i = 0; i < graph_size; i++) { // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable // count of the neighbors while increasing the target detourable count from zero. - uint64_t pk = 0; - uint32_t num_detour = 0; + uint64_t pk = 0; + uint32_t num_detour = 0; + uint64_t num_low_detour = 0; for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) { uint32_t next_num_detour = std::numeric_limits::max(); for (uint64_t k = 0; k < knn_graph_degree; k++) { @@ -1362,6 +1371,7 @@ void optimize( } if (pk >= output_graph_degree) break; } + if (num_low_detour < target_pruned_degree) { num_low_detour = pk; } if (pk >= output_graph_degree) break; if (next_num_detour == std::numeric_limits::max()) { @@ -1379,6 +1389,9 @@ void optimize( i); invalid_neighbor_list = true; } + natural_degree_vec(i) = variable_graph_degree + ? std::min(num_low_detour, output_graph_degree) + : output_graph_degree; } RAFT_EXPECTS( !invalid_neighbor_list, @@ -1460,6 +1473,8 @@ void optimize( bool check_num_protected_edges = true; #pragma omp parallel for for (uint64_t i = 0; i < graph_size; i++) { + auto effective_degree = variable_graph_degree ? natural_degree_vec(i) : output_graph_degree; + auto my_rev_graph = rev_graph.data_handle() + (output_graph_degree * i); auto my_out_graph = output_graph_ptr + (output_graph_degree * i); @@ -1499,9 +1514,22 @@ void optimize( } const auto num_protected_edges = - std::max(mst_graph_num_edges_ptr[i], output_graph_degree / 2); - if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; } - if (num_protected_edges == output_graph_degree) continue; + std::max(mst_graph_num_edges_ptr[i], effective_degree / 2); + if (num_protected_edges > effective_degree) { check_num_protected_edges = false; } + if (num_protected_edges == effective_degree) { + if (variable_graph_degree) { + for (uint32_t j = effective_degree; j < output_graph_degree; j++) { + my_out_graph[j] = static_cast(-1); + } + natural_degree_vec(i) = effective_degree; + } + if (guarantee_connectivity) { + for (uint32_t j = 0; j < output_graph_degree; j++) { + output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j]; + } + } + continue; + } // Replace some edges of the output graph with edges of the reverse graph. auto kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree); @@ -1510,15 +1538,31 @@ void optimize( if (my_rev_graph[kr] < graph_size) { uint64_t pos = pos_in_array(my_rev_graph[kr], my_out_graph, output_graph_degree); if (pos < num_protected_edges) { continue; } - uint64_t num_shift = pos - num_protected_edges; - if (pos >= output_graph_degree) { - num_shift = output_graph_degree - num_protected_edges - 1; + + if (pos < effective_degree) { + uint64_t num_shift = pos - num_protected_edges; + shift_array(my_out_graph + num_protected_edges, num_shift); + my_out_graph[num_protected_edges] = my_rev_graph[kr]; + } else if (pos < output_graph_degree) { + continue; + } else if (effective_degree < output_graph_degree) { + my_out_graph[effective_degree] = my_rev_graph[kr]; + effective_degree++; + } else { + uint64_t num_shift = effective_degree - num_protected_edges - 1; + shift_array(my_out_graph + num_protected_edges, num_shift); + my_out_graph[num_protected_edges] = my_rev_graph[kr]; } - shift_array(my_out_graph + num_protected_edges, num_shift); - my_out_graph[num_protected_edges] = my_rev_graph[kr]; } } + if (variable_graph_degree) { + for (uint32_t j = effective_degree; j < output_graph_degree; j++) { + my_out_graph[j] = static_cast(-1); + } + natural_degree_vec(i) = effective_degree; + } + // If guarantee_connectivity == true, move the output neighbor list from the temporal list to // the output list. If false, the copy is not needed because my_out_graph is a pointer to the // output buffer. @@ -1532,6 +1576,17 @@ void optimize( "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too " "many MST optimization edges."); + if (variable_graph_degree) { + uint64_t total_natural = 0; +#pragma omp parallel for reduction(+ : total_natural) + for (uint64_t i = 0; i < graph_size; i++) { + total_natural += natural_degree_vec(i); + } + RAFT_LOG_INFO("# Variable graph degree: avg natural degree = %.2f / %u", + static_cast(total_natural) / graph_size, + output_graph_degree); + } + const double time_replace_end = cur_time(); RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms", (time_replace_end - time_replace_start) * 1000.0); @@ -1607,6 +1662,8 @@ void optimize( for (uint32_t j = 0; j < output_graph_degree; j++) { const auto neighbor_a = my_out_graph[j]; + if (neighbor_a == static_cast(-1)) { continue; } + // Check oor if (neighbor_a > graph_size) { num_oor++; diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index 4914a0fa1b..5052fc538f 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -279,12 +279,16 @@ std::enable_if_t>> fro for (size_t i = 0; i < static_cast(host_graph_view.extent(0)); ++i) { auto hnsw_internal_id = appr_algo->label_lookup_.find(i)->second; auto ll_i = appr_algo->get_linklist0(hnsw_internal_id); - appr_algo->setListCount(ll_i, host_graph_view.extent(1)); - auto* data = (uint32_t*)(ll_i + 1); + size_t actual_count = 0; + auto* data = (uint32_t*)(ll_i + 1); for (size_t j = 0; j < static_cast(host_graph_view.extent(1)); ++j) { - auto neighbor_internal_id = appr_algo->label_lookup_.find(host_graph(i, j))->second; - data[j] = neighbor_internal_id; + auto neighbor_id = host_graph(i, j); + if (neighbor_id == static_cast(-1)) { break; } + auto neighbor_internal_id = appr_algo->label_lookup_.find(neighbor_id)->second; + data[actual_count] = neighbor_internal_id; + actual_count++; } + appr_algo->setListCount(ll_i, actual_count); } hnsw_index->set_index(std::move(appr_algo)); @@ -676,9 +680,16 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, for (int64_t batch_idx = 0; batch_idx < current_batch_size; batch_idx++) { const int64_t i = batch_start + batch_idx; - os.write(reinterpret_cast(&graph_degree_int), sizeof(int)); - + int actual_degree_int = graph_degree_int; const IdxT* graph_row = &graph_buffer(batch_idx, 0); + for (int gj = 0; gj < graph_degree_int; gj++) { + if (graph_row[gj] == static_cast(-1)) { + actual_degree_int = gj; + break; + } + } + os.write(reinterpret_cast(&actual_degree_int), sizeof(int)); + os.write(reinterpret_cast(graph_row), sizeof(IdxT) * graph_degree_int); if (odd_graph_degree) { @@ -1039,12 +1050,16 @@ std::enable_if_t>> fro common::nvtx::range copy_scope("get_linklist0"); #pragma omp parallel for num_threads(num_threads) for (int64_t i = 0; i < n_rows; i++) { - auto ll_i = appr_algo->get_linklist0(i); - appr_algo->setListCount(ll_i, degree); - auto* data = (uint32_t*)(ll_i + 1); + auto ll_i = appr_algo->get_linklist0(i); + auto* data = (uint32_t*)(ll_i + 1); + int64_t actual_count = 0; for (int64_t j = 0; j < degree; j++) { - data[j] = graph_ptr[i * degree + j]; + auto neighbor_id = graph_ptr[i * degree + j]; + if (neighbor_id == static_cast(-1)) { break; } + data[actual_count] = neighbor_id; + actual_count++; } + appr_algo->setListCount(ll_i, actual_count); } } else { common::nvtx::range copy_scope("get_linklist0"); @@ -1056,11 +1071,20 @@ std::enable_if_t>> fro n_rows, cudaMemcpyDefault, raft::resource::get_cuda_stream(res))); + raft::resource::sync_stream(res); #pragma omp parallel for num_threads(num_threads) for (int64_t i = 0; i < n_rows; i++) { - appr_algo->setListCount(appr_algo->get_linklist0(i), degree); + auto ll_i = appr_algo->get_linklist0(i); + auto* data = (uint32_t*)(ll_i + 1); + int64_t actual_count = degree; + for (int64_t j = 0; j < degree; j++) { + if (data[j] == static_cast(-1)) { + actual_count = j; + break; + } + } + appr_algo->setListCount(ll_i, actual_count); } - raft::resource::sync_stream(res); } hnsw_index->set_index(std::move(appr_algo)); return hnsw_index; From 261fc3d64a58ed26a741115f255fa6900fc7801d Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Apr 2026 07:20:45 +0200 Subject: [PATCH 2/5] Improve the trimming logic --- cpp/include/cuvs/neighbors/cagra.hpp | 2 +- cpp/src/neighbors/detail/cagra/graph_core.cuh | 42 +++++-------------- 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 3c9f9b5546..1ae3334fcc 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -147,7 +147,7 @@ struct index_params : cuvs::neighbors::index_params { /** * Fraction of output graph_degree used as the target for low-detour edges * during the pruning step. Must be in (0, 1]. The default value of 1.0 - * disables variable-degree logic (original CAGRA behavior). Values < 1.0 + * disables variable-degree logic (normal CAGRA behavior). Values < 1.0 * enable variable-degree graphs: the optimize step finds the minimum detour * threshold that covers at least ceil(graph_degree * fraction) edges per node, * then lets reverse edges expand the degree further. Unused slots are filled diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index eec9b1179f..cf69455fed 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1431,8 +1431,9 @@ void optimize( for (uint64_t k = 0; k < output_graph_degree; k++) { #pragma omp parallel for for (uint64_t i = 0; i < graph_size; i++) { - // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)]; - dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)]; + dest_nodes(i) = (k < natural_degree_vec(i)) + ? output_graph_ptr[k + (output_graph_degree * i)] + : static_cast(graph_size); } raft::resource::sync_stream(res); @@ -1513,23 +1514,9 @@ void optimize( } } - const auto num_protected_edges = - std::max(mst_graph_num_edges_ptr[i], effective_degree / 2); + const auto num_protected_edges = std::max( + mst_graph_num_edges_ptr[i], std::min(effective_degree, output_graph_degree / 2)); if (num_protected_edges > effective_degree) { check_num_protected_edges = false; } - if (num_protected_edges == effective_degree) { - if (variable_graph_degree) { - for (uint32_t j = effective_degree; j < output_graph_degree; j++) { - my_out_graph[j] = static_cast(-1); - } - natural_degree_vec(i) = effective_degree; - } - if (guarantee_connectivity) { - for (uint32_t j = 0; j < output_graph_degree; j++) { - output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j]; - } - } - continue; - } // Replace some edges of the output graph with edges of the reverse graph. auto kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree); @@ -1539,20 +1526,13 @@ void optimize( uint64_t pos = pos_in_array(my_rev_graph[kr], my_out_graph, output_graph_degree); if (pos < num_protected_edges) { continue; } - if (pos < effective_degree) { - uint64_t num_shift = pos - num_protected_edges; - shift_array(my_out_graph + num_protected_edges, num_shift); - my_out_graph[num_protected_edges] = my_rev_graph[kr]; - } else if (pos < output_graph_degree) { - continue; - } else if (effective_degree < output_graph_degree) { - my_out_graph[effective_degree] = my_rev_graph[kr]; - effective_degree++; - } else { - uint64_t num_shift = effective_degree - num_protected_edges - 1; - shift_array(my_out_graph + num_protected_edges, num_shift); - my_out_graph[num_protected_edges] = my_rev_graph[kr]; + uint64_t num_shift = pos - num_protected_edges; + if (pos >= output_graph_degree) { + num_shift = output_graph_degree - num_protected_edges - 1; } + shift_array(my_out_graph + num_protected_edges, num_shift); + my_out_graph[num_protected_edges] = my_rev_graph[kr]; + if (effective_degree < output_graph_degree) { effective_degree++; } } } From b642179b8685d47ccfcd9eefef7f5600d4f5dbed Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Apr 2026 07:27:41 +0200 Subject: [PATCH 3/5] Improve the wording --- cpp/include/cuvs/neighbors/cagra.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 1ae3334fcc..b93cb8b0f1 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -145,7 +145,10 @@ struct index_params : cuvs::neighbors::index_params { /** Degree of output graph. */ size_t graph_degree = 64; /** - * Fraction of output graph_degree used as the target for low-detour edges + * Fraction of output graph_degree to define the minimum output graph degree, + * allowing variable-degree neighbor graphs. + * + * This fraction is used as the target for low-detour edges * during the pruning step. Must be in (0, 1]. The default value of 1.0 * disables variable-degree logic (normal CAGRA behavior). Values < 1.0 * enable variable-degree graphs: the optimize step finds the minimum detour From de0bcaa6c9668e9bf8a6d734ed5abe6cf3c4fec7 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Apr 2026 10:31:08 +0200 Subject: [PATCH 4/5] Remove irrelevant file --- cpp/src/core/setup.cpp | 140 ----------------------------------------- 1 file changed, 140 deletions(-) delete mode 100644 cpp/src/core/setup.cpp diff --git a/cpp/src/core/setup.cpp b/cpp/src/core/setup.cpp deleted file mode 100644 index 1f05390d51..0000000000 --- a/cpp/src/core/setup.cpp +++ /dev/null @@ -1,140 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -/** Report a more verbose error with a backtrace when OOM occurs on RMM side. */ -inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool -{ - auto cuda_status = cudaGetLastError(); - size_t free = 0; - size_t total = 0; - RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total)); - RAFT_FAIL( - "[cuVS Performance] Failed to allocate %zu bytes using RMM memory resource. " - "NB: latest cuda status = %s, free memory = %zu, total memory = %zu.", - bytes, - cudaGetErrorName(cuda_status), - free, - total); -} - -/** Helper class to setup a pool memory resource for a single device. */ -class global_mem_resource { - public: - using pool_mr_type = rmm::mr::pool_memory_resource; - using mr_type = rmm::mr::failure_callback_resource_adaptor; - using large_mr_type = rmm::mr::managed_memory_resource; - - constexpr static size_t kInitialSize = 1024ull * 1024ull * 1024ull; - constexpr static double kMaxMemoryUsage = 0.8; - constexpr static double kMaxInitialMemoryUsage = 0.5; - - global_mem_resource() - try - : orig_resource_{rmm::mr::get_current_device_resource()}, - pool_resource_(orig_resource_, compute_initial_size(), compute_max_size()), - resource_(&pool_resource_, rmm_oom_callback, nullptr) { - rmm::mr::set_current_device_resource(&resource_); - } catch (const std::exception& e) { - auto cuda_status = cudaGetLastError(); - size_t free = 0; - size_t total = 0; - RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total)); - RAFT_FAIL( - "Failed to initialize shared raft resources (NB: latest cuda status = %s, free memory = %zu, " - "total memory = %zu): %s", - cudaGetErrorName(cuda_status), - free, - total, - e.what()); - } - - global_mem_resource(global_mem_resource&&) = delete; - auto operator=(global_mem_resource&&) -> global_mem_resource& = delete; - global_mem_resource(const global_mem_resource& res) = delete; - auto operator=(const global_mem_resource& other) -> global_mem_resource& = delete; - - ~global_mem_resource() noexcept { rmm::mr::set_current_device_resource(orig_resource_); } - - private: - rmm::mr::device_memory_resource* orig_resource_; - pool_mr_type pool_resource_; - mr_type resource_; - - static auto compute_initial_size() -> size_t - { - size_t free_bytes = 0; - size_t total_bytes = 0; - if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) { free_bytes = kInitialSize; } - auto limit = static_cast(free_bytes * kMaxInitialMemoryUsage); - return raft::round_up_safe(std::min(kInitialSize, limit), 256ull); - } - - static auto compute_max_size() -> size_t - { - size_t free_bytes = 0; - size_t total_bytes = 0; - if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) { total_bytes = kInitialSize; } - return raft::round_up_safe(total_bytes * kMaxMemoryUsage, 256ull); - } -}; - -/** Remember and restore the current device. */ -struct keep_current_device_raii { - int initial_device_id = 0; - keep_current_device_raii() { cudaGetDevice(&initial_device_id); } - ~keep_current_device_raii() { cudaSetDevice(initial_device_id); } -}; - -/** Handles for each RMM memory resource. */ -static std::array, 8> global_mem_resource_; - -// Initialize at the moment libcuvs.so is loaded. -__attribute__((constructor)) void cuvs_performance_init() -{ - int device_count = 0; - if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count <= 0) { return; } - - // Start CUDA profiler (ignore errors if not supported in this context) - (void)cudaProfilerStart(); - - keep_current_device_raii keep_current_device; - - // == DISABLED FOR NOW TO TEST ACTUAL PERFORMANCE == - // Configure each device with a pool memory resource - // for (int device_id = 0; device_id < device_count; ++device_id) { - // if (cudaSetDevice(device_id) != cudaSuccess) { continue; } - // global_mem_resource_[device_id].emplace(); - // } -} - -// Cleanup before unloading libcuvs.so. -__attribute__((destructor)) void cuvs_performance_cleanup() -{ - (void)cudaProfilerStop(); - - keep_current_device_raii keep_current_device; - for (auto& global_mem_resource : global_mem_resource_) { - global_mem_resource.reset(); - } -} From a0f6379699d6fb68e87e4880d47dfec71778d6bc Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 17 Apr 2026 15:54:25 +0200 Subject: [PATCH 5/5] Set defaults for HNSW heuristics and fix parameter parsing --- .../src/cuvs/cuvs_ann_bench_param_parser.h | 60 +++++++++++-------- cpp/src/neighbors/cagra.cpp | 12 ++-- 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h index 2eaf3123a0..32ca6040a0 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h @@ -265,24 +265,52 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index params.guarantee_connectivity = conf.at("guarantee_connectivity"); } - // Override the graph_build_algo if requested explicitly + if (conf.contains("variable_graph_degree_fraction")) { + params.variable_graph_degree_fraction = conf.at("variable_graph_degree_fraction"); + } + + // Extract build-algo-specific parameters + nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_"); + nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_"); + nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); + nlohmann::json ace_conf = collect_conf_with_prefix(conf, "ace_"); + + // Determine and initialize graph build algorithm. + // Priority 1: explicit "graph_build_algo" config key. + // Priority 2: infer from algorithm-specific prefixed config keys (only when monostate). + // Priority 3: leave as-is (from prior heuristics or monostate for AUTO at build time). + std::string graph_build_algo; if (conf.contains("graph_build_algo")) { - if (conf.at("graph_build_algo") == "IVF_PQ") { + graph_build_algo = conf.at("graph_build_algo"); + } else if (std::holds_alternative(params.graph_build_params)) { + if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) { + graph_build_algo = "IVF_PQ"; + } else if (!nn_descent_conf.empty()) { + graph_build_algo = "NN_DESCENT"; + } else if (!ace_conf.empty()) { + graph_build_algo = "ACE"; + } + // else: leave as monostate → AUTO in cagra_build.cuh + } + + if (!graph_build_algo.empty()) { + if (graph_build_algo == "IVF_PQ") { if (!std::holds_alternative( params.graph_build_params)) { params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{}; } - } else if (conf.at("graph_build_algo") == "NN_DESCENT") { + } else if (graph_build_algo == "NN_DESCENT") { if (!std::holds_alternative( params.graph_build_params)) { - params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{}; + params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params( + params.intermediate_graph_degree, params.metric); } - } else if (conf.at("graph_build_algo") == "ACE") { + } else if (graph_build_algo == "ACE") { if (!std::holds_alternative( params.graph_build_params)) { params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{}; } - } else if (conf.at("graph_build_algo") == "ITERATIVE_SEARCH") { + } else if (graph_build_algo == "ITERATIVE_SEARCH") { if (!std::holds_alternative( params.graph_build_params)) { params.graph_build_params = cuvs::neighbors::graph_build_params::iterative_search_params{}; @@ -290,26 +318,6 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index } } - // Parse build-algo-specific parameters and use them to decide on the algo type - nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_"); - nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_"); - nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); - nlohmann::json ace_conf = collect_conf_with_prefix(conf, "ace_"); - - // When graph_build_algo is not specified, leave graph_build_params as monostate so the - // CAGRA build uses AUTO selection (NN_DESCENT or IVF_PQ based on dataset/heuristics). - // Only infer from algo-specific config keys when present. - if (std::holds_alternative(params.graph_build_params)) { - if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) { - params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{}; - } else if (!nn_descent_conf.empty()) { - params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{}; - } else if (!ace_conf.empty()) { - params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{}; - } - // else: leave as monostate → AUTO in cagra_build.cuh - } - // Apply build-algo-specific parameters std::visit( [&](auto& arg) { diff --git a/cpp/src/neighbors/cagra.cpp b/cpp/src/neighbors/cagra.cpp index 6aa5737e36..cd63de9a24 100644 --- a/cpp/src/neighbors/cagra.cpp +++ b/cpp/src/neighbors/cagra.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -40,13 +40,15 @@ cagra::index_params index_params::from_hnsw_params(raft::matrix_extent cagra::index_params params; switch (heuristic) { case hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT: - params.graph_degree = M * 2; - params.intermediate_graph_degree = M * 3; + params.graph_degree = M * 2; + params.intermediate_graph_degree = M * 3; + params.variable_graph_degree_fraction = 0.35; break; case hnsw_heuristic_type::SIMILAR_SEARCH_PERFORMANCE: default: - params.graph_degree = 2 + M * 2 / 3; - params.intermediate_graph_degree = M + M * ef_construction / 256; + params.graph_degree = M; + params.intermediate_graph_degree = M + M * ef_construction / 256; + params.variable_graph_degree_fraction = 0.7; break; } params.graph_build_params =