From 36abd64adcbbc141f1e5156dc9cf956b9ab52e0a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Apr 2026 15:02:28 +0200
Subject: [PATCH 1/5] Add variable graph degree option to cagra optimize

---
 cpp/include/cuvs/neighbors/cagra.hpp          |  15 ++
 cpp/src/core/setup.cpp                        | 140 ++++++++++++++++++
 cpp/src/neighbors/cagra.cuh                   |   6 +-
 .../neighbors/detail/cagra/cagra_build.cuh    |  24 ++-
 .../detail/cagra/cagra_serialize.cuh          |  10 +-
 cpp/src/neighbors/detail/cagra/graph_core.cuh |  91 +++++++++---
 cpp/src/neighbors/detail/hnsw.hpp             |  48 ++++--
 7 files changed, 295 insertions(+), 39 deletions(-)
 create mode 100644 cpp/src/core/setup.cpp
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index a7e1249677..3c9f9b5546 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -144,6 +144,21 @@ struct index_params : cuvs::neighbors::index_params {
   size_t intermediate_graph_degree = 128;
   /** Degree of output graph. */
   size_t graph_degree = 64;
+  /**
+   * Fraction of output graph_degree used as the target for low-detour edges
+   * during the pruning step. Must be in (0, 1]. The default value of 1.0
+   * disables variable-degree logic (original CAGRA behavior). Values < 1.0
+   * enable variable-degree graphs: the optimize step finds the minimum detour
+   * threshold that covers at least ceil(graph_degree * fraction) edges per node,
+   * then lets reverse edges expand the degree further. Unused slots are filled
+   * with a sentinel value (IdxT(-1)).
+   *
+   * This is intended for the CAGRA-to-HNSW conversion pipeline: the resulting
+   * graph, when imported into hnswlib, produces variable-degree neighbor lists
+   * similar to natively-built HNSW graphs. Do not use this with CAGRA's native
+   * GPU search.
+   */
+  double variable_graph_degree_fraction = 1.0;
   /**
    * Specify compression parameters if compression is desired. If set, overrides the
    * attach_dataset_on_build (and the compressed dataset is always added to the index).
diff --git a/cpp/src/core/setup.cpp b/cpp/src/core/setup.cpp
new file mode 100644
index 0000000000..1f05390d51
--- /dev/null
+++ b/cpp/src/core/setup.cpp
@@ -0,0 +1,140 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuda_profiler_api.h>
+#include <cuda_runtime_api.h>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/failure_callback_resource_adaptor.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <raft/core/logger.hpp>
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/integer_utils.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <exception>
+#include <optional>
+
+/** Report a more verbose error with a backtrace when OOM occurs on RMM side. */
+inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool
+{
+  auto cuda_status = cudaGetLastError();
+  size_t free      = 0;
+  size_t total     = 0;
+  RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total));
+  RAFT_FAIL(
+    "[cuVS Performance] Failed to allocate %zu bytes using RMM memory resource. "
+    "NB: latest cuda status = %s, free memory = %zu, total memory = %zu.",
+    bytes,
+    cudaGetErrorName(cuda_status),
+    free,
+    total);
+}
+
+/** Helper class to setup a pool memory resource for a single device. */
+class global_mem_resource {
+ public:
+  using pool_mr_type  = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  using mr_type       = rmm::mr::failure_callback_resource_adaptor<pool_mr_type>;
+  using large_mr_type = rmm::mr::managed_memory_resource;
+
+  constexpr static size_t kInitialSize           = 1024ull * 1024ull * 1024ull;
+  constexpr static double kMaxMemoryUsage        = 0.8;
+  constexpr static double kMaxInitialMemoryUsage = 0.5;
+
+  global_mem_resource()
+  try
+    : orig_resource_{rmm::mr::get_current_device_resource()},
+      pool_resource_(orig_resource_, compute_initial_size(), compute_max_size()),
+      resource_(&pool_resource_, rmm_oom_callback, nullptr) {
+    rmm::mr::set_current_device_resource(&resource_);
+  } catch (const std::exception& e) {
+    auto cuda_status = cudaGetLastError();
+    size_t free      = 0;
+    size_t total     = 0;
+    RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total));
+    RAFT_FAIL(
+      "Failed to initialize shared raft resources (NB: latest cuda status = %s, free memory = %zu, "
+      "total memory = %zu): %s",
+      cudaGetErrorName(cuda_status),
+      free,
+      total,
+      e.what());
+  }
+
+  global_mem_resource(global_mem_resource&&)                               = delete;
+  auto operator=(global_mem_resource&&) -> global_mem_resource&            = delete;
+  global_mem_resource(const global_mem_resource& res)                      = delete;
+  auto operator=(const global_mem_resource& other) -> global_mem_resource& = delete;
+
+  ~global_mem_resource() noexcept { rmm::mr::set_current_device_resource(orig_resource_); }
+
+ private:
+  rmm::mr::device_memory_resource* orig_resource_;
+  pool_mr_type pool_resource_;
+  mr_type resource_;
+
+  static auto compute_initial_size() -> size_t
+  {
+    size_t free_bytes  = 0;
+    size_t total_bytes = 0;
+    if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) { free_bytes = kInitialSize; }
+    auto limit = static_cast<size_t>(free_bytes * kMaxInitialMemoryUsage);
+    return raft::round_up_safe<size_t>(std::min<size_t>(kInitialSize, limit), 256ull);
+  }
+
+  static auto compute_max_size() -> size_t
+  {
+    size_t free_bytes  = 0;
+    size_t total_bytes = 0;
+    if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) { total_bytes = kInitialSize; }
+    return raft::round_up_safe<size_t>(total_bytes * kMaxMemoryUsage, 256ull);
+  }
+};
+
+/** Remember and restore the current device. */
+struct keep_current_device_raii {
+  int initial_device_id = 0;
+  keep_current_device_raii() { cudaGetDevice(&initial_device_id); }
+  ~keep_current_device_raii() { cudaSetDevice(initial_device_id); }
+};
+
+/** Handles for each RMM memory resource. */
+static std::array<std::optional<global_mem_resource>, 8> global_mem_resource_;
+
+// Initialize at the moment libcuvs.so is loaded.
+__attribute__((constructor)) void cuvs_performance_init()
+{
+  int device_count = 0;
+  if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count <= 0) { return; }
+
+  // Start CUDA profiler (ignore errors if not supported in this context)
+  (void)cudaProfilerStart();
+
+  keep_current_device_raii keep_current_device;
+
+  // == DISABLED FOR NOW TO TEST ACTUAL PERFORMANCE ==
+  // Configure each device with a pool memory resource
+  // for (int device_id = 0; device_id < device_count; ++device_id) {
+  //   if (cudaSetDevice(device_id) != cudaSuccess) { continue; }
+  //   global_mem_resource_[device_id].emplace();
+  // }
+}
+
+// Cleanup before unloading libcuvs.so.
+__attribute__((destructor)) void cuvs_performance_cleanup()
+{
+  (void)cudaProfilerStop();
+
+  keep_current_device_raii keep_current_device;
+  for (auto& global_mem_resource : global_mem_resource_) {
+    global_mem_resource.reset();
+  }
+}
diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh
index 73c3794d39..b761563cce 100644
--- a/cpp/src/neighbors/cagra.cuh
+++ b/cpp/src/neighbors/cagra.cuh
@@ -257,9 +257,11 @@ void optimize(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
-  const bool guarantee_connectivity = false)
+  const bool guarantee_connectivity           = false,
+  const double variable_graph_degree_fraction = 1.0)
 {
-  detail::optimize(res, knn_graph, new_graph, guarantee_connectivity);
+  detail::optimize(
+    res, knn_graph, new_graph, guarantee_connectivity, variable_graph_degree_fraction);
 }
 
 template <typename T,
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 415ebc635c..54e5a994db 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1904,7 +1904,8 @@ void optimize(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
-  const bool guarantee_connectivity = false)
+  const bool guarantee_connectivity           = false,
+  const double variable_graph_degree_fraction = 1.0)
 {
   using internal_IdxT = typename std::make_unsigned<IdxT>::type;
 
@@ -1921,8 +1922,12 @@ void optimize(
       knn_graph.extent(0),
       knn_graph.extent(1));
 
-  cagra::detail::graph::optimize(
-    res, knn_graph_internal, new_graph_internal, guarantee_connectivity);
+  cagra::detail::graph::optimize(res,
+                                 knn_graph_internal,
+                                 new_graph_internal,
+                                 guarantee_connectivity,
+                                 true,
+                                 variable_graph_degree_fraction);
 }
 
 // RAII wrapper for allocating memory with Transparent HugePage
@@ -2141,8 +2146,11 @@ auto iterative_build_graph(
     auto next_graph_size = curr_query_size;
     cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);  // delete existing grahp
     cagra_graph = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
-    optimize<IdxT>(
-      res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
+    optimize<IdxT>(res,
+                   neighbors_view,
+                   cagra_graph.view(),
+                   flag_last ? params.guarantee_connectivity : false,
+                   flag_last ? params.variable_graph_degree_fraction : 1.0);
 
     auto end        = std::chrono::high_resolution_clock::now();
     auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
@@ -2262,7 +2270,11 @@ index<T, IdxT> build(
     cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
 
     RAFT_LOG_TRACE("optimizing graph");
-    optimize<IdxT>(res, knn_graph->view(), cagra_graph.view(), params.guarantee_connectivity);
+    optimize<IdxT>(res,
+                   knn_graph->view(),
+                   cagra_graph.view(),
+                   params.guarantee_connectivity,
+                   params.variable_graph_degree_fraction);
 
     // free intermediate graph before trying to create the index
     knn_graph.reset();
diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
index 323184e757..1bb48c4787 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
@@ -196,8 +196,14 @@ void serialize_to_hnswlib(
   size_t bytes_written = 0;
   float GiB            = 1 << 30;
   for (std::size_t i = 0; i < index_.size(); i++) {
-    auto graph_degree = static_cast<int>(index_.graph_degree());
-    os.write(reinterpret_cast<char*>(&graph_degree), sizeof(int));
+    int actual_degree = static_cast<int>(index_.graph_degree());
+    for (int j = 0; j < actual_degree; j++) {
+      if (host_graph(i, j) == static_cast<IdxT>(-1)) {
+        actual_degree = j;
+        break;
+      }
+    }
+    os.write(reinterpret_cast<char*>(&actual_degree), sizeof(int));
 
     IdxT* graph_row = &host_graph(i, 0);
     os.write(reinterpret_cast<char*>(graph_row), sizeof(IdxT) * index_.graph_degree());
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 8546ad307e..eec9b1179f 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -1151,11 +1151,10 @@ void optimize(
   raft::resources const& res,
   raft::mdspan<IdxT, raft::matrix_extent<int64_t>, raft::row_major, g_accessor> knn_graph,
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> new_graph,
-  const bool guarantee_connectivity = true,
-  const bool use_gpu                = true)
+  const bool guarantee_connectivity           = true,
+  const bool use_gpu                          = true,
+  const double variable_graph_degree_fraction = 1.0)
 {
-  RAFT_LOG_DEBUG(
-    "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
   auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
 
   RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
@@ -1163,14 +1162,23 @@ void optimize(
   RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
                "output graph cannot have more columns than input graph");
   // const uint64_t input_graph_degree  = knn_graph.extent(1);
-  const uint64_t knn_graph_degree    = knn_graph.extent(1);
-  const uint64_t output_graph_degree = new_graph.extent(1);
-  const uint64_t graph_size          = new_graph.extent(0);
+  const uint64_t knn_graph_degree     = knn_graph.extent(1);
+  const uint64_t output_graph_degree  = new_graph.extent(1);
+  const uint64_t graph_size           = new_graph.extent(0);
+  const uint64_t target_pruned_degree = std::max<uint64_t>(
+    1, static_cast<uint64_t>(std::ceil(output_graph_degree * variable_graph_degree_fraction)));
   // auto input_graph_ptr               = knn_graph.data_handle();
   auto output_graph_ptr = new_graph.data_handle();
+  RAFT_LOG_INFO("# Pruning kNN graph (size=%lu, degree=%lu, target_pruned_degree=%lu)\n",
+                graph_size,
+                knn_graph_degree,
+                target_pruned_degree);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree);
 
+  const bool variable_graph_degree = (target_pruned_degree < output_graph_degree);
+  auto natural_degree_vec          = raft::make_host_vector<uint32_t, int64_t>(graph_size);
+
   // MST optimization
   auto mst_graph               = raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0);
   auto mst_graph_num_edges     = raft::make_host_vector<uint32_t, int64_t>(graph_size);
@@ -1333,8 +1341,9 @@ void optimize(
     for (uint64_t i = 0; i < graph_size; i++) {
       // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable
       // count of the neighbors while increasing the target detourable count from zero.
-      uint64_t pk         = 0;
-      uint32_t num_detour = 0;
+      uint64_t pk             = 0;
+      uint32_t num_detour     = 0;
+      uint64_t num_low_detour = 0;
       for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
         uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
         for (uint64_t k = 0; k < knn_graph_degree; k++) {
@@ -1362,6 +1371,7 @@ void optimize(
           }
           if (pk >= output_graph_degree) break;
         }
+        if (num_low_detour < target_pruned_degree) { num_low_detour = pk; }
         if (pk >= output_graph_degree) break;
 
         if (next_num_detour == std::numeric_limits<uint32_t>::max()) {
@@ -1379,6 +1389,9 @@ void optimize(
           i);
         invalid_neighbor_list = true;
       }
+      natural_degree_vec(i) = variable_graph_degree
+                                ? std::min<uint64_t>(num_low_detour, output_graph_degree)
+                                : output_graph_degree;
     }
     RAFT_EXPECTS(
       !invalid_neighbor_list,
@@ -1460,6 +1473,8 @@ void optimize(
     bool check_num_protected_edges = true;
 #pragma omp parallel for
     for (uint64_t i = 0; i < graph_size; i++) {
+      auto effective_degree = variable_graph_degree ? natural_degree_vec(i) : output_graph_degree;
+
       auto my_rev_graph = rev_graph.data_handle() + (output_graph_degree * i);
       auto my_out_graph = output_graph_ptr + (output_graph_degree * i);
 
@@ -1499,9 +1514,22 @@ void optimize(
       }
 
       const auto num_protected_edges =
-        std::max<uint64_t>(mst_graph_num_edges_ptr[i], output_graph_degree / 2);
-      if (num_protected_edges > output_graph_degree) { check_num_protected_edges = false; }
-      if (num_protected_edges == output_graph_degree) continue;
+        std::max<uint64_t>(mst_graph_num_edges_ptr[i], effective_degree / 2);
+      if (num_protected_edges > effective_degree) { check_num_protected_edges = false; }
+      if (num_protected_edges == effective_degree) {
+        if (variable_graph_degree) {
+          for (uint32_t j = effective_degree; j < output_graph_degree; j++) {
+            my_out_graph[j] = static_cast<IdxT>(-1);
+          }
+          natural_degree_vec(i) = effective_degree;
+        }
+        if (guarantee_connectivity) {
+          for (uint32_t j = 0; j < output_graph_degree; j++) {
+            output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j];
+          }
+        }
+        continue;
+      }
 
       // Replace some edges of the output graph with edges of the reverse graph.
       auto kr = std::min<uint32_t>(rev_graph_count.data_handle()[i], output_graph_degree);
@@ -1510,15 +1538,31 @@ void optimize(
         if (my_rev_graph[kr] < graph_size) {
           uint64_t pos = pos_in_array<IdxT>(my_rev_graph[kr], my_out_graph, output_graph_degree);
           if (pos < num_protected_edges) { continue; }
-          uint64_t num_shift = pos - num_protected_edges;
-          if (pos >= output_graph_degree) {
-            num_shift = output_graph_degree - num_protected_edges - 1;
+
+          if (pos < effective_degree) {
+            uint64_t num_shift = pos - num_protected_edges;
+            shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
+            my_out_graph[num_protected_edges] = my_rev_graph[kr];
+          } else if (pos < output_graph_degree) {
+            continue;
+          } else if (effective_degree < output_graph_degree) {
+            my_out_graph[effective_degree] = my_rev_graph[kr];
+            effective_degree++;
+          } else {
+            uint64_t num_shift = effective_degree - num_protected_edges - 1;
+            shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
+            my_out_graph[num_protected_edges] = my_rev_graph[kr];
           }
-          shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
-          my_out_graph[num_protected_edges] = my_rev_graph[kr];
         }
       }
 
+      if (variable_graph_degree) {
+        for (uint32_t j = effective_degree; j < output_graph_degree; j++) {
+          my_out_graph[j] = static_cast<IdxT>(-1);
+        }
+        natural_degree_vec(i) = effective_degree;
+      }
+
       // If guarantee_connectivity == true, move the output neighbor list from the temporal list to
       // the output list. If false, the copy is not needed because my_out_graph is a pointer to the
       // output buffer.
@@ -1532,6 +1576,17 @@ void optimize(
                  "Failed to merge the MST, pruned, and reverse edge graphs. Some nodes have too "
                  "many MST optimization edges.");
 
+    if (variable_graph_degree) {
+      uint64_t total_natural = 0;
+#pragma omp parallel for reduction(+ : total_natural)
+      for (uint64_t i = 0; i < graph_size; i++) {
+        total_natural += natural_degree_vec(i);
+      }
+      RAFT_LOG_INFO("# Variable graph degree: avg natural degree = %.2f / %u",
+                    static_cast<double>(total_natural) / graph_size,
+                    output_graph_degree);
+    }
+
     const double time_replace_end = cur_time();
     RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms",
                    (time_replace_end - time_replace_start) * 1000.0);
@@ -1607,6 +1662,8 @@ void optimize(
       for (uint32_t j = 0; j < output_graph_degree; j++) {
         const auto neighbor_a = my_out_graph[j];
 
+        if (neighbor_a == static_cast<IdxT>(-1)) { continue; }
+
         // Check oor
         if (neighbor_a > graph_size) {
           num_oor++;
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 4914a0fa1b..5052fc538f 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -279,12 +279,16 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
   for (size_t i = 0; i < static_cast<size_t>(host_graph_view.extent(0)); ++i) {
     auto hnsw_internal_id = appr_algo->label_lookup_.find(i)->second;
     auto ll_i             = appr_algo->get_linklist0(hnsw_internal_id);
-    appr_algo->setListCount(ll_i, host_graph_view.extent(1));
-    auto* data = (uint32_t*)(ll_i + 1);
+    size_t actual_count   = 0;
+    auto* data            = (uint32_t*)(ll_i + 1);
     for (size_t j = 0; j < static_cast<size_t>(host_graph_view.extent(1)); ++j) {
-      auto neighbor_internal_id = appr_algo->label_lookup_.find(host_graph(i, j))->second;
-      data[j]                   = neighbor_internal_id;
+      auto neighbor_id = host_graph(i, j);
+      if (neighbor_id == static_cast<uint32_t>(-1)) { break; }
+      auto neighbor_internal_id = appr_algo->label_lookup_.find(neighbor_id)->second;
+      data[actual_count]        = neighbor_internal_id;
+      actual_count++;
     }
+    appr_algo->setListCount(ll_i, actual_count);
   }
 
   hnsw_index->set_index(std::move(appr_algo));
@@ -676,9 +680,16 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
     for (int64_t batch_idx = 0; batch_idx < current_batch_size; batch_idx++) {
       const int64_t i = batch_start + batch_idx;
 
-      os.write(reinterpret_cast<char*>(&graph_degree_int), sizeof(int));
-
+      int actual_degree_int = graph_degree_int;
       const IdxT* graph_row = &graph_buffer(batch_idx, 0);
+      for (int gj = 0; gj < graph_degree_int; gj++) {
+        if (graph_row[gj] == static_cast<IdxT>(-1)) {
+          actual_degree_int = gj;
+          break;
+        }
+      }
+      os.write(reinterpret_cast<char*>(&actual_degree_int), sizeof(int));
+
       os.write(reinterpret_cast<const char*>(graph_row), sizeof(IdxT) * graph_degree_int);
 
       if (odd_graph_degree) {
@@ -1039,12 +1050,16 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
     common::nvtx::range<common::nvtx::domain::cuvs> copy_scope("get_linklist0<host>");
 #pragma omp parallel for num_threads(num_threads)
     for (int64_t i = 0; i < n_rows; i++) {
-      auto ll_i = appr_algo->get_linklist0(i);
-      appr_algo->setListCount(ll_i, degree);
-      auto* data = (uint32_t*)(ll_i + 1);
+      auto ll_i            = appr_algo->get_linklist0(i);
+      auto* data           = (uint32_t*)(ll_i + 1);
+      int64_t actual_count = 0;
       for (int64_t j = 0; j < degree; j++) {
-        data[j] = graph_ptr[i * degree + j];
+        auto neighbor_id = graph_ptr[i * degree + j];
+        if (neighbor_id == static_cast<uint32_t>(-1)) { break; }
+        data[actual_count] = neighbor_id;
+        actual_count++;
       }
+      appr_algo->setListCount(ll_i, actual_count);
     }
   } else {
     common::nvtx::range<common::nvtx::domain::cuvs> copy_scope("get_linklist0<device>");
@@ -1056,11 +1071,20 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
                                     n_rows,
                                     cudaMemcpyDefault,
                                     raft::resource::get_cuda_stream(res)));
+    raft::resource::sync_stream(res);
 #pragma omp parallel for num_threads(num_threads)
     for (int64_t i = 0; i < n_rows; i++) {
-      appr_algo->setListCount(appr_algo->get_linklist0(i), degree);
+      auto ll_i            = appr_algo->get_linklist0(i);
+      auto* data           = (uint32_t*)(ll_i + 1);
+      int64_t actual_count = degree;
+      for (int64_t j = 0; j < degree; j++) {
+        if (data[j] == static_cast<uint32_t>(-1)) {
+          actual_count = j;
+          break;
+        }
+      }
+      appr_algo->setListCount(ll_i, actual_count);
     }
-    raft::resource::sync_stream(res);
   }
   hnsw_index->set_index(std::move(appr_algo));
   return hnsw_index;

From 261fc3d64a58ed26a741115f255fa6900fc7801d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Apr 2026 07:20:45 +0200
Subject: [PATCH 2/5] Improve the trimming logic

---
 cpp/include/cuvs/neighbors/cagra.hpp          |  2 +-
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 42 +++++--------------
 2 files changed, 12 insertions(+), 32 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 3c9f9b5546..1ae3334fcc 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -147,7 +147,7 @@ struct index_params : cuvs::neighbors::index_params {
   /**
    * Fraction of output graph_degree used as the target for low-detour edges
    * during the pruning step. Must be in (0, 1]. The default value of 1.0
-   * disables variable-degree logic (original CAGRA behavior). Values < 1.0
+   * disables variable-degree logic (normal CAGRA behavior). Values < 1.0
    * enable variable-degree graphs: the optimize step finds the minimum detour
    * threshold that covers at least ceil(graph_degree * fraction) edges per node,
    * then lets reverse edges expand the degree further. Unused slots are filled
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index eec9b1179f..cf69455fed 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -1431,8 +1431,9 @@ void optimize(
     for (uint64_t k = 0; k < output_graph_degree; k++) {
 #pragma omp parallel for
       for (uint64_t i = 0; i < graph_size; i++) {
-        // dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
-        dest_nodes(i) = output_graph_ptr[k + (output_graph_degree * i)];
+        dest_nodes(i) = (k < natural_degree_vec(i))
+                          ? output_graph_ptr[k + (output_graph_degree * i)]
+                          : static_cast<IdxT>(graph_size);
       }
       raft::resource::sync_stream(res);
 
@@ -1513,23 +1514,9 @@ void optimize(
         }
       }
 
-      const auto num_protected_edges =
-        std::max<uint64_t>(mst_graph_num_edges_ptr[i], effective_degree / 2);
+      const auto num_protected_edges = std::max<uint64_t>(
+        mst_graph_num_edges_ptr[i], std::min<uint64_t>(effective_degree, output_graph_degree / 2));
       if (num_protected_edges > effective_degree) { check_num_protected_edges = false; }
-      if (num_protected_edges == effective_degree) {
-        if (variable_graph_degree) {
-          for (uint32_t j = effective_degree; j < output_graph_degree; j++) {
-            my_out_graph[j] = static_cast<IdxT>(-1);
-          }
-          natural_degree_vec(i) = effective_degree;
-        }
-        if (guarantee_connectivity) {
-          for (uint32_t j = 0; j < output_graph_degree; j++) {
-            output_graph_ptr[(output_graph_degree * i) + j] = my_out_graph[j];
-          }
-        }
-        continue;
-      }
 
       // Replace some edges of the output graph with edges of the reverse graph.
       auto kr = std::min<uint32_t>(rev_graph_count.data_handle()[i], output_graph_degree);
@@ -1539,20 +1526,13 @@ void optimize(
           uint64_t pos = pos_in_array<IdxT>(my_rev_graph[kr], my_out_graph, output_graph_degree);
           if (pos < num_protected_edges) { continue; }
 
-          if (pos < effective_degree) {
-            uint64_t num_shift = pos - num_protected_edges;
-            shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
-            my_out_graph[num_protected_edges] = my_rev_graph[kr];
-          } else if (pos < output_graph_degree) {
-            continue;
-          } else if (effective_degree < output_graph_degree) {
-            my_out_graph[effective_degree] = my_rev_graph[kr];
-            effective_degree++;
-          } else {
-            uint64_t num_shift = effective_degree - num_protected_edges - 1;
-            shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
-            my_out_graph[num_protected_edges] = my_rev_graph[kr];
+          uint64_t num_shift = pos - num_protected_edges;
+          if (pos >= output_graph_degree) {
+            num_shift = output_graph_degree - num_protected_edges - 1;
           }
+          shift_array<IdxT>(my_out_graph + num_protected_edges, num_shift);
+          my_out_graph[num_protected_edges] = my_rev_graph[kr];
+          if (effective_degree < output_graph_degree) { effective_degree++; }
         }
       }
 

From b642179b8685d47ccfcd9eefef7f5600d4f5dbed Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Apr 2026 07:27:41 +0200
Subject: [PATCH 3/5] Improve the wording

---
 cpp/include/cuvs/neighbors/cagra.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 1ae3334fcc..b93cb8b0f1 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -145,7 +145,10 @@ struct index_params : cuvs::neighbors::index_params {
   /** Degree of output graph. */
   size_t graph_degree = 64;
   /**
-   * Fraction of output graph_degree used as the target for low-detour edges
+   * Fraction of output graph_degree to define the minimum output graph degree,
+   * allowing variable-degree neighbor graphs.
+   *
+   * This fraction is used as the target for low-detour edges
    * during the pruning step. Must be in (0, 1]. The default value of 1.0
    * disables variable-degree logic (normal CAGRA behavior). Values < 1.0
    * enable variable-degree graphs: the optimize step finds the minimum detour

From de0bcaa6c9668e9bf8a6d734ed5abe6cf3c4fec7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Apr 2026 10:31:08 +0200
Subject: [PATCH 4/5] Remove irrelevant file

---
 cpp/src/core/setup.cpp | 140 -----------------------------------------
 1 file changed, 140 deletions(-)
 delete mode 100644 cpp/src/core/setup.cpp

diff --git a/cpp/src/core/setup.cpp b/cpp/src/core/setup.cpp
deleted file mode 100644
index 1f05390d51..0000000000
--- a/cpp/src/core/setup.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include <cuda_profiler_api.h>
-#include <cuda_runtime_api.h>
-
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/failure_callback_resource_adaptor.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <raft/core/logger.hpp>
-#include <raft/util/cuda_rt_essentials.hpp>
-#include <raft/util/integer_utils.hpp>
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <exception>
-#include <optional>
-
-/** Report a more verbose error with a backtrace when OOM occurs on RMM side. */
-inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool
-{
-  auto cuda_status = cudaGetLastError();
-  size_t free      = 0;
-  size_t total     = 0;
-  RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total));
-  RAFT_FAIL(
-    "[cuVS Performance] Failed to allocate %zu bytes using RMM memory resource. "
-    "NB: latest cuda status = %s, free memory = %zu, total memory = %zu.",
-    bytes,
-    cudaGetErrorName(cuda_status),
-    free,
-    total);
-}
-
-/** Helper class to setup a pool memory resource for a single device. */
-class global_mem_resource {
- public:
-  using pool_mr_type  = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-  using mr_type       = rmm::mr::failure_callback_resource_adaptor<pool_mr_type>;
-  using large_mr_type = rmm::mr::managed_memory_resource;
-
-  constexpr static size_t kInitialSize           = 1024ull * 1024ull * 1024ull;
-  constexpr static double kMaxMemoryUsage        = 0.8;
-  constexpr static double kMaxInitialMemoryUsage = 0.5;
-
-  global_mem_resource()
-  try
-    : orig_resource_{rmm::mr::get_current_device_resource()},
-      pool_resource_(orig_resource_, compute_initial_size(), compute_max_size()),
-      resource_(&pool_resource_, rmm_oom_callback, nullptr) {
-    rmm::mr::set_current_device_resource(&resource_);
-  } catch (const std::exception& e) {
-    auto cuda_status = cudaGetLastError();
-    size_t free      = 0;
-    size_t total     = 0;
-    RAFT_CUDA_TRY_NO_THROW(cudaMemGetInfo(&free, &total));
-    RAFT_FAIL(
-      "Failed to initialize shared raft resources (NB: latest cuda status = %s, free memory = %zu, "
-      "total memory = %zu): %s",
-      cudaGetErrorName(cuda_status),
-      free,
-      total,
-      e.what());
-  }
-
-  global_mem_resource(global_mem_resource&&)                               = delete;
-  auto operator=(global_mem_resource&&) -> global_mem_resource&            = delete;
-  global_mem_resource(const global_mem_resource& res)                      = delete;
-  auto operator=(const global_mem_resource& other) -> global_mem_resource& = delete;
-
-  ~global_mem_resource() noexcept { rmm::mr::set_current_device_resource(orig_resource_); }
-
- private:
-  rmm::mr::device_memory_resource* orig_resource_;
-  pool_mr_type pool_resource_;
-  mr_type resource_;
-
-  static auto compute_initial_size() -> size_t
-  {
-    size_t free_bytes  = 0;
-    size_t total_bytes = 0;
-    if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) { free_bytes = kInitialSize; }
-    auto limit = static_cast<size_t>(free_bytes * kMaxInitialMemoryUsage);
-    return raft::round_up_safe<size_t>(std::min<size_t>(kInitialSize, limit), 256ull);
-  }
-
-  static auto compute_max_size() -> size_t
-  {
-    size_t free_bytes  = 0;
-    size_t total_bytes = 0;
-    if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) { total_bytes = kInitialSize; }
-    return raft::round_up_safe<size_t>(total_bytes * kMaxMemoryUsage, 256ull);
-  }
-};
-
-/** Remember and restore the current device. */
-struct keep_current_device_raii {
-  int initial_device_id = 0;
-  keep_current_device_raii() { cudaGetDevice(&initial_device_id); }
-  ~keep_current_device_raii() { cudaSetDevice(initial_device_id); }
-};
-
-/** Handles for each RMM memory resource. */
-static std::array<std::optional<global_mem_resource>, 8> global_mem_resource_;
-
-// Initialize at the moment libcuvs.so is loaded.
-__attribute__((constructor)) void cuvs_performance_init()
-{
-  int device_count = 0;
-  if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count <= 0) { return; }
-
-  // Start CUDA profiler (ignore errors if not supported in this context)
-  (void)cudaProfilerStart();
-
-  keep_current_device_raii keep_current_device;
-
-  // == DISABLED FOR NOW TO TEST ACTUAL PERFORMANCE ==
-  // Configure each device with a pool memory resource
-  // for (int device_id = 0; device_id < device_count; ++device_id) {
-  //   if (cudaSetDevice(device_id) != cudaSuccess) { continue; }
-  //   global_mem_resource_[device_id].emplace();
-  // }
-}
-
-// Cleanup before unloading libcuvs.so.
-__attribute__((destructor)) void cuvs_performance_cleanup()
-{
-  (void)cudaProfilerStop();
-
-  keep_current_device_raii keep_current_device;
-  for (auto& global_mem_resource : global_mem_resource_) {
-    global_mem_resource.reset();
-  }
-}

From a0f6379699d6fb68e87e4880d47dfec71778d6bc Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Apr 2026 15:54:25 +0200
Subject: [PATCH 5/5] Set defaults for HNSW heuristics and fix parameter
 parsing

---
 .../src/cuvs/cuvs_ann_bench_param_parser.h    | 60 +++++++++++--------
 cpp/src/neighbors/cagra.cpp                   | 12 ++--
 2 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 2eaf3123a0..32ca6040a0 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -265,24 +265,52 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
     params.guarantee_connectivity = conf.at("guarantee_connectivity");
   }
 
-  // Override the graph_build_algo if requested explicitly
+  if (conf.contains("variable_graph_degree_fraction")) {
+    params.variable_graph_degree_fraction = conf.at("variable_graph_degree_fraction");
+  }
+
+  // Extract build-algo-specific parameters
+  nlohmann::json ivf_pq_build_conf  = collect_conf_with_prefix(conf, "ivf_pq_build_");
+  nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_");
+  nlohmann::json nn_descent_conf    = collect_conf_with_prefix(conf, "nn_descent_");
+  nlohmann::json ace_conf           = collect_conf_with_prefix(conf, "ace_");
+
+  // Determine and initialize graph build algorithm.
+  // Priority 1: explicit "graph_build_algo" config key.
+  // Priority 2: infer from algorithm-specific prefixed config keys (only when monostate).
+  // Priority 3: leave as-is (from prior heuristics or monostate for AUTO at build time).
+  std::string graph_build_algo;
   if (conf.contains("graph_build_algo")) {
-    if (conf.at("graph_build_algo") == "IVF_PQ") {
+    graph_build_algo = conf.at("graph_build_algo");
+  } else if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
+    if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) {
+      graph_build_algo = "IVF_PQ";
+    } else if (!nn_descent_conf.empty()) {
+      graph_build_algo = "NN_DESCENT";
+    } else if (!ace_conf.empty()) {
+      graph_build_algo = "ACE";
+    }
+    // else: leave as monostate → AUTO in cagra_build.cuh
+  }
+
+  if (!graph_build_algo.empty()) {
+    if (graph_build_algo == "IVF_PQ") {
       if (!std::holds_alternative<cuvs::neighbors::graph_build_params::ivf_pq_params>(
             params.graph_build_params)) {
         params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{};
       }
-    } else if (conf.at("graph_build_algo") == "NN_DESCENT") {
+    } else if (graph_build_algo == "NN_DESCENT") {
       if (!std::holds_alternative<cuvs::neighbors::graph_build_params::nn_descent_params>(
             params.graph_build_params)) {
-        params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{};
+        params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params(
+          params.intermediate_graph_degree, params.metric);
       }
-    } else if (conf.at("graph_build_algo") == "ACE") {
+    } else if (graph_build_algo == "ACE") {
       if (!std::holds_alternative<cuvs::neighbors::graph_build_params::ace_params>(
             params.graph_build_params)) {
         params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{};
       }
-    } else if (conf.at("graph_build_algo") == "ITERATIVE_SEARCH") {
+    } else if (graph_build_algo == "ITERATIVE_SEARCH") {
       if (!std::holds_alternative<cuvs::neighbors::graph_build_params::iterative_search_params>(
             params.graph_build_params)) {
         params.graph_build_params = cuvs::neighbors::graph_build_params::iterative_search_params{};
@@ -290,26 +318,6 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
     }
   }
 
-  // Parse build-algo-specific parameters and use them to decide on the algo type
-  nlohmann::json ivf_pq_build_conf  = collect_conf_with_prefix(conf, "ivf_pq_build_");
-  nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_");
-  nlohmann::json nn_descent_conf    = collect_conf_with_prefix(conf, "nn_descent_");
-  nlohmann::json ace_conf           = collect_conf_with_prefix(conf, "ace_");
-
-  // When graph_build_algo is not specified, leave graph_build_params as monostate so the
-  // CAGRA build uses AUTO selection (NN_DESCENT or IVF_PQ based on dataset/heuristics).
-  // Only infer from algo-specific config keys when present.
-  if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
-    if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) {
-      params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{};
-    } else if (!nn_descent_conf.empty()) {
-      params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{};
-    } else if (!ace_conf.empty()) {
-      params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{};
-    }
-    // else: leave as monostate → AUTO in cagra_build.cuh
-  }
-
   // Apply build-algo-specific parameters
   std::visit(
     [&](auto& arg) {
diff --git a/cpp/src/neighbors/cagra.cpp b/cpp/src/neighbors/cagra.cpp
index 6aa5737e36..cd63de9a24 100644
--- a/cpp/src/neighbors/cagra.cpp
+++ b/cpp/src/neighbors/cagra.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -40,13 +40,15 @@ cagra::index_params index_params::from_hnsw_params(raft::matrix_extent<int64_t>
   cagra::index_params params;
   switch (heuristic) {
     case hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT:
-      params.graph_degree              = M * 2;
-      params.intermediate_graph_degree = M * 3;
+      params.graph_degree                   = M * 2;
+      params.intermediate_graph_degree      = M * 3;
+      params.variable_graph_degree_fraction = 0.35;
       break;
     case hnsw_heuristic_type::SIMILAR_SEARCH_PERFORMANCE:
     default:
-      params.graph_degree              = 2 + M * 2 / 3;
-      params.intermediate_graph_degree = M + M * ef_construction / 256;
+      params.graph_degree                   = M;
+      params.intermediate_graph_degree      = M + M * ef_construction / 256;
+      params.variable_graph_degree_fraction = 0.7;
       break;
   }
   params.graph_build_params =