rapidsai
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎c/tests/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎c/tests/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/build_go.sh‎
Lines changed: 1 addition & 1 deletion b/‎ci/build_go.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 22 additions & 40 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 22 additions & 40 deletions
diff --git a/‎cpp/bench/ann/src/common/benchmark.hpp‎
Lines changed: 43 additions & 39 deletions b/‎cpp/bench/ann/src/common/benchmark.hpp‎
Lines changed: 43 additions & 39 deletions
@@ -92,7 +92,7 @@ repos:
                 files: rust/.*
                 language: rust
       - repo: https://github.com/codespell-project/codespell
-        rev: v2.2.2
+        rev: v2.4.1
         hooks:
               - id: codespell
                 additional_dependencies: [tomli]
 
@@ -1,6 +1,6 @@
 # =============================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 # =============================================================================
 
@@ -1,5 +1,5 @@
 #!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
 
@@ -111,7 +111,7 @@ message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}")
 message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_CUVS_BENCH}")
 message(VERBOSE "cuVS: Build only the shared library: ${CUVS_COMPILE_DYNAMIC_ONLY}")
 message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
-message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
+message(VERBOSE "cuVS: Disable deprecation warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}")
 message(VERBOSE "cuVS: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}")
 message(VERBOSE "cuVS: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
@@ -359,6 +359,7 @@ if(NOT BUILD_CPU_ONLY)
 
   set(JIT_LTO_TARGET_ARCHITECTURE "")
   set(JIT_LTO_COMPILATION OFF)
+  set(jit_lto_files)
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
     set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
     set(JIT_LTO_COMPILATION ON)
@@ -436,28 +437,16 @@ if(NOT BUILD_CPU_ONLY)
     )
     endblock()
 
-    add_library(
-      cuvs_jit_lto_kernels STATIC
-      ${interleaved_scan_files}
-      ${metric_files}
-      ${filter_files}
-      ${post_lambda_files}
-      src/detail/jit_lto/AlgorithmLauncher.cu
-      src/detail/jit_lto/AlgorithmPlanner.cu
-      src/detail/jit_lto/FragmentDatabase.cu
-      src/detail/jit_lto/FragmentEntry.cu
-      src/detail/jit_lto/nvjitlink_checker.cpp
+    set(jit_lto_files
+        ${interleaved_scan_files}
+        ${metric_files}
+        ${filter_files}
+        ${post_lambda_files}
+        src/detail/jit_lto/AlgorithmLauncher.cpp
+        src/detail/jit_lto/AlgorithmPlanner.cpp
+        src/detail/jit_lto/FragmentEntry.cpp
+        src/detail/jit_lto/nvjitlink_checker.cpp
     )
-    set_target_properties(
-      cuvs_jit_lto_kernels PROPERTIES POSITION_INDEPENDENT_CODE ON CXX_STANDARD 20
-    )
-    target_include_directories(
-      cuvs_jit_lto_kernels
-      PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src"
-              "${CMAKE_CURRENT_SOURCE_DIR}/../c/include"
-    )
-    target_link_libraries(cuvs_jit_lto_kernels PRIVATE raft::raft)
-    add_library(cuvs::cuvs_jit_lto_kernels ALIAS cuvs_jit_lto_kernels)
   endif()
 
   add_library(
@@ -667,6 +656,7 @@ if(NOT BUILD_CPU_ONLY)
     src/stats/silhouette_score.cu
     src/stats/trustworthiness_score.cu
     ${CUVS_MG_ALGOS}
+    ${jit_lto_files}
   )
 
   set_target_properties(
@@ -778,12 +768,8 @@ if(NOT BUILD_CPU_ONLY)
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
              $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
-      PRIVATE
-        $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-        $<COMPILE_ONLY:nvidia::cutlass::cutlass>
-        $<COMPILE_ONLY:cuco::cuco>
-        $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
-        $<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
+      PRIVATE $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> $<COMPILE_ONLY:nvidia::cutlass::cutlass>
+              $<COMPILE_ONLY:cuco::cuco> $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
     )
 
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
@@ -840,13 +826,11 @@ SECTIONS
              ${CUVS_CTK_MATH_DEPENDENCIES}
              $<TARGET_NAME_IF_EXISTS:NCCL::NCCL> # needs to be public for DT_NEEDED
              $<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
-      PRIVATE
-        $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-        $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
-        $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
-        $<COMPILE_ONLY:nvidia::cutlass::cutlass>
-        $<COMPILE_ONLY:cuco::cuco>
-        $<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
+      PRIVATE $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+              $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
+              $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
+              $<COMPILE_ONLY:nvidia::cutlass::cutlass>
+              $<COMPILE_ONLY:cuco::cuco>
     )
   endif()
 
@@ -887,11 +871,9 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENAB
   include(GNUInstallDirs)
   include(CPack)
 
-  set(target_names cuvs cuvs_static cuvs_jit_lto_kernels cuvs_cpp_headers cuvs_c)
-  set(component_names cuvs_shared cuvs_static cuvs_static cuvs_cpp_headers c_api)
-  set(export_names cuvs-shared-exports cuvs-static-exports cuvs-static-exports
-                   cuvs-cpp-headers-exports cuvs-c-exports
-  )
+  set(target_names cuvs cuvs_static cuvs_cpp_headers cuvs_c)
+  set(component_names cuvs_shared cuvs_static cuvs_cpp_headers c_api)
+  set(export_names cuvs-shared-exports cuvs-static-exports cuvs-cpp-headers-exports cuvs-c-exports)
   foreach(target component export IN ZIP_LISTS target_names component_names export_names)
     if(TARGET ${target})
       install(
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -351,15 +351,9 @@ void bench_search(::benchmark::State& state,
 
   // Each thread calculates recall on their partition of queries.
   // evaluate recall
-  if (dataset->max_k() >= k) {
-    const std::int32_t* gt             = dataset->gt_set();
-    const std::uint32_t* filter_bitset = dataset->filter_bitset(MemoryType::kHostMmap);
-    auto filter                        = [filter_bitset](std::int32_t i) -> bool {
-      if (filter_bitset == nullptr) { return true; }
-      auto word = filter_bitset[i >> 5];
-      return word & (1 << (i & 31));
-    };
-    const std::uint32_t max_k = dataset->max_k();
+  if (dataset->max_k() >= k && dataset->gt_maps().has_value()) {
+    // gt_maps[i] is a hash map of {id, neighbor_rank} for query i
+    const auto& gt_maps = dataset->gt_maps();
     result_buf.transfer_data(MemoryType::kHost, current_algo_props->query_memory_type);
     auto* neighbors_host    = reinterpret_cast<index_type*>(result_buf.data(MemoryType::kHost));
     std::size_t rows        = std::min(queries_processed, query_set_size);
@@ -369,39 +363,49 @@ void bench_search(::benchmark::State& state,
     // We go through the groundtruth with same stride as the benchmark loop.
     size_t out_offset   = 0;
     size_t batch_offset = (state.thread_index() * n_queries) % query_set_size;
+    // Avoid CPU oversubscription when parallelizing recall calculation loop
+    int num_recall_calculation_worker_threads =
+      std::thread::hardware_concurrency() / benchmark_n_threads - 1;  // -1 for the main thread
+    // ensure non-negative number of workers (possible if hardware_concurrency()
+    // does not return an expected value) by clamping to 0
+    if (num_recall_calculation_worker_threads < 0) { num_recall_calculation_worker_threads = 0; }
     while (out_offset < rows) {
-      for (std::size_t i = 0; i < n_queries; i++) {
-        size_t i_orig_idx = batch_offset + i;
-        size_t i_out_idx  = out_offset + i;
-        if (i_out_idx < rows) {
-          /* NOTE: recall correctness & filtering
-
-          In the loop below, we filter the ground truth values on-the-fly.
-          We need enough ground truth values to compute recall correctly though.
-          But the ground truth file only contains `max_k` values per row; if there are less valid
-          values than k among them, we overestimate the recall. Essentially, we compare the first
-          `filter_pass_count` values of the algorithm output, and this counter can be less than `k`.
-          In the extreme case of very high filtering rate, we may be bypassing entire rows of
-          results. However, this is still better than no recall estimate at all.
-
-          TODO: consider generating the filtered ground truth on-the-fly
-          */
-          uint32_t filter_pass_count = 0;
-          for (std::uint32_t l = 0; l < max_k && filter_pass_count < k; l++) {
-            auto exp_idx = gt[i_orig_idx * max_k + l];
-            if (!filter(exp_idx)) { continue; }
-            filter_pass_count++;
-            for (std::uint32_t j = 0; j < k; j++) {
-              auto act_idx = static_cast<std::int32_t>(neighbors_host[i_out_idx * k + j]);
-              if (act_idx == exp_idx) {
-                match_count++;
-                break;
-              }
-            }
+      std::vector<std::thread> recall_calculation_workers;
+      recall_calculation_workers.reserve(num_recall_calculation_worker_threads);
+      std::vector<std::size_t> local_match_count(num_recall_calculation_worker_threads + 1);
+      std::vector<std::size_t> local_total_count(num_recall_calculation_worker_threads + 1);
+      int chunk_size =
+        n_queries / (num_recall_calculation_worker_threads + 1);  // +1 for the main thread
+      int remainder           = n_queries % (num_recall_calculation_worker_threads + 1);
+      auto recall_calculation = [&](int start, int end, int tid) -> void {
+        for (int i = start; i < end; ++i) {
+          size_t i_orig_idx = batch_offset + i;
+          size_t i_out_idx  = out_offset + i;
+          if (i_out_idx < rows) {
+            auto* candidates       = neighbors_host + i_out_idx * k;
+            auto [matching, total] = gt_maps->count_matches(i_orig_idx, candidates, k);
+            local_match_count[tid] += matching;
+            local_total_count[tid] += total;
           }
-          total_count += filter_pass_count;
         }
+      };
+      // launch worker threads
+      int start = 0;
+      for (int tid = 0; tid < num_recall_calculation_worker_threads; tid++) {
+        int end = start + chunk_size;
+        if (tid < remainder) { ++end; }
+        recall_calculation_workers.emplace_back(recall_calculation, start, end, tid);
+        start = end;
       }
+      // main thread works on last chunk
+      recall_calculation(start, n_queries, num_recall_calculation_worker_threads);
+      // join all worker threads
+      for (auto& worker : recall_calculation_workers) {
+        worker.join();
+      }
+      match_count += std::accumulate(local_match_count.begin(), local_match_count.end(), 0);
+      total_count += std::accumulate(local_total_count.begin(), local_total_count.end(), 0);
+
       out_offset += n_queries;
       batch_offset = (batch_offset + queries_stride) % query_set_size;
     }