Skip to content

Commit fe095bb

Browse files
authored
Merge branch 'main' into 26.06-kmeans-fit
2 parents df1c56e + 3f77e46 commit fe095bb

87 files changed

Lines changed: 2132 additions & 1192 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ repos:
9292
files: rust/.*
9393
language: rust
9494
- repo: https://github.com/codespell-project/codespell
95-
rev: v2.2.2
95+
rev: v2.4.1
9696
hooks:
9797
- id: codespell
9898
additional_dependencies: [tomli]

c/tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# =============================================================================
22
# cmake-format: off
3-
# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
3+
# SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
44
# SPDX-License-Identifier: Apache-2.0
55
# cmake-format: on
66
# =============================================================================

ci/build_go.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
2+
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
33
# SPDX-License-Identifier: Apache-2.0
44

55
set -euo pipefail

cpp/CMakeLists.txt

Lines changed: 22 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}")
111111
message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_CUVS_BENCH}")
112112
message(VERBOSE "cuVS: Build only the shared library: ${CUVS_COMPILE_DYNAMIC_ONLY}")
113113
message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
114-
message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
114+
message(VERBOSE "cuVS: Disable deprecation warnings " ${DISABLE_DEPRECATION_WARNINGS})
115115
message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}")
116116
message(VERBOSE "cuVS: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}")
117117
message(VERBOSE "cuVS: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
@@ -359,6 +359,7 @@ if(NOT BUILD_CPU_ONLY)
359359

360360
set(JIT_LTO_TARGET_ARCHITECTURE "")
361361
set(JIT_LTO_COMPILATION OFF)
362+
set(jit_lto_files)
362363
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
363364
set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
364365
set(JIT_LTO_COMPILATION ON)
@@ -436,28 +437,16 @@ if(NOT BUILD_CPU_ONLY)
436437
)
437438
endblock()
438439

439-
add_library(
440-
cuvs_jit_lto_kernels STATIC
441-
${interleaved_scan_files}
442-
${metric_files}
443-
${filter_files}
444-
${post_lambda_files}
445-
src/detail/jit_lto/AlgorithmLauncher.cu
446-
src/detail/jit_lto/AlgorithmPlanner.cu
447-
src/detail/jit_lto/FragmentDatabase.cu
448-
src/detail/jit_lto/FragmentEntry.cu
449-
src/detail/jit_lto/nvjitlink_checker.cpp
440+
set(jit_lto_files
441+
${interleaved_scan_files}
442+
${metric_files}
443+
${filter_files}
444+
${post_lambda_files}
445+
src/detail/jit_lto/AlgorithmLauncher.cpp
446+
src/detail/jit_lto/AlgorithmPlanner.cpp
447+
src/detail/jit_lto/FragmentEntry.cpp
448+
src/detail/jit_lto/nvjitlink_checker.cpp
450449
)
451-
set_target_properties(
452-
cuvs_jit_lto_kernels PROPERTIES POSITION_INDEPENDENT_CODE ON CXX_STANDARD 20
453-
)
454-
target_include_directories(
455-
cuvs_jit_lto_kernels
456-
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src"
457-
"${CMAKE_CURRENT_SOURCE_DIR}/../c/include"
458-
)
459-
target_link_libraries(cuvs_jit_lto_kernels PRIVATE raft::raft)
460-
add_library(cuvs::cuvs_jit_lto_kernels ALIAS cuvs_jit_lto_kernels)
461450
endif()
462451

463452
add_library(
@@ -667,6 +656,7 @@ if(NOT BUILD_CPU_ONLY)
667656
src/stats/silhouette_score.cu
668657
src/stats/trustworthiness_score.cu
669658
${CUVS_MG_ALGOS}
659+
${jit_lto_files}
670660
)
671661

672662
set_target_properties(
@@ -778,12 +768,8 @@ if(NOT BUILD_CPU_ONLY)
778768
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
779769
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
780770
$<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
781-
PRIVATE
782-
$<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
783-
$<COMPILE_ONLY:nvidia::cutlass::cutlass>
784-
$<COMPILE_ONLY:cuco::cuco>
785-
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
786-
$<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
771+
PRIVATE $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> $<COMPILE_ONLY:nvidia::cutlass::cutlass>
772+
$<COMPILE_ONLY:cuco::cuco> $<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
787773
)
788774

789775
# ensure CUDA symbols aren't relocated to the middle of the debug build binaries
@@ -840,13 +826,11 @@ SECTIONS
840826
${CUVS_CTK_MATH_DEPENDENCIES}
841827
$<TARGET_NAME_IF_EXISTS:NCCL::NCCL> # needs to be public for DT_NEEDED
842828
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
843-
PRIVATE
844-
$<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
845-
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
846-
$<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
847-
$<COMPILE_ONLY:nvidia::cutlass::cutlass>
848-
$<COMPILE_ONLY:cuco::cuco>
849-
$<$<BOOL:${JIT_LTO_COMPILATION}>:$<LINK_LIBRARY:WHOLE_ARCHIVE,cuvs::cuvs_jit_lto_kernels>>
829+
PRIVATE $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
830+
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
831+
$<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
832+
$<COMPILE_ONLY:nvidia::cutlass::cutlass>
833+
$<COMPILE_ONLY:cuco::cuco>
850834
)
851835
endif()
852836

@@ -887,11 +871,9 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENAB
887871
include(GNUInstallDirs)
888872
include(CPack)
889873

890-
set(target_names cuvs cuvs_static cuvs_jit_lto_kernels cuvs_cpp_headers cuvs_c)
891-
set(component_names cuvs_shared cuvs_static cuvs_static cuvs_cpp_headers c_api)
892-
set(export_names cuvs-shared-exports cuvs-static-exports cuvs-static-exports
893-
cuvs-cpp-headers-exports cuvs-c-exports
894-
)
874+
set(target_names cuvs cuvs_static cuvs_cpp_headers cuvs_c)
875+
set(component_names cuvs_shared cuvs_static cuvs_cpp_headers c_api)
876+
set(export_names cuvs-shared-exports cuvs-static-exports cuvs-cpp-headers-exports cuvs-c-exports)
895877
foreach(target component export IN ZIP_LISTS target_names component_names export_names)
896878
if(TARGET ${target})
897879
install(

cpp/bench/ann/src/common/benchmark.hpp

Lines changed: 43 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
#pragma once
@@ -351,15 +351,9 @@ void bench_search(::benchmark::State& state,
351351

352352
// Each thread calculates recall on their partition of queries.
353353
// evaluate recall
354-
if (dataset->max_k() >= k) {
355-
const std::int32_t* gt = dataset->gt_set();
356-
const std::uint32_t* filter_bitset = dataset->filter_bitset(MemoryType::kHostMmap);
357-
auto filter = [filter_bitset](std::int32_t i) -> bool {
358-
if (filter_bitset == nullptr) { return true; }
359-
auto word = filter_bitset[i >> 5];
360-
return word & (1 << (i & 31));
361-
};
362-
const std::uint32_t max_k = dataset->max_k();
354+
if (dataset->max_k() >= k && dataset->gt_maps().has_value()) {
355+
// gt_maps[i] is a hash map of {id, neighbor_rank} for query i
356+
const auto& gt_maps = dataset->gt_maps();
363357
result_buf.transfer_data(MemoryType::kHost, current_algo_props->query_memory_type);
364358
auto* neighbors_host = reinterpret_cast<index_type*>(result_buf.data(MemoryType::kHost));
365359
std::size_t rows = std::min(queries_processed, query_set_size);
@@ -369,39 +363,49 @@ void bench_search(::benchmark::State& state,
369363
// We go through the groundtruth with same stride as the benchmark loop.
370364
size_t out_offset = 0;
371365
size_t batch_offset = (state.thread_index() * n_queries) % query_set_size;
366+
// Avoid CPU oversubscription when parallelizing recall calculation loop
367+
int num_recall_calculation_worker_threads =
368+
std::thread::hardware_concurrency() / benchmark_n_threads - 1; // -1 for the main thread
369+
// ensure non-negative number of workers (possible if hardware_concurrency()
370+
// does not return an expected value) by clamping to 0
371+
if (num_recall_calculation_worker_threads < 0) { num_recall_calculation_worker_threads = 0; }
372372
while (out_offset < rows) {
373-
for (std::size_t i = 0; i < n_queries; i++) {
374-
size_t i_orig_idx = batch_offset + i;
375-
size_t i_out_idx = out_offset + i;
376-
if (i_out_idx < rows) {
377-
/* NOTE: recall correctness & filtering
378-
379-
In the loop below, we filter the ground truth values on-the-fly.
380-
We need enough ground truth values to compute recall correctly though.
381-
But the ground truth file only contains `max_k` values per row; if there are less valid
382-
values than k among them, we overestimate the recall. Essentially, we compare the first
383-
`filter_pass_count` values of the algorithm output, and this counter can be less than `k`.
384-
In the extreme case of very high filtering rate, we may be bypassing entire rows of
385-
results. However, this is still better than no recall estimate at all.
386-
387-
TODO: consider generating the filtered ground truth on-the-fly
388-
*/
389-
uint32_t filter_pass_count = 0;
390-
for (std::uint32_t l = 0; l < max_k && filter_pass_count < k; l++) {
391-
auto exp_idx = gt[i_orig_idx * max_k + l];
392-
if (!filter(exp_idx)) { continue; }
393-
filter_pass_count++;
394-
for (std::uint32_t j = 0; j < k; j++) {
395-
auto act_idx = static_cast<std::int32_t>(neighbors_host[i_out_idx * k + j]);
396-
if (act_idx == exp_idx) {
397-
match_count++;
398-
break;
399-
}
400-
}
373+
std::vector<std::thread> recall_calculation_workers;
374+
recall_calculation_workers.reserve(num_recall_calculation_worker_threads);
375+
std::vector<std::size_t> local_match_count(num_recall_calculation_worker_threads + 1);
376+
std::vector<std::size_t> local_total_count(num_recall_calculation_worker_threads + 1);
377+
int chunk_size =
378+
n_queries / (num_recall_calculation_worker_threads + 1); // +1 for the main thread
379+
int remainder = n_queries % (num_recall_calculation_worker_threads + 1);
380+
auto recall_calculation = [&](int start, int end, int tid) -> void {
381+
for (int i = start; i < end; ++i) {
382+
size_t i_orig_idx = batch_offset + i;
383+
size_t i_out_idx = out_offset + i;
384+
if (i_out_idx < rows) {
385+
auto* candidates = neighbors_host + i_out_idx * k;
386+
auto [matching, total] = gt_maps->count_matches(i_orig_idx, candidates, k);
387+
local_match_count[tid] += matching;
388+
local_total_count[tid] += total;
401389
}
402-
total_count += filter_pass_count;
403390
}
391+
};
392+
// launch worker threads
393+
int start = 0;
394+
for (int tid = 0; tid < num_recall_calculation_worker_threads; tid++) {
395+
int end = start + chunk_size;
396+
if (tid < remainder) { ++end; }
397+
recall_calculation_workers.emplace_back(recall_calculation, start, end, tid);
398+
start = end;
404399
}
400+
// main thread works on last chunk
401+
recall_calculation(start, n_queries, num_recall_calculation_worker_threads);
402+
// join all worker threads
403+
for (auto& worker : recall_calculation_workers) {
404+
worker.join();
405+
}
406+
match_count += std::accumulate(local_match_count.begin(), local_match_count.end(), 0);
407+
total_count += std::accumulate(local_total_count.begin(), local_total_count.end(), 0);
408+
405409
out_offset += n_queries;
406410
batch_offset = (batch_offset + queries_stride) % query_set_size;
407411
}

0 commit comments

Comments
 (0)