Skip to content

Commit c95625b

Browse files
JIT+LTO IVF-PQ compute similarity (#1957)
Binary size savings: CUDA 12 x86 Conda: 2307418890 - 2186096243 = 121MB (5%) CUDA 13 x86 Conda: 1820176852 - 1765064926 = 55MB (3%) CUDA 12 x86 wheel: 479856846 - 416689911 = 63MB (13%) CUDA 13 x86 wheel: 277521287 - 259940426 = 17.5MB (6.3%) Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - James Lamb (https://github.com/jameslamb) - Divye Gala (https://github.com/divyegala) URL: #1957
1 parent c780765 commit c95625b

48 files changed

Lines changed: 2270 additions & 682 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

ci/validate_wheel.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ PYDISTCHECK_ARGS=(
2121
if [[ "${package_dir}" == "python/libcuvs" ]]; then
2222
if [[ "${RAPIDS_CUDA_MAJOR}" == "12" ]]; then
2323
PYDISTCHECK_ARGS+=(
24-
--max-allowed-size-compressed '800Mi'
24+
--max-allowed-size-compressed '400Mi'
2525
)
2626
else
2727
PYDISTCHECK_ARGS+=(
28-
--max-allowed-size-compressed '450Mi'
28+
--max-allowed-size-compressed '250Mi'
2929
)
3030
fi
3131
elif [[ "${package_dir}" != "python/cuvs" ]]; then

cpp/CMakeLists.txt

Lines changed: 175 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -399,13 +399,14 @@ if(NOT BUILD_CPU_ONLY)
399399
target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20)
400400
target_link_libraries(jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL)
401401

402-
block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files)
402+
block(PROPAGATE jit_lto_files)
403+
set(jit_lto_files)
403404
set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE})
404405
set(ivf_flat_ns "cuvs::neighbors::ivf_flat::detail")
405406
generate_jit_lto_kernels(
406-
interleaved_scan_files
407+
jit_lto_files
407408
NAME_FORMAT
408-
"interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@"
409+
"ivf_flat_interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@"
409410
MATRIX_JSON_FILE
410411
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json"
411412
KERNEL_INPUT_FILE
@@ -414,12 +415,12 @@ if(NOT BUILD_CPU_ONLY)
414415
"${ivf_flat_ns}::fragment_tag_interleaved_scan<${ivf_flat_ns}::tag_@type_abbrev@, ${ivf_flat_ns}::tag_acc_@acc_abbrev@, ${ivf_flat_ns}::tag_idx_@idx_abbrev@, @capacity@, @veclen@, @ascending_value@, @compute_norm_value@>"
415416
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_flat/interleaved_scan_fragments.hpp>"
416417
"<cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp>"
417-
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan"
418+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_flat/interleaved_scan"
418419
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
419420
)
420421
generate_jit_lto_kernels(
421-
metric_files
422-
NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@"
422+
jit_lto_files
423+
NAME_FORMAT "ivf_flat_metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@"
423424
MATRIX_JSON_FILE
424425
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json"
425426
KERNEL_INPUT_FILE
@@ -428,12 +429,12 @@ if(NOT BUILD_CPU_ONLY)
428429
"${ivf_flat_ns}::fragment_tag_metric<@veclen@, ${ivf_flat_ns}::tag_@type_abbrev@, ${ivf_flat_ns}::tag_acc_@acc_abbrev@, ${ivf_flat_ns}::tag_metric_@metric_name@>"
429430
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_flat/interleaved_scan_fragments.hpp>"
430431
"<cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp>"
431-
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric"
432+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_flat/metric"
432433
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
433434
)
434435
generate_jit_lto_kernels(
435-
filter_files
436-
NAME_FORMAT "@filter_name@"
436+
jit_lto_files
437+
NAME_FORMAT "ivf_flat_@filter_name@"
437438
MATRIX_JSON_FILE
438439
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json"
439440
KERNEL_INPUT_FILE
@@ -442,12 +443,12 @@ if(NOT BUILD_CPU_ONLY)
442443
"${ivf_flat_ns}::fragment_tag_filter<${ivf_flat_ns}::tag_filter<${ivf_flat_ns}::tag_idx_l, ${ivf_flat_ns}::tag_@filter_name@_impl>>"
443444
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_flat/interleaved_scan_fragments.hpp>"
444445
"<cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp>"
445-
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter"
446+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_flat/filter"
446447
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
447448
)
448449
generate_jit_lto_kernels(
449-
post_lambda_files
450-
NAME_FORMAT "@post_lambda_name@"
450+
jit_lto_files
451+
NAME_FORMAT "ivf_flat_@post_lambda_name@"
451452
MATRIX_JSON_FILE
452453
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json"
453454
KERNEL_INPUT_FILE
@@ -456,21 +457,155 @@ if(NOT BUILD_CPU_ONLY)
456457
"${ivf_flat_ns}::fragment_tag_post_lambda<${ivf_flat_ns}::tag_@post_lambda_name@>"
457458
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_flat/interleaved_scan_fragments.hpp>"
458459
"<cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp>"
459-
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda"
460+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_flat/post_lambda"
460461
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
461462
)
462-
endblock()
463-
464-
set(jit_lto_files
465-
${interleaved_scan_files}
466-
${metric_files}
467-
${filter_files}
468-
${post_lambda_files}
469-
src/detail/jit_lto/AlgorithmLauncher.cpp
470-
src/detail/jit_lto/AlgorithmPlanner.cpp
471-
src/detail/jit_lto/FragmentEntry.cpp
472-
src/detail/jit_lto/nvjitlink_checker.cpp
463+
set(ivf_pq_ns "cuvs::neighbors::ivf_pq::detail")
464+
generate_jit_lto_kernels(
465+
jit_lto_files
466+
NAME_FORMAT "ivf_pq_compute_similarity_out_@out_abbrev@_lut_@lut_abbrev@"
467+
MATRIX_JSON_FILE
468+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_matrix.json"
469+
KERNEL_INPUT_FILE
470+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_kernel.cu.in"
471+
FRAGMENT_TAG_FORMAT
472+
"${ivf_pq_ns}::fragment_tag_compute_similarity<${ivf_pq_ns}::tag_out_@out_abbrev@, ${ivf_pq_ns}::tag_lut_@lut_abbrev@>"
473+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
474+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/compute_similarity"
475+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
476+
)
477+
generate_jit_lto_kernels(
478+
jit_lto_files
479+
NAME_FORMAT "ivf_pq_prepare_lut_lut_@lut_abbrev@_@enable_smem_lut_str@_@pq_bits@pq"
480+
MATRIX_JSON_FILE
481+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/prepare_lut_matrix.json"
482+
KERNEL_INPUT_FILE
483+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/prepare_lut_kernel.cu.in"
484+
FRAGMENT_TAG_FORMAT
485+
"${ivf_pq_ns}::fragment_tag_prepare_lut<${ivf_pq_ns}::tag_lut_@lut_abbrev@, @enable_smem_lut@, @pq_bits@>"
486+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
487+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/prepare_lut"
488+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
489+
)
490+
generate_jit_lto_kernels(
491+
jit_lto_files
492+
NAME_FORMAT "ivf_pq_store_calculated_distances_out_@out_abbrev@_@k_manage_local_topk_str@"
493+
MATRIX_JSON_FILE
494+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/store_calculated_distances_matrix.json"
495+
KERNEL_INPUT_FILE
496+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/store_calculated_distances_kernel.cu.in"
497+
FRAGMENT_TAG_FORMAT
498+
"${ivf_pq_ns}::fragment_tag_store_calculated_distances<${ivf_pq_ns}::tag_out_@out_abbrev@, @k_manage_local_topk@>"
499+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
500+
OUTPUT_DIRECTORY
501+
"${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/store_calculated_distances"
502+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
503+
)
504+
generate_jit_lto_kernels(
505+
jit_lto_files
506+
NAME_FORMAT "ivf_pq_precompute_base_diff_metric_@metric@"
507+
MATRIX_JSON_FILE
508+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/precompute_base_diff_matrix.json"
509+
KERNEL_INPUT_FILE
510+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/precompute_base_diff_kernel.cu.in"
511+
FRAGMENT_TAG_FORMAT
512+
"${ivf_pq_ns}::fragment_tag_precompute_base_diff<${ivf_pq_ns}::tag_metric_@metric@>"
513+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
514+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/precompute_base_diff"
515+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
516+
)
517+
generate_jit_lto_kernels(
518+
jit_lto_files
519+
NAME_FORMAT
520+
"ivf_pq_create_lut_lut_@lut_abbrev@_@precomp_base_diff_str@_@pq_bits@pq_metric_@metric@"
521+
MATRIX_JSON_FILE
522+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/create_lut_matrix.json"
523+
KERNEL_INPUT_FILE
524+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/create_lut_kernel.cu.in"
525+
FRAGMENT_TAG_FORMAT
526+
"${ivf_pq_ns}::fragment_tag_create_lut<${ivf_pq_ns}::tag_lut_@lut_abbrev@, ${ivf_pq_ns}::tag_metric_@metric@, @precomp_base_diff@, @pq_bits@>"
527+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
528+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/create_lut"
529+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
530+
)
531+
generate_jit_lto_kernels(
532+
jit_lto_files
533+
NAME_FORMAT "ivf_pq_compute_distances_out_@out_abbrev@_lut_@lut_abbrev@_capacity_@capacity@"
534+
MATRIX_JSON_FILE
535+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_distances_matrix.json"
536+
KERNEL_INPUT_FILE
537+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_distances_kernel.cu.in"
538+
FRAGMENT_TAG_FORMAT
539+
"${ivf_pq_ns}::fragment_tag_compute_distances<${ivf_pq_ns}::tag_out_@out_abbrev@, ${ivf_pq_ns}::tag_lut_@lut_abbrev@, @capacity@>"
540+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
541+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/compute_distances"
542+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
543+
)
544+
generate_jit_lto_kernels(
545+
jit_lto_files
546+
NAME_FORMAT "ivf_pq_get_early_stop_limit_out_@out_abbrev@_metric_@metric@"
547+
MATRIX_JSON_FILE
548+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/get_early_stop_limit_matrix.json"
549+
KERNEL_INPUT_FILE
550+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/get_early_stop_limit_kernel.cu.in"
551+
FRAGMENT_TAG_FORMAT
552+
"${ivf_pq_ns}::fragment_tag_get_early_stop_limit<${ivf_pq_ns}::tag_out_@out_abbrev@, ${ivf_pq_ns}::tag_metric_@metric@>"
553+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
554+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/get_early_stop_limit"
555+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
473556
)
557+
generate_jit_lto_kernels(
558+
jit_lto_files
559+
NAME_FORMAT "ivf_pq_sample_filter_@filter_name@"
560+
MATRIX_JSON_FILE
561+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/sample_filter_matrix.json"
562+
KERNEL_INPUT_FILE
563+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/sample_filter_kernel.cu.in"
564+
FRAGMENT_TAG_FORMAT
565+
"${ivf_pq_ns}::fragment_tag_sample_filter<${ivf_pq_ns}::tag_filter_@filter_name@>"
566+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
567+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/sample_filter"
568+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
569+
)
570+
generate_jit_lto_kernels(
571+
jit_lto_files
572+
NAME_FORMAT "ivf_pq_get_line_width_@pq_bits@pq"
573+
MATRIX_JSON_FILE
574+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/get_line_width_matrix.json"
575+
KERNEL_INPUT_FILE
576+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/get_line_width_kernel.cu.in"
577+
FRAGMENT_TAG_FORMAT "${ivf_pq_ns}::fragment_tag_get_line_width<@pq_bits@>"
578+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
579+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/get_line_width"
580+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
581+
)
582+
generate_jit_lto_kernels(
583+
jit_lto_files
584+
NAME_FORMAT "ivf_pq_compute_score_out_@out_abbrev@_lut_@lut_abbrev@_@pq_bits@pq"
585+
MATRIX_JSON_FILE
586+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_score_matrix.json"
587+
KERNEL_INPUT_FILE
588+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_score_kernel.cu.in"
589+
FRAGMENT_TAG_FORMAT
590+
"${ivf_pq_ns}::fragment_tag_compute_score<${ivf_pq_ns}::tag_out_@out_abbrev@, ${ivf_pq_ns}::tag_lut_@lut_abbrev@, @pq_bits@>"
591+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
592+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/compute_score"
593+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
594+
)
595+
generate_jit_lto_kernels(
596+
jit_lto_files
597+
NAME_FORMAT "ivf_pq_increment_score_out_@out_abbrev@_@increment_str@"
598+
MATRIX_JSON_FILE
599+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/increment_score_matrix.json"
600+
KERNEL_INPUT_FILE
601+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/jit_lto_kernels/increment_score_kernel.cu.in"
602+
FRAGMENT_TAG_FORMAT
603+
"${ivf_pq_ns}::fragment_tag_increment_score<${ivf_pq_ns}::tag_out_@out_abbrev@, @increment@>"
604+
FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_pq/compute_similarity_fragments.hpp>"
605+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_pq/increment_score"
606+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
607+
)
608+
endblock()
474609

475610
# Note that this matrix contains an `arch_includes` placeholder, since we don't currently have a
476611
# way to do an item-wise transform on a list after computing the matrix product and before
@@ -586,11 +721,20 @@ if(NOT BUILD_CPU_ONLY)
586721
generate_inst_matrix(
587722
ivf_pq_compute_similarity_inst_files
588723
MATRIX_JSON_FILE
589-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_matrix.json"
724+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_select_matrix.json"
725+
INPUT_FILE
726+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_select_inst.cu.in"
727+
OUTPUT_FILE_FORMAT
728+
"${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_select_inst_out_@out_abbrev@_lut_@lut_abbrev@_filter_@filter_name@_metric_@metric@_@increment_score_str@.cu"
729+
)
730+
generate_inst_matrix(
731+
ivf_pq_compute_similarity_inst_files
732+
MATRIX_JSON_FILE
733+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_run_matrix.json"
590734
INPUT_FILE
591-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_inst.cu.in"
735+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_run_inst.cu.in"
592736
OUTPUT_FILE_FORMAT
593-
"${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_inst_out_@out_abbrev@_lut_@lut_abbrev@.cu"
737+
"${CMAKE_CURRENT_BINARY_DIR}/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_run_inst_out_@out_abbrev@_lut_@lut_abbrev@.cu"
594738
)
595739
generate_inst_matrix(
596740
ivf_pq_search_inst_files
@@ -671,6 +815,10 @@ if(NOT BUILD_CPU_ONLY)
671815
src/core/omp_wrapper.cpp
672816
src/util/file_io.cpp
673817
src/util/host_memory.cpp
818+
src/detail/jit_lto/AlgorithmLauncher.cpp
819+
src/detail/jit_lto/AlgorithmPlanner.cpp
820+
src/detail/jit_lto/FragmentEntry.cpp
821+
src/detail/jit_lto/nvjitlink_checker.cpp
674822
src/distance/detail/kernels/gram_matrix.cu
675823
src/distance/detail/kernels/kernel_factory.cu
676824
src/distance/detail/kernels/kernel_matrices.cu
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#pragma once
7+
8+
#include <cstdint>
9+
10+
namespace cuvs::neighbors::ivf_pq::detail {
11+
12+
struct tag_out_f {};
13+
struct tag_out_h {};
14+
15+
struct tag_lut_f {};
16+
struct tag_lut_h {};
17+
struct tag_lut_fp8_signed {};
18+
struct tag_lut_fp8_unsigned {};
19+
20+
struct tag_filter_none {};
21+
struct tag_filter_bitset {};
22+
23+
struct tag_metric_none {};
24+
struct tag_metric_euclidean {};
25+
struct tag_metric_inner_product {};
26+
27+
template <typename OutTag, typename LutTag>
28+
struct fragment_tag_compute_similarity {};
29+
30+
template <typename LutTag, bool EnableSMemLut, uint32_t PqBits>
31+
struct fragment_tag_prepare_lut {};
32+
33+
template <typename OutTag, bool kManageLocalTopK>
34+
struct fragment_tag_store_calculated_distances {};
35+
36+
template <typename MetricTag>
37+
struct fragment_tag_precompute_base_diff {};
38+
39+
template <typename LutTag, typename MetricTag, bool PrecompBaseDiff, uint32_t PqBits>
40+
struct fragment_tag_create_lut {};
41+
42+
template <typename OutTag, typename LutTag, int Capacity>
43+
struct fragment_tag_compute_distances {};
44+
45+
template <typename OutTag, typename MetricTag>
46+
struct fragment_tag_get_early_stop_limit {};
47+
48+
template <typename FilterTag>
49+
struct fragment_tag_sample_filter {};
50+
51+
template <uint32_t PqBits>
52+
struct fragment_tag_get_line_width {};
53+
54+
template <typename OutTag, typename LutTag, uint32_t PqBits>
55+
struct fragment_tag_compute_score {};
56+
57+
template <typename OutTag, bool Increment>
58+
struct fragment_tag_increment_score {};
59+
60+
} // namespace cuvs::neighbors::ivf_pq::detail

cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_inst.cu.in renamed to cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_run_inst.cu.in

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
#include <neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh>
77
#include <neighbors/ivf_pq/ivf_pq_fp_8bit.cuh>
8-
#include <neighbors/sample_filter.cuh>
98

109
namespace {
1110

@@ -16,17 +15,6 @@ using lut_t = @lut_type@;
1615

1716
namespace cuvs::neighbors::ivf_pq::detail {
1817

19-
template auto compute_similarity_select<out_t, lut_t>(const cudaDeviceProp& dev_props,
20-
bool manage_local_topk,
21-
int locality_hint,
22-
double preferred_shmem_carveout,
23-
uint32_t pq_bits,
24-
uint32_t pq_dim,
25-
uint32_t precomp_data_count,
26-
uint32_t n_queries,
27-
uint32_t n_probes,
28-
uint32_t topk) -> selected<out_t, lut_t>;
29-
3018
template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run<out_t, lut_t>(
3119
cuvs::neighbors::ivf_pq::detail::selected<out_t, lut_t> s,
3220
rmm::cuda_stream_view stream,
@@ -35,7 +23,6 @@ template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run<out_t, lut
3523
uint32_t pq_dim,
3624
uint32_t n_queries,
3725
uint32_t queries_offset,
38-
cuvs::distance::DistanceType metric,
3926
cuvs::neighbors::ivf_pq::codebook_gen codebook_kind,
4027
uint32_t topk,
4128
uint32_t max_samples,
@@ -47,7 +34,10 @@ template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run<out_t, lut
4734
const float* queries,
4835
const uint32_t* index_list,
4936
float* query_kths,
50-
const cuvs::neighbors::filtering::base_filter& sample_filter,
37+
const int64_t* const* inds_ptrs,
38+
uint32_t* bitset_ptr,
39+
int64_t bitset_len,
40+
int64_t original_nbits,
5141
lut_t* lut_scores,
5242
out_t* _out_scores,
5343
uint32_t* _out_indices);

0 commit comments

Comments
 (0)