diff --git a/ci/pod/e2e-arm-cpu.yaml b/ci/pod/e2e-arm-cpu.yaml index 346496f32..c9d30f000 100644 --- a/ci/pod/e2e-arm-cpu.yaml +++ b/ci/pod/e2e-arm-cpu.yaml @@ -19,11 +19,11 @@ spec: args: ["cat"] resources: requests: - memory: "12Gi" - cpu: "3" + memory: "16Gi" + cpu: "4" limits: - memory: "12Gi" - cpu: "3" + memory: "16Gi" + cpu: "4" volumeMounts: - mountPath: /home/data name: db-data diff --git a/cmake/libs/libfaiss.cmake b/cmake/libs/libfaiss.cmake index 311e5cf19..47fd43db2 100644 --- a/cmake/libs/libfaiss.cmake +++ b/cmake/libs/libfaiss.cmake @@ -37,18 +37,31 @@ knowhere_file_glob( FAISS_AVX512_SRCS thirdparty/faiss/faiss/cppcontrib/knowhere/impl/*avx512.cpp ) -# AVX512 vanilla Faiss dynamic dispatch related files +# AVX512 vanilla Faiss dynamic dispatch related files. Baseline +# sq-avx512.cpp is replaced by a knowhere-local prelude file that declares +# a fast DCTemplate specialization for QT_4bit_uniform + L2 and then +# textually #includes the baseline sq-avx512.cpp — see +# cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp for the full design note. knowhere_file_glob( GLOB FAISS_DD_AVX512_SRCS thirdparty/faiss/faiss/impl/fast_scan/impl-avx512.cpp thirdparty/faiss/faiss/impl/hnsw/avx512.cpp thirdparty/faiss/faiss/impl/pq_code_distance/pq_code_distance-avx512.cpp - thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp + thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp thirdparty/faiss/faiss/utils/distances_fused/avx512.cpp thirdparty/faiss/faiss/utils/simd_impl/distances_avx512.cpp thirdparty/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp ) +# Baseline sq-avx512.cpp is pulled in textually by the prelude file, not +# compiled directly. Remove it from the generic list so it is not picked +# up as a stand-alone TU (which would duplicate symbols). +knowhere_file_glob( + GLOB + FAISS_SQ_AVX512_EXCLUDE + thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +) +list(REMOVE_ITEM FAISS_SRCS ${FAISS_SQ_AVX512_EXCLUDE}) # combine files list(APPEND FAISS_AVX512_SRCS ${FAISS_DD_AVX512_SRCS}) # remove platform files from general files @@ -61,7 +74,8 @@ knowhere_file_glob( FAISS_AVX2_SRCS thirdparty/faiss/faiss/cppcontrib/knowhere/impl/*avx.cpp ) -# AVX2 vanilla Faiss dynamic dispatch related files +# AVX2 vanilla Faiss dynamic dispatch related files. sq-avx2.cpp is +# textually wrapped by sq-avx2-fastpath.cpp (see design note there). knowhere_file_glob( GLOB FAISS_DD_AVX2_SRCS @@ -69,11 +83,18 @@ knowhere_file_glob( thirdparty/faiss/faiss/impl/fast_scan/impl-avx2.cpp thirdparty/faiss/faiss/impl/hnsw/avx2.cpp thirdparty/faiss/faiss/impl/pq_code_distance/pq_code_distance-avx2.cpp - thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp + thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.cpp thirdparty/faiss/faiss/utils/simd_impl/distances_avx2.cpp + thirdparty/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp thirdparty/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp ) +knowhere_file_glob( + GLOB + FAISS_SQ_AVX2_EXCLUDE + thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +) +list(REMOVE_ITEM FAISS_SRCS ${FAISS_SQ_AVX2_EXCLUDE}) # combine files list(APPEND FAISS_AVX2_SRCS ${FAISS_DD_AVX2_SRCS}) # remove platform files from general files @@ -104,17 +125,25 @@ knowhere_file_glob( FAISS_NEON_SRCS thirdparty/faiss/faiss/cppcontrib/knowhere/impl/*neon.cpp ) -# NEON vanilla Faiss dynamic dispatch related files +# NEON vanilla Faiss dynamic dispatch related files. sq-neon.cpp is +# textually wrapped by sq-neon-fastpath.cpp (see design note there). knowhere_file_glob( GLOB FAISS_DD_NEON_SRCS thirdparty/faiss/faiss/impl/approx_topk/neon.cpp thirdparty/faiss/faiss/impl/fast_scan/impl-neon.cpp - thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp + thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp thirdparty/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp thirdparty/faiss/faiss/utils/simd_impl/distances_aarch64.cpp + thirdparty/faiss/faiss/utils/simd_impl/partitioning_neon.cpp thirdparty/faiss/faiss/utils/simd_impl/rabitq_neon.cpp ) +knowhere_file_glob( + GLOB + FAISS_SQ_NEON_EXCLUDE + thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +) +list(REMOVE_ITEM FAISS_SRCS ${FAISS_SQ_NEON_EXCLUDE}) # combine files list(APPEND FAISS_NEON_SRCS ${FAISS_DD_NEON_SRCS}) # remove platform files from general files diff --git a/src/index/data_view_dense_index/refine_computer.h b/src/index/data_view_dense_index/refine_computer.h index 08f409cf4..0390b856d 100644 --- a/src/index/data_view_dense_index/refine_computer.h +++ b/src/index/data_view_dense_index/refine_computer.h @@ -12,9 +12,9 @@ // knowhere-specific indices #pragma once -#include "faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h" #include "faiss/cppcontrib/knowhere/invlists/InvertedLists.h" #include "faiss/impl/DistanceComputer.h" +#include "faiss/impl/ScalarQuantizer.h" #include "knowhere/comp/index_param.h" #include "knowhere/object.h" #include "knowhere/operands.h" @@ -63,16 +63,13 @@ struct QuantRefine { } switch (refine_type) { case RefineType::UINT8_QUANT: - quantizer = new faiss::cppcontrib::knowhere::ScalarQuantizer( - d, faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_8bit); + quantizer = new faiss::ScalarQuantizer(d, faiss::ScalarQuantizer::QuantizerType::QT_8bit); break; case RefineType::BFLOAT16_QUANT: - quantizer = new faiss::cppcontrib::knowhere::ScalarQuantizer( - d, faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_bf16); + quantizer = new faiss::ScalarQuantizer(d, faiss::ScalarQuantizer::QuantizerType::QT_bf16); break; case RefineType::FLOAT16_QUANT: - quantizer = new faiss::cppcontrib::knowhere::ScalarQuantizer( - d, faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_fp16); + quantizer = new faiss::ScalarQuantizer(d, faiss::ScalarQuantizer::QuantizerType::QT_fp16); break; default: throw std::runtime_error("Fail to generate quant for refiner if refine_type == RefineType::DATA_VIEW"); @@ -118,9 +115,9 @@ struct QuantRefine { GetMetric() { return metric_type; } - std::unique_ptr + std::unique_ptr GetQuantComputer() { - return std::unique_ptr( + return std::unique_ptr( quantizer->get_distance_computer(metric_type)); } DataFormatEnum @@ -141,7 +138,7 @@ struct QuantRefine { static constexpr size_t key = 0; static constexpr size_t list_num = 1; static constexpr size_t segment_size = 48; - faiss::cppcontrib::knowhere::ScalarQuantizer* quantizer = nullptr; + faiss::ScalarQuantizer* quantizer = nullptr; faiss::cppcontrib::knowhere::InvertedLists* storage = nullptr; faiss::MetricType metric_type; DataFormatEnum origin_data_type; @@ -153,7 +150,7 @@ template struct QuantDataDistanceComputer : faiss::DistanceComputer { std::vector query_buf; std::shared_ptr quant_data; - std::unique_ptr qc; + std::unique_ptr qc; float q_norm; size_t dim; diff --git a/src/index/hnsw/faiss_hnsw.cc b/src/index/hnsw/faiss_hnsw.cc index 1cf64c1f3..0f9c52d02 100644 --- a/src/index/hnsw/faiss_hnsw.cc +++ b/src/index/hnsw/faiss_hnsw.cc @@ -9,8 +9,10 @@ // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License. +#include #include #include +#include #include #include #include @@ -32,7 +34,6 @@ #include "common/metric.h" #include "faiss/cppcontrib/knowhere/IndexHNSW.h" #include "faiss/cppcontrib/knowhere/IndexRefine.h" -#include "faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h" #include "faiss/cppcontrib/knowhere/index_io.h" #include "faiss/impl/mapped_io.h" #include "index/clustering_config.h" @@ -546,10 +547,10 @@ convert_rows_to_fp32(const void* const __restrict src_in, float* const __restric // where each query_row has ((dim + 7) / 8) * 8 bits, and the total is nrows * ((dim + 7) / 8) * 8 bits. // But the final format required is nrows * dim * 32 bits (float). // There are actually two conversions happening here: - // 1. Each uint8_t value must be converted to float (in `BinarySQDistanceComputerWrapper::set_query` - // and `ScalarQuantizer::compute_codes`), it will be converted back to uint8_t). [same as int8] + // 1. Each uint8_t value must be converted to float (in `BinaryFlatCodesDC::set_query` inside + // IndexBinaryScalarQuantizer, it will be converted back to uint8_t). [same as int8] // 2. Each row must occupy dim * 32 bits of space, even if not all bits are filled; - // this is required by the convention set in `ScalarQuantizer::compute_codes`. + // this is required by the convention set by IndexBinaryScalarQuantizer::sa_encode. const knowhere::bin1* const src = reinterpret_cast(src_in); auto uint8_dim = (dim + 7) / 8; for (size_t i = 0; i < nrows; i++) { @@ -711,20 +712,26 @@ get_index_data_format(const faiss::Index* index) { return DataFormatEnum::fp32; } - // is it sq? - // note: IndexScalarQuantizerCosine preserves the original data, no cosine norm is appliesd - auto index_sq = dynamic_cast(index); - if (index_sq != nullptr) { - if (index_sq->sq.qtype == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_bf16) { - return DataFormatEnum::bf16; - } else if (index_sq->sq.qtype == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_fp16) { - return DataFormatEnum::fp16; - } else if (index_sq->sq.qtype == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit_direct_signed) { - return DataFormatEnum::int8; - } else if (index_sq->sq.qtype == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_1bit_direct) { - return DataFormatEnum::bin1; - } else { - return std::nullopt; + // is it binary (1-bit-direct)? Routed through + // IndexBinaryScalarQuantizer, which replaces the legacy + // IndexScalarQuantizer(QT_1bit_direct) path. + if (dynamic_cast(index) != nullptr) { + return DataFormatEnum::bin1; + } + + // is it sq? All SQ storage produced by knowhere now inherits from + // baseline faiss::IndexScalarQuantizer (Cosine/SQ4U wrappers, + // plain IndexHNSWSQ, and refine). + if (auto* index_sq = dynamic_cast(index)) { + switch (index_sq->sq.qtype) { + case faiss::ScalarQuantizer::QT_bf16: + return DataFormatEnum::bf16; + case faiss::ScalarQuantizer::QT_fp16: + return DataFormatEnum::fp16; + case faiss::ScalarQuantizer::QT_8bit_direct_signed: + return DataFormatEnum::int8; + default: + return std::nullopt; } } @@ -2068,9 +2075,8 @@ class BaseFaissRegularIndexHNSWFlatNode : public BaseFaissRegularIndexHNSWNode { if (is_binary) { if (metric.value() == faiss::MetricType::METRIC_Hamming || metric.value() == faiss::MetricType::METRIC_Jaccard) { - hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_1bit_direct, hnsw_cfg.M.value(), - metric.value()); + hnsw_index = std::make_unique(dim, hnsw_cfg.M.value(), + metric.value()); } else { LOG_KNOWHERE_ERROR_ << "Unsupported metric for binary data: " << hnsw_cfg.metric_type.value(); return Status::invalid_metric_type; @@ -2082,14 +2088,13 @@ class BaseFaissRegularIndexHNSWFlatNode : public BaseFaissRegularIndexHNSWNode { std::make_unique(dim, hnsw_cfg.M.value()); } else if (data_format == DataFormatEnum::fp16) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_fp16, hnsw_cfg.M.value()); + dim, faiss::ScalarQuantizer::QT_fp16, hnsw_cfg.M.value()); } else if (data_format == DataFormatEnum::bf16) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_bf16, hnsw_cfg.M.value()); + dim, faiss::ScalarQuantizer::QT_bf16, hnsw_cfg.M.value()); } else if (data_format == DataFormatEnum::int8) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit_direct_signed, - hnsw_cfg.M.value()); + dim, faiss::ScalarQuantizer::QT_8bit_direct_signed, hnsw_cfg.M.value()); } else { LOG_KNOWHERE_ERROR_ << "Unsupported metric type: " << hnsw_cfg.metric_type.value(); return Status::invalid_metric_type; @@ -2100,16 +2105,13 @@ class BaseFaissRegularIndexHNSWFlatNode : public BaseFaissRegularIndexHNSWNode { dim, hnsw_cfg.M.value(), metric.value()); } else if (data_format == DataFormatEnum::fp16) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_fp16, hnsw_cfg.M.value(), - metric.value()); + dim, faiss::ScalarQuantizer::QT_fp16, hnsw_cfg.M.value(), metric.value()); } else if (data_format == DataFormatEnum::bf16) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_bf16, hnsw_cfg.M.value(), - metric.value()); + dim, faiss::ScalarQuantizer::QT_bf16, hnsw_cfg.M.value(), metric.value()); } else if (data_format == DataFormatEnum::int8) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit_direct_signed, - hnsw_cfg.M.value(), metric.value()); + dim, faiss::ScalarQuantizer::QT_8bit_direct_signed, hnsw_cfg.M.value(), metric.value()); } else { LOG_KNOWHERE_ERROR_ << "Unsupported metric type: " << hnsw_cfg.metric_type.value(); return Status::invalid_metric_type; @@ -2548,7 +2550,7 @@ class BaseFaissRegularIndexHNSWSQNode : public BaseFaissRegularIndexHNSWNode { // create an index const bool is_cosine = IsMetricType(hnsw_cfg.metric_type.value(), metric::COSINE); - const bool is_sq4u = sq_type.value() == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_4bit_uniform; + const bool is_sq4u = sq_type.value() == faiss::ScalarQuantizer::QT_4bit_uniform; // should refine be used? std::unique_ptr final_index; @@ -2570,6 +2572,17 @@ class BaseFaissRegularIndexHNSWSQNode : public BaseFaissRegularIndexHNSWNode { } else { hnsw_index = std::make_unique( dim, sq_type.value(), hnsw_cfg.M.value(), metric.value()); + // QT_4bit_uniform + L2 benefits from quantile-based range + // estimation. This used to be hard-coded inside the fork + // IndexScalarQuantizer ctor; moved here so that ctor is + // behaviorally equivalent to baseline. + if (is_sq4u) { + auto* idx_sq = dynamic_cast(hnsw_index->storage); + if (idx_sq != nullptr) { + idx_sq->sq.rangestat = faiss::ScalarQuantizer::RS_quantiles; + idx_sq->sq.rangestat_arg = 0.01; + } + } } hnsw_index->hnsw.efConstruction = hnsw_cfg.efConstruction.value(); diff --git a/src/index/ivf/ivf.cc b/src/index/ivf/ivf.cc index ab47304bb..97fe824fd 100644 --- a/src/index/ivf/ivf.cc +++ b/src/index/ivf/ivf.cc @@ -507,19 +507,19 @@ to_index_flat(std::unique_ptr&& index) { return std::make_unique(std::move(*index)); } -expected +expected get_ivf_sq_quantizer_type(int code_size) { switch (code_size) { case 4: - return faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_4bit; + return faiss::ScalarQuantizer::QuantizerType::QT_4bit; case 6: - return faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_6bit; + return faiss::ScalarQuantizer::QuantizerType::QT_6bit; case 8: - return faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_8bit; + return faiss::ScalarQuantizer::QuantizerType::QT_8bit; case 16: - return faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_fp16; + return faiss::ScalarQuantizer::QuantizerType::QT_fp16; default: - return expected::Err( + return expected::Err( Status::invalid_args, fmt::format("current code size {} not in (4, 6, 8, 16)", code_size)); } } diff --git a/src/index/ivf/ivf_wrapper.cc b/src/index/ivf/ivf_wrapper.cc index 72a3eaf9d..04ca212df 100644 --- a/src/index/ivf/ivf_wrapper.cc +++ b/src/index/ivf/ivf_wrapper.cc @@ -196,15 +196,15 @@ IndexIvfFactory::create_for_sq(faiss::cppcontrib::knowhere::IndexFlat* qzr_raw_p // create IndexIVFSQ // Index does not own qzr - faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType quantizer_type; + faiss::ScalarQuantizer::QuantizerType quantizer_type; // ivf_sq_cfg.sq_type.value() has already been guaranteed to be legal in CheckAndAdjust std::string quantizer_type_tolower = str_to_lower(ivf_sq_cfg.sq_type.value()); if (quantizer_type_tolower == "sq4") { - quantizer_type = faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_4bit; + quantizer_type = faiss::ScalarQuantizer::QuantizerType::QT_4bit; } else if (quantizer_type_tolower == "sq6") { - quantizer_type = faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_6bit; + quantizer_type = faiss::ScalarQuantizer::QuantizerType::QT_6bit; } else { - quantizer_type = faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_8bit; + quantizer_type = faiss::ScalarQuantizer::QuantizerType::QT_8bit; } auto index = std::make_unique(qzr_raw_ptr, d, nlist, quantizer_type, metric); diff --git a/src/index/refine/refine_utils.cc b/src/index/refine/refine_utils.cc index 25f81e74b..3c32f1dc3 100644 --- a/src/index/refine/refine_utils.cc +++ b/src/index/refine/refine_utils.cc @@ -7,8 +7,8 @@ #include #include +#include "faiss/IndexScalarQuantizer.h" #include "faiss/cppcontrib/knowhere/IndexRefine.h" -#include "faiss/cppcontrib/knowhere/IndexScalarQuantizer.h" #include "fmt/format.h" #include "knowhere/log.h" #include "knowhere/tolower.h" @@ -16,21 +16,18 @@ namespace knowhere { // a supporting function -expected +expected get_sq_quantizer_type(const std::string& sq_type) { - std::map sq_types = { - {"sq4u", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_4bit_uniform}, - {"sq6", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_6bit}, - {"sq8", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit}, - {"fp16", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_fp16}, - {"bf16", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_bf16}, - {"int8", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit_direct_signed}}; + std::map sq_types = { + {"sq4u", faiss::ScalarQuantizer::QT_4bit_uniform}, {"sq6", faiss::ScalarQuantizer::QT_6bit}, + {"sq8", faiss::ScalarQuantizer::QT_8bit}, {"fp16", faiss::ScalarQuantizer::QT_fp16}, + {"bf16", faiss::ScalarQuantizer::QT_bf16}, {"int8", faiss::ScalarQuantizer::QT_8bit_direct_signed}}; // todo: tolower auto sq_type_tolower = str_to_lower(sq_type); auto itr = sq_types.find(sq_type_tolower); if (itr == sq_types.cend()) { - return expected::Err( + return expected::Err( Status::invalid_args, fmt::format("invalid scalar quantizer type ({})", sq_type_tolower)); } @@ -61,8 +58,7 @@ is_flat_refine(const std::optional& refine_type) { } bool -has_lossless_quant(const expected& quant_type, - DataFormatEnum dataFormat) { +has_lossless_quant(const expected& quant_type, DataFormatEnum dataFormat) { if (!quant_type.has_value()) { return false; } @@ -72,11 +68,11 @@ has_lossless_quant(const expected>::Err( Status::invalid_args, "fp16 input data does not accept bf16 or fp32 as a refine index."); @@ -127,7 +123,7 @@ pick_refine_index(const DataFormatEnum data_format, const std::optional>::Err( Status::invalid_args, "bf16 input data does not accept fp16 or fp32 as a refine index."); @@ -159,9 +155,22 @@ pick_refine_index(const DataFormatEnum data_format, const std::optional( - base_d, refine_sq_type.value(), base_metric_type); + // create an sq. Baseline faiss::IndexScalarQuantizer — the fork + // variant's ctor is now behavior-identical (see fork + // IndexScalarQuantizer.cpp), and fork index_write.cpp recognises + // baseline IxSQ via an overload (see \u00a75). + auto sq_refine = + std::make_unique(base_d, refine_sq_type.value(), base_metric_type); + + // QT_4bit_uniform + L2 benefits from quantile-based range + // estimation. Previously applied inside the fork + // IndexScalarQuantizer ctor for SQ4U+L2; now applied explicitly + // at the call site so the fork ctor is behavior-identical to + // baseline and this call site can use either. + if (refine_sq_type.value() == faiss::ScalarQuantizer::QT_4bit_uniform && base_metric_type == faiss::METRIC_L2) { + sq_refine->sq.rangestat = faiss::ScalarQuantizer::RS_quantiles; + sq_refine->sq.rangestat_arg = 0.01; + } auto refine_index = std::make_unique(local_index.get(), sq_refine.get()); diff --git a/src/index/refine/refine_utils.h b/src/index/refine/refine_utils.h index e11e425a3..c34e6e5e8 100644 --- a/src/index/refine/refine_utils.h +++ b/src/index/refine/refine_utils.h @@ -7,21 +7,25 @@ #include #include "faiss/Index.h" -#include "faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h" +#include "faiss/impl/ScalarQuantizer.h" #include "knowhere/expected.h" #include "knowhere/operands.h" namespace knowhere { -expected +// Returns a baseline faiss::ScalarQuantizer::QuantizerType. The integer +// values for every qtype this function returns match the fork's enum, +// so static_cast at the boundary of a fork IndexScalarQuantizer ctor is +// lossless. The fork enum is retired at the knowhere layer; fork ctors +// are the only remaining consumers. +expected get_sq_quantizer_type(const std::string& sq_type); expected is_flat_refine(const std::optional& refine_type); bool -has_lossless_quant(const expected& quant_type, - DataFormatEnum dataFormat); +has_lossless_quant(const expected& quant_type, DataFormatEnum dataFormat); bool has_lossless_refine_index(const std::optional& refine, const std::optional& refine_type, diff --git a/src/simd/hook.cc b/src/simd/hook.cc index ab18fd8c6..43d4c69ca 100644 --- a/src/simd/hook.cc +++ b/src/simd/hook.cc @@ -11,7 +11,6 @@ #include "hook.h" -#include #include #include @@ -578,7 +577,6 @@ fvec_hook(std::string& simd_type) { static int init_hook_ = []() { std::string simd_type; fvec_hook(simd_type); - faiss::cppcontrib::knowhere::sq_hook(); return 0; }(); diff --git a/thirdparty/faiss/.github/actions/build_cmake/action.yml b/thirdparty/faiss/.github/actions/build_cmake/action.yml index 3a1306d2e..7b9d2c7bb 100644 --- a/thirdparty/faiss/.github/actions/build_cmake/action.yml +++ b/thirdparty/faiss/.github/actions/build_cmake/action.yml @@ -100,27 +100,43 @@ runs: sudo apt-get -qq update >/dev/null sudo apt-get -qq install -y kmod wget gpg >/dev/null + # Download, prepare, and install the package signing key + mkdir --parents --mode=0755 /etc/apt/keyrings + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + + - name: Add rocm repository + if: inputs.rocm == 'ON' + shell: bash + run: | # Get UBUNTU version name UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'` # Set ROCm version ROCM_VERSION="6.2" - # Download, prepare, and install the package signing key - mkdir --parents --mode=0755 /etc/apt/keyrings - wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + rocm_baseurl="https://repo.radeon.com/rocm/apt/${ROCM_VERSION}" + sudo mkdir -p /etc/apt/keyrings + wget -qO /tmp/rocm.gpg.key https://repo.radeon.com/rocm/rocm.gpg.key + echo "2de99e2354646a90d9903e2a669fc4e36b02c1bbff7075c481e12d7edab2c88b /tmp/rocm.gpg.key" | sha256sum --check - # Add rocm repository - wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add - - rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}" - echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" | sudo tee /etc/apt/sources.list.d/rocm.list - sudo apt-get -qq update --allow-insecure-repositories >/dev/null - sudo apt-get -qq install -y --allow-unauthenticated \ - "rocm-dev${ROCM_VERSION}" "rocm-utils${ROCM_VERSION}" \ - "rocm-libs${ROCM_VERSION}" >/dev/null + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" | sudo tee /etc/apt/sources.list.d/rocm.list + + sudo apt-get -qq update >/dev/null + sudo apt-get -qq install -y \ + "rocm-dev${ROCM_VERSION}" "rocm-utils${ROCM_VERSION}" "rocm-libs${ROCM_VERSION}" >/dev/null + + + - name: Pin BLAS/LAPACK versions + if: inputs.rocm == 'ON' + shell: bash + run: | + conda install -y \ + "libblas=3.9.0=35_*" \ + "libcblas=3.9.0=35_*" \ + "liblapack=3.9.0=35_*" # Fake presence of MI200-class accelerators - echo "gfx90a" | sudo tee /opt/rocm/bin/target.lst + echo "gfx942" | sudo tee /opt/rocm/bin/target.lst # Cleanup sudo apt-get -qq autoclean >/dev/null @@ -135,10 +151,14 @@ runs: sudo ln -s /lib/x86_64-linux-gnu/libc_nonshared.a /usr/lib64/libc_nonshared.a sudo ln -s /usr/lib/x86_64-linux-gnu/libpthread.so.0 /lib64/libpthread.so.0 sudo ln -s $HOME/miniconda3/x86_64-conda-linux-gnu/sysroot/usr/lib64/libpthread_nonshared.a /usr/lib64/libpthread_nonshared.a - - name: Print GPU info - if: inputs.gpu == 'ON' + - name: Print NVIDIA GPU info + if: inputs.gpu == 'ON' && inputs.rocm != 'ON' shell: bash run: nvidia-smi + - name: Print AMD GPU info + if: inputs.gpu == 'ON' && inputs.rocm == 'ON' + shell: bash + run: rocm-smi - name: Build all targets shell: bash run: | diff --git a/thirdparty/faiss/.github/workflows/build-pull-request.yml b/thirdparty/faiss/.github/workflows/build-pull-request.yml index 00e5491ab..05efb6010 100644 --- a/thirdparty/faiss/.github/workflows/build-pull-request.yml +++ b/thirdparty/faiss/.github/workflows/build-pull-request.yml @@ -96,6 +96,29 @@ jobs: uses: ./.github/actions/build_cmake with: gpu: ON + linux-x86_64-GPU-w-ROCm-cmake: + name: Linux x86_64 GPU w/ ROCm (cmake) + needs: linux-x86_64-cmake + runs-on: linux-amd-rocm-mi325-ubuntu-24 + container: + image: ubuntu:24.04 + options: --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN + steps: + - name: Container setup + run: | + if [ -f /.dockerenv ]; then + apt-get update && apt-get install -y sudo && apt-get install -y git + git config --global --add safe.directory '*' + else + echo 'Skipping. Current job is not running inside a container.' + fi + - name: Checkout + uses: actions/checkout@v4 + - name: Build and Test (cmake) + uses: ./.github/actions/build_cmake + with: + gpu: ON + rocm: ON linux-x86_64-GPU-w-CUVS-cmake: name: Linux x86_64 GPU w/ cuVS (cmake) needs: linux-x86_64-cmake diff --git a/thirdparty/faiss/.gitignore b/thirdparty/faiss/.gitignore index 52a99e8fc..0033491a7 100644 --- a/thirdparty/faiss/.gitignore +++ b/thirdparty/faiss/.gitignore @@ -21,3 +21,13 @@ faiss/python/swigfaiss_avx2.swig faiss/python/swigfaiss_avx512.swig faiss/python/swigfaiss_avx512_spr.swig faiss/python/swigfaiss_sve.swig + +# Local intermediate build artifacts (CUDA 13.2 scripts) +/_build/ +/_build_python_*/ +/_libfaiss_stage/ +/build_output/ + +# Python package build outputs +/dist/ +/*.egg-info/ diff --git a/thirdparty/faiss/CONTRIBUTING.md b/thirdparty/faiss/CONTRIBUTING.md index 10fc8152f..2ac8ad935 100644 --- a/thirdparty/faiss/CONTRIBUTING.md +++ b/thirdparty/faiss/CONTRIBUTING.md @@ -44,7 +44,7 @@ outlined on that page and do not file a public issue. * 4 spaces for indentation in C++ (no tabs) * 80 character line length (both for C++ and Python) -* C++ language level: C++17 +* C++ language level: C++20 ## License diff --git a/thirdparty/faiss/INSTALL.md b/thirdparty/faiss/INSTALL.md index b233ddf01..b3276c2ae 100644 --- a/thirdparty/faiss/INSTALL.md +++ b/thirdparty/faiss/INSTALL.md @@ -248,7 +248,7 @@ $ make -C build demo_ivfpq_indexing_gpu $ ./build/demos/demo_ivfpq_indexing_gpu ``` -This produce the GPU code equivalent to the CPU `demo_ivfpq_indexing`. It also +This produces the GPU code equivalent to the CPU `demo_ivfpq_indexing`. It also shows how to translate indexes from/to a GPU. ### A real-life benchmark diff --git a/thirdparty/faiss/README.md b/thirdparty/faiss/README.md index 1a6949ab4..df490b17a 100644 --- a/thirdparty/faiss/README.md +++ b/thirdparty/faiss/README.md @@ -35,7 +35,7 @@ The optional GPU implementation provides what is likely (as of March 2017) the f The following are entry points for documentation: -- the full documentation can be found on the [wiki page](http://github.com/facebookresearch/faiss/wiki), including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting) +- the full documentation can be found on the [wiki page](https://github.com/facebookresearch/faiss/wiki), including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting) - the [doxygen documentation](https://faiss.ai/) gives per-class information extracted from code comments - to reproduce results from our research papers, [Polysemous codes](https://arxiv.org/abs/1609.01882) and [Billion-scale similarity search with GPUs](https://arxiv.org/abs/1702.08734), refer to the [benchmarks README](benchs/README.md). For [ Link and code: Fast indexing with graphs and compact regression codes](https://arxiv.org/abs/1804.09996), see the [link_and_code README](benchs/link_and_code) @@ -82,7 +82,7 @@ For the GPU version of Faiss, please cite: For public discussion of Faiss or for questions, visit https://github.com/facebookresearch/faiss/discussions. -We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository. +We monitor the [issues page](https://github.com/facebookresearch/faiss/issues) of the repository. You can report bugs, ask questions, etc. ## Legal diff --git a/thirdparty/faiss/benchs/README.md b/thirdparty/faiss/benchs/README.md index b50dad80c..e9da262da 100644 --- a/thirdparty/faiss/benchs/README.md +++ b/thirdparty/faiss/benchs/README.md @@ -226,7 +226,7 @@ The run produces two warnings: - the add() function complains that there is an inefficient memory allocation, but this is a concern only when it happens often, and we are not benchmarking the add time anyways. -To index small datasets, it is more efficient to use a `GpuIVFFlat`, which just stores the full vectors in the inverted lists. We did not mention this in the the paper because it is not as scalable. To experiment with this setting, change the `index_factory` string from "IVF4096,PQ64" to "IVF16384,Flat". This gives: +To index small datasets, it is more efficient to use a `GpuIVFFlat`, which just stores the full vectors in the inverted lists. We did not mention this in the paper because it is not as scalable. To experiment with this setting, change the `index_factory` string from "IVF4096,PQ64" to "IVF16384,Flat". This gives: ``` nprobe= 1 0.025 s recalls= 0.4084 0.4105 0.4105 diff --git a/thirdparty/faiss/benchs/bench_gpu_1bn.py b/thirdparty/faiss/benchs/bench_gpu_1bn.py index fc17e9de7..f56a669f2 100644 --- a/thirdparty/faiss/benchs/bench_gpu_1bn.py +++ b/thirdparty/faiss/benchs/bench_gpu_1bn.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the diff --git a/thirdparty/faiss/benchs/bench_vector_ops.py b/thirdparty/faiss/benchs/bench_vector_ops.py index 96aa97057..85ecc7231 100644 --- a/thirdparty/faiss/benchs/bench_vector_ops.py +++ b/thirdparty/faiss/benchs/bench_vector_ops.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the diff --git a/thirdparty/faiss/benchs/kmeans_mnist.py b/thirdparty/faiss/benchs/kmeans_mnist.py index fa070483a..72b00c38a 100644 --- a/thirdparty/faiss/benchs/kmeans_mnist.py +++ b/thirdparty/faiss/benchs/kmeans_mnist.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the diff --git a/thirdparty/faiss/c_api/CMakeLists.txt b/thirdparty/faiss/c_api/CMakeLists.txt index 3d445dfd5..67a945d52 100644 --- a/thirdparty/faiss/c_api/CMakeLists.txt +++ b/thirdparty/faiss/c_api/CMakeLists.txt @@ -76,7 +76,7 @@ endif() if(NOT WIN32) # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids. # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide - target_compile_options(faiss_c_avx512_spr PRIVATE $<$:-march=sapphirerapids -mtune=sapphirerapids>) + target_compile_options(faiss_c_avx512_spr PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vpopcntdq -mpopcnt -mavx512fp16 -mavx512bf16>) else() target_compile_options(faiss_c_avx512_spr PRIVATE $<$:/arch:AVX512>) endif() diff --git a/thirdparty/faiss/cmake/link_to_faiss_lib.cmake b/thirdparty/faiss/cmake/link_to_faiss_lib.cmake index 7ca19fa8a..da81e8429 100644 --- a/thirdparty/faiss/cmake/link_to_faiss_lib.cmake +++ b/thirdparty/faiss/cmake/link_to_faiss_lib.cmake @@ -31,7 +31,7 @@ function(link_to_faiss_lib target) if(NOT WIN32) # Architecture mode to support AVX512 extensions available since Intel (R) Sapphire Rapids. # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide - target_compile_options(${target} PRIVATE $<$:-march=sapphirerapids -mtune=sapphirerapids>) + target_compile_options(${target} PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vpopcntdq -mpopcnt -mavx512fp16 -mavx512bf16>) else() target_compile_options(${target} PRIVATE $<$:/arch:AVX512>) endif() diff --git a/thirdparty/faiss/contrib/README.md b/thirdparty/faiss/contrib/README.md index f2b7d0f84..8b9052c44 100644 --- a/thirdparty/faiss/contrib/README.md +++ b/thirdparty/faiss/contrib/README.md @@ -8,8 +8,6 @@ The contrib directory contains helper modules for Faiss for various tasks. The contrib directory gets compiled in the module faiss.contrib. Note that although some of the modules may depend on additional modules (eg. GPU Faiss, pytorch, hdf5), they are not necessarily compiled in to avoid adding dependencies. It is the user's responsibility to provide them. -In contrib, we are progressively dropping python2 support. - ## List of contrib modules ### rpc.py diff --git a/thirdparty/faiss/contrib/clustering.py b/thirdparty/faiss/contrib/clustering.py index 6222fb39b..00bc4f08b 100644 --- a/thirdparty/faiss/contrib/clustering.py +++ b/thirdparty/faiss/contrib/clustering.py @@ -121,6 +121,78 @@ def train_ivf_index_with_2level(index, xt, **args): index.train(xt) +def balanced_assignment_with_penalties(x, centroids, alpha = 0.03, num_iter = 20, maxk = 100): + """ + Assign vectors x to centroids with a balance constraint. + + Iteratively adjusts per-cluster penalties so that oversized clusters + become less attractive. At each iteration the penalized distance for + cluster c is ``d(x, c)^2 + penalty_c^2`` and the penalty is updated as + ``penalty_c *= (binsize_c / n_opt) ** alpha`` where ``n_opt = n / nc``. + + A single kNN call (with *maxk* neighbors) is done upfront; subsequent + iterations only re-weight among those candidates, making the routine + fast even for large datasets. + + Reference: "Balancing clusters to reduce response time variability in + large scale image search", Tavenard et al., CBMI 2011. + https://inria.hal.science/inria-00576886/document + See also notebook N10159950. + + Args: + x: (n, d) float32 array of vectors to assign. + centroids: (nc, d) float32 array of cluster centroids. + alpha: exponent that controls how aggressively penalties grow. + Higher values yield more balanced clusters at the cost + of higher MSE. Typical range: 0.01 – 0.1. + num_iter: number of penalty-update iterations. + maxk: number of nearest centroids to consider per vector. + Must be <= nc. + + Returns: + assign: (n,) int64 array of centroid indices. + stats: dict with keys + + - *imf*: imbalance factor (1.0 = perfectly balanced) + - *mse*: mean squared error of the assignment + - *binsize_min*, *binsize_max*: smallest / largest cluster + - *penalty_min*, *penalty_max*: penalty value range + - *alpha*: the alpha value used + """ + + nc = len(centroids) + n = len(x) + nopt = n / nc # targed bin sizes + + # we assign to the top-maxk clusters. The final assignment will pick among these clusters. + full_d2, full_assign = faiss.knn(x, centroids, maxk) + + # scalar penalty for each cluster + penalties = np.ones(nc, dtype=np.float32) + + for it in range(num_iter): + # compute penalized assignment + penalties2 = penalties ** 2 + full_d2_penalized = full_d2 + penalties2[full_assign] + a0 = full_d2_penalized.argmin(axis=1) + assign = np.take_along_axis(full_assign, a0[:, None], axis=1).ravel() + binsizes = np.bincount(assign, minlength=nc) + # print(imbalance_factor(nc, assign), mse, int(binsizes.min()), int(binsizes.max())) + penalties *= (binsizes / nopt) ** alpha + + stats = dict( + alpha=alpha, + imf=imbalance_factor(nc, assign), + mse = ((x - centroids[assign]) ** 2).sum(1).mean(), # recompute MSE + binsize_min=int(binsizes.min()), + binsize_max=int(binsizes.max()), + penalty_min=penalties.min(), + penalty_max=penalties.max(), + ) + + return assign, stats + + ############################################################################### # K-means implementation in Python # diff --git a/thirdparty/faiss/faiss/AutoTune.cpp b/thirdparty/faiss/faiss/AutoTune.cpp index 5c01e2c30..33044776b 100644 --- a/thirdparty/faiss/faiss/AutoTune.cpp +++ b/thirdparty/faiss/faiss/AutoTune.cpp @@ -135,7 +135,7 @@ bool OperatingPoints::add( break; } } - assert(i < a.size()); + FAISS_THROW_IF_NOT(i < a.size()); if (t < a[i].t) { if (a[i].perf == perf) { a[i] = op; diff --git a/thirdparty/faiss/faiss/CMakeLists.txt b/thirdparty/faiss/faiss/CMakeLists.txt index 4fc463992..48ad5073c 100644 --- a/thirdparty/faiss/faiss/CMakeLists.txt +++ b/thirdparty/faiss/faiss/CMakeLists.txt @@ -15,6 +15,7 @@ set(FAISS_SIMD_AVX2_SRC impl/scalar_quantizer/sq-avx2.cpp impl/approx_topk/avx2.cpp utils/simd_impl/distances_avx2.cpp + utils/simd_impl/partitioning_avx2.cpp utils/distances_fused/simdlib_based.cpp utils/simd_impl/rabitq_avx2.cpp ) @@ -32,6 +33,7 @@ set(FAISS_SIMD_NEON_SRC impl/scalar_quantizer/sq-neon.cpp impl/approx_topk/neon.cpp utils/simd_impl/distances_aarch64.cpp + utils/simd_impl/partitioning_neon.cpp utils/distances_fused/simdlib_based_neon.cpp utils/simd_impl/rabitq_neon.cpp ) @@ -39,11 +41,16 @@ set(FAISS_SIMD_SVE_SRC impl/pq_code_distance/pq_code_distance-sve.cpp utils/simd_impl/distances_arm_sve.cpp ) +set(FAISS_SIMD_RVV_SRC + impl/scalar_quantizer/sq-rvv.cpp +) # Select SIMD sources based on target architecture if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64|amd64|AMD64)") set(FAISS_SIMD_SRC ${FAISS_SIMD_AVX2_SRC} ${FAISS_SIMD_AVX512_SRC}) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64|arm64|ARM64)") set(FAISS_SIMD_SRC ${FAISS_SIMD_NEON_SRC} ${FAISS_SIMD_SVE_SRC}) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(riscv64|riscv)") + set(FAISS_SIMD_SRC ${FAISS_SIMD_RVV_SRC}) else() set(FAISS_SIMD_SRC "") endif() @@ -118,7 +125,6 @@ set(FAISS_SRC impl/AdditiveQuantizer.cpp impl/RaBitQuantizer.cpp impl/RaBitQuantizerMultiBit.cpp - impl/RaBitQStats.cpp impl/RaBitQUtils.cpp impl/ResidualQuantizer.cpp impl/LocalSearchQuantizer.cpp @@ -156,6 +162,7 @@ set(FAISS_SRC utils/simd_levels.cpp utils/distances_fused/distances_fused.cpp factory_tools.cpp + # build.cpp excluded due to build errors on Windows ) if(FAISS_ENABLE_SVS) @@ -251,7 +258,6 @@ set(FAISS_HEADERS impl/Quantizer.h impl/RaBitQuantizer.h impl/RaBitQuantizerMultiBit.h - impl/RaBitQStats.h impl/RaBitQUtils.h impl/ResidualQuantizer.h impl/ResultHandler.h @@ -298,7 +304,6 @@ set(FAISS_HEADERS utils/NeuralNet.h utils/WorkerThread.h utils/distances.h - utils/distances_dispatch.h utils/extra_distances.h utils/fp16-fp16c.h utils/fp16-inl.h @@ -405,7 +410,7 @@ endif() if(NOT WIN32) # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids. # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide - target_compile_options(faiss_avx512_spr PRIVATE $<$:-march=sapphirerapids -mtune=sapphirerapids>) + target_compile_options(faiss_avx512_spr PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vpopcntdq -mpopcnt -mavx512fp16 -mavx512bf16>) else() target_compile_options(faiss_avx512_spr PRIVATE $<$:/arch:AVX512>) # we need bigobj for the swig wrapper @@ -454,7 +459,8 @@ if(FAISS_OPT_LEVEL STREQUAL "dd") target_compile_definitions(faiss PRIVATE FAISS_ENABLE_DD) # Architecture-specific SIMD definitions for Dynamic Dispatch if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64|amd64|AMD64)") - target_compile_definitions(faiss PRIVATE COMPILE_SIMD_AVX2 COMPILE_SIMD_AVX512) + target_compile_definitions(faiss PRIVATE + COMPILE_SIMD_AVX2 COMPILE_SIMD_AVX512 COMPILE_SIMD_AVX512_SPR) # Baseline flags for common files (prevents auto-vectorization) target_compile_options(faiss PRIVATE $<$:-mpopcnt -msse4 -mno-avx -mno-avx2>) @@ -476,6 +482,14 @@ if(FAISS_OPT_LEVEL STREQUAL "dd") TARGET_DIRECTORY faiss PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+sve" ) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(riscv64|riscv)") + target_compile_definitions(faiss PRIVATE COMPILE_SIMD_RISCV_RVV) + if(FAISS_SIMD_RVV_SRC) + set_source_files_properties(${FAISS_SIMD_RVV_SRC} + TARGET_DIRECTORY faiss + PROPERTIES COMPILE_OPTIONS "-march=rv64gcv_zvfhmin;-mabi=lp64d" + ) + endif() endif() endif() endif() @@ -488,6 +502,19 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64|arm64|ARM64)") target_sources(faiss PRIVATE ${FAISS_SIMD_NEON_SRC}) endif() +# RVV is the baseline SIMD on rv64 builds compiled with rv64gcv. Compile RVV +# sources into the main faiss target, mirroring the ARM NEON story on aarch64. +if(CMAKE_SYSTEM_PROCESSOR MATCHES "(riscv64|riscv)") + target_compile_definitions(faiss PRIVATE COMPILE_SIMD_RISCV_RVV) + target_sources(faiss PRIVATE ${FAISS_SIMD_RVV_SRC}) + if(NOT WIN32 AND FAISS_SIMD_RVV_SRC) + set_source_files_properties(${FAISS_SIMD_RVV_SRC} + TARGET_DIRECTORY faiss + PROPERTIES COMPILE_OPTIONS "-march=rv64gcv_zvfhmin;-mabi=lp64d" + ) + endif() +endif() + if(FAISS_ENABLE_SVS) find_package(svs_runtime REQUIRED) diff --git a/thirdparty/faiss/faiss/Clustering.cpp b/thirdparty/faiss/faiss/Clustering.cpp index fde41e2d6..8d5558539 100644 --- a/thirdparty/faiss/faiss/Clustering.cpp +++ b/thirdparty/faiss/faiss/Clustering.cpp @@ -163,7 +163,8 @@ void compute_centroids( for (size_t i = 0; i < n; i++) { int64_t ci = assign[i]; - assert(ci >= 0 && ci < k + k_frozen); + FAISS_THROW_IF_NOT_MSG( + ci >= 0 && ci < k + k_frozen, "invalid cluster assignment"); ci -= k_frozen; if (ci >= static_cast(c0) && ci < static_cast(c1)) { diff --git a/thirdparty/faiss/faiss/IVFlib.cpp b/thirdparty/faiss/faiss/IVFlib.cpp index 03a975512..10766faf7 100644 --- a/thirdparty/faiss/faiss/IVFlib.cpp +++ b/thirdparty/faiss/faiss/IVFlib.cpp @@ -125,7 +125,7 @@ void search_centroid( index = index_pre->index; } faiss::IndexIVF* index_ivf = dynamic_cast(index); - assert(index_ivf); + FAISS_THROW_IF_NOT_MSG(index_ivf, "could not extract IVF index"); index_ivf->quantizer->assign(n, x, centroid_ids); } @@ -146,7 +146,7 @@ void search_and_return_centroids( index = index_pre->index; } faiss::IndexIVF* index_ivf = dynamic_cast(index); - assert(index_ivf); + FAISS_THROW_IF_NOT_MSG(index_ivf, "could not extract IVF index"); size_t nprobe = index_ivf->nprobe; std::vector cent_nos(n * nprobe); diff --git a/thirdparty/faiss/faiss/Index.cpp b/thirdparty/faiss/faiss/Index.cpp index 42fa3fa13..327cef890 100644 --- a/thirdparty/faiss/faiss/Index.cpp +++ b/thirdparty/faiss/faiss/Index.cpp @@ -168,7 +168,7 @@ struct GenericDistanceComputer : DistanceComputer { size_t d; const Index& storage; std::vector buf; - const float* q; + const float* q = nullptr; explicit GenericDistanceComputer(const Index& storage_in) : storage(storage_in) { diff --git a/thirdparty/faiss/faiss/Index2Layer.cpp b/thirdparty/faiss/faiss/Index2Layer.cpp index ee942d0c2..33d855f7f 100644 --- a/thirdparty/faiss/faiss/Index2Layer.cpp +++ b/thirdparty/faiss/faiss/Index2Layer.cpp @@ -13,10 +13,6 @@ #include #include -#ifdef __SSE3__ -#include -#endif - #include #include @@ -138,9 +134,10 @@ struct Distance2Level : DistanceComputer { size_t d; const Index2Layer& storage; std::vector buf; - const float* q; + const float* q = nullptr; - const float *pq_l1_tab, *pq_l2_tab; + const float* pq_l1_tab = nullptr; + const float* pq_l2_tab = nullptr; explicit Distance2Level(const Index2Layer& storage_) : storage(storage_) { d = storage_.d; @@ -162,7 +159,8 @@ struct Distance2Level : DistanceComputer { // well optimized for xNN+PQNN struct DistanceXPQ4 : Distance2Level { - int M, k; + int M = 0; + int k = 0; explicit DistanceXPQ4(const Index2Layer& storage_) : Distance2Level(storage_) { @@ -175,34 +173,26 @@ struct DistanceXPQ4 : Distance2Level { } float operator()(idx_t i) override { -#ifdef __SSE3__ const uint8_t* code = storage.codes.data() + i * storage.code_size; idx_t key = 0; memcpy(&key, code, storage.code_size_1); code += storage.code_size_1; - // walking pointers const float* qa = q; - const __m128* l1_t = (const __m128*)(pq_l1_tab + d * key); - const __m128* pq_l2_t = (const __m128*)pq_l2_tab; - __m128 accu = _mm_setzero_ps(); + const float* l1 = pq_l1_tab + d * key; + const float* l2 = pq_l2_tab; + float accu = 0; for (int m = 0; m < M; m++) { - __m128 qi = _mm_loadu_ps(qa); - __m128 recons = _mm_add_ps(l1_t[m], pq_l2_t[*code++]); - __m128 diff = _mm_sub_ps(qi, recons); - accu = _mm_add_ps(accu, _mm_mul_ps(diff, diff)); - pq_l2_t += 256; + for (int j = 0; j < 4; j++) { + float diff = qa[j] - (l1[m * 4 + j] + l2[*code * 4 + j]); + accu += diff * diff; + } + code++; + l2 += 256 * 4; qa += 4; } - - accu = _mm_hadd_ps(accu, accu); - accu = _mm_hadd_ps(accu, accu); - return _mm_cvtss_f32(accu); -#else - (void)i; - FAISS_THROW_MSG("not implemented for non-x64 platforms"); -#endif + return accu; } }; @@ -227,42 +217,36 @@ struct Distance2xXPQ4 : Distance2Level { int64_t key01 = 0; memcpy(&key01, code, storage.code_size_1); code += storage.code_size_1; -#ifdef __SSE3__ - // walking pointers const float* qa = q; - const __m128* pq_l1_t = (const __m128*)pq_l1_tab; - const __m128* pq_l2_t = (const __m128*)pq_l2_tab; - __m128 accu = _mm_setzero_ps(); + const float* l1 = pq_l1_tab; + const float* l2 = pq_l2_tab; + float accu = 0; for (int mi_m = 0; mi_m < 2; mi_m++) { int64_t l1_idx = key01 & (((int64_t)1 << mi_nbits) - 1); - const __m128* pq_l1 = pq_l1_t + M_2 * l1_idx; + const float* l1_sub = l1 + M_2 * l1_idx * 4; for (int m = 0; m < M_2; m++) { - __m128 qi = _mm_loadu_ps(qa); - __m128 recons = _mm_add_ps(pq_l1[m], pq_l2_t[*code++]); - __m128 diff = _mm_sub_ps(qi, recons); - accu = _mm_add_ps(accu, _mm_mul_ps(diff, diff)); - pq_l2_t += 256; + for (int j = 0; j < 4; j++) { + float diff = + qa[j] - (l1_sub[m * 4 + j] + l2[*code * 4 + j]); + accu += diff * diff; + } + code++; + l2 += 256 * 4; qa += 4; } - pq_l1_t += M_2 << mi_nbits; + l1 += (M_2 << mi_nbits) * 4; key01 >>= mi_nbits; } - accu = _mm_hadd_ps(accu, accu); - accu = _mm_hadd_ps(accu, accu); - return _mm_cvtss_f32(accu); -#else - FAISS_THROW_MSG("not implemented for non-x64 platforms"); -#endif + return accu; } }; } // namespace DistanceComputer* Index2Layer::get_distance_computer() const { -#ifdef __SSE3__ const MultiIndexQuantizer* mi = dynamic_cast(q1.quantizer); @@ -275,7 +259,6 @@ DistanceComputer* Index2Layer::get_distance_computer() const { if (fl && pq.dsub == 4) { return new DistanceXPQ4(*this); } -#endif return Index::get_distance_computer(); } diff --git a/thirdparty/faiss/faiss/IndexAdditiveQuantizer.cpp b/thirdparty/faiss/faiss/IndexAdditiveQuantizer.cpp index f68a40d9b..14d717c0c 100644 --- a/thirdparty/faiss/faiss/IndexAdditiveQuantizer.cpp +++ b/thirdparty/faiss/faiss/IndexAdditiveQuantizer.cpp @@ -52,7 +52,7 @@ struct AQDistanceComputerDecompress : FlatCodesDistanceComputer { vd(vd_), d(iaq.d) {} - const float* q; + const float* q = nullptr; void set_query(const float* x) final { q = x; } @@ -83,7 +83,7 @@ struct AQDistanceComputerLUT : FlatCodesDistanceComputer { aq(*iaq.aq), d(iaq.d) {} - float bias; + float bias = 0.0f; void set_query(const float* x) final { q = x; // this is quite sub-optimal for multiple queries diff --git a/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp b/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp index fe322e968..9e4f67ba2 100644 --- a/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +++ b/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp @@ -217,6 +217,10 @@ void IndexAdditiveQuantizerFastScan::sa_decode( aq->decode(bytes, x, n); } +size_t IndexAdditiveQuantizerFastScan::fast_scan_code_size() const { + return M2 / 2; +} + /************************************************************************************** * IndexResidualQuantizerFastScan **************************************************************************************/ diff --git a/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.h b/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.h index f76e2e133..d1fda0b2d 100644 --- a/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.h +++ b/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.h @@ -28,7 +28,7 @@ namespace faiss { */ struct IndexAdditiveQuantizerFastScan : IndexFastScan { - AdditiveQuantizer* aq; + AdditiveQuantizer* aq = nullptr; using Search_type_t = AdditiveQuantizer::Search_type_t; bool rescale_norm = true; @@ -86,6 +86,9 @@ struct IndexAdditiveQuantizerFastScan : IndexFastScan { * @param x output vectors, size n * d */ void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; + + /// Packed code size: M2 / 2 bytes (4-bit AQ sub-quantizer nibbles) + size_t fast_scan_code_size() const override; }; /** Index based on a residual quantizer. Stored vectors are diff --git a/thirdparty/faiss/faiss/IndexBinary.h b/thirdparty/faiss/faiss/IndexBinary.h index be031d582..2ce811762 100644 --- a/thirdparty/faiss/faiss/IndexBinary.h +++ b/thirdparty/faiss/faiss/IndexBinary.h @@ -58,7 +58,7 @@ struct IndexBinary { } else { FAISS_THROW_MSG("IndexBinary::train: unsupported numeric type"); } - }; + } /** Add n vectors of dimension d to the index. * @@ -72,7 +72,7 @@ struct IndexBinary { } else { FAISS_THROW_MSG("IndexBinary::add: unsupported numeric type"); } - }; + } /** Same as add, but stores xids instead of sequential ids. * @@ -93,7 +93,7 @@ struct IndexBinary { FAISS_THROW_MSG( "IndexBinary::add_with_ids: unsupported numeric type"); } - }; + } /** Query n vectors of dimension d to the index. * @@ -129,7 +129,7 @@ struct IndexBinary { } else { FAISS_THROW_MSG("IndexBinary::search: unsupported numeric type"); } - }; + } /** Query n vectors of dimension d to the index. * diff --git a/thirdparty/faiss/faiss/IndexBinaryHNSW.cpp b/thirdparty/faiss/faiss/IndexBinaryHNSW.cpp index 92c78161e..21f10e53f 100644 --- a/thirdparty/faiss/faiss/IndexBinaryHNSW.cpp +++ b/thirdparty/faiss/faiss/IndexBinaryHNSW.cpp @@ -236,7 +236,7 @@ void IndexBinaryHNSW::search( dis->set_query((float*)(x + i * code_size)); // Given that IndexBinaryHNSW is not an IndexHNSW, we pass nullptr // as the index parameter. This state does not get used in the - // search function, as it is merely there to to enable Panorama + // search function, as it is merely there to enable Panorama // execution for IndexHNSWFlatPanorama. HNSWStats stats = hnsw.search(*dis, nullptr, res, vt, params_in); n1 += stats.n1; diff --git a/thirdparty/faiss/faiss/IndexBinaryHNSW.h b/thirdparty/faiss/faiss/IndexBinaryHNSW.h index a43bb398b..9bca05a31 100644 --- a/thirdparty/faiss/faiss/IndexBinaryHNSW.h +++ b/thirdparty/faiss/faiss/IndexBinaryHNSW.h @@ -19,7 +19,7 @@ namespace faiss { * link structure built on top */ struct IndexBinaryHNSW : IndexBinary { - typedef HNSW::storage_idx_t storage_idx_t; + using storage_idx_t = HNSW::storage_idx_t; // the link structure HNSW hnsw; diff --git a/thirdparty/faiss/faiss/IndexBinaryHash.cpp b/thirdparty/faiss/faiss/IndexBinaryHash.cpp index b8c0092ea..dbd19586c 100644 --- a/thirdparty/faiss/faiss/IndexBinaryHash.cpp +++ b/thirdparty/faiss/faiss/IndexBinaryHash.cpp @@ -305,7 +305,7 @@ IndexBinaryMultiHash::~IndexBinaryMultiHash() { void IndexBinaryMultiHash::reset() { storage->reset(); ntotal = 0; - for (auto map : maps) { + for (auto& map : maps) { map.clear(); } } @@ -459,7 +459,7 @@ void IndexBinaryMultiHash::search( size_t IndexBinaryMultiHash::hashtable_size() const { size_t tot = 0; - for (auto map : maps) { + for (const auto& map : maps) { tot += map.size(); } diff --git a/thirdparty/faiss/faiss/IndexBinaryHash.h b/thirdparty/faiss/faiss/IndexBinaryHash.h index 78240c0c5..32a8c4cad 100644 --- a/thirdparty/faiss/faiss/IndexBinaryHash.h +++ b/thirdparty/faiss/faiss/IndexBinaryHash.h @@ -66,10 +66,10 @@ struct IndexBinaryHash : IndexBinary { }; struct IndexBinaryHashStats { - size_t nq; // nb of queries run - size_t n0; // nb of empty lists - size_t nlist; // nb of non-empty inverted lists scanned - size_t ndis{}; // nb of distances computed + size_t nq = 0; // nb of queries run + size_t n0 = 0; // nb of empty lists + size_t nlist = 0; // nb of non-empty inverted lists scanned + size_t ndis = 0; // nb of distances computed IndexBinaryHashStats() { reset(); diff --git a/thirdparty/faiss/faiss/IndexBinaryIVF.cpp b/thirdparty/faiss/faiss/IndexBinaryIVF.cpp index 0c8c4dee4..6de40b41a 100644 --- a/thirdparty/faiss/faiss/IndexBinaryIVF.cpp +++ b/thirdparty/faiss/faiss/IndexBinaryIVF.cpp @@ -64,7 +64,7 @@ void IndexBinaryIVF::add_core( const idx_t* xids, const idx_t* precomputed_idx) { FAISS_THROW_IF_NOT(is_trained); - assert(invlists); + FAISS_THROW_IF_NOT_MSG(invlists, "invlists not initialized"); direct_map.check_can_add(xids); const idx_t* idx; @@ -354,7 +354,7 @@ struct IVFBinaryScannerL2 : BinaryInvertedListScanner { hc.set(query_vector, code_size); } - idx_t list_no; + idx_t list_no = 0; void set_list(idx_t list_no_2, uint8_t /* coarse_dis */) override { this->list_no = list_no_2; } @@ -611,10 +611,10 @@ template struct BlockSearch { HammingComputer hcs[NQ]; // heaps to update for each query - int32_t* distances[NQ]; - idx_t* labels[NQ]; + int32_t* distances[NQ] = {}; + idx_t* labels[NQ] = {}; // curent top of heap - int32_t heap_tops[NQ]; + int32_t heap_tops[NQ] = {}; BlockSearch( size_t code_size, @@ -648,10 +648,10 @@ struct BlockSearchVariableK { int k; HammingComputer hcs[NQ]; // heaps to update for each query - int32_t* distances[NQ]; - idx_t* labels[NQ]; + int32_t* distances[NQ] = {}; + idx_t* labels[NQ] = {}; // curent top of heap - int32_t heap_tops[NQ]; + int32_t heap_tops[NQ] = {}; BlockSearchVariableK( size_t code_size, diff --git a/thirdparty/faiss/faiss/IndexFastScan.cpp b/thirdparty/faiss/faiss/IndexFastScan.cpp index 2a433f20e..1dc93f05f 100644 --- a/thirdparty/faiss/faiss/IndexFastScan.cpp +++ b/thirdparty/faiss/faiss/IndexFastScan.cpp @@ -528,7 +528,14 @@ void IndexFastScan::search_implem_14( const FastScanDistancePostProcessing& context) const { FAISS_THROW_IF_NOT(bbs % 32 == 0); - int qbs2 = qbs == 0 ? 4 : qbs; + // The accumulate loop dispatch table only instantiates certain + // (nq, BB) pairs where BB = bbs/32. Cap the query batch size to + // the maximum nq instantiated for the current BB so the caller + // doesn't have to know about internal template constraints. + // BB=1 → nq up to 4, BB=2 → nq up to 2, BB>=3 → nq=1 + int BB = bbs / 32; + int max_qbs = BB <= 1 ? 4 : BB == 2 ? 2 : 1; + int qbs2 = std::min(qbs == 0 ? 4 : qbs, max_qbs); // handle qbs2 blocking by recursive call if (n > qbs2) { diff --git a/thirdparty/faiss/faiss/IndexFastScan.h b/thirdparty/faiss/faiss/IndexFastScan.h index 26d6a47f1..8f73e4322 100644 --- a/thirdparty/faiss/faiss/IndexFastScan.h +++ b/thirdparty/faiss/faiss/IndexFastScan.h @@ -44,8 +44,8 @@ struct IndexFastScan : Index { // vector quantizer size_t M; - size_t nbits; - size_t ksub; + size_t nbits = 0; + size_t ksub = 0; size_t code_size; // packed version of the codes @@ -236,6 +236,18 @@ struct IndexFastScan : Index { void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override { compute_codes(bytes, n, x); } + + /** Get the size of the code portion packed by pq4_pack_codes. + * + * Returns the number of bytes per vector that are interleaved into + * SIMD blocks by pq4_pack_codes, excluding any embedded metadata + * (e.g., RaBitQ factors). The meaning of these bytes depends on the + * quantizer: for PQ/AQ they are 4-bit sub-quantizer nibbles, for + * RaBitQ they are 1-bit-per-dimension sign bits packed into nibbles. + * + * Must be implemented by all derived classes. + */ + virtual size_t fast_scan_code_size() const = 0; }; struct FastScanStats { diff --git a/thirdparty/faiss/faiss/IndexHNSW.cpp b/thirdparty/faiss/faiss/IndexHNSW.cpp index a0ff524c1..50141443a 100644 --- a/thirdparty/faiss/faiss/IndexHNSW.cpp +++ b/thirdparty/faiss/faiss/IndexHNSW.cpp @@ -284,6 +284,7 @@ void hnsw_search( ndis += stats.ndis; nhops += stats.nhops; res.end(); + vt.advance(); } } InterruptCallback::check(); @@ -1042,7 +1043,7 @@ void IndexHNSWCagra::search( std::vector nearest(n); std::vector nearest_d(n); -#pragma omp for +#pragma omp parallel for for (idx_t i = 0; i < n; i++) { std::unique_ptr dis( storage_distance_computer(this->storage)); @@ -1080,6 +1081,70 @@ void IndexHNSWCagra::search( } } +void IndexHNSWCagra::range_search( + idx_t n, + const float* x, + float radius, + RangeSearchResult* result, + const SearchParameters* params) const { + if (!base_level_only) { + IndexHNSW::range_search(n, x, radius, result, params); + return; + } + + const HNSW& hnsw = this->hnsw; + size_t n1 = 0, n2 = 0, ndis = 0, nhops = 0; + float threshold = is_similarity_metric(metric_type) ? -radius : radius; + RangeSearchPartialResult pres(result); + + for (idx_t i = 0; i < n; i++) { + std::unique_ptr dis( + storage_distance_computer(storage)); + dis->set_query(x + i * d); + + storage_idx_t nearest = -1; + float nearest_d = std::numeric_limits::max(); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distrib(0, ntotal - 1); + + for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) { + auto idx = distrib(gen); + auto distance = (*dis)(idx); + if (distance < nearest_d) { + nearest = idx; + nearest_d = distance; + } + } + FAISS_THROW_IF_NOT_MSG( + nearest >= 0, "Could not find a valid entrypoint."); + + RangeQueryResult& qres = pres.new_result(i); + RangeResultHandler res(&qres, threshold); + VisitedTable vt(ntotal, hnsw.use_visited_hashset); + HNSWStats stats; + hnsw.search_level_0( + *dis, res, 1, &nearest, &nearest_d, 1, stats, vt, params); + n1 += stats.n1; + n2 += stats.n2; + ndis += stats.ndis; + nhops += stats.nhops; + } + + pres.set_lims(); + result->do_allocation(); + pres.copy_result(); + + hnsw_stats.combine({n1, n2, ndis, nhops}); + + if (is_similarity_metric(metric_type)) { + for (size_t i = 0; i < result->lims[result->nq]; i++) { + result->distances[i] = -result->distances[i]; + } + } +} + faiss::NumericType IndexHNSWCagra::get_numeric_type() const { return numeric_type_; } diff --git a/thirdparty/faiss/faiss/IndexHNSW.h b/thirdparty/faiss/faiss/IndexHNSW.h index a43828d42..c1eb9f268 100644 --- a/thirdparty/faiss/faiss/IndexHNSW.h +++ b/thirdparty/faiss/faiss/IndexHNSW.h @@ -28,7 +28,7 @@ struct IndexHNSW; * link structure built on top */ struct IndexHNSW : Index { - typedef HNSW::storage_idx_t storage_idx_t; + using storage_idx_t = HNSW::storage_idx_t; // the link structure HNSW hnsw; @@ -259,9 +259,16 @@ struct IndexHNSWCagra : IndexHNSW { idx_t* labels, const SearchParameters* params = nullptr) const override; + void range_search( + idx_t n, + const float* x, + float radius, + RangeSearchResult* result, + const SearchParameters* params = nullptr) const override; + faiss::NumericType get_numeric_type() const; void set_numeric_type(faiss::NumericType numeric_type); - NumericType numeric_type_; + NumericType numeric_type_ = Float32; }; } // namespace faiss diff --git a/thirdparty/faiss/faiss/IndexIVF.cpp b/thirdparty/faiss/faiss/IndexIVF.cpp index c32f0d80e..3d1d251e8 100644 --- a/thirdparty/faiss/faiss/IndexIVF.cpp +++ b/thirdparty/faiss/faiss/IndexIVF.cpp @@ -828,103 +828,119 @@ void IndexIVF::range_search_preassigned( #pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis) { - RangeSearchPartialResult pres(result); - std::unique_ptr scanner( - get_InvertedListScanner(store_pairs, sel, params)); - FAISS_THROW_IF_NOT(scanner.get()); - all_pres[omp_get_thread_num()] = &pres; - - // prepare the list scanning function - - auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult& qres) { - idx_t key = keys[i * cur_nprobe + ik]; /* select the list */ - if (key < 0) { - return; - } - FAISS_THROW_IF_NOT_FMT( - key < (idx_t)nlist, - "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n", - key, - ik, - nlist); - - if (invlists->is_empty(key, inverted_list_context)) { - return; - } + try { + RangeSearchPartialResult pres(result); + std::unique_ptr scanner( + get_InvertedListScanner(store_pairs, sel, params)); + FAISS_THROW_IF_NOT(scanner.get()); + all_pres[omp_get_thread_num()] = &pres; - try { - size_t list_size = 0; - scanner->set_list(key, coarse_dis[i * cur_nprobe + ik]); - if (invlists->use_iterator) { - std::unique_ptr it( - invlists->get_iterator(key, inverted_list_context)); + // prepare the list scanning function - scanner->iterate_codes_range( - it.get(), radius, qres, list_size); - } else { - InvertedLists::ScopedCodes scodes(invlists, key); - InvertedLists::ScopedIds ids(invlists, key); - list_size = invlists->list_size(key); + auto scan_list_func = [&](size_t i, + size_t ik, + RangeQueryResult& qres) { + try { + idx_t key = keys[i * cur_nprobe + ik]; /* select the list */ + if (key < 0) { + return; + } - scanner->scan_codes_range( - list_size, scodes.get(), ids.get(), radius, qres); + FAISS_THROW_IF_NOT_FMT( + key < (idx_t)nlist, + "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n", + key, + ik, + nlist); + + if (invlists->is_empty(key, inverted_list_context)) { + return; + } + + size_t list_size = 0; + scanner->set_list(key, coarse_dis[i * cur_nprobe + ik]); + if (invlists->use_iterator) { + std::unique_ptr it( + invlists->get_iterator( + key, inverted_list_context)); + + scanner->iterate_codes_range( + it.get(), radius, qres, list_size); + } else { + InvertedLists::ScopedCodes scodes(invlists, key); + InvertedLists::ScopedIds ids(invlists, key); + list_size = invlists->list_size(key); + + scanner->scan_codes_range( + list_size, + scodes.get(), + ids.get(), + radius, + qres); + } + nlistv++; + ndis += list_size; + } catch (const std::exception& e) { + std::lock_guard lock(exception_mutex); + exception_string = demangle_cpp_symbol(typeid(e).name()) + + " " + e.what(); + interrupt = true; } - nlistv++; - ndis += list_size; - } catch (const std::exception& e) { - std::lock_guard lock(exception_mutex); - exception_string = - demangle_cpp_symbol(typeid(e).name()) + " " + e.what(); - interrupt = true; - } - }; + }; - if (parallel_mode == 0) { + if (parallel_mode == 0) { #pragma omp for - for (idx_t i = 0; i < nx; i++) { - scanner->set_query(x + i * d); + for (idx_t i = 0; i < nx; i++) { + scanner->set_query(x + i * d); - RangeQueryResult& qres = pres.new_result(i); + RangeQueryResult& qres = pres.new_result(i); - for (idx_t ik = 0; ik < cur_nprobe; ik++) { - scan_list_func(i, ik, qres); + for (idx_t ik = 0; ik < cur_nprobe; ik++) { + scan_list_func(i, ik, qres); + } } - } - } else if (parallel_mode == 1) { - for (idx_t i = 0; i < nx; i++) { - scanner->set_query(x + i * d); + } else if (parallel_mode == 1) { + for (idx_t i = 0; i < nx; i++) { + scanner->set_query(x + i * d); - RangeQueryResult& qres = pres.new_result(i); + RangeQueryResult& qres = pres.new_result(i); #pragma omp for schedule(dynamic) - for (int64_t ik = 0; ik < cur_nprobe; ik++) { - scan_list_func(i, ik, qres); + for (int64_t ik = 0; ik < cur_nprobe; ik++) { + scan_list_func(i, ik, qres); + } } - } - } else if (parallel_mode == 2) { - RangeQueryResult* qres = nullptr; + } else if (parallel_mode == 2) { + RangeQueryResult* qres = nullptr; #pragma omp for schedule(dynamic) - for (idx_t iik = 0; iik < nx * (idx_t)cur_nprobe; iik++) { - idx_t i = iik / (idx_t)cur_nprobe; - idx_t ik = iik % (idx_t)cur_nprobe; - if (qres == nullptr || qres->qno != i) { - qres = &pres.new_result(i); - scanner->set_query(x + i * d); + for (idx_t iik = 0; iik < nx * (idx_t)cur_nprobe; iik++) { + idx_t i = iik / (idx_t)cur_nprobe; + idx_t ik = iik % (idx_t)cur_nprobe; + if (qres == nullptr || qres->qno != i) { + qres = &pres.new_result(i); + scanner->set_query(x + i * d); + } + scan_list_func(i, ik, *qres); } - scan_list_func(i, ik, *qres); + } else { + FAISS_THROW_FMT( + "parallel_mode %d not supported\n", parallel_mode); } - } else { - FAISS_THROW_FMT("parallel_mode %d not supported\n", parallel_mode); - } - if (parallel_mode == 0) { - pres.finalize(); - } else { + if (parallel_mode == 0) { + pres.finalize(); + } else { #pragma omp barrier #pragma omp single - RangeSearchPartialResult::merge(all_pres, false); + RangeSearchPartialResult::merge(all_pres, false); #pragma omp barrier + } + } catch (const std::exception& e) { + std::lock_guard lock(exception_mutex); + exception_string = + demangle_cpp_symbol(typeid(e).name()) + " " + e.what(); + interrupt = true; } } @@ -976,6 +992,11 @@ void IndexIVF::search1( for (size_t i = 0; i < cur_nprobe; i++) { idx_t key = keys[i]; + FAISS_THROW_IF_NOT_FMT( + key < (idx_t)nlist, + "Invalid key=%" PRId64 " nlist=%zd\n", + key, + nlist); if (key < 0 || invlists->is_empty(key)) { continue; } @@ -1411,11 +1432,19 @@ size_t InvertedListScanner::iterate_codes( size_t nup = 0; list_size = 0; + const bool has_cb = it->has_search_callbacks_; + if (!keep_max) { for (; it->is_available(); it->next()) { auto id_and_codes = it->get_id_and_codes(); float dis = distance_to_code(id_and_codes.second); + if (has_cb) { + it->on_distance_computed(id_and_codes.first, dis); + } if (dis < simi[0]) { + if (has_cb) { + it->on_heap_changed(id_and_codes.first, idxi[0]); + } maxheap_replace_top(k, simi, idxi, dis, id_and_codes.first); nup++; } @@ -1425,7 +1454,13 @@ size_t InvertedListScanner::iterate_codes( for (; it->is_available(); it->next()) { auto id_and_codes = it->get_id_and_codes(); float dis = distance_to_code(id_and_codes.second); + if (has_cb) { + it->on_distance_computed(id_and_codes.first, dis); + } if (dis > simi[0]) { + if (has_cb) { + it->on_heap_changed(id_and_codes.first, idxi[0]); + } minheap_replace_top(k, simi, idxi, dis, id_and_codes.first); nup++; } diff --git a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp index 6f132b9ab..24b63cf3c 100644 --- a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +++ b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp @@ -181,14 +181,14 @@ struct AQInvertedListScanner : InvertedListScanner { tmp.resize(ia.d); } - const float* q0; + const float* q0 = nullptr; /// from now on we handle this query. void set_query(const float* query_vector) override { q0 = query_vector; } - const float* q; + const float* q = nullptr; /// following codes come from this inverted list void set_list(idx_t list_no_, float /*coarse_dis*/) override { this->list_no = list_no_; diff --git a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp index 81ded84fe..e32e6601a 100644 --- a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +++ b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp @@ -129,6 +129,10 @@ IndexIVFAdditiveQuantizerFastScan::IndexIVFAdditiveQuantizerFastScan() { IndexIVFAdditiveQuantizerFastScan::~IndexIVFAdditiveQuantizerFastScan() = default; +size_t IndexIVFAdditiveQuantizerFastScan::fast_scan_code_size() const { + return M2 / 2; +} + /********************************************************* * Training *********************************************************/ diff --git a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h index 73c2ca044..eeb699b8a 100644 --- a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +++ b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h @@ -70,6 +70,9 @@ struct IndexIVFAdditiveQuantizerFastScan : IndexIVFFastScan { const IndexIVFAdditiveQuantizer& orig, int bbs = 32); + /// Packed code size: M2 / 2 bytes (4-bit AQ sub-quantizer nibbles) + size_t fast_scan_code_size() const override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; idx_t train_encoder_num_vectors() const override; diff --git a/thirdparty/faiss/faiss/IndexIVFFastScan.cpp b/thirdparty/faiss/faiss/IndexIVFFastScan.cpp index b9402ccec..2962328c0 100644 --- a/thirdparty/faiss/faiss/IndexIVFFastScan.cpp +++ b/thirdparty/faiss/faiss/IndexIVFFastScan.cpp @@ -88,7 +88,11 @@ void IndexIVFFastScan::init_fastscan( void IndexIVFFastScan::init_code_packer() { auto bil = dynamic_cast(invlists); - FAISS_THROW_IF_NOT(bil); + if (!bil) { + // invlists is not block-packed (e.g., when own_invlists=false). + // Nothing to do — the caller manages inverted lists externally. + return; + } delete bil->packer; // in case there was one before bil->packer = get_CodePacker(); } diff --git a/thirdparty/faiss/faiss/IndexIVFFastScan.h b/thirdparty/faiss/faiss/IndexIVFFastScan.h index 852f8f76b..fa7662852 100644 --- a/thirdparty/faiss/faiss/IndexIVFFastScan.h +++ b/thirdparty/faiss/faiss/IndexIVFFastScan.h @@ -41,14 +41,14 @@ struct Quantizer; struct IndexIVFFastScan : IndexIVF { // size of the kernel - int bbs; // set at build time + int bbs = 0; // set at build time - size_t M; - size_t nbits; - size_t ksub; + size_t M = 0; + size_t nbits = 0; + size_t ksub = 0; // M rounded up to a multiple of 2 - size_t M2; + size_t M2 = 0; // search-time implementation int implem = 0; @@ -156,7 +156,7 @@ struct IndexIVFFastScan : IndexIVF { * @param context processing context containing query factors * processor */ - void compute_LUT_uint8( + virtual void compute_LUT_uint8( size_t n, const float* x, const CoarseQuantized& cq, @@ -347,6 +347,18 @@ struct IndexIVFFastScan : IndexIVF { */ void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; + /** Get the size of the code portion packed by pq4_pack_codes. + * + * Returns the number of bytes per vector that are interleaved into + * SIMD blocks by pq4_pack_codes, excluding any embedded metadata + * (e.g., RaBitQ factors). The meaning of these bytes depends on the + * quantizer: for PQ/AQ they are 4-bit sub-quantizer nibbles, for + * RaBitQ they are 1-bit-per-dimension sign bits packed into nibbles. + * + * Must be implemented by all derived classes. + */ + virtual size_t fast_scan_code_size() const = 0; + protected: /** Get stride for interpreting codes during SIMD packing. * diff --git a/thirdparty/faiss/faiss/IndexIVFFlat.cpp b/thirdparty/faiss/faiss/IndexIVFFlat.cpp index 1ada94713..f649316c5 100644 --- a/thirdparty/faiss/faiss/IndexIVFFlat.cpp +++ b/thirdparty/faiss/faiss/IndexIVFFlat.cpp @@ -66,7 +66,7 @@ void IndexIVFFlat::add_core( FAISS_THROW_IF_NOT(is_trained); FAISS_THROW_IF_NOT(coarse_idx); FAISS_THROW_IF_NOT(!by_residual); - assert(invlists); + FAISS_THROW_IF_NOT_MSG(invlists, "invlists not initialized"); direct_map.check_can_add(xids); int64_t n_add = 0; @@ -215,7 +215,7 @@ void IndexIVFFlatDedup::add_with_ids( const float* x, const idx_t* xids) { FAISS_THROW_IF_NOT(is_trained); - assert(invlists); + FAISS_THROW_IF_NOT_MSG(invlists, "invlists not initialized"); FAISS_THROW_IF_NOT_MSG( direct_map.no(), "IVFFlatDedup not implemented with direct_map"); std::unique_ptr idx(new int64_t[na]); diff --git a/thirdparty/faiss/faiss/IndexIVFFlat.h b/thirdparty/faiss/faiss/IndexIVFFlat.h index ae74794e5..1665a06e6 100644 --- a/thirdparty/faiss/faiss/IndexIVFFlat.h +++ b/thirdparty/faiss/faiss/IndexIVFFlat.h @@ -76,7 +76,7 @@ struct IVFFlatScanner : InvertedListScanner { code_size = vd.d * sizeof(float); } - const float* xi; + const float* xi = nullptr; void set_query(const float* query) override { this->xi = query; } diff --git a/thirdparty/faiss/faiss/IndexIVFPQ.cpp b/thirdparty/faiss/faiss/IndexIVFPQ.cpp index 3af6fddff..7d2a439cc 100644 --- a/thirdparty/faiss/faiss/IndexIVFPQ.cpp +++ b/thirdparty/faiss/faiss/IndexIVFPQ.cpp @@ -9,7 +9,6 @@ #include -#include #include #include #include @@ -17,7 +16,6 @@ #include -#include #include #include @@ -556,7 +554,7 @@ struct QueryTables { *****************************************************/ // field specific to query - const float* qi; + const float* qi = nullptr; // query-specific initialization void init_query(const float* qi_in) { @@ -587,8 +585,8 @@ struct QueryTables { *****************************************************/ // fields specific to list - idx_t key; - float coarse_dis; + idx_t key = 0; + float coarse_dis = 0.0f; std::vector q_code; uint64_t init_list_cycles; @@ -804,18 +802,18 @@ struct WrappedSearchResult { template struct IVFPQScannerT : QueryTables { using PQDecoder = typename PQCodeDist::PQDecoder; - const uint8_t* list_codes; + const uint8_t* list_codes = nullptr; const IDType* list_ids; - size_t list_size; + size_t list_size = 0; IVFPQScannerT( const IndexIVFPQ& ivfpq_in, const IVFSearchParameters* params_in) : QueryTables(ivfpq_in, params_in) { - assert(METRIC_TYPE == metric_type); + FAISS_THROW_IF_NOT(METRIC_TYPE == metric_type); } - float dis0; + float dis0 = 0.0f; void init_list(idx_t list_no, float coarse_dis_in, int mode) { this->key = list_no; @@ -1217,7 +1215,7 @@ struct IVFPQScanner : IVFPQScannerT, } float distance_to_code(const uint8_t* code) const override { - assert(precompute_mode == 2); + FAISS_THROW_IF_NOT(precompute_mode == 2); float dis = this->dis0 + PQCodeDist::distance_single_code( this->pq.M, this->pq.nbits, this->sim_table, code); @@ -1236,7 +1234,7 @@ struct IVFPQScanner : IVFPQScannerT, handler); if (this->polysemous_ht > 0) { - assert(precompute_mode == 2); + FAISS_THROW_IF_NOT(precompute_mode == 2); this->scan_list_polysemous(ncode, codes, res); } else if (precompute_mode == 2) { this->scan_list_with_table(ncode, codes, res); diff --git a/thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp b/thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp index 1fa49d1a3..bcfdf41c1 100644 --- a/thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp +++ b/thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp @@ -8,7 +8,6 @@ #include #include -#include #include #include @@ -118,6 +117,10 @@ IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs_in) orig_invlists = orig.invlists; } +size_t IndexIVFPQFastScan::fast_scan_code_size() const { + return M2 / 2; +} + /********************************************************* * Training *********************************************************/ @@ -187,16 +190,19 @@ void IndexIVFPQFastScan::encode_vectors( * Look-Up Table functions *********************************************************/ +// Explicit SIMD-level alias (no global bare aliases). +using simd8float32 = simd8float32_tpl; + void fvec_madd_simd( size_t n, const float* a, float bf, const float* b, float* c) { - assert(is_aligned_pointer(a)); - assert(is_aligned_pointer(b)); - assert(is_aligned_pointer(c)); - assert(n % 8 == 0); + FAISS_THROW_IF_NOT_MSG(is_aligned_pointer(a), "pointer a is not aligned"); + FAISS_THROW_IF_NOT_MSG(is_aligned_pointer(b), "pointer b is not aligned"); + FAISS_THROW_IF_NOT_MSG(is_aligned_pointer(c), "pointer c is not aligned"); + FAISS_THROW_IF_NOT_MSG(n % 8 == 0, "n must be a multiple of 8"); simd8float32 bf8(bf); n /= 8; for (size_t i = 0; i < n; i++) { @@ -313,7 +319,8 @@ namespace { struct IVFPQFastScanScanner : InvertedListScanner { using InvertedListScanner::scan_codes; - static constexpr int impl = 10; // based on search_implem_10 + [[maybe_unused]] static constexpr int impl = + 10; // based on search_implem_10 static constexpr size_t nq = 1; // 1 query at a time. const IndexIVFPQFastScan& index; AlignedTable dis_tables; diff --git a/thirdparty/faiss/faiss/IndexIVFPQFastScan.h b/thirdparty/faiss/faiss/IndexIVFPQFastScan.h index b47a5e105..bd1cdbfe6 100644 --- a/thirdparty/faiss/faiss/IndexIVFPQFastScan.h +++ b/thirdparty/faiss/faiss/IndexIVFPQFastScan.h @@ -55,6 +55,9 @@ struct IndexIVFPQFastScan : IndexIVFFastScan { // built from an IndexIVFPQ explicit IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs = 32); + /// Packed code size: M2 / 2 bytes (4-bit PQ sub-quantizer nibbles) + size_t fast_scan_code_size() const override; + void train_encoder(idx_t n, const float* x, const idx_t* assign) override; idx_t train_encoder_num_vectors() const override; diff --git a/thirdparty/faiss/faiss/IndexIVFPQR.cpp b/thirdparty/faiss/faiss/IndexIVFPQR.cpp index d0d78ba5c..e0fc34dd0 100644 --- a/thirdparty/faiss/faiss/IndexIVFPQR.cpp +++ b/thirdparty/faiss/faiss/IndexIVFPQR.cpp @@ -176,9 +176,12 @@ void IndexIVFPQR::search_preassigned( int list_no = lo_listno(sl); int ofs = lo_offset(sl); - assert(list_no >= 0 && static_cast(list_no) < nlist); - assert(ofs >= 0 && - static_cast(ofs) < invlists->list_size(list_no)); + FAISS_THROW_IF_NOT( + list_no >= 0 && static_cast(list_no) < nlist); + FAISS_THROW_IF_NOT( + ofs >= 0 && + static_cast(ofs) < + invlists->list_size(list_no)); // 1st level residual quantizer->compute_residual(xq, residual_1.get(), list_no); @@ -193,7 +196,7 @@ void IndexIVFPQR::search_preassigned( // 3rd level residual's approximation idx_t id = invlists->get_single_id(list_no, ofs); - assert(0 <= id && id < ntotal); + FAISS_THROW_IF_NOT(0 <= id && id < ntotal); refine_pq.decode( &refine_codes[id * refine_pq.code_size], residual_1.get()); @@ -220,7 +223,7 @@ void IndexIVFPQR::reconstruct_from_offset( IndexIVFPQ::reconstruct_from_offset(list_no, offset, recons); idx_t id = invlists->get_single_id(list_no, offset); - assert(0 <= id && id < ntotal); + FAISS_THROW_IF_NOT(0 <= id && id < ntotal); std::vector r3(d); refine_pq.decode(&refine_codes[id * refine_pq.code_size], r3.data()); diff --git a/thirdparty/faiss/faiss/IndexIVFRaBitQ.cpp b/thirdparty/faiss/faiss/IndexIVFRaBitQ.cpp index c520dfea3..d651f66c6 100644 --- a/thirdparty/faiss/faiss/IndexIVFRaBitQ.cpp +++ b/thirdparty/faiss/faiss/IndexIVFRaBitQ.cpp @@ -231,12 +231,6 @@ struct RaBitInvertedListScanner : InvertedListScanner { // Multi-bit: Two-stage search with adaptive filtering size_t nup = 0; - // Stats tracking for multi-bit two-stage search - // n_1bit_evaluations: candidates evaluated using 1-bit lower bound - // n_multibit_evaluations: candidates requiring full multi-bit distance - size_t local_1bit_evaluations = 0; - size_t local_multibit_evaluations = 0; - for (size_t j = 0; j < list_size; j++) { if (sel != nullptr) { int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; @@ -246,16 +240,8 @@ struct RaBitInvertedListScanner : InvertedListScanner { } } - local_1bit_evaluations++; - - // Stage 1: Compute distance bound using 1-bit codes - // For L2 (min-heap): use lower_bound to safely skip if it's - // already worse than heap worst - // For IP (max-heap): use upper_bound because with a lower bound, - // we can't safely skip any candidate float est_distance = rabitq_dc->distance_to_code_1bit(codes); - // Extract f_error and g_error for filtering size_t code_size_base = (ivf_rabitq.d + 7) / 8; const rabitq_utils::SignBitFactorsWithError* base_fac = reinterpret_cast< @@ -269,8 +255,6 @@ struct RaBitInvertedListScanner : InvertedListScanner { handler.threshold, keep_max); if (should_refine) { - local_multibit_evaluations++; - // Lower bound is promising, compute full distance float dis = distance_to_code(codes); int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; @@ -281,12 +265,6 @@ struct RaBitInvertedListScanner : InvertedListScanner { codes += code_size; } - // Update global stats atomically -#pragma omp atomic - rabitq_stats.n_1bit_evaluations += local_1bit_evaluations; -#pragma omp atomic - rabitq_stats.n_multibit_evaluations += local_multibit_evaluations; - return nup; } diff --git a/thirdparty/faiss/faiss/IndexIVFRaBitQ.h b/thirdparty/faiss/faiss/IndexIVFRaBitQ.h index a70507006..8a39a8c94 100644 --- a/thirdparty/faiss/faiss/IndexIVFRaBitQ.h +++ b/thirdparty/faiss/faiss/IndexIVFRaBitQ.h @@ -13,7 +13,6 @@ #include #include -#include #include namespace faiss { diff --git a/thirdparty/faiss/faiss/IndexIVFRaBitQFastScan.cpp b/thirdparty/faiss/faiss/IndexIVFRaBitQFastScan.cpp index 6e76bb399..2ae1ef6c6 100644 --- a/thirdparty/faiss/faiss/IndexIVFRaBitQFastScan.cpp +++ b/thirdparty/faiss/faiss/IndexIVFRaBitQFastScan.cpp @@ -107,6 +107,10 @@ size_t IndexIVFRaBitQFastScan::compute_per_vector_storage_size() const { return rabitq_utils::compute_per_vector_storage_size(rabitq.nb_bits, d); } +size_t IndexIVFRaBitQFastScan::fast_scan_code_size() const { + return (d + 7) / 8; +} + size_t IndexIVFRaBitQFastScan::code_packing_stride() const { // Use code_size as stride to skip embedded factor data during packing return code_size; @@ -267,85 +271,133 @@ bool IndexIVFRaBitQFastScan::lookup_table_is_3d() const { return true; } +// out[code] = base + sum of v_i for each set bit in code. +inline void write_subset_sum_lut( + float* out, + float base, + float v0, + float v1, + float v2, + float v3) { + out[0] = base; + out[1] = base + v0; + out[2] = base + v1; + out[3] = base + v0 + v1; + out[4] = base + v2; + out[5] = base + v0 + v2; + out[6] = base + v1 + v2; + out[7] = base + v0 + v1 + v2; + out[8] = base + v3; + out[9] = base + v0 + v3; + out[10] = base + v1 + v3; + out[11] = base + v0 + v1 + v3; + out[12] = base + v2 + v3; + out[13] = base + v0 + v2 + v3; + out[14] = base + v1 + v2 + v3; + out[15] = base + v0 + v1 + v2 + v3; +} + // Computes lookup table for residual vectors in RaBitQ FastScan format void IndexIVFRaBitQFastScan::compute_residual_LUT( - const float* residual, + const float* query, + idx_t centroid_id, QueryFactorsData& query_factors, float* lut_out, - const float* original_query) const { - FAISS_THROW_IF_NOT(qb > 0 && qb <= 8); - - std::vector rotated_q(d); - std::vector rotated_qq(d); + uint8_t qb_param, + bool centered_param, + std::vector& rotated_q, + std::vector& centroid_buf) const { + const size_t d_val = static_cast(d); + FAISS_THROW_IF_NOT(d_val > 0); + rotated_q.resize(d_val); + centroid_buf.resize(d_val); + std::vector rotated_qq(d_val); + + // Compute residual + quantizer->reconstruct(centroid_id, centroid_buf.data()); + for (size_t i = 0; i < d_val; i++) { + rotated_q[i] = query[i] - centroid_buf[i]; + } - // Use RaBitQUtils to compute query factors - eliminates code duplication + // Compute query factors using shared utility query_factors = rabitq_utils::compute_query_factors( - residual, - d, + rotated_q.data(), + d_val, nullptr, - qb, - centered, + qb_param, + centered_param, metric_type, rotated_q, rotated_qq); - if (metric_type == MetricType::METRIC_INNER_PRODUCT && - original_query != nullptr) { - query_factors.qr_norm_L2sqr = fvec_norm_L2sqr(original_query, d); - query_factors.q_dot_c = query_factors.qr_norm_L2sqr - - fvec_inner_product(original_query, residual, d); + if (metric_type == MetricType::METRIC_INNER_PRODUCT) { + query_factors.qr_norm_L2sqr = fvec_norm_L2sqr(query, d_val); + query_factors.q_dot_c = + fvec_inner_product(query, centroid_buf.data(), d_val); } - const size_t ex_bits = rabitq.nb_bits - 1; - if (ex_bits > 0) { + if (rabitq.nb_bits > 1) { query_factors.rotated_q = rotated_q; } - if (centered) { - const float max_code_value = (1 << qb) - 1; - - for (size_t m = 0; m < M; m++) { - const size_t dim_start = m * 4; - - for (int code_val = 0; code_val < 16; code_val++) { - float xor_contribution = 0.0f; + // Build LUT using branchless subset-sum construction + const size_t d_sz = d_val; - for (size_t dim_offset = 0; dim_offset < 4; dim_offset++) { - const size_t dim_idx = dim_start + dim_offset; + if (centered_param) { + const float mcv = static_cast((1 << qb_param) - 1); - if (dim_idx < static_cast(d)) { - const bool db_bit = (code_val >> dim_offset) & 1; - const float query_value = rotated_qq[dim_idx]; - - xor_contribution += db_bit - ? (max_code_value - query_value) - : query_value; - } - } - - lut_out[m * 16 + code_val] = xor_contribution; + for (size_t m = 0; m < M; m++) { + const size_t ds = m * 4; + float* out = lut_out + m * 16; + + float base = 0.0f; + float v0 = 0.0f, v1 = 0.0f, v2 = 0.0f, v3 = 0.0f; + if (ds + 0 < d_sz) { + float q = rotated_qq[ds + 0]; + base += q; + v0 = mcv - 2.0f * q; } + if (ds + 1 < d_sz) { + float q = rotated_qq[ds + 1]; + base += q; + v1 = mcv - 2.0f * q; + } + if (ds + 2 < d_sz) { + float q = rotated_qq[ds + 2]; + base += q; + v2 = mcv - 2.0f * q; + } + if (ds + 3 < d_sz) { + float q = rotated_qq[ds + 3]; + base += q; + v3 = mcv - 2.0f * q; + } + + write_subset_sum_lut(out, base, v0, v1, v2, v3); } } else { - for (size_t m = 0; m < M; m++) { - const size_t dim_start = m * 4; + const float c1 = query_factors.c1; + const float c2 = query_factors.c2; - for (int code_val = 0; code_val < 16; code_val++) { - float inner_product = 0.0f; - int popcount = 0; - - for (size_t dim_offset = 0; dim_offset < 4; dim_offset++) { - const size_t dim_idx = dim_start + dim_offset; + for (size_t m = 0; m < M; m++) { + const size_t ds = m * 4; + float* out = lut_out + m * 16; - if (dim_idx < static_cast(d) && - ((code_val >> dim_offset) & 1)) { - inner_product += rotated_qq[dim_idx]; - popcount++; - } - } - lut_out[m * 16 + code_val] = query_factors.c1 * inner_product + - query_factors.c2 * popcount; + float v0 = 0.0f, v1 = 0.0f, v2 = 0.0f, v3 = 0.0f; + if (ds + 0 < d_sz) { + v0 = c1 * rotated_qq[ds + 0] + c2; + } + if (ds + 1 < d_sz) { + v1 = c1 * rotated_qq[ds + 1] + c2; } + if (ds + 2 < d_sz) { + v2 = c1 * rotated_qq[ds + 2] + c2; + } + if (ds + 3 < d_sz) { + v3 = c1 * rotated_qq[ds + 3] + c2; + } + + write_subset_sum_lut(out, 0.0f, v0, v1, v2, v3); } } } @@ -368,15 +420,24 @@ void IndexIVFRaBitQFastScan::search_preassigned( FAISS_THROW_IF_NOT_MSG(!stats, "stats not supported for this index"); size_t cur_nprobe = this->nprobe; + uint8_t used_qb = qb; + bool used_centered = centered; if (params) { FAISS_THROW_IF_NOT(params->max_codes == 0); cur_nprobe = params->nprobe; + if (auto rparams = + dynamic_cast(params)) { + used_qb = rparams->qb; + used_centered = rparams->centered; + } } std::vector query_factors_storage(n * cur_nprobe); FastScanDistancePostProcessing context; context.query_factors = query_factors_storage.data(); context.nprobe = cur_nprobe; + context.qb = used_qb; + context.centered = used_centered; const CoarseQuantized cq = {cur_nprobe, centroid_dis, assign}; search_dispatch_implem(n, x, k, distances, labels, cq, context, params); @@ -392,6 +453,10 @@ void IndexIVFRaBitQFastScan::compute_LUT( FAISS_THROW_IF_NOT(is_trained); FAISS_THROW_IF_NOT(by_residual); + // Use overridden qb/centered from context if provided, else index defaults + const uint8_t used_qb = context.qb > 0 ? context.qb : qb; + const bool used_centered = context.qb > 0 ? context.centered : centered; + size_t cq_nprobe = cq.nprobe; size_t dim12 = 16 * M; @@ -402,34 +467,145 @@ void IndexIVFRaBitQFastScan::compute_LUT( if (n * cq_nprobe > 0) { memset(biases.get(), 0, sizeof(float) * n * cq_nprobe); } - std::unique_ptr xrel(new float[n * cq_nprobe * d]); #pragma omp parallel for if (n * cq_nprobe > 1000) for (idx_t ij = 0; ij < static_cast(n * cq_nprobe); ij++) { idx_t i = ij / cq_nprobe; - float* xij = &xrel[ij * d]; idx_t cij = cq.ids[ij]; if (cij >= 0) { - quantizer->compute_residual(x + i * d, xij, cij); - - // Create QueryFactorsData for this query-list combination + std::vector rotated_q(d); + std::vector centroid_buf(d); QueryFactorsData query_factors_data; compute_residual_LUT( - xij, + x + i * d, + cij, query_factors_data, dis_tables.get() + ij * dim12, - x + i * d); + used_qb, + used_centered, + rotated_q, + centroid_buf); - // Store query factors using compact indexing (ij directly) if (context.query_factors != nullptr) { context.query_factors[ij] = query_factors_data; } } else { - memset(xij, -1, sizeof(float) * d); - memset(dis_tables.get() + ij * dim12, -1, sizeof(float) * dim12); + memset(dis_tables.get() + ij * dim12, 0, sizeof(float) * dim12); + } + } +} + +void IndexIVFRaBitQFastScan::compute_LUT_uint8( + size_t n, + const float* x, + const CoarseQuantized& cq, + AlignedTable& dis_tables, + AlignedTable& biases, + float* normalizers, + const FastScanDistancePostProcessing& context) const { + FAISS_THROW_IF_NOT(is_trained); + FAISS_THROW_IF_NOT(by_residual); + + const uint8_t used_qb = context.qb > 0 ? context.qb : qb; + const bool used_centered = context.qb > 0 ? context.centered : centered; + const size_t cur_nprobe = cq.nprobe; + const size_t dim12 = 16 * M; + const size_t dim12_2 = 16 * M2; + + // Allocate only the uint8 output table (no full float table) + dis_tables.resize(n * cur_nprobe * dim12_2); + biases.resize(n * cur_nprobe); + +#pragma omp parallel if (n > 1) + { + // Per-thread buffers reused across queries + AlignedTable lut_float(cur_nprobe * dim12); + std::vector rotated_q(d); + std::vector centroid_buf(d); + std::vector all_mins(cur_nprobe * M); + std::vector probe_b(cur_nprobe); + +#pragma omp for schedule(dynamic) + for (int64_t i = 0; i < static_cast(n); i++) { + const float* xi = x + i * d; + + // Compute float LUT for all probes using fused path + for (size_t j = 0; j < cur_nprobe; j++) { + const size_t ij = i * cur_nprobe + j; + idx_t cij = cq.ids[ij]; + + if (cij >= 0) { + QueryFactorsData qf; + compute_residual_LUT( + xi, + cij, + qf, + lut_float.get() + j * dim12, + used_qb, + used_centered, + rotated_q, + centroid_buf); + + if (context.query_factors != nullptr) { + context.query_factors[ij] = qf; + } + } else { + memset(lut_float.get() + j * dim12, + 0, + sizeof(float) * dim12); + } + } + + // Quantize float LUT to uint8 inline. + // Mirrors quantize_LUT_and_bias 3D path with zero biases. + // Single pass: find per-sub-q mins, max span, and per-probe b. + float glob_max_span = -HUGE_VAL; + float glob_max_dis = -HUGE_VAL; + float glob_b = HUGE_VAL; + for (size_t j2 = 0; j2 < cur_nprobe; j2++) { + float b_j = 0; + float span_j = 0; + for (size_t m = 0; m < M; m++) { + const float* tab = lut_float.get() + j2 * dim12 + m * ksub; + float mn = tab[0], mx = tab[0]; + for (size_t s = 1; s < ksub; s++) { + mn = std::min(mn, tab[s]); + mx = std::max(mx, tab[s]); + } + all_mins[j2 * M + m] = mn; + float span = mx - mn; + glob_max_span = std::max(glob_max_span, span); + b_j += mn; + span_j += span; + } + probe_b[j2] = b_j; + glob_max_dis = std::max(glob_max_dis, span_j); + glob_b = std::min(glob_b, b_j); + } + float a = std::min(255.0f / glob_max_span, 65535.0f / glob_max_dis); + + // Second pass: quantize LUT and compute biasq + uint8_t* out_base = dis_tables.get() + i * cur_nprobe * dim12_2; + uint16_t* bq = biases.get() + i * cur_nprobe; + for (size_t j2 = 0; j2 < cur_nprobe; j2++) { + for (size_t m = 0; m < M; m++) { + const float* tab = lut_float.get() + j2 * dim12 + m * ksub; + float mn = all_mins[j2 * M + m]; + uint8_t* out = out_base + j2 * dim12_2 + m * ksub; + for (size_t s = 0; s < ksub; s++) { + out[s] = static_cast( + std::roundf(a * (tab[s] - mn))); + } + } + memset(out_base + j2 * dim12_2 + M * ksub, 0, (M2 - M) * ksub); + bq[j2] = static_cast( + std::roundf(a * (probe_b[j2] - glob_b))); + } + normalizers[2 * i] = a; + normalizers[2 * i + 1] = glob_b; } } } @@ -567,71 +743,127 @@ std::unique_ptr IndexIVFRaBitQFastScan::make_knn_scanner( namespace { /// Provides IVF scanner interface using FastScan's SIMD batch processing. +/// Buffers are allocated once and reused across set_list + scan_codes calls. struct IVFRaBitQFastScanScanner : InvertedListScanner { using InvertedListScanner::scan_codes; - static constexpr int impl = 10; static constexpr size_t nq = 1; const IndexIVFRaBitQFastScan& index; + const uint8_t qb; + const bool centered; + + const float* xi = nullptr; + // Reusable buffers (allocated once in constructor) AlignedTable dis_tables; AlignedTable biases; - /// [scale, offset] for converting uint16 to float std::array normalizers{}; - - const float* xi = nullptr; - + AlignedTable lut_float; + std::vector rotated_q; + std::vector centroid_buf; QueryFactorsData query_factors; FastScanDistancePostProcessing context; + std::vector probe_map; + std::vector mins_buf; + // Distance computer for distance_to_code (created in set_list) std::unique_ptr dc; - std::vector centroid; IVFRaBitQFastScanScanner( const IndexIVFRaBitQFastScan& index_in, bool store_pairs_in, - const IDSelector* sel_in) - : InvertedListScanner(store_pairs_in, sel_in), index(index_in) { + const IDSelector* sel_in, + uint8_t qb_in, + bool centered_in) + : InvertedListScanner(store_pairs_in, sel_in), + index(index_in), + qb(qb_in), + centered(centered_in), + lut_float(16 * index_in.M), + rotated_q(index_in.d), + centroid_buf(index_in.d), + probe_map({0}), + mins_buf(index_in.M) { this->keep_max = is_similarity_metric(index_in.metric_type); + this->code_size = index_in.code_size; + + // Pre-allocate output tables for single probe + dis_tables.resize(16 * index_in.M2); + biases.resize(1); + + // Set up context once + context.query_factors = &query_factors; + context.nprobe = 1; + context.qb = qb; + context.centered = centered; } void set_query(const float* query) override { this->xi = query; } - void set_list(idx_t list_no_in, float coarse_dis_in) override { + void set_list(idx_t list_no_in, float /*coarse_dis_in*/) override { this->list_no = list_no_in; - IndexIVFFastScan::CoarseQuantized cq{ - .nprobe = 1, - .dis = &coarse_dis_in, - .ids = &list_no_in, - }; + index.compute_residual_LUT( + xi, + list_no_in, + query_factors, + lut_float.get(), + qb, + centered, + rotated_q, + centroid_buf); + + // Single-probe quantization (simplified inline, no OMP, no 3D) + const size_t M = index.M; + const size_t M2 = index.M2; + const size_t ksub = index.ksub; + + float max_span = -HUGE_VAL; + float max_dis = 0; + float b = 0; + float* mins = mins_buf.data(); - // Set up context for use in scan_codes - context = FastScanDistancePostProcessing{}; - context.query_factors = &query_factors; - context.nprobe = 1; + for (size_t m = 0; m < M; m++) { + const float* tab = lut_float.get() + m * ksub; + float mn = tab[0], mx = tab[0]; + for (size_t s = 1; s < ksub; s++) { + mn = std::min(mn, tab[s]); + mx = std::max(mx, tab[s]); + } + mins[m] = mn; + float span = mx - mn; + max_span = std::max(max_span, span); + max_dis += span; + b += mn; + } - index.compute_LUT_uint8( - 1, xi, cq, dis_tables, biases, &normalizers[0], context); + float a = std::min(255.0f / max_span, 65535.0f / max_dis); + uint8_t* out = dis_tables.get(); + for (size_t m = 0; m < M; m++) { + const float* tab = lut_float.get() + m * ksub; + for (size_t s = 0; s < ksub; s++) { + out[m * ksub + s] = static_cast( + std::roundf(a * (tab[s] - mins[m]))); + } + } + memset(out + M * ksub, 0, (M2 - M) * ksub); + biases[0] = 0; + normalizers[0] = a; + normalizers[1] = b; - // Set up distance computer for distance_to_code - centroid.resize(index.d); - index.quantizer->reconstruct(list_no, centroid.data()); + // Create distance computer (reuses centroid_buf from + // compute_residual_LUT) dc.reset(index.rabitq.get_distance_computer( - index.qb, centroid.data(), index.centered)); + qb, centroid_buf.data(), centered)); dc->set_query(xi); } float distance_to_code(const uint8_t* code) const override { - FAISS_THROW_IF_NOT_MSG( - dc, - "set_query and set_list must be called before distance_to_code"); return dc->distance_to_code(code); } - public: size_t scan_codes( size_t ntotal, const uint8_t* codes, @@ -639,8 +871,6 @@ struct IVFRaBitQFastScanScanner : InvertedListScanner { float* distances, idx_t* labels, size_t k) const override { - // initialize the current iteration heap to the worst possible value of - // the prior loop std::vector curr_dists(k, distances[0]); std::vector curr_labels(k, labels[0]); @@ -658,15 +888,14 @@ struct IVFRaBitQFastScanScanner : InvertedListScanner { int qmap1[1] = {0}; handler->q_map = qmap1; handler->begin(&normalizers[0]); - - const uint8_t* LUT = dis_tables.get(); handler->dbias = biases.get(); handler->ntotal = ntotal; handler->id_map = ids; - // RaBitQ needs list context for factor lookup - std::vector probe_map = {0}; handler->set_list_context(list_no, probe_map); + if (!handler->list_codes_ptr) { + handler->list_codes_ptr = codes; + } scanner->accumulate_loop( 1, @@ -674,11 +903,10 @@ struct IVFRaBitQFastScanScanner : InvertedListScanner { index.bbs, static_cast(index.M2), codes, - LUT, + dis_tables.get(), 0, index.get_block_stride()); - // Combine results across iterations handler->end(); if (keep_max) { minheap_addn( @@ -697,7 +925,6 @@ struct IVFRaBitQFastScanScanner : InvertedListScanner { curr_labels.data(), k); } - return handler->num_updates(); } }; @@ -707,8 +934,16 @@ struct IVFRaBitQFastScanScanner : InvertedListScanner { InvertedListScanner* IndexIVFRaBitQFastScan::get_InvertedListScanner( bool store_pairs, const IDSelector* sel, - const IVFSearchParameters*) const { - return new IVFRaBitQFastScanScanner(*this, store_pairs, sel); + const IVFSearchParameters* search_params_in) const { + uint8_t used_qb = qb; + bool used_centered = centered; + if (auto params = dynamic_cast( + search_params_in)) { + used_qb = params->qb; + used_centered = params->centered; + } + return new IVFRaBitQFastScanScanner( + *this, store_pairs, sel, used_qb, used_centered); } } // namespace faiss diff --git a/thirdparty/faiss/faiss/IndexIVFRaBitQFastScan.h b/thirdparty/faiss/faiss/IndexIVFRaBitQFastScan.h index 40368479b..b5979e938 100644 --- a/thirdparty/faiss/faiss/IndexIVFRaBitQFastScan.h +++ b/thirdparty/faiss/faiss/IndexIVFRaBitQFastScan.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -83,6 +82,10 @@ struct IndexIVFRaBitQFastScan : IndexIVFFastScan { uint8_t* codes, bool include_listnos = false) const override; + /// Packed code size: (d + 7) / 8 bytes (1-bit-per-dimension sign bits, + /// excluding factors) + size_t fast_scan_code_size() const override; + protected: /// Return code_size as stride to skip embedded factor data during packing size_t code_packing_stride() const override; @@ -108,15 +111,29 @@ struct IndexIVFRaBitQFastScan : IndexIVFFastScan { /// Compute per-vector auxiliary storage size based on nb_bits size_t compute_per_vector_storage_size() const; - private: - /// Compute query factors and lookup table for a residual vector - /// (similar to IndexRaBitQFastScan::compute_float_LUT) + /// Override: compute and quantize LUT per-query to avoid O(n*nprobe*M*16) + /// float table allocation. + void compute_LUT_uint8( + size_t n, + const float* x, + const CoarseQuantized& cq, + AlignedTable& dis_tables, + AlignedTable& biases, + float* normalizers, + const FastScanDistancePostProcessing& context) const override; + + /// Compute residual, query factors, and float LUT in two passes over d. void compute_residual_LUT( - const float* residual, + const float* query, + idx_t centroid_id, QueryFactorsData& query_factors, float* lut_out, - const float* original_query = nullptr) const; + uint8_t qb_param, + bool centered_param, + std::vector& rotated_q, + std::vector& centroid_buf) const; + private: /// Decode FastScan code to RaBitQ residual vector with explicit /// dp_multiplier void decode_fastscan_to_residual( @@ -200,8 +217,7 @@ IVFRaBitQHeapHandler::IVFRaBitQHeapHandler( storage_size(idx->compute_per_vector_storage_size()), packed_block_size(((idx->M2 + 1) / 2) * idx->bbs), full_block_size(idx->get_block_stride()), - packer(idx->get_CodePacker()), - unpack_buf(idx->code_size) { + unpack_buf((idx->d + 7) / 8) { current_list_no = 0; probe_indices.clear(); for (int64_t q = 0; q < static_cast(nq); q++) { @@ -209,6 +225,9 @@ IVFRaBitQHeapHandler::IVFRaBitQHeapHandler( } } +// Explicit alias — must match SIMDResultHandler::handle() signature. +using simd16uint16 = simd16uint16_tpl; + template void IVFRaBitQHeapHandler::handle( size_t q, @@ -237,23 +256,31 @@ void IVFRaBitQHeapHandler::handle( "Query factors not available: FastScanDistancePostProcessing with query_factors required"); } - size_t probe_rank = probe_indices[local_q]; - size_t nprobe = context->nprobe > 0 ? context->nprobe : index->nprobe; - size_t storage_idx = q * nprobe + probe_rank; + const size_t probe_rank = probe_indices[local_q]; + const size_t storage_idx = q * cached_nprobe + probe_rank; const auto& query_factors = context->query_factors[storage_idx]; const float one_a = this->normalizers ? (1.0f / this->normalizers[2 * q]) : 1.0f; const float bias = this->normalizers ? this->normalizers[2 * q + 1] : 0.0f; - uint64_t idx_base = this->j0 + b * 32; + const uint64_t idx_base = this->j0 + b * 32; if (idx_base >= this->ntotal) { return; } - size_t max_positions = std::min(32, this->ntotal - idx_base); + const size_t max_positions = std::min(32, this->ntotal - idx_base); + + // Hoist aux pointer base out of loop: all 32 elements in this block share + // the same block base. Only the per-element offset (j * storage_size) + // varies. + const uint8_t* aux_base = this->list_codes_ptr + + (idx_base / index->bbs) * full_block_size + packed_block_size; - size_t local_1bit_evaluations = 0; - size_t local_multibit_evaluations = 0; + // Cache index fields used in the inner loop. + // Use overridden qb/centered from context if provided, else index defaults. + const bool centered = context->qb > 0 ? context->centered : index->centered; + const size_t qb = context->qb > 0 ? context->qb : index->qb; + const size_t d = index->d; for (size_t j = 0; j < max_positions; j++) { const int64_t result_id = this->adjust_id(b, j); @@ -267,16 +294,9 @@ void IVFRaBitQHeapHandler::handle( this->scan_cnt++; const float normalized_distance = d32tab[j] * one_a + bias; - const uint8_t* base_ptr = rabitq_utils::get_block_aux_ptr( - list_codes_ptr, - idx_base + j, - index->bbs, - packed_block_size, - full_block_size, - storage_size); + const uint8_t* base_ptr = aux_base + j * storage_size; if (is_multibit) { - local_1bit_evaluations++; const SignBitFactorsWithError& full_factors = *reinterpret_cast(base_ptr); @@ -284,12 +304,10 @@ void IVFRaBitQHeapHandler::handle( normalized_distance, full_factors, query_factors, - index->centered, - index->qb, - index->d); + centered, + qb, + d); - const bool is_similarity = - index->metric_type == MetricType::METRIC_INNER_PRODUCT; bool should_refine = rabitq_utils::should_refine_candidate( dist_1bit, full_factors.f_error, @@ -297,10 +315,9 @@ void IVFRaBitQHeapHandler::handle( heap_dis[0], is_similarity); if (should_refine) { - local_multibit_evaluations++; - size_t local_offset = this->j0 + b * 32 + j; + size_t local_offset = idx_base + j; float dist_full = compute_full_multibit_distance( - result_id, local_q, q, local_offset); + local_q, q, local_offset, base_ptr); if (Cfloat::cmp(heap_dis[0], dist_full)) { heap_replace_top( k, heap_dis, heap_ids, dist_full, result_id); @@ -315,9 +332,9 @@ void IVFRaBitQHeapHandler::handle( normalized_distance, db_factors, query_factors, - index->centered, - index->qb, - index->d); + centered, + qb, + d); if (Cfloat::cmp(heap_dis[0], adjusted_distance)) { heap_replace_top( k, heap_dis, heap_ids, adjusted_distance, result_id); @@ -325,11 +342,6 @@ void IVFRaBitQHeapHandler::handle( } } } - -#pragma omp atomic - rabitq_stats.n_1bit_evaluations += local_1bit_evaluations; -#pragma omp atomic - rabitq_stats.n_multibit_evaluations += local_multibit_evaluations; } template @@ -338,7 +350,12 @@ void IVFRaBitQHeapHandler::set_list_context( const std::vector& probe_map) { current_list_no = list_no; probe_indices = probe_map; - list_codes_ptr = index->invlists->get_codes(list_no); + cached_nprobe = + context && context->nprobe > 0 ? context->nprobe : index->nprobe; + is_similarity = index->metric_type == MetricType::METRIC_INNER_PRODUCT; + if (index->invlists) { + this->list_codes_ptr = index->invlists->get_codes(list_no); + } } template @@ -356,44 +373,36 @@ void IVFRaBitQHeapHandler::end() { template float IVFRaBitQHeapHandler::compute_full_multibit_distance( - size_t /*db_idx*/, size_t local_q, size_t global_q, - size_t local_offset) { + size_t local_offset, + const uint8_t* aux_ptr) { const size_t ex_bits = index->rabitq.nb_bits - 1; const size_t dim = index->d; - const uint8_t* base_ptr = rabitq_utils::get_block_aux_ptr( - list_codes_ptr, - local_offset, - index->bbs, - packed_block_size, - full_block_size, - storage_size); - const size_t ex_code_size = (dim * ex_bits + 7) / 8; - const uint8_t* ex_code = base_ptr + sizeof(SignBitFactorsWithError); + const uint8_t* ex_code = aux_ptr + sizeof(SignBitFactorsWithError); const ExtraBitsFactors& ex_fac = *reinterpret_cast( - base_ptr + sizeof(SignBitFactorsWithError) + ex_code_size); + aux_ptr + sizeof(SignBitFactorsWithError) + ex_code_size); - size_t probe_rank = probe_indices[local_q]; - size_t nprobe_val = context->nprobe > 0 ? context->nprobe : index->nprobe; - size_t storage_idx_val = global_q * nprobe_val + probe_rank; + const size_t probe_rank = probe_indices[local_q]; + const size_t storage_idx_val = global_q * cached_nprobe + probe_rank; const auto& query_factors = context->query_factors[storage_idx_val]; - // Use list_codes_ptr (already set by set_list_context) and the - // pre-allocated unpack_buf to avoid per-refinement ScopedCodes - // re-acquisition and heap allocation. - packer->unpack_1(list_codes_ptr, local_offset, unpack_buf.data()); + rabitq_utils::unpack_sign_bits_from_packed( + this->list_codes_ptr, + index->bbs, + index->M2, + local_offset, + full_block_size, + unpack_buf.data()); return rabitq_utils::compute_full_multibit_distance( unpack_buf.data(), ex_code, ex_fac, query_factors.rotated_q.data(), - (index->metric_type == MetricType::METRIC_INNER_PRODUCT) - ? query_factors.q_dot_c - : query_factors.qr_to_c_L2sqr, + is_similarity ? query_factors.q_dot_c : query_factors.qr_to_c_L2sqr, dim, ex_bits, index->metric_type); diff --git a/thirdparty/faiss/faiss/IndexIVFSpectralHash.cpp b/thirdparty/faiss/faiss/IndexIVFSpectralHash.cpp index f0e53d36e..523ef8afb 100644 --- a/thirdparty/faiss/faiss/IndexIVFSpectralHash.cpp +++ b/thirdparty/faiss/faiss/IndexIVFSpectralHash.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -159,7 +160,7 @@ void binarize_with_freq( memset(codes, 0, (nbit + 7) / 8); for (size_t i = 0; i < nbit; i++) { float xf = (x[i] - c[i]); - int64_t xi = int64_t(floor(xf * freq)); + int64_t xi = int64_t(std::floor(xf * freq)); int64_t bit = xi & 1; codes[i >> 3] |= bit << (i & 7); } diff --git a/thirdparty/faiss/faiss/IndexLSH.cpp b/thirdparty/faiss/faiss/IndexLSH.cpp index 2e3217859..fc07f6cbd 100644 --- a/thirdparty/faiss/faiss/IndexLSH.cpp +++ b/thirdparty/faiss/faiss/IndexLSH.cpp @@ -49,7 +49,11 @@ const float* IndexLSH::apply_preprocess(idx_t n, const float* x) const { // also applies bias if exists xt = rrot.apply(n, x); } else if (d != nbits) { - assert(nbits < d); + FAISS_THROW_IF_NOT_FMT( + nbits < d, + "nbits (%d) must be less than d (%d)", + nbits, + (int)d); xt = new float[nbits * n]; float* xp = xt; for (idx_t i = 0; i < n; i++) { diff --git a/thirdparty/faiss/faiss/IndexPQ.cpp b/thirdparty/faiss/faiss/IndexPQ.cpp index de220ff36..3d3ff5467 100644 --- a/thirdparty/faiss/faiss/IndexPQ.cpp +++ b/thirdparty/faiss/faiss/IndexPQ.cpp @@ -640,7 +640,7 @@ struct SemiSortedArray { using HC = CMax; std::vector perm; - int k; // k elements are sorted + int k = 0; // k elements are sorted int initial_k, k_factor; @@ -689,7 +689,7 @@ struct SemiSortedArray { // remap orders counted from smallest to indices in array int get_ord(int n) { - assert(n < k); + FAISS_THROW_IF_NOT(n < k); return perm[n]; } }; @@ -732,7 +732,8 @@ struct MinSumK { * terms involved in the sum. */ using HC = CMin; - size_t heap_capacity, heap_size; + size_t heap_capacity = 0; + size_t heap_size = 0; T* bh_val; int64_t* bh_ids; @@ -747,7 +748,7 @@ struct MinSumK { MinSumK(int K_in, int M_in, int nbit_in, int N_in) : K(K_in), M(M_in), nbit(nbit_in), N(N_in) { heap_capacity = K_in * M_in; - assert(N_in <= (1 << nbit_in)); + FAISS_THROW_IF_NOT(N_in <= (1 << nbit_in)); // we'll do k steps, each step pushes at most M vals bh_val = new T[heap_capacity]; @@ -805,11 +806,11 @@ struct MinSumK { // pop smallest value from heap if (use_seen) { // skip already seen elements while (is_seen(bh_ids[0])) { - assert(heap_size > 0); + FAISS_THROW_IF_NOT(heap_size > 0); heap_pop(heap_size--, bh_val, bh_ids); } } - assert(heap_size > 0); + FAISS_THROW_IF_NOT(heap_size > 0); T sum = sums[k] = bh_val[0]; int64_t ti = terms[k] = bh_ids[0]; @@ -1076,7 +1077,7 @@ void MultiIndexQuantizer2::search( if (K == 1) { // simple version that just finds the min in each table - assert(k2 == 1); + FAISS_THROW_IF_NOT(k2 == 1); for (idx_t i = 0; i < n; i++) { float dis = 0; diff --git a/thirdparty/faiss/faiss/IndexPQ.h b/thirdparty/faiss/faiss/IndexPQ.h index 7c854d795..75603efc7 100644 --- a/thirdparty/faiss/faiss/IndexPQ.h +++ b/thirdparty/faiss/faiss/IndexPQ.h @@ -121,7 +121,8 @@ struct IndexPQStats { size_t nq; // nb of queries run size_t ncode; // nb of codes visited - size_t n_hamming_pass; // nb of passed Hamming distance tests (for polysemy) + size_t n_hamming_pass = 0; // nb of passed Hamming distance tests (for + // polysemy) IndexPQStats() { reset(); diff --git a/thirdparty/faiss/faiss/IndexPQFastScan.cpp b/thirdparty/faiss/faiss/IndexPQFastScan.cpp index 96567540b..a92b4a542 100644 --- a/thirdparty/faiss/faiss/IndexPQFastScan.cpp +++ b/thirdparty/faiss/faiss/IndexPQFastScan.cpp @@ -72,4 +72,8 @@ void IndexPQFastScan::sa_decode(idx_t n, const uint8_t* bytes, float* x) const { pq.decode(bytes, x, n); } +size_t IndexPQFastScan::fast_scan_code_size() const { + return M2 / 2; +} + } // namespace faiss diff --git a/thirdparty/faiss/faiss/IndexPQFastScan.h b/thirdparty/faiss/faiss/IndexPQFastScan.h index ef56256bc..5ec2b1ac1 100644 --- a/thirdparty/faiss/faiss/IndexPQFastScan.h +++ b/thirdparty/faiss/faiss/IndexPQFastScan.h @@ -52,6 +52,9 @@ struct IndexPQFastScan : IndexFastScan { const FastScanDistancePostProcessing& context) const override; void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; + + /// Packed code size: M2 / 2 bytes (4-bit PQ sub-quantizer nibbles) + size_t fast_scan_code_size() const override; }; } // namespace faiss diff --git a/thirdparty/faiss/faiss/IndexRaBitQ.cpp b/thirdparty/faiss/faiss/IndexRaBitQ.cpp index 5ae817333..6fab12357 100644 --- a/thirdparty/faiss/faiss/IndexRaBitQ.cpp +++ b/thirdparty/faiss/faiss/IndexRaBitQ.cpp @@ -106,15 +106,8 @@ struct Run_search_with_dc_res { resi.begin(q); dc_base->set_query(xq + d * q); - // Stats tracking for multi-bit two-stage search only - // n_1bit_evaluations: candidates evaluated using 1-bit lower - // bound n_multibit_evaluations: candidates requiring full - // multi-bit distance - size_t local_1bit_evaluations = 0; - size_t local_multibit_evaluations = 0; - if (ex_bits == 0) { - // 1-bit: Standard single-stage search (no stats tracking) + // 1-bit: Standard single-stage search for (size_t i = 0; i < ntotal; i++) { if (res.is_in_selection(i)) { float dis = (*dc_base)(i); @@ -133,7 +126,6 @@ struct Run_search_with_dc_res { dc != nullptr, "Failed to cast to RaBitQDistanceComputer for two-stage search"); - // Use appropriate comparison based on metric type bool is_similarity = is_similarity_metric(index->metric_type); @@ -142,16 +134,9 @@ struct Run_search_with_dc_res { const uint8_t* code = index->codes.data() + i * index->code_size; - local_1bit_evaluations++; - - // Stage 1: Compute distance bound using 1-bit codes - // For L2 (min-heap): use lower_bound (est - - // error) For IP (max-heap): use upper_bound (est - // + error) float est_distance = dc->distance_to_code_1bit(code); - // Extract f_error for filtering size_t code_size_base = (index->d + 7) / 8; const rabitq_utils::SignBitFactorsWithError* base_fac = reinterpret_cast< @@ -159,7 +144,6 @@ struct Run_search_with_dc_res { SignBitFactorsWithError*>( code + code_size_base); - // Stage 2: Adaptive filtering bool should_refine = rabitq_utils::should_refine_candidate( est_distance, @@ -168,8 +152,6 @@ struct Run_search_with_dc_res { resi.threshold, is_similarity); if (should_refine) { - local_multibit_evaluations++; - // Compute full multi-bit distance float dist_full = dc->distance_to_code_full(code); resi.add_result(dist_full, i); @@ -178,13 +160,6 @@ struct Run_search_with_dc_res { } } - // Update global stats atomically -#pragma omp atomic - rabitq_stats.n_1bit_evaluations += local_1bit_evaluations; -#pragma omp atomic - rabitq_stats.n_multibit_evaluations += - local_multibit_evaluations; - resi.end(); } } diff --git a/thirdparty/faiss/faiss/IndexRaBitQ.h b/thirdparty/faiss/faiss/IndexRaBitQ.h index dfa92705a..e6a48a6e0 100644 --- a/thirdparty/faiss/faiss/IndexRaBitQ.h +++ b/thirdparty/faiss/faiss/IndexRaBitQ.h @@ -8,7 +8,6 @@ #pragma once #include -#include #include namespace faiss { diff --git a/thirdparty/faiss/faiss/IndexRaBitQFastScan.cpp b/thirdparty/faiss/faiss/IndexRaBitQFastScan.cpp index a98df2e02..88c62c1a5 100644 --- a/thirdparty/faiss/faiss/IndexRaBitQFastScan.cpp +++ b/thirdparty/faiss/faiss/IndexRaBitQFastScan.cpp @@ -467,6 +467,10 @@ void IndexRaBitQFastScan::compute_float_LUT( } } +size_t IndexRaBitQFastScan::fast_scan_code_size() const { + return (d + 7) / 8; +} + void IndexRaBitQFastScan::sa_decode(idx_t n, const uint8_t* bytes, float* x) const { const float* centroid_in = diff --git a/thirdparty/faiss/faiss/IndexRaBitQFastScan.h b/thirdparty/faiss/faiss/IndexRaBitQFastScan.h index 90e997b72..449c9a2b1 100644 --- a/thirdparty/faiss/faiss/IndexRaBitQFastScan.h +++ b/thirdparty/faiss/faiss/IndexRaBitQFastScan.h @@ -12,7 +12,6 @@ #include #include -#include #include #include #include @@ -78,6 +77,10 @@ struct IndexRaBitQFastScan : IndexFastScan { void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; + /// Packed code size: (d + 7) / 8 bytes (1-bit-per-dimension sign bits, + /// excluding factors) + size_t fast_scan_code_size() const override; + /// Return CodePackerRaBitQ with enlarged block size CodePacker* get_CodePacker() const override; @@ -147,10 +150,7 @@ struct RaBitQHeapHandler const size_t storage_size; const size_t packed_block_size; const size_t full_block_size; - std::unique_ptr packer; // cached for unpack in hot path - // Handler-local scratch reused across refinements. This assumes a handler - // instance is confined to one search slice and not entered concurrently. - std::vector unpack_buf; // reusable buffer for unpack_1 + std::vector unpack_buf; // sign bits scratch buffer // Use float-based comparator for heap operations using Cfloat = typename std::conditional< @@ -178,8 +178,7 @@ struct RaBitQHeapHandler storage_size(index->compute_per_vector_storage_size()), packed_block_size(((index->M2 + 1) / 2) * index->bbs), full_block_size(index->get_block_stride()), - packer(index->get_CodePacker()), - unpack_buf(index->code_size) { + unpack_buf((index->d + 7) / 8) { #pragma omp parallel for if (nq > 100) for (int64_t q = 0; q < static_cast(nq); q++) { float* heap_dis = heap_distances + q * k; @@ -213,17 +212,12 @@ struct RaBitQHeapHandler const uint8_t* aux_base = rabitq_index->codes.get() + block_idx * full_block_size + packed_block_size; - size_t local_1bit_evaluations = 0; - size_t local_multibit_evaluations = 0; - for (size_t i = 0; i < max_vectors; i++) { const size_t db_idx = base_db_idx + i; const float normalized_distance = d32tab[i] * one_a + bias; const uint8_t* base_ptr = aux_base + i * storage_size; if (is_multi_bit) { - local_1bit_evaluations++; - const SignBitFactorsWithError& full_factors = *reinterpret_cast( base_ptr); @@ -248,7 +242,6 @@ struct RaBitQHeapHandler is_similarity); if (should_refine) { - local_multibit_evaluations++; float dist_full = compute_full_multibit_distance(db_idx, q); if (Cfloat::cmp(heap_dis[0], dist_full)) { @@ -276,11 +269,6 @@ struct RaBitQHeapHandler } } } - -#pragma omp atomic - rabitq_stats.n_1bit_evaluations += local_1bit_evaluations; -#pragma omp atomic - rabitq_stats.n_multibit_evaluations += local_multibit_evaluations; } void begin(const float* norms) override { @@ -319,9 +307,13 @@ struct RaBitQHeapHandler const rabitq_utils::QueryFactorsData& query_factors = context->query_factors[q]; - // Reuse pre-allocated unpack_buf to avoid per-refinement heap - // allocation. - packer->unpack_1(rabitq_index->codes.get(), db_idx, unpack_buf.data()); + rabitq_utils::unpack_sign_bits_from_packed( + rabitq_index->codes.get(), + rabitq_index->bbs, + rabitq_index->M2, + db_idx, + full_block_size, + unpack_buf.data()); const uint8_t* sign_bits = unpack_buf.data(); return rabitq_utils::compute_full_multibit_distance( diff --git a/thirdparty/faiss/faiss/IndexRefine.cpp b/thirdparty/faiss/faiss/IndexRefine.cpp index 2025483fa..9d76cbc17 100644 --- a/thirdparty/faiss/faiss/IndexRefine.cpp +++ b/thirdparty/faiss/faiss/IndexRefine.cpp @@ -100,7 +100,7 @@ void IndexRefine::search( n, x, k_base, base_distances, base_labels, base_index_params); for (int i = 0; i < n * k_base; i++) { - assert(base_labels[i] >= -1 && base_labels[i] < ntotal); + FAISS_THROW_IF_NOT(base_labels[i] >= -1 && base_labels[i] < ntotal); } // parallelize over queries @@ -125,12 +125,12 @@ void IndexRefine::search( // sort and store result if (metric_type == METRIC_L2) { - typedef CMax C; + using C = CMax; reorder_2_heaps( n, k, labels, distances, k_base, base_labels, base_distances); } else if (metric_type == METRIC_INNER_PRODUCT) { - typedef CMin C; + using C = CMin; reorder_2_heaps( n, k, labels, distances, k_base, base_labels, base_distances); } else { @@ -287,7 +287,7 @@ void IndexRefineFlat::search( n, x, k_base, base_distances, base_labels, base_index_params); for (int i = 0; i < n * k_base; i++) { - assert(base_labels[i] >= -1 && base_labels[i] < ntotal); + FAISS_THROW_IF_NOT(base_labels[i] >= -1 && base_labels[i] < ntotal); } // compute refined distances @@ -298,12 +298,12 @@ void IndexRefineFlat::search( // sort and store result if (metric_type == METRIC_L2) { - typedef CMax C; + using C = CMax; reorder_2_heaps( n, k, labels, distances, k_base, base_labels, base_distances); } else if (metric_type == METRIC_INNER_PRODUCT) { - typedef CMin C; + using C = CMin; reorder_2_heaps( n, k, labels, distances, k_base, base_labels, base_distances); } else { @@ -353,7 +353,7 @@ void IndexRefinePanorama::search( n, x, k_base, base_distances, base_labels, base_index_params); for (int i = 0; i < n * k_base; i++) { - assert(base_labels[i] >= -1 && base_labels[i] < ntotal); + FAISS_THROW_IF_NOT(base_labels[i] >= -1 && base_labels[i] < ntotal); } refine_index->search_subset( diff --git a/thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp b/thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp index 9c7de1424..9694c5f99 100644 --- a/thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp +++ b/thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp @@ -166,11 +166,6 @@ void sa_decode_impl( ? chunk_size : static_cast(n_input)) * old_code_size); - std::vector minmax( - (chunk_size < static_cast(n_input) - ? chunk_size - : static_cast(n_input))); - // all the elements to process size_t n_left = n_input; @@ -231,7 +226,7 @@ void train_inplace_impl( std::vector minmax(n); // normalize -#pragma omp for +#pragma omp parallel for for (idx_t i = 0; i < n; i++) { // compute min & max values float minv = std::numeric_limits::max(); @@ -269,6 +264,7 @@ void train_inplace_impl( sub_index->train(n, x); // rescale data back +#pragma omp parallel for for (idx_t i = 0; i < n; i++) { float scaler = 0; float minv = 0; @@ -294,7 +290,7 @@ void train_impl(IndexRowwiseMinMaxBase* const index, idx_t n, const float* x) { // temp buffer std::vector tmp(n * d); -#pragma omp for +#pragma omp parallel for for (idx_t i = 0; i < n; i++) { // compute min & max values float minv = std::numeric_limits::max(); @@ -309,7 +305,7 @@ void train_impl(IndexRowwiseMinMaxBase* const index, idx_t n, const float* x) { const float scaler = maxv - minv; // save the coefficients - StorageMinMaxT storage; + StorageMinMaxT storage = {}; storage.from_floats(scaler, minv); // and load them back, because the coefficients might diff --git a/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp b/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp index ffa2c9312..bcdbac952 100644 --- a/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp +++ b/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp @@ -133,6 +133,10 @@ IndexIVFScalarQuantizer::IndexIVFScalarQuantizer( invlists->code_size = code_size; } is_trained = false; + if (qtype == ScalarQuantizer::QT_0bit) { + by_residual = false; + is_trained = true; // no training needed + } } IndexIVFScalarQuantizer::IndexIVFScalarQuantizer() : IndexIVF() { @@ -156,6 +160,19 @@ void IndexIVFScalarQuantizer::encode_vectors( const idx_t* list_nos, uint8_t* codes, bool include_listnos) const { + if (sq.code_size == 0) { + // QT_0bit: nothing to encode, but handle coarse codes if needed + if (include_listnos) { + size_t coarse_size = coarse_code_size(); + for (idx_t i = 0; i < n; i++) { + int64_t list_no = list_nos[i]; + if (list_no >= 0) { + encode_listno(list_no, codes + i * coarse_size); + } + } + } + return; + } std::unique_ptr squant(sq.select_quantizer()); size_t coarse_size = include_listnos ? coarse_code_size() : 0; memset(codes, 0, (code_size + coarse_size) * n); @@ -186,14 +203,42 @@ void IndexIVFScalarQuantizer::encode_vectors( void IndexIVFScalarQuantizer::decode_vectors( idx_t n, const uint8_t* codes, - const idx_t*, + const idx_t* list_nos, float* x) const { + if (sq.code_size == 0 && list_nos) { + // QT_0bit: reconstruct centroids if list_nos provided + for (idx_t i = 0; i < n; i++) { + quantizer->reconstruct(list_nos[i], x + i * d); + } + return; + } FAISS_THROW_IF_NOT(is_trained); - return sq.decode(codes, x, n); + sq.decode(codes, x, n); + if (by_residual) { + FAISS_THROW_IF_NOT_MSG( + list_nos, "decode_vectors with by_residual requires list_nos"); +#pragma omp parallel for if (n > 1000) + for (idx_t i = 0; i < n; i++) { + std::vector centroid(d); + quantizer->reconstruct(list_nos[i], centroid.data()); + for (size_t j = 0; j < static_cast(d); j++) { + x[i * d + j] += centroid[j]; + } + } + } } void IndexIVFScalarQuantizer::sa_decode(idx_t n, const uint8_t* codes, float* x) const { + if (sq.code_size == 0) { + size_t coarse_size = coarse_code_size(); + for (idx_t i = 0; i < n; i++) { + const uint8_t* code = codes + i * coarse_size; + int64_t list_no = decode_listno(code); + quantizer->reconstruct(list_no, x + i * d); + } + return; + } std::unique_ptr squant(sq.select_quantizer()); size_t coarse_size = coarse_code_size(); @@ -224,6 +269,23 @@ void IndexIVFScalarQuantizer::add_core( const idx_t* coarse_idx, void* inverted_list_context) { FAISS_THROW_IF_NOT(is_trained); + if (sq.code_size == 0) { + // QT_0bit: just add IDs with empty codes + uint8_t dummy_code = 0; + DirectMapAdd dm_add(direct_map, n, xids); + for (idx_t i = 0; i < n; i++) { + int64_t list_no = coarse_idx[i]; + if (list_no >= 0) { + int64_t id = xids ? xids[i] : ntotal + i; + size_t ofs = invlists->add_entry(list_no, id, &dummy_code); + dm_add.add(i, list_no, ofs); + } else { + dm_add.add(i, -1, 0); + } + } + ntotal += n; + return; + } std::unique_ptr squant(sq.select_quantizer()); @@ -277,6 +339,11 @@ void IndexIVFScalarQuantizer::reconstruct_from_offset( int64_t list_no, int64_t offset, float* recons) const { + if (sq.code_size == 0) { + // QT_0bit: reconstruct from centroid + quantizer->reconstruct(list_no, recons); + return; + } const uint8_t* code = invlists->get_single_code(list_no, offset); if (by_residual) { diff --git a/thirdparty/faiss/faiss/VectorTransform.cpp b/thirdparty/faiss/faiss/VectorTransform.cpp index 7300940b3..219920e88 100644 --- a/thirdparty/faiss/faiss/VectorTransform.cpp +++ b/thirdparty/faiss/faiss/VectorTransform.cpp @@ -150,7 +150,9 @@ void VectorTransform::reverse_transform(idx_t, const float*, float*) const { } void VectorTransform::check_identical(const VectorTransform& other) const { - FAISS_THROW_IF_NOT(other.d_in == d_in && other.d_in == d_in); + FAISS_THROW_IF_NOT_MSG( + other.d_in == d_in && other.d_out == d_out, + "transforms must have matching d_in and d_out"); } /********************************************* @@ -277,7 +279,7 @@ void LinearTransform::set_is_orthonormal() { float v = ATA[i + j * d_out]; if (i == j) v -= 1; - if (fabs(v) > eps) { + if (std::fabs(v) > eps) { is_orthonormal = false; } } @@ -303,7 +305,9 @@ void LinearTransform::print_if_verbose( if (!verbose) return; printf("matrix %s: %d*%d [\n", name, n, d); - FAISS_THROW_IF_NOT(mat.size() >= static_cast(n) * d); + FAISS_THROW_IF_NOT_MSG( + mat.size() >= static_cast(n) * d, + "matrix size is too small for the given dimensions"); for (int i = 0; i < n; i++) { for (int j = 0; j < d; j++) { printf("%10.5g ", mat[i * d + j]); @@ -316,8 +320,10 @@ void LinearTransform::print_if_verbose( void LinearTransform::check_identical(const VectorTransform& other_in) const { VectorTransform::check_identical(other_in); auto other = dynamic_cast(&other_in); - FAISS_THROW_IF_NOT(other); - FAISS_THROW_IF_NOT(other->A == A && other->b == b); + FAISS_THROW_IF_NOT_MSG(other, "failed to cast to LinearTransform"); + FAISS_THROW_IF_NOT_MSG( + other->A == A && other->b == b, + "LinearTransform matrix A and bias vector b must match"); } /********************************************* @@ -390,7 +396,8 @@ static void generate_signs( std::vector& s1, std::vector& s2, std::vector& s3) { - FAISS_THROW_IF_NOT(p > 0); + FAISS_THROW_IF_NOT_MSG( + p > 0, "number of Hadamard factors p must be positive"); SplitMix64RandomGenerator rng(seed); s1.resize(p); s2.resize(p); @@ -426,9 +433,15 @@ void HadamardRotation::apply_noalloc(idx_t n, const float* x, float* xt) const { size_t d = d_in; size_t p = d_out; - FAISS_THROW_IF_NOT(signs1.size() == p); - FAISS_THROW_IF_NOT(signs2.size() == p); - FAISS_THROW_IF_NOT(signs3.size() == p); + FAISS_THROW_IF_NOT_MSG( + signs1.size() == p, + "sign-flip vector 1 size must match output dimension"); + FAISS_THROW_IF_NOT_MSG( + signs2.size() == p, + "sign-flip vector 2 size must match output dimension"); + FAISS_THROW_IF_NOT_MSG( + signs3.size() == p, + "sign-flip vector 3 size must match output dimension"); // Each unnormalized FWHT scales norms by sqrt(p). // Three rounds scale by p^(3/2). Normalize once at the end. @@ -468,10 +481,14 @@ void HadamardRotation::apply_noalloc(idx_t n, const float* x, float* xt) const { void HadamardRotation::check_identical(const VectorTransform& other) const { auto* hr = dynamic_cast(&other); - FAISS_THROW_IF_NOT(hr); - FAISS_THROW_IF_NOT(d_in == hr->d_in); - FAISS_THROW_IF_NOT(d_out == hr->d_out); - FAISS_THROW_IF_NOT(seed == hr->seed); + FAISS_THROW_IF_NOT_MSG(hr, "failed to cast to HadamardRotation"); + FAISS_THROW_IF_NOT_MSG( + d_in == hr->d_in, "HadamardRotation input dimensions must match"); + FAISS_THROW_IF_NOT_MSG( + d_out == hr->d_out, + "HadamardRotation output dimensions must match"); + FAISS_THROW_IF_NOT_MSG( + seed == hr->seed, "HadamardRotation seeds must match"); } /********************************************* @@ -731,7 +748,9 @@ void PCAMatrix::train(idx_t n, const float* x_in) { } void PCAMatrix::copy_from(const PCAMatrix& other) { - FAISS_THROW_IF_NOT(other.is_trained); + FAISS_THROW_IF_NOT_MSG( + other.is_trained, + "source PCAMatrix must be trained before copying"); mean = other.mean; eigenvalues = other.eigenvalues; PCAMat = other.PCAMat; @@ -754,14 +773,16 @@ void PCAMatrix::prepare_Ab() { if (eigen_power != 0) { float* ai = A.data(); for (int i = 0; i < d_out; i++) { - float factor = pow(eigenvalues[i] + epsilon, eigen_power); + float factor = std::pow(eigenvalues[i] + epsilon, eigen_power); for (int j = 0; j < d_in; j++) *ai++ *= factor; } } if (balanced_bins != 0) { - FAISS_THROW_IF_NOT(d_out % balanced_bins == 0); + FAISS_THROW_IF_NOT_MSG( + d_out % balanced_bins == 0, + "output dimension must be divisible by balanced_bins"); int dsub = d_out / balanced_bins; std::vector Ain; std::swap(A, Ain); @@ -945,7 +966,8 @@ void ITQMatrix::train(idx_t n, const float* xf) { &lwork, &info); - FAISS_THROW_IF_NOT(info == 0); + FAISS_THROW_IF_NOT_MSG( + info == 0, "LAPACK dgesvd workspace query failed"); lwork = size_t(lwork1); std::vector work(lwork); dgesvd_("A", @@ -1001,14 +1023,17 @@ ITQTransform::ITQTransform(int din, int dout, bool do_pca_in) itq(dout), pca_then_itq(din, dout, false) { if (!do_pca_in) { - FAISS_THROW_IF_NOT(din == dout); + FAISS_THROW_IF_NOT_MSG( + din == dout, + "input and output dimensions must match when PCA is disabled"); } max_train_per_dim = 10; is_trained = false; } void ITQTransform::train(idx_t n, const float* x_in) { - FAISS_THROW_IF_NOT(!is_trained); + FAISS_THROW_IF_NOT_MSG( + !is_trained, "ITQTransform has already been trained"); size_t max_train_points = std::max(d_in * max_train_per_dim, 32768); const float* x = @@ -1100,9 +1125,10 @@ void ITQTransform::apply_noalloc(idx_t n, const float* x, float* xt) const { void ITQTransform::check_identical(const VectorTransform& other_in) const { VectorTransform::check_identical(other_in); auto other = dynamic_cast(&other_in); - FAISS_THROW_IF_NOT(other); + FAISS_THROW_IF_NOT_MSG(other, "failed to cast to ITQTransform"); pca_then_itq.check_identical(other->pca_then_itq); - FAISS_THROW_IF_NOT(other->mean == mean); + FAISS_THROW_IF_NOT_MSG( + other->mean == mean, "ITQTransform mean vectors must match"); } /********************************************* @@ -1184,7 +1210,8 @@ void OPQMatrix::train(idx_t n, const float* x_in) { // we use only the d * d2 upper part of the matrix A.resize(d * d2); } else { - FAISS_THROW_IF_NOT(A.size() == d * d2); + FAISS_THROW_IF_NOT_MSG( + A.size() == d * d2, "rotation matrix A has incorrect size"); rotation = A.data(); } @@ -1360,8 +1387,9 @@ void NormalizationTransform::check_identical( const VectorTransform& other_in) const { VectorTransform::check_identical(other_in); auto other = dynamic_cast(&other_in); - FAISS_THROW_IF_NOT(other); - FAISS_THROW_IF_NOT(other->norm == norm); + FAISS_THROW_IF_NOT_MSG(other, "failed to cast to NormalizationTransform"); + FAISS_THROW_IF_NOT_MSG( + other->norm == norm, "normalization type must match"); } /********************************************* @@ -1389,7 +1417,8 @@ void CenteringTransform::train(idx_t n, const float* x) { void CenteringTransform::apply_noalloc(idx_t n, const float* x, float* xt) const { - FAISS_THROW_IF_NOT(is_trained); + FAISS_THROW_IF_NOT_MSG( + is_trained, "CenteringTransform has not been trained"); for (idx_t i = 0; i < n; i++) { for (int j = 0; j < d_in; j++) { @@ -1400,7 +1429,8 @@ void CenteringTransform::apply_noalloc(idx_t n, const float* x, float* xt) void CenteringTransform::reverse_transform(idx_t n, const float* xt, float* x) const { - FAISS_THROW_IF_NOT(is_trained); + FAISS_THROW_IF_NOT_MSG( + is_trained, "CenteringTransform has not been trained"); for (idx_t i = 0; i < n; i++) { for (int j = 0; j < d_in; j++) { @@ -1413,8 +1443,9 @@ void CenteringTransform::check_identical( const VectorTransform& other_in) const { VectorTransform::check_identical(other_in); auto other = dynamic_cast(&other_in); - FAISS_THROW_IF_NOT(other); - FAISS_THROW_IF_NOT(other->mean == mean); + FAISS_THROW_IF_NOT_MSG(other, "failed to cast to CenteringTransform"); + FAISS_THROW_IF_NOT_MSG( + other->mean == mean, "CenteringTransform mean vectors must match"); } /********************************************* @@ -1429,7 +1460,9 @@ RemapDimensionsTransform::RemapDimensionsTransform( map.resize(dout); for (int i = 0; i < dout; i++) { map[i] = map_in[i]; - FAISS_THROW_IF_NOT(map[i] == -1 || (map[i] >= 0 && map[i] < din)); + FAISS_THROW_IF_NOT_MSG( + map[i] == -1 || (map[i] >= 0 && map[i] < din), + "map entries must be -1 (unused) or valid input dimension indices"); } } @@ -1486,6 +1519,7 @@ void RemapDimensionsTransform::check_identical( const VectorTransform& other_in) const { VectorTransform::check_identical(other_in); auto other = dynamic_cast(&other_in); - FAISS_THROW_IF_NOT(other); - FAISS_THROW_IF_NOT(other->map == map); + FAISS_THROW_IF_NOT_MSG(other, "failed to cast to RemapDimensionsTransform"); + FAISS_THROW_IF_NOT_MSG( + other->map == map, "RemapDimensionsTransform maps must match"); } diff --git a/thirdparty/faiss/faiss/build.cpp b/thirdparty/faiss/faiss/build.cpp new file mode 100644 index 000000000..c14dc6d05 --- /dev/null +++ b/thirdparty/faiss/faiss/build.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "faiss/build.h" + +namespace faiss { + +bool has_omp() { + int omp = 1; + // Detect whether OpenMP is enabled by using the 'max' reduction to render + // the below assignment a no-op. This works: + // 1) without starting any threads + // 2) irrespective of the current thread limit +#pragma omp parallel reduction(max : omp) num_threads(1) + omp = 0; + return omp != 0; +} + +} // namespace faiss diff --git a/thirdparty/faiss/faiss/build.h b/thirdparty/faiss/faiss/build.h new file mode 100644 index 000000000..0a1ddab4e --- /dev/null +++ b/thirdparty/faiss/faiss/build.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +namespace faiss { + +// Returns true iff `faiss` was compiled with non-mocked OpenMP support. +bool has_omp(); + +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.cpp deleted file mode 100644 index 065c54e05..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.cpp +++ /dev/null @@ -1,83 +0,0 @@ - -// -*- c++ -*- - -#include -#include - -#include -#include -#include -#include -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -sq_get_distance_computer_func_ptr sq_get_distance_computer = - sq_get_distance_computer_ref; -sq_sel_quantizer_func_ptr sq_sel_quantizer = sq_select_quantizer_ref; -sq_sel_inv_list_scanner_func_ptr sq_sel_inv_list_scanner = - sq_select_inverted_list_scanner_ref; - -// Note: The Hamming computer implementation is selected at compile time -// based on the instruction set in `hamdis-inl.h`, not by runtime hook. -sq_get_distance_computer_func_ptr sq_get_hamming_distance_computer = - sq_get_hamming_distance_computer_ref; - -// Note: The Jaccard distance computer uses `__builtin_popcount` for -// computation. This function is efficiently implemented by the -// compiler and automatically utilizes the best available instruction set. -// Therefore, there is no need to manually adjust or hook the Jaccard computer -// for different SIMD instruction sets. -sq_get_distance_computer_func_ptr sq_get_jaccard_distance_computer = - sq_get_jaccard_distance_computer_ref; - -void sq_hook() { - // SQ8 always hook best SIMD -#ifdef __x86_64__ - static std::mutex hook_mutex; - std::lock_guard lock(hook_mutex); - - if (use_avx512 && cpu_support_avx512()) { - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_avx512; - sq_sel_quantizer = sq_select_quantizer_avx512; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_avx512; - } else if (use_avx2 && cpu_support_avx2()) { - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_avx; - sq_sel_quantizer = sq_select_quantizer_avx; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_avx; - } else if (use_sse4_2 && cpu_support_sse4_2()) { - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_ref; - sq_sel_quantizer = sq_select_quantizer_ref; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_ref; - } -#endif - -#if defined(__ARM_NEON) - static std::mutex hook_mutex; - std::lock_guard lock(hook_mutex); - - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_neon; - sq_sel_quantizer = sq_select_quantizer_neon; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_neon; -#endif - -#if defined(__riscv_vector) - static std::mutex hook_mutex; - std::lock_guard lock(hook_mutex); - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_rvv; - sq_sel_quantizer = sq_select_quantizer_rvv; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_rvv; -#endif -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.h deleted file mode 100644 index 5e0d5b816..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.h +++ /dev/null @@ -1,48 +0,0 @@ - -// -*- c++ -*- - -#pragma once - -#include - -#include -#include -#include -#include "simd/hook.h" - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -// todo aguzhva: replace FaissHook.h with simd/hook.h - -typedef ScalarQuantizer::SQDistanceComputer* (*sq_get_distance_computer_func_ptr)( - MetricType, - ScalarQuantizer::QuantizerType, - size_t, - const std::vector&); -typedef ScalarQuantizer::SQuantizer* (*sq_sel_quantizer_func_ptr)( - ScalarQuantizer::QuantizerType, - size_t, - const std::vector&); -typedef InvertedListScanner* (*sq_sel_inv_list_scanner_func_ptr)( - MetricType, - const ScalarQuantizer*, - const Index*, - size_t, - bool, - const IDSelector*, - bool); - -extern sq_get_distance_computer_func_ptr sq_get_distance_computer; -extern sq_get_distance_computer_func_ptr sq_get_hamming_distance_computer; -extern sq_get_distance_computer_func_ptr sq_get_jaccard_distance_computer; -extern sq_sel_quantizer_func_ptr sq_sel_quantizer; -extern sq_sel_inv_list_scanner_func_ptr sq_sel_inv_list_scanner; -void sq_hook(); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IVFIteratorWorkspace.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IVFIteratorWorkspace.cpp index 60a952c64..bf80155e1 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IVFIteratorWorkspace.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IVFIteratorWorkspace.cpp @@ -12,6 +12,7 @@ #include #include +#include #include namespace faiss::cppcontrib::knowhere { diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.cpp new file mode 100644 index 000000000..f82766f96 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2019-2025 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include +#include +#include + +namespace faiss { +namespace cppcontrib { +namespace knowhere { + +namespace { + +// Adapter that binds one of baseline's HammingComputerN / JaccardComputerN +// size-specialized primitives into the FlatCodesDistanceComputer interface. +template +struct BinaryFlatCodesDC : faiss::FlatCodesDistanceComputer { + BinaryComputer binary_computer; + std::vector tmp; + + BinaryFlatCodesDC(const uint8_t* codes_in, size_t code_size_in) + : FlatCodesDistanceComputer(codes_in, code_size_in), + tmp(code_size_in) {} + + void set_query(const float* x) final { + // Legacy convention: each float is an integer in [0, 255]; cast + // to uint8 to recover the bit-packed query byte. Same pattern + // used by the fork's BinarySQDistanceComputerWrapper. + for (size_t i = 0; i < code_size; ++i) { + tmp[i] = static_cast(x[i]); + } + binary_computer.set(tmp.data(), code_size); + } + + float distance_to_code(const uint8_t* code) final { + return binary_computer.compute(code); + } + + float symmetric_dis(idx_t i, idx_t j) final { + BinaryComputer temp; + temp.set(codes + i * code_size, code_size); + return temp.compute(codes + j * code_size); + } +}; + +faiss::FlatCodesDistanceComputer* +make_hamming_dc(const uint8_t* codes, size_t code_size) { + switch (code_size) { + case 4: + return new BinaryFlatCodesDC(codes, code_size); + case 8: + return new BinaryFlatCodesDC(codes, code_size); + case 16: + return new BinaryFlatCodesDC(codes, code_size); + case 20: + return new BinaryFlatCodesDC(codes, code_size); + case 32: + return new BinaryFlatCodesDC(codes, code_size); + case 64: + return new BinaryFlatCodesDC(codes, code_size); + default: + return new BinaryFlatCodesDC( + codes, code_size); + } +} + +faiss::FlatCodesDistanceComputer* +make_jaccard_dc(const uint8_t* codes, size_t code_size) { + switch (code_size) { + case 8: + return new BinaryFlatCodesDC(codes, code_size); + case 16: + return new BinaryFlatCodesDC(codes, code_size); + case 32: + return new BinaryFlatCodesDC(codes, code_size); + case 64: + return new BinaryFlatCodesDC(codes, code_size); + case 128: + return new BinaryFlatCodesDC( + codes, code_size); + case 256: + return new BinaryFlatCodesDC( + codes, code_size); + case 512: + return new BinaryFlatCodesDC( + codes, code_size); + default: + return new BinaryFlatCodesDC( + codes, code_size); + } +} + +} // namespace + +IndexBinaryScalarQuantizer::IndexBinaryScalarQuantizer() : IndexFlatCodes() {} + +IndexBinaryScalarQuantizer::IndexBinaryScalarQuantizer(int d, MetricType metric) + : IndexFlatCodes(static_cast((d + 7) / 8), d, metric) { + FAISS_THROW_IF_NOT_MSG( + metric == METRIC_Hamming || metric == METRIC_Jaccard || + metric == METRIC_Substructure || + metric == METRIC_Superstructure, + "IndexBinaryScalarQuantizer: unsupported metric (expected Hamming, " + "Jaccard, Substructure, or Superstructure)"); + is_trained = true; +} + +void IndexBinaryScalarQuantizer::sa_encode( + idx_t n, const float* x, uint8_t* bytes) const { + // Follows the legacy Quantizer1bitDirect convention byte-for-byte: + // each vector has d floats, but only the first code_size are read; + // each is cast to uint8 to form the code byte. + const size_t cs = code_size; + for (idx_t vi = 0; vi < n; ++vi) { + const float* src = x + vi * static_cast(d); + uint8_t* dst = bytes + vi * cs; + for (size_t i = 0; i < cs; ++i) { + dst[i] = static_cast(src[i]); + } + } +} + +void IndexBinaryScalarQuantizer::sa_decode( + idx_t n, const uint8_t* bytes, float* x) const { + // Mirror of sa_encode. Output stride is d (matching baseline + // ScalarQuantizer::decode) but only the first code_size lanes of + // each d-float slot are written. Trailing lanes are left untouched + // by design: callers that only need the meaningful bytes (see + // faiss_hnsw.cc GetVectorByIds, bin1 branch) allocate exactly + // code_size floats per vector and rely on the decoder not writing + // past that. Zero-filling the tail would overrun those buffers. + const size_t cs = code_size; + for (idx_t vi = 0; vi < n; ++vi) { + float* dst = x + vi * static_cast(d); + const uint8_t* src = bytes + vi * cs; + for (size_t i = 0; i < cs; ++i) { + dst[i] = static_cast(src[i]); + } + } +} + +faiss::FlatCodesDistanceComputer* +IndexBinaryScalarQuantizer::get_FlatCodesDistanceComputer() const { + switch (metric_type) { + case METRIC_Hamming: + case METRIC_Substructure: + case METRIC_Superstructure: + return make_hamming_dc(codes.data(), code_size); + case METRIC_Jaccard: + return make_jaccard_dc(codes.data(), code_size); + default: + FAISS_THROW_MSG( + "IndexBinaryScalarQuantizer: unsupported metric in " + "get_FlatCodesDistanceComputer"); + } +} + +} // namespace knowhere +} // namespace cppcontrib +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.h new file mode 100644 index 000000000..64103b9c1 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.h @@ -0,0 +1,63 @@ +// Copyright (C) 2019-2025 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace faiss { +namespace cppcontrib { +namespace knowhere { + +/** + * Storage class for 1-bit-per-dimension binary vectors, with Hamming / + * Jaccard / Substructure / Superstructure semantics. Acts as the storage + * under the fork's IndexHNSW family — a direct replacement for the legacy + * path that routed binary data through faiss::ScalarQuantizer with + * qtype == QT_1bit_direct. + * + * Input/output convention for sa_encode / sa_decode and set_query: the + * `float*` buffer carries per-byte integer values (0..255) that together + * represent the bit-packed binary vector. The first code_size entries of + * each d-float "vector" are meaningful; any remaining lanes are ignored + * on both encode and decode — decode does not touch them, so callers may + * allocate exactly code_size floats per vector. + */ +struct IndexBinaryScalarQuantizer : faiss::IndexFlatCodes { + IndexBinaryScalarQuantizer(); + + /// d is the number of binary dimensions. code_size is (d + 7) / 8. + /// metric must be one of METRIC_Hamming, METRIC_Jaccard, + /// METRIC_Substructure, METRIC_Superstructure. The index is + /// considered trained immediately after construction. + IndexBinaryScalarQuantizer(int d, MetricType metric); + + void + sa_encode(idx_t n, const float* x, uint8_t* bytes) const override; + + void + sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; + + /// Returns a size-specialized Hamming or Jaccard computer wired into + /// the FlatCodesDistanceComputer interface. Uses baseline FAISS + /// primitives from faiss/utils/hamming.h. + faiss::FlatCodesDistanceComputer* + get_FlatCodesDistanceComputer() const override; +}; + +} // namespace knowhere +} // namespace cppcontrib +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.cpp index 22a4a6f23..fa16ecfd5 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.cpp @@ -18,7 +18,7 @@ #include -#include +#include "simd/hook.h" #include #include @@ -335,11 +335,12 @@ FlatCodesDistanceComputer* IndexFlatCosine::get_FlatCodesDistanceComputer() cons IndexScalarQuantizerCosine::IndexScalarQuantizerCosine( int d, - ScalarQuantizer::QuantizerType qtype) - : IndexScalarQuantizer(d, qtype, MetricType::METRIC_INNER_PRODUCT) { + ::faiss::ScalarQuantizer::QuantizerType qtype) + : ::faiss::IndexScalarQuantizer(d, qtype, MetricType::METRIC_INNER_PRODUCT) { } -IndexScalarQuantizerCosine::IndexScalarQuantizerCosine() : IndexScalarQuantizer() { +IndexScalarQuantizerCosine::IndexScalarQuantizerCosine() + : ::faiss::IndexScalarQuantizer() { metric_type = MetricType::METRIC_INNER_PRODUCT; } @@ -349,12 +350,12 @@ void IndexScalarQuantizerCosine::add(idx_t n, const float* x) { return; } - IndexScalarQuantizer::add(n, x); + ::faiss::IndexScalarQuantizer::add(n, x); inverse_norms_storage.add(x, n, d); } void IndexScalarQuantizerCosine::reset() { - IndexScalarQuantizer::reset(); + ::faiss::IndexScalarQuantizer::reset(); inverse_norms_storage.reset(); } @@ -366,7 +367,8 @@ DistanceComputer* IndexScalarQuantizerCosine::get_distance_computer() const { return new WithCosineNormDistanceComputer( this->get_inverse_l2_norms(), this->d, - std::unique_ptr(IndexScalarQuantizer::get_FlatCodesDistanceComputer()) + std::unique_ptr( + ::faiss::IndexScalarQuantizer::get_FlatCodesDistanceComputer()) ); } @@ -483,7 +485,7 @@ IndexHNSWSQCosine::IndexHNSWSQCosine() { IndexHNSWSQCosine::IndexHNSWSQCosine( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M) : IndexHNSW(new IndexScalarQuantizerCosine(d, qtype), M) { diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.h index 4075fa675..56d73b7c1 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -128,12 +128,13 @@ struct IndexFlatCosine : IndexFlat, HasInverseL2Norms { }; // -struct IndexScalarQuantizerCosine : IndexScalarQuantizer, HasInverseL2Norms { +struct IndexScalarQuantizerCosine : ::faiss::IndexScalarQuantizer, + HasInverseL2Norms { L2NormsStorage inverse_norms_storage; IndexScalarQuantizerCosine( int d, - ScalarQuantizer::QuantizerType qtype); + ::faiss::ScalarQuantizer::QuantizerType qtype); IndexScalarQuantizerCosine(); @@ -195,7 +196,7 @@ struct IndexHNSWSQCosine : IndexHNSW, HasInverseL2Norms { IndexHNSWSQCosine(); IndexHNSWSQCosine( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M); const float* get_inverse_l2_norms() const override; diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexFlat.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexFlat.cpp index 3f09ce1ad..f032088b3 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexFlat.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexFlat.cpp @@ -11,7 +11,7 @@ #include -#include +#include "simd/hook.h" #include #include diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.cpp index 6f69022ff..51835d798 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.cpp @@ -685,10 +685,10 @@ void IndexHNSWPQ::train(idx_t n, const float* x) { IndexHNSWSQ::IndexHNSWSQ( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M, MetricType metric) - : IndexHNSW(new IndexScalarQuantizer(d, qtype, metric), M) { + : IndexHNSW(new ::faiss::IndexScalarQuantizer(d, qtype, metric), M) { is_trained = this->storage->is_trained; own_fields = true; } diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.h index 7a2566748..010801edd 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.h @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include @@ -145,7 +145,7 @@ struct IndexHNSWSQ : IndexHNSW { IndexHNSWSQ(); IndexHNSWSQ( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M, MetricType metric = METRIC_L2); }; diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.cpp new file mode 100644 index 000000000..7de16b739 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2019-2025 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +#include + +#include + +namespace faiss { +namespace cppcontrib { +namespace knowhere { + +IndexHNSWBinary::IndexHNSWBinary() = default; + +IndexHNSWBinary::IndexHNSWBinary(int d, int M, MetricType metric) + : IndexHNSW(new IndexBinaryScalarQuantizer(d, metric), M) { + is_trained = this->storage->is_trained; + own_fields = true; +} + +} // namespace knowhere +} // namespace cppcontrib +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.h new file mode 100644 index 000000000..f3b614609 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.h @@ -0,0 +1,47 @@ +// Copyright (C) 2019-2025 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace faiss { +namespace cppcontrib { +namespace knowhere { + +/** + * HNSW index with a IndexBinaryScalarQuantizer storage — a replacement for + * the legacy IndexHNSWSQ(QT_1bit_direct, metric) path. Inherits from + * IndexHNSW directly (not IndexHNSWSQ) so ctor delegation goes straight + * to the Index*-storage form. + * + * On disk, instances serialize with the same fourcc ("IHNs") and byte + * layout as IndexHNSWSQ with an inner QT_1bit_direct ScalarQuantizer. + * Readers materialize either IndexHNSWSQ (for non-binary SQ qtypes) or + * IndexHNSWBinary (for QT_1bit_direct) depending on the inner storage's + * qtype — see fork's impl/index_read.cpp for the dispatch. + */ +struct IndexHNSWBinary : IndexHNSW { + IndexHNSWBinary(); + + /// d is the number of binary dimensions (not bytes). metric must be + /// supported by IndexBinaryScalarQuantizer: METRIC_Hamming, METRIC_Jaccard, + /// METRIC_Substructure, METRIC_Superstructure. + IndexHNSWBinary(int d, int M, MetricType metric = METRIC_Hamming); +}; + +} // namespace knowhere +} // namespace cppcontrib +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFFlat.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFFlat.cpp index 37ecfa930..8e2a414d3 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFFlat.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFFlat.cpp @@ -21,7 +21,6 @@ #include -#include #include #include @@ -32,6 +31,8 @@ #include +#include "simd/hook.h" + namespace faiss::cppcontrib::knowhere { diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQ.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQ.cpp index 522639050..0c34c5944 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQ.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQ.cpp @@ -22,8 +22,8 @@ #include #include -#include #include +#include "simd/hook.h" #include diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQFastScan.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQFastScan.cpp index affa74d0f..c4500da57 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQFastScan.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQFastScan.cpp @@ -176,6 +176,9 @@ void IndexIVFPQFastScan::encode_vectors( * Look-Up Table functions *********************************************************/ +// Explicit SIMD-level alias (no global bare aliases). +using simd8float32 = simd8float32_tpl; + void fvec_madd_simd_internal( size_t n, const float* __restrict a, diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.cpp index 839063b52..b771c7bb9 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.cpp @@ -14,7 +14,7 @@ IndexIVFScalarQuantizerCC::IndexIVFScalarQuantizerCC( size_t d, size_t nlist, size_t ssize, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric, bool by_residual, std::optional raw_data_prefix_path) @@ -52,7 +52,7 @@ void IndexIVFScalarQuantizerCC::add_core( FAISS_THROW_IF_NOT(is_trained); size_t nadd = 0; - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); DirectMapAdd dm_add(direct_map, n, xids); @@ -123,7 +123,7 @@ IndexIVFScalarQuantizerCCCosine::IndexIVFScalarQuantizerCCCosine( size_t d, size_t nlist, size_t ssize, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric, bool by_residual, std::optional raw_data_prefix_path) @@ -153,7 +153,7 @@ void IndexIVFScalarQuantizerCCCosine::add_core( const float* base_x = x_normalized.get(); size_t nadd = 0; - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); DirectMapAdd dm_add(direct_map, n, xids); diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.h index b153ec1a4..9317d0942 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.h @@ -25,7 +25,7 @@ struct IndexIVFScalarQuantizerCC : IndexIVFScalarQuantizer { size_t d, size_t nlist, size_t ssize, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric = METRIC_L2, bool by_residual = false, std::optional raw_data_prefix_path = std::nullopt); @@ -57,7 +57,7 @@ struct IndexIVFScalarQuantizerCCCosine : IndexIVFScalarQuantizerCC, HasInverseL2 size_t d, size_t nlist, size_t ssize, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric = METRIC_L2, bool by_residual = false, std::optional raw_data_prefix_path = std::nullopt); diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.cpp index ec8cb0145..7c723b9ee 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include "simd/hook.h" #include #include #include @@ -159,20 +159,20 @@ float WithSQ4UniformNormIPDistanceComputer::symmetric_dis(idx_t i, idx_t j) { IndexScalarQuantizer4bitUniformCosine::IndexScalarQuantizer4bitUniformCosine( int d) - : IndexScalarQuantizer( + : ::faiss::IndexScalarQuantizer( d, - ScalarQuantizer::QT_4bit_uniform, + ::faiss::ScalarQuantizer::QT_4bit_uniform, METRIC_INNER_PRODUCT) { - sq.rangestat = ScalarQuantizer::RS_quantiles; + sq.rangestat = ::faiss::ScalarQuantizer::RS_quantiles; sq.rangestat_arg = 0.01; } IndexScalarQuantizer4bitUniformCosine::IndexScalarQuantizer4bitUniformCosine() - : IndexScalarQuantizer() { + : ::faiss::IndexScalarQuantizer() { metric_type = METRIC_INNER_PRODUCT; - sq.rangestat = ScalarQuantizer::RS_quantiles; + sq.rangestat = ::faiss::ScalarQuantizer::RS_quantiles; sq.rangestat_arg = 0.01; } @@ -194,7 +194,7 @@ void IndexScalarQuantizer4bitUniformCosine::add(idx_t n, const float* x) { auto normalized_data = ::knowhere::CopyAndNormalizeVecs(x, n, d); // Add normalized data - IndexScalarQuantizer::add(n, normalized_data.get()); + ::faiss::IndexScalarQuantizer::add(n, normalized_data.get()); // Store inverse L2 norms from ORIGINAL vectors (not normalized) // This is needed for refine to work correctly with COSINE metric @@ -203,10 +203,19 @@ void IndexScalarQuantizer4bitUniformCosine::add(idx_t n, const float* x) { DistanceComputer* IndexScalarQuantizer4bitUniformCosine::get_distance_computer() const { - std::unique_ptr base_dc( - IndexScalarQuantizer::get_distance_computer()); - - return new SQ4UniformCosineDistanceComputer(d, std::move(base_dc)); + // The DC wrapper does `cosine = 1 - 0.5 * L2^2`, so the inner DC + // must compute L2^2 regardless of this index's metric_type (which is + // METRIC_INNER_PRODUCT because the index is semantically IP-based). + // Ask the SQ directly for an L2 DC rather than going through the + // baseline helper, which would pick a DC from `metric_type` and hand + // us an IP-computing DC. Pre-migration this was hidden by the fork's + // DistanceComputerSQ4UByte which always returned L2 regardless of + // Similarity. + auto* base_dc = sq.get_distance_computer(METRIC_L2); + base_dc->code_size = sq.code_size; + base_dc->codes = codes.data(); + return new SQ4UniformCosineDistanceComputer( + d, std::unique_ptr(base_dc)); } const float* IndexScalarQuantizer4bitUniformCosine::get_inverse_l2_norms() @@ -215,7 +224,7 @@ const float* IndexScalarQuantizer4bitUniformCosine::get_inverse_l2_norms() } void IndexScalarQuantizer4bitUniformCosine::reset() { - IndexScalarQuantizer::reset(); + ::faiss::IndexScalarQuantizer::reset(); inverse_norms_storage.reset(); } @@ -224,20 +233,20 @@ void IndexScalarQuantizer4bitUniformCosine::reset() { ////////////////////////////////////////////////////////////////////////////////// IndexScalarQuantizer4bitUniformIP::IndexScalarQuantizer4bitUniformIP(int d) - : IndexScalarQuantizer( + : ::faiss::IndexScalarQuantizer( d, - ScalarQuantizer::QT_4bit_uniform, + ::faiss::ScalarQuantizer::QT_4bit_uniform, METRIC_INNER_PRODUCT) { } IndexScalarQuantizer4bitUniformIP::IndexScalarQuantizer4bitUniformIP() - : IndexScalarQuantizer() { + : ::faiss::IndexScalarQuantizer() { metric_type = METRIC_INNER_PRODUCT; } void IndexScalarQuantizer4bitUniformIP::add(idx_t n, const float* x) { FAISS_THROW_IF_NOT(is_trained); - IndexScalarQuantizer::add(n, x); + ::faiss::IndexScalarQuantizer::add(n, x); // Compute and store norms squared for IP distance computation for (idx_t i = 0; i < n; i++) { @@ -248,17 +257,22 @@ void IndexScalarQuantizer4bitUniformIP::add(idx_t n, const float* x) { } void IndexScalarQuantizer4bitUniformIP::reset() { - IndexScalarQuantizer::reset(); + ::faiss::IndexScalarQuantizer::reset(); l2_norms_sqr.clear(); } DistanceComputer* IndexScalarQuantizer4bitUniformIP::get_distance_computer() const { - std::unique_ptr base_dc( - IndexScalarQuantizer::get_distance_computer()); - + // See IndexScalarQuantizer4bitUniformCosine::get_distance_computer + // for why we force METRIC_L2 here. The wrapper DC does + // `IP = 0.5 * (||q||^2 + ||b||^2 - L2^2)`, which only holds if the + // inner DC actually returns L2^2. + auto* base_dc = sq.get_distance_computer(METRIC_L2); + base_dc->code_size = sq.code_size; + base_dc->codes = codes.data(); return new WithSQ4UniformNormIPDistanceComputer( - get_l2_norms_sqr(), d, std::move(base_dc)); + get_l2_norms_sqr(), d, + std::unique_ptr(base_dc)); } const float* IndexScalarQuantizer4bitUniformIP::get_l2_norms_sqr() const { @@ -274,11 +288,11 @@ IndexHNSWSQ4UniformCosine::IndexHNSWSQ4UniformCosine() : IndexHNSW() { IndexHNSWSQ4UniformCosine::IndexHNSWSQ4UniformCosine( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M) : IndexHNSW(new IndexScalarQuantizer4bitUniformCosine(d), M) { FAISS_THROW_IF_NOT_MSG( - qtype == ScalarQuantizer::QT_4bit_uniform, + qtype == ::faiss::ScalarQuantizer::QT_4bit_uniform, "IndexHNSWSQ4UniformCosine only supports QT_4bit_uniform"); is_trained = this->storage->is_trained; @@ -299,11 +313,11 @@ IndexHNSWSQ4UniformIP::IndexHNSWSQ4UniformIP() : IndexHNSW() { IndexHNSWSQ4UniformIP::IndexHNSWSQ4UniformIP( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M) : IndexHNSW(new IndexScalarQuantizer4bitUniformIP(d), M) { FAISS_THROW_IF_NOT_MSG( - qtype == ScalarQuantizer::QT_4bit_uniform, + qtype == ::faiss::ScalarQuantizer::QT_4bit_uniform, "IndexHNSWSQ4UniformIP only supports QT_4bit_uniform"); is_trained = this->storage->is_trained; diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.h index def5e0466..c94aae829 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.h @@ -23,7 +23,7 @@ #include #include -#include +#include #include namespace faiss { @@ -128,7 +128,7 @@ struct WithSQ4UniformNormIPDistanceComputer : DistanceComputer { * modifying caller's data. Query normalization handled by knowhere layer. * Implements HasInverseL2Norms. */ -struct IndexScalarQuantizer4bitUniformCosine : IndexScalarQuantizer, +struct IndexScalarQuantizer4bitUniformCosine : ::faiss::IndexScalarQuantizer, HasInverseL2Norms { L2NormsStorage inverse_norms_storage; @@ -155,7 +155,7 @@ struct IndexScalarQuantizer4bitUniformCosine : IndexScalarQuantizer, * Scalar Quantizer specialized for 4-bit uniform quantization with IP metric. * Stores L2 norms squared of vectors to convert L2^2 distances to IP. */ -struct IndexScalarQuantizer4bitUniformIP : IndexScalarQuantizer { +struct IndexScalarQuantizer4bitUniformIP : ::faiss::IndexScalarQuantizer { /// Storage for L2 norms squared (||x||^2) std::vector l2_norms_sqr; @@ -184,7 +184,7 @@ struct IndexHNSWSQ4UniformCosine : IndexHNSW, HasInverseL2Norms { IndexHNSWSQ4UniformCosine( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M); const float* get_inverse_l2_norms() const override; @@ -193,7 +193,10 @@ struct IndexHNSWSQ4UniformCosine : IndexHNSW, HasInverseL2Norms { struct IndexHNSWSQ4UniformIP : IndexHNSW { IndexHNSWSQ4UniformIP(); - IndexHNSWSQ4UniformIP(int d, ScalarQuantizer::QuantizerType qtype, int M); + IndexHNSWSQ4UniformIP( + int d, + ::faiss::ScalarQuantizer::QuantizerType qtype, + int M); }; } diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScaNN.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScaNN.cpp index 7b53e575c..15deb73a3 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScaNN.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScaNN.cpp @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.cpp index 281b464db..8c4a6abaf 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.cpp @@ -11,116 +11,22 @@ #include #include +#include #include +#include #include #include #include -#include +#include +#include #include namespace faiss::cppcontrib::knowhere { -/******************************************************************* - * IndexScalarQuantizer implementation - ********************************************************************/ - -IndexScalarQuantizer::IndexScalarQuantizer( - int d, - ScalarQuantizer::QuantizerType qtype, - MetricType metric) - : IndexFlatCodes(0, d, metric), sq(d, qtype) { - if (qtype == ScalarQuantizer::QT_4bit_uniform && metric == METRIC_L2) { - sq.rangestat = ScalarQuantizer::RS_quantiles; - sq.rangestat_arg = 0.01; - } - is_trained = qtype == ScalarQuantizer::QT_fp16 || - qtype == ScalarQuantizer::QT_8bit_direct || - qtype == ScalarQuantizer::QT_bf16 || - qtype == ScalarQuantizer::QT_8bit_direct_signed || - qtype == ScalarQuantizer::QT_1bit_direct; - code_size = sq.code_size; -} - -IndexScalarQuantizer::IndexScalarQuantizer() - : IndexScalarQuantizer(0, ScalarQuantizer::QT_8bit) {} - -void IndexScalarQuantizer::train(idx_t n, const float* x) { - sq.train(n, x); - is_trained = true; -} - -void IndexScalarQuantizer::search( - idx_t n, - const float* x, - idx_t k, - float* distances, - idx_t* labels, - const SearchParameters* params) const { - const IDSelector* sel = params ? params->sel : nullptr; - - FAISS_THROW_IF_NOT(k > 0); - FAISS_THROW_IF_NOT(is_trained); - FAISS_THROW_IF_NOT( - metric_type == METRIC_L2 || metric_type == METRIC_INNER_PRODUCT); - -#pragma omp parallel - { - std::unique_ptr scanner( - sq.select_InvertedListScanner(metric_type, nullptr, true, sel)); - - scanner->list_no = 0; // directly the list number - -#pragma omp for - for (idx_t i = 0; i < n; i++) { - float* D = distances + k * i; - idx_t* I = labels + k * i; - // re-order heap - if (metric_type == METRIC_L2) { - maxheap_heapify(k, D, I); - } else { - minheap_heapify(k, D, I); - } - scanner->set_query(x + i * d); - size_t scan_cnt = 0; - scanner->scan_codes(ntotal, codes.data(), nullptr, nullptr, D, I, k, scan_cnt); - - // re-order heap - if (metric_type == METRIC_L2) { - maxheap_reorder(k, D, I); - } else { - minheap_reorder(k, D, I); - } - } - } -} - -FlatCodesDistanceComputer* IndexScalarQuantizer::get_FlatCodesDistanceComputer() - const { - ScalarQuantizer::SQDistanceComputer* dc = - sq.get_distance_computer(metric_type); - dc->code_size = sq.code_size; - dc->codes = codes.data(); - return dc; -} - -/* Codec interface */ - -void IndexScalarQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes) - const { - FAISS_THROW_IF_NOT(is_trained); - sq.compute_codes(x, bytes, n); -} - -void IndexScalarQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x) - const { - FAISS_THROW_IF_NOT(is_trained); - sq.decode(bytes, x, n); -} - /******************************************************************* * IndexIVFScalarQuantizer implementation ********************************************************************/ @@ -129,7 +35,7 @@ IndexIVFScalarQuantizer::IndexIVFScalarQuantizer( Index* quantizer, size_t d, size_t nlist, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric, bool by_residual) : IndexIVF(quantizer, d, nlist, 0, metric), sq(d, qtype) { @@ -161,7 +67,7 @@ void IndexIVFScalarQuantizer::encode_vectors( const idx_t* list_nos, uint8_t* codes, bool include_listnos) const { - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); size_t coarse_size = include_listnos ? coarse_code_size() : 0; memset(codes, 0, (code_size + coarse_size) * n); @@ -190,7 +96,7 @@ void IndexIVFScalarQuantizer::encode_vectors( void IndexIVFScalarQuantizer::sa_decode(idx_t n, const uint8_t* codes, float* x) const { - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); size_t coarse_size = coarse_code_size(); #pragma omp parallel if (n > 1000) @@ -222,7 +128,7 @@ void IndexIVFScalarQuantizer::add_core( void* inverted_list_context) { FAISS_THROW_IF_NOT(is_trained); - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); DirectMapAdd dm_add(direct_map, n, xids); @@ -262,12 +168,253 @@ void IndexIVFScalarQuantizer::add_core( ntotal += n; } +namespace { + +// Adapter scanners that implement the fork InvertedListScanner interface +// but delegate distance computation to a baseline SQDistanceComputer. +// Two variants are needed because the IP / L2 paths differ in how the +// coarse-centroid residual is folded into the distance: +// IP: dis = coarse_dis + dc.query_to_code(code) +// L2: the query is shifted into the centroid frame in set_list(), and +// the DC already produces the final L2 distance on every code. +// +// scan_cnt is a fork-side out-param that fork's own SQ scanners never +// increment (only IVFFlat/FastScan do), so we match that behavior and +// leave it untouched. + +class BaselineIVFSQScannerIP : public InvertedListScanner { + public: + BaselineIVFSQScannerIP( + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc, + size_t code_size_in, + bool store_pairs_in, + const IDSelector* sel_in, + bool by_residual_in) + : dc_(std::move(dc)), by_residual_(by_residual_in) { + store_pairs = store_pairs_in; + sel = sel_in; + code_size = code_size_in; + keep_max = true; + } + + void set_query(const float* query) override { + dc_->set_query(query); + } + + void set_list(idx_t list_no_in, float coarse_dis) override { + this->list_no = list_no_in; + accu0_ = by_residual_ ? coarse_dis : 0.0f; + } + + float distance_to_code(const uint8_t* code) const override { + return accu0_ + dc_->query_to_code(code); + } + + size_t scan_codes( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + float* simi, + idx_t* idxi, + size_t k, + size_t& /*scan_cnt*/) const override { + size_t nup = 0; + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = accu0_ + dc_->query_to_code(codes + j * code_size); + if (dis > simi[0]) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + minheap_replace_top(k, simi, idxi, dis, id); + nup++; + } + } + return nup; + } + + void scan_codes_and_return( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + std::vector<::knowhere::DistId>& out) const override { + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = accu0_ + dc_->query_to_code(codes + j * code_size); + out.emplace_back(ids[j], dis); + } + } + + void scan_codes_range( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + float radius, + RangeQueryResult& res) const override { + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = accu0_ + dc_->query_to_code(codes + j * code_size); + if (dis > radius) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + res.add(dis, id); + } + } + } + + private: + bool selector_accepts(size_t j, const idx_t* ids) const { + if (!sel) { + return true; + } + return sel->is_member(store_pairs ? static_cast(j) : ids[j]); + } + + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc_; + bool by_residual_; + float accu0_ = 0.0f; +}; + +class BaselineIVFSQScannerL2 : public InvertedListScanner { + public: + BaselineIVFSQScannerL2( + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc, + int d_in, + size_t code_size_in, + const Index* quantizer_in, + bool store_pairs_in, + const IDSelector* sel_in, + bool by_residual_in) + : dc_(std::move(dc)), + by_residual_(by_residual_in), + quantizer_(quantizer_in), + tmp_(d_in) { + store_pairs = store_pairs_in; + sel = sel_in; + code_size = code_size_in; + keep_max = false; + } + + void set_query(const float* query) override { + x_ = query; + if (!by_residual_) { + dc_->set_query(query); + } + } + + void set_list(idx_t list_no_in, float /*coarse_dis*/) override { + this->list_no = list_no_in; + if (by_residual_) { + quantizer_->compute_residual(x_, tmp_.data(), list_no_in); + dc_->set_query(tmp_.data()); + } + } + + float distance_to_code(const uint8_t* code) const override { + return dc_->query_to_code(code); + } + + size_t scan_codes( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + float* simi, + idx_t* idxi, + size_t k, + size_t& /*scan_cnt*/) const override { + size_t nup = 0; + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = dc_->query_to_code(codes + j * code_size); + if (dis < simi[0]) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + maxheap_replace_top(k, simi, idxi, dis, id); + nup++; + } + } + return nup; + } + + void scan_codes_and_return( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + std::vector<::knowhere::DistId>& out) const override { + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = dc_->query_to_code(codes + j * code_size); + out.emplace_back(ids[j], dis); + } + } + + void scan_codes_range( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + float radius, + RangeQueryResult& res) const override { + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = dc_->query_to_code(codes + j * code_size); + if (dis < radius) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + res.add(dis, id); + } + } + } + + private: + bool selector_accepts(size_t j, const idx_t* ids) const { + if (!sel) { + return true; + } + return sel->is_member(store_pairs ? static_cast(j) : ids[j]); + } + + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc_; + bool by_residual_; + const Index* quantizer_; + const float* x_ = nullptr; + std::vector tmp_; +}; + +} // namespace + InvertedListScanner* IndexIVFScalarQuantizer::get_InvertedListScanner( bool store_pairs, const IDSelector* sel, const IVFSearchParameters*) const { - return sq.select_InvertedListScanner( - metric_type, quantizer, store_pairs, sel, by_residual); + FAISS_THROW_IF_NOT( + metric_type == METRIC_L2 || metric_type == METRIC_INNER_PRODUCT); + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc( + sq.get_distance_computer(metric_type)); + if (metric_type == METRIC_INNER_PRODUCT) { + return new BaselineIVFSQScannerIP( + std::move(dc), code_size, store_pairs, sel, by_residual); + } + return new BaselineIVFSQScannerL2( + std::move(dc), + static_cast(d), + code_size, + quantizer, + store_pairs, + sel, + by_residual); } void IndexIVFScalarQuantizer::reconstruct_from_offset( diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.h index 085379097..dbaa358a3 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.h @@ -13,54 +13,14 @@ #include #include +#include #include #include -#include -#include - namespace faiss { namespace cppcontrib { namespace knowhere { -/** - * Flat index built on a scalar quantizer. - */ -struct IndexScalarQuantizer : IndexFlatCodes { - /// Used to encode the vectors - ScalarQuantizer sq; - - /** Constructor. - * - * @param d dimensionality of the input vectors - * @param M number of subquantizers - * @param nbits number of bit per subvector index - */ - IndexScalarQuantizer( - int d, - ScalarQuantizer::QuantizerType qtype, - MetricType metric = METRIC_L2); - - IndexScalarQuantizer(); - - void train(idx_t n, const float* x) override; - - void search( - idx_t n, - const float* x, - idx_t k, - float* distances, - idx_t* labels, - const SearchParameters* params = nullptr) const override; - - FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override; - - /* standalone codec interface */ - void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override; - - void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; -}; - /** An IVF implementation where the components of the residuals are * encoded with a scalar quantizer. All distance computations * are asymmetric, so the encoded vectors are decoded and approximate @@ -68,13 +28,20 @@ struct IndexScalarQuantizer : IndexFlatCodes { */ struct IndexIVFScalarQuantizer : IndexIVF { - ScalarQuantizer sq; + // Baseline scalar quantizer value-type. Fork IVF still inherits + // from fork IndexIVF (needed for ConcurrentArrayInvertedLists, + // extended search params, and the 8-arg scan_codes interface), but + // the SQ state itself is the upstream struct and the scanner + // returned from get_InvertedListScanner is a fork-interface adapter + // that forwards distance computation to a baseline + // SQDistanceComputer. + ::faiss::ScalarQuantizer sq; IndexIVFScalarQuantizer( Index* quantizer, size_t d, size_t nlist, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric = METRIC_L2, bool by_residual = true); diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/RaBitQuantizer.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/RaBitQuantizer.cpp index 65bd3d9c6..350396987 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/RaBitQuantizer.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/RaBitQuantizer.cpp @@ -7,10 +7,10 @@ #include #include -#include #include #include #include +#include "simd/hook.h" diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.cpp deleted file mode 100644 index 055662bed..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.cpp +++ /dev/null @@ -1,210 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -// -*- c++ -*- - -#include - -#include -#include - -#include -#include - -#ifdef __SSE__ -#include -#endif - -#include -#include -#include - -#include -#include -#include -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; - -/******************************************************************* - * ScalarQuantizer implementation - * - * The main source of complexity is to support combinations of 4 - * variants without incurring runtime tests or virtual function calls: - * - * - 4 / 8 bits per code component - * - uniform / non-uniform - * - IP / L2 distance search - * - scalar / AVX distance computation - * - * The appropriate Quantizer object is returned via select_quantizer - * that hides the template mess. - ********************************************************************/ - -#ifdef __AVX2__ -#ifdef __F16C__ -#define USE_F16C -#else -#warning \ - "Cannot enable AVX optimizations in scalar quantizer if -mf16c is not set as well" -#endif -#endif - -/******************************************************************* - * ScalarQuantizer implementation - ********************************************************************/ - -ScalarQuantizer::ScalarQuantizer(size_t d, QuantizerType qtype) - : Quantizer(d), qtype(qtype) { - set_derived_sizes(); -} - -ScalarQuantizer::ScalarQuantizer() {} - -void ScalarQuantizer::set_derived_sizes() { - switch (qtype) { - case QT_8bit: - case QT_8bit_uniform: - case QT_8bit_direct: - case QT_8bit_direct_signed: - code_size = d; - bits = 8; - break; - case QT_4bit: - case QT_4bit_uniform: - code_size = (d + 1) / 2; - bits = 4; - break; - case QT_6bit: - code_size = (d * 6 + 7) / 8; - bits = 6; - break; - case QT_fp16: - code_size = d * 2; - bits = 16; - break; - case QT_bf16: - code_size = d * 2; - bits = 16; - break; - case QT_1bit_direct: - code_size = (d + 7) / 8; - bits = 1; - break; - } -} - -void ScalarQuantizer::train(size_t n, const float* x) { - int bit_per_dim = qtype == QT_4bit_uniform ? 4 - : qtype == QT_4bit ? 4 - : qtype == QT_6bit ? 6 - : qtype == QT_8bit_uniform ? 8 - : qtype == QT_8bit ? 8 - : qtype == QT_1bit_direct ? 1 - : -1; - - switch (qtype) { - case QT_4bit_uniform: - case QT_8bit_uniform: - train_Uniform( - rangestat, - rangestat_arg, - n * d, - 1 << bit_per_dim, - x, - trained); - break; - case QT_4bit: - case QT_8bit: - case QT_6bit: - train_NonUniform( - rangestat, - rangestat_arg, - n, - d, - 1 << bit_per_dim, - x, - trained); - break; - case QT_fp16: - case QT_8bit_direct: - case QT_bf16: - case QT_8bit_direct_signed: - case QT_1bit_direct: - // no training necessary - break; - } -} - -ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const { - /* use hook to decide use AVX512 or not */ - return sq_sel_quantizer(qtype, d, trained); -} - -void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n) - const { - std::unique_ptr squant(select_quantizer()); - - memset(codes, 0, code_size * n); -#pragma omp parallel for if (n > 1) - for (int64_t i = 0; i < n; i++) - squant->encode_vector(x + i * d, codes + i * code_size); -} - -void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const { - std::unique_ptr squant(select_quantizer()); - -#pragma omp parallel for if (n > 1) - for (int64_t i = 0; i < n; i++) - squant->decode_vector(codes + i * code_size, x + i * d); -} - -SQDistanceComputer* ScalarQuantizer::get_distance_computer( - MetricType metric) const { - FAISS_THROW_IF_NOT( - metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT || - metric == METRIC_Hamming || metric == METRIC_Jaccard); - /* use hook to decide use AVX512 or not */ - if (metric == METRIC_Hamming) { - assert(qtype == QT_1bit_direct); - return sq_get_hamming_distance_computer(metric, qtype, d, trained); - } - if (metric == METRIC_Jaccard) { - assert(qtype == QT_1bit_direct); - return sq_get_jaccard_distance_computer(metric, qtype, d, trained); - } - return sq_get_distance_computer(metric, qtype, d, trained); -} - -/******************************************************************* - * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object - * - * It is an InvertedListScanner, but is designed to work with - * IndexScalarQuantizer as well. - ********************************************************************/ - -InvertedListScanner* ScalarQuantizer::select_InvertedListScanner( - MetricType mt, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) const { - /* use hook to decide use AVX512 or not */ - return sq_sel_inv_list_scanner(mt, this, quantizer, d, store_pairs, - sel, by_residual); -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h deleted file mode 100644 index 43a2b900e..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h +++ /dev/null @@ -1,146 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -// -*- c++ -*- - -#pragma once - -#include -#include -#include - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -struct InvertedListScanner; - -/** - * The uniform quantizer has a range [vmin, vmax]. The range can be - * the same for all dimensions (uniform) or specific per dimension - * (default). - */ - -struct ScalarQuantizer : Quantizer { - enum QuantizerType { - QT_8bit, ///< 8 bits per component - QT_4bit, ///< 4 bits per component - QT_8bit_uniform, ///< same, shared range for all dimensions - QT_4bit_uniform, - QT_fp16, - QT_8bit_direct, ///< fast indexing of uint8s - QT_6bit, ///< 6 bits per component, - QT_bf16, - QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from - ///< [-128 to 127] - QT_1bit_direct, ///< fast indexing of 1 bit per component - }; - - QuantizerType qtype = QT_8bit; - - /** The uniform encoder can estimate the range of representable - * values of the unform encoder using different statistics. Here - * rs = rangestat_arg */ - - // rangestat_arg. - enum RangeStat { - RS_minmax, ///< [min - rs*(max-min), max + rs*(max-min)] - RS_meanstd, ///< [mean - std * rs, mean + std * rs] - RS_quantiles, ///< [Q(rs), Q(1-rs)] - RS_optim, ///< alternate optimization of reconstruction error - }; - - RangeStat rangestat = RS_minmax; - float rangestat_arg = 0; - - /// bits per scalar code - size_t bits = 0; - - /// trained values (including the range) - std::vector trained; - - ScalarQuantizer(size_t d, QuantizerType qtype); - ScalarQuantizer(); - - /// updates internal values based on qtype and d - void set_derived_sizes(); - - void train(size_t n, const float* x) override; - - /** Encode a set of vectors - * - * @param x vectors to encode, size n * d - * @param codes output codes, size n * code_size - */ - void compute_codes(const float* x, uint8_t* codes, size_t n) const override; - - /** Decode a set of vectors - * - * @param codes codes to decode, size n * code_size - * @param x output vectors, size n * d - */ - void decode(const uint8_t* code, float* x, size_t n) const override; - - /***************************************************** - * Objects that provide methods for encoding/decoding, distance - * computation and inverted list scanning - *****************************************************/ - - struct SQuantizer { - // encodes one vector. Assumes code is filled with 0s on input! - virtual void encode_vector(const float* x, uint8_t* code) const = 0; - virtual void decode_vector(const uint8_t* code, float* x) const = 0; - - virtual ~SQuantizer() {} - }; - - SQuantizer* select_quantizer() const; - - struct SQDistanceComputer : FlatCodesDistanceComputer { - const float* q; - - SQDistanceComputer() : q(nullptr) {} - - virtual float query_to_code(const uint8_t* code) const = 0; - - float distance_to_code(const uint8_t* code) final { - return query_to_code(code); - } - - virtual void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3 - ) const { - dis0 = this->query_to_code(code_0); - dis1 = this->query_to_code(code_1); - dis2 = this->query_to_code(code_2); - dis3 = this->query_to_code(code_3); - } - }; - - SQDistanceComputer* get_distance_computer( - MetricType metric = METRIC_L2) const; - - InvertedListScanner* select_InvertedListScanner( - MetricType mt, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual = false) const; -}; - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec.h deleted file mode 100644 index e848f16fe..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec.h +++ /dev/null @@ -1,979 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - code[i] = (int)(255 * x); - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - return (code[i] + 0.5f) / 255.0f; - } -}; - -struct Codec4bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2); - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; - } -}; - -struct Codec6bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - int bits = (int)(x * 63.0); - code += (i >> 2) * 3; - switch (i & 3) { - case 0: - code[0] |= bits; - break; - case 1: - code[0] |= bits << 6; - code[1] |= bits >> 2; - break; - case 2: - code[1] |= bits << 4; - code[2] |= bits >> 4; - break; - case 3: - code[2] |= bits << 2; - break; - } - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - uint8_t bits = 0x00; - code += (i >> 2) * 3; - switch (i & 3) { - case 0: - bits = code[0] & 0x3f; - break; - case 1: - bits = code[0] >> 6; - bits |= (code[1] & 0xf) << 2; - break; - case 2: - bits = code[1] >> 4; - bits |= (code[2] & 3) << 4; - break; - case 3: - bits = code[2] >> 2; - break; - } - return (bits + 0.5f) / 63.0f; - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -enum class QuantizerTemplateScaling { - UNIFORM = 0, - NON_UNIFORM = 1 -}; - -template -struct QuantizerTemplate {}; - -template -struct QuantizerTemplate : SQuantizer { - const size_t d; - const float vmin, vdiff; - - QuantizerTemplate(size_t d, const std::vector& trained) - : d(d), vmin(trained[0]), vdiff(trained[1]) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = 0; - if (vdiff != 0) { - xi = (x[i] - vmin) / vdiff; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - } - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin + xi * vdiff; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - float xi = Codec::decode_component(code, i); - return vmin + xi * vdiff; - } -}; - -template -struct QuantizerTemplate : SQuantizer { - const size_t d; - const float *vmin, *vdiff; - - QuantizerTemplate(size_t d, const std::vector& trained) - : d(d), vmin(trained.data()), vdiff(trained.data() + d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = 0; - if (vdiff[i] != 0) { - xi = (x[i] - vmin[i]) / vdiff[i]; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - } - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin[i] + xi * vdiff[i]; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - float xi = Codec::decode_component(code, i); - return vmin[i] + xi * vdiff[i]; - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16 {}; - -template <> -struct QuantizerFP16<1> : SQuantizer { - const size_t d; - - QuantizerFP16(size_t d, const std::vector& /* unused */) : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - ((uint16_t*)code)[i] = encode_fp16(x[i]); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = decode_fp16(((uint16_t*)code)[i]); - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return decode_fp16(((uint16_t*)code)[i]); - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16 {}; - -template <> -struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer { - const size_t d; - - QuantizerBF16(size_t d, const std::vector& /* unused */) : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - ((uint16_t*)code)[i] = encode_bf16(x[i]); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = decode_bf16(((uint16_t*)code)[i]); - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return decode_bf16(((uint16_t*)code)[i]); - } -}; - -/******************************************************************* - * Specialized QuantizerTemplate for SQ4U (base version) - *******************************************************************/ - -template <> -struct QuantizerTemplate - : SQuantizer { - const size_t d; - const float vmin, vdiff; - float final_scale; - float final_bias; - - QuantizerTemplate(size_t d, const std::vector& trained) - : d(d), vmin(trained[0]), vdiff(trained[1]) { - final_scale = vdiff / 15.0f; - final_bias = vmin + vdiff * 0.5f / 15.0f; - } - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = 0; - if (vdiff != 0) { - xi = (x[i] - vmin) / vdiff; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - } - Codec4bit::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec4bit::decode_component(code, i); - x[i] = vmin + xi * vdiff; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - float xi = Codec4bit::decode_component(code, i); - return vmin + xi * vdiff; - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect {}; - -template <> -struct Quantizer8bitDirect<1> : SQuantizer { - const size_t d; - - Quantizer8bitDirect(size_t d, const std::vector& /* unused */) - : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - code[i] = (uint8_t)x[i]; - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = code[i]; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return code[i]; - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned {}; - -template <> -struct Quantizer8bitDirectSigned<1> : ScalarQuantizer::SQuantizer { - const size_t d; - - Quantizer8bitDirectSigned(size_t d, const std::vector& /* unused */) - : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - code[i] = (uint8_t)(x[i] + 128); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = code[i] - 128; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return code[i] - 128; - } -}; - -/******************************************************************* - * 1bit_direct quantizer - * - * Note: The 1bit_direct quantizer currently does not support the - *`reconstruct_component` method and does not provide SIMDWIDTH support. - *******************************************************************/ - -struct Quantizer1bitDirect : SQuantizer { - const size_t d; - - Quantizer1bitDirect(size_t d, const std::vector& /* unused */) - : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - size_t code_size = (d + 7) / 8; - for (size_t i = 0; i < code_size; i++) { - code[i] = (uint8_t)x[i]; - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - size_t code_size = (d + 7) / 8; - for (size_t i = 0; i < code_size; i++) { - x[i] = (float)code[i]; - } - } -}; - -template -SQuantizer* select_quantizer_1( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case ScalarQuantizer::QT_8bit: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_6bit: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_4bit: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_8bit_uniform: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_4bit_uniform: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_fp16: - return new QuantizerFP16(d, trained); - case ScalarQuantizer::QT_bf16: - return new QuantizerBF16(d, trained); - case ScalarQuantizer::QT_8bit_direct: - return new Quantizer8bitDirect(d, trained); - case ScalarQuantizer::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned(d, trained); - case ScalarQuantizer::QT_1bit_direct: - return new Quantizer1bitDirect(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * DistanceComputerSQ4UByte: specialized distance computer for SQ4U - * Always computes L2 distance in quantized space regardless of Similarity - *******************************************************************/ - -template -struct DistanceComputerSQ4UByte : SQDistanceComputer { - using Quantizer = - QuantizerTemplate; - Quantizer quant; - - // Quantized query codes - uint8_t* q_codes; - - DistanceComputerSQ4UByte(size_t d, const std::vector& trained) - : quant(d, trained) { - q_codes = new uint8_t[(d + 1) / 2]; - } - - ~DistanceComputerSQ4UByte() { - delete[] q_codes; - } - - void set_query(const float* x) override { - // Quantize query to 4-bit codes - // Database layout: low nibble = even index, high nibble = odd index - float inv_scale = 1.0f / quant.final_scale; - float offset = quant.vmin; - - for (size_t i = 0; i < quant.d; i += 2) { - // Quantize first component (even index -> low nibble) - float val0 = (x[i] - offset) * inv_scale; - int q0 = static_cast(std::floor(val0)); - q0 = std::max(0, std::min(15, q0)); - - // Quantize second component (odd index -> high nibble) - int q1 = 0; - if (i + 1 < quant.d) { - float val1 = (x[i + 1] - offset) * inv_scale; - q1 = static_cast(std::floor(val1)); - q1 = std::max(0, std::min(15, q1)); - } - - // Pack: low nibble = q0 (even), high nibble = q1 (odd) - q_codes[i / 2] = q0 | (q1 << 4); - } - } - - // Compute L2 distance between query and database code - float compute_distance_l2(const uint8_t* code8) const { - int32_t accu = 0; - const uint8_t* qc = q_codes; - - for (size_t i = 0; i < quant.d; i += 2) { - uint8_t qbyte = *qc++; - uint8_t dbyte = *code8++; - - // Extract nibbles: low nibble = even index, high nibble = odd index - int q0 = qbyte & 15; // even (low nibble) - int q1 = qbyte >> 4; // odd (high nibble) - int d0 = dbyte & 15; // even (low nibble) - int d1 = dbyte >> 4; // odd (high nibble) - - // Compute differences - int diff0 = q0 - d0; - int diff1 = q1 - d1; - - // Accumulate squared differences - accu += diff0 * diff0 + diff1 * diff1; - } - - // Scale to floating point - float scale = quant.final_scale; - return accu * scale * scale; - } - - // Compute L2 distance between two codes - float compute_code_distance_l2(const uint8_t* code1, const uint8_t* code2) - const { - int32_t accu = 0; - - for (size_t i = 0; i < quant.d; i += 2) { - uint8_t byte1 = *code1++; - uint8_t byte2 = *code2++; - - // Extract nibbles: low nibble = even index, high nibble = odd index - int c1_0 = byte1 & 15; // even (low nibble) - int c1_1 = byte1 >> 4; // odd (high nibble) - int c2_0 = byte2 & 15; // even (low nibble) - int c2_1 = byte2 >> 4; // odd (high nibble) - - // Compute differences - int diff0 = c1_0 - c2_0; - int diff1 = c1_1 - c2_1; - - // Accumulate squared differences - accu += diff0 * diff0 + diff1 * diff1; - } - - // Scale to floating point - float scale = quant.final_scale; - return accu * scale * scale; - } - - float query_to_code(const uint8_t* code) const override { - return compute_distance_l2(code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - const uint8_t* code_i = codes + i * code_size; - const uint8_t* code_j = codes + j * code_size; - return compute_code_distance_l2(code_i, code_j); - } -}; - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2 {}; - -template <> -struct SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2(const float* y) : y(y) {} - - /******* scalar accumulator *******/ - - float accu; - - FAISS_ALWAYS_INLINE void begin() { - accu = 0; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_component(float x) { - float tmp = *yi++ - x; - accu += tmp * tmp; - } - - FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { - float tmp = x1 - x2; - accu += tmp * tmp; - } - - FAISS_ALWAYS_INLINE float result() { - return accu; - } -}; - -template -struct SimilarityIP {}; - -template <> -struct SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - const float *y, *yi; - - float accu; - - explicit SimilarityIP(const float* y) : y(y) {} - - FAISS_ALWAYS_INLINE void begin() { - accu = 0; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_component(float x) { - accu += *yi++ * x; - } - - FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { - accu += x1 * x2; - } - - FAISS_ALWAYS_INLINE float result() { - return accu; - } -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ - -template -struct DCTemplate : SQDistanceComputer {}; - -template -struct DCTemplate : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float xi = quant.reconstruct_component(code, i); - sim.add_component(xi); - } - return sim.result(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float x1 = quant.reconstruct_component(code1, i); - float x2 = quant.reconstruct_component(code2, i); - sim.add_component_2(x1, x2); - } - return sim.result(); - } - - void set_query(const float* x) final { - q = x; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(q, code); - } -}; - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ - -template -struct DistanceComputerByte : SQDistanceComputer {}; - -template -struct DistanceComputerByte : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - int accu = 0; - for (int i = 0; i < d; i++) { - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - accu += int(code1[i]) * code2[i]; - } else { - int diff = int(code1[i]) - code2[i]; - accu += diff * diff; - } - } - return accu; - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -SQDistanceComputer* select_distance_computer( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - switch (qtype) { - case ScalarQuantizer::QT_8bit_uniform: - return new DCTemplate< - QuantizerTemplate, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_4bit_uniform: - return new DistanceComputerSQ4UByte(d, trained); - - case ScalarQuantizer::QT_8bit: - return new DCTemplate< - QuantizerTemplate, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_6bit: - return new DCTemplate< - QuantizerTemplate, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_4bit: - return new DCTemplate< - QuantizerTemplate, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_fp16: - return new DCTemplate, Sim, SIMDWIDTH>( - d, trained); - - case ScalarQuantizer::QT_bf16: - return new DCTemplate, Sim, SIMDWIDTH>( - d, trained); - - case ScalarQuantizer::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte(d, trained); - } else { - return new DCTemplate< - Quantizer8bitDirect, - Sim, - SIMDWIDTH>(d, trained); - } - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate< - Quantizer8bitDirectSigned, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -// This wrapper adapts Jaccard and Hamming binary computers to the -// SQDistanceComputer interface -template -struct BinarySQDistanceComputerWrapper : SQDistanceComputer { - BinaryComputerType binary_computer; - size_t code_size; - std::vector tmp; - - BinarySQDistanceComputerWrapper(size_t code_size, const std::vector&) - : code_size(code_size), tmp(code_size) {} - - void set_query(const float* x) final { - for (size_t i = 0; i < code_size; ++i) { - tmp[i] = (uint8_t)x[i]; - } - binary_computer.set(tmp.data(), code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return binary_computer.compute(code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - const uint8_t* code_i = codes + i * code_size; - const uint8_t* code_j = codes + j * code_size; - - BinaryComputerType temp_computer; - temp_computer.set(code_i, code_size); - return temp_computer.compute(code_j); - } -}; - -SQDistanceComputer* select_hamming_distance_computer( - size_t d, - const std::vector& trained); - -SQDistanceComputer* select_jaccard_distance_computer( - size_t d, - const std::vector& trained); - -template -InvertedListScanner* sel3_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - if (DCClass::Sim::metric_type == METRIC_L2) { - return new IVFSQScannerL2( - sq->d, - sq->trained, - sq->code_size, - quantizer, - store_pairs, - sel, - r); - } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) { - return new IVFSQScannerIP( - sq->d, sq->trained, sq->code_size, store_pairs, sel, r); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -template -InvertedListScanner* sel2_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - if (sel) { - if (store_pairs) { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } else { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } - } else { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } -} - -template -InvertedListScanner* sel12_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate; - using DCClass = DCTemplate; - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case ScalarQuantizer::QT_8bit_uniform: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_4bit_uniform: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_8bit: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_4bit: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_6bit: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_fp16: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_bf16: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner< - DistanceComputerByte>( - sq, quantizer, store_pairs, sel, r); - } else { - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx.h deleted file mode 100644 index ef02f9df9..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx.h +++ /dev/null @@ -1,1230 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit_avx : public Codec8bit { - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - const uint64_t c8 = *(uint64_t*)(code + i); - - const __m128i i8 = _mm_set1_epi64x(c8); - const __m256i i32 = _mm256_cvtepu8_epi32(i8); - const __m256 f8 = _mm256_cvtepi32_ps(i32); - const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f); - const __m256 one_255 = _mm256_set1_ps(1.f / 255.f); - return _mm256_fmadd_ps(f8, one_255, half_one_255); - } -}; - -struct Codec4bit_avx : public Codec4bit { - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - uint32_t c4 = *(uint32_t*)(code + (i >> 1)); - uint32_t mask = 0x0f0f0f0f; - uint32_t c4ev = c4 & mask; - uint32_t c4od = (c4 >> 4) & mask; - - // the 8 lower bytes of c8 contain the values - __m128i c8 = - _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od)); - __m128i c4lo = _mm_cvtepu8_epi32(c8); - __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4)); - __m256i i8 = _mm256_castsi128_si256(c4lo); - i8 = _mm256_insertf128_si256(i8, c4hi, 1); - __m256 f8 = _mm256_cvtepi32_ps(i8); - __m256 half = _mm256_set1_ps(0.5f); - f8 = _mm256_add_ps(f8, half); - __m256 one_255 = _mm256_set1_ps(1.f / 15.f); - return _mm256_mul_ps(f8, one_255); - } - - static FAISS_ALWAYS_INLINE __m256i - decode_8_components_int(const uint8_t* code, int i) { - // Load 4 bytes containing 8 nibbles - uint32_t c4 = *(uint32_t*)(code + (i >> 1)); - uint32_t mask = 0x0f0f0f0f; - uint32_t c4ev = c4 & mask; // Even nibbles - uint32_t c4od = (c4 >> 4) & mask; // Odd nibbles - - // Interleave even and odd nibbles - __m128i c8 = - _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od)); - - // Convert to 8x32-bit integers - __m128i c4lo = _mm_cvtepu8_epi32(c8); - __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4)); - __m256i result = _mm256_castsi128_si256(c4lo); - result = _mm256_insertf128_si256(result, c4hi, 1); - - return result; - } -}; - -struct Codec6bit_avx : public Codec6bit { - /* Load 6 bytes that represent 8 6-bit values, return them as a - * 8*32 bit vector register */ - static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) { - const __m128i perm = _mm_set_epi8( - -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0); - const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0); - - // load 6 bytes - __m128i c1 = - _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]); - - // put in 8 * 32 bits - __m128i c2 = _mm_shuffle_epi8(c1, perm); - __m256i c3 = _mm256_cvtepi16_epi32(c2); - - // shift and mask out useless bits - __m256i c4 = _mm256_srlv_epi32(c3, shifts); - __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4); - return c5; - } - - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here - // // for the reference, maybe, it becomes used oned day. - // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3); - // const uint32_t* data32 = (const uint32_t*)data16; - // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32); - // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL); - // const __m128i i8 = _mm_set1_epi64x(vext); - // const __m256i i32 = _mm256_cvtepi8_epi32(i8); - // const __m256 f8 = _mm256_cvtepi32_ps(i32); - // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); - // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); - // return _mm256_fmadd_ps(f8, one_255, half_one_255); - - __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3)); - __m256 f8 = _mm256_cvtepi32_ps(i8); - // this could also be done with bit manipulations but it is - // not obviously faster - const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); - const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); - return _mm256_fmadd_ps(f8, one_255, half_one_255); - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -template -struct QuantizerTemplate_avx {}; - -template -struct QuantizerTemplate_avx - : public QuantizerTemplate { - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} -}; - -template -struct QuantizerTemplate_avx - : public QuantizerTemplate { - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256 xi = Codec::decode_8_components(code, i); - return _mm256_fmadd_ps( - xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin)); - } -}; - -template -struct QuantizerTemplate_avx - : public QuantizerTemplate { - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} -}; - -template -struct QuantizerTemplate_avx - : public QuantizerTemplate { - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256 xi = Codec::decode_8_components(code, i); - return _mm256_fmadd_ps( - xi, - _mm256_loadu_ps(this->vdiff + i), - _mm256_loadu_ps(this->vmin + i)); - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16_avx {}; - -template <> -struct QuantizerFP16_avx<1> : public QuantizerFP16<1> { - QuantizerFP16_avx(size_t d, const std::vector& unused) - : QuantizerFP16<1>(d, unused) {} -}; - -template <> -struct QuantizerFP16_avx<8> : public QuantizerFP16<1> { - QuantizerFP16_avx(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i)); - return _mm256_cvtph_ps(codei); - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16_avx {}; - -template <> -struct QuantizerBF16_avx<1> : public QuantizerBF16<1> { - QuantizerBF16_avx(size_t d, const std::vector& unused) - : QuantizerBF16<1>(d, unused) {} -}; - -template <> -struct QuantizerBF16_avx<8> : public QuantizerBF16<1> { - QuantizerBF16_avx(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i)); - __m256i code_256i = _mm256_cvtepu16_epi32(code_128i); - code_256i = _mm256_slli_epi32(code_256i, 16); - return _mm256_castsi256_ps(code_256i); - } -}; - -/******************************************************************* - * Specialized QuantizerTemplate for SQ4U - *******************************************************************/ - -template <> -struct QuantizerTemplate_avx< - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM, - 8> - : public QuantizerTemplate< - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM, - 1> { - float final_scale; - float final_bias; - - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM, - 1>(d, trained) { - final_scale = this->vdiff / 15.0f; - final_bias = this->vmin + this->vdiff * 0.5f / 15.0f; - } - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256i nibbles = Codec4bit_avx::decode_8_components_int(code, i); - __m256 nibbles_f = _mm256_cvtepi32_ps(nibbles); - - return _mm256_fmadd_ps( - nibbles_f, - _mm256_set1_ps(final_scale), - _mm256_set1_ps(final_bias)); - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect_avx {}; - -template <> -struct Quantizer8bitDirect_avx<1> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_avx(size_t d, const std::vector& unused) - : Quantizer8bitDirect(d, unused) {} -}; - -template <> -struct Quantizer8bitDirect_avx<8> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_avx(size_t d, const std::vector& trained) - : Quantizer8bitDirect<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 - return _mm256_cvtepi32_ps(y8); // 8 * float32 - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned_avx {}; - -template <> -struct Quantizer8bitDirectSigned_avx<1> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_avx(size_t d, const std::vector& unused) - : Quantizer8bitDirectSigned(d, unused) {} -}; - -template <> -struct Quantizer8bitDirectSigned_avx<8> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_avx(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 - __m256i c8 = _mm256_set1_epi32(128); - __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes - return _mm256_cvtepi32_ps(z8); // 8 * float32 - } -}; - -template -SQuantizer* select_quantizer_1_avx( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case QuantizerType::QT_8bit: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_6bit: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_4bit: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_fp16: - return new QuantizerFP16_avx(d, trained); - case QuantizerType::QT_bf16: - return new QuantizerBF16_avx(d, trained); - case QuantizerType::QT_8bit_direct: - return new Quantizer8bitDirect_avx(d, trained); - case QuantizerType::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned_avx(d, trained); - case QuantizerType::QT_1bit_direct: - // todo: add more SIMDWIDTH support for avx if needed - return new Quantizer1bitDirect(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2_avx {}; - -template <> -struct SimilarityL2_avx<1> : public SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - explicit SimilarityL2_avx(const float* y) : SimilarityL2<1>(y) {} -}; - -template <> -struct SimilarityL2_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2_avx(const float* y) : y(y) {} - __m256 accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = _mm256_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { - __m256 yiv = _mm256_loadu_ps(yi); - yi += 8; - __m256 tmp = _mm256_sub_ps(yiv, x); - accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); - } - - FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y_2) { - __m256 tmp = _mm256_sub_ps(y_2, x); - accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); - } - - FAISS_ALWAYS_INLINE float result_8() { - const __m128 sum = _mm_add_ps( - _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); - const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); - const __m128 v1 = _mm_add_ps(sum, v0); - __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 v3 = _mm_add_ps(v1, v2); - return _mm_cvtss_f32(v3); - } -}; - -template -struct SimilarityIP_avx {}; - -template <> -struct SimilarityIP_avx<1> : public SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - explicit SimilarityIP_avx(const float* y) : SimilarityIP<1>(y) {} -}; - -template <> -struct SimilarityIP_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - float accu; - - explicit SimilarityIP_avx(const float* y) : y(y) {} - - __m256 accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = _mm256_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { - __m256 yiv = _mm256_loadu_ps(yi); - yi += 8; - accu8 = _mm256_fmadd_ps(yiv, x, accu8); - } - - FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x1, __m256 x2) { - accu8 = _mm256_fmadd_ps(x1, x2, accu8); - } - - FAISS_ALWAYS_INLINE float result_8() { - const __m128 sum = _mm_add_ps( - _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); - const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); - const __m128 v1 = _mm_add_ps(sum, v0); - __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 v3 = _mm_add_ps(v1, v2); - return _mm_cvtss_f32(v3); - } -}; - -/******************************************************************* - * SQ4U specialized distance computer (AVX2 version) - *******************************************************************/ - -template -struct DistanceComputerSQ4UByte_avx : SQDistanceComputer { - using Quantizer = QuantizerTemplate_avx< - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM, - 8>; - using Sim = Similarity; - - Quantizer quant; - std::vector q_lo; - std::vector q_hi; - float final_scale_sq; - - DistanceComputerSQ4UByte_avx(size_t d, const std::vector& trained) - : quant(d, trained), - q_lo((d + 1) / 2 + 32, 0), - q_hi((d + 1) / 2 + 32, 0) { - final_scale_sq = quant.final_scale * quant.final_scale; - } - - void set_query(const float* x) final { - float inv_scale = 1.0f / quant.final_scale; - float offset = quant.vmin; - - for (size_t i = 0; i < quant.d; i++) { - float val = (x[i] - offset) * inv_scale; - int code = (int)std::floor(val); - if (code < 0) - code = 0; - if (code > 15) - code = 15; - - if (i % 2 == 0) { - q_lo[i / 2] = (uint8_t)code; - } else { - q_hi[i / 2] = (uint8_t)code; - } - } - } - - // Only computes L2 distance - float compute_distance(const float* x, const uint8_t* code) const { - return compute_distance_l2(code); - } - - float compute_distance_l2(const uint8_t* code) const { - const size_t d = quant.d; - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - __m256i acc = _mm256_setzero_si256(); - const __m256i mask_f = _mm256_set1_epi8(0xF); - const __m256i one = _mm256_set1_epi16(1); - - size_t i = 0; - // Process 64 dimensions per iteration (32 bytes = 64 nibbles) - for (; i + 64 <= d; i += 64) { - __m256i c256 = _mm256_loadu_si256((const __m256i*)(code + i / 2)); - - __m256i nibbles_lo = _mm256_and_si256(c256, mask_f); - __m256i nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c256, 4), mask_f); - - __m256i q_lo_vec = - _mm256_loadu_si256((const __m256i*)(q_lo_ptr + i / 2)); - __m256i q_hi_vec = - _mm256_loadu_si256((const __m256i*)(q_hi_ptr + i / 2)); - - // Compute absolute differences - __m256i diff_lo = _mm256_sub_epi8(q_lo_vec, nibbles_lo); - __m256i diff_hi = _mm256_sub_epi8(q_hi_vec, nibbles_hi); - - // AVX2 doesn't have _mm256_abs_epi8, so we use max(x, -x) - diff_lo = _mm256_max_epi8( - diff_lo, _mm256_sub_epi8(_mm256_setzero_si256(), diff_lo)); - diff_hi = _mm256_max_epi8( - diff_hi, _mm256_sub_epi8(_mm256_setzero_si256(), diff_hi)); - - // Square using maddubs: treats input as unsigned bytes - __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); - __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); - - // Accumulate to 32-bit - __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); - __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); - - acc = _mm256_add_epi32(acc, sum_lo); - acc = _mm256_add_epi32(acc, sum_hi); - } - - // Horizontal reduction of acc - __m128i acc_lo = _mm256_castsi256_si128(acc); - __m128i acc_hi = _mm256_extracti128_si256(acc, 1); - acc_lo = _mm_add_epi32(acc_lo, acc_hi); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - int32_t sum = _mm_cvtsi128_si32(acc_lo); - - // Handle remaining dimensions scalar - for (; i < d; i++) { - uint8_t c = code[i / 2]; - uint8_t nibble; - if (i % 2 == 0) { - nibble = c & 0xF; - } else { - nibble = (c >> 4) & 0xF; - } - - int diff; - if (i % 2 == 0) { - diff = (int)q_lo[i / 2] - (int)nibble; - } else { - diff = (int)q_hi[i / 2] - (int)nibble; - } - sum += diff * diff; - } - - return sum * final_scale_sq; - } - - float compute_code_distance_l2(const uint8_t* code1, const uint8_t* code2) - const { - const size_t d = quant.d; - __m256i acc = _mm256_setzero_si256(); - const __m256i mask_f = _mm256_set1_epi8(0xF); - const __m256i one = _mm256_set1_epi16(1); - - size_t i = 0; - for (; i + 64 <= d; i += 64) { - __m256i c1_256 = - _mm256_loadu_si256((const __m256i*)(code1 + i / 2)); - __m256i c2_256 = - _mm256_loadu_si256((const __m256i*)(code2 + i / 2)); - - __m256i c1_nibbles_lo = _mm256_and_si256(c1_256, mask_f); - __m256i c1_nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c1_256, 4), mask_f); - - __m256i c2_nibbles_lo = _mm256_and_si256(c2_256, mask_f); - __m256i c2_nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c2_256, 4), mask_f); - - __m256i diff_lo = _mm256_sub_epi8(c1_nibbles_lo, c2_nibbles_lo); - __m256i diff_hi = _mm256_sub_epi8(c1_nibbles_hi, c2_nibbles_hi); - - diff_lo = _mm256_max_epi8( - diff_lo, _mm256_sub_epi8(_mm256_setzero_si256(), diff_lo)); - diff_hi = _mm256_max_epi8( - diff_hi, _mm256_sub_epi8(_mm256_setzero_si256(), diff_hi)); - - __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); - __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); - - __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); - __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); - - acc = _mm256_add_epi32(acc, sum_lo); - acc = _mm256_add_epi32(acc, sum_hi); - } - - __m128i acc_lo = _mm256_castsi256_si128(acc); - __m128i acc_hi = _mm256_extracti128_si256(acc, 1); - acc_lo = _mm_add_epi32(acc_lo, acc_hi); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - int32_t sum = _mm_cvtsi128_si32(acc_lo); - - for (; i < d; i++) { - uint8_t c1 = code1[i / 2]; - uint8_t c2 = code2[i / 2]; - uint8_t n1, n2; - if (i % 2 == 0) { - n1 = c1 & 0xF; - n2 = c2 & 0xF; - } else { - n1 = (c1 >> 4) & 0xF; - n2 = (c2 >> 4) & 0xF; - } - int diff = (int)n1 - (int)n2; - sum += diff * diff; - } - - return sum * final_scale_sq; - } - - float operator()(idx_t i) final { - return compute_distance(nullptr, codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance_l2( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(nullptr, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const { - const size_t d = quant.d; - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - __m256i acc0 = _mm256_setzero_si256(); - __m256i acc1 = _mm256_setzero_si256(); - __m256i acc2 = _mm256_setzero_si256(); - __m256i acc3 = _mm256_setzero_si256(); - - const __m256i mask_f = _mm256_set1_epi8(0xF); - const __m256i one = _mm256_set1_epi16(1); - const __m256i zero = _mm256_setzero_si256(); - - size_t i = 0; - // Process 128 dimensions per outer iteration - for (; i + 128 <= d; i += 128) { - // Chunk 0: first 64 dimensions - __m256i q_lo_0 = - _mm256_loadu_si256((const __m256i*)(q_lo_ptr + i / 2)); - __m256i q_hi_0 = - _mm256_loadu_si256((const __m256i*)(q_hi_ptr + i / 2)); - - auto process_chunk_64 = [&](const uint8_t* code, - __m256i& acc, - __m256i q_lo, - __m256i q_hi, - int offset) { - __m256i c = _mm256_loadu_si256( - (const __m256i*)(code + i / 2 + offset)); - __m256i nibbles_lo = _mm256_and_si256(c, mask_f); - __m256i nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c, 4), mask_f); - - __m256i diff_lo = _mm256_sub_epi8(q_lo, nibbles_lo); - __m256i diff_hi = _mm256_sub_epi8(q_hi, nibbles_hi); - - diff_lo = _mm256_max_epi8( - diff_lo, _mm256_sub_epi8(zero, diff_lo)); - diff_hi = _mm256_max_epi8( - diff_hi, _mm256_sub_epi8(zero, diff_hi)); - - __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); - __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); - - __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); - __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); - - acc = _mm256_add_epi32(acc, sum_lo); - acc = _mm256_add_epi32(acc, sum_hi); - }; - - process_chunk_64(code_0, acc0, q_lo_0, q_hi_0, 0); - process_chunk_64(code_1, acc1, q_lo_0, q_hi_0, 0); - process_chunk_64(code_2, acc2, q_lo_0, q_hi_0, 0); - process_chunk_64(code_3, acc3, q_lo_0, q_hi_0, 0); - - // Chunk 1: next 64 dimensions - __m256i q_lo_1 = - _mm256_loadu_si256((const __m256i*)(q_lo_ptr + i / 2 + 32)); - __m256i q_hi_1 = - _mm256_loadu_si256((const __m256i*)(q_hi_ptr + i / 2 + 32)); - - process_chunk_64(code_0, acc0, q_lo_1, q_hi_1, 32); - process_chunk_64(code_1, acc1, q_lo_1, q_hi_1, 32); - process_chunk_64(code_2, acc2, q_lo_1, q_hi_1, 32); - process_chunk_64(code_3, acc3, q_lo_1, q_hi_1, 32); - } - - // Handle remaining 64-dimensional chunk - if (i + 64 <= d) { - __m256i q_lo_0 = - _mm256_loadu_si256((const __m256i*)(q_lo_ptr + i / 2)); - __m256i q_hi_0 = - _mm256_loadu_si256((const __m256i*)(q_hi_ptr + i / 2)); - - auto process = [&](const uint8_t* code, __m256i& acc) { - __m256i c = _mm256_loadu_si256((const __m256i*)(code + i / 2)); - __m256i nibbles_lo = _mm256_and_si256(c, mask_f); - __m256i nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c, 4), mask_f); - - __m256i diff_lo = _mm256_sub_epi8(q_lo_0, nibbles_lo); - __m256i diff_hi = _mm256_sub_epi8(q_hi_0, nibbles_hi); - - diff_lo = _mm256_max_epi8( - diff_lo, _mm256_sub_epi8(zero, diff_lo)); - diff_hi = _mm256_max_epi8( - diff_hi, _mm256_sub_epi8(zero, diff_hi)); - - __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); - __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); - - __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); - __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); - - acc = _mm256_add_epi32(acc, sum_lo); - acc = _mm256_add_epi32(acc, sum_hi); - }; - - process(code_0, acc0); - process(code_1, acc1); - process(code_2, acc2); - process(code_3, acc3); - - i += 64; - } - - // Horizontal reductions - auto reduce = [](const __m256i& acc) -> int32_t { - __m128i acc_lo = _mm256_castsi256_si128(acc); - __m128i acc_hi = _mm256_extracti128_si256(acc, 1); - acc_lo = _mm_add_epi32(acc_lo, acc_hi); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - return _mm_cvtsi128_si32(acc_lo); - }; - - dis0 = reduce(acc0); - dis1 = reduce(acc1); - dis2 = reduce(acc2); - dis3 = reduce(acc3); - - // Handle remaining dimensions scalar - for (; i < d; i++) { - uint8_t nibble_lo = q_lo[i / 2]; - uint8_t nibble_hi = q_hi[i / 2]; - - auto process_scalar = [&](const uint8_t* code, float& dis) { - uint8_t c = code[i / 2]; - uint8_t nibble; - if (i % 2 == 0) { - nibble = c & 0xF; - } else { - nibble = (c >> 4) & 0xF; - } - int diff; - if (i % 2 == 0) { - diff = (int)nibble_lo - (int)nibble; - } else { - diff = (int)nibble_hi - (int)nibble; - } - dis += diff * diff; - }; - - process_scalar(code_0, dis0); - process_scalar(code_1, dis1); - process_scalar(code_2, dis2); - process_scalar(code_3, dis3); - } - - dis0 *= final_scale_sq; - dis1 *= final_scale_sq; - dis2 *= final_scale_sq; - dis3 *= final_scale_sq; - } - - void distances_batch_4( - const idx_t idx0, - const idx_t idx1, - const idx_t idx2, - const idx_t idx3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) override { - query_to_codes_batch_4( - codes + idx0 * code_size, - codes + idx1 * code_size, - codes + idx2 * code_size, - codes + idx3 * code_size, - dis0, - dis1, - dis2, - dis3); - } -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ - -template -struct DCTemplate_avx : SQDistanceComputer {}; - -template -struct DCTemplate_avx - : public DCTemplate { - DCTemplate_avx(size_t d, const std::vector& trained) - : DCTemplate(d, trained) {} -}; - -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN -template -struct DCTemplate_avx : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_avx(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 x1 = quant.reconstruct_8_components(code1, i); - __m256 x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query(const float* x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return query_to_code(codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(q, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - Similarity sim0(q); - Similarity sim1(q); - Similarity sim2(q); - Similarity sim3(q); - - sim0.begin_8(); - sim1.begin_8(); - sim2.begin_8(); - sim3.begin_8(); - - for (size_t i = 0; i < quant.d; i += 8) { - __m256 xi0 = quant.reconstruct_8_components(code_0, i); - __m256 xi1 = quant.reconstruct_8_components(code_1, i); - __m256 xi2 = quant.reconstruct_8_components(code_2, i); - __m256 xi3 = quant.reconstruct_8_components(code_3, i); - sim0.add_8_components(xi0); - sim1.add_8_components(xi1); - sim2.add_8_components(xi2); - sim3.add_8_components(xi3); - } - - dis0 = sim0.result_8(); - dis1 = sim1.result_8(); - dis2 = sim2.result_8(); - dis3 = sim3.result_8(); - } -}; -FAISS_PRAGMA_IMPRECISE_FUNCTION_END - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ - -template -struct DistanceComputerByte_avx : SQDistanceComputer {}; - -template -struct DistanceComputerByte_avx - : public DistanceComputerByte { - DistanceComputerByte_avx(int d, const std::vector& unused) - : DistanceComputerByte(d, unused) {} -}; - -template -struct DistanceComputerByte_avx : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_avx(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - // __m256i accu = _mm256_setzero_ps (); - __m256i accu = _mm256_setzero_si256(); - for (int i = 0; i < d; i += 16) { - // load 16 bytes, convert to 16 uint16_t - __m256i c1 = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i*)(code1 + i))); - __m256i c2 = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i*)(code2 + i))); - __m256i prod32; - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - prod32 = _mm256_madd_epi16(c1, c2); - } else { - __m256i diff = _mm256_sub_epi16(c1, c2); - prod32 = _mm256_madd_epi16(diff, diff); - } - accu = _mm256_add_epi32(accu, prod32); - } - __m128i sum = _mm256_extractf128_si256(accu, 0); - sum = _mm_add_epi32(sum, _mm256_extractf128_si256(accu, 1)); - sum = _mm_hadd_epi32(sum, sum); - sum = _mm_hadd_epi32(sum, sum); - return _mm_cvtsi128_si32(sum); - } - - void set_query(const float* x) final { - /* - for (int i = 0; i < d; i += 8) { - __m256 xi = _mm256_loadu_ps (x + i); - __m256i ci = _mm256_cvtps_epi32(xi); - */ - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return query_to_code(codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -SQDistanceComputer* select_distance_computer_avx( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - switch (qtype) { - case QuantizerType::QT_8bit_uniform: - return new DCTemplate_avx< - QuantizerTemplate_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit_uniform: - return new DistanceComputerSQ4UByte_avx(d, trained); - - case QuantizerType::QT_8bit: - return new DCTemplate_avx< - QuantizerTemplate_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_6bit: - return new DCTemplate_avx< - QuantizerTemplate_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit: - return new DCTemplate_avx< - QuantizerTemplate_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_fp16: - return new DCTemplate_avx< - QuantizerFP16_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_bf16: - return new DCTemplate_avx< - QuantizerBF16_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte_avx(d, trained); - } else { - return new DCTemplate_avx< - Quantizer8bitDirect_avx, - Sim, - SIMDWIDTH>(d, trained); - } - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate_avx< - Quantizer8bitDirectSigned_avx, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel2_InvertedListScanner_avx( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel12_InvertedListScanner_avx( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate_avx; - using DCClass = DCTemplate_avx; - return sel2_InvertedListScanner_avx( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner_avx( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_avx< - Similarity, - Codec8bit_avx, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_avx< - Similarity, - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_avx< - Similarity, - Codec8bit_avx, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_avx< - Similarity, - Codec4bit_avx, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_avx< - Similarity, - Codec6bit_avx, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner_avx, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_bf16: - return sel2_InvertedListScanner_avx, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner_avx< - DistanceComputerByte_avx>( - sq, quantizer, store_pairs, sel, r); - } else { - return sel2_InvertedListScanner_avx, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner_avx, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner_avx( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner_avx>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner_avx>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx512.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx512.h deleted file mode 100644 index 335bd0222..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx512.h +++ /dev/null @@ -1,1518 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit_avx512 : public Codec8bit_avx { - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - const __m128i c8 = _mm_loadu_si128((const __m128i_u*)(code + i)); - const __m512i i32 = _mm512_cvtepu8_epi32(c8); - const __m512 f8 = _mm512_cvtepi32_ps(i32); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 255.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - } -}; - -struct Codec4bit_avx512 : public Codec4bit_avx { - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - uint64_t c8 = *(uint64_t*)(code + (i >> 1)); - uint64_t mask = 0x0f0f0f0f0f0f0f0f; - uint64_t c8ev = c8 & mask; - uint64_t c8od = (c8 >> 4) & mask; - - // the 8 lower bytes of c8 contain the values - __m128i c16 = - _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od)); - __m256i c8lo = _mm256_cvtepu8_epi32(c16); - __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8)); - __m512i i16 = _mm512_castsi256_si512(c8lo); - i16 = _mm512_inserti32x8(i16, c8hi, 1); - __m512 f16 = _mm512_cvtepi32_ps(i16); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 15.f); - return _mm512_fmadd_ps(f16, one_255, half_one_255); - } - - static FAISS_ALWAYS_INLINE __m512i - decode_16_components_int(const uint8_t* code, int i) { - __m128i v8 = _mm_loadl_epi64((const __m128i*)(code + (i >> 1))); - __m128i v16 = _mm_unpacklo_epi8(v8, v8); - __m512i v512 = _mm512_cvtepu8_epi32(v16); - - // Shift right: 0 for even, 4 for odd - const __m512i shift_counts = _mm512_setr_epi32( - 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4); - v512 = _mm512_srlv_epi32(v512, shift_counts); - return _mm512_and_si512(v512, _mm512_set1_epi32(0xF)); - } -}; - -struct Codec6bit_avx512 : public Codec6bit_avx { - // TODO: can be optimized - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - /* - // todo aguzhva: the following piece of code is very fast - // for Intel chips. AMD ones will be very slow unless Zen3+ - - const uint16_t* data16_0 = (const uint16_t*)(code + (i >> 2) * 3); - const uint64_t* data64_0 = (const uint64_t*)data16_0; - const uint64_t val_0 = *data64_0; - const uint64_t vext_0 = _pdep_u64(val_0, 0x3F3F3F3F3F3F3F3FULL); - - const uint16_t* data16_1 = data16_0 + 3; - const uint32_t* data32_1 = (const uint32_t*)data16_1; - const uint64_t val_1 = *data32_1 + ((uint64_t)data16_1[2] << 32); - const uint64_t vext_1 = _pdep_u64(val_1, 0x3F3F3F3F3F3F3F3FULL); - - const __m128i i8 = _mm_set_epi64x(vext_1, vext_0); - const __m512i i32 = _mm512_cvtepi8_epi32(i8); - const __m512 f8 = _mm512_cvtepi32_ps(i32); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - */ - - /* - // todo aguzhva: another candidate for pdep, which might be faster - const uint16_t* data16_0 = (const uint16_t*)(code + (i >> 2) * 3); - const uint64_t* data64_0 = (const uint64_t*)data16_0; - const uint64_t val_0 = *data64_0; - const uint64_t vext_0 = _pdep_u64(val_0, 0x3F3F3F3F3F3F3F3FULL); - - const uint32_t* data32_1 = (const uint32_t*)data16_0; - const uint64_t val_1 = (val_0 >> 48) | (((uint64_t)data32_1[1]) << 16); - const uint64_t vext_1 = _pdep_u64(val_1, 0x3F3F3F3F3F3F3F3FULL); - - const __m128i i8 = _mm_set_epi64x(vext_1, vext_0); - const __m512i i32 = _mm512_cvtepi8_epi32(i8); - const __m512 f8 = _mm512_cvtepi32_ps(i32); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - */ - - // pure AVX512 implementation, slower than pdep one, but has no problems - // for AMD - - // clang-format off - - // 16 components, 16x6 bit=12 bytes - const __m128i bit_6v = - _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3); - const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v); - - // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F - // 00 01 02 03 - const __m256i shuffle_mask = _mm256_setr_epi16( - 0xFF00, 0x0100, 0x0201, 0xFF02, - 0xFF03, 0x0403, 0x0504, 0xFF05, - 0xFF06, 0x0706, 0x0807, 0xFF08, - 0xFF09, 0x0A09, 0x0B0A, 0xFF0B); - const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask); - - // 0: xxxxxxxx xx543210 - // 1: xxxx5432 10xxxxxx - // 2: xxxxxx54 3210xxxx - // 3: xxxxxxxx 543210xx - const __m256i shift_right_v = _mm256_setr_epi16( - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U); - __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v); - - // remove unneeded bits - shuffled_shifted = - _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F)); - - // scale - const __m512 f8 = - _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted)); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - - // clang-format on - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -template -struct QuantizerTemplate_avx512 {}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m512 xi = Codec::decode_16_components(code, i); - return _mm512_fmadd_ps( - xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin)); - } -}; - -template <> -struct QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - 16> - : public QuantizerTemplate_avx< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - 8> { - float final_scale; - float final_bias; - - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - 8>(d, trained) { - final_scale = this->vdiff / 15.0f; - final_bias = this->vmin + this->vdiff * 0.5f / 15.0f; - } - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m512i nibbles = Codec4bit_avx512::decode_16_components_int(code, i); - __m512 nibbles_f = _mm512_cvtepi32_ps(nibbles); - - return _mm512_fmadd_ps( - nibbles_f, - _mm512_set1_ps(final_scale), - _mm512_set1_ps(final_bias)); - } -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m512 xi = Codec::decode_16_components(code, i); - return _mm512_fmadd_ps( - xi, - _mm512_loadu_ps(this->vdiff + i), - _mm512_loadu_ps(this->vmin + i)); - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16_avx512 {}; - -template <> -struct QuantizerFP16_avx512<1> : public QuantizerFP16_avx<1> { - QuantizerFP16_avx512(size_t d, const std::vector& unused) - : QuantizerFP16_avx<1>(d, unused) {} -}; - -template <> -struct QuantizerFP16_avx512<8> : public QuantizerFP16_avx<8> { - QuantizerFP16_avx512(size_t d, const std::vector& trained) - : QuantizerFP16_avx<8>(d, trained) {} -}; - -template <> -struct QuantizerFP16_avx512<16> : public QuantizerFP16_avx<8> { - QuantizerFP16_avx512(size_t d, const std::vector& trained) - : QuantizerFP16_avx<8>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m256i codei = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); - return _mm512_cvtph_ps(codei); - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16_avx512 {}; - -template <> -struct QuantizerBF16_avx512<1> : public QuantizerBF16_avx<1> { - QuantizerBF16_avx512(size_t d, const std::vector& unused) - : QuantizerBF16_avx<1>(d, unused) {} -}; - -template <> -struct QuantizerBF16_avx512<8> : public QuantizerBF16_avx<8> { - QuantizerBF16_avx512(size_t d, const std::vector& trained) - : QuantizerBF16_avx<8>(d, trained) {} -}; - -template <> -struct QuantizerBF16_avx512<16> : public QuantizerBF16_avx<8> { - QuantizerBF16_avx512(size_t d, const std::vector& trained) - : QuantizerBF16_avx<8>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m256i code_256i = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); - __m512i code_512i = _mm512_cvtepu16_epi32(code_256i); - code_512i = _mm512_slli_epi32(code_512i, 16); - return _mm512_castsi512_ps(code_512i); - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect_avx512 {}; - -template <> -struct Quantizer8bitDirect_avx512<1> : public Quantizer8bitDirect_avx<1> { - Quantizer8bitDirect_avx512(size_t d, const std::vector& unused) - : Quantizer8bitDirect_avx<1>(d, unused) {} -}; - -template <> -struct Quantizer8bitDirect_avx512<8> : public Quantizer8bitDirect_avx<8> { - Quantizer8bitDirect_avx512(size_t d, const std::vector& trained) - : Quantizer8bitDirect_avx<8>(d, trained) {} -}; - -template <> -struct Quantizer8bitDirect_avx512<16> : public Quantizer8bitDirect_avx<8> { - Quantizer8bitDirect_avx512(size_t d, const std::vector& trained) - : Quantizer8bitDirect_avx<8>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8 - __m512i y16 = _mm512_cvtepu8_epi32(x16); // 16 * int32 - return _mm512_cvtepi32_ps(y16); // 16 * float32 - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned_avx512 {}; - -template <> -struct Quantizer8bitDirectSigned_avx512<1> - : public Quantizer8bitDirectSigned_avx<1> { - Quantizer8bitDirectSigned_avx512(size_t d, const std::vector& unused) - : Quantizer8bitDirectSigned_avx<1>(d, unused) {} -}; - -template <> -struct Quantizer8bitDirectSigned_avx512<8> - : public Quantizer8bitDirectSigned_avx<8> { - Quantizer8bitDirectSigned_avx512( - size_t d, - const std::vector& trained) - : Quantizer8bitDirectSigned_avx<8>(d, trained) {} -}; - -template <> -struct Quantizer8bitDirectSigned_avx512<16> - : public Quantizer8bitDirectSigned_avx<8> { - Quantizer8bitDirectSigned_avx512( - size_t d, - const std::vector& trained) - : Quantizer8bitDirectSigned_avx<8>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8 - __m512i y16 = _mm512_cvtepu8_epi32(x16); // 16 * int32 - __m512i c16 = _mm512_set1_epi32(128); - __m512i z16 = _mm512_sub_epi32(y16, c16); // subtract 128 from all lanes - return _mm512_cvtepi32_ps(z16); // 16 * float32 - } -}; - -template -SQuantizer* select_quantizer_1_avx512( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case QuantizerType::QT_8bit: - return new QuantizerTemplate_avx512< - Codec8bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_6bit: - return new QuantizerTemplate_avx512< - Codec6bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_4bit: - return new QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_avx512< - Codec8bit_avx512, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_fp16: - return new QuantizerFP16_avx512(d, trained); - case QuantizerType::QT_bf16: - return new QuantizerBF16_avx512(d, trained); - case QuantizerType::QT_8bit_direct: - return new Quantizer8bitDirect_avx512(d, trained); - case QuantizerType::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned_avx512(d, trained); - case QuantizerType::QT_1bit_direct: - // todo: add more SIMDWIDTH support for avx512 if needed - return new Quantizer1bitDirect(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2_avx512 {}; - -template <> -struct SimilarityL2_avx512<1> : public SimilarityL2_avx<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - explicit SimilarityL2_avx512(const float* y) : SimilarityL2_avx<1>(y) {} -}; - -template <> -struct SimilarityL2_avx512<8> : public SimilarityL2_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - explicit SimilarityL2_avx512(const float* y) : SimilarityL2_avx<8>(y) {} -}; - -template <> -struct SimilarityL2_avx512<16> { - static constexpr int simdwidth = 16; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2_avx512(const float* y) : y(y) {} - __m512 accu16; - - FAISS_ALWAYS_INLINE void begin_16() { - accu16 = _mm512_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_16_components(__m512 x) { - __m512 yiv = _mm512_loadu_ps(yi); - yi += 16; - __m512 tmp = _mm512_sub_ps(yiv, x); - accu16 = _mm512_fmadd_ps(tmp, tmp, accu16); - } - - FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x, __m512 y_2) { - __m512 tmp = _mm512_sub_ps(y_2, x); - accu16 = _mm512_fmadd_ps(tmp, tmp, accu16); - } - - FAISS_ALWAYS_INLINE float result_16() { - return _mm512_reduce_add_ps(accu16); - } -}; - -template -struct SimilarityIP_avx512 {}; - -template <> -struct SimilarityIP_avx512<1> : public SimilarityIP_avx<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - explicit SimilarityIP_avx512(const float* y) : SimilarityIP_avx<1>(y) {} -}; - -template <> -struct SimilarityIP_avx512<8> : public SimilarityIP_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - explicit SimilarityIP_avx512(const float* y) : SimilarityIP_avx<8>(y) {} -}; - -template <> -struct SimilarityIP_avx512<16> { - static constexpr int simdwidth = 16; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - float accu; - - explicit SimilarityIP_avx512(const float* y) : y(y) {} - - __m512 accu16; - - FAISS_ALWAYS_INLINE void begin_16() { - accu16 = _mm512_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_16_components(__m512 x) { - __m512 yiv = _mm512_loadu_ps(yi); - yi += 16; - accu16 = _mm512_fmadd_ps(yiv, x, accu16); - } - - FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x1, __m512 x2) { - accu16 = _mm512_fmadd_ps(x1, x2, accu16); - } - - FAISS_ALWAYS_INLINE float result_16() { - return _mm512_reduce_add_ps(accu16); - } -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ - -template -struct DCTemplate_avx512 : SQDistanceComputer {}; - -template -struct DCTemplate_avx512 - : public DCTemplate_avx { - DCTemplate_avx512(size_t d, const std::vector& trained) - : DCTemplate_avx(d, trained) {} -}; - -template -struct DCTemplate_avx512 - : public DCTemplate_avx { - DCTemplate_avx512(size_t d, const std::vector& trained) - : DCTemplate_avx(d, trained) {} -}; - -template -struct DCTemplate_avx512 : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_avx512(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_16(); - for (size_t i = 0; i < quant.d; i += 16) { - __m512 xi = quant.reconstruct_16_components(code, i); - sim.add_16_components(xi); - } - return sim.result_16(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_16(); - for (size_t i = 0; i < quant.d; i += 16) { - __m512 x1 = quant.reconstruct_16_components(code1, i); - __m512 x2 = quant.reconstruct_16_components(code2, i); - sim.add_16_components_2(x1, x2); - } - return sim.result_16(); - } - - void set_query(const float* x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return compute_distance(q, codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(q, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - Similarity sim0(q); - Similarity sim1(q); - Similarity sim2(q); - Similarity sim3(q); - - sim0.begin_16(); - sim1.begin_16(); - sim2.begin_16(); - sim3.begin_16(); - - for (size_t i = 0; i < quant.d; i += 16) { - __m512 xi0 = quant.reconstruct_16_components(code_0, i); - __m512 xi1 = quant.reconstruct_16_components(code_1, i); - __m512 xi2 = quant.reconstruct_16_components(code_2, i); - __m512 xi3 = quant.reconstruct_16_components(code_3, i); - sim0.add_16_components(xi0); - sim1.add_16_components(xi1); - sim2.add_16_components(xi2); - sim3.add_16_components(xi3); - } - - dis0 = sim0.result_16(); - dis1 = sim1.result_16(); - dis2 = sim2.result_16(); - dis3 = sim3.result_16(); - } - - void distances_batch_4( - const idx_t idx0, - const idx_t idx1, - const idx_t idx2, - const idx_t idx3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) override { - query_to_codes_batch_4( - codes + idx0 * code_size, - codes + idx1 * code_size, - codes + idx2 * code_size, - codes + idx3 * code_size, - dis0, - dis1, - dis2, - dis3); - } -}; - -template -struct DistanceComputerSQ4UByte_avx512 : SQDistanceComputer { - using Quantizer = QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - 16>; - using Sim = Similarity; - - Quantizer quant; - std::vector q_lo; - std::vector q_hi; - float final_scale_sq; - - DistanceComputerSQ4UByte_avx512(size_t d, const std::vector& trained) - : quant(d, trained), - q_lo((d + 1) / 2 + 64, 0), - q_hi((d + 1) / 2 + 64, 0) { - final_scale_sq = quant.final_scale * quant.final_scale; - } - - void set_query(const float* x) final { - float inv_scale = 1.0f / quant.final_scale; - float offset = quant.vmin; - - for (size_t i = 0; i < quant.d; i++) { - float val = (x[i] - offset) * inv_scale; - int code = (int)std::floor(val); - if (code < 0) - code = 0; - if (code > 15) - code = 15; - - if (i % 2 == 0) { - q_lo[i / 2] = (uint8_t)code; - } else { - q_hi[i / 2] = (uint8_t)code; - } - } - } - - // Only computes L2 distance - float compute_distance(const float* x, const uint8_t* code) const { - return compute_distance_l2(code); - } - - float compute_distance_l2(const uint8_t* code) const { - __m512i acc = _mm512_setzero_si512(); - const size_t d = quant.d; - const __m512i mask_f = _mm512_set1_epi8(0xF); - const __m512i one = _mm512_set1_epi16(1); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - for (; i + 128 <= d; i += 128) { - __m512i c512 = _mm512_loadu_si512((const __m512i*)(code + i / 2)); - - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i q_lo_vec = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_vec = _mm512_loadu_si512(q_hi_ptr + i / 2); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sq_sum = _mm512_add_epi16(sq_lo, sq_hi); - __m512i sum_32 = _mm512_madd_epi16(sq_sum, one); - - acc = _mm512_add_epi32(acc, sum_32); - } - - // Handle remaining dimensions - if (i < d) { - size_t rem = d - i; - uint64_t mask_even = - (rem + 1) / 2 >= 64 ? -1ULL : (1ULL << ((rem + 1) / 2)) - 1; - uint64_t mask_odd = rem / 2 >= 64 ? -1ULL : (1ULL << (rem / 2)) - 1; - - __m512i c512 = _mm512_maskz_loadu_epi8(mask_even, code + i / 2); - - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i q_lo_vec = - _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); - __m512i q_hi_vec = - _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); - - __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); - nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sq_sum = _mm512_add_epi16(sq_lo, sq_hi); - __m512i sum_32 = _mm512_madd_epi16(sq_sum, one); - - acc = _mm512_add_epi32(acc, sum_32); - } - - int32_t sum = _mm512_reduce_add_epi32(acc); - return sum * final_scale_sq; - } - - float compute_code_distance_l2(const uint8_t* code1, const uint8_t* code2) - const { - __m512i acc = _mm512_setzero_si512(); - const size_t d = quant.d; - - size_t i = 0; - for (; i + 128 <= d; i += 128) { - __m512i c1_512 = - _mm512_loadu_si512((const __m512i*)(code1 + i / 2)); - __m512i c2_512 = - _mm512_loadu_si512((const __m512i*)(code2 + i / 2)); - - __m512i c1_nibbles_lo = - _mm512_and_si512(c1_512, _mm512_set1_epi8(0xF)); - __m512i c1_nibbles_hi = _mm512_and_si512( - _mm512_srli_epi16(c1_512, 4), _mm512_set1_epi8(0xF)); - - __m512i c2_nibbles_lo = - _mm512_and_si512(c2_512, _mm512_set1_epi8(0xF)); - __m512i c2_nibbles_hi = _mm512_and_si512( - _mm512_srli_epi16(c2_512, 4), _mm512_set1_epi8(0xF)); - - __m512i diff_lo = _mm512_sub_epi8(c1_nibbles_lo, c2_nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(c1_nibbles_hi, c2_nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, _mm512_set1_epi16(1)); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, _mm512_set1_epi16(1)); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - } - - // Handle remaining dimensions - if (i < d) { - size_t rem = d - i; - uint64_t mask_even = - (rem + 1) / 2 >= 64 ? -1ULL : (1ULL << ((rem + 1) / 2)) - 1; - uint64_t mask_odd = rem / 2 >= 64 ? -1ULL : (1ULL << (rem / 2)) - 1; - - __m512i c1_512 = _mm512_maskz_loadu_epi8(mask_even, code1 + i / 2); - __m512i c2_512 = _mm512_maskz_loadu_epi8(mask_even, code2 + i / 2); - - __m512i c1_nibbles_lo = - _mm512_and_si512(c1_512, _mm512_set1_epi8(0xF)); - __m512i c1_nibbles_hi = _mm512_and_si512( - _mm512_srli_epi16(c1_512, 4), _mm512_set1_epi8(0xF)); - - __m512i c2_nibbles_lo = - _mm512_and_si512(c2_512, _mm512_set1_epi8(0xF)); - __m512i c2_nibbles_hi = _mm512_and_si512( - _mm512_srli_epi16(c2_512, 4), _mm512_set1_epi8(0xF)); - - __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); - c1_nibbles_hi = _mm512_and_si512(c1_nibbles_hi, mask_odd_vec); - c2_nibbles_hi = _mm512_and_si512(c2_nibbles_hi, mask_odd_vec); - - __m512i diff_lo = _mm512_sub_epi8(c1_nibbles_lo, c2_nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(c1_nibbles_hi, c2_nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, _mm512_set1_epi16(1)); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, _mm512_set1_epi16(1)); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - } - - int32_t sum = _mm512_reduce_add_epi32(acc); - return sum * final_scale_sq; - } - - float operator()(idx_t i) final { - return compute_distance(nullptr, codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance_l2( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(nullptr, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - if constexpr (USE_VNNI) { - query_to_codes_batch_4_vnni( - code_0, code_1, code_2, code_3, dis0, dis1, dis2, dis3); - } else { - query_to_codes_batch_4_avx512( - code_0, code_1, code_2, code_3, dis0, dis1, dis2, dis3); - } - } - - __attribute__((target("avx512vnni"))) void query_to_codes_batch_4_vnni( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const { - __m512i acc0 = _mm512_setzero_si512(); - __m512i acc1 = _mm512_setzero_si512(); - __m512i acc2 = _mm512_setzero_si512(); - __m512i acc3 = _mm512_setzero_si512(); - - const size_t d = quant.d; - const __m512i mask_f = _mm512_set1_epi8(0xF); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - // 256 dimensions per iteration - for (; i + 256 <= d; i += 256) { - // Chunk 0 - __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); - - // Chunk 1 - __m512i q_lo_1 = _mm512_loadu_si512(q_lo_ptr + i / 2 + 64); - __m512i q_hi_1 = _mm512_loadu_si512(q_hi_ptr + i / 2 + 64); - - auto process_chunk = [&]( - const uint8_t* code, - __m512i& acc, - __m512i q_lo, - __m512i q_hi, - int offset) __attribute__((target("avx512vnni"))) { - __m512i c512 = _mm512_loadu_si512( - (const __m512i*)(code + i / 2 + offset)); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i diff_lo = _mm512_sub_epi8(q_lo, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); - acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); - }; - - process_chunk(code_0, acc0, q_lo_0, q_hi_0, 0); - process_chunk(code_1, acc1, q_lo_0, q_hi_0, 0); - process_chunk(code_2, acc2, q_lo_0, q_hi_0, 0); - process_chunk(code_3, acc3, q_lo_0, q_hi_0, 0); - - process_chunk(code_0, acc0, q_lo_1, q_hi_1, 64); - process_chunk(code_1, acc1, q_lo_1, q_hi_1, 64); - process_chunk(code_2, acc2, q_lo_1, q_hi_1, 64); - process_chunk(code_3, acc3, q_lo_1, q_hi_1, 64); - } - - if (i + 128 <= d) { - __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); - - auto process_chunk = [&](const uint8_t* code, __m512i& acc) - __attribute__((target("avx512vnni"))) { - __m512i c512 = - _mm512_loadu_si512((const __m512i*)(code + i / 2)); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_0, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_0, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); - acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); - }; - - process_chunk(code_0, acc0); - process_chunk(code_1, acc1); - process_chunk(code_2, acc2); - process_chunk(code_3, acc3); - - i += 128; - } - - // Handle remaining dimensions - if (i < d) { - size_t rem = d - i; - uint64_t mask_even = - (rem + 1) / 2 >= 64 ? -1ULL : (1ULL << ((rem + 1) / 2)) - 1; - uint64_t mask_odd = rem / 2 >= 64 ? -1ULL : (1ULL << (rem / 2)) - 1; - - __m512i q_lo_vec = - _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); - __m512i q_hi_vec = - _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); - __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); - - auto process = [&](const uint8_t* code, __m512i& acc) - __attribute__((target("avx512vnni"))) { - __m512i c512 = _mm512_maskz_loadu_epi8(mask_even, code + i / 2); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); - acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); - }; - - process(code_0, acc0); - process(code_1, acc1); - process(code_2, acc2); - process(code_3, acc3); - } - - dis0 = _mm512_reduce_add_epi32(acc0) * final_scale_sq; - dis1 = _mm512_reduce_add_epi32(acc1) * final_scale_sq; - dis2 = _mm512_reduce_add_epi32(acc2) * final_scale_sq; - dis3 = _mm512_reduce_add_epi32(acc3) * final_scale_sq; - } - - void query_to_codes_batch_4_avx512( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const { - __m512i acc0 = _mm512_setzero_si512(); - __m512i acc1 = _mm512_setzero_si512(); - __m512i acc2 = _mm512_setzero_si512(); - __m512i acc3 = _mm512_setzero_si512(); - - const size_t d = quant.d; - const __m512i mask_f = _mm512_set1_epi8(0xF); - const __m512i one = _mm512_set1_epi16(1); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - // 256 dimensions per iteration - for (; i + 256 <= d; i += 256) { - // Chunk 0 - __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); - - // Chunk 1 - __m512i q_lo_1 = _mm512_loadu_si512(q_lo_ptr + i / 2 + 64); - __m512i q_hi_1 = _mm512_loadu_si512(q_hi_ptr + i / 2 + 64); - - auto process_chunk = [&](const uint8_t* code, - __m512i& acc, - __m512i q_lo, - __m512i q_hi, - int offset) { - __m512i c512 = _mm512_loadu_si512( - (const __m512i*)(code + i / 2 + offset)); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i diff_lo = _mm512_sub_epi8(q_lo, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - }; - - process_chunk(code_0, acc0, q_lo_0, q_hi_0, 0); - process_chunk(code_1, acc1, q_lo_0, q_hi_0, 0); - process_chunk(code_2, acc2, q_lo_0, q_hi_0, 0); - process_chunk(code_3, acc3, q_lo_0, q_hi_0, 0); - - process_chunk(code_0, acc0, q_lo_1, q_hi_1, 64); - process_chunk(code_1, acc1, q_lo_1, q_hi_1, 64); - process_chunk(code_2, acc2, q_lo_1, q_hi_1, 64); - process_chunk(code_3, acc3, q_lo_1, q_hi_1, 64); - } - - if (i + 128 <= d) { - __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); - - auto process_chunk = [&](const uint8_t* code, __m512i& acc) { - __m512i c512 = - _mm512_loadu_si512((const __m512i*)(code + i / 2)); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_0, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_0, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - }; - - process_chunk(code_0, acc0); - process_chunk(code_1, acc1); - process_chunk(code_2, acc2); - process_chunk(code_3, acc3); - - i += 128; - } - - // Handle remaining dimensions - if (i < d) { - size_t rem = d - i; - uint64_t mask_even = - (rem + 1) / 2 >= 64 ? -1ULL : (1ULL << ((rem + 1) / 2)) - 1; - uint64_t mask_odd = rem / 2 >= 64 ? -1ULL : (1ULL << (rem / 2)) - 1; - - __m512i q_lo_vec = - _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); - __m512i q_hi_vec = - _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); - __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); - - auto process = [&](const uint8_t* code, __m512i& acc) { - __m512i c512 = _mm512_maskz_loadu_epi8(mask_even, code + i / 2); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - }; - - process(code_0, acc0); - process(code_1, acc1); - process(code_2, acc2); - process(code_3, acc3); - } - - dis0 = _mm512_reduce_add_epi32(acc0) * final_scale_sq; - dis1 = _mm512_reduce_add_epi32(acc1) * final_scale_sq; - dis2 = _mm512_reduce_add_epi32(acc2) * final_scale_sq; - dis3 = _mm512_reduce_add_epi32(acc3) * final_scale_sq; - } - - void distances_batch_4( - const idx_t idx0, - const idx_t idx1, - const idx_t idx2, - const idx_t idx3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) override { - query_to_codes_batch_4( - codes + idx0 * code_size, - codes + idx1 * code_size, - codes + idx2 * code_size, - codes + idx3 * code_size, - dis0, - dis1, - dis2, - dis3); - } -}; - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ - -template -struct DistanceComputerByte_avx512 : SQDistanceComputer {}; - -template -struct DistanceComputerByte_avx512 - : public DistanceComputerByte_avx { - DistanceComputerByte_avx512(int d, const std::vector& unused) - : DistanceComputerByte_avx(d, unused) {} -}; - -template -struct DistanceComputerByte_avx512 - : public DistanceComputerByte_avx { - DistanceComputerByte_avx512(int d, const std::vector& unused) - : DistanceComputerByte_avx(d, unused) {} -}; - -template -struct DistanceComputerByte_avx512 : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_avx512(int d, const std::vector&) - : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - // __m256i accu = _mm256_setzero_ps (); - __m512i accu = _mm512_setzero_si512(); - for (int i = 0; i < d; i += 32) { - // load 32 bytes, convert to 16 uint16_t - __m512i c1 = _mm512_cvtepu8_epi16( - _mm256_loadu_si256((__m256i*)(code1 + i))); - __m512i c2 = _mm512_cvtepu8_epi16( - _mm256_loadu_si256((__m256i*)(code2 + i))); - __m512i prod32; - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - prod32 = _mm512_madd_epi16(c1, c2); - } else { - __m512i diff = _mm512_sub_epi16(c1, c2); - prod32 = _mm512_madd_epi16(diff, diff); - } - accu = _mm512_add_epi32(accu, prod32); - } - return _mm512_reduce_add_epi32(accu); - } - - void set_query(const float* x) final { - /* - for (int i = 0; i < d; i += 8) { - __m256 xi = _mm256_loadu_ps (x + i); - __m256i ci = _mm256_cvtps_epi32(xi); - */ - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return compute_distance(q, codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -SQDistanceComputer* select_distance_computer_avx512( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - const bool use_vnni = __builtin_cpu_supports("avx512vnni"); - switch (qtype) { - case QuantizerType::QT_8bit_uniform: - return new DCTemplate_avx512< - QuantizerTemplate_avx512, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit_uniform: - if (use_vnni) { - return new DistanceComputerSQ4UByte_avx512( - d, trained); - } else { - return new DistanceComputerSQ4UByte_avx512( - d, trained); - } - - case QuantizerType::QT_8bit: - return new DCTemplate_avx512< - QuantizerTemplate_avx512< - Codec8bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_6bit: - return new DCTemplate_avx512< - QuantizerTemplate_avx512< - Codec6bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit: - return new DCTemplate_avx512< - QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_fp16: - return new DCTemplate_avx512< - QuantizerFP16_avx512, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_bf16: - return new DCTemplate_avx512< - QuantizerBF16_avx512, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte_avx512( - d, trained); - } else { - return new DCTemplate_avx512< - Quantizer8bitDirect_avx512, - Sim, - SIMDWIDTH>(d, trained); - } - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate_avx512< - Quantizer8bitDirectSigned_avx512, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel2_InvertedListScanner_avx512( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel12_InvertedListScanner_avx512( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate_avx512; - using DCClass = DCTemplate_avx512; - return sel2_InvertedListScanner_avx512( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner_avx512( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec8bit_avx512, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec8bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec4bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec6bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner_avx512, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_bf16: - return sel2_InvertedListScanner_avx512, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner_avx512< - DistanceComputerByte_avx512>( - sq, quantizer, store_pairs, sel, r); - } else { - return sel2_InvertedListScanner_avx512, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner_avx512, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner_avx512( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner_avx512>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner_avx512>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_neon.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_neon.h deleted file mode 100644 index 89040dfa1..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_neon.h +++ /dev/null @@ -1,1074 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include -#include - -#include -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit_neon : public Codec8bit { - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -}; - -struct Codec4bit_neon : public Codec4bit { - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -}; - -struct Codec6bit_neon : public Codec6bit { - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -template -struct QuantizerTemplate_neon {}; - -template -struct QuantizerTemplate_neon - : public QuantizerTemplate { - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} -}; - -template -struct QuantizerTemplate_neon - : public QuantizerTemplate { - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec::decode_8_components(code, i); - return { - vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[0], - vdupq_n_f32(this->vdiff)), - vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[1], - vdupq_n_f32(this->vdiff)) - }; - } -}; - -template <> -struct QuantizerTemplate_neon< - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM, - 8> - : public QuantizerTemplate< - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM, - 1> { - float final_scale; - float final_bias; - - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM, - 1>(d, trained) { - final_scale = this->vdiff / 15.0f; - final_bias = this->vmin + this->vdiff * 0.5f / 15.0f; - } - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec4bit_neon::decode_8_components(code, i); - return {vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[0], - vdupq_n_f32(this->vdiff)), - vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[1], - vdupq_n_f32(this->vdiff))}; - } -}; - -template -struct QuantizerTemplate_neon - : public QuantizerTemplate { - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} -}; - -template -struct QuantizerTemplate_neon - : public QuantizerTemplate { - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec::decode_8_components(code, i); - - float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i); - float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i); - - return { - vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]), - vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1]) - }; - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16_neon {}; - -template <> -struct QuantizerFP16_neon<1> : public QuantizerFP16<1> { - QuantizerFP16_neon(size_t d, const std::vector& unused) - : QuantizerFP16<1>(d, unused) {} -}; - -template <> -struct QuantizerFP16_neon<8> : public QuantizerFP16<1> { - QuantizerFP16_neon(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); - return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), - vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))}; - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16_neon {}; - -template <> -struct QuantizerBF16_neon<1> : public QuantizerBF16<1> { - QuantizerBF16_neon(size_t d, const std::vector& unused) - : QuantizerBF16<1>(d, unused) {} -}; - -template <> -struct QuantizerBF16_neon<8> : public QuantizerBF16<1> { - QuantizerBF16_neon(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); - return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)), - vreinterpretq_f32_u32( - vshlq_n_u32(vmovl_u16(codei.val[1]), 16))}; - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect_neon {}; - -template <> -struct Quantizer8bitDirect_neon<1> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_neon(size_t d, const std::vector& unused) - : Quantizer8bitDirect(d, unused) {} -}; - -template <> -struct Quantizer8bitDirect_neon<8> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_neon(size_t d, const std::vector& trained) - : Quantizer8bitDirect<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i)); - uint16x8_t y8 = vmovl_u8(x8); - uint16x4_t y8_0 = vget_low_u16(y8); - uint16x4_t y8_1 = vget_high_u16(y8); - - // convert uint16 -> uint32 -> fp32 - return {vcvtq_f32_u32(vmovl_u16(y8_0)), vcvtq_f32_u32(vmovl_u16(y8_1))}; - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned_neon {}; - -template <> -struct Quantizer8bitDirectSigned_neon<1> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_neon(size_t d, const std::vector& unused) - : Quantizer8bitDirectSigned(d, unused) {} -}; - -template <> -struct Quantizer8bitDirectSigned_neon<8> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_neon(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i)); - uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16 - uint16x4_t y8_0 = vget_low_u16(y8); - uint16x4_t y8_1 = vget_high_u16(y8); - - float32x4_t z8_0 = vcvtq_f32_u32( - vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32 - float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1)); - - // subtract 128 to convert into signed numbers - return {vsubq_f32(z8_0, vmovq_n_f32(128.0)), - vsubq_f32(z8_1, vmovq_n_f32(128.0))}; - } -}; - -template -SQuantizer* select_quantizer_1_neon( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case QuantizerType::QT_8bit: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_6bit: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_4bit: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_fp16: - return new QuantizerFP16_neon(d, trained); - case QuantizerType::QT_bf16: - return new QuantizerBF16_neon(d, trained); - case QuantizerType::QT_8bit_direct: - return new Quantizer8bitDirect_neon(d, trained); - case QuantizerType::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned_neon(d, trained); - case QuantizerType::QT_1bit_direct: - // todo: add more SIMDWIDTH support for neon if needed - return new Quantizer1bitDirect(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2_neon {}; - -template <> -struct SimilarityL2_neon<1> : public SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - explicit SimilarityL2_neon(const float* y) : SimilarityL2<1>(y) {} -}; - -template <> -struct SimilarityL2_neon<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2_neon(const float* y) : y(y) {} - float32x4x2_t accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { - float32x4x2_t yiv = vld1q_f32_x2(yi); - yi += 8; - - float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]); - float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]); - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); - - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE void add_8_components_2( - float32x4x2_t x, - float32x4x2_t y) { - float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]); - float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]); - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); - - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE float result_8() { - float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]); - float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]); - - float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0); - float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1); - return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0); - } -}; - -template -struct SimilarityIP_neon {}; - -template <> -struct SimilarityIP_neon<1> : public SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - explicit SimilarityIP_neon(const float* y) : SimilarityIP<1>(y) {} -}; - -template <> -struct SimilarityIP_neon<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - float accu; - - explicit SimilarityIP_neon(const float* y) : y(y) {} - - float32x4x2_t accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { - float32x4x2_t yiv = vld1q_f32_x2(yi); - yi += 8; - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]); - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE void add_8_components_2(float32x4x2_t x1, float32x4x2_t x2) { - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]); - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE float result_8() { - float32x4x2_t sum = { - vpaddq_f32(accu8.val[0], accu8.val[0]), - vpaddq_f32(accu8.val[1], accu8.val[1]) - }; - float32x4x2_t sum2 = { - vpaddq_f32(sum.val[0], sum.val[0]), - vpaddq_f32(sum.val[1], sum.val[1]) - }; - return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0); - } -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ - -template -struct DCTemplate_neon : SQDistanceComputer {}; - -template -struct DCTemplate_neon - : public DCTemplate { - DCTemplate_neon(size_t d, const std::vector& trained) - : DCTemplate(d, trained) {} -}; - -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN -template -struct DCTemplate_neon : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_neon(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t x1 = quant.reconstruct_8_components(code1, i); - float32x4x2_t x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query(const float* x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return query_to_code(codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(q, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - Similarity sim0(q); - Similarity sim1(q); - Similarity sim2(q); - Similarity sim3(q); - - sim0.begin_8(); - sim1.begin_8(); - sim2.begin_8(); - sim3.begin_8(); - - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t xi0 = quant.reconstruct_8_components(code_0, i); - float32x4x2_t xi1 = quant.reconstruct_8_components(code_1, i); - float32x4x2_t xi2 = quant.reconstruct_8_components(code_2, i); - float32x4x2_t xi3 = quant.reconstruct_8_components(code_3, i); - sim0.add_8_components(xi0); - sim1.add_8_components(xi1); - sim2.add_8_components(xi2); - sim3.add_8_components(xi3); - } - - dis0 = sim0.result_8(); - dis1 = sim1.result_8(); - dis2 = sim2.result_8(); - dis3 = sim3.result_8(); - } -}; -FAISS_PRAGMA_IMPRECISE_FUNCTION_END - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ - -template -struct DistanceComputerByte_neon : SQDistanceComputer {}; - -template -struct DistanceComputerByte_neon - : public DistanceComputerByte { - DistanceComputerByte_neon(int d, const std::vector& unused) - : DistanceComputerByte(d, unused) {} -}; - -template -struct DistanceComputerByte_neon : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_neon(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - int accu = 0; - for (int i = 0; i < d; i++) { - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - accu += int(code1[i]) * code2[i]; - } else { - int diff = int(code1[i]) - code2[i]; - accu += diff * diff; - } - } - return accu; - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return query_to_code(codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -template -struct DistanceComputerSQ4UByte_neon : SQDistanceComputer { - using Quantizer = QuantizerTemplate_neon< - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM, - 8>; - using Similarity = Sim; - - Quantizer quant; - std::vector q_lo; - std::vector q_hi; - float final_scale_sq; - - DistanceComputerSQ4UByte_neon(size_t d, const std::vector& trained) - : quant(d, trained), - q_lo((d + 1) / 2 + 64, 0), - q_hi((d + 1) / 2 + 64, 0) { - final_scale_sq = quant.final_scale * quant.final_scale; - } - - void set_query(const float* x) final { - float inv_scale = 1.0f / quant.final_scale; - float offset = quant.vmin; - - for (size_t i = 0; i < quant.d; i++) { - float val = (x[i] - offset) * inv_scale; - int code = (int)std::floor(val); - if (code < 0) - code = 0; - if (code > 15) - code = 15; - - if (i % 2 == 0) { - q_lo[i / 2] = (uint8_t)code; - } else { - q_hi[i / 2] = (uint8_t)code; - } - } - } - - // Only computes L2 distance - float compute_distance(const float* x, const uint8_t* code) const { - return compute_distance_l2(code); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance_l2(code); - } - - float compute_distance_l2(const uint8_t* code) const { - uint32x4_t acc = vdupq_n_u32(0); - const size_t d = quant.d; - const uint8x16_t mask_f = vdupq_n_u8(0xF); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - for (; i + 32 <= d; i += 32) { - uint8x16_t c = vld1q_u8(code + i / 2); - - uint8x16_t nibbles_lo = vandq_u8(c, mask_f); - uint8x16_t nibbles_hi = vandq_u8(vshrq_n_u8(c, 4), mask_f); - - uint8x16_t q_lo_vec = vld1q_u8(q_lo_ptr + i / 2); - uint8x16_t q_hi_vec = vld1q_u8(q_hi_ptr + i / 2); - - uint8x16_t diff_lo = vabdq_u8(q_lo_vec, nibbles_lo); - uint8x16_t diff_hi = vabdq_u8(q_hi_vec, nibbles_hi); - - uint16x8_t sq_lo_1 = - vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); - uint16x8_t sq_lo_2 = - vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); - uint16x8_t sq_hi_1 = - vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); - uint16x8_t sq_hi_2 = - vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); - - acc = vpadalq_u16(acc, sq_lo_1); - acc = vpadalq_u16(acc, sq_lo_2); - acc = vpadalq_u16(acc, sq_hi_1); - acc = vpadalq_u16(acc, sq_hi_2); - } - - uint32_t result = vaddvq_u32(acc); - - if (i < d) { - size_t rem = d - i; - for (size_t j = 0; j < rem; j++) { - size_t idx = i + j; - uint8_t nibble_lo = q_lo[idx / 2]; - uint8_t nibble_hi = q_hi[idx / 2]; - - uint8_t c = code[idx / 2]; - uint8_t nibble; - if (idx % 2 == 0) { - nibble = c & 0xF; - } else { - nibble = (c >> 4) & 0xF; - } - int diff; - if (idx % 2 == 0) { - diff = (int)nibble_lo - (int)nibble; - } else { - diff = (int)nibble_hi - (int)nibble; - } - result += diff * diff; - } - } - - return result * final_scale_sq; - } - - float compute_code_distance_l2(const uint8_t* code1, const uint8_t* code2) - const { - uint32x4_t acc = vdupq_n_u32(0); - const size_t d = quant.d; - const uint8x16_t mask_f = vdupq_n_u8(0xF); - - size_t i = 0; - for (; i + 32 <= d; i += 32) { - uint8x16_t c1 = vld1q_u8(code1 + i / 2); - uint8x16_t c2 = vld1q_u8(code2 + i / 2); - - uint8x16_t n1_lo = vandq_u8(c1, mask_f); - uint8x16_t n1_hi = vandq_u8(vshrq_n_u8(c1, 4), mask_f); - - uint8x16_t n2_lo = vandq_u8(c2, mask_f); - uint8x16_t n2_hi = vandq_u8(vshrq_n_u8(c2, 4), mask_f); - - uint8x16_t diff_lo = vabdq_u8(n1_lo, n2_lo); - uint8x16_t diff_hi = vabdq_u8(n1_hi, n2_hi); - - uint16x8_t sq_lo_1 = - vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); - uint16x8_t sq_lo_2 = - vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); - - uint16x8_t sq_hi_1 = - vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); - uint16x8_t sq_hi_2 = - vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); - - acc = vpadalq_u16(acc, sq_lo_1); - acc = vpadalq_u16(acc, sq_lo_2); - acc = vpadalq_u16(acc, sq_hi_1); - acc = vpadalq_u16(acc, sq_hi_2); - } - - uint32_t result = vaddvq_u32(acc); - - if (i < d) { - size_t rem = d - i; - for (size_t j = 0; j < rem; j++) { - size_t idx = i + j; - uint8_t c1 = code1[idx / 2]; - uint8_t c2 = code2[idx / 2]; - uint8_t n1, n2; - if (idx % 2 == 0) { - n1 = c1 & 0xF; - n2 = c2 & 0xF; - } else { - n1 = (c1 >> 4) & 0xF; - n2 = (c2 >> 4) & 0xF; - } - int diff = (int)n1 - (int)n2; - result += diff * diff; - } - } - - return result * final_scale_sq; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance_l2( - codes + i * code_size, codes + j * code_size); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - uint32x4_t acc0 = vdupq_n_u32(0); - uint32x4_t acc1 = vdupq_n_u32(0); - uint32x4_t acc2 = vdupq_n_u32(0); - uint32x4_t acc3 = vdupq_n_u32(0); - - const size_t d = quant.d; - const uint8x16_t mask_f = vdupq_n_u8(0xF); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - for (; i + 32 <= d; i += 32) { - uint8x16_t q_lo_vec = vld1q_u8(q_lo_ptr + i / 2); - uint8x16_t q_hi_vec = vld1q_u8(q_hi_ptr + i / 2); - - auto process = [&](const uint8_t* code, uint32x4_t& acc) { - uint8x16_t c = vld1q_u8(code + i / 2); - uint8x16_t nibbles_lo = vandq_u8(c, mask_f); - uint8x16_t nibbles_hi = vandq_u8(vshrq_n_u8(c, 4), mask_f); - - uint8x16_t diff_lo = vabdq_u8(q_lo_vec, nibbles_lo); - uint8x16_t diff_hi = vabdq_u8(q_hi_vec, nibbles_hi); - - uint16x8_t sq_lo_1 = - vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); - uint16x8_t sq_lo_2 = - vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); - uint16x8_t sq_hi_1 = - vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); - uint16x8_t sq_hi_2 = - vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); - - acc = vpadalq_u16(acc, sq_lo_1); - acc = vpadalq_u16(acc, sq_lo_2); - acc = vpadalq_u16(acc, sq_hi_1); - acc = vpadalq_u16(acc, sq_hi_2); - }; - - process(code_0, acc0); - process(code_1, acc1); - process(code_2, acc2); - process(code_3, acc3); - } - - dis0 = vaddvq_u32(acc0); - dis1 = vaddvq_u32(acc1); - dis2 = vaddvq_u32(acc2); - dis3 = vaddvq_u32(acc3); - - if (i < d) { - size_t rem = d - i; - for (size_t j = 0; j < rem; j++) { - size_t idx = i + j; - uint8_t nibble_lo = q_lo[idx / 2]; - uint8_t nibble_hi = q_hi[idx / 2]; - - auto process_scalar = [&](const uint8_t* code, float& dis) { - uint8_t c = code[idx / 2]; - uint8_t nibble; - if (idx % 2 == 0) { - nibble = c & 0xF; - } else { - nibble = (c >> 4) & 0xF; - } - int diff; - if (idx % 2 == 0) { - diff = (int)nibble_lo - (int)nibble; - } else { - diff = (int)nibble_hi - (int)nibble; - } - dis += diff * diff; - }; - - process_scalar(code_0, dis0); - process_scalar(code_1, dis1); - process_scalar(code_2, dis2); - process_scalar(code_3, dis3); - } - } - - dis0 *= final_scale_sq; - dis1 *= final_scale_sq; - dis2 *= final_scale_sq; - dis3 *= final_scale_sq; - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -SQDistanceComputer* select_distance_computer_neon( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - switch (qtype) { - case QuantizerType::QT_8bit_uniform: - return new DCTemplate_neon< - QuantizerTemplate_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit_uniform: - return new DistanceComputerSQ4UByte_neon(d, trained); - - case QuantizerType::QT_8bit: - return new DCTemplate_neon< - QuantizerTemplate_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_6bit: - return new DCTemplate_neon< - QuantizerTemplate_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit: - return new DCTemplate_neon< - QuantizerTemplate_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_fp16: - return new DCTemplate_neon< - QuantizerFP16_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_bf16: - return new DCTemplate_neon< - QuantizerBF16_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte_neon(d, trained); - } else { - return new DCTemplate_neon< - Quantizer8bitDirect_neon, - Sim, - SIMDWIDTH>(d, trained); - } - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate_neon< - Quantizer8bitDirectSigned_neon, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel2_InvertedListScanner_neon( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel12_InvertedListScanner_neon( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate_neon; - using DCClass = DCTemplate_neon; - return sel2_InvertedListScanner_neon( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner_neon( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_neon< - Similarity, - Codec8bit_neon, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_neon< - Similarity, - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_neon< - Similarity, - Codec8bit_neon, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_neon< - Similarity, - Codec4bit_neon, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_neon< - Similarity, - Codec6bit_neon, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner_neon, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_bf16: - return sel2_InvertedListScanner_neon, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner_neon< - DistanceComputerByte_neon>( - sq, quantizer, store_pairs, sel, r); - } else { - return sel2_InvertedListScanner_neon, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner_neon, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner_neon( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner_neon>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner_neon>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_rvv.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_rvv.h deleted file mode 100644 index f52ecba81..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_rvv.h +++ /dev/null @@ -1,1346 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#if defined(__riscv_vector) - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -inline size_t get_vlen_f32_m1() { - return __riscv_vsetvlmax_e32m1(); -} -inline size_t get_vlen_f32_m2() { - return __riscv_vsetvlmax_e32m2(); -} -inline size_t get_vlen_f32_m4() { - return __riscv_vsetvlmax_e32m4(); -} - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -constexpr size_t RVV_CODEC_STACK_THRESHOLD = 512; - -struct Codec8bit_rvv : public Codec8bit { - static FAISS_ALWAYS_INLINE vfloat32m4_t - decode_components(const uint8_t* code, int i, size_t vl) { - vuint8m1_t v_u8 = __riscv_vle8_v_u8m1(code + i, vl); - vuint16m2_t v_u16 = __riscv_vwcvtu_x_x_v_u16m2(v_u8, vl); - vuint32m4_t v_u32 = __riscv_vwcvtu_x_x_v_u32m4(v_u16, vl); - vfloat32m4_t v_f32 = __riscv_vfcvt_f_xu_v_f32m4(v_u32, vl); - vfloat32m4_t one_255 = __riscv_vfmv_v_f_f32m4(1.0f / 255.0f, vl); - vfloat32m4_t half_one_255 = __riscv_vfmv_v_f_f32m4(0.5f / 255.0f, vl); - return __riscv_vfmadd_vv_f32m4(v_f32, one_255, half_one_255, vl); - } -}; - -struct Codec4bit_rvv : public Codec4bit { - static FAISS_ALWAYS_INLINE vfloat32m4_t - decode_components(const uint8_t* code, int i, size_t vl) { - auto process = [&](uint32_t* unpacked_buf) -> vfloat32m4_t { - for (size_t j = 0; j < vl; ++j) { - size_t current_idx = static_cast(i) + j; - const uint8_t byte = code[current_idx / 2]; - unpacked_buf[j] = - (current_idx % 2 == 0) ? (byte & 0x0F) : (byte >> 4); - } - vuint32m4_t v_u32 = __riscv_vle32_v_u32m4(unpacked_buf, vl); - vfloat32m4_t v_f32 = __riscv_vfcvt_f_xu_v_f32m4(v_u32, vl); - vfloat32m4_t one_15 = __riscv_vfmv_v_f_f32m4(1.0f / 15.0f, vl); - vfloat32m4_t half = __riscv_vfmv_v_f_f32m4(0.5f, vl); - vfloat32m4_t temp_sum = __riscv_vfadd_vv_f32m4(v_f32, half, vl); - return __riscv_vfmul_vv_f32m4(temp_sum, one_15, vl); - }; - - if (vl <= RVV_CODEC_STACK_THRESHOLD) { - std::array stack_buf{}; - return process(stack_buf.data()); - } else { - std::vector heap_buf(vl); - return process(heap_buf.data()); - } - } -}; - -struct Codec6bit_rvv : public Codec6bit { - static FAISS_ALWAYS_INLINE void decode_components( - const uint8_t* code, - int i, - size_t vl, - float* out) { - const size_t max_chunk = __riscv_vsetvlmax_e32m4(); - - std::array unpacked_buf; - FAISS_THROW_IF_NOT_MSG( - max_chunk <= RVV_CODEC_STACK_THRESHOLD, - "RVV max_chunk exceeds stack buffer"); - - size_t offset = 0; - while (offset < vl) { - const size_t chunk_vl = std::min(vl - offset, max_chunk); - - for (size_t j = 0; j < chunk_vl; ++j) { - size_t abs_i = static_cast(i) + offset + j; - size_t tab = abs_i / 4; - size_t q = abs_i % 4; - const uint8_t* p_grp = code + tab * 3; - uint32_t x4 = 0; - if (q == 0) { - x4 = p_grp[0] & 0x3F; - } else if (q == 1) { - x4 = ((p_grp[0] >> 6) | (p_grp[1] << 2)) & 0x3F; - } else if (q == 2) { - x4 = ((p_grp[1] >> 4) | (p_grp[2] << 4)) & 0x3F; - } else { - x4 = (p_grp[2] >> 2) & 0x3F; - } - unpacked_buf[j] = x4; - } - - vuint32m4_t v_u32 = - __riscv_vle32_v_u32m4(unpacked_buf.data(), chunk_vl); - vfloat32m4_t v_f32 = __riscv_vfcvt_f_xu_v_f32m4(v_u32, chunk_vl); - - vfloat32m4_t one_63 = - __riscv_vfmv_v_f_f32m4(1.0f / 63.0f, chunk_vl); - vfloat32m4_t half_one_63 = - __riscv_vfmv_v_f_f32m4(0.5f / 63.0f, chunk_vl); - - vfloat32m4_t chunk_result = __riscv_vfmadd_vv_f32m4( - v_f32, one_63, half_one_63, chunk_vl); - - __riscv_vse32_v_f32m4(out + offset, chunk_result, chunk_vl); - - offset += chunk_vl; - } - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ -template -struct QuantizerTemplate_rvv {}; - -template -struct QuantizerTemplate_rvv - : public QuantizerTemplate< - Codec, - QuantizerTemplateScaling::UNIFORM, - 1> { - QuantizerTemplate_rvv(size_t d, const std::vector& trained) - : QuantizerTemplate( - d, - trained) {} - - FAISS_ALWAYS_INLINE vfloat32m4_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - vfloat32m4_t xi = Codec::decode_components(code, i, vl); - - vfloat32m4_t v_vmin = __riscv_vfmv_v_f_f32m4(this->vmin, vl); - vfloat32m4_t v_vdiff = __riscv_vfmv_v_f_f32m4(this->vdiff, vl); - - return __riscv_vfmadd_vv_f32m4(xi, v_vdiff, v_vmin, vl); - } -}; - -template -struct QuantizerTemplate_rvv - : public QuantizerTemplate< - Codec, - QuantizerTemplateScaling::NON_UNIFORM, - 1> { - QuantizerTemplate_rvv(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec, - QuantizerTemplateScaling::NON_UNIFORM, - 1>(d, trained) {} - FAISS_ALWAYS_INLINE vfloat32m4_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - vfloat32m4_t xi = Codec::decode_components(code, i, vl); - - vfloat32m4_t v_vmin = __riscv_vle32_v_f32m4(this->vmin + i, vl); - vfloat32m4_t v_vdiff = __riscv_vle32_v_f32m4(this->vdiff + i, vl); - - return __riscv_vfmadd_vv_f32m4(xi, v_vdiff, v_vmin, vl); - } -}; - -template <> -struct QuantizerTemplate_rvv< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0> - : public QuantizerTemplate< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 1> { - QuantizerTemplate_rvv(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 1>(d, trained) {} - - FAISS_ALWAYS_INLINE void reconstruct_components( - const uint8_t* code, - int i, - size_t vl, - float* out) const { - Codec6bit_rvv::decode_components(code, i, vl, out); - - const size_t max_chunk = __riscv_vsetvlmax_e32m4(); - size_t offset = 0; - - while (offset < vl) { - const size_t chunk_vl = std::min(vl - offset, max_chunk); - - vfloat32m4_t xi = __riscv_vle32_v_f32m4(out + offset, chunk_vl); - - vfloat32m4_t v_vmin = - __riscv_vle32_v_f32m4(this->vmin + i + offset, chunk_vl); - vfloat32m4_t v_vdiff = - __riscv_vle32_v_f32m4(this->vdiff + i + offset, chunk_vl); - - vfloat32m4_t result = - __riscv_vfmadd_vv_f32m4(xi, v_vdiff, v_vmin, chunk_vl); - - __riscv_vse32_v_f32m4(out + offset, result, chunk_vl); - - offset += chunk_vl; - } - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16_rvv {}; - -template <> -struct QuantizerFP16_rvv<1> : public QuantizerFP16<1> { - QuantizerFP16_rvv(size_t d, const std::vector& unused) - : QuantizerFP16<1>(d, unused) {} -}; - -template <> -struct QuantizerFP16_rvv<0> : public QuantizerFP16<1> { - QuantizerFP16_rvv(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE vfloat32m2_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - const _Float16* code_ptr = reinterpret_cast( - code + 2 * static_cast(i)); - vfloat16m1_t v_f16 = __riscv_vle16_v_f16m1(code_ptr, vl); - return __riscv_vfwcvt_f_f_v_f32m2(v_f16, vl); - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16_rvv {}; - -template <> -struct QuantizerBF16_rvv<1> : public QuantizerBF16<1> { - QuantizerBF16_rvv(size_t d, const std::vector& unused) - : QuantizerBF16<1>(d, unused) {} -}; - -template <> -struct QuantizerBF16_rvv<0> : public QuantizerBF16<1> { - QuantizerBF16_rvv(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE vfloat32m2_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - const uint16_t* code_ptr = reinterpret_cast( - code + 2 * static_cast(i)); - vuint16m1_t v_u16 = __riscv_vle16_v_u16m1(code_ptr, vl); - vuint32m2_t v_u32 = __riscv_vwaddu_vx_u32m2(v_u16, 0, vl); - vuint32m2_t v_shifted = __riscv_vsll_vx_u32m2(v_u32, 16, vl); - return __riscv_vreinterpret_v_u32m2_f32m2(v_shifted); - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect_rvv {}; -template <> -struct Quantizer8bitDirect_rvv<1> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_rvv(size_t d, const std::vector& u) - : Quantizer8bitDirect(d, u) {} -}; -template <> -struct Quantizer8bitDirect_rvv<0> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_rvv(size_t d, const std::vector& t) - : Quantizer8bitDirect<1>(d, t) {} - - FAISS_ALWAYS_INLINE vfloat32m4_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - vuint8m1_t v_u8 = __riscv_vle8_v_u8m1(code + i, vl); - vuint16m2_t v_u16 = __riscv_vwcvtu_x_x_v_u16m2(v_u8, vl); - vuint32m4_t v_u32 = __riscv_vwcvtu_x_x_v_u32m4(v_u16, vl); - return __riscv_vfcvt_f_xu_v_f32m4(v_u32, vl); - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ -template -struct Quantizer8bitDirectSigned_rvv {}; - -template <> -struct Quantizer8bitDirectSigned_rvv<1> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_rvv(size_t d, const std::vector& unused) - : Quantizer8bitDirectSigned(d, unused) {} -}; - -template <> -struct Quantizer8bitDirectSigned_rvv<0> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_rvv(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE vfloat32m4_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - vuint8m1_t v_u8 = __riscv_vle8_v_u8m1(code + i, vl); - vuint16m2_t v_u16 = __riscv_vwcvtu_x_x_v_u16m2(v_u8, vl); - vuint32m4_t v_u32 = __riscv_vwcvtu_x_x_v_u32m4(v_u16, vl); - vfloat32m4_t v_f32 = __riscv_vfcvt_f_xu_v_f32m4(v_u32, vl); - vfloat32m4_t c128 = __riscv_vfmv_v_f_f32m4(128.0f, vl); - return __riscv_vfsub_vv_f32m4(v_f32, c128, vl); - } -}; - -template -ScalarQuantizer::SQuantizer* select_quantizer_1_rvv( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - switch (qtype) { - case ScalarQuantizer::QT_8bit: - return new QuantizerTemplate_rvv< - Codec8bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_8bit_uniform: - return new QuantizerTemplate_rvv< - Codec8bit_rvv, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_4bit: - return new QuantizerTemplate_rvv< - Codec4bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_4bit_uniform: - return new QuantizerTemplate_rvv< - Codec4bit_rvv, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_6bit: - return new QuantizerTemplate_rvv< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_fp16: - return new QuantizerFP16_rvv(dim, trained); - case ScalarQuantizer::QT_bf16: - return new QuantizerBF16_rvv(dim, trained); - case ScalarQuantizer::QT_8bit_direct: - return new Quantizer8bitDirect_rvv(dim, trained); - case ScalarQuantizer::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned_rvv(dim, trained); - default: - FAISS_THROW_FMT("Quantizer type %d not supported", qtype); - } - return nullptr; -} - -/******************************************************************* - * Similarity "Tags": Used as template parameters to select metric. - * These are now stateless. - *******************************************************************/ - -template -struct SimilarityL2_rvv {}; -template <> -struct SimilarityL2_rvv<0> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; -}; - -template -struct SimilarityIP_rvv {}; -template <> -struct SimilarityIP_rvv<0> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ -template -struct DCTemplate_rvv : SQDistanceComputer {}; - -template -struct DCTemplate_rvv - : public DCTemplate { - DCTemplate_rvv(size_t d, const std::vector& trained) - : DCTemplate(d, trained) {} -}; - -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN -template -struct DCTemplate_rvv : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - using Quantizer6bitSpecialized = QuantizerTemplate_rvv< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0>; - - DCTemplate_rvv(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - size_t d = quant.d; - size_t i = 0; - const size_t vlmax = __riscv_vsetvlmax_e32m2(); - - vfloat32m2_t vacc0 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc1 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc3 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - - for (; i + 4 * vlmax <= d; i += 4 * vlmax) { - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vfloat32m2_t x0 = quant.reconstruct_components(code, i, vlmax); - vfloat32m2_t x1 = - quant.reconstruct_components(code, i + vlmax, vlmax); - vfloat32m2_t x2 = quant.reconstruct_components( - code, i + 2 * vlmax, vlmax); - vfloat32m2_t x3 = quant.reconstruct_components( - code, i + 3 * vlmax, vlmax); - - const float* y_ptr = x + i; - vfloat32m2_t y0 = __riscv_vle32_v_f32m2(y_ptr, vlmax); - vfloat32m2_t y1 = __riscv_vle32_v_f32m2(y_ptr + vlmax, vlmax); - vfloat32m2_t y2 = - __riscv_vle32_v_f32m2(y_ptr + 2 * vlmax, vlmax); - vfloat32m2_t y3 = - __riscv_vle32_v_f32m2(y_ptr + 3 * vlmax, vlmax); - - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(y0, x0, vlmax); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(y1, x1, vlmax); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(y2, x2, vlmax); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(y3, x3, vlmax); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vlmax); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y0, x0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y1, x1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y2, x2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y3, x3, vlmax); - } - } else { - vfloat32m4_t x_m4_0, x_m4_1; - constexpr size_t buf_len = 4 * 128; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - std::array temp_buf; - FAISS_THROW_IF_NOT_MSG( - 4 * vlmax <= buf_len, - "RVV vlmax too large for stack buffer in DCTemplate_rvv"); - - quant.reconstruct_components( - code, i, 2 * vlmax, temp_buf.data()); - quant.reconstruct_components( - code, - i + 2 * vlmax, - 2 * vlmax, - temp_buf.data() + 2 * vlmax); - - x_m4_0 = __riscv_vle32_v_f32m4(temp_buf.data(), 2 * vlmax); - x_m4_1 = __riscv_vle32_v_f32m4( - temp_buf.data() + 2 * vlmax, 2 * vlmax); - - } else { - (void)__riscv_vsetvl_e32m4(2 * vlmax); - x_m4_0 = quant.reconstruct_components(code, i, 2 * vlmax); - x_m4_1 = quant.reconstruct_components( - code, i + 2 * vlmax, 2 * vlmax); - (void)__riscv_vsetvl_e32m2(vlmax); - } - - vfloat32m2_t x0 = __riscv_vget_v_f32m4_f32m2(x_m4_0, 0); - vfloat32m2_t x1 = __riscv_vget_v_f32m4_f32m2(x_m4_0, 1); - vfloat32m2_t x2 = __riscv_vget_v_f32m4_f32m2(x_m4_1, 0); - vfloat32m2_t x3 = __riscv_vget_v_f32m4_f32m2(x_m4_1, 1); - - const float* y_ptr = x + i; - vfloat32m2_t y0 = __riscv_vle32_v_f32m2(y_ptr, vlmax); - vfloat32m2_t y1 = __riscv_vle32_v_f32m2(y_ptr + vlmax, vlmax); - vfloat32m2_t y2 = - __riscv_vle32_v_f32m2(y_ptr + 2 * vlmax, vlmax); - vfloat32m2_t y3 = - __riscv_vle32_v_f32m2(y_ptr + 3 * vlmax, vlmax); - - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(y0, x0, vlmax); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(y1, x1, vlmax); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(y2, x2, vlmax); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(y3, x3, vlmax); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vlmax); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y0, x0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y1, x1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y2, x2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y3, x3, vlmax); - } - } - } - - for (; i < d;) { - size_t vl; - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vl = __riscv_vsetvl_e32m2(d - i); - vfloat32m2_t xi = quant.reconstruct_components(code, i, vl); - const float* y_ptr = x + i; - vfloat32m2_t y_rem = __riscv_vle32_v_f32m2(y_ptr, vl); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t diff = __riscv_vfsub_vv_f32m2(y_rem, xi, vl); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, diff, diff, vl); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y_rem, xi, vl); - } - } else { - vl = __riscv_vsetvl_e32m4(d - i); - - vfloat32m4_t xi_m4; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - if (vl <= RVV_CODEC_STACK_THRESHOLD) { - std::array temp_buf; - quant.reconstruct_components( - code, i, vl, temp_buf.data()); - xi_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - } else { - std::vector temp_buf(vl); - quant.reconstruct_components( - code, i, vl, temp_buf.data()); - xi_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - } - - } else { - xi_m4 = quant.reconstruct_components(code, i, vl); - } - - vfloat32m2_t p0 = __riscv_vget_v_f32m4_f32m2(xi_m4, 0); - vfloat32m2_t p1 = __riscv_vget_v_f32m4_f32m2(xi_m4, 1); - - const float* y_ptr = x + i; - - const size_t vlmax_m2 = __riscv_vsetvlmax_e32m2(); - size_t vl0 = (vl > vlmax_m2) ? vlmax_m2 : vl; - size_t vl1 = vl - vl0; - - if (vl0 > 0) { - vfloat32m2_t y0 = __riscv_vle32_v_f32m2(y_ptr, vl0); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(y0, p0, vl0); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vl0); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y0, p0, vl0); - } - } - if (vl1 > 0) { - vfloat32m2_t y1 = __riscv_vle32_v_f32m2(y_ptr + vl0, vl1); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(y1, p1, vl1); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vl1); - } else { - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y1, p1, vl1); - } - } - } - i += vl; - } - - vfloat32m1_t sum_scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1); - vfloat32m1_t s0 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc0, sum_scalar, vlmax); - vfloat32m1_t s1 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc1, sum_scalar, vlmax); - vfloat32m1_t s2 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc2, sum_scalar, vlmax); - vfloat32m1_t s3 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc3, sum_scalar, vlmax); - - float f0 = __riscv_vfmv_f_s_f32m1_f32(s0); - float f1 = __riscv_vfmv_f_s_f32m1_f32(s1); - float f2 = __riscv_vfmv_f_s_f32m1_f32(s2); - float f3 = __riscv_vfmv_f_s_f32m1_f32(s3); - - return f0 + f1 + f2 + f3; - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - size_t d = quant.d; - size_t i = 0; - const size_t vlmax = __riscv_vsetvlmax_e32m2(); - - vfloat32m2_t vacc0 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc1 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc3 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - - for (; i + 4 * vlmax <= d; i += 4 * vlmax) { - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vfloat32m2_t x1_0 = - quant.reconstruct_components(code1, i, vlmax); - vfloat32m2_t x1_1 = - quant.reconstruct_components(code1, i + vlmax, vlmax); - vfloat32m2_t x1_2 = quant.reconstruct_components( - code1, i + 2 * vlmax, vlmax); - vfloat32m2_t x1_3 = quant.reconstruct_components( - code1, i + 3 * vlmax, vlmax); - vfloat32m2_t x2_0 = - quant.reconstruct_components(code2, i, vlmax); - vfloat32m2_t x2_1 = - quant.reconstruct_components(code2, i + vlmax, vlmax); - vfloat32m2_t x2_2 = quant.reconstruct_components( - code2, i + 2 * vlmax, vlmax); - vfloat32m2_t x2_3 = quant.reconstruct_components( - code2, i + 3 * vlmax, vlmax); - - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(x1_0, x2_0, vlmax); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(x1_1, x2_1, vlmax); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(x1_2, x2_2, vlmax); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(x1_3, x2_3, vlmax); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vlmax); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, x1_0, x2_0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, x1_1, x2_1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, x1_2, x2_2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, x1_3, x2_3, vlmax); - } - } else { - vfloat32m4_t x1_m4_0, x1_m4_1, x2_m4_0, x2_m4_1; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - constexpr size_t buf_len = 8 * 128; - std::array temp_buf; - FAISS_THROW_IF_NOT_MSG( - 8 * vlmax <= buf_len, - "RVV vlmax too large for stack buffer in DCTemplate_rvv"); - - quant.reconstruct_components( - code1, i, 2 * vlmax, temp_buf.data()); - quant.reconstruct_components( - code1, - i + 2 * vlmax, - 2 * vlmax, - temp_buf.data() + 2 * vlmax); - quant.reconstruct_components( - code2, i, 2 * vlmax, temp_buf.data() + 4 * vlmax); - quant.reconstruct_components( - code2, - i + 2 * vlmax, - 2 * vlmax, - temp_buf.data() + 6 * vlmax); - - x1_m4_0 = __riscv_vle32_v_f32m4(temp_buf.data(), 2 * vlmax); - x1_m4_1 = __riscv_vle32_v_f32m4( - temp_buf.data() + 2 * vlmax, 2 * vlmax); - x2_m4_0 = __riscv_vle32_v_f32m4( - temp_buf.data() + 4 * vlmax, 2 * vlmax); - x2_m4_1 = __riscv_vle32_v_f32m4( - temp_buf.data() + 6 * vlmax, 2 * vlmax); - - } else { - x1_m4_0 = quant.reconstruct_components(code1, i, 2 * vlmax); - x1_m4_1 = quant.reconstruct_components( - code1, i + 2 * vlmax, 2 * vlmax); - x2_m4_0 = quant.reconstruct_components(code2, i, 2 * vlmax); - x2_m4_1 = quant.reconstruct_components( - code2, i + 2 * vlmax, 2 * vlmax); - } - vfloat32m2_t x1_0 = __riscv_vget_v_f32m4_f32m2(x1_m4_0, 0); - vfloat32m2_t x1_1 = __riscv_vget_v_f32m4_f32m2(x1_m4_0, 1); - vfloat32m2_t x1_2 = __riscv_vget_v_f32m4_f32m2(x1_m4_1, 0); - vfloat32m2_t x1_3 = __riscv_vget_v_f32m4_f32m2(x1_m4_1, 1); - vfloat32m2_t x2_0 = __riscv_vget_v_f32m4_f32m2(x2_m4_0, 0); - vfloat32m2_t x2_1 = __riscv_vget_v_f32m4_f32m2(x2_m4_0, 1); - vfloat32m2_t x2_2 = __riscv_vget_v_f32m4_f32m2(x2_m4_1, 0); - vfloat32m2_t x2_3 = __riscv_vget_v_f32m4_f32m2(x2_m4_1, 1); - - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(x1_0, x2_0, vlmax); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(x1_1, x2_1, vlmax); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(x1_2, x2_2, vlmax); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(x1_3, x2_3, vlmax); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vlmax); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, x1_0, x2_0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, x1_1, x2_1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, x1_2, x2_2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, x1_3, x2_3, vlmax); - } - } - } - - for (; i < d;) { - size_t vl; - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vl = __riscv_vsetvl_e32m2(d - i); - vfloat32m2_t x1i = quant.reconstruct_components(code1, i, vl); - vfloat32m2_t x2i = quant.reconstruct_components(code2, i, vl); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t diff = __riscv_vfsub_vv_f32m2(x1i, x2i, vl); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, diff, diff, vl); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, x1i, x2i, vl); - } - } else { - vl = __riscv_vsetvl_e32m4(d - i); - - vfloat32m4_t x1i_m4, x2i_m4; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - if (2 * vl <= RVV_CODEC_STACK_THRESHOLD * 2) { - std::array - temp_buf; - quant.reconstruct_components( - code1, i, vl, temp_buf.data()); - quant.reconstruct_components( - code2, i, vl, temp_buf.data() + vl); - x1i_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - x2i_m4 = - __riscv_vle32_v_f32m4(temp_buf.data() + vl, vl); - } else { - std::vector temp_buf(2 * vl); - quant.reconstruct_components( - code1, i, vl, temp_buf.data()); - quant.reconstruct_components( - code2, i, vl, temp_buf.data() + vl); - x1i_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - x2i_m4 = - __riscv_vle32_v_f32m4(temp_buf.data() + vl, vl); - } - } else { - x1i_m4 = quant.reconstruct_components(code1, i, vl); - x2i_m4 = quant.reconstruct_components(code2, i, vl); - } - - vfloat32m2_t p1_0 = __riscv_vget_v_f32m4_f32m2(x1i_m4, 0); - vfloat32m2_t p1_1 = __riscv_vget_v_f32m4_f32m2(x1i_m4, 1); - vfloat32m2_t p2_0 = __riscv_vget_v_f32m4_f32m2(x2i_m4, 0); - vfloat32m2_t p2_1 = __riscv_vget_v_f32m4_f32m2(x2i_m4, 1); - - const size_t vlmax_m2 = __riscv_vsetvlmax_e32m2(); - size_t vl0 = (vl > vlmax_m2) ? vlmax_m2 : vl; - size_t vl1 = vl - vl0; - - if (vl0 > 0) { - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t diff0 = - __riscv_vfsub_vv_f32m2(p1_0, p2_0, vl0); - vacc0 = __riscv_vfmacc_vv_f32m2( - vacc0, diff0, diff0, vl0); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, p1_0, p2_0, vl0); - } - } - if (vl1 > 0) { - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t diff1 = - __riscv_vfsub_vv_f32m2(p1_1, p2_1, vl1); - vacc1 = __riscv_vfmacc_vv_f32m2( - vacc1, diff1, diff1, vl1); - } else { - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, p1_1, p2_1, vl1); - } - } - } - i += vl; - } - - vfloat32m1_t sum_scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1); - vfloat32m1_t s0 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc0, sum_scalar, vlmax); - vfloat32m1_t s1 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc1, sum_scalar, vlmax); - vfloat32m1_t s2 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc2, sum_scalar, vlmax); - vfloat32m1_t s3 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc3, sum_scalar, vlmax); - - float f0 = __riscv_vfmv_f_s_f32m1_f32(s0); - float f1 = __riscv_vfmv_f_s_f32m1_f32(s1); - float f2 = __riscv_vfmv_f_s_f32m1_f32(s2); - float f3 = __riscv_vfmv_f_s_f32m1_f32(s3); - - return f0 + f1 + f2 + f3; - } - - void set_query(const float* x) final { - this->q = x; - } - - float operator()(idx_t i) final { - return this->query_to_code(this->codes + i * this->code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - this->codes + i * this->code_size, - this->codes + j * this->code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(this->q, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - const size_t vlmax = __riscv_vsetvlmax_e32m2(); - - vfloat32m2_t vacc0 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc1 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc3 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - - size_t d = quant.d; - size_t i = 0; - - for (; i < d;) { - size_t vl; - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vl = __riscv_vsetvl_e32m2(d - i); - vfloat32m2_t x0 = quant.reconstruct_components(code_0, i, vl); - vfloat32m2_t x1 = quant.reconstruct_components(code_1, i, vl); - vfloat32m2_t x2 = quant.reconstruct_components(code_2, i, vl); - vfloat32m2_t x3 = quant.reconstruct_components(code_3, i, vl); - - vfloat32m2_t y = __riscv_vle32_v_f32m2(this->q + i, vl); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(y, x0, vl); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(y, x1, vl); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(y, x2, vl); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(y, x3, vl); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vl); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vl); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vl); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vl); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y, x0, vl); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y, x1, vl); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y, x2, vl); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y, x3, vl); - } - } else { - vl = __riscv_vsetvl_e32m4(d - i); - - vfloat32m4_t x0_m4, x1_m4, x2_m4, x3_m4; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - if (4 * vl <= RVV_CODEC_STACK_THRESHOLD * 4) { - std::array - temp_buf; - quant.reconstruct_components( - code_0, i, vl, temp_buf.data()); - quant.reconstruct_components( - code_1, i, vl, temp_buf.data() + vl); - quant.reconstruct_components( - code_2, i, vl, temp_buf.data() + 2 * vl); - quant.reconstruct_components( - code_3, i, vl, temp_buf.data() + 3 * vl); - x0_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - x1_m4 = __riscv_vle32_v_f32m4(temp_buf.data() + vl, vl); - x2_m4 = __riscv_vle32_v_f32m4( - temp_buf.data() + 2 * vl, vl); - x3_m4 = __riscv_vle32_v_f32m4( - temp_buf.data() + 3 * vl, vl); - } else { - std::vector temp_buf(4 * vl); - quant.reconstruct_components( - code_0, i, vl, temp_buf.data()); - quant.reconstruct_components( - code_1, i, vl, temp_buf.data() + vl); - quant.reconstruct_components( - code_2, i, vl, temp_buf.data() + 2 * vl); - quant.reconstruct_components( - code_3, i, vl, temp_buf.data() + 3 * vl); - x0_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - x1_m4 = __riscv_vle32_v_f32m4(temp_buf.data() + vl, vl); - x2_m4 = __riscv_vle32_v_f32m4( - temp_buf.data() + 2 * vl, vl); - x3_m4 = __riscv_vle32_v_f32m4( - temp_buf.data() + 3 * vl, vl); - } - - } else { - x0_m4 = quant.reconstruct_components(code_0, i, vl); - x1_m4 = quant.reconstruct_components(code_1, i, vl); - x2_m4 = quant.reconstruct_components(code_2, i, vl); - x3_m4 = quant.reconstruct_components(code_3, i, vl); - } - vfloat32m2_t x0_p0 = __riscv_vget_v_f32m4_f32m2(x0_m4, 0); - vfloat32m2_t x0_p1 = __riscv_vget_v_f32m4_f32m2(x0_m4, 1); - vfloat32m2_t x1_p0 = __riscv_vget_v_f32m4_f32m2(x1_m4, 0); - vfloat32m2_t x1_p1 = __riscv_vget_v_f32m4_f32m2(x1_m4, 1); - vfloat32m2_t x2_p0 = __riscv_vget_v_f32m4_f32m2(x2_m4, 0); - vfloat32m2_t x2_p1 = __riscv_vget_v_f32m4_f32m2(x2_m4, 1); - vfloat32m2_t x3_p0 = __riscv_vget_v_f32m4_f32m2(x3_m4, 0); - vfloat32m2_t x3_p1 = __riscv_vget_v_f32m4_f32m2(x3_m4, 1); - - const size_t vlmax_m2 = __riscv_vsetvlmax_e32m2(); - size_t vl0 = (vl > vlmax_m2) ? vlmax_m2 : vl; - size_t vl1 = vl - vl0; - - if (vl0 > 0) { - vfloat32m2_t y0 = __riscv_vle32_v_f32m2(this->q + i, vl0); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = - __riscv_vfsub_vv_f32m2(y0, x0_p0, vl0); - vfloat32m2_t d1 = - __riscv_vfsub_vv_f32m2(y0, x1_p0, vl0); - vfloat32m2_t d2 = - __riscv_vfsub_vv_f32m2(y0, x2_p0, vl0); - vfloat32m2_t d3 = - __riscv_vfsub_vv_f32m2(y0, x3_p0, vl0); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vl0); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vl0); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vl0); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vl0); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y0, x0_p0, vl0); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y0, x1_p0, vl0); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y0, x2_p0, vl0); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y0, x3_p0, vl0); - } - } - if (vl1 > 0) { - size_t offset = i + vl0; - vfloat32m2_t y1 = - __riscv_vle32_v_f32m2(this->q + offset, vl1); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = - __riscv_vfsub_vv_f32m2(y1, x0_p1, vl1); - vfloat32m2_t d1 = - __riscv_vfsub_vv_f32m2(y1, x1_p1, vl1); - vfloat32m2_t d2 = - __riscv_vfsub_vv_f32m2(y1, x2_p1, vl1); - vfloat32m2_t d3 = - __riscv_vfsub_vv_f32m2(y1, x3_p1, vl1); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vl1); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vl1); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vl1); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vl1); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y1, x0_p1, vl1); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y1, x1_p1, vl1); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y1, x2_p1, vl1); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y1, x3_p1, vl1); - } - } - } - i += vl; - } - - vfloat32m1_t sum_scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1); - dis0 = __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredusum_vs_f32m2_f32m1(vacc0, sum_scalar, vlmax)); - dis1 = __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredusum_vs_f32m2_f32m1(vacc1, sum_scalar, vlmax)); - dis2 = __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredusum_vs_f32m2_f32m1(vacc2, sum_scalar, vlmax)); - dis3 = __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredusum_vs_f32m2_f32m1(vacc3, sum_scalar, vlmax)); - } -}; -FAISS_PRAGMA_IMPRECISE_FUNCTION_END - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ -template -struct DistanceComputerByte_rvv : SQDistanceComputer {}; - -template -struct DistanceComputerByte_rvv - : public DistanceComputerByte { - DistanceComputerByte_rvv(int d, const std::vector& unused) - : DistanceComputerByte(d, unused) {} -}; - -template -struct DistanceComputerByte_rvv : SQDistanceComputer { - using Sim = Similarity; - int d; - std::vector tmp; - - DistanceComputerByte_rvv(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - size_t remaining_d = static_cast(d); - size_t offset = 0; - uint64_t acc64 = 0; - - while (true) { - size_t vl = __riscv_vsetvl_e8m1(remaining_d); - if (vl == 0) - break; - - vuint8m1_t vx_u = __riscv_vle8_v_u8m1(code1 + offset, vl); - vuint8m1_t vy_u = __riscv_vle8_v_u8m1(code2 + offset, vl); - - if constexpr (Sim::metric_type == METRIC_L2) { - vuint16m2_t vx16 = __riscv_vzext_vf2_u16m2(vx_u, vl); - vuint16m2_t vy16 = __riscv_vzext_vf2_u16m2(vy_u, vl); - vuint32m4_t vx32 = __riscv_vzext_vf2_u32m4(vx16, vl); - vuint32m4_t vy32 = __riscv_vzext_vf2_u32m4(vy16, vl); - vint32m4_t sx32 = __riscv_vreinterpret_v_u32m4_i32m4(vx32); - vint32m4_t sy32 = __riscv_vreinterpret_v_u32m4_i32m4(vy32); - vint32m4_t sdiff = __riscv_vsub_vv_i32m4(sx32, sy32, vl); - vint32m4_t sqr = __riscv_vmul_vv_i32m4(sdiff, sdiff, vl); - vuint32m4_t sqr_u = __riscv_vreinterpret_v_i32m4_u32m4(sqr); - vuint32m1_t vsum = __riscv_vmv_s_x_u32m1(0, 1); - vsum = __riscv_vredsum_vs_u32m4_u32m1(sqr_u, vsum, vl); - acc64 += static_cast(__riscv_vmv_x_s_u32m1_u32(vsum)); - } else { - vuint16m2_t vprod = __riscv_vwmulu_vv_u16m2(vx_u, vy_u, vl); - vuint32m4_t vprod_w = __riscv_vwaddu_vx_u32m4(vprod, 0, vl); - vuint32m1_t vsum = __riscv_vmv_s_x_u32m1(0, 1); - vsum = __riscv_vredsum_vs_u32m4_u32m1(vprod_w, vsum, vl); - acc64 += static_cast(__riscv_vmv_x_s_u32m1_u32(vsum)); - } - - offset += vl; - remaining_d -= vl; - } - if (acc64 > static_cast(std::numeric_limits::max())) { - return std::numeric_limits::max(); - } - return static_cast(acc64); - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = static_cast(x[i]); - } - } - - float operator()(idx_t i) final { - return query_to_code(this->codes + i * this->code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - this->codes + i * this->code_size, - this->codes + j * this->code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -ScalarQuantizer::SQDistanceComputer* select_distance_computer_rvv( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - switch (qtype) { - case ScalarQuantizer::QT_8bit: - return new DCTemplate_rvv< - QuantizerTemplate_rvv< - Codec8bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_8bit_uniform: - return new DCTemplate_rvv< - QuantizerTemplate_rvv< - Codec8bit_rvv, - QuantizerTemplateScaling::UNIFORM, - 0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_4bit: - return new DCTemplate_rvv< - QuantizerTemplate_rvv< - Codec4bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_4bit_uniform: - // Fallback to base class for SQ4U to ensure correct COSINE metric - // handling The generic DCTemplate_rvv computes IP distance for - // INNER_PRODUCT metric, but SQ4U with COSINE metric requires L2 - // distance computation. - // TODO: Implement RVV-optimized DistanceComputerSQ4UByte_rvv - // similar to AVX2 version - return select_distance_computer(qtype, dim, trained); - - case ScalarQuantizer::QT_6bit: - return new DCTemplate_rvv< - QuantizerTemplate_rvv< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_fp16: - return new DCTemplate_rvv, Similarity, 0>( - dim, trained); - - case ScalarQuantizer::QT_bf16: - return new DCTemplate_rvv, Similarity, 0>( - dim, trained); - - case ScalarQuantizer::QT_8bit_direct: - return new DCTemplate_rvv< - Quantizer8bitDirect_rvv<0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate_rvv< - Quantizer8bitDirectSigned_rvv<0>, - Similarity, - 0>(dim, trained); - - default: - FAISS_THROW_FMT("Quantizer type %d not supported", qtype); - return nullptr; - } -} - -template -InvertedListScanner* sel2_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -template -InvertedListScanner* sel2_InvertedListScanner_rvv( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel12_InvertedListScanner_rvv( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = 0; - using QuantizerClass = QuantizerTemplate_rvv; - using DCClass = DCTemplate_rvv; - return sel2_InvertedListScanner_rvv( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner_rvv( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = 0; - switch (sq->qtype) { - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec8bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec8bit_rvv, - QuantizerTemplateScaling::UNIFORM>( - sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec4bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec4bit_rvv, - QuantizerTemplateScaling::UNIFORM>( - sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner_rvv, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_bf16: - return sel2_InvertedListScanner_rvv, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_8bit_direct: - return sel2_InvertedListScanner_rvv, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_8bit_direct_signed: - return sel2_InvertedListScanner_rvv, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - - default: - FAISS_THROW_MSG("unknown qtype"); - return nullptr; - } -} - -template -InvertedListScanner* select_inverted_list_scanner_rvv( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t /*dim*/, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner_rvv>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner_rvv>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - return nullptr; - } -} - -} -} -} // namespace faiss - -#endif // __riscv_vector diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.cpp deleted file mode 100644 index 6a2db28d9..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ - -/* SSE */ -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (metric == METRIC_L2) { - return select_distance_computer>(qtype, dim, trained); - } else { - return select_distance_computer>(qtype, dim, trained); - } -} - -ScalarQuantizer::SQDistanceComputer* sq_get_hamming_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - return select_hamming_distance_computer(dim, trained); -} - -SQDistanceComputer* select_hamming_distance_computer( - size_t d, - const std::vector& trained) { - size_t code_size = (d + 7) / 8; - switch (code_size) { - case 4: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 8: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 16: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 20: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 32: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 64: - return new BinarySQDistanceComputerWrapper(code_size, trained); - default: - return new BinarySQDistanceComputerWrapper(code_size, trained); - } -} - -ScalarQuantizer::SQDistanceComputer* sq_get_jaccard_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - return select_jaccard_distance_computer(dim, trained); -} - -SQDistanceComputer* select_jaccard_distance_computer( - size_t d, - const std::vector& trained) { - size_t code_size = (d + 7) / 8; - switch (code_size) { - case 8: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 16: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 32: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 64: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 128: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 256: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 512: - return new BinarySQDistanceComputerWrapper(code_size, trained); - default: - return new BinarySQDistanceComputerWrapper(code_size, trained); - } -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_ref( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - return select_quantizer_1<1>(qtype, dim, trained); -} - -InvertedListScanner* sq_select_inverted_list_scanner_ref( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - return sel0_InvertedListScanner<1>( - mt, sq, quantizer, store_pairs, sel, by_residual); -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.h deleted file mode 100644 index de6008050..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.h +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQDistanceComputer* sq_get_hamming_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQDistanceComputer* sq_get_jaccard_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_ref( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_ref( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.cpp deleted file mode 100644 index 8eade7715..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (metric == METRIC_L2) { - if (dim % 8 == 0) { - return select_distance_computer_avx>( - qtype, dim, trained); - } else { - return select_distance_computer_avx>( - qtype, dim, trained); - } - } else { - if (dim % 8 == 0) { - return select_distance_computer_avx>( - qtype, dim, trained); - } else { - return select_distance_computer_avx>( - qtype, dim, trained); - } - } -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_avx( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (dim % 8 == 0) { - return select_quantizer_1_avx<8>(qtype, dim, trained); - } else { - return select_quantizer_1_avx<1>(qtype, dim, trained); - } -} - -InvertedListScanner* sq_select_inverted_list_scanner_avx( - MetricType mt, - const ScalarQuantizer *sq, - const Index *quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (dim % 8 == 0) { - return sel0_InvertedListScanner_avx<8>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } else { - return sel0_InvertedListScanner_avx<1>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.h deleted file mode 100644 index af44d70ef..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_avx( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_avx( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.cpp deleted file mode 100644 index f291930f4..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx512( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (metric == METRIC_L2) { - if (dim % 16 == 0) { - return select_distance_computer_avx512>( - qtype, dim, trained); - } else if (dim % 8 == 0) { - return select_distance_computer_avx512>( - qtype, dim, trained); - } else { - return select_distance_computer_avx512>( - qtype, dim, trained); - } - } else { - if (dim % 16 == 0) { - return select_distance_computer_avx512>( - qtype, dim, trained); - } else if (dim % 8 == 0) { - return select_distance_computer_avx512>( - qtype, dim, trained); - } else { - return select_distance_computer_avx512>( - qtype, dim, trained); - } - } -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_avx512( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (dim % 16 == 0) { - return select_quantizer_1_avx512<16>(qtype, dim, trained); - } else if (dim % 8 == 0) { - return select_quantizer_1_avx512<8>(qtype, dim, trained); - } else { - return select_quantizer_1_avx512<1>(qtype, dim, trained); - } -} - -InvertedListScanner* sq_select_inverted_list_scanner_avx512( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (dim % 16 == 0) { - return sel0_InvertedListScanner_avx512<16>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } else if (dim % 8 == 0) { - return sel0_InvertedListScanner_avx512<8>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } else { - return sel0_InvertedListScanner_avx512<1>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.h deleted file mode 100644 index b0d719008..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx512( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_avx512( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_avx512( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.cpp deleted file mode 100644 index d41b470d6..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_neon( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (metric == METRIC_L2) { - if (dim % 8 == 0) { - return select_distance_computer_neon>( - qtype, dim, trained); - } else { - return select_distance_computer_neon>( - qtype, dim, trained); - } - } else { - if (dim % 8 == 0) { - return select_distance_computer_neon>( - qtype, dim, trained); - } else { - return select_distance_computer_neon>( - qtype, dim, trained); - } - } -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_neon( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (dim % 8 == 0) { - return select_quantizer_1_neon<8>(qtype, dim, trained); - } else { - return select_quantizer_1_neon<1>(qtype, dim, trained); - } -} - -InvertedListScanner* sq_select_inverted_list_scanner_neon( - MetricType mt, - const ScalarQuantizer *sq, - const Index *quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (dim % 8 == 0) { - return sel0_InvertedListScanner_neon<8>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } else { - return sel0_InvertedListScanner_neon<1>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.h deleted file mode 100644 index dcca6b284..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_neon( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_neon( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_neon( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.cpp deleted file mode 100644 index 03bb55ea1..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_rvv( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { -#if defined(__riscv_vector) - - if (metric == METRIC_L2) { - return select_distance_computer_rvv>( - qtype, dim, trained); - } else { - return select_distance_computer_rvv>( - qtype, dim, trained); - } -#else - - if (metric == METRIC_L2) { - return select_distance_computer_rvv>( - qtype, dim, trained); - } else { - return select_distance_computer_rvv>( - qtype, dim, trained); - } -#endif -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_rvv( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { -#if defined(__riscv_vector) - - return select_quantizer_1_rvv<0>(qtype, dim, trained); -#else - - return select_quantizer_1_rvv<1>(qtype, dim, trained); -#endif -} - -InvertedListScanner* sq_select_inverted_list_scanner_rvv( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { -#if defined(__riscv_vector) - - return select_inverted_list_scanner_rvv<0>( - mt, sq, quantizer, dim, store_pairs, sel, by_residual); -#else - - return select_inverted_list_scanner_rvv<1>( - mt, sq, quantizer, dim, store_pairs, sel, by_residual); -#endif -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.h deleted file mode 100644 index 75c798aa2..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.h +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -// Forward declaration for RVV-specific implementations. -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_rvv( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_rvv( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_rvv( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.cpp deleted file mode 100644 index 705cdd11b..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.cpp +++ /dev/null @@ -1,193 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -#include -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -using RangeStat = ScalarQuantizer::RangeStat; - -/******************************************************************* - * Quantizer range training - */ - -static float sqr(float x) { - return x * x; -} - -void train_Uniform( - RangeStat rs, - float rs_arg, - idx_t n, - int k, - const float* x, - std::vector& trained) { - trained.resize(2); - float& vmin = trained[0]; - float& vmax = trained[1]; - - if (rs == RangeStat::RS_minmax) { - vmin = HUGE_VAL; - vmax = -HUGE_VAL; - for (size_t i = 0; i < n; i++) { - if (x[i] < vmin) - vmin = x[i]; - if (x[i] > vmax) - vmax = x[i]; - } - float vexp = (vmax - vmin) * rs_arg; - vmin -= vexp; - vmax += vexp; - } else if (rs == RangeStat::RS_meanstd) { - double sum = 0, sum2 = 0; - for (size_t i = 0; i < n; i++) { - sum += x[i]; - sum2 += x[i] * x[i]; - } - float mean = sum / n; - float var = sum2 / n - mean * mean; - float std = var <= 0 ? 1.0 : sqrt(var); - - vmin = mean - std * rs_arg; - vmax = mean + std * rs_arg; - } else if (rs == RangeStat::RS_quantiles) { - std::vector x_copy(n); - memcpy(x_copy.data(), x, n * sizeof(*x)); - // TODO just do a quickselect - std::sort(x_copy.begin(), x_copy.end()); - int o = int(rs_arg * n); - if (o < 0) - o = 0; - if (o > n - o) - o = n / 2; - vmin = x_copy[o]; - vmax = x_copy[n - 1 - o]; - - } else if (rs == RangeStat::RS_optim) { - float a, b; - float sx = 0; - { - vmin = HUGE_VAL, vmax = -HUGE_VAL; - for (size_t i = 0; i < n; i++) { - if (x[i] < vmin) - vmin = x[i]; - if (x[i] > vmax) - vmax = x[i]; - sx += x[i]; - } - b = vmin; - a = (vmax - vmin) / (k - 1); - } - int verbose = false; - int niter = 2000; - float last_err = -1; - int iter_last_err = 0; - for (int it = 0; it < niter; it++) { - float sn = 0, sn2 = 0, sxn = 0, err1 = 0; - - for (idx_t i = 0; i < n; i++) { - float xi = x[i]; - float ni = floor((xi - b) / a + 0.5); - if (ni < 0) - ni = 0; - if (ni >= k) - ni = k - 1; - err1 += sqr(xi - (ni * a + b)); - sn += ni; - sn2 += ni * ni; - sxn += ni * xi; - } - - if (err1 == last_err) { - iter_last_err++; - if (iter_last_err == 16) - break; - } else { - last_err = err1; - iter_last_err = 0; - } - - float det = sqr(sn) - sn2 * n; - - b = (sn * sxn - sn2 * sx) / det; - a = (sn * sx - n * sxn) / det; - if (verbose) { - printf("it %d, err1=%g \r", it, err1); - fflush(stdout); - } - } - if (verbose) - printf("\n"); - - vmin = b; - vmax = b + a * (k - 1); - - } else { - FAISS_THROW_MSG("Invalid qtype"); - } - vmax -= vmin; -} - -void train_NonUniform( - RangeStat rs, - float rs_arg, - idx_t n, - int d, - int k, - const float* x, - std::vector& trained) { - trained.resize(2 * d); - float* vmin = trained.data(); - float* vmax = trained.data() + d; - if (rs == RangeStat::RS_minmax) { - memcpy(vmin, x, sizeof(*x) * d); - memcpy(vmax, x, sizeof(*x) * d); - for (size_t i = 1; i < n; i++) { - const float* xi = x + i * d; - for (size_t j = 0; j < d; j++) { - if (xi[j] < vmin[j]) - vmin[j] = xi[j]; - if (xi[j] > vmax[j]) - vmax[j] = xi[j]; - } - } - float* vdiff = vmax; - for (size_t j = 0; j < d; j++) { - float vexp = (vmax[j] - vmin[j]) * rs_arg; - vmin[j] -= vexp; - vmax[j] += vexp; - vdiff[j] = vmax[j] - vmin[j]; - } - } else { - // transpose - std::vector xt(n * d); - for (size_t i = 1; i < n; i++) { - const float* xi = x + i * d; - for (size_t j = 0; j < d; j++) { - xt[j * n + i] = xi[j]; - } - } - std::vector trained_d(2); -#pragma omp parallel for - for (int j = 0; j < d; j++) { - train_Uniform(rs, rs_arg, n, k, xt.data() + j * n, trained_d); - vmin[j] = trained_d[0]; - vmax[j] = trained_d[1]; - } - } -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.h deleted file mode 100644 index 8204f5a1b..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.h +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -void train_Uniform( - ScalarQuantizer::RangeStat rs, - float rs_arg, - idx_t n, - int k, - const float* x, - std::vector& trained); - -void train_NonUniform( - ScalarQuantizer::RangeStat rs, - float rs_arg, - idx_t n, - int d, - int k, - const float* x, - std::vector& trained); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerScanner.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerScanner.h deleted file mode 100644 index c3b9c7b79..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerScanner.h +++ /dev/null @@ -1,343 +0,0 @@ -#pragma once - -#include -//#include - -//struct InvertedListScanner; -//struct IDSelector; - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -/******************************************************************* - * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object - * - * It is an InvertedListScanner, but is designed to work with - * IndexScalarQuantizer as well. - ********************************************************************/ -template < - // A predicate for filtering elements. - // std::optional Pred(const size_t idx); - // * return true to accept an element. - // * return false to reject an element. - // * return std::nullopt to break the iteration loop. - typename Pred, - // Apply an element. - // void Apply(const float dis, const size_t idx); - typename Apply, - typename DCClass> -void fvec_distance_ny_scalar_if( - const DCClass& dc, - const uint8_t* __restrict codes, - const size_t code_size, - const size_t ny, - Pred pred, - Apply apply) { - // compute a distance from the query to 1 element - auto distance1 = [&dc, codes, code_size](const size_t idx) { - return dc.query_to_code(codes + idx * code_size); - }; - - // compute distances from the query to 4 elements - auto distance4 = [&dc, codes, code_size]( - const std::array indices, - std::array& dis) { - dc.query_to_codes_batch_4( - codes + indices[0] * code_size, - codes + indices[1] * code_size, - codes + indices[2] * code_size, - codes + indices[3] * code_size, - dis[0], - dis[1], - dis[2], - dis[3]); - }; - - NoRemapping remapper; - - fvec_distance_ny_if< - Pred, - decltype(distance1), - decltype(distance4), - decltype(remapper), - Apply, - 4, - DEFAULT_BUFFER_SIZE>( - ny, pred, distance1, distance4, remapper, apply); -} - -/* use_sel = 0: don't check selector - * = 1: check on ids[j] - * = 2: check in j directly (normally ids is nullptr and store_pairs) - */ - -template -struct IVFSQScannerIP : InvertedListScanner { - DCClass dc; - bool by_residual; - - float accu0; /// added to all distances - - IVFSQScannerIP( - int d, - const std::vector& trained, - size_t code_size, - bool store_pairs, - const IDSelector* sel, - bool by_residual) - : dc(d, trained), by_residual(by_residual), accu0(0) { - this->store_pairs = store_pairs; - this->sel = sel; - this->code_size = code_size; - this->keep_max = true; - } - - void set_query(const float* query) override { - dc.set_query(query); - } - - void set_list(idx_t list_no, float coarse_dis) override { - this->list_no = list_no; - accu0 = by_residual ? coarse_dis : 0; - } - - float distance_to_code(const uint8_t* code) const final { - return accu0 + dc.query_to_code(code); - } - - size_t scan_codes( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - float* simi, - idx_t* idxi, - size_t k, - size_t& scan_cnt) const override { - size_t nup = 0; - // baseline - // for (size_t j = 0; j < list_size; j++, codes += code_size) { - // if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - // continue; - // } - - // // todo aguzhva: upgrade - // float accu = accu0 + dc.query_to_code(codes); - - // if (accu > simi[0]) { - // int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - // minheap_replace_top(k, simi, idxi, accu, id); - // nup++; - // } - // } - - // the lambda that filters acceptable elements. - auto filter = [&](const size_t j) { - return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); - }; - - // the lambda that applies a filtered element. - auto apply = [&](const float dis_in, const size_t j) { - const float dis = accu0 + dis_in; - if (dis > simi[0]) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - minheap_replace_top(k, simi, idxi, dis, id); - nup++; - } - }; - - // compute distances - fvec_distance_ny_scalar_if( - dc, codes, code_size, list_size, filter, apply); - return nup; - } - - void scan_codes_and_return( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - std::vector<::knowhere::DistId>& out) const override { - // the lambda that filters acceptable elements. - auto filter = [&](const size_t j) { - return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); - }; - // the lambda that applies a valid element. - auto apply = [&](const float dis_in, const size_t j) { - const float dis = accu0 + dis_in; - out.emplace_back(ids[j], dis); - }; - fvec_distance_ny_scalar_if( - dc, codes, code_size, list_size, filter, apply); - } - - void scan_codes_range( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - float radius, - RangeQueryResult& res) const override { - for (size_t j = 0; j < list_size; j++, codes += code_size) { - if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - continue; - } - - // todo aguzhva: upgrade - float accu = accu0 + dc.query_to_code(codes); - if (accu > radius) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - res.add(accu, id); - } - } - } -}; - -/* use_sel = 0: don't check selector - * = 1: check on ids[j] - * = 2: check in j directly (normally ids is nullptr and store_pairs) - */ - -template -struct IVFSQScannerL2 : InvertedListScanner { - DCClass dc; - - bool by_residual; - const Index* quantizer; - const float* x; /// current query - - std::vector tmp; - - IVFSQScannerL2( - int d, - const std::vector& trained, - size_t code_size, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) - : dc(d, trained), - by_residual(by_residual), - quantizer(quantizer), - x(nullptr), - tmp(d) { - this->store_pairs = store_pairs; - this->sel = sel; - this->code_size = code_size; - } - - void set_query(const float* query) override { - x = query; - if (!quantizer) { - dc.set_query(query); - } - } - - void set_list(idx_t list_no, float) override { - this->list_no = list_no; - if (by_residual) { - // shift of x_in wrt centroid - quantizer->compute_residual(x, tmp.data(), list_no); - dc.set_query(tmp.data()); - } else { - dc.set_query(x); - } - } - - float distance_to_code(const uint8_t* code) const final { - return dc.query_to_code(code); - } - - size_t scan_codes( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - float* simi, - idx_t* idxi, - size_t k, - size_t& scan_cnt) const override { - size_t nup = 0; - - // // baseline - // for (size_t j = 0; j < list_size; j++, codes += code_size) { - // if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - // continue; - // } - // - // float dis = dc.query_to_code(codes); - // - // if (dis < simi[0]) { - // int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - // maxheap_replace_top(k, simi, idxi, dis, id); - // nup++; - // } - // } - - // the lambda that filters acceptable elements. - auto filter = - [&](const size_t j) { return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); }; - - // the lambda that applies a filtered element. - auto apply = - [&](const float dis, const size_t j) { - if (dis < simi[0]) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - maxheap_replace_top(k, simi, idxi, dis, id); - nup++; - } - }; - - // compute distances - fvec_distance_ny_scalar_if( - dc, codes, code_size, list_size, filter, apply); - - return nup; - } - - void scan_codes_and_return( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - std::vector<::knowhere::DistId>& out) const override { - // the lambda that filters acceptable elements. - auto filter = [&](const size_t j) { - return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); - }; - // the lambda that applies a valid element. - auto apply = [&](const float dis_in, const size_t j) { - out.emplace_back(ids[j], dis_in); - }; - fvec_distance_ny_scalar_if( - dc, codes, code_size, list_size, filter, apply); - } - - void scan_codes_range( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - float radius, - RangeQueryResult& res) const override { - for (size_t j = 0; j < list_size; j++, codes += code_size) { - if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - continue; - } - - // todo aguzhva: upgrade - float dis = dc.query_to_code(codes); - if (dis < radius) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - res.add(dis, id); - } - } - } -}; - -} -} -} \ No newline at end of file diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_read.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_read.cpp index e2d0e27a5..ae0760929 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_read.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_read.cpp @@ -24,9 +24,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -38,6 +40,7 @@ #include #include #include +#include #include #include @@ -574,7 +577,9 @@ static void read_ProductLocalSearchQuantizer( } } -static void read_ScalarQuantizer(ScalarQuantizer* ivsc, IOReader* f) { +static void read_ScalarQuantizer( + ::faiss::ScalarQuantizer* ivsc, + IOReader* f) { READ1(ivsc->qtype); READ1(ivsc->rangestat); READ1(ivsc->rangestat_arg); @@ -986,12 +991,30 @@ Index* read_index(IOReader* f, int io_flags) { READVECTOR(idxs->inverse_norms_storage.inverse_l2_norms); idx = idxs; } else if (h == fourcc("IxSQ")) { - IndexScalarQuantizer* idxs = new IndexScalarQuantizer(); + ::faiss::IndexScalarQuantizer* idxs = + new ::faiss::IndexScalarQuantizer(); read_index_header(idxs, f); read_ScalarQuantizer(&idxs->sq, f); read_vector(idxs->codes, f); idxs->code_size = idxs->sq.code_size; - idx = idxs; + // Legacy binary format: the fork used integer 9 as QT_1bit_direct + // for 1-bit HNSW storage; baseline maps the same integer to + // QT_0bit. Compare against the raw integer so we never depend on + // either enum name, and route legacy data to + // IndexBinaryScalarQuantizer. + const int legacy_qt_1bit_direct_marker = 9; + if (static_cast(idxs->sq.qtype) == + legacy_qt_1bit_direct_marker) { + IndexBinaryScalarQuantizer* bsq = new IndexBinaryScalarQuantizer( + static_cast(idxs->d), idxs->metric_type); + bsq->ntotal = idxs->ntotal; + bsq->is_trained = idxs->is_trained; + bsq->codes = std::move(idxs->codes); + delete idxs; + idx = bsq; + } else { + idx = idxs; + } } else if (h == fourcc("IvSQ")) { // legacy IndexIVFScalarQuantizer* ivsc = new IndexIVFScalarQuantizer(); std::vector> ids; @@ -1110,6 +1133,27 @@ Index* read_index(IOReader* f, int io_flags) { if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) { dynamic_cast(idxhnsw->storage)->pq.compute_sdc_table(); } + // Legacy binary HNSW: IHNs fourcc with an + // IndexBinaryScalarQuantizer inner storage was how the fork used + // to serialize IndexHNSWSQ(QT_1bit_direct, ...). Swap the outer + // wrapper to IndexHNSWBinary so the runtime type reflects the + // data. The on-disk bytes are unchanged by this conversion. + if (h == fourcc("IHNs") && + dynamic_cast(idxhnsw->storage) != + nullptr) { + IndexHNSWBinary* newh = new IndexHNSWBinary(); + newh->d = idxhnsw->d; + newh->ntotal = idxhnsw->ntotal; + newh->is_trained = idxhnsw->is_trained; + newh->metric_type = idxhnsw->metric_type; + newh->hnsw = std::move(idxhnsw->hnsw); + newh->storage = idxhnsw->storage; + newh->own_fields = idxhnsw->own_fields; + idxhnsw->storage = nullptr; + idxhnsw->own_fields = false; + delete idxhnsw; + idxhnsw = newh; + } idx = idxhnsw; } else if (h == fourcc("IwPf")) { IndexIVFPQFastScan* ivpq = new IndexIVFPQFastScan(); diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_write.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_write.cpp index b1ad3b188..4f0eddbb6 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_write.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_write.cpp @@ -23,9 +23,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -37,6 +39,7 @@ #include #include #include +#include #include #include @@ -236,7 +239,9 @@ static void write_ProductLocalSearchQuantizer( } } -static void write_ScalarQuantizer(const ScalarQuantizer* ivsc, IOWriter* f) { +static void write_ScalarQuantizer( + const ::faiss::ScalarQuantizer* ivsc, + IOWriter* f) { WRITE1(ivsc->qtype); WRITE1(ivsc->rangestat); WRITE1(ivsc->rangestat_arg); @@ -586,8 +591,39 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) { // inverse norms WRITEVECTOR(idxs->inverse_norms_storage.inverse_l2_norms); } else if ( - const IndexScalarQuantizer* idxs = - dynamic_cast(idx)) { + const IndexBinaryScalarQuantizer* bsq = + dynamic_cast(idx)) { + // Legacy binary serialization: emit the same IxSQ fourcc + SQ + // wire layout that IndexScalarQuantizer(QT_1bit_direct) used to + // produce, so old readers continue to parse it unchanged. The + // trained vector is empty (1-bit-direct has no training data). + // QuantizerType enum integer 9 was fork's QT_1bit_direct; in + // baseline the same integer is QT_0bit. Emit the raw integer so + // the wire format is stable regardless of which enum is in scope. + uint32_t h = fourcc("IxSQ"); + WRITE1(h); + write_index_header(idx, f); + + const int legacy_qt_1bit_direct_marker = 9; + auto legacy_qtype = + static_cast<::faiss::ScalarQuantizer::QuantizerType>( + legacy_qt_1bit_direct_marker); + ::faiss::ScalarQuantizer::RangeStat legacy_rangestat = + ::faiss::ScalarQuantizer::RS_minmax; + float legacy_rangestat_arg = 0.0f; + size_t legacy_d = static_cast(bsq->d); + size_t legacy_code_size = bsq->code_size; + std::vector legacy_trained; + WRITE1(legacy_qtype); + WRITE1(legacy_rangestat); + WRITE1(legacy_rangestat_arg); + WRITE1(legacy_d); + WRITE1(legacy_code_size); + WRITEVECTOR(legacy_trained); + WRITEVECTOR(bsq->codes); + } else if ( + const ::faiss::IndexScalarQuantizer* idxs = + dynamic_cast(idx)) { uint32_t h = fourcc("IxSQ"); WRITE1(h); write_index_header(idx, f); @@ -665,6 +701,11 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) { } else if (const IndexHNSW* idxhnsw = dynamic_cast(idx)) { uint32_t h = dynamic_cast(idx) ? fourcc("IHNf") : dynamic_cast(idx) ? fourcc("IHNp") + // IndexHNSWBinary reuses the legacy IHNs fourcc so + // on-disk bytes match what IndexHNSWSQ(QT_1bit_direct, + // metric) used to produce. Readers dispatch to + // IndexHNSWBinary based on the inner storage type. + : dynamic_cast(idx) ? fourcc("IHNs") : dynamic_cast(idx) ? fourcc("IHNs") : dynamic_cast(idx) ? fourcc("IHN2") : dynamic_cast(idx) ? fourcc("IHNc") diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp new file mode 100644 index 000000000..36ea767c7 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp @@ -0,0 +1,328 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/****************************************************************************** + * Knowhere-local prelude over baseline sq-avx2.cpp. + * + * See sq-avx512-fastpath.cpp for the full design note on how this pattern + * works (full DCTemplate specialization declared here; baseline .cpp + * included below; template lookup picks our specialization). + * + * This file ports the AVX2 variant of the fork's DistanceComputerSQ4UByte + * for QT_4bit_uniform + L2. + *****************************************************************************/ + +#ifdef COMPILE_SIMD_AVX2 + +#include +#include +#include +#include + +#include +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +template <> +struct DCTemplate< + QuantizerTemplate< + Codec4bit, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::AVX2>, + SimilarityL2, + SIMDLevel::AVX2> : SQDistanceComputer { + using Sim = SimilarityL2; + + size_t d; + float vmin; + float vdiff; + float final_scale_sq; + std::vector q_lo; + std::vector q_hi; + + DCTemplate(size_t d_in, const std::vector& trained) + : d(d_in), + vmin(trained[0]), + vdiff(trained[1]), + // Over-allocate by 32 bytes so full 256-bit loads past the + // logical length read a safe zero-filled tail. + q_lo((d_in + 1) / 2 + 32, 0), + q_hi((d_in + 1) / 2 + 32, 0) { + const float final_scale = vdiff / 15.0f; + final_scale_sq = final_scale * final_scale; + } + + void set_query(const float* x) final { + this->q = x; + const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff; + for (size_t i = 0; i < d; i++) { + float val = (x[i] - vmin) * inv_scale; + int code = static_cast(std::floor(val + 0.5f)); + if (code < 0) { + code = 0; + } + if (code > 15) { + code = 15; + } + if (i % 2 == 0) { + q_lo[i / 2] = static_cast(code); + } else { + q_hi[i / 2] = static_cast(code); + } + } + } + + float query_to_code(const uint8_t* code) const final { + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + __m256i acc = _mm256_setzero_si256(); + const __m256i mask_f = _mm256_set1_epi8(0xF); + const __m256i one = _mm256_set1_epi16(1); + const __m256i zero = _mm256_setzero_si256(); + + size_t i = 0; + // 64 dims per iteration (32 bytes of packed 4-bit codes). + for (; i + 64 <= d; i += 64) { + __m256i c256 = _mm256_loadu_si256( + reinterpret_cast(code + i / 2)); + + __m256i nibbles_lo = _mm256_and_si256(c256, mask_f); + __m256i nibbles_hi = + _mm256_and_si256(_mm256_srli_epi16(c256, 4), mask_f); + + __m256i q_lo_vec = _mm256_loadu_si256( + reinterpret_cast(q_lo_ptr + i / 2)); + __m256i q_hi_vec = _mm256_loadu_si256( + reinterpret_cast(q_hi_ptr + i / 2)); + + __m256i diff_lo = _mm256_sub_epi8(q_lo_vec, nibbles_lo); + __m256i diff_hi = _mm256_sub_epi8(q_hi_vec, nibbles_hi); + + // AVX2 has no _mm256_abs_epi8; emulate via max(x, -x). + diff_lo = _mm256_max_epi8(diff_lo, _mm256_sub_epi8(zero, diff_lo)); + diff_hi = _mm256_max_epi8(diff_hi, _mm256_sub_epi8(zero, diff_hi)); + + __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); + __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); + + __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); + __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); + + acc = _mm256_add_epi32(acc, sum_lo); + acc = _mm256_add_epi32(acc, sum_hi); + } + + // Horizontal reduction. + __m128i acc_lo = _mm256_castsi256_si128(acc); + __m128i acc_hi = _mm256_extracti128_si256(acc, 1); + acc_lo = _mm_add_epi32(acc_lo, acc_hi); + acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); + acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); + int32_t sum = _mm_cvtsi128_si32(acc_lo); + + // Scalar tail. + for (; i < d; i++) { + uint8_t c = code[i / 2]; + uint8_t nibble = (i % 2 == 0) + ? static_cast(c & 0x0F) + : static_cast(c >> 4); + int q_code = (i % 2 == 0) ? q_lo[i / 2] : q_hi[i / 2]; + int diff = q_code - int(nibble); + sum += diff * diff; + } + + return static_cast(sum) * final_scale_sq; + } + + float symmetric_dis(idx_t i, idx_t j) override { + const uint8_t* c1 = codes + i * code_size; + const uint8_t* c2 = codes + j * code_size; + int64_t acc = 0; + for (size_t k = 0; k < d; k++) { + uint8_t a = (k % 2 == 0) + ? static_cast(c1[k / 2] & 0x0F) + : static_cast(c1[k / 2] >> 4); + uint8_t b = (k % 2 == 0) + ? static_cast(c2[k / 2] & 0x0F) + : static_cast(c2[k / 2] >> 4); + int diff = int(a) - int(b); + acc += diff * diff; + } + return static_cast(acc) * final_scale_sq; + } + + /// Batch-4: 128 dims per outer iter, two 64-dim chunks sharing q_lo/q_hi + /// loads across four input codes. Ported verbatim from the fork's AVX2 + /// DistanceComputerSQ4UByte_avx. AVX2 has no abs_epi8 so |diff| is + /// emulated via max(x, -x). + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + __m256i acc0 = _mm256_setzero_si256(); + __m256i acc1 = _mm256_setzero_si256(); + __m256i acc2 = _mm256_setzero_si256(); + __m256i acc3 = _mm256_setzero_si256(); + + const __m256i mask_f = _mm256_set1_epi8(0x0F); + const __m256i one = _mm256_set1_epi16(1); + const __m256i zero = _mm256_setzero_si256(); + + size_t i = 0; + // 128 dims per outer iter. + for (; i + 128 <= d; i += 128) { + __m256i q_lo_0 = _mm256_loadu_si256( + reinterpret_cast(q_lo_ptr + i / 2)); + __m256i q_hi_0 = _mm256_loadu_si256( + reinterpret_cast(q_hi_ptr + i / 2)); + + auto process_chunk_64 = [&](const uint8_t* code, + __m256i& acc, + __m256i q_lo_v, + __m256i q_hi_v, + int offset) { + __m256i c = _mm256_loadu_si256(reinterpret_cast( + code + i / 2 + offset)); + __m256i nibbles_lo = _mm256_and_si256(c, mask_f); + __m256i nibbles_hi = + _mm256_and_si256(_mm256_srli_epi16(c, 4), mask_f); + + __m256i diff_lo = _mm256_sub_epi8(q_lo_v, nibbles_lo); + __m256i diff_hi = _mm256_sub_epi8(q_hi_v, nibbles_hi); + + diff_lo = _mm256_max_epi8( + diff_lo, _mm256_sub_epi8(zero, diff_lo)); + diff_hi = _mm256_max_epi8( + diff_hi, _mm256_sub_epi8(zero, diff_hi)); + + __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); + __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); + + __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); + __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); + + acc = _mm256_add_epi32(acc, sum_lo); + acc = _mm256_add_epi32(acc, sum_hi); + }; + + process_chunk_64(code_0, acc0, q_lo_0, q_hi_0, 0); + process_chunk_64(code_1, acc1, q_lo_0, q_hi_0, 0); + process_chunk_64(code_2, acc2, q_lo_0, q_hi_0, 0); + process_chunk_64(code_3, acc3, q_lo_0, q_hi_0, 0); + + __m256i q_lo_1 = _mm256_loadu_si256( + reinterpret_cast(q_lo_ptr + i / 2 + 32)); + __m256i q_hi_1 = _mm256_loadu_si256( + reinterpret_cast(q_hi_ptr + i / 2 + 32)); + + process_chunk_64(code_0, acc0, q_lo_1, q_hi_1, 32); + process_chunk_64(code_1, acc1, q_lo_1, q_hi_1, 32); + process_chunk_64(code_2, acc2, q_lo_1, q_hi_1, 32); + process_chunk_64(code_3, acc3, q_lo_1, q_hi_1, 32); + } + + // 64-dim remainder chunk. + if (i + 64 <= d) { + __m256i q_lo_0 = _mm256_loadu_si256( + reinterpret_cast(q_lo_ptr + i / 2)); + __m256i q_hi_0 = _mm256_loadu_si256( + reinterpret_cast(q_hi_ptr + i / 2)); + + auto process = [&](const uint8_t* code, __m256i& acc) { + __m256i c = _mm256_loadu_si256( + reinterpret_cast(code + i / 2)); + __m256i nibbles_lo = _mm256_and_si256(c, mask_f); + __m256i nibbles_hi = + _mm256_and_si256(_mm256_srli_epi16(c, 4), mask_f); + + __m256i diff_lo = _mm256_sub_epi8(q_lo_0, nibbles_lo); + __m256i diff_hi = _mm256_sub_epi8(q_hi_0, nibbles_hi); + + diff_lo = _mm256_max_epi8( + diff_lo, _mm256_sub_epi8(zero, diff_lo)); + diff_hi = _mm256_max_epi8( + diff_hi, _mm256_sub_epi8(zero, diff_hi)); + + __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); + __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); + + __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); + __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); + + acc = _mm256_add_epi32(acc, sum_lo); + acc = _mm256_add_epi32(acc, sum_hi); + }; + + process(code_0, acc0); + process(code_1, acc1); + process(code_2, acc2); + process(code_3, acc3); + + i += 64; + } + + auto reduce = [](const __m256i& acc) -> int32_t { + __m128i acc_lo = _mm256_castsi256_si128(acc); + __m128i acc_hi = _mm256_extracti128_si256(acc, 1); + acc_lo = _mm_add_epi32(acc_lo, acc_hi); + acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); + acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); + return _mm_cvtsi128_si32(acc_lo); + }; + + dis0 = static_cast(reduce(acc0)); + dis1 = static_cast(reduce(acc1)); + dis2 = static_cast(reduce(acc2)); + dis3 = static_cast(reduce(acc3)); + + // Scalar tail. + for (; i < d; i++) { + uint8_t nibble_lo = q_lo[i / 2]; + uint8_t nibble_hi = q_hi[i / 2]; + + auto process_scalar = [&](const uint8_t* code, float& dis) { + uint8_t c = code[i / 2]; + uint8_t nibble = (i % 2 == 0) + ? static_cast(c & 0x0F) + : static_cast(c >> 4); + int q_code = (i % 2 == 0) ? nibble_lo : nibble_hi; + int diff = q_code - int(nibble); + dis += static_cast(diff * diff); + }; + + process_scalar(code_0, dis0); + process_scalar(code_1, dis1); + process_scalar(code_2, dis2); + process_scalar(code_3, dis3); + } + + dis0 *= final_scale_sq; + dis1 *= final_scale_sq; + dis2 *= final_scale_sq; + dis3 *= final_scale_sq; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss + +#include "../../../impl/scalar_quantizer/sq-avx2.cpp" + +#endif // COMPILE_SIMD_AVX2 diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp new file mode 100644 index 000000000..63004ec5f --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp @@ -0,0 +1,547 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/****************************************************************************** + * Knowhere-local prelude over baseline sq-avx512.cpp. + * + * What this file does, and why: + * - Declares a FULL template specialization of + * faiss::scalar_quantizer::DCTemplate, AVX512> + * for Q = QuantizerTemplate, UNIFORM, AVX512>. + * - Then textually `#include`s the baseline + * thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp. + * - Knowhere's CMake swaps this file in place of that baseline .cpp when + * building the faiss_avx512 object library. + * + * Effect: baseline's sq-dispatch.h dispatcher (included at the bottom of the + * baseline .cpp we pull in below) instantiates DCTemplate<...> for the + * template args the dispatcher writes for QT_4bit_uniform. C++ template + * lookup picks our full specialization because it is strictly more + * specialized than baseline's partial specialization for the AVX512 level. + * Non-matching combinations (other qtypes, IP metric) still resolve to + * baseline's partial specialization — nothing else changes. + * + * IMPORTANT constraint: the full specialization body must NOT contain a + * member of type Quantizer<...>. Inside baseline's sq-avx512.cpp, the AVX512 + * partial specializations of Codec4bit / QuantizerTemplate are declared + * BELOW the point at which we include that file here. At the point of our + * full specialization, those types are incomplete (primary template only), + * so we cannot have a member of that type. Workaround: read `trained[0]` + * and `trained[1]` directly in the constructor. + * + * For a detailed design note see the project plan's §1.2.A. + *****************************************************************************/ + +#ifdef COMPILE_SIMD_AVX512 + +#include +#include +#include +#include + +#include +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +/************************************************************************* + * QT_4bit_uniform + L2 fast path, AVX512. + * + * Math recap: for UNIFORM 4-bit scaling, + * recon(c) = vmin + vdiff * (c + 0.5) / 15 = final_scale * c + bias + * final_scale = vdiff / 15 + * L2(recon(q), recon(c)) = final_scale^2 * (q_c - c_c)^2 + * + * We pre-nibble the query floats into q_lo / q_hi (even / odd lanes) once + * at set_query time and then compute everything in the int domain, paying + * one float multiply at the end. + ************************************************************************/ + +template <> +struct DCTemplate< + QuantizerTemplate< + Codec4bit, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::AVX512>, + SimilarityL2, + SIMDLevel::AVX512> : SQDistanceComputer { + using Sim = SimilarityL2; + + size_t d; + float vmin; + float vdiff; + float final_scale_sq; + std::vector q_lo; + std::vector q_hi; + bool has_vnni; + + DCTemplate(size_t d_in, const std::vector& trained) + : d(d_in), + vmin(trained[0]), + vdiff(trained[1]), + // Over-allocate by 64 bytes so full 512-bit loads past the + // logical length are safe (readers mask off unused lanes). + q_lo((d_in + 1) / 2 + 64, 0), + q_hi((d_in + 1) / 2 + 64, 0), + has_vnni(__builtin_cpu_supports("avx512vnni")) { + const float final_scale = vdiff / 15.0f; + final_scale_sq = final_scale * final_scale; + } + + void set_query(const float* x) final { + this->q = x; + const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff; + for (size_t i = 0; i < d; i++) { + float val = (x[i] - vmin) * inv_scale; + int code = static_cast(std::floor(val + 0.5f)); + if (code < 0) { + code = 0; + } + if (code > 15) { + code = 15; + } + if (i % 2 == 0) { + q_lo[i / 2] = static_cast(code); + } else { + q_hi[i / 2] = static_cast(code); + } + } + } + + float query_to_code(const uint8_t* code) const final { + __m512i acc = _mm512_setzero_si512(); + const __m512i mask_f = _mm512_set1_epi8(0xF); + const __m512i one = _mm512_set1_epi16(1); + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + size_t i = 0; + // 128 dims per iteration (64 bytes of packed 4-bit codes). + for (; i + 128 <= d; i += 128) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2)); + + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i q_lo_vec = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_vec = _mm512_loadu_si512(q_hi_ptr + i / 2); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sq_sum = _mm512_add_epi16(sq_lo, sq_hi); + __m512i sum_32 = _mm512_madd_epi16(sq_sum, one); + + acc = _mm512_add_epi32(acc, sum_32); + } + + // Tail. q_lo / q_hi are over-allocated so masked loads past the + // logical length read zeros; code is also loaded with mask_even and + // nibbles_hi is masked separately so odd-lane overread is zeroed. + if (i < d) { + size_t rem = d - i; + uint64_t mask_even = (rem + 1) / 2 >= 64 + ? ~0ULL + : (1ULL << ((rem + 1) / 2)) - 1; + uint64_t mask_odd = + rem / 2 >= 64 ? ~0ULL : (1ULL << (rem / 2)) - 1; + + __m512i c512 = _mm512_maskz_loadu_epi8(mask_even, code + i / 2); + + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i q_lo_vec = + _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); + __m512i q_hi_vec = + _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); + + __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); + nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sq_sum = _mm512_add_epi16(sq_lo, sq_hi); + __m512i sum_32 = _mm512_madd_epi16(sq_sum, one); + + acc = _mm512_add_epi32(acc, sum_32); + } + + const int32_t sum = _mm512_reduce_add_epi32(acc); + return static_cast(sum) * final_scale_sq; + } + + float symmetric_dis(idx_t i, idx_t j) override { + // Not on the critical query path; scalar version suffices. + const uint8_t* c1 = codes + i * code_size; + const uint8_t* c2 = codes + j * code_size; + int64_t acc = 0; + for (size_t k = 0; k < d; k++) { + uint8_t a = (k % 2 == 0) + ? static_cast(c1[k / 2] & 0x0F) + : static_cast(c1[k / 2] >> 4); + uint8_t b = (k % 2 == 0) + ? static_cast(c2[k / 2] & 0x0F) + : static_cast(c2[k / 2] >> 4); + int diff = int(a) - int(b); + acc += diff * diff; + } + return static_cast(acc) * final_scale_sq; + } + + /// Batch-4 entry point: dispatches to VNNI or non-VNNI path based on + /// runtime CPU capability. Both paths process 256 dims per outer loop + /// iteration by amortizing two q_lo / q_hi chunks across the four + /// input codes. + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + if (has_vnni) { + query_to_codes_batch_4_vnni( + code_0, code_1, code_2, code_3, dis0, dis1, dis2, dis3); + } else { + query_to_codes_batch_4_avx512( + code_0, code_1, code_2, code_3, dis0, dis1, dis2, dis3); + } + } + + /// VNNI path: uses _mm512_dpbusd_epi32 to fuse square-and-accumulate. + /// Still valid because for 4-bit codes the differences are in [-15, 15] + /// and |diff|^2 fits in u8 × u8 → i32 without overflow. + __attribute__((target("avx512vnni"))) void query_to_codes_batch_4_vnni( + const uint8_t* __restrict code_0, + const uint8_t* __restrict code_1, + const uint8_t* __restrict code_2, + const uint8_t* __restrict code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const { + __m512i acc0 = _mm512_setzero_si512(); + __m512i acc1 = _mm512_setzero_si512(); + __m512i acc2 = _mm512_setzero_si512(); + __m512i acc3 = _mm512_setzero_si512(); + + const __m512i mask_f = _mm512_set1_epi8(0x0F); + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + size_t i = 0; + // 256 dims per iteration — two 128-dim chunks sharing two q loads. + for (; i + 256 <= d; i += 256) { + __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); + __m512i q_lo_1 = _mm512_loadu_si512(q_lo_ptr + i / 2 + 64); + __m512i q_hi_1 = _mm512_loadu_si512(q_hi_ptr + i / 2 + 64); + + auto process_chunk = [&](const uint8_t* code, + __m512i& acc, + __m512i q_lo_v, + __m512i q_hi_v, + int offset) + __attribute__((target("avx512vnni"))) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2 + offset)); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_v, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_v, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); + acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); + }; + + process_chunk(code_0, acc0, q_lo_0, q_hi_0, 0); + process_chunk(code_1, acc1, q_lo_0, q_hi_0, 0); + process_chunk(code_2, acc2, q_lo_0, q_hi_0, 0); + process_chunk(code_3, acc3, q_lo_0, q_hi_0, 0); + + process_chunk(code_0, acc0, q_lo_1, q_hi_1, 64); + process_chunk(code_1, acc1, q_lo_1, q_hi_1, 64); + process_chunk(code_2, acc2, q_lo_1, q_hi_1, 64); + process_chunk(code_3, acc3, q_lo_1, q_hi_1, 64); + } + + // 128-dim remainder (one q chunk). + if (i + 128 <= d) { + __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); + + auto process_chunk = [&](const uint8_t* code, __m512i& acc) + __attribute__((target("avx512vnni"))) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2)); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_0, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_0, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); + acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); + }; + + process_chunk(code_0, acc0); + process_chunk(code_1, acc1); + process_chunk(code_2, acc2); + process_chunk(code_3, acc3); + + i += 128; + } + + // Sub-128-dim tail with masked loads. + if (i < d) { + size_t rem = d - i; + uint64_t mask_even = (rem + 1) / 2 >= 64 + ? ~0ULL + : (1ULL << ((rem + 1) / 2)) - 1; + uint64_t mask_odd = + rem / 2 >= 64 ? ~0ULL : (1ULL << (rem / 2)) - 1; + + __m512i q_lo_vec = + _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); + __m512i q_hi_vec = + _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); + __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); + + auto process = [&](const uint8_t* code, __m512i& acc) + __attribute__((target("avx512vnni"))) { + __m512i c512 = + _mm512_maskz_loadu_epi8(mask_even, code + i / 2); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); + acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); + }; + + process(code_0, acc0); + process(code_1, acc1); + process(code_2, acc2); + process(code_3, acc3); + } + + dis0 = static_cast(_mm512_reduce_add_epi32(acc0)) * + final_scale_sq; + dis1 = static_cast(_mm512_reduce_add_epi32(acc1)) * + final_scale_sq; + dis2 = static_cast(_mm512_reduce_add_epi32(acc2)) * + final_scale_sq; + dis3 = static_cast(_mm512_reduce_add_epi32(acc3)) * + final_scale_sq; + } + + /// Non-VNNI path: squares via _mm512_maddubs_epi16 (u8×u8 → i16) and + /// accumulates to i32 with _mm512_madd_epi16. Same 256-dim outer loop. + void query_to_codes_batch_4_avx512( + const uint8_t* __restrict code_0, + const uint8_t* __restrict code_1, + const uint8_t* __restrict code_2, + const uint8_t* __restrict code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const { + __m512i acc0 = _mm512_setzero_si512(); + __m512i acc1 = _mm512_setzero_si512(); + __m512i acc2 = _mm512_setzero_si512(); + __m512i acc3 = _mm512_setzero_si512(); + + const __m512i mask_f = _mm512_set1_epi8(0x0F); + const __m512i one = _mm512_set1_epi16(1); + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + size_t i = 0; + for (; i + 256 <= d; i += 256) { + __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); + __m512i q_lo_1 = _mm512_loadu_si512(q_lo_ptr + i / 2 + 64); + __m512i q_hi_1 = _mm512_loadu_si512(q_hi_ptr + i / 2 + 64); + + auto process_chunk = [&](const uint8_t* code, + __m512i& acc, + __m512i q_lo_v, + __m512i q_hi_v, + int offset) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2 + offset)); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_v, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_v, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); + __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); + + acc = _mm512_add_epi32(acc, sum_lo); + acc = _mm512_add_epi32(acc, sum_hi); + }; + + process_chunk(code_0, acc0, q_lo_0, q_hi_0, 0); + process_chunk(code_1, acc1, q_lo_0, q_hi_0, 0); + process_chunk(code_2, acc2, q_lo_0, q_hi_0, 0); + process_chunk(code_3, acc3, q_lo_0, q_hi_0, 0); + + process_chunk(code_0, acc0, q_lo_1, q_hi_1, 64); + process_chunk(code_1, acc1, q_lo_1, q_hi_1, 64); + process_chunk(code_2, acc2, q_lo_1, q_hi_1, 64); + process_chunk(code_3, acc3, q_lo_1, q_hi_1, 64); + } + + if (i + 128 <= d) { + __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); + + auto process_chunk = [&](const uint8_t* code, __m512i& acc) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2)); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_0, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_0, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); + __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); + + acc = _mm512_add_epi32(acc, sum_lo); + acc = _mm512_add_epi32(acc, sum_hi); + }; + + process_chunk(code_0, acc0); + process_chunk(code_1, acc1); + process_chunk(code_2, acc2); + process_chunk(code_3, acc3); + + i += 128; + } + + if (i < d) { + size_t rem = d - i; + uint64_t mask_even = (rem + 1) / 2 >= 64 + ? ~0ULL + : (1ULL << ((rem + 1) / 2)) - 1; + uint64_t mask_odd = + rem / 2 >= 64 ? ~0ULL : (1ULL << (rem / 2)) - 1; + + __m512i q_lo_vec = + _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); + __m512i q_hi_vec = + _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); + __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); + + auto process = [&](const uint8_t* code, __m512i& acc) { + __m512i c512 = + _mm512_maskz_loadu_epi8(mask_even, code + i / 2); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); + __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); + + acc = _mm512_add_epi32(acc, sum_lo); + acc = _mm512_add_epi32(acc, sum_hi); + }; + + process(code_0, acc0); + process(code_1, acc1); + process(code_2, acc2); + process(code_3, acc3); + } + + dis0 = static_cast(_mm512_reduce_add_epi32(acc0)) * + final_scale_sq; + dis1 = static_cast(_mm512_reduce_add_epi32(acc1)) * + final_scale_sq; + dis2 = static_cast(_mm512_reduce_add_epi32(acc2)) * + final_scale_sq; + dis3 = static_cast(_mm512_reduce_add_epi32(acc3)) * + final_scale_sq; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss + +// Pull in baseline's sq-avx512.cpp. Its AVX512 partial specializations of +// Codec / QuantizerTemplate / Similarity / DCTemplate, its Similarity +// structs, and its dispatcher instantiation all come online after this +// point. Our full specialization above is already visible, so at the +// instantiation moment inside sq-dispatch.h, C++ template lookup selects +// it over the partial one. +#include "../../../impl/scalar_quantizer/sq-avx512.cpp" + +#endif // COMPILE_SIMD_AVX512 diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp new file mode 100644 index 000000000..0158368bf --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/****************************************************************************** + * Knowhere-local prelude over baseline sq-neon.cpp. + * + * See sq-avx512-fastpath.cpp for the full design note on how this pattern + * works. This file ports the NEON variant of the fork's + * DistanceComputerSQ4UByte for QT_4bit_uniform + L2. + *****************************************************************************/ + +#ifdef COMPILE_SIMD_ARM_NEON + +#include +#include +#include +#include + +#include +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +template <> +struct DCTemplate< + QuantizerTemplate< + Codec4bit, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::ARM_NEON>, + SimilarityL2, + SIMDLevel::ARM_NEON> : SQDistanceComputer { + using Sim = SimilarityL2; + + size_t d; + float vmin; + float vdiff; + float final_scale_sq; + std::vector q_lo; + std::vector q_hi; + + DCTemplate(size_t d_in, const std::vector& trained) + : d(d_in), + vmin(trained[0]), + vdiff(trained[1]), + // Over-allocate by 16 bytes for safe 128-bit vld1q_u8 past + // the logical length (readers ignore out-of-range lanes). + q_lo((d_in + 1) / 2 + 16, 0), + q_hi((d_in + 1) / 2 + 16, 0) { + const float final_scale = vdiff / 15.0f; + final_scale_sq = final_scale * final_scale; + } + + void set_query(const float* x) final { + this->q = x; + const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff; + for (size_t i = 0; i < d; i++) { + float val = (x[i] - vmin) * inv_scale; + int code = static_cast(std::floor(val + 0.5f)); + if (code < 0) { + code = 0; + } + if (code > 15) { + code = 15; + } + if (i % 2 == 0) { + q_lo[i / 2] = static_cast(code); + } else { + q_hi[i / 2] = static_cast(code); + } + } + } + + float query_to_code(const uint8_t* code) const final { + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + uint32x4_t acc = vdupq_n_u32(0); + const uint8x16_t mask_f = vdupq_n_u8(0x0F); + + size_t i = 0; + // 32 dims per iteration (16 bytes of packed 4-bit codes). + for (; i + 32 <= d; i += 32) { + uint8x16_t c = vld1q_u8(code + i / 2); + + uint8x16_t nibbles_lo = vandq_u8(c, mask_f); + uint8x16_t nibbles_hi = vandq_u8(vshrq_n_u8(c, 4), mask_f); + + uint8x16_t q_lo_vec = vld1q_u8(q_lo_ptr + i / 2); + uint8x16_t q_hi_vec = vld1q_u8(q_hi_ptr + i / 2); + + uint8x16_t diff_lo = vabdq_u8(q_lo_vec, nibbles_lo); + uint8x16_t diff_hi = vabdq_u8(q_hi_vec, nibbles_hi); + + // Widen + square — each byte in [0, 15] so squared fits in u16. + uint16x8_t sq_lo_1 = + vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); + uint16x8_t sq_lo_2 = + vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); + uint16x8_t sq_hi_1 = + vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); + uint16x8_t sq_hi_2 = + vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); + + acc = vpadalq_u16(acc, sq_lo_1); + acc = vpadalq_u16(acc, sq_lo_2); + acc = vpadalq_u16(acc, sq_hi_1); + acc = vpadalq_u16(acc, sq_hi_2); + } + + uint32_t sum = vaddvq_u32(acc); + + // Scalar tail. + for (; i < d; i++) { + uint8_t c = code[i / 2]; + uint8_t nibble = (i % 2 == 0) + ? static_cast(c & 0x0F) + : static_cast(c >> 4); + int q_code = (i % 2 == 0) ? q_lo[i / 2] : q_hi[i / 2]; + int diff = q_code - int(nibble); + sum += diff * diff; + } + + return static_cast(sum) * final_scale_sq; + } + + float symmetric_dis(idx_t i, idx_t j) override { + const uint8_t* c1 = codes + i * code_size; + const uint8_t* c2 = codes + j * code_size; + int64_t acc = 0; + for (size_t k = 0; k < d; k++) { + uint8_t a = (k % 2 == 0) + ? static_cast(c1[k / 2] & 0x0F) + : static_cast(c1[k / 2] >> 4); + uint8_t b = (k % 2 == 0) + ? static_cast(c2[k / 2] & 0x0F) + : static_cast(c2[k / 2] >> 4); + int diff = int(a) - int(b); + acc += diff * diff; + } + return static_cast(acc) * final_scale_sq; + } + + /// Batch-4: 32 dims per outer iter with four parallel u32 accumulators, + /// amortizing the q_lo / q_hi load across four input codes. Ported + /// verbatim from the fork's NEON DistanceComputerSQ4UByte_neon. + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + uint32x4_t acc0 = vdupq_n_u32(0); + uint32x4_t acc1 = vdupq_n_u32(0); + uint32x4_t acc2 = vdupq_n_u32(0); + uint32x4_t acc3 = vdupq_n_u32(0); + + const uint8x16_t mask_f = vdupq_n_u8(0x0F); + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + size_t i = 0; + for (; i + 32 <= d; i += 32) { + uint8x16_t q_lo_vec = vld1q_u8(q_lo_ptr + i / 2); + uint8x16_t q_hi_vec = vld1q_u8(q_hi_ptr + i / 2); + + auto process = [&](const uint8_t* code, uint32x4_t& acc) { + uint8x16_t c = vld1q_u8(code + i / 2); + uint8x16_t nibbles_lo = vandq_u8(c, mask_f); + uint8x16_t nibbles_hi = vandq_u8(vshrq_n_u8(c, 4), mask_f); + + uint8x16_t diff_lo = vabdq_u8(q_lo_vec, nibbles_lo); + uint8x16_t diff_hi = vabdq_u8(q_hi_vec, nibbles_hi); + + uint16x8_t sq_lo_1 = + vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); + uint16x8_t sq_lo_2 = + vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); + uint16x8_t sq_hi_1 = + vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); + uint16x8_t sq_hi_2 = + vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); + + acc = vpadalq_u16(acc, sq_lo_1); + acc = vpadalq_u16(acc, sq_lo_2); + acc = vpadalq_u16(acc, sq_hi_1); + acc = vpadalq_u16(acc, sq_hi_2); + }; + + process(code_0, acc0); + process(code_1, acc1); + process(code_2, acc2); + process(code_3, acc3); + } + + dis0 = static_cast(vaddvq_u32(acc0)); + dis1 = static_cast(vaddvq_u32(acc1)); + dis2 = static_cast(vaddvq_u32(acc2)); + dis3 = static_cast(vaddvq_u32(acc3)); + + // Scalar tail. + if (i < d) { + size_t rem = d - i; + for (size_t j = 0; j < rem; j++) { + size_t idx = i + j; + uint8_t nibble_lo = q_lo[idx / 2]; + uint8_t nibble_hi = q_hi[idx / 2]; + + auto process_scalar = [&](const uint8_t* code, float& dis) { + uint8_t c = code[idx / 2]; + uint8_t nibble = (idx % 2 == 0) + ? static_cast(c & 0x0F) + : static_cast(c >> 4); + int q_code = (idx % 2 == 0) ? nibble_lo : nibble_hi; + int diff = q_code - int(nibble); + dis += static_cast(diff * diff); + }; + + process_scalar(code_0, dis0); + process_scalar(code_1, dis1); + process_scalar(code_2, dis2); + process_scalar(code_3, dis3); + } + } + + dis0 *= final_scale_sq; + dis1 *= final_scale_sq; + dis2 *= final_scale_sq; + dis3 *= final_scale_sq; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss + +#include "../../../impl/scalar_quantizer/sq-neon.cpp" + +#endif // COMPILE_SIMD_ARM_NEON diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/utils/distances.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/utils/distances.cpp index 23ac36f34..ef15f6366 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/utils/distances.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/utils/distances.cpp @@ -22,8 +22,8 @@ #include "knowhere/bitsetview_idselector.h" #include "knowhere/object.h" -#include #include +#include "simd/hook.h" #include #include #include diff --git a/thirdparty/faiss/faiss/gpu/GpuDistance.cu b/thirdparty/faiss/faiss/gpu/GpuDistance.cu index 2f330b523..112d9cce1 100644 --- a/thirdparty/faiss/faiss/gpu/GpuDistance.cu +++ b/thirdparty/faiss/faiss/gpu/GpuDistance.cu @@ -318,8 +318,14 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { cuvs::neighbors::brute_force::index idx( handle, index.view(), norms_view, distance, metric_arg); + cuvs::neighbors::brute_force::search_params search_params_bf; cuvs::neighbors::brute_force::search( - handle, idx, search.view(), inds.view(), dists.view()); + handle, + search_params_bf, + idx, + search.view(), + inds.view(), + dists.view()); } else { auto index = raft::make_readonly_temporary_device_buffer< const float, @@ -357,8 +363,14 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { cuvs::neighbors::brute_force::index idx( handle, index.view(), norms_view, distance, metric_arg); + cuvs::neighbors::brute_force::search_params search_params_bf; cuvs::neighbors::brute_force::search( - handle, idx, search.view(), inds.view(), dists.view()); + handle, + search_params_bf, + idx, + search.view(), + inds.view(), + dists.view()); } if (args.metric == MetricType::METRIC_Lp) { diff --git a/thirdparty/faiss/faiss/gpu/GpuIndexBinaryCagra.cu b/thirdparty/faiss/faiss/gpu/GpuIndexBinaryCagra.cu index 3acb23714..46b6efb63 100644 --- a/thirdparty/faiss/faiss/gpu/GpuIndexBinaryCagra.cu +++ b/thirdparty/faiss/faiss/gpu/GpuIndexBinaryCagra.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -119,8 +120,6 @@ void GpuIndexBinaryCagra::search( return; } - FAISS_THROW_IF_NOT_MSG(!params, "params not implemented"); - // validateKSelect(k); // The input vectors may be too large for the GPU, but we still @@ -260,7 +259,8 @@ void GpuIndexBinaryCagra::searchImpl_( params->hashmap_min_bitlen, params->hashmap_max_fill_rate, params->num_random_samplings, - params->seed); + params->seed, + params->sel); if (not search_params) { delete params; @@ -281,6 +281,10 @@ void GpuIndexBinaryCagra::copyFrom(const faiss::IndexBinaryHNSWCagra* index) { IndexBinaryFlat* flat_storage = dynamic_cast(index->storage); FAISS_ASSERT(flat_storage); + fprintf(stderr, + "WARNING: GpuIndexBinaryCagra::copyFrom uses non-owning CPU " + "storage. Keep the source IndexBinaryHNSWCagra alive for the " + "lifetime of the GpuIndexBinaryCagra.\n"); auto hnsw = index->hnsw; // copy level 0 to a dense knn graph matrix diff --git a/thirdparty/faiss/faiss/gpu/GpuIndexCagra.cu b/thirdparty/faiss/faiss/gpu/GpuIndexCagra.cu index a2d42daab..2843a58a7 100644 --- a/thirdparty/faiss/faiss/gpu/GpuIndexCagra.cu +++ b/thirdparty/faiss/faiss/gpu/GpuIndexCagra.cu @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -228,7 +229,8 @@ void GpuIndexCagra::searchImpl_ex_( params->hashmap_min_bitlen, params->hashmap_max_fill_rate, params->num_random_samplings, - params->seed); + params->seed, + params->sel); } else if (numeric_type == NumericType::Float16) { Tensor queries( @@ -251,7 +253,8 @@ void GpuIndexCagra::searchImpl_ex_( params->hashmap_min_bitlen, params->hashmap_max_fill_rate, params->num_random_samplings, - params->seed); + params->seed, + params->sel); } else if (numeric_type == NumericType::Int8) { Tensor queries( const_cast(static_cast(x)), @@ -274,7 +277,8 @@ void GpuIndexCagra::searchImpl_ex_( params->hashmap_min_bitlen, params->hashmap_max_fill_rate, params->num_random_samplings, - params->seed); + params->seed, + params->sel); } else { FAISS_THROW_MSG("GpuIndexCagra::searchImpl_ unsupported data type"); } @@ -312,9 +316,10 @@ void GpuIndexCagra::copyFrom_ex( GpuIndex::copyFrom(index); auto hnsw = index->hnsw; + // copy level 0 to a dense knn graph matrix std::vector knn_graph; - knn_graph.reserve(index->ntotal * hnsw.nb_neighbors(0)); + knn_graph.resize(index->ntotal * hnsw.nb_neighbors(0)); #pragma omp parallel for for (size_t i = 0; i < index->ntotal; ++i) { @@ -331,6 +336,10 @@ void GpuIndexCagra::copyFrom_ex( auto base_index = dynamic_cast(index->storage); FAISS_ASSERT(base_index); auto dataset = base_index->get_xb(); + fprintf(stderr, + "WARNING: GpuIndexCagra::copyFrom uses non-owning CPU storage. " + "Keep the source IndexHNSWCagra alive for the lifetime of the " + "GpuIndexCagra.\n"); index_ = std::make_shared>( this->resources_.get(), @@ -345,7 +354,11 @@ void GpuIndexCagra::copyFrom_ex( } else if (numeric_type == NumericType::Float16) { auto base_index = dynamic_cast(index->storage); FAISS_ASSERT(base_index); - auto dataset = (half*)base_index->codes.data(); + auto dataset = reinterpret_cast(base_index->codes.data()); + fprintf(stderr, + "WARNING: GpuIndexCagra::copyFrom uses non-owning CPU storage. " + "Keep the source IndexHNSWCagra alive for the lifetime of the " + "GpuIndexCagra.\n"); index_ = std::make_shared>( this->resources_.get(), @@ -361,9 +374,11 @@ void GpuIndexCagra::copyFrom_ex( auto base_index = dynamic_cast(index->storage); FAISS_ASSERT(base_index); auto dataset = (uint8_t*)base_index->codes.data(); + fprintf(stderr, + "WARNING: GpuIndexCagra::copyFrom uses non-owning CPU storage. " + "Keep the source IndexHNSWCagra alive for the lifetime of the " + "GpuIndexCagra.\n"); - // decode what was encoded by Quantizer8bitDirectSigned in - // ScalarQuantizer int8_t* decoded_train_dataset = new int8_t[index->ntotal * index->d]; for (int i = 0; i < index->ntotal * this->d; i++) { decoded_train_dataset[i] = dataset[i] - 128; diff --git a/thirdparty/faiss/faiss/gpu/GpuIndexCagra.h b/thirdparty/faiss/faiss/gpu/GpuIndexCagra.h index 33d14553a..1f2a124c2 100644 --- a/thirdparty/faiss/faiss/gpu/GpuIndexCagra.h +++ b/thirdparty/faiss/faiss/gpu/GpuIndexCagra.h @@ -28,6 +28,7 @@ #include #include +#include #include "faiss/Index.h" namespace faiss { diff --git a/thirdparty/faiss/faiss/gpu/GpuIndexFlat.cu b/thirdparty/faiss/faiss/gpu/GpuIndexFlat.cu index eb87e082e..456f42fcc 100644 --- a/thirdparty/faiss/faiss/gpu/GpuIndexFlat.cu +++ b/thirdparty/faiss/faiss/gpu/GpuIndexFlat.cu @@ -224,8 +224,16 @@ void GpuIndexFlat::searchImpl_( Tensor outDistances(distances, {n, k}); Tensor outLabels(labels, {n, k}); + const IDSelector* sel = params ? params->sel : nullptr; data_->query( - queries, k, metric_type, metric_arg, outDistances, outLabels, true); + queries, + k, + metric_type, + metric_arg, + outDistances, + outLabels, + true, + sel); } void GpuIndexFlat::reconstruct(idx_t key, float* out) const { diff --git a/thirdparty/faiss/faiss/gpu/GpuIndexIVF.cu b/thirdparty/faiss/faiss/gpu/GpuIndexIVF.cu index a549703c2..8029f0cc3 100644 --- a/thirdparty/faiss/faiss/gpu/GpuIndexIVF.cu +++ b/thirdparty/faiss/faiss/gpu/GpuIndexIVF.cu @@ -342,8 +342,10 @@ void GpuIndexIVF::searchImpl_( Tensor outDistances(distances, {n, k}); Tensor outLabels(const_cast(labels), {n, k}); + const IDSelector* sel = params ? params->sel : nullptr; + baseIndex_->search( - quantizer, queries, use_nprobe, k, outDistances, outLabels); + quantizer, queries, use_nprobe, k, outDistances, outLabels, sel); } void GpuIndexIVF::search_preassigned( diff --git a/thirdparty/faiss/faiss/gpu/impl/BinaryCuvsCagra.cu b/thirdparty/faiss/faiss/gpu/impl/BinaryCuvsCagra.cu index 60cbe53fa..cfa178ef4 100644 --- a/thirdparty/faiss/faiss/gpu/impl/BinaryCuvsCagra.cu +++ b/thirdparty/faiss/faiss/gpu/impl/BinaryCuvsCagra.cu @@ -22,10 +22,12 @@ */ #include +#include #include #include #include +#include #include #include #include @@ -202,7 +204,8 @@ void BinaryCuvsCagra::search( idx_t hashmap_min_bitlen, float hashmap_max_fill_rate, idx_t num_random_samplings, - idx_t rand_xor_mask) { + idx_t rand_xor_mask, + const IDSelector* sel) { const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); idx_t numQueries = queries.getSize(0); @@ -255,13 +258,31 @@ void BinaryCuvsCagra::search( raft_handle, numQueries, k_); auto distances_float_view = distances_float.view(); + std::optional> bitset_holder; + std::optional> + bitset_filter; + cuvs::neighbors::filtering::none_sample_filter none_filter; + + if (sel) { + bitset_holder = + cuvs::core::bitset(raft_handle, n_, false); + faiss::gpu::convert_to_bitset(resources_, *sel, bitset_holder->view()); + bitset_filter.emplace(bitset_holder->view()); + } + const cuvs::neighbors::filtering::base_filter& filter_ref = sel + ? static_cast( + bitset_filter.value()) + : static_cast( + none_filter); + cuvs::neighbors::cagra::search( raft_handle, search_pams, *cuvs_index, queries_view, indices_copy.view(), - distances_float_view); + distances_float_view, + filter_ref); thrust::copy( raft::resource::get_thrust_policy(raft_handle), diff --git a/thirdparty/faiss/faiss/gpu/impl/BinaryCuvsCagra.cuh b/thirdparty/faiss/faiss/gpu/impl/BinaryCuvsCagra.cuh index a14480bcb..3c7a9eb68 100644 --- a/thirdparty/faiss/faiss/gpu/impl/BinaryCuvsCagra.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/BinaryCuvsCagra.cuh @@ -31,6 +31,7 @@ #include #include +#include #include @@ -80,7 +81,8 @@ class BinaryCuvsCagra { idx_t hashmap_min_bitlen, float hashmap_max_fill_rate, idx_t num_random_samplings, - idx_t rand_xor_mask); + idx_t rand_xor_mask, + const IDSelector* sel = nullptr); void reset(); diff --git a/thirdparty/faiss/faiss/gpu/impl/CuvsCagra.cu b/thirdparty/faiss/faiss/gpu/impl/CuvsCagra.cu index 755817f43..c28046a59 100644 --- a/thirdparty/faiss/faiss/gpu/impl/CuvsCagra.cu +++ b/thirdparty/faiss/faiss/gpu/impl/CuvsCagra.cu @@ -22,10 +22,12 @@ */ #include +#include #include #include #include +#include #include #include #include @@ -33,6 +35,7 @@ #include #include +#include namespace faiss { namespace gpu { @@ -235,7 +238,8 @@ void CuvsCagra::search( idx_t hashmap_min_bitlen, float hashmap_max_fill_rate, idx_t num_random_samplings, - idx_t rand_xor_mask) { + idx_t rand_xor_mask, + const IDSelector* sel) { const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); idx_t numQueries = queries.getSize(0); @@ -286,13 +290,31 @@ void CuvsCagra::search( auto indices_copy = raft::make_device_matrix( raft_handle, numQueries, k_); + std::optional> bitset_holder; + std::optional> + bitset_filter; + cuvs::neighbors::filtering::none_sample_filter none_filter; + + if (sel) { + bitset_holder = + cuvs::core::bitset(raft_handle, n_, false); + faiss::gpu::convert_to_bitset(resources_, *sel, bitset_holder->view()); + bitset_filter.emplace(bitset_holder->view()); + } + const cuvs::neighbors::filtering::base_filter& filter_ref = sel + ? static_cast( + bitset_filter.value()) + : static_cast( + none_filter); + cuvs::neighbors::cagra::search( raft_handle, search_pams, *cuvs_index, queries_view, indices_copy.view(), - distances_view); + distances_view, + filter_ref); thrust::copy( raft::resource::get_thrust_policy(raft_handle), indices_copy.data_handle(), diff --git a/thirdparty/faiss/faiss/gpu/impl/CuvsCagra.cuh b/thirdparty/faiss/faiss/gpu/impl/CuvsCagra.cuh index 224f5585a..b4c1989c6 100644 --- a/thirdparty/faiss/faiss/gpu/impl/CuvsCagra.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/CuvsCagra.cuh @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -103,7 +104,8 @@ class CuvsCagra { idx_t hashmap_min_bitlen, float hashmap_max_fill_rate, idx_t num_random_samplings, - idx_t rand_xor_mask); + idx_t rand_xor_mask, + const IDSelector* sel = nullptr); void reset(); diff --git a/thirdparty/faiss/faiss/gpu/impl/CuvsFlatIndex.cu b/thirdparty/faiss/faiss/gpu/impl/CuvsFlatIndex.cu index d1669831c..d4a2d99fe 100644 --- a/thirdparty/faiss/faiss/gpu/impl/CuvsFlatIndex.cu +++ b/thirdparty/faiss/faiss/gpu/impl/CuvsFlatIndex.cu @@ -21,6 +21,7 @@ * limitations under the License. */ +#include #include #include #include @@ -28,6 +29,8 @@ #include #include +#include +#include #include #include #include @@ -53,26 +56,22 @@ void CuvsFlatIndex::query( float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance) { - /** - * cuVS doesn't yet support half-precision in bfknn. - * Use FlatIndex for float16 for now - */ + bool exactDistance, + const IDSelector* sel) { if (useFloat16_) { - auto stream = resources_->getDefaultStreamCurrentDevice(); - // We need to convert the input to float16 for comparison to ourselves + auto stream = resources_->getDefaultStreamCurrentDevice(); auto inputHalf = convertTensorTemporary( resources_, stream, input); - - FlatIndex::query( + CuvsFlatIndex::query( inputHalf, k, metric, metricArg, outDistances, outIndices, - exactDistance); + exactDistance, + sel); } else { raft::device_resources& handle = resources_->getRaftHandleCurrentDevice(); @@ -99,7 +98,29 @@ void CuvsFlatIndex::query( cuvs::neighbors::brute_force::index idx( handle, index, norms_view, distance, metricArg); - cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists); + + std::optional> bitset_cuvs; + std::optional< + cuvs::neighbors::filtering::bitset_filter> + bitset_filter_cuvs; + cuvs::neighbors::filtering::none_sample_filter none_filter; + + if (sel) { + bitset_cuvs = cuvs::core::bitset( + handle, vectors_.getSize(0), false); + faiss::gpu::convert_to_bitset( + resources_, *sel, bitset_cuvs->view()); + bitset_filter_cuvs.emplace(bitset_cuvs->view()); + } + const cuvs::neighbors::filtering::base_filter& filter_ref = sel + ? static_cast( + bitset_filter_cuvs.value()) + : static_cast( + none_filter); + cuvs::neighbors::brute_force::search_params search_params_bf; + + cuvs::neighbors::brute_force::search( + handle, search_params_bf, idx, search, inds, dists, filter_ref); if (metric == MetricType::METRIC_Lp) { raft::linalg::unary_op( @@ -126,18 +147,70 @@ void CuvsFlatIndex::query( float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance) { + bool exactDistance, + const IDSelector* sel) { FAISS_ASSERT(useFloat16_); - // FIXME: ref https://github.com/rapidsai/raft/issues/1280 - FlatIndex::query( - vecs, - k, - metric, - metricArg, - outDistances, - outIndices, - exactDistance); + raft::device_resources& handle = resources_->getRaftHandleCurrentDevice(); + + auto index = raft::make_device_matrix_view( + vectorsHalf_.data(), + vectorsHalf_.getSize(0), + vectorsHalf_.getSize(1)); + auto search = raft::make_device_matrix_view( + vecs.data(), vecs.getSize(0), vecs.getSize(1)); + + auto inds = raft::make_device_matrix_view( + outIndices.data(), outIndices.getSize(0), outIndices.getSize(1)); + auto dists = raft::make_device_matrix_view( + outDistances.data(), + outDistances.getSize(0), + outDistances.getSize(1)); + + auto distance = metricFaissToCuvs(metric, exactDistance); + + std::optional> norms_view = + raft::make_device_vector_view(norms_.data(), norms_.getSize(0)); + + cuvs::neighbors::brute_force::index idx( + handle, index, norms_view, distance, metricArg); + + std::optional> bitset_cuvs; + std::optional> + bitset_filter_cuvs; + cuvs::neighbors::filtering::none_sample_filter none_filter; + + if (sel) { + bitset_cuvs = cuvs::core::bitset( + handle, vectorsHalf_.getSize(0), false); + faiss::gpu::convert_to_bitset(resources_, *sel, bitset_cuvs->view()); + bitset_filter_cuvs.emplace(bitset_cuvs->view()); + } + const cuvs::neighbors::filtering::base_filter& filter_ref = sel + ? static_cast( + bitset_filter_cuvs.value()) + : static_cast( + none_filter); + cuvs::neighbors::brute_force::search_params search_params_bf; + + cuvs::neighbors::brute_force::search( + handle, search_params_bf, idx, search, inds, dists, filter_ref); + + if (metric == MetricType::METRIC_Lp) { + raft::linalg::unary_op( + handle, + raft::make_const_mdspan(dists), + dists, + [metricArg] __device__(const float& a) { + return powf(a, metricArg); + }); + } else if (metric == MetricType::METRIC_JensenShannon) { + raft::linalg::unary_op( + handle, + raft::make_const_mdspan(dists), + dists, + [] __device__(const float& a) { return powf(a, 2); }); + } } } // namespace gpu diff --git a/thirdparty/faiss/faiss/gpu/impl/CuvsFlatIndex.cuh b/thirdparty/faiss/faiss/gpu/impl/CuvsFlatIndex.cuh index b856351cf..f77a67e48 100644 --- a/thirdparty/faiss/faiss/gpu/impl/CuvsFlatIndex.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/CuvsFlatIndex.cuh @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -55,7 +56,8 @@ class CuvsFlatIndex : public FlatIndex { float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance) override; + bool exactDistance, + const IDSelector* sel) override; void query( Tensor& vecs, @@ -64,7 +66,8 @@ class CuvsFlatIndex : public FlatIndex { float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance) override; + bool exactDistance, + const IDSelector* sel) override; }; } // namespace gpu diff --git a/thirdparty/faiss/faiss/gpu/impl/CuvsIVFFlat.cu b/thirdparty/faiss/faiss/gpu/impl/CuvsIVFFlat.cu index a605f90c0..59ccb2178 100644 --- a/thirdparty/faiss/faiss/gpu/impl/CuvsIVFFlat.cu +++ b/thirdparty/faiss/faiss/gpu/impl/CuvsIVFFlat.cu @@ -26,14 +26,17 @@ #include #include +#include #include #include #include #include #include +#include #include #include +#include #include #include @@ -100,7 +103,8 @@ void CuvsIVFFlat::search( int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) { + Tensor& outIndices, + const IDSelector* sel) { /// NB: The coarse quantizer is ignored here. The user is assumed to have /// called updateQuantizer() to modify the cuVS index if the quantizer was /// modified externally @@ -127,13 +131,31 @@ void CuvsIVFFlat::search( auto out_dists_view = raft::make_device_matrix_view( outDistances.data(), (idx_t)numQueries, (idx_t)k_); + std::optional> bitset_cuvs; + std::optional> + bitset_filter_cuvs; + cuvs::neighbors::filtering::none_sample_filter none_filter; + + if (sel) { + bitset_cuvs = cuvs::core::bitset( + raft_handle, cuvs_index->size(), false); + faiss::gpu::convert_to_bitset(resources_, *sel, bitset_cuvs->view()); + bitset_filter_cuvs.emplace(bitset_cuvs->view()); + } + const cuvs::neighbors::filtering::base_filter& filter_ref = sel + ? static_cast( + bitset_filter_cuvs.value()) + : static_cast( + none_filter); + cuvs::neighbors::ivf_flat::search( raft_handle, pams, *cuvs_index, queries_view, out_inds_view, - out_dists_view); + out_dists_view, + filter_ref); /// Identify NaN rows and mask their nearest neighbors auto nan_flag = raft::make_device_vector(raft_handle, numQueries); diff --git a/thirdparty/faiss/faiss/gpu/impl/CuvsIVFFlat.cuh b/thirdparty/faiss/faiss/gpu/impl/CuvsIVFFlat.cuh index 0ad94f76f..781d02e03 100644 --- a/thirdparty/faiss/faiss/gpu/impl/CuvsIVFFlat.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/CuvsIVFFlat.cuh @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include @@ -63,7 +64,8 @@ class CuvsIVFFlat : public IVFFlat { int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) override; + Tensor& outIndices, + const IDSelector* sel = nullptr) override; /// Performs search when we are already given the IVF cells to look at /// (GpuIndexIVF::search_preassigned implementation) diff --git a/thirdparty/faiss/faiss/gpu/impl/CuvsIVFPQ.cu b/thirdparty/faiss/faiss/gpu/impl/CuvsIVFPQ.cu index f2f293026..1711b2f27 100644 --- a/thirdparty/faiss/faiss/gpu/impl/CuvsIVFPQ.cu +++ b/thirdparty/faiss/faiss/gpu/impl/CuvsIVFPQ.cu @@ -23,17 +23,20 @@ #include #include +#include #include #include #include #include +#include #include #include #include #include #include +#include namespace faiss { namespace gpu { @@ -359,7 +362,8 @@ void CuvsIVFPQ::search( int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) { + Tensor& outIndices, + const IDSelector* sel) { uint32_t numQueries = queries.getSize(0); uint32_t cols = queries.getSize(1); idx_t k_ = std::min(static_cast(k), cuvs_index->size()); @@ -383,13 +387,31 @@ void CuvsIVFPQ::search( auto out_dists_view = raft::make_device_matrix_view( outDistances.data(), (idx_t)numQueries, (idx_t)k_); + std::optional> bitset_cuvs; + std::optional> + bitset_filter_cuvs; + cuvs::neighbors::filtering::none_sample_filter none_filter; + + if (sel) { + bitset_cuvs = cuvs::core::bitset( + raft_handle, cuvs_index->size(), false); + faiss::gpu::convert_to_bitset(resources_, *sel, bitset_cuvs->view()); + bitset_filter_cuvs.emplace(bitset_cuvs->view()); + } + const cuvs::neighbors::filtering::base_filter& filter_ref = sel + ? static_cast( + bitset_filter_cuvs.value()) + : static_cast( + none_filter); + cuvs::neighbors::ivf_pq::search( raft_handle, pams, *cuvs_index, queries_view, out_inds_view, - out_dists_view); + out_dists_view, + filter_ref); /// Identify NaN rows and mask their nearest neighbors auto nan_flag = raft::make_device_vector(raft_handle, numQueries); diff --git a/thirdparty/faiss/faiss/gpu/impl/CuvsIVFPQ.cuh b/thirdparty/faiss/faiss/gpu/impl/CuvsIVFPQ.cuh index db6c6b626..59ef996a0 100644 --- a/thirdparty/faiss/faiss/gpu/impl/CuvsIVFPQ.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/CuvsIVFPQ.cuh @@ -76,7 +76,8 @@ class CuvsIVFPQ : public IVFPQ { int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) override; + Tensor& outIndices, + const IDSelector* sel = nullptr) override; /// Performs search when we are already given the IVF cells to look at /// (GpuIndexIVF::search_preassigned implementation) diff --git a/thirdparty/faiss/faiss/gpu/impl/FlatIndex.cu b/thirdparty/faiss/faiss/gpu/impl/FlatIndex.cu index 0f2273789..e0b5335cf 100644 --- a/thirdparty/faiss/faiss/gpu/impl/FlatIndex.cu +++ b/thirdparty/faiss/faiss/gpu/impl/FlatIndex.cu @@ -116,7 +116,8 @@ void FlatIndex::query( float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance) { + bool exactDistance, + const IDSelector* sel) { auto stream = resources_->getDefaultStreamCurrentDevice(); if (useFloat16_) { @@ -130,7 +131,8 @@ void FlatIndex::query( metricArg, outDistances, outIndices, - exactDistance); + exactDistance, + sel); } else { bfKnnOnDevice( resources_, @@ -157,7 +159,8 @@ void FlatIndex::query( float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance) { + bool exactDistance, + const IDSelector* sel) { FAISS_ASSERT(useFloat16_); bfKnnOnDevice( diff --git a/thirdparty/faiss/faiss/gpu/impl/FlatIndex.cuh b/thirdparty/faiss/faiss/gpu/impl/FlatIndex.cuh index 6e6f3bf45..79e315d2b 100644 --- a/thirdparty/faiss/faiss/gpu/impl/FlatIndex.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/FlatIndex.cuh @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -67,7 +68,8 @@ class FlatIndex { float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance); + bool exactDistance, + const IDSelector* sel = nullptr); virtual void query( Tensor& vecs, @@ -76,7 +78,8 @@ class FlatIndex { float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance); + bool exactDistance, + const IDSelector* sel = nullptr); /// Compute residual for set of vectors void computeResidual( diff --git a/thirdparty/faiss/faiss/gpu/impl/IVFBase.cuh b/thirdparty/faiss/faiss/gpu/impl/IVFBase.cuh index 1c8fc4547..e2e1ea6de 100644 --- a/thirdparty/faiss/faiss/gpu/impl/IVFBase.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/IVFBase.cuh @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,8 @@ class IVFBase { int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) = 0; + Tensor& outIndices, + const IDSelector* sel = nullptr) = 0; /// Performs search when we are already given the IVF cells to look at /// (GpuIndexIVF::search_preassigned implementation) diff --git a/thirdparty/faiss/faiss/gpu/impl/IVFFlat.cu b/thirdparty/faiss/faiss/gpu/impl/IVFFlat.cu index b6a8a3fd9..77b298793 100644 --- a/thirdparty/faiss/faiss/gpu/impl/IVFFlat.cu +++ b/thirdparty/faiss/faiss/gpu/impl/IVFFlat.cu @@ -169,7 +169,8 @@ void IVFFlat::search( int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) { + Tensor& outIndices, + const IDSelector* sel) { auto stream = resources_->getDefaultStreamCurrentDevice(); // These are caught at a higher level diff --git a/thirdparty/faiss/faiss/gpu/impl/IVFFlat.cuh b/thirdparty/faiss/faiss/gpu/impl/IVFFlat.cuh index 27f51b56c..f8b55a58c 100644 --- a/thirdparty/faiss/faiss/gpu/impl/IVFFlat.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/IVFFlat.cuh @@ -37,7 +37,8 @@ class IVFFlat : public IVFBase { int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) override; + Tensor& outIndices, + const IDSelector* sel = nullptr) override; /// Performs search when we are already given the IVF cells to look at /// (GpuIndexIVF::search_preassigned implementation) diff --git a/thirdparty/faiss/faiss/gpu/impl/IVFPQ.cu b/thirdparty/faiss/faiss/gpu/impl/IVFPQ.cu index 7cbaf2cd4..0d6bcb2bc 100644 --- a/thirdparty/faiss/faiss/gpu/impl/IVFPQ.cu +++ b/thirdparty/faiss/faiss/gpu/impl/IVFPQ.cu @@ -494,7 +494,8 @@ void IVFPQ::search( int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) { + Tensor& outIndices, + const IDSelector* sel) { // These are caught at a higher level FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K); FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); diff --git a/thirdparty/faiss/faiss/gpu/impl/IVFPQ.cuh b/thirdparty/faiss/faiss/gpu/impl/IVFPQ.cuh index 2b20f0ee6..c8ab03236 100644 --- a/thirdparty/faiss/faiss/gpu/impl/IVFPQ.cuh +++ b/thirdparty/faiss/faiss/gpu/impl/IVFPQ.cuh @@ -53,7 +53,8 @@ class IVFPQ : public IVFBase { int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) override; + Tensor& outIndices, + const IDSelector* sel = nullptr) override; /// Performs search when we are already given the IVF cells to look at /// (GpuIndexIVF::search_preassigned implementation) diff --git a/thirdparty/faiss/faiss/gpu/test/TestGpuFilterConvert.cu b/thirdparty/faiss/faiss/gpu/test/TestGpuFilterConvert.cu index 08343097c..7751f28c1 100644 --- a/thirdparty/faiss/faiss/gpu/test/TestGpuFilterConvert.cu +++ b/thirdparty/faiss/faiss/gpu/test/TestGpuFilterConvert.cu @@ -53,6 +53,8 @@ struct Options { }; void run_complex() { + using bitset_t = uint32_t; + using indexing_t = int64_t; Options spec; faiss::gpu::StandardGpuResources res; res.noTempMemory(); @@ -77,7 +79,7 @@ void run_complex() { auto or_selector = faiss::IDSelectorOr(&range_selector, &array_selector); auto bitmap_faiss_cpu = std::vector((spec.bitset_len + 8) / 8); - for (uint32_t i = 0; i < bitmap_faiss_cpu.size(); i++) { + for (indexing_t i = 0; i < bitmap_faiss_cpu.size(); i++) { bitmap_faiss_cpu[i] = (uint8_t)faiss::gpu::randVal(0, 255); } auto bitmap_selector = @@ -88,19 +90,19 @@ void run_complex() { faiss::IDSelectorXOr(&or_selector, ¬_bitmap_selector); // convert to cuVS bitset - auto bitset = cuvs::core::bitset( + auto bitset = cuvs::core::bitset( raft_handle, spec.bitset_len, false); faiss::gpu::convert_to_bitset(gpuRes.get(), xor_selector, bitset.view()); // verify auto bitset_converted_cpu = - raft::make_host_vector(bitset.n_elements()); + raft::make_host_vector(bitset.n_elements()); auto bitset_converted_cpu_view = - cuvs::core::bitset_view( + cuvs::core::bitset_view( bitset_converted_cpu.data_handle(), spec.bitset_len); raft::copy(raft_handle, bitset_converted_cpu.view(), bitset.to_mdspan()); raft::resource::sync_stream(raft_handle); - for (uint32_t i = 0; i < spec.bitset_len; i++) { + for (indexing_t i = 0; i < spec.bitset_len; i++) { if (bitset_converted_cpu_view.test(i) != xor_selector.is_member(i)) { ASSERT_TRUE( testing::AssertionFailure() @@ -120,23 +122,24 @@ void run_range() { gpuRes->getRaftHandleCurrentDevice(); // take random imin and imax, check all ids using bitset_t = uint32_t; + using indexing_t = int64_t; auto imin = faiss::gpu::randVal(0, spec.bitset_len - 2); auto imax = faiss::gpu::randVal(1, spec.bitset_len - 1); if (imin > imax) std::swap(imin, imax); auto selector = faiss::IDSelectorRange(imin, imax); - auto bitset = cuvs::core::bitset( + auto bitset = cuvs::core::bitset( raft_handle, spec.bitset_len, false); auto nbits = sizeof(bitset_t) * 8; faiss::gpu::convert_to_bitset(gpuRes.get(), selector, bitset.view()); auto bitset_converted_cpu = - raft::make_host_vector(bitset.n_elements()); + raft::make_host_vector(bitset.n_elements()); raft::copy(raft_handle, bitset_converted_cpu.view(), bitset.to_mdspan()); raft::resource::sync_stream(raft_handle); - auto bitset_view_cpu = cuvs::core::bitset_view( + auto bitset_view_cpu = cuvs::core::bitset_view( bitset_converted_cpu.data_handle(), spec.bitset_len); - for (uint64_t i = 0; i < spec.bitset_len; i++) { + for (indexing_t i = 0; i < spec.bitset_len; i++) { if (bitset_view_cpu.test(i) != selector.is_member(i)) { ASSERT_TRUE( testing::AssertionFailure() @@ -154,28 +157,30 @@ void run_bitmap() { faiss::gpu::StandardGpuResources res; res.noTempMemory(); + using bitset_t = uint32_t; + using indexing_t = int64_t; auto gpuRes = res.getResources(); const raft::device_resources& raft_handle = gpuRes->getRaftHandleCurrentDevice(); // generate random bitmap selector auto bitmap_faiss_cpu = std::vector((spec.bitset_len + 8) / 8); - for (uint32_t i = 0; i < bitmap_faiss_cpu.size(); i++) { + for (indexing_t i = 0; i < bitmap_faiss_cpu.size(); i++) { bitmap_faiss_cpu[i] = (uint8_t)faiss::gpu::randVal(0, 255); } auto bitmap_selector = faiss::IDSelectorBitmap(spec.bitset_len, bitmap_faiss_cpu.data()); - auto bitset = cuvs::core::bitset( + auto bitset = cuvs::core::bitset( raft_handle, spec.bitset_len, false); faiss::gpu::convert_to_bitset(gpuRes.get(), bitmap_selector, bitset.view()); auto bitset_converted_cpu = - raft::make_host_vector(bitset.n_elements()); + raft::make_host_vector(bitset.n_elements()); raft::copy(raft_handle, bitset_converted_cpu.view(), bitset.to_mdspan()); raft::resource::sync_stream(raft_handle); auto bitset_converted_cpu_view = - cuvs::core::bitset_view( + cuvs::core::bitset_view( bitset_converted_cpu.data_handle(), spec.bitset_len); - for (uint32_t i = 0; i < spec.bitset_len; i++) { + for (indexing_t i = 0; i < spec.bitset_len; i++) { if (bitset_converted_cpu_view.test(i) != bitmap_selector.is_member(i)) { ASSERT_TRUE( testing::AssertionFailure() @@ -189,6 +194,8 @@ void run_bitmap() { } void run_array() { + using bitset_t = uint32_t; + using indexing_t = int64_t; Options spec; faiss::gpu::StandardGpuResources res; res.noTempMemory(); @@ -203,18 +210,18 @@ void run_array() { } auto array_selector = faiss::IDSelectorArray(n, array_selector_indices.data()); - auto bitset = cuvs::core::bitset( + auto bitset = cuvs::core::bitset( raft_handle, spec.bitset_len, false); faiss::gpu::convert_to_bitset(gpuRes.get(), array_selector, bitset.view()); auto bitset_converted_cpu = - raft::make_host_vector(bitset.n_elements()); + raft::make_host_vector(bitset.n_elements()); raft::copy(raft_handle, bitset_converted_cpu.view(), bitset.to_mdspan()); raft::resource::sync_stream(raft_handle); auto bitset_converted_cpu_view = - cuvs::core::bitset_view( + cuvs::core::bitset_view( bitset_converted_cpu.data_handle(), spec.bitset_len); - for (uint32_t i = 0; i < spec.bitset_len; i++) { + for (indexing_t i = 0; i < spec.bitset_len; i++) { if (bitset_converted_cpu_view.test(i) != array_selector.is_member(i)) { ASSERT_TRUE( testing::AssertionFailure() diff --git a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexBinaryCagra.cu b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexBinaryCagra.cu index 79632bef0..93aa734a4 100644 --- a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexBinaryCagra.cu +++ b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexBinaryCagra.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -432,6 +433,74 @@ TEST(TestGpuIndexBinaryCagra, CopyFrom_ITERATIVE_SEARCH) { copyFromTest(faiss::gpu::graph_build_algo::ITERATIVE_SEARCH, 0.98); } +void testIDSelectorBinaryCagra(std::string selectorName) { + Options opt; + auto trainVecs = faiss::gpu::randBinaryVecs(opt.numTrain, opt.dim); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexCagraConfig config; + config.device = opt.device; + config.graph_degree = opt.graphDegree; + config.intermediate_graph_degree = opt.intermediateGraphDegree; + config.build_algo = faiss::gpu::graph_build_algo::NN_DESCENT; + config.nn_descent_niter = 20; + + faiss::gpu::GpuIndexBinaryCagra gpuIndex(&res, opt.dim, config); + gpuIndex.train(opt.numTrain, trainVecs.data()); + + auto queryVecs = faiss::gpu::randBinaryVecs(opt.numQuery, opt.dim); + faiss::gpu::TestIDSelectorStruct selector_struct(opt.numTrain); + faiss::gpu::SearchParametersCagra search_params; + // TODO: For CI test purposes only, remove this + // for (auto& [selectorName, selector] : selector_struct.selector_map) { + auto selector = selector_struct.selector_map[selectorName].get(); + search_params.sel = selector; + std::vector distances(opt.numQuery * opt.k, 0); + std::vector labels(opt.numQuery * opt.k, -1); + gpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + distances.data(), + labels.data(), + &search_params); + for (int i = 0; i < opt.numQuery * opt.k; ++i) { + if (labels[i] >= 0) { + EXPECT_TRUE(selector->is_member(labels[i])) + << "Label " << labels[i] << " @ " << i << " not in " + << selectorName << " selector"; + } + } + //} +} + +TEST(TestGpuIndexBinaryCagra, IDSelector_Range) { + testIDSelectorBinaryCagra("Range"); +} +TEST(TestGpuIndexBinaryCagra, IDSelector_Array) { + testIDSelectorBinaryCagra("Array"); +} +TEST(TestGpuIndexBinaryCagra, IDSelector_Batch) { + testIDSelectorBinaryCagra("Batch"); +} +TEST(TestGpuIndexBinaryCagra, IDSelector_Bitmap) { + testIDSelectorBinaryCagra("Bitmap"); +} +TEST(TestGpuIndexBinaryCagra, IDSelector_Not) { + testIDSelectorBinaryCagra("Not"); +} +TEST(TestGpuIndexBinaryCagra, IDSelector_And) { + testIDSelectorBinaryCagra("And"); +} +TEST(TestGpuIndexBinaryCagra, IDSelector_Or) { + testIDSelectorBinaryCagra("Or"); +} +TEST(TestGpuIndexBinaryCagra, IDSelector_XOr) { + testIDSelectorBinaryCagra("XOr"); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexCagra.cu b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexCagra.cu index bd1c216d3..1f94b6dfb 100644 --- a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexCagra.cu +++ b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexCagra.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -663,7 +664,7 @@ void copyFromTest(faiss::MetricType metric, double expected_recall) { res.noTempMemory(); // convert to gpu index - faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric); + faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, opt.dim, metric); copiedGpuIndex.copyFrom(&cpuIndex); // train gpu index @@ -781,7 +782,7 @@ void copyFromTestFP16(faiss::MetricType metric, double expected_recall) { res.noTempMemory(); // convert to gpu index - faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric); + faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, opt.dim, metric); copiedGpuIndex.copyFrom_ex(&cpuIndex, faiss::NumericType::Float16); // train gpu index @@ -794,7 +795,7 @@ void copyFromTestFP16(faiss::MetricType metric, double expected_recall) { // faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config); // gpuIndex.train(opt.numTrain, trainVecs.data()); - faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config); + faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config); // Create half vector std::vector<__half> trainVecs_half(trainVecs.size()); @@ -895,6 +896,53 @@ TEST(TestGpuIndexCagra, Float16_CopyFrom_IP) { copyFromTestFP16(faiss::METRIC_INNER_PRODUCT, 0.98); } +void testIDSelectorCagra(faiss::MetricType metricType) { + Options opt; + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + if (metricType == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data()); + } + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexCagraConfig config; + config.device = opt.device; + config.graph_degree = opt.graphDegree; + config.intermediate_graph_degree = opt.intermediateGraphDegree; + // Use only IVF_PQ to avoid NN_DESCENT + IP combination + config.build_algo = faiss::gpu::graph_build_algo::IVF_PQ; + + faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metricType, config); + gpuIndex.train(opt.numTrain, trainVecs.data()); + + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + if (metricType == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data()); + } + + faiss::gpu::TestIDSelectorStruct selector_struct(opt.numTrain); + faiss::gpu::SearchParametersCagra search_params; + for (auto& [selectorName, selector] : selector_struct.selector_map) { + search_params.sel = selector.get(); + faiss::gpu::testIDSelectorSearch( + &gpuIndex, + &search_params, + queryVecs, + opt.numQuery, + opt.k, + selectorName); + } +} + +TEST(TestGpuIndexCagra, IDSelector_L2) { + testIDSelectorCagra(faiss::METRIC_L2); +} + +TEST(TestGpuIndexCagra, IDSelector_IP) { + testIDSelectorCagra(faiss::METRIC_INNER_PRODUCT); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp index 734ab41fd..3b2ca1c19 100644 --- a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +++ b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -761,6 +762,51 @@ TEST(TestCuvsGpuIndexFlat, SearchAndReconstruct) { } #endif +void testIDSelectorFlat(faiss::MetricType metricType) { + int numAdd = faiss::gpu::randVal(2000, 5000); + int dim = faiss::gpu::randVal(64, 200); + int numQuery = faiss::gpu::randVal(32, 100); + int k = faiss::gpu::randVal(10, 30); + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + auto queryVecs = faiss::gpu::randVecs(numQuery, dim); + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.use_cuvs = true; + for (bool useFloat16 : {false, true}) { + config.useFloat16 = useFloat16; + faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, metricType, config); + std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); + gpuIndex.add(numAdd, addVecs.data()); + + faiss::gpu::TestIDSelectorStruct selector_struct(numAdd); + faiss::SearchParameters search_params; + for (auto& [selectorName, selector] : selector_struct.selector_map) { + search_params.sel = selector.get(); + faiss::gpu::testIDSelectorSearch( + &gpuIndex, + &search_params, + queryVecs, + numQuery, + k, + selectorName); + } + } +} + +#if defined USE_NVIDIA_CUVS +TEST(TestCuvsGpuIndexFlat, IDSelector_L2) { + testIDSelectorFlat(faiss::METRIC_L2); +} + +TEST(TestCuvsGpuIndexFlat, IDSelector_IP) { + testIDSelectorFlat(faiss::METRIC_INNER_PRODUCT); +} +#endif + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index bc4eb8980..34b0218f7 100644 --- a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -917,6 +918,53 @@ TEST(TestGpuIndexIVFFlat, Reconstruct_n) { EXPECT_EQ(gpuVals, cpuVals); } +void testIDSelectorIVFFlat(faiss::MetricType metricType) { + Options opt; + + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = opt.device; + config.indicesOptions = faiss::gpu::INDICES_64_BIT; + config.use_cuvs = true; + + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, metricType, config); + gpuIndex.nprobe = opt.nprobe; + + gpuIndex.train(opt.numTrain, trainVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + faiss::gpu::TestIDSelectorStruct selector_struct(opt.numAdd); + faiss::SearchParametersIVF search_params; + search_params.nprobe = opt.nprobe; + for (auto& [selectorName, selector] : selector_struct.selector_map) { + search_params.sel = selector.get(); + faiss::gpu::testIDSelectorSearch( + &gpuIndex, + &search_params, + queryVecs, + opt.numQuery, + opt.k, + selectorName); + } +} + +#if defined USE_NVIDIA_CUVS +TEST(TestCuvsGpuIndexIVFFlat, IDSelector_L2) { + testIDSelectorIVFFlat(faiss::METRIC_L2); +} + +TEST(TestCuvsGpuIndexIVFFlat, IDSelector_IP) { + testIDSelectorIVFFlat(faiss::METRIC_INNER_PRODUCT); +} +#endif + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp index a9d39e4d7..3af54342e 100644 --- a/thirdparty/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +++ b/thirdparty/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp @@ -6,11 +6,13 @@ */ #include +#include #include #include #include #include #include +#include #include #include #include @@ -102,7 +104,11 @@ struct Options { } float getCompareEpsilon() const { - return 0.035f; + // With very low dimensionality (e.g., dim=4, codes=2 giving + // dimPerSubQuantizer=2), L2 distances can be very small + // (near-zero), causing relative error comparisons to be + // unstable despite tiny absolute differences. + return (dim <= 8) ? 0.15f : 0.035f; } float getPctMaxDiff1() const { @@ -892,6 +898,62 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) { #endif } +void testIDSelectorIVFPQ(faiss::MetricType metricType) { + Options opt; + + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::IndexFlatL2 quantizerL2(opt.dim); + faiss::IndexFlatIP quantizerIP(opt.dim); + faiss::Index* quantizer = metricType == faiss::METRIC_L2 + ? (faiss::Index*)&quantizerL2 + : (faiss::Index*)&quantizerIP; + + faiss::IndexIVFPQ cpuIndex( + quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode); + cpuIndex.metric_type = metricType; + cpuIndex.train(opt.numTrain, trainVecs.data()); + cpuIndex.add(opt.numAdd, addVecs.data()); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = opt.device; + config.indicesOptions = faiss::gpu::INDICES_64_BIT; + config.interleavedLayout = true; + config.use_cuvs = true; + + faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); + gpuIndex.nprobe = opt.nprobe; + + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + faiss::gpu::TestIDSelectorStruct selector_struct(opt.numAdd); + faiss::SearchParametersIVF search_params; + search_params.nprobe = opt.nprobe; + for (auto& [selectorName, selector] : selector_struct.selector_map) { + search_params.sel = selector.get(); + faiss::gpu::testIDSelectorSearch( + &gpuIndex, + &search_params, + queryVecs, + opt.numQuery, + opt.k, + selectorName); + } +} + +#if defined USE_NVIDIA_CUVS +TEST(TestCuvsGpuIndexIVFPQ, IDSelector_L2) { + testIDSelectorIVFPQ(faiss::METRIC_L2); +} + +TEST(TestCuvsGpuIndexIVFPQ, IDSelector_IP) { + testIDSelectorIVFPQ(faiss::METRIC_INNER_PRODUCT); +} +#endif + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/thirdparty/faiss/faiss/gpu/test/TestUtils.cpp b/thirdparty/faiss/faiss/gpu/test/TestUtils.cpp index 1357cfcb4..22ddb73a3 100644 --- a/thirdparty/faiss/faiss/gpu/test/TestUtils.cpp +++ b/thirdparty/faiss/faiss/gpu/test/TestUtils.cpp @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -429,5 +430,76 @@ void compareLists( } } +TestIDSelectorStruct::TestIDSelectorStruct(int numAdd) { + // Range selector [20%, 80%] of the database + size_t min_id = numAdd / 5; + size_t max_id = numAdd * 4 / 5; + selector_map["Range"] = + std::make_unique(min_id, max_id); + + // Array selector (every 3rd element) + array_ids.clear(); + for (int i = 0; i < numAdd; i += 3) { + array_ids.push_back(i); + } + selector_map["Array"] = std::make_unique( + array_ids.size(), array_ids.data()); + + // Batch selector (every 5th element) + batch_ids.clear(); + for (int i = 1; i < numAdd; i += 5) { + batch_ids.push_back(i); + } + selector_map["Batch"] = std::make_unique( + batch_ids.size(), batch_ids.data()); + + // Bitmap selector (every 4th element selected) + size_t bitmap_size = (numAdd + 7) / 8; + bitmap.resize(bitmap_size, 0); + for (int i = 0; i < numAdd; i += 4) { + int byte_idx = i / 8; + int bit_idx = i % 8; + bitmap[byte_idx] |= (1 << bit_idx); + } + selector_map["Bitmap"] = + std::make_unique(numAdd, bitmap.data()); + + selector_map["Not"] = + std::make_unique(selector_map["Range"].get()); + selector_map["And"] = std::make_unique( + selector_map["Range"].get(), selector_map["Array"].get()); + selector_map["Or"] = std::make_unique( + selector_map["Range"].get(), selector_map["Batch"].get()); + selector_map["XOr"] = std::make_unique( + selector_map["Not"].get(), selector_map["Array"].get()); +} + +void testIDSelectorSearch( + faiss::Index* index, + faiss::SearchParameters* search_params, + const std::vector& queryVecs, + int numQuery, + int k, + const std::string& selectorName) { + FAISS_ASSERT(search_params && search_params->sel); + std::vector distances(numQuery * k, 0); + std::vector labels(numQuery * k, -1); + index->search( + numQuery, + queryVecs.data(), + k, + distances.data(), + labels.data(), + search_params); + faiss::IDSelector* selector = search_params->sel; + for (int i = 0; i < numQuery * k; ++i) { + if (labels[i] >= 0) { + EXPECT_TRUE(selector->is_member(labels[i])) + << "Label " << labels[i] << " @ " << i << " not in " + << selectorName << " selector"; + } + } +} + } // namespace gpu } // namespace faiss diff --git a/thirdparty/faiss/faiss/gpu/test/TestUtils.h b/thirdparty/faiss/faiss/gpu/test/TestUtils.h index 02ef37ccf..45b70da2a 100644 --- a/thirdparty/faiss/faiss/gpu/test/TestUtils.h +++ b/thirdparty/faiss/faiss/gpu/test/TestUtils.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -141,5 +142,27 @@ void testIVFEquality(A& cpuIndex, B& gpuIndex) { } } +/// Run search with the given search_params and verify all returned labels are +/// members of the selector. Works with any Index that supports search with +/// SearchParameters. +void testIDSelectorSearch( + faiss::Index* index, + faiss::SearchParameters* search_params, + const std::vector& queryVecs, + int numQuery, + int k, + const std::string& selectorName); + +// Structure to hold all IDSelector instances +struct TestIDSelectorStruct { + // Storage for selectors that need it + std::vector array_ids; + std::vector batch_ids; + std::vector bitmap; + std::map> selector_map; + + explicit TestIDSelectorStruct(int numAdd); +}; + } // namespace gpu } // namespace faiss diff --git a/thirdparty/faiss/faiss/gpu/test/test_cagra.py b/thirdparty/faiss/faiss/gpu/test/test_cagra.py index 972ce5e29..d9e1b27a6 100644 --- a/thirdparty/faiss/faiss/gpu/test/test_cagra.py +++ b/thirdparty/faiss/faiss/gpu/test/test_cagra.py @@ -150,6 +150,27 @@ def test_interop_L2_Int8(self): def test_interop_IP_Int8(self): self.do_interop(faiss.METRIC_INNER_PRODUCT, faiss.Int8) + def test_base_level_only_range_search(self): + d = 32 + nb = 1000 + nq = 10 + ds = datasets.SyntheticDataset(d, 0, nb, nq) + data_base = ds.get_database() + data_query = ds.get_queries() + + res = faiss.StandardGpuResources() + index = faiss.GpuIndexCagra(res, d, faiss.METRIC_L2) + index.train(data_base, numeric_type=faiss.Float32) + + cpu_index = faiss.index_gpu_to_cpu(index) + cpu_index.base_level_only = True + cpu_index.num_base_level_search_entrypoints = 8 + + radius = np.float32(1e9) + lims, _, _ = cpu_index.range_search(data_query, radius) + counts = lims[1:] - lims[:-1] + self.assertTrue(np.all(counts > 0)) + @unittest.skipIf( "CUVS" not in faiss.get_compile_options(), diff --git a/thirdparty/faiss/faiss/gpu/utils/CuvsFilterConvert.cu b/thirdparty/faiss/faiss/gpu/utils/CuvsFilterConvert.cu index bedbcb642..037ca8600 100644 --- a/thirdparty/faiss/faiss/gpu/utils/CuvsFilterConvert.cu +++ b/thirdparty/faiss/faiss/gpu/utils/CuvsFilterConvert.cu @@ -36,13 +36,13 @@ namespace faiss::gpu { template RAFT_KERNEL set_range_kernel( bitset_t* bitset_data, - uint32_t imin, - uint32_t imax, - uint32_t n_elements_to_set) { - uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t imin, + int64_t imax, + int64_t n_elements_to_set) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; const uint32_t nbits = sizeof(bitset_t) * 8; - uint32_t current_index = (imin / nbits) + idx; + int64_t current_index = (imin / nbits) + idx; bitset_t mask = 0; if (idx < n_elements_to_set) { if (n_elements_to_set == 1) { @@ -70,7 +70,7 @@ RAFT_KERNEL set_range_kernel( void convert_to_bitset_range( raft::resources const& res, const faiss::IDSelectorRange& selector, - cuvs::core::bitset_view bitset) { + cuvs::core::bitset_view bitset) { RAFT_EXPECTS( bitset.size() >= selector.imax, "IDSelectorRange is out of range for the given bitset"); @@ -79,10 +79,10 @@ void convert_to_bitset_range( if (original_nbits == 0) { original_nbits = nbits; } - uint32_t imin = selector.imin; - uint32_t imax = selector.imax; + int64_t imin = selector.imin; + int64_t imax = selector.imax; - uint32_t n_elements_to_set = 1 + (imax + original_nbits) / original_nbits; + int64_t n_elements_to_set = 1 + (imax + original_nbits) / original_nbits; n_elements_to_set -= (imin + original_nbits) / original_nbits; auto stream = raft::resource::get_cuda_stream(res); @@ -107,14 +107,14 @@ void convert_to_bitset_range( void convert_to_bitset_array( raft::resources const& res, const faiss::IDSelectorArray& selector, - cuvs::core::bitset_view bitset) { - uint32_t n = selector.n; + cuvs::core::bitset_view bitset) { + int64_t n = selector.n; auto d_indexes_to_set = - raft::make_device_vector(res, n); + raft::make_device_vector(res, n); raft::copy( res, d_indexes_to_set.view(), - raft::make_host_vector_view( + raft::make_host_vector_view( selector.ids, n)); thrust::for_each_n( raft::resource::get_thrust_policy(res), @@ -128,13 +128,13 @@ void convert_to_bitset_array( RAFT_KERNEL set_bitmap_kernel( uint32_t* new_bitset_data, uint8_t* original_bitmap_data, - uint32_t n_elements, - uint32_t bitset_original_nbits) { - uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; - uint32_t bit_index = 0; - uint32_t bit_offset = 0; + int64_t n_elements, + int64_t bitset_original_nbits) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t bit_index = 0; + int64_t bit_offset = 0; raft::core::compute_original_nbits_position( - uint32_t{8}, bitset_original_nbits, idx * 8, bit_index, bit_offset); + int64_t{8}, bitset_original_nbits, idx * 8, bit_index, bit_offset); if (idx < n_elements) { uint32_t mask = original_bitmap_data[idx]; atomicOr(&new_bitset_data[bit_index], mask << bit_offset); @@ -144,24 +144,25 @@ RAFT_KERNEL set_bitmap_kernel( void convert_to_bitset_bitmap( raft::resources const& res, const faiss::IDSelectorBitmap& selector, - cuvs::core::bitset_view bitset) { - uint32_t n = selector.n; + cuvs::core::bitset_view bitset) { + auto n = selector.n; auto bitset_original_nbits = bitset.get_original_nbits(); if (bitset_original_nbits == 0) { bitset_original_nbits = sizeof(uint32_t) * 8; } RAFT_EXPECTS( bitset.size() == n, - "IDSelectorBitmap is out of range for the given bitset"); + "IDSelectorBitmap is out of range for the given bitset: %ld != %zu", + bitset.size(), + n); auto stream = raft::resource::get_cuda_stream(res); auto n_elements = (selector.n + 7) / 8; - auto d_bitmap = - raft::make_device_vector(res, n_elements); + auto d_bitmap = raft::make_device_vector(res, n_elements); auto d_bitmap_ptr = d_bitmap.data_handle(); raft::copy( res, d_bitmap.view(), - raft::make_host_vector_view( + raft::make_host_vector_view( selector.bitmap, n_elements)); const int threads_per_block = 256; @@ -174,15 +175,15 @@ void convert_to_bitset_bitmap( void convert_to_bitset_bruteforce( raft::resources const& res, const faiss::IDSelector& selector, - cuvs::core::bitset_view bitset, + cuvs::core::bitset_view bitset, int num_threads = 0) { auto bitset_cpu = - raft::make_host_vector(bitset.n_elements()); + raft::make_host_vector(bitset.n_elements()); auto nbits = sizeof(uint32_t) * 8; if (num_threads == 0) num_threads = omp_get_max_threads(); #pragma omp parallel for num_threads(num_threads) - for (uint32_t i = 0; i < bitset.n_elements(); i++) { + for (int64_t i = 0; i < bitset.n_elements(); i++) { uint32_t element = uint32_t{0}; for (uint32_t j = 0; j < nbits; j++) { if (i * nbits + j < bitset.size() && @@ -199,7 +200,7 @@ void convert_to_bitset_bruteforce( void convert_to_bitset( faiss::gpu::GpuResources* res, const faiss::IDSelector& selector, - cuvs::core::bitset_view bitset, + cuvs::core::bitset_view bitset, int num_threads) { raft::device_resources& raft_handle = res->getRaftHandleCurrentDevice(); // If the selector is simple, we can use the specialized functions diff --git a/thirdparty/faiss/faiss/gpu/utils/CuvsFilterConvert.h b/thirdparty/faiss/faiss/gpu/utils/CuvsFilterConvert.h index a88aa1512..d659d6598 100644 --- a/thirdparty/faiss/faiss/gpu/utils/CuvsFilterConvert.h +++ b/thirdparty/faiss/faiss/gpu/utils/CuvsFilterConvert.h @@ -36,6 +36,6 @@ namespace faiss::gpu { void convert_to_bitset( faiss::gpu::GpuResources* res, const faiss::IDSelector& selector, - cuvs::core::bitset_view bitset, + cuvs::core::bitset_view bitset, int num_threads = 0); } // namespace faiss::gpu diff --git a/thirdparty/faiss/faiss/impl/HNSW.cpp b/thirdparty/faiss/faiss/impl/HNSW.cpp index ab82d9b83..54ddeb920 100644 --- a/thirdparty/faiss/faiss/impl/HNSW.cpp +++ b/thirdparty/faiss/faiss/impl/HNSW.cpp @@ -188,7 +188,7 @@ void HNSW::fill_with_random_links(size_t n) { for (size_t ii = 0; ii < elts.size(); ii++) { int i = elts[ii]; size_t begin, end; - neighbor_range(i, 0, &begin, &end); + neighbor_range(i, level, &begin, &end); for (size_t j = begin; j < end; j++) { int other = 0; do { @@ -1000,6 +1000,18 @@ int search_from_candidates_panorama( return nres; } +template +void reservePriorityQueue( + std::priority_queue& q, + std::size_t size) { + struct Access : std::priority_queue { + using std::priority_queue::c; + }; + Access access{std::move(q)}; + access.c.reserve(size); + q = std::move(access); +} + std::priority_queue search_from_candidate_unbounded( const HNSW& hnsw, const Node& node, @@ -1009,7 +1021,10 @@ std::priority_queue search_from_candidate_unbounded( HNSWStats& stats) { int ndis = 0; std::priority_queue top_candidates; + reservePriorityQueue(top_candidates, ef); + std::priority_queue, std::greater> candidates; + reservePriorityQueue(candidates, ef); top_candidates.push(node); candidates.push(node); @@ -1048,11 +1063,11 @@ std::priority_queue search_from_candidate_unbounded( auto add_to_heap = [&](const size_t idx, const float dis) { if (top_candidates.top().first > dis || - top_candidates.size() < static_cast(ef)) { + top_candidates.size() < ef) { candidates.emplace(dis, idx); top_candidates.emplace(dis, idx); - if (top_candidates.size() > static_cast(ef)) { + if (top_candidates.size() > ef) { top_candidates.pop(); } } diff --git a/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.cpp b/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.cpp index cf5c8f14b..42f08b38c 100644 --- a/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.cpp +++ b/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.cpp @@ -26,7 +26,7 @@ // this is needed for prefetching -#ifdef __AVX2__ +#ifdef COMPILE_SIMD_AVX2 #include #endif @@ -600,73 +600,75 @@ void LocalSearchQuantizer::icm_encode_step( FAISS_THROW_IF_NOT(M != 0 && K != 0); FAISS_THROW_IF_NOT(binaries != nullptr); + // Resolve SIMD level once, not per iteration of the n × n_iters × M loop. + with_simd_level_256bit([&]() { #pragma omp parallel for schedule(dynamic) - for (int64_t i = 0; i < static_cast(n); i++) { - std::vector objs(K); - - for (size_t iter = 0; iter < n_iters; iter++) { - // condition on the m-th subcode - for (size_t m = 0; m < M; m++) { - // copy - auto u = unaries + m * n * K + i * K; - for (size_t code = 0; code < K; code++) { - objs[code] = u[code]; - } + for (int64_t i = 0; i < static_cast(n); i++) { + std::vector objs(K); - // compute objective function by adding unary - // and binary terms together - for (size_t other_m = 0; other_m < M; other_m++) { - if (other_m == m) { - continue; + for (size_t iter = 0; iter < n_iters; iter++) { + // condition on the m-th subcode + for (size_t m = 0; m < M; m++) { + // copy + auto u = unaries + m * n * K + i * K; + for (size_t code = 0; code < K; code++) { + objs[code] = u[code]; } -#ifdef __AVX2__ - // TODO: add platform-independent compiler-independent - // prefetch utilities. - if (other_m + 1 < M) { - // do a single prefetch - int32_t code2 = codes[i * M + other_m + 1]; - // for (int32_t code = 0; code < K; code += 64) { - int32_t code = 0; - { - size_t binary_idx = (other_m + 1) * M * K * K + - m * K * K + code2 * K + code; - _mm_prefetch( - (const char*)(binaries + binary_idx), - _MM_HINT_T0); + // compute objective function by adding unary + // and binary terms together + for (size_t other_m = 0; other_m < M; other_m++) { + if (other_m == m) { + continue; + } + +#ifdef COMPILE_SIMD_AVX2 + // TODO: add platform-independent compiler-independent + // prefetch utilities. + if (other_m + 1 < M) { + // do a single prefetch + int32_t code2 = codes[i * M + other_m + 1]; + // for (int32_t code = 0; code < K; code += 64) { + int32_t code = 0; + { + size_t binary_idx = (other_m + 1) * M * K * K + + m * K * K + code2 * K + code; + _mm_prefetch( + (const char*)(binaries + binary_idx), + _MM_HINT_T0); + } } - } #endif - for (size_t code = 0; code < K; code++) { - int32_t code2 = codes[i * M + other_m]; - size_t binary_idx = other_m * M * K * K + m * K * K + - code2 * K + code; - // binaries[m, other_m, code, code2]. - // It is symmetric over (m <-> other_m) - // and (code <-> code2). - // So, replace the op with - // binaries[other_m, m, code2, code]. - objs[code] += binaries[binary_idx]; + for (size_t code = 0; code < K; code++) { + int32_t code2 = codes[i * M + other_m]; + size_t binary_idx = other_m * M * K * K + + m * K * K + code2 * K + code; + // binaries[m, other_m, code, code2]. + // It is symmetric over (m <-> other_m) + // and (code <-> code2). + // So, replace the op with + // binaries[other_m, m, code2, code]. + objs[code] += binaries[binary_idx]; + } } - } - // find the optimal value of the m-th subcode - float best_obj = HUGE_VALF; - int32_t best_code = 0; + // find the optimal value of the m-th subcode + float best_obj = HUGE_VALF; + int32_t best_code = 0; - // find one using SIMD. The following operation is similar - // to the search of the smallest element in objs - using C = CMax; - HeapWithBuckets::addn( - K, objs.data(), 1, &best_obj, &best_code); + // find one using SIMD. The following operation is similar + // to the search of the smallest element in objs + HeapWithBucketsCMaxFloat<16, 1, SL>::addn( + K, objs.data(), 1, &best_obj, &best_code); - // done - codes[i * M + m] = best_code; + // done + codes[i * M + m] = best_code; - } // loop M + } // loop M + } } - } + }); } void LocalSearchQuantizer::perturb_codes( int32_t* codes, diff --git a/thirdparty/faiss/faiss/impl/NNDescent.cpp b/thirdparty/faiss/faiss/impl/NNDescent.cpp index d0c41d694..e27955754 100644 --- a/thirdparty/faiss/faiss/impl/NNDescent.cpp +++ b/thirdparty/faiss/faiss/impl/NNDescent.cpp @@ -31,25 +31,46 @@ Nhood::Nhood(int /* l */, int s, std::mt19937& rng, int N) { /// Copy operator Nhood& Nhood::operator=(const Nhood& other) { - M = other.M; - std::copy( - other.nn_new.begin(), - other.nn_new.end(), - std::back_inserter(nn_new)); - nn_new.reserve(other.nn_new.capacity()); - pool.reserve(other.pool.capacity()); + if (this != &other) { + M = other.M; + nn_new = other.nn_new; + nn_old = other.nn_old; + rnn_new = other.rnn_new; + rnn_old = other.rnn_old; + pool = other.pool; + } return *this; } /// Copy constructor -Nhood::Nhood(const Nhood& other) { - M = other.M; - std::copy( - other.nn_new.begin(), - other.nn_new.end(), - std::back_inserter(nn_new)); - nn_new.reserve(other.nn_new.capacity()); - pool.reserve(other.pool.capacity()); +Nhood::Nhood(const Nhood& other) + : pool(other.pool), + M(other.M), + nn_old(other.nn_old), + nn_new(other.nn_new), + rnn_old(other.rnn_old), + rnn_new(other.rnn_new) {} + +/// Move constructor +Nhood::Nhood(Nhood&& other) noexcept + : pool(std::move(other.pool)), + M(other.M), + nn_old(std::move(other.nn_old)), + nn_new(std::move(other.nn_new)), + rnn_old(std::move(other.rnn_old)), + rnn_new(std::move(other.rnn_new)) {} + +/// Move assignment operator +Nhood& Nhood::operator=(Nhood&& other) noexcept { + if (this != &other) { + M = other.M; + nn_new = std::move(other.nn_new); + nn_old = std::move(other.nn_old); + rnn_new = std::move(other.rnn_new); + rnn_old = std::move(other.rnn_old); + pool = std::move(other.pool); + } + return *this; } /// Insert a point into the candidate pool @@ -90,6 +111,22 @@ void Nhood::join(C callback) const { } void gen_random(std::mt19937& rng, int* addr, const int size, const int N) { + FAISS_THROW_IF_NOT_FMT( + size > 0 && size <= N, + "gen_random: size (%d) must be > 0 and <= N (%d)", + size, + N); + if (size == N) { + // Special case: return all indices in random order + for (int i = 0; i < size; ++i) { + addr[i] = i; + } + for (int i = size - 1; i > 0; --i) { + int j = rng() % (i + 1); + std::swap(addr[i], addr[j]); + } + return; + } for (int i = 0; i < size; ++i) { addr[i] = rng() % (N - size); } @@ -294,7 +331,7 @@ void NNDescent::nndescent(DistanceComputer& qdis, bool verbose) { int num_eval_points = std::min(NUM_EVAL_POINTS, ntotal); std::vector eval_points(num_eval_points); std::vector> acc_eval_set(num_eval_points); - std::mt19937 rng(random_seed * 6577 + omp_get_thread_num()); + std::mt19937 rng(random_seed * 6577); gen_random(rng, eval_points.data(), eval_points.size(), ntotal); generate_eval_set(qdis, eval_points, acc_eval_set, ntotal); for (int it = 0; it < iter; it++) { diff --git a/thirdparty/faiss/faiss/impl/NNDescent.h b/thirdparty/faiss/faiss/impl/NNDescent.h index 60c273df7..d6fafd048 100644 --- a/thirdparty/faiss/faiss/impl/NNDescent.h +++ b/thirdparty/faiss/faiss/impl/NNDescent.h @@ -80,6 +80,10 @@ struct Nhood { Nhood(const Nhood& other); + Nhood(Nhood&& other) noexcept; + + Nhood& operator=(Nhood&& other) noexcept; + void insert(int id, float dist); template diff --git a/thirdparty/faiss/faiss/impl/NSG.cpp b/thirdparty/faiss/faiss/impl/NSG.cpp index 2e45e9798..2826863b9 100644 --- a/thirdparty/faiss/faiss/impl/NSG.cpp +++ b/thirdparty/faiss/faiss/impl/NSG.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -113,7 +114,6 @@ using namespace nsg; NSG::NSG(int R_in) : R(R_in), rng(0x0903) { L = R + 32; C = R + 100; - srand(0x1998); } void NSG::search( @@ -179,7 +179,7 @@ void NSG::build( is_built = true; if (verbose) { - int max = 0, min = 1e6; + int max = 0, min = std::numeric_limits::max(); double avg = 0; for (int i = 0; i < n; i++) { @@ -265,7 +265,7 @@ void NSG::search_on_graph( continue; } - init_ids[i] = id; + init_ids[num_ids] = id; vt.set(id); num_ids += 1; } @@ -397,10 +397,23 @@ void NSG::sync_prune( std::vector result; + if (pool.empty()) { + for (int i = 0; i < R; i++) { + graph.at(q, i).id = EMPTY_ID; + } + return; + } + int start = 0; if (pool[start].id == q) { start++; } + if (start >= static_cast(pool.size())) { + for (int i = 0; i < R; i++) { + graph.at(q, i).id = EMPTY_ID; + } + return; + } result.push_back(pool[start]); while (result.size() < static_cast(R) && diff --git a/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp b/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp index 2eec10d33..403db4526 100644 --- a/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp +++ b/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp @@ -486,7 +486,7 @@ void ProductQuantizer::compute_distance_tables( const float* x, float* dis_tables) const { int64_t nx_signed = nx; -#if defined(__AVX2__) || defined(__aarch64__) +#if defined(COMPILE_SIMD_AVX2) || defined(COMPILE_SIMD_ARM_NEON) if (dsub == 2 && nbits < 8) { // interesting for a narrow range of settings compute_PQ_dis_tables_dsub2( d, ksub, centroids.data(), nx, x, false, dis_tables); @@ -521,7 +521,7 @@ void ProductQuantizer::compute_inner_prod_tables( const float* x, float* dis_tables) const { int64_t nx_signed = nx; -#if defined(__AVX2__) || defined(__aarch64__) +#if defined(COMPILE_SIMD_AVX2) || defined(COMPILE_SIMD_ARM_NEON) if (dsub == 2 && nbits < 8) { compute_PQ_dis_tables_dsub2( d, ksub, centroids.data(), nx, x, true, dis_tables); diff --git a/thirdparty/faiss/faiss/impl/RaBitQStats.cpp b/thirdparty/faiss/faiss/impl/RaBitQStats.cpp deleted file mode 100644 index a629053e7..000000000 --- a/thirdparty/faiss/faiss/impl/RaBitQStats.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace faiss { - -// NOLINTNEXTLINE(facebook-avoid-non-const-global-variables) -RaBitQStats rabitq_stats; - -void RaBitQStats::reset() { - n_1bit_evaluations = 0; - n_multibit_evaluations = 0; -} - -double RaBitQStats::skip_percentage() const { - const size_t copy_n_1bit_evaluations = n_1bit_evaluations; - const size_t copy_n_multibit_evaluations = n_multibit_evaluations; - return copy_n_1bit_evaluations > 0 - ? 100.0 * (copy_n_1bit_evaluations - copy_n_multibit_evaluations) / - copy_n_1bit_evaluations - : 0.0; -} - -} // namespace faiss diff --git a/thirdparty/faiss/faiss/impl/RaBitQStats.h b/thirdparty/faiss/faiss/impl/RaBitQStats.h deleted file mode 100644 index 8096c584c..000000000 --- a/thirdparty/faiss/faiss/impl/RaBitQStats.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace faiss { - -/// Statistics for RaBitQ multi-bit two-stage search. -/// -/// These stats are ONLY collected for multi-bit mode (nb_bits > 1). -/// In 1-bit mode, there is no two-stage filtering - all candidates are -/// evaluated with a single distance computation, so there is nothing -/// meaningful to track. For 1-bit mode, both counters remain 0. -/// -/// Multi-bit mode uses a two-stage search: -/// Stage 1: Compute 1-bit lower bound distance for all candidates -/// Stage 2: Compute full multi-bit distance only for promising candidates -/// -/// The skip_percentage() metric measures filtering effectiveness: -/// how many candidates were filtered out by the 1-bit lower bound -/// without needing the more expensive multi-bit distance computation. -/// -/// WARNING: Statistics are not robust to internal threading nor to -/// concurrent RaBitQ searches. Use these values in a single-threaded -/// context to accurately gauge RaBitQ's filtering effectiveness. -/// Call reset() before search, then read stats after search completes. -struct RaBitQStats { - /// Number of candidates evaluated using 1-bit (lower bound) distance. - /// This is the first stage of two-stage search in multi-bit mode. - /// Always 0 in 1-bit mode (stats not tracked). - size_t n_1bit_evaluations = 0; - - /// Number of candidates that passed 1-bit filtering and required - /// full multi-bit distance computation (second stage). - /// Always 0 in 1-bit mode (stats not tracked). - size_t n_multibit_evaluations = 0; - - void reset(); - - /// Compute percentage of candidates skipped (filtered out by 1-bit stage). - /// Returns 0 if no candidates were evaluated (including 1-bit mode). - double skip_percentage() const; -}; - -/// Global stats for RaBitQ indexes -// NOLINTNEXTLINE(facebook-avoid-non-const-global-variables) -FAISS_API extern RaBitQStats rabitq_stats; - -} // namespace faiss diff --git a/thirdparty/faiss/faiss/impl/RaBitQUtils.h b/thirdparty/faiss/faiss/impl/RaBitQUtils.h index 1eb2ebf04..e3cc4b9d3 100644 --- a/thirdparty/faiss/faiss/impl/RaBitQUtils.h +++ b/thirdparty/faiss/faiss/impl/RaBitQUtils.h @@ -380,6 +380,41 @@ inline T* get_block_aux_ptr( (vec_pos % bbs) * storage_size; } +/// Extract sign bits from PQ4-interleaved block into flat byte packing. +/// Like CodePackerRaBitQ::unpack_1 but sign-bits-only and with the +/// vector's in-block address hoisted out of the per-SQ loop. +inline void unpack_sign_bits_from_packed( + const uint8_t* block, + size_t bbs, + size_t nsq, + size_t offset, + size_t block_stride, + uint8_t* sign_bits_out) { + block += (offset / bbs) * block_stride; + offset = offset % bbs; + + const bool nibble_high = offset > 15; + const size_t vid = offset & 15; + const size_t in_group_addr = + (vid < 8) ? (vid << 1) : (((vid - 8) << 1) + 1); + + const size_t num_pairs = nsq / 2; + for (size_t k = 0; k < num_pairs; k++) { + const size_t base = k * bbs; + const uint8_t raw_even = block[base + in_group_addr]; + const uint8_t raw_odd = block[base + in_group_addr + 16]; + + const uint8_t nib0 = nibble_high ? (raw_even >> 4) : (raw_even & 0xF); + const uint8_t nib1 = nibble_high ? (raw_odd >> 4) : (raw_odd & 0xF); + sign_bits_out[k] = nib0 | (nib1 << 4); + } + + if (nsq & 1) { + const uint8_t raw = block[num_pairs * bbs + in_group_addr]; + sign_bits_out[num_pairs] = nibble_high ? (raw >> 4) : (raw & 0xF); + } +} + /** Compute per-vector auxiliary storage size. * * @param nb_bits number of quantization bits (1 = sign-bit only) diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp b/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp index 5fb2a97f5..7400626ec 100644 --- a/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp +++ b/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp @@ -62,6 +62,10 @@ void ScalarQuantizer::set_derived_sizes() { code_size = d * 2; bits = 16; break; + case QT_0bit: + code_size = 0; + bits = 0; + break; default: break; } @@ -71,6 +75,10 @@ void ScalarQuantizer::train(size_t n, const float* x) { using scalar_quantizer::train_NonUniform; using scalar_quantizer::train_Uniform; + if (qtype == QT_0bit) { + return; // nothing to train for centroid-only mode + } + int bit_per_dim = qtype == QT_4bit_uniform ? 4 : qtype == QT_4bit ? 4 : qtype == QT_6bit ? 6 @@ -128,6 +136,9 @@ ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const { void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n) const { + if (code_size == 0) { + return; // QT_0bit: nothing to encode + } std::unique_ptr squant(select_quantizer()); memset(codes, 0, code_size * n); @@ -138,6 +149,10 @@ void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n) } void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const { + if (code_size == 0) { + memset(x, 0, sizeof(float) * d * n); + return; // QT_0bit: no per-vector data, zero-fill + } std::unique_ptr squant(select_quantizer()); #pragma omp parallel for diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizer.h b/thirdparty/faiss/faiss/impl/ScalarQuantizer.h index 186074db0..8fc44c805 100644 --- a/thirdparty/faiss/faiss/impl/ScalarQuantizer.h +++ b/thirdparty/faiss/faiss/impl/ScalarQuantizer.h @@ -33,6 +33,7 @@ struct ScalarQuantizer : Quantizer { QT_bf16, QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from ///< [-128 to 127] + QT_0bit, ///< 0 bits per component, centroid-only distance (for IVF) QT_count }; @@ -101,6 +102,25 @@ struct ScalarQuantizer : Quantizer { virtual float query_to_code(const uint8_t* code) const = 0; + /// Compute four query-to-code distances in one call. Default loops + /// query_to_code four times; per-SIMD specializations may batch the + /// inner dim loop across the four codes to amortize query state and + /// expose ILP across independent accumulators. + virtual void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const { + dis0 = query_to_code(code_0); + dis1 = query_to_code(code_1); + dis2 = query_to_code(code_2); + dis3 = query_to_code(code_3); + } + float distance_to_code(const uint8_t* code) final { return query_to_code(code); } diff --git a/thirdparty/faiss/faiss/impl/approx_topk/approx_topk.h b/thirdparty/faiss/faiss/impl/approx_topk/approx_topk.h index 5d7550278..26c0a0884 100644 --- a/thirdparty/faiss/faiss/impl/approx_topk/approx_topk.h +++ b/thirdparty/faiss/faiss/impl/approx_topk/approx_topk.h @@ -212,4 +212,65 @@ struct HeapWithBuckets, NBUCKETS, N> { } }; +// ----------------------------------------------------------------------- +// approx_topk_by_mode: consolidates the mode switch + dispatch pattern +// used by residual_quantizer_encode_steps.cpp and other callers. +// ----------------------------------------------------------------------- + +// SL-parameterized version for callers that have already resolved the +// SIMD level (e.g., inside a with_simd_level_256bit lambda). +template +inline void approx_topk_by_mode( + ApproxTopK_mode_t mode, + uint32_t beam_size, + uint32_t n_per_beam, + const float* distances, + uint32_t k, + float* bh_val, + int32_t* bh_ids) { + using C = CMax; + auto approx = [&]() { + HeapWithBucketsCMaxFloat::bs_addn( + beam_size, n_per_beam, distances, k, bh_val, bh_ids); + }; + switch (mode) { + case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B8_D3: + approx.template operator()<8, 3>(); + break; + case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B8_D2: + approx.template operator()<8, 2>(); + break; + case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B16_D2: + approx.template operator()<16, 2>(); + break; + case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B32_D2: + approx.template operator()<32, 2>(); + break; + default: + heap_addn( + k, + bh_val, + bh_ids, + distances, + nullptr, + beam_size * n_per_beam); + break; + } +} + +// Non-SL wrapper that dispatches via with_simd_level_256bit. +inline void approx_topk_by_mode( + ApproxTopK_mode_t mode, + uint32_t beam_size, + uint32_t n_per_beam, + const float* distances, + uint32_t k, + float* bh_val, + int32_t* bh_ids) { + with_simd_level_256bit([&]() { + approx_topk_by_mode( + mode, beam_size, n_per_beam, distances, k, bh_val, bh_ids); + }); +} + } // namespace faiss diff --git a/thirdparty/faiss/faiss/impl/fast_scan/FastScanDistancePostProcessing.h b/thirdparty/faiss/faiss/impl/fast_scan/FastScanDistancePostProcessing.h index 9a09a2165..72b5b27f6 100644 --- a/thirdparty/faiss/faiss/impl/fast_scan/FastScanDistancePostProcessing.h +++ b/thirdparty/faiss/faiss/impl/fast_scan/FastScanDistancePostProcessing.h @@ -35,6 +35,14 @@ struct FastScanDistancePostProcessing { /// Set to 0 to use index->nprobe as fallback. size_t nprobe = 0; + /// RaBitQ query quantization bits override. + /// Set to 0 to use the index default (index->qb). + uint8_t qb = 0; + + /// RaBitQ centered scalar quantizer override. + /// Only used when qb > 0 (i.e., when params are overridden). + bool centered = false; + /// Default constructor - no processing FastScanDistancePostProcessing() = default; diff --git a/thirdparty/faiss/faiss/impl/fast_scan/kernels_simd256.h b/thirdparty/faiss/faiss/impl/fast_scan/kernels_simd256.h index 21721cc67..cd889e5fd 100644 --- a/thirdparty/faiss/faiss/impl/fast_scan/kernels_simd256.h +++ b/thirdparty/faiss/faiss/impl/fast_scan/kernels_simd256.h @@ -11,6 +11,10 @@ namespace faiss { +// Explicit SIMD-level aliases for this file (no global bare aliases). +using simd16uint16 = simd16uint16_tpl; +using simd32uint8 = simd32uint8_tpl; + /* * Multi-BB variant: accumulates NQ queries x BB*32 database elements. * Used by the search_1 path (bbs > 32). diff --git a/thirdparty/faiss/faiss/impl/fast_scan/kernels_simd512.h b/thirdparty/faiss/faiss/impl/fast_scan/kernels_simd512.h index 7e932c454..6da036030 100644 --- a/thirdparty/faiss/faiss/impl/fast_scan/kernels_simd512.h +++ b/thirdparty/faiss/faiss/impl/fast_scan/kernels_simd512.h @@ -14,6 +14,10 @@ namespace faiss { +// Explicit SIMD-level aliases for this file (no global bare aliases). +using simd32uint16 = simd32uint16_tpl; +using simd64uint8 = simd64uint8_tpl; + // NQ=1 specialization: processes 512-bit chunks aggressively. // a special version for NQ=1. // Despite the function being large in the text form, it compiles to a very diff --git a/thirdparty/faiss/faiss/impl/fast_scan/rabitq_result_handler.h b/thirdparty/faiss/faiss/impl/fast_scan/rabitq_result_handler.h index 74364d3d8..59950e8ae 100644 --- a/thirdparty/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +++ b/thirdparty/faiss/faiss/impl/fast_scan/rabitq_result_handler.h @@ -10,9 +10,7 @@ #include #include -#include #include -#include #include #include #include @@ -47,7 +45,6 @@ struct IVFRaBitQHeapHandler : ResultHandlerCompare { int64_t* heap_labels; // [nq * k] const size_t nq, k; size_t current_list_no = 0; - const uint8_t* list_codes_ptr = nullptr; // raw block data for list std::vector probe_indices; // probe index for each query in current batch const FastScanDistancePostProcessing* @@ -59,10 +56,12 @@ struct IVFRaBitQHeapHandler : ResultHandlerCompare { const size_t storage_size; const size_t packed_block_size; const size_t full_block_size; - std::unique_ptr packer; // cached for unpack in hot path - // Handler-local scratch reused across refinements. This assumes a handler - // instance is confined to one search slice and not entered concurrently. - std::vector unpack_buf; // reusable buffer for unpack_1 + std::vector unpack_buf; // sign bits scratch buffer + + // Cached per-list values (set in set_list_context, avoid recomputing in + // handle) + size_t cached_nprobe = 0; + bool is_similarity = false; // metric == INNER_PRODUCT // Use float-based comparator for heap operations using Cfloat = typename std::conditional< @@ -98,10 +97,10 @@ struct IVFRaBitQHeapHandler : ResultHandlerCompare { private: float compute_full_multibit_distance( - size_t db_idx, size_t local_q, size_t global_q, - size_t local_offset); + size_t local_offset, + const uint8_t* aux_ptr); }; } // namespace simd_result_handlers diff --git a/thirdparty/faiss/faiss/impl/fast_scan/simd_result_handlers.h b/thirdparty/faiss/faiss/impl/fast_scan/simd_result_handlers.h index b99434680..e7b810350 100644 --- a/thirdparty/faiss/faiss/impl/fast_scan/simd_result_handlers.h +++ b/thirdparty/faiss/faiss/impl/fast_scan/simd_result_handlers.h @@ -83,6 +83,9 @@ inline void for_each_block( } // namespace +// Explicit SIMD-level alias for the virtual interface below. +using simd16uint16 = simd16uint16_tpl; + struct SIMDResultHandler { // used to dispatch templates bool is_CMax = false; @@ -118,6 +121,7 @@ struct SIMDResultHandlerToFloat : SIMDResultHandler { /// these fields are used mainly for the IVF variants (with_id_map=true) const idx_t* id_map = nullptr; // map offset in invlist to vector id const int* q_map = nullptr; // map q to global query + const uint8_t* list_codes_ptr = nullptr; // raw block data for current list const uint16_t* dbias = nullptr; // table of biases to add to each query (for IVF L2 search) const float* normalizers = nullptr; // size 2 * nq, to convert @@ -292,7 +296,7 @@ struct ResultHandlerCompare : SIMDResultHandlerToFloat { // compute and adjust idx int64_t adjust_id(size_t b, size_t j) { int64_t idx = j0 + 32 * b + j; - if (with_id_map) { + if (id_map) { idx = id_map[idx]; } return idx; diff --git a/thirdparty/faiss/faiss/impl/hnsw/MinimaxHeap.cpp b/thirdparty/faiss/faiss/impl/hnsw/MinimaxHeap.cpp index 69ad526d5..3bf1ac05a 100644 --- a/thirdparty/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +++ b/thirdparty/faiss/faiss/impl/hnsw/MinimaxHeap.cpp @@ -5,6 +5,8 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include @@ -14,6 +16,10 @@ namespace faiss { void MinimaxHeap::push(storage_idx_t i, float v) { + // Treat NaN distances as infinitely far away so heap ordering is preserved. + if (std::isnan(v)) { + v = HC::neutral(); + } if (k == n) { if (v >= dis[0]) { return; diff --git a/thirdparty/faiss/faiss/impl/index_read.cpp b/thirdparty/faiss/faiss/impl/index_read.cpp index abe6746b9..c19c58b51 100644 --- a/thirdparty/faiss/faiss/impl/index_read.cpp +++ b/thirdparty/faiss/faiss/impl/index_read.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -697,6 +698,38 @@ static void validate_codebooks_size( aq.total_codebook_size); } +// Validate FastScan fields shared by all FastScan index types. +// M, ksub, bbs must be positive; bbs must be 32-aligned; M2 must be +// roundup(M, 2); and ksub * M / ksub * M2 must not overflow. +static void validate_fastscan_fields( + size_t M, + size_t M2, + size_t ksub, + int bbs, + const char* index_type) { + FAISS_THROW_IF_NOT_FMT( + M > 0 && ksub > 0, + "%s: invalid quantizer state (M=%zd, ksub=%zd, must be > 0)", + index_type, + M, + ksub); + FAISS_THROW_IF_NOT_FMT( + bbs > 0 && bbs % 32 == 0, + "%s: invalid bbs=%d (must be > 0 and a multiple of 32)", + index_type, + bbs); + size_t expected_M2 = (M + 1) & ~static_cast(1); // roundup(M, 2) + FAISS_THROW_IF_NOT_FMT( + M2 == expected_M2, + "%s: invalid M2=%zd (expected roundup(M=%zd, 2) = %zd)", + index_type, + M2, + M, + expected_M2); + mul_no_overflow(ksub, M, index_type); + mul_no_overflow(ksub, M2, index_type); +} + // Validate that the AdditiveQuantizer dimension matches the index header // dimension. compute_LUT() treats codebooks as a (d, total_codebook_size) // matrix and query vectors are sized for idx_d, so a mismatch leads to @@ -871,6 +904,7 @@ void read_ScalarQuantizer( case ScalarQuantizer::QT_bf16: case ScalarQuantizer::QT_8bit_direct: case ScalarQuantizer::QT_8bit_direct_signed: + case ScalarQuantizer::QT_0bit: case ScalarQuantizer::QT_count: expected = 0; break; @@ -1261,6 +1295,20 @@ static std::unique_ptr read_ivfpq( read_ProductQuantizer(&ivfpqr->refine_pq, f); READVECTOR(ivfpqr->refine_codes); READ1(ivfpqr->k_factor); + // k_factor multiplies k to size search-time allocations + // (n * k * k_factor labels + distances). Defaults are 1 + // (IndexRefine) and 4 (IndexIVFPQR); AutoTune explores + // powers-of-two up to 64. Cap at 1000 to leave ample + // headroom beyond any known usage while still blocking + // OOM from crafted files (same cap as beam_factor in + // ResidualCoarseQuantizer). + FAISS_THROW_IF_NOT_FMT( + std::isfinite(ivfpqr->k_factor) && + ivfpqr->k_factor >= 1.0f && + ivfpqr->k_factor <= 1000.0f, + "k_factor %.6g out of valid range [1, 1000]" + " for IndexIVFPQR", + ivfpqr->k_factor); } } return ivpq; @@ -1512,25 +1560,12 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { READVECTOR(idxaqfs->codes); - FAISS_THROW_IF_NOT_FMT( - idxaqfs->M > 0 && idxaqfs->ksub > 0, - "IndexAdditiveQuantizerFastScan: invalid quantizer state " - "(M=%zd, ksub=%zd, must be > 0)", - idxaqfs->M, - idxaqfs->ksub); - FAISS_THROW_IF_NOT_FMT( - idxaqfs->bbs > 0 && idxaqfs->bbs % 32 == 0, - "IndexAdditiveQuantizerFastScan: invalid bbs=%d " - "(must be > 0 and a multiple of 32)", - idxaqfs->bbs); - mul_no_overflow( - idxaqfs->ksub, + validate_fastscan_fields( idxaqfs->M, - "IndexAdditiveQuantizerFastScan ksub * M"); - mul_no_overflow( - idxaqfs->ksub, idxaqfs->M2, - "IndexAdditiveQuantizerFastScan ksub * M2"); + idxaqfs->ksub, + idxaqfs->bbs, + "IndexAdditiveQuantizerFastScan"); idx = std::move(idxaqfs); } else if ( @@ -1588,6 +1623,14 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { read_InvertedLists(*ivaqfs, f, io_flags); ivaqfs->init_code_packer(); + + validate_fastscan_fields( + ivaqfs->M, + ivaqfs->M2, + ivaqfs->ksub, + ivaqfs->bbs, + "IndexIVFAdditiveQuantizerFastScan"); + idx = std::move(ivaqfs); } else if (h == fourcc("IvFl") || h == fourcc("IvFL")) { // legacy auto ivfl = std::make_unique(); @@ -1810,6 +1853,12 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { auto base = read_index_up(f, io_flags); auto refine = read_index_up(f, io_flags); READ1(idxrf->k_factor); + // Same rationale as IndexIVFPQR k_factor above. + FAISS_THROW_IF_NOT_FMT( + std::isfinite(idxrf->k_factor) && idxrf->k_factor >= 1.0f && + idxrf->k_factor <= 1000.0f, + "k_factor %.6g out of valid range [1, 1000] for IndexRefine", + idxrf->k_factor); if (h == fourcc("IxRP")) { // then make a RefineFlatPanorama with it auto idxrf_new = std::make_unique(); @@ -2032,20 +2081,12 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { idxpqfs->ksub = (1 << pq.nbits); idxpqfs->code_size = pq.code_size; - FAISS_THROW_IF_NOT_FMT( - idxpqfs->M > 0 && idxpqfs->ksub > 0, - "IndexPQFastScan: invalid quantizer state " - "(M=%zd, ksub=%zd, must be > 0)", + validate_fastscan_fields( idxpqfs->M, - idxpqfs->ksub); - FAISS_THROW_IF_NOT_FMT( - idxpqfs->bbs > 0 && idxpqfs->bbs % 32 == 0, - "IndexPQFastScan: invalid bbs=%d " - "(must be > 0 and a multiple of 32)", - idxpqfs->bbs); - mul_no_overflow(idxpqfs->ksub, idxpqfs->M, "IndexPQFastScan ksub * M"); - mul_no_overflow( - idxpqfs->ksub, idxpqfs->M2, "IndexPQFastScan ksub * M2"); + idxpqfs->M2, + idxpqfs->ksub, + idxpqfs->bbs, + "IndexPQFastScan"); idx = std::move(idxpqfs); @@ -2069,6 +2110,9 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { ivpq->code_size = pq.code_size; ivpq->init_code_packer(); + validate_fastscan_fields( + ivpq->M, ivpq->M2, ivpq->ksub, ivpq->bbs, "IndexIVFPQFastScan"); + idx = std::move(ivpq); } else if (h == fourcc("IRMf")) { auto imm = std::make_unique(); @@ -2117,21 +2161,12 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { idxqfs->nbits = nbits_fastscan; idxqfs->ksub = (1 << nbits_fastscan); - FAISS_THROW_IF_NOT_FMT( - idxqfs->M > 0 && idxqfs->ksub > 0, - "IndexRaBitQFastScan: invalid quantizer state " - "(M=%zd, ksub=%zd, must be > 0)", + validate_fastscan_fields( idxqfs->M, - idxqfs->ksub); - FAISS_THROW_IF_NOT_FMT( - idxqfs->bbs > 0 && idxqfs->bbs % 32 == 0, - "IndexRaBitQFastScan: invalid bbs=%d " - "(must be > 0 and a multiple of 32)", - idxqfs->bbs); - mul_no_overflow( - idxqfs->ksub, idxqfs->M, "IndexRaBitQFastScan ksub * M"); - mul_no_overflow( - idxqfs->ksub, idxqfs->M2, "IndexRaBitQFastScan ksub * M2"); + idxqfs->M2, + idxqfs->ksub, + idxqfs->bbs, + "IndexRaBitQFastScan"); READVECTOR(idxqfs->codes); @@ -2258,7 +2293,16 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { READ1(svs->max_candidate_pool_size); READ1(svs->prune_to); READ1(svs->use_full_search_history); - READ1(svs->storage_kind); + + int sk; + READ1(sk); + FAISS_THROW_IF_NOT_FMT( + sk >= 0 && sk < static_cast(SVS_count), + "invalid SVS storage_kind=%d (must be in [0, %d))", + sk, + static_cast(SVS_count)); + svs->storage_kind = static_cast(sk); + if (h == fourcc("ISVL")) { auto* leanvec = dynamic_cast(svs.get()); FAISS_THROW_IF_NOT_MSG( @@ -2334,6 +2378,13 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { ivrqfs->nbits = nbits_fastscan; ivrqfs->ksub = (1 << nbits_fastscan); + validate_fastscan_fields( + ivrqfs->M, + ivrqfs->M2, + ivrqfs->ksub, + ivrqfs->bbs, + "IndexIVFRaBitQFastScan"); + read_InvertedLists(*ivrqfs, f, io_flags); ivrqfs->init_code_packer(); diff --git a/thirdparty/faiss/faiss/impl/mapped_io.cpp b/thirdparty/faiss/faiss/impl/mapped_io.cpp index 8e5ef7ca5..2582e6f57 100644 --- a/thirdparty/faiss/faiss/impl/mapped_io.cpp +++ b/thirdparty/faiss/faiss/impl/mapped_io.cpp @@ -7,7 +7,7 @@ #include -#if defined(__linux__) || defined(__FreeBSD__) +#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__) #include #include @@ -27,7 +27,7 @@ namespace faiss { -#if defined(__linux__) || defined(__FreeBSD__) +#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__) struct MmappedFileMappingOwner::PImpl { void* ptr = nullptr; @@ -60,11 +60,11 @@ struct MmappedFileMappingOwner::PImpl { void* address = mmap( nullptr, filesize, PROT_READ, MAP_SHARED, fileno(f.get()), 0); FAISS_THROW_IF_NOT_FMT( - address != nullptr, "could not mmap(): %s", strerror(errno)); + address != MAP_FAILED, "could not mmap(): %s", strerror(errno)); // btw, fd can be closed here - madvise(address, filesize, MADV_RANDOM); + (void)madvise(address, filesize, MADV_RANDOM); // save it ptr = address; @@ -83,11 +83,11 @@ struct MmappedFileMappingOwner::PImpl { void* address = mmap(nullptr, filesize, PROT_READ, MAP_SHARED, fileno(f), 0); FAISS_THROW_IF_NOT_FMT( - address != nullptr, "could not mmap(): %s", strerror(errno)); + address != MAP_FAILED, "could not mmap(): %s", strerror(errno)); // btw, fd can be closed here - madvise(address, filesize, MADV_RANDOM); + (void)madvise(address, filesize, MADV_RANDOM); // save it ptr = address; diff --git a/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.cpp b/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.cpp index ac0fe85e6..85310a61a 100644 --- a/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +++ b/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.cpp @@ -92,111 +92,96 @@ void beam_search_encode_step( } InterruptCallback::check(); + // Resolve SIMD level once, not per iteration of the n-parallel loop. + with_simd_level_256bit([&]() { #pragma omp parallel for if (n > 100) - for (int64_t i = 0; i < static_cast(n); i++) { - const int32_t* codes_i = codes + i * m * beam_size; - int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; - const float* residuals_i = residuals + i * d * beam_size; - float* new_residuals_i = new_residuals + i * d * new_beam_size; - - float* new_distances_i = new_distances + i * new_beam_size; - using C = CMax; - - if (assign_index) { - const float* cent_distances_i = - cent_distances.data() + i * beam_size * new_beam_size; - const idx_t* cent_ids_i = - cent_ids.data() + i * beam_size * new_beam_size; - - // here we could be a tad more efficient by merging sorted arrays - for (size_t j = 0; j < new_beam_size; j++) { - new_distances_i[j] = C::neutral(); - } - std::vector perm(new_beam_size, -1); - heap_addn( - new_beam_size, - new_distances_i, - perm.data(), - cent_distances_i, - nullptr, - beam_size * new_beam_size); - heap_reorder(new_beam_size, new_distances_i, perm.data()); - - for (size_t j = 0; j < new_beam_size; j++) { - int js = perm[j] / new_beam_size; - int ls = cent_ids_i[perm[j]]; - if (m > 0) { - memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); + for (int64_t i = 0; i < static_cast(n); i++) { + const int32_t* codes_i = codes + i * m * beam_size; + int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; + const float* residuals_i = residuals + i * d * beam_size; + float* new_residuals_i = new_residuals + i * d * new_beam_size; + + float* new_distances_i = new_distances + i * new_beam_size; + using C = CMax; + + if (assign_index) { + const float* cent_distances_i = + cent_distances.data() + i * beam_size * new_beam_size; + const idx_t* cent_ids_i = + cent_ids.data() + i * beam_size * new_beam_size; + + // here we could be a tad more efficient by merging sorted + // arrays + for (size_t j = 0; j < new_beam_size; j++) { + new_distances_i[j] = C::neutral(); + } + std::vector perm(new_beam_size, -1); + heap_addn( + new_beam_size, + new_distances_i, + perm.data(), + cent_distances_i, + nullptr, + beam_size * new_beam_size); + heap_reorder(new_beam_size, new_distances_i, perm.data()); + + for (size_t j = 0; j < new_beam_size; j++) { + int js = perm[j] / new_beam_size; + int ls = cent_ids_i[perm[j]]; + if (m > 0) { + memcpy(new_codes_i, + codes_i + js * m, + sizeof(*codes) * m); + } + new_codes_i[m] = ls; + new_codes_i += m + 1; + fvec_sub( + d, + residuals_i + js * d, + cent + ls * d, + new_residuals_i); + new_residuals_i += d; } - new_codes_i[m] = ls; - new_codes_i += m + 1; - fvec_sub( - d, - residuals_i + js * d, - cent + ls * d, - new_residuals_i); - new_residuals_i += d; - } - } else { - const float* cent_distances_i = - cent_distances.data() + i * beam_size * K; - // then we have to select the best results - for (size_t j = 0; j < new_beam_size; j++) { - new_distances_i[j] = C::neutral(); - } - std::vector perm(new_beam_size, -1); + } else { + const float* cent_distances_i = + cent_distances.data() + i * beam_size * K; + // then we have to select the best results + for (size_t j = 0; j < new_beam_size; j++) { + new_distances_i[j] = C::neutral(); + } + std::vector perm(new_beam_size, -1); - auto approx = [&]() { - HeapWithBuckets::bs_addn( + approx_topk_by_mode( + approx_topk_mode, beam_size, K, cent_distances_i, new_beam_size, new_distances_i, perm.data()); - }; - switch (approx_topk_mode) { - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B8_D3: - approx.template operator()<8, 3>(); - break; - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B8_D2: - approx.template operator()<8, 2>(); - break; - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B16_D2: - approx.template operator()<16, 2>(); - break; - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B32_D2: - approx.template operator()<32, 2>(); - break; - default: - heap_addn( - new_beam_size, - new_distances_i, - perm.data(), - cent_distances_i, - nullptr, - beam_size * K); - } - heap_reorder(new_beam_size, new_distances_i, perm.data()); - - for (size_t j = 0; j < new_beam_size; j++) { - int js = perm[j] / K; - int ls = perm[j] % K; - if (m > 0) { - memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); + heap_reorder(new_beam_size, new_distances_i, perm.data()); + + for (size_t j = 0; j < new_beam_size; j++) { + int js = perm[j] / K; + int ls = perm[j] % K; + if (m > 0) { + memcpy(new_codes_i, + codes_i + js * m, + sizeof(*codes) * m); + } + new_codes_i[m] = ls; + new_codes_i += m + 1; + fvec_sub( + d, + residuals_i + js * d, + cent + ls * d, + new_residuals_i); + new_residuals_i += d; } - new_codes_i[m] = ls; - new_codes_i += m + 1; - fvec_sub( - d, - residuals_i + js * d, - cent + ls * d, - new_residuals_i); - new_residuals_i += d; } } - } + }); } // exposed in the faiss namespace @@ -380,20 +365,21 @@ void beam_search_encode_step_tab( { FAISS_THROW_IF_NOT(ldc >= K); + // Resolve SIMD level once, not per iteration of the n-parallel loop. + with_simd_level_256bit([&]() { #pragma omp parallel for if (n > 100) schedule(dynamic) - for (int64_t i = 0; i < static_cast(n); i++) { - std::vector cent_distances(beam_size * K); - std::vector cd_common(K); + for (int64_t i = 0; i < static_cast(n); i++) { + std::vector cent_distances(beam_size * K); + std::vector cd_common(K); - const int32_t* codes_i = codes + i * m * beam_size; - const float* query_cp_i = query_cp + i * ldqc; - const float* distances_i = distances + i * beam_size; + const int32_t* codes_i = codes + i * m * beam_size; + const float* query_cp_i = query_cp + i * ldqc; + const float* distances_i = distances + i * beam_size; - for (size_t k = 0; k < K; k++) { - cd_common[k] = cent_norms_i[k] - 2 * query_cp_i[k]; - } + for (size_t k = 0; k < K; k++) { + cd_common[k] = cent_norms_i[k] - 2 * query_cp_i[k]; + } - with_simd_level_256bit([&]() { if constexpr (SL == SIMDLevel::NONE) { compute_cent_distances_baseline( K, @@ -419,64 +405,40 @@ void beam_search_encode_step_tab( cd_common.data(), cent_distances.data()); } - }); - using C = CMax; - int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; - float* new_distances_i = new_distances + i * new_beam_size; - const float* cent_distances_i = cent_distances.data(); + using C = CMax; + int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; + float* new_distances_i = new_distances + i * new_beam_size; - // then we have to select the best results - for (size_t j = 0; j < new_beam_size; j++) { - new_distances_i[j] = C::neutral(); - } - std::vector perm(new_beam_size, -1); + const float* cent_distances_i = cent_distances.data(); + + // then we have to select the best results + for (size_t j = 0; j < new_beam_size; j++) { + new_distances_i[j] = C::neutral(); + } + std::vector perm(new_beam_size, -1); - auto approx = [&]() { - HeapWithBuckets::bs_addn( + approx_topk_by_mode( + approx_topk_mode, beam_size, K, cent_distances_i, new_beam_size, new_distances_i, perm.data()); - }; - switch (approx_topk_mode) { - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B8_D3: - approx.template operator()<8, 3>(); - break; - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B8_D2: - approx.template operator()<8, 2>(); - break; - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B16_D2: - approx.template operator()<16, 2>(); - break; - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B32_D2: - approx.template operator()<32, 2>(); - break; - default: - heap_addn( - new_beam_size, - new_distances_i, - perm.data(), - cent_distances_i, - nullptr, - beam_size * K); - break; - } - - heap_reorder(new_beam_size, new_distances_i, perm.data()); + heap_reorder(new_beam_size, new_distances_i, perm.data()); - for (size_t j = 0; j < new_beam_size; j++) { - int js = perm[j] / K; - int ls = perm[j] % K; - if (m > 0) { - memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); + for (size_t j = 0; j < new_beam_size; j++) { + int js = perm[j] / K; + int ls = perm[j] % K; + if (m > 0) { + memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); + } + new_codes_i[m] = ls; + new_codes_i += m + 1; } - new_codes_i[m] = ls; - new_codes_i += m + 1; } - } + }); } /******************************************************************** diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/scanners.h b/thirdparty/faiss/faiss/impl/scalar_quantizer/scanners.h index cecb578be..6f63f6e24 100644 --- a/thirdparty/faiss/faiss/impl/scalar_quantizer/scanners.h +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/scanners.h @@ -159,6 +159,32 @@ InvertedListScanner* sq_select_InvertedListScanner( const IDSelector* sel, bool by_residual); +/// Scanner for QT_0bit / centroid-only distance: always returns the +/// coarse distance that was set via set_list(). +struct IVFCoarseDistanceScanner : InvertedListScanner { + float coarse_dis = 0; + + IVFCoarseDistanceScanner( + bool is_similarity, + bool store_pairs, + const IDSelector* sel) + : InvertedListScanner(store_pairs, sel) { + code_size = 0; + keep_max = is_similarity; + } + + void set_query(const float* /*query_vector*/) override {} + + void set_list(idx_t list_no_in, float coarse_dis_in) override { + this->list_no = list_no_in; + this->coarse_dis = coarse_dis_in; + } + + float distance_to_code(const uint8_t* /*code*/) const override { + return coarse_dis; + } +}; + } // namespace scalar_quantizer } // namespace faiss diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp index ac31c99cc..708288bb3 100644 --- a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp @@ -387,6 +387,43 @@ struct DCTemplate : SQDistanceComputer { float query_to_code(const uint8_t* code) const final { return compute_distance(q, code); } + + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + Similarity sim0(q); + Similarity sim1(q); + Similarity sim2(q); + Similarity sim3(q); + + sim0.begin_8(); + sim1.begin_8(); + sim2.begin_8(); + sim3.begin_8(); + + for (size_t i = 0; i < quant.d; i += 8) { + const int ii = static_cast(i); + simd8float32 xi0 = quant.reconstruct_8_components(code_0, ii); + simd8float32 xi1 = quant.reconstruct_8_components(code_1, ii); + simd8float32 xi2 = quant.reconstruct_8_components(code_2, ii); + simd8float32 xi3 = quant.reconstruct_8_components(code_3, ii); + sim0.add_8_components(xi0); + sim1.add_8_components(xi1); + sim2.add_8_components(xi2); + sim3.add_8_components(xi3); + } + + dis0 = sim0.result_8(); + dis1 = sim1.result_8(); + dis2 = sim2.result_8(); + dis3 = sim3.result_8(); + } }; template diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp index f85f8bbb8..77abffb36 100644 --- a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp @@ -368,6 +368,42 @@ struct DCTemplate float query_to_code(const uint8_t* code) const final { return compute_distance(q, code); } + + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + Similarity sim0(q); + Similarity sim1(q); + Similarity sim2(q); + Similarity sim3(q); + + sim0.begin_16(); + sim1.begin_16(); + sim2.begin_16(); + sim3.begin_16(); + + for (size_t i = 0; i < quant.d; i += 16) { + simd16float32 xi0 = quant.reconstruct_16_components(code_0, i); + simd16float32 xi1 = quant.reconstruct_16_components(code_1, i); + simd16float32 xi2 = quant.reconstruct_16_components(code_2, i); + simd16float32 xi3 = quant.reconstruct_16_components(code_3, i); + sim0.add_16_components(xi0); + sim1.add_16_components(xi1); + sim2.add_16_components(xi2); + sim3.add_16_components(xi3); + } + + dis0 = sim0.result_16(); + dis1 = sim1.result_16(); + dis2 = sim2.result_16(); + dis3 = sim3.result_16(); + } }; template diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h index 9a10e75a4..02c09244c 100644 --- a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h @@ -85,6 +85,9 @@ ScalarQuantizer::SQuantizer* sq_select_quantizer( return new Quantizer8bitDirect(d, trained); case ScalarQuantizer::QT_8bit_direct_signed: return new Quantizer8bitDirectSigned(d, trained); + case ScalarQuantizer::QT_0bit: + FAISS_THROW_MSG( + "QT_0bit does not support standalone quantization, use IndexIVFScalarQuantizer"); default: FAISS_THROW_MSG("unknown qtype"); } @@ -175,6 +178,9 @@ SQDistanceComputer* select_distance_computer_body( case ScalarQuantizer::QT_8bit_direct_signed: return new DCTemplate, Sim, SL2>( d, trained); + case ScalarQuantizer::QT_0bit: + FAISS_THROW_MSG( + "QT_0bit does not support standalone distance computation, use IndexIVFScalarQuantizer"); default: FAISS_THROW_MSG("unknown qtype"); } @@ -309,6 +315,9 @@ InvertedListScanner* sq_select_InvertedListScanner( Quantizer8bitDirectSigned, Similarity, SL2>>(); + case ScalarQuantizer::QT_0bit: + return new IVFCoarseDistanceScanner( + Similarity::metric_type != METRIC_L2, store_pairs, sel); default: FAISS_THROW_MSG("unknown qtype"); } diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp index 7d895941d..9be9f4a31 100644 --- a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp @@ -358,6 +358,42 @@ struct DCTemplate float query_to_code(const uint8_t* code) const final { return compute_distance(q, code); } + + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + Similarity sim0(q); + Similarity sim1(q); + Similarity sim2(q); + Similarity sim3(q); + + sim0.begin_8(); + sim1.begin_8(); + sim2.begin_8(); + sim3.begin_8(); + + for (size_t i = 0; i < quant.d; i += 8) { + simd8float32 xi0 = quant.reconstruct_8_components(code_0, i); + simd8float32 xi1 = quant.reconstruct_8_components(code_1, i); + simd8float32 xi2 = quant.reconstruct_8_components(code_2, i); + simd8float32 xi3 = quant.reconstruct_8_components(code_3, i); + sim0.add_8_components(xi0); + sim1.add_8_components(xi1); + sim2.add_8_components(xi2); + sim3.add_8_components(xi3); + } + + dis0 = sim0.result_8(); + dis1 = sim1.result_8(); + dis2 = sim2.result_8(); + dis3 = sim3.result_8(); + } }; template diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp new file mode 100644 index 000000000..6fe99f8d1 --- /dev/null +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_RISCV_RVV + +#include +#include +#include +#include +#include + +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +/************************************************************************* + * Marker specializations. + * + * Unlike x86/NEON sq-*.cpp files that expose a fixed 8-wide / 16-wide codec + * interface (reconstruct_8_components / reconstruct_16_components), RVV is + * variable-width: the native vector length is implementation-defined and + * queried at runtime via __riscv_vsetvl. Forcing RVV into a fixed-width + * codec would leave performance on the table on wider hardware. + * + * So the strategy here is: Codec / Quantizer / Similarity classes for + * RISCV_RVV act as opaque TAG TYPES — they only need to be complete types + * so that baseline's sq-dispatch.h can form template arguments like + * `DCTemplate, UNIFORM, RISCV_RVV>, + * SimilarityL2, RISCV_RVV>`. + * + * The real SIMD work lives in full DCTemplate specializations below. + * Unspecialized combinations fall through to scalar via the fallback + * `DCTemplate : DCTemplate`. + ************************************************************************/ + +template <> +struct Codec8bit : Codec8bit {}; + +template <> +struct Codec4bit : Codec4bit {}; + +template <> +struct Codec6bit : Codec6bit {}; + +template +struct QuantizerTemplate< + Codec, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::RISCV_RVV> + : QuantizerTemplate< + Codec, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::NONE> { + QuantizerTemplate(size_t d, const std::vector& trained) + : QuantizerTemplate< + Codec, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::NONE>(d, trained) {} +}; + +template +struct QuantizerTemplate< + Codec, + QuantizerTemplateScaling::NON_UNIFORM, + SIMDLevel::RISCV_RVV> + : QuantizerTemplate< + Codec, + QuantizerTemplateScaling::NON_UNIFORM, + SIMDLevel::NONE> { + QuantizerTemplate(size_t d, const std::vector& trained) + : QuantizerTemplate< + Codec, + QuantizerTemplateScaling::NON_UNIFORM, + SIMDLevel::NONE>(d, trained) {} +}; + +template <> +struct QuantizerFP16 : QuantizerFP16 { + QuantizerFP16(size_t d, const std::vector& trained) + : QuantizerFP16(d, trained) {} +}; + +template <> +struct QuantizerBF16 : QuantizerBF16 { + QuantizerBF16(size_t d, const std::vector& trained) + : QuantizerBF16(d, trained) {} +}; + +template <> +struct Quantizer8bitDirect + : Quantizer8bitDirect { + Quantizer8bitDirect(size_t d, const std::vector& trained) + : Quantizer8bitDirect(d, trained) {} +}; + +template <> +struct Quantizer8bitDirectSigned + : Quantizer8bitDirectSigned { + Quantizer8bitDirectSigned(size_t d, const std::vector& trained) + : Quantizer8bitDirectSigned(d, trained) {} +}; + +template <> +struct SimilarityL2 : SimilarityL2 { + using SimilarityL2::SimilarityL2; +}; + +template <> +struct SimilarityIP : SimilarityIP { + using SimilarityIP::SimilarityIP; +}; + +/************************************************************************* + * Fallback DCTemplate / DistanceComputerByte for RISCV_RVV. + * + * Inheriting from the NONE specialization means every (Quantizer, Similarity) + * combination that does NOT have a hand-tuned RVV full specialization below + * falls through to scalar code. Callers and the dispatcher don't know or care. + ************************************************************************/ + +template +struct DCTemplate + : DCTemplate { + using Base = DCTemplate; + using Base::Base; +}; + +template +struct DistanceComputerByte + : DistanceComputerByte { + using Base = DistanceComputerByte; + using Base::Base; +}; + +/************************************************************************* + * Fast path — QT_4bit_uniform + L2 + * + * 4-bit UNIFORM scaling: every component reconstructs as an affine function + * of the 4-bit code, + * recon(c) = vmin + vdiff * (c + 0.5) / 15 = final_scale * c + bias + * where final_scale = vdiff / 15. L2 distance between two reconstructions + * therefore reduces to final_scale^2 * (q_c - c_c)^2 over integer codes, + * so we can stay in the int domain and pay one float multiply at the end. + * + * The RVV path pre-nibbles the query into q_lo / q_hi (even / odd lanes) + * once at set_query time and then processes native-VL-sized chunks of code + * without ever decoding to float. + ************************************************************************/ + +template <> +struct DCTemplate< + QuantizerTemplate< + Codec4bit, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::RISCV_RVV>, + SimilarityL2, + SIMDLevel::RISCV_RVV> : SQDistanceComputer { + using Sim = SimilarityL2; + + size_t d; + float vmin; + float vdiff; + float final_scale_sq; + std::vector q_lo; + std::vector q_hi; + + DCTemplate(size_t d_in, const std::vector& trained) + : d(d_in), + vmin(trained[0]), + vdiff(trained[1]), + q_lo((d_in + 1) / 2, 0), + q_hi((d_in + 1) / 2, 0) { + const float final_scale = vdiff / 15.0f; + final_scale_sq = final_scale * final_scale; + } + + void set_query(const float* x) final { + this->q = x; + const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff; + for (size_t i = 0; i < d; i++) { + float val = (x[i] - vmin) * inv_scale; + int code = static_cast(std::floor(val + 0.5f)); + if (code < 0) { + code = 0; + } + if (code > 15) { + code = 15; + } + if (i % 2 == 0) { + q_lo[i / 2] = static_cast(code); + } else { + q_hi[i / 2] = static_cast(code); + } + } + } + + /// Squared integer-domain L2 between pre-nibbled q and packed 4-bit code. + /// Uses RVV's native VL; no fixed width assumptions. Returns the raw + /// integer sum — caller multiplies by final_scale_sq. + int64_t accumulate_int_l2(const uint8_t* code) const { + int64_t acc = 0; + size_t i = 0; + while (i < d) { + // Process up to vl codes per iteration. Each code byte packs two + // 4-bit codes, so we load (vl + 1) / 2 bytes; keep vl even to + // keep the nibble split aligned with the i % 2 split we used at + // set_query time. + size_t remaining = d - i; + size_t vl = __riscv_vsetvl_e8m1(remaining); + if (vl & 1) { + vl -= 1; // keep even; tail handled on next iter or scalar + } + if (vl == 0) { + break; + } + const size_t byte_vl = vl / 2; + + vuint8m1_t packed = __riscv_vle8_v_u8m1(code + i / 2, byte_vl); + vuint8m1_t ql = __riscv_vle8_v_u8m1(q_lo.data() + i / 2, byte_vl); + vuint8m1_t qh = __riscv_vle8_v_u8m1(q_hi.data() + i / 2, byte_vl); + + vuint8m1_t lo_nib = __riscv_vand_vx_u8m1(packed, 0x0F, byte_vl); + vuint8m1_t hi_nib = __riscv_vsrl_vx_u8m1(packed, 4, byte_vl); + + // |ql - lo| and |qh - hi| fit in u8 (values are in [0, 15]). + vuint8m1_t d_lo = __riscv_vsub_vv_u8m1( + __riscv_vmaxu_vv_u8m1(ql, lo_nib, byte_vl), + __riscv_vminu_vv_u8m1(ql, lo_nib, byte_vl), + byte_vl); + vuint8m1_t d_hi = __riscv_vsub_vv_u8m1( + __riscv_vmaxu_vv_u8m1(qh, hi_nib, byte_vl), + __riscv_vminu_vv_u8m1(qh, hi_nib, byte_vl), + byte_vl); + + // Square via widening multiply (each byte squared fits in u16, + // since max byte value is 15 -> 225). + vuint16m2_t sq_lo = __riscv_vwmulu_vv_u16m2(d_lo, d_lo, byte_vl); + vuint16m2_t sq_hi = __riscv_vwmulu_vv_u16m2(d_hi, d_hi, byte_vl); + vuint16m2_t sq_sum = __riscv_vadd_vv_u16m2(sq_lo, sq_hi, byte_vl); + + // Reduce to a scalar u32 (safe: byte_vl * 450 fits in u32 for + // any realistic d). + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, 1); + vuint32m1_t red = + __riscv_vwredsumu_vs_u16m2_u32m1(sq_sum, zero, byte_vl); + acc += __riscv_vmv_x_s_u32m1_u32(red); + + i += vl; + } + // Scalar tail: cover any leftover odd lane (at most one). + for (; i < d; i++) { + uint8_t c_code = + (i % 2 == 0) ? (code[i / 2] & 0x0F) : (code[i / 2] >> 4); + uint8_t q_code = (i % 2 == 0) ? q_lo[i / 2] : q_hi[i / 2]; + int diff = int(q_code) - int(c_code); + acc += diff * diff; + } + return acc; + } + + float query_to_code(const uint8_t* code) const final { + return static_cast(accumulate_int_l2(code)) * final_scale_sq; + } + + float symmetric_dis(idx_t i, idx_t j) override { + // Not on the critical path for most workloads; reconstruct both + // codes into nibbles scalar-style and compute squared distance. + const uint8_t* c1 = codes + i * code_size; + const uint8_t* c2 = codes + j * code_size; + int64_t acc = 0; + for (size_t k = 0; k < d; k++) { + uint8_t a = (k % 2 == 0) ? (c1[k / 2] & 0x0F) : (c1[k / 2] >> 4); + uint8_t b = (k % 2 == 0) ? (c2[k / 2] & 0x0F) : (c2[k / 2] >> 4); + int diff = int(a) - int(b); + acc += diff * diff; + } + return static_cast(acc) * final_scale_sq; + } + + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + // Simple 4x unroll of the single-code path; good enough as a first + // cut — gives ILP across the four independent accumulate loops. + dis0 = static_cast(accumulate_int_l2(code_0)) * final_scale_sq; + dis1 = static_cast(accumulate_int_l2(code_1)) * final_scale_sq; + dis2 = static_cast(accumulate_int_l2(code_2)) * final_scale_sq; + dis3 = static_cast(accumulate_int_l2(code_3)) * final_scale_sq; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss + +#define THE_LEVEL_TO_DISPATCH SIMDLevel::RISCV_RVV +#include + +#endif // COMPILE_SIMD_RISCV_RVV diff --git a/thirdparty/faiss/faiss/impl/simd_dispatch.h b/thirdparty/faiss/faiss/impl/simd_dispatch.h index bfd27bc35..b18bc5b4a 100644 --- a/thirdparty/faiss/faiss/impl/simd_dispatch.h +++ b/thirdparty/faiss/faiss/impl/simd_dispatch.h @@ -101,6 +101,14 @@ inline auto with_selected_simd_levels(LambdaType&& action) { } [[fallthrough]]; #endif + +#ifdef COMPILE_SIMD_RISCV_RVV + case SIMDLevel::RISCV_RVV: + if constexpr (available_levels & (1 << int(SIMDLevel::RISCV_RVV))) { + return action.template operator()(); + } + [[fallthrough]]; +#endif default: return action.template operator()(); } diff --git a/thirdparty/faiss/faiss/impl/simdlib/simdlib_dispatch.h b/thirdparty/faiss/faiss/impl/simdlib/simdlib_dispatch.h index 5a8a0aaba..c4be6732a 100644 --- a/thirdparty/faiss/faiss/impl/simdlib/simdlib_dispatch.h +++ b/thirdparty/faiss/faiss/impl/simdlib/simdlib_dispatch.h @@ -38,23 +38,7 @@ #endif -// Convenience aliases: bare names resolve to the current TU's SIMD level. -// Generic code uses SINGLE_SIMD_LEVEL (= NONE in DD, compiled-in in static). -// Per-SIMD TUs should define their own aliases with the concrete level. - -namespace faiss { - -// 256-bit -using simd256bit = simd256bit_tpl; -using simd16uint16 = simd16uint16_tpl; -using simd32uint8 = simd32uint8_tpl; -using simd8uint32 = simd8uint32_tpl; -using simd8float32 = simd8float32_tpl; - -// 512-bit (AVX512_SPR maps to AVX512 — same 512-bit integer ops) -using simd512bit = simd512bit_tpl; -using simd32uint16 = simd32uint16_tpl; -using simd64uint8 = simd64uint8_tpl; -using simd16float32 = simd16float32_tpl; - -} // namespace faiss +// No global bare-name aliases (simd16uint16, simd32uint8, etc.) — each file +// that needs them must declare its own `using` with an explicit SIMD level. +// This prevents per-ISA TUs from accidentally picking up SINGLE_SIMD_LEVEL +// (= NONE in DD mode) when they should use THE_SIMD_LEVEL. diff --git a/thirdparty/faiss/faiss/index_factory.cpp b/thirdparty/faiss/faiss/index_factory.cpp index 1e66227ae..1e0eb5b78 100644 --- a/thirdparty/faiss/faiss/index_factory.cpp +++ b/thirdparty/faiss/faiss/index_factory.cpp @@ -154,9 +154,10 @@ std::map sq_types = { {"SQbf16", ScalarQuantizer::QT_bf16}, {"SQ8_direct_signed", ScalarQuantizer::QT_8bit_direct_signed}, {"SQ8_direct", ScalarQuantizer::QT_8bit_direct}, + {"SQ0", ScalarQuantizer::QT_0bit}, }; const std::string sq_pattern = - "(SQ4|SQ8|SQ6|SQfp16|SQbf16|SQ8_direct_signed|SQ8_direct)"; + "(SQ0|SQ4|SQ8|SQ6|SQfp16|SQbf16|SQ8_direct_signed|SQ8_direct)"; std::map aq_search_type = { {"_Nfloat", AdditiveQuantizer::ST_norm_float}, diff --git a/thirdparty/faiss/faiss/invlists/InvertedLists.cpp b/thirdparty/faiss/faiss/invlists/InvertedLists.cpp index 6f8e88ef3..7e3216c74 100644 --- a/thirdparty/faiss/faiss/invlists/InvertedLists.cpp +++ b/thirdparty/faiss/faiss/invlists/InvertedLists.cpp @@ -289,7 +289,9 @@ size_t ArrayInvertedLists::add_entries( ids[list_no].resize(o + n_entry); memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry); codes[list_no].resize((o + n_entry) * code_size); - memcpy(&codes[list_no][o * code_size], code, code_size * n_entry); + if (code_size > 0) { + memcpy(&codes[list_no][o * code_size], code, code_size * n_entry); + } return o; } @@ -328,7 +330,11 @@ void ArrayInvertedLists::update_entries( assert(list_no < nlist); assert(n_entry + offset <= ids[list_no].size()); memcpy(&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry); - memcpy(&codes[list_no][offset * code_size], codes_in, code_size * n_entry); + if (code_size > 0) { + memcpy(&codes[list_no][offset * code_size], + codes_in, + code_size * n_entry); + } } void ArrayInvertedLists::permute_invlists(const idx_t* map) { diff --git a/thirdparty/faiss/faiss/invlists/InvertedLists.h b/thirdparty/faiss/faiss/invlists/InvertedLists.h index 743bcad62..7f1933143 100644 --- a/thirdparty/faiss/faiss/invlists/InvertedLists.h +++ b/thirdparty/faiss/faiss/invlists/InvertedLists.h @@ -28,6 +28,24 @@ struct InvertedListsIterator { virtual bool is_available() const = 0; virtual void next() = 0; virtual std::pair get_id_and_codes() = 0; + + /// When true, iterate_codes will invoke on_distance_computed() and + /// on_heap_changed() via virtual dispatch. When false (the default), + /// iterate_codes skips the callbacks entirely — the guard branch is + /// perfectly predicted and costs ~0 cycles, so non-callback users + /// pay no overhead. Derived classes that override the callbacks + /// should set this to true in their constructor. + bool has_search_callbacks_ = false; + + /// Called from iterate_codes after distance computation for the vector + /// returned by the most recent get_id_and_codes(). Default: no-op. + /// Only invoked when has_search_callbacks_ is true. + virtual void on_distance_computed(idx_t /* vid */, float /* distance */) {} + + /// Called from iterate_codes when a vector replaces the current worst + /// in the top-K heap. evicted_id is the displaced entry. Default: no-op. + /// Only invoked when has_search_callbacks_ is true. + virtual void on_heap_changed(idx_t /* new_id */, idx_t /* evicted_id */) {} }; /** Table of inverted lists diff --git a/thirdparty/faiss/faiss/python/swigfaiss.swig b/thirdparty/faiss/faiss/python/swigfaiss.swig index 8ff744c05..2e821e9ad 100644 --- a/thirdparty/faiss/faiss/python/swigfaiss.swig +++ b/thirdparty/faiss/faiss/python/swigfaiss.swig @@ -203,7 +203,6 @@ typedef uint64_t size_t; #include #include -#include #include #include #include @@ -668,7 +667,6 @@ void gpu_sync_all_devices() %include %include -%include %include %include %include @@ -827,6 +825,7 @@ void gpu_sync_all_devices() DOWNCAST ( IndexHNSWFlat ) DOWNCAST ( IndexHNSWPQ ) DOWNCAST ( IndexHNSWSQ ) + DOWNCAST ( IndexHNSWCagra ) DOWNCAST ( IndexHNSW ) DOWNCAST ( IndexHNSW2Level ) DOWNCAST ( IndexNNDescentFlat ) diff --git a/thirdparty/faiss/faiss/svs/IndexSVSVamana.h b/thirdparty/faiss/faiss/svs/IndexSVSVamana.h index 081f4c0e4..c4981a9cb 100644 --- a/thirdparty/faiss/faiss/svs/IndexSVSVamana.h +++ b/thirdparty/faiss/faiss/svs/IndexSVSVamana.h @@ -49,6 +49,7 @@ enum SVSStorageKind { SVS_LeanVec4x4, SVS_LeanVec4x8, SVS_LeanVec8x8, + SVS_count, }; inline svs_runtime::StorageKind to_svs_storage_kind(SVSStorageKind kind) { @@ -86,7 +87,7 @@ struct IndexSVSVamana : Index { size_t max_candidate_pool_size = 200; bool use_full_search_history = true; - SVSStorageKind storage_kind; + SVSStorageKind storage_kind = SVS_FP32; IndexSVSVamana(); diff --git a/thirdparty/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h b/thirdparty/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h index 18a232ead..5f9a2063e 100644 --- a/thirdparty/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +++ b/thirdparty/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h @@ -16,6 +16,10 @@ namespace faiss { +// Explicit SIMD-level aliases for this file (no global bare aliases). +using simd8uint32 = simd8uint32_tpl; +using simd16uint16 = simd16uint16_tpl; + // HeapWithBucketsForHamming32 uses simd8uint32 under the // hood. diff --git a/thirdparty/faiss/faiss/utils/partitioning.cpp b/thirdparty/faiss/faiss/utils/partitioning.cpp index fd2c6d770..f200cd077 100644 --- a/thirdparty/faiss/faiss/utils/partitioning.cpp +++ b/thirdparty/faiss/faiss/utils/partitioning.cpp @@ -10,9 +10,11 @@ #include #include #include +#include +#include #include -#include +#include #include #include @@ -218,529 +220,9 @@ typename C::T partition_fuzzy_median3( return thresh; } -} // namespace partitioning - -/****************************************************************** - * SIMD routines when vals is an aligned array of uint16_t - ******************************************************************/ - -namespace simd_partitioning { - -void find_minimax( - const uint16_t* vals, - size_t n, - uint16_t& smin, - uint16_t& smax) { - simd16uint16 vmin(0xffff), vmax(0); - for (size_t i = 0; i + 15 < n; i += 16) { - simd16uint16 v(vals + i); - vmin.accu_min(v); - vmax.accu_max(v); - } - - ALIGNED(32) uint16_t tab32[32]; - vmin.store(tab32); - vmax.store(tab32 + 16); - - smin = tab32[0], smax = tab32[16]; - - for (int i = 1; i < 16; i++) { - smin = std::min(smin, tab32[i]); - smax = std::max(smax, tab32[i + 16]); - } - - // missing values - for (size_t i = (n & ~15); i < n; i++) { - smin = std::min(smin, vals[i]); - smax = std::max(smax, vals[i]); - } -} - -// max func differentiates between CMin and CMax (keep lowest or largest) -template -simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) { - constexpr bool is_max = C::is_max; - if (is_max) { - return max(v, thr16); - } else { - return min(v, thr16); - } -} - -template -void count_lt_and_eq( - const uint16_t* vals, - int n, - uint16_t thresh, - size_t& n_lt, - size_t& n_eq) { - n_lt = n_eq = 0; - simd16uint16 thr16(thresh); - - size_t n1 = n / 16; - - for (size_t i = 0; i < n1; i++) { - simd16uint16 v(vals); - vals += 16; - simd16uint16 eqmask = (v == thr16); - simd16uint16 max2 = max_func(v, thr16); - simd16uint16 gemask = (v == max2); - uint32_t bits = get_MSBs(uint16_to_uint8_saturate(eqmask, gemask)); - int i_eq = __builtin_popcount(bits & 0x00ff00ff); - int i_ge = __builtin_popcount(bits) - i_eq; - n_eq += i_eq; - n_lt += 16 - i_ge; - } - - for (int i = n1 * 16; i < n; i++) { - uint16_t v = *vals++; - if (C::cmp(thresh, v)) { - n_lt++; - } else if (v == thresh) { - n_eq++; - } - } -} - -/* compress separated values and ids table, keeping all values < thresh and at - * most n_eq equal values */ -template -int simd_compress_array( - uint16_t* vals, - typename C::TI* ids, - size_t n, - uint16_t thresh, - int n_eq) { - simd16uint16 thr16(thresh); - simd16uint16 mixmask(0xff00); - - int wp = 0; - size_t i0; - - // loop while there are eqs to collect - for (i0 = 0; i0 + 15 < n && n_eq > 0; i0 += 16) { - simd16uint16 v(vals + i0); - simd16uint16 max2 = max_func(v, thr16); - simd16uint16 gemask = (v == max2); - simd16uint16 eqmask = (v == thr16); - uint32_t bits = get_MSBs( - blendv(simd32uint8(eqmask), - simd32uint8(gemask), - simd32uint8(mixmask))); - bits ^= 0xAAAAAAAA; - // bit 2*i : eq - // bit 2*i + 1 : lt - - while (bits) { - int j = __builtin_ctz(bits) & (~1); - bool is_eq = (bits >> j) & 1; - bool is_lt = (bits >> j) & 2; - bits &= ~(3 << j); - j >>= 1; - - if (is_lt) { - vals[wp] = vals[i0 + j]; - ids[wp] = ids[i0 + j]; - wp++; - } else if (is_eq && n_eq > 0) { - vals[wp] = vals[i0 + j]; - ids[wp] = ids[i0 + j]; - wp++; - n_eq--; - } - } - } - - // handle remaining, only strictly lt ones. - for (; i0 + 15 < n; i0 += 16) { - simd16uint16 v(vals + i0); - simd16uint16 max2 = max_func(v, thr16); - simd16uint16 gemask = (v == max2); - uint32_t bits = ~get_MSBs(simd32uint8(gemask)); - - while (bits) { - int j = __builtin_ctz(bits); - bits &= ~(3 << j); - j >>= 1; - - vals[wp] = vals[i0 + j]; - ids[wp] = ids[i0 + j]; - wp++; - } - } - - // end with scalar - for (size_t i = (n & ~size_t(15)); i < n; i++) { - if (C::cmp(thresh, vals[i])) { - vals[wp] = vals[i]; - ids[wp] = ids[i]; - wp++; - } else if (vals[i] == thresh && n_eq > 0) { - vals[wp] = vals[i]; - ids[wp] = ids[i]; - wp++; - n_eq--; - } - } - assert(n_eq == 0); - return wp; -} - -// #define MICRO_BENCHMARK - -static uint64_t get_cy() { -#ifdef MICRO_BENCHMARK - uint32_t high, low; - asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high)); - return ((uint64_t)high << 32) | (low); -#else - return 0; -#endif -} - -#define IFV if (false) - -template -uint16_t simd_partition_fuzzy_with_bounds( - uint16_t* vals, - typename C::TI* ids, - size_t n, - size_t q_min, - size_t q_max, - size_t* q_out, - uint16_t s0i, - uint16_t s1i) { - if (q_min == 0) { - if (q_out) { - *q_out = 0; - } - return 0; - } - if (q_max >= n) { - if (q_out) { - *q_out = q_max; - } - return 0xffff; - } - if (s0i == s1i) { - if (q_out) { - *q_out = q_min; - } - return s0i; - } - uint64_t t0 = get_cy(); - - // lower bound inclusive, upper exclusive - size_t s0 = s0i, s1 = s1i + 1; - - IFV printf("bounds: %zu %zu\n", s0, s1 - 1); - - int thresh; - size_t n_eq = 0, n_lt = 0; - size_t q = 0; - - for (int it = 0; it < 200; it++) { - // while(s0 + 1 < s1) { - thresh = (s0 + s1) / 2; - count_lt_and_eq(vals, n, thresh, n_lt, n_eq); - - IFV printf( - " [%zu %zu] thresh=%d n_lt=%zu n_eq=%zu, q=%zu:%zu/%zu\n", - s0, - s1, - thresh, - n_lt, - n_eq, - q_min, - q_max, - n); - if (n_lt <= q_min) { - if (n_lt + n_eq >= q_min) { - q = q_min; - break; - } else { - if (C::is_max) { - s0 = thresh; - } else { - s1 = thresh; - } - } - } else if (n_lt <= q_max) { - q = n_lt; - break; - } else { - if (C::is_max) { - s1 = thresh; - } else { - s0 = thresh; - } - } - } - - uint64_t t1 = get_cy(); - - // number of equal values to keep - int64_t n_eq_1 = q - n_lt; - - IFV printf( - "shrink: thresh=%d q=%zu n_eq_1=%" PRId64 "\n", thresh, q, n_eq_1); - if (n_eq_1 < 0) { // happens when > q elements are at lower bound - assert(s0 + 1 == s1); - q = q_min; - if (C::is_max) { - thresh--; - } else { - thresh++; - } - n_eq_1 = q; - IFV printf( - " override: thresh=%d n_eq_1=%" PRId64 "\n", thresh, n_eq_1); - } else { - assert(n_eq_1 <= n_eq); - } - - size_t wp = simd_compress_array(vals, ids, n, thresh, n_eq_1); - - IFV printf("wp=%zu\n", wp); - assert(wp == q); - if (q_out) { - *q_out = q; - } - - uint64_t t2 = get_cy(); - - partition_stats.bisect_cycles += t1 - t0; - partition_stats.compress_cycles += t2 - t1; - - return thresh; -} - -template -uint16_t simd_partition_fuzzy_with_bounds_histogram( - uint16_t* vals, - typename C::TI* ids, - size_t n, - size_t q_min, - size_t q_max, - size_t* q_out, - uint16_t s0i, - uint16_t s1i) { - if (q_min == 0) { - if (q_out) { - *q_out = 0; - } - return 0; - } - if (q_max >= n) { - if (q_out) { - *q_out = q_max; - } - return 0xffff; - } - if (s0i == s1i) { - if (q_out) { - *q_out = q_min; - } - return s0i; - } - - IFV printf( - "partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n", - q_min, - q_max, - n, - s0i, - s1i); - - if (!C::is_max) { - IFV printf( - "revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max); - q_min = n - q_min; - q_max = n - q_max; - } - - // lower and upper bound of range, inclusive - int s0 = s0i, s1 = s1i; - // number of values < s0 and > s1 - size_t n_lt = 0, n_gt = 0; - - // output of loop: - int thresh; // final threshold - uint64_t tot_eq = 0; // total nb of equal values - uint64_t n_eq = 0; // nb of equal values to keep - size_t q; // final quantile - - // buffer for the histograms - int hist[16]; - - for (int it = 0; it < 20; it++) { - // otherwise we would be done already - - int shift = 0; - - IFV printf( - " it %d bounds: %d %d n_lt=%ld n_gt=%ld\n", - it, - s0, - s1, - n_lt, - n_gt); - - int maxval = s1 - s0; - - while (maxval > 15) { - shift++; - maxval >>= 1; - } - - IFV printf( - " histogram shift %d maxval %d ?= %d\n", - shift, - maxval, - int((s1 - s0) >> shift)); - - if (maxval > 7) { - simd_histogram_16(vals, n, s0, shift, hist); - } else { - simd_histogram_8(vals, n, s0, shift, hist); - } - IFV { - int sum = n_lt + n_gt; - printf(" n_lt=%ld hist=[", n_lt); - for (int i = 0; i <= maxval; i++) { - printf("%d ", hist[i]); - sum += hist[i]; - } - printf("] n_gt=%ld sum=%d\n", n_gt, sum); - assert(sum == n); - } - - size_t sum_below = n_lt; - int i; - for (i = 0; i <= maxval; i++) { - sum_below += hist[i]; - if (sum_below >= q_min) { - break; - } - } - IFV printf(" i=%d sum_below=%ld\n", i, sum_below); - if (i <= maxval) { - s0 = s0 + (i << shift); - s1 = s0 + (1 << shift) - 1; - n_lt = sum_below - hist[i]; - n_gt = n - sum_below; - } else { - assert(false && "not implemented"); - } - - IFV printf( - " new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n", - s0, - s1, - n_lt, - n_gt); - - if (s1 > s0) { - if (n_lt >= q_min && q_max >= n_lt) { - IFV printf(" FOUND1\n"); - thresh = s0; - q = n_lt; - break; - } - - size_t n_lt_2 = n - n_gt; - if (n_lt_2 >= q_min && q_max >= n_lt_2) { - thresh = s1 + 1; - q = n_lt_2; - IFV printf(" FOUND2\n"); - break; - } - } else { - thresh = s0; - q = q_min; - tot_eq = n - n_gt - n_lt; - n_eq = q_min - n_lt; - IFV printf(" FOUND3\n"); - break; - } - } - - IFV printf("end bisection: thresh=%d q=%ld n_eq=%ld\n", thresh, q, n_eq); +#undef IFV - if (!C::is_max) { - if (n_eq == 0) { - thresh--; - } else { - // thresh unchanged - n_eq = tot_eq - n_eq; - } - q = n - q; - IFV printf("revert due to CMin, q->%ld n_eq->%ld\n", q, n_eq); - } - - size_t wp = simd_compress_array(vals, ids, n, thresh, n_eq); - IFV printf("wp=%ld ?= %ld\n", wp, q); - assert(wp == q); - if (q_out) { - *q_out = wp; - } - - return thresh; -} - -template -uint16_t simd_partition_fuzzy( - uint16_t* vals, - typename C::TI* ids, - size_t n, - size_t q_min, - size_t q_max, - size_t* q_out) { - assert(is_aligned_pointer(vals)); - - uint16_t s0i, s1i; - find_minimax(vals, n, s0i, s1i); - // QSelect_stats.t0 += get_cy() - t0; - - return simd_partition_fuzzy_with_bounds( - vals, ids, n, q_min, q_max, q_out, s0i, s1i); -} - -template -uint16_t simd_partition( - uint16_t* vals, - typename C::TI* ids, - size_t n, - size_t q) { - assert(is_aligned_pointer(vals)); - - if (q == 0) { - return 0; - } - if (q >= n) { - return 0xffff; - } - - uint16_t s0i, s1i; - find_minimax(vals, n, s0i, s1i); - - return simd_partition_fuzzy_with_bounds( - vals, ids, n, q, q, nullptr, s0i, s1i); -} - -template -uint16_t simd_partition_with_bounds( - uint16_t* vals, - typename C::TI* ids, - size_t n, - size_t q, - uint16_t s0i, - uint16_t s1i) { - return simd_partition_fuzzy_with_bounds( - vals, ids, n, q, q, nullptr, s0i, s1i); -} - -} // namespace simd_partitioning +} // namespace partitioning /****************************************************************** * Driver routine @@ -754,13 +236,20 @@ typename C::T partition_fuzzy( size_t q_min, size_t q_max, size_t* q_out) { -#ifdef __AVX2__ constexpr bool is_uint16 = std::is_same::value; - if (is_uint16 && is_aligned_pointer(vals)) { - return simd_partitioning::simd_partition_fuzzy( - (uint16_t*)vals, ids, n, q_min, q_max, q_out); + if constexpr (is_uint16) { + if (is_aligned_pointer(vals)) { + return with_simd_level_256bit([&]() -> typename C::T { + if constexpr (SL == SIMDLevel::NONE) { + return partitioning::partition_fuzzy_median3( + vals, ids, n, q_min, q_max, q_out); + } else { + return partition_fuzzy_simd( + (uint16_t*)vals, ids, n, q_min, q_max, q_out); + } + }); + } } -#endif return partitioning::partition_fuzzy_median3( vals, ids, n, q_min, q_max, q_out); } @@ -816,457 +305,12 @@ template uint16_t partition_fuzzy>( size_t* q_out); /****************************************************************** - * Histogram subroutines + * Histogram subroutines — scalar fallbacks ******************************************************************/ -#if defined(__AVX2__) || defined(__aarch64__) -/// FIXME when MSB of uint16 is set -// this code does not compile properly with GCC 7.4.0 - namespace { -/************************************************************ - * 8 bins - ************************************************************/ - -simd32uint8 accu4to8(simd16uint16 a4) { - simd16uint16 mask4(0x0f0f); - - simd16uint16 a8_0 = a4 & mask4; - simd16uint16 a8_1 = (a4 >> 4) & mask4; - - return simd32uint8(hadd(a8_0, a8_1)); -} - -simd16uint16 accu8to16(simd32uint8 a8) { - simd16uint16 mask8(0x00ff); - - simd16uint16 a8_0 = simd16uint16(a8) & mask8; - simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8; - - return hadd(a8_0, a8_1); -} - -static const simd32uint8 shifts = simd32uint8::create< - 1, - 16, - 0, - 0, - 4, - 64, - 0, - 0, - 0, - 0, - 1, - 16, - 0, - 0, - 4, - 64, - 1, - 16, - 0, - 0, - 4, - 64, - 0, - 0, - 0, - 0, - 1, - 16, - 0, - 0, - 4, - 64>(); - -// 2-bit accumulator: we can add only up to 3 elements -// on output we return 2*4-bit results -// preproc returns either an index in 0..7 or 0xffff -// that yields a 0 when used in the table look-up -template -void compute_accu2( - const uint16_t*& data, - Preproc& pp, - simd16uint16& a4lo, - simd16uint16& a4hi) { - simd16uint16 mask2(0x3333); - simd16uint16 a2((uint16_t)0); // 2-bit accu - for (int j = 0; j < N; j++) { - simd16uint16 v(data); - data += 16; - v = pp(v); - // 0x800 -> force second half of table - simd16uint16 idx = v | (v << 8) | simd16uint16(0x800); - a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx))); - } - a4lo += a2 & mask2; - a4hi += (a2 >> 2) & mask2; -} - -template -simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) { - assert(n_in % 16 == 0); - int n = n_in / 16; - - simd32uint8 a8lo(0); - simd32uint8 a8hi(0); - - for (int i0 = 0; i0 < n; i0 += 15) { - simd16uint16 a4lo(0); // 4-bit accus - simd16uint16 a4hi(0); - - int i1 = std::min(i0 + 15, n); - int i; - for (i = i0; i + 2 < i1; i += 3) { - compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max - } - switch (i1 - i) { - case 2: - compute_accu2<2>(data, pp, a4lo, a4hi); - break; - case 1: - compute_accu2<1>(data, pp, a4lo, a4hi); - break; - } - - a8lo += accu4to8(a4lo); - a8hi += accu4to8(a4hi); - } - - // move to 16-bit accu - simd16uint16 a16lo = accu8to16(a8lo); - simd16uint16 a16hi = accu8to16(a8hi); - - simd16uint16 a16 = hadd(a16lo, a16hi); - - // the 2 lanes must still be combined - return a16; -} - -/************************************************************ - * 16 bins - ************************************************************/ - -static const simd32uint8 shifts2 = simd32uint8::create< - 1, - 2, - 4, - 8, - 16, - 32, - 64, - 128, - 1, - 2, - 4, - 8, - 16, - 32, - 64, - 128, - 1, - 2, - 4, - 8, - 16, - 32, - 64, - 128, - 1, - 2, - 4, - 8, - 16, - 32, - 64, - 128>(); - -simd32uint8 shiftr_16(simd32uint8 x, int n) { - return simd32uint8(simd16uint16(x) >> n); -} - -// 2-bit accumulator: we can add only up to 3 elements -// on output we return 2*4-bit results -template -void compute_accu2_16( - const uint16_t*& data, - Preproc pp, - simd32uint8& a4_0, - simd32uint8& a4_1, - simd32uint8& a4_2, - simd32uint8& a4_3) { - simd32uint8 mask1(0x55); - simd32uint8 a2_0; // 2-bit accu - simd32uint8 a2_1; // 2-bit accu - a2_0.clear(); - a2_1.clear(); - - for (int j = 0; j < N; j++) { - simd16uint16 v(data); - data += 16; - v = pp(v); - - simd16uint16 idx = v | (v << 8); - simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx)); - // contains 0s for out-of-bounds elements - - simd16uint16 lt8 = (v >> 3) == simd16uint16(0); - lt8 = lt8 ^ simd16uint16(0xff00); - - a1 = a1 & lt8; - - a2_0 += a1 & mask1; - a2_1 += shiftr_16(a1, 1) & mask1; - } - simd32uint8 mask2(0x33); - - a4_0 += a2_0 & mask2; - a4_1 += a2_1 & mask2; - a4_2 += shiftr_16(a2_0, 2) & mask2; - a4_3 += shiftr_16(a2_1, 2) & mask2; -} - -simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) { - simd32uint8 mask4(0x0f); - - simd16uint16 a8_0 = combine2x2( - (simd16uint16)(a4_0 & mask4), - (simd16uint16)(shiftr_16(a4_0, 4) & mask4)); - - simd16uint16 a8_1 = combine2x2( - (simd16uint16)(a4_1 & mask4), - (simd16uint16)(shiftr_16(a4_1, 4) & mask4)); - - return simd32uint8(hadd(a8_0, a8_1)); -} - -template -simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) { - assert(n_in % 16 == 0); - int n = n_in / 16; - - simd32uint8 a8lo((uint8_t)0); - simd32uint8 a8hi((uint8_t)0); - - for (int i0 = 0; i0 < n; i0 += 7) { - simd32uint8 a4_0(0); // 0, 4, 8, 12 - simd32uint8 a4_1(0); // 1, 5, 9, 13 - simd32uint8 a4_2(0); // 2, 6, 10, 14 - simd32uint8 a4_3(0); // 3, 7, 11, 15 - - int i1 = std::min(i0 + 7, n); - int i; - for (i = i0; i + 2 < i1; i += 3) { - compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3); - } - switch (i1 - i) { - case 2: - compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3); - break; - case 1: - compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3); - break; - } - - a8lo += accu4to8_2(a4_0, a4_1); - a8hi += accu4to8_2(a4_2, a4_3); - } - - // move to 16-bit accu - simd16uint16 a16lo = accu8to16(a8lo); - simd16uint16 a16hi = accu8to16(a8hi); - - simd16uint16 a16 = hadd(a16lo, a16hi); - - a16 = simd16uint16{simd8uint32{a16}.unzip()}; - - return a16; -} - -struct PreprocNOP { - simd16uint16 operator()(simd16uint16 x) { - return x; - } -}; - -template -struct PreprocMinShift { - simd16uint16 min16; - simd16uint16 max16; - - explicit PreprocMinShift(uint16_t min) { - min16.set1(min); - int vmax0 = std::min((nbin << shift) + min, 65536); - uint16_t vmax = uint16_t(vmax0 - 1 - min); - max16.set1(vmax); // vmax inclusive - } - - simd16uint16 operator()(simd16uint16 x) { - x = x - min16; - simd16uint16 mask = (x == max(x, max16)) - (x == max16); - return (x >> shift) | mask; - } -}; - -/* unbounded versions of the functions */ - -void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) { - PreprocNOP pp; - simd16uint16 a16 = histogram_8(data, pp, (n & ~15)); - - ALIGNED(32) uint16_t a16_tab[16]; - a16.store(a16_tab); - - for (int i = 0; i < 8; i++) { - hist[i] = a16_tab[i] + a16_tab[i + 8]; - } - - for (int i = (n & ~15); i < n; i++) { - hist[data[i]]++; - } -} - -void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) { - simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15)); - - ALIGNED(32) uint16_t a16_tab[16]; - a16.store(a16_tab); - - for (int i = 0; i < 16; i++) { - hist[i] = a16_tab[i]; - } - - for (int i = (n & ~15); i < n; i++) { - hist[data[i]]++; - } -} - -} // anonymous namespace - -/************************************************************ - * Driver routines - ************************************************************/ - -void simd_histogram_8( - const uint16_t* data, - int n, - uint16_t min, - int shift, - int* hist) { - if (shift < 0) { - simd_histogram_8_unbounded(data, n, hist); - return; - } - - simd16uint16 a16; - -#define DISPATCH(s) \ - case s: \ - a16 = histogram_8(data, PreprocMinShift(min), (n & ~15)); \ - break - - switch (shift) { - DISPATCH(0); - DISPATCH(1); - DISPATCH(2); - DISPATCH(3); - DISPATCH(4); - DISPATCH(5); - DISPATCH(6); - DISPATCH(7); - DISPATCH(8); - DISPATCH(9); - DISPATCH(10); - DISPATCH(11); - DISPATCH(12); - DISPATCH(13); - default: - FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift); - } -#undef DISPATCH - - ALIGNED(32) uint16_t a16_tab[16]; - a16.store(a16_tab); - - for (int i = 0; i < 8; i++) { - hist[i] = a16_tab[i] + a16_tab[i + 8]; - } - - // complete with remaining bins - for (int i = (n & ~15); i < n; i++) { - if (data[i] < min) { - continue; - } - uint16_t v = data[i] - min; - v >>= shift; - if (v < 8) { - hist[v]++; - } - } -} - -void simd_histogram_16( - const uint16_t* data, - int n, - uint16_t min, - int shift, - int* hist) { - if (shift < 0) { - simd_histogram_16_unbounded(data, n, hist); - return; - } - - simd16uint16 a16; - -#define DISPATCH(s) \ - case s: \ - a16 = histogram_16(data, PreprocMinShift(min), (n & ~15)); \ - break - - switch (shift) { - DISPATCH(0); - DISPATCH(1); - DISPATCH(2); - DISPATCH(3); - DISPATCH(4); - DISPATCH(5); - DISPATCH(6); - DISPATCH(7); - DISPATCH(8); - DISPATCH(9); - DISPATCH(10); - DISPATCH(11); - DISPATCH(12); - default: - FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift); - } -#undef DISPATCH - - ALIGNED(32) uint16_t a16_tab[16]; - a16.store(a16_tab); - - for (int i = 0; i < 16; i++) { - hist[i] = a16_tab[i]; - } - - for (int i = (n & ~15); i < n; i++) { - if (data[i] < min) { - continue; - } - uint16_t v = data[i] - min; - v >>= shift; - if (v < 16) { - hist[v]++; - } - } -} - -// no AVX2 -#else - -void simd_histogram_16( +void simd_histogram_16_scalar( const uint16_t* data, int n, uint16_t min, @@ -1288,18 +332,11 @@ void simd_histogram_16( continue; v >>= shift; hist[v]++; - - /* - if (data[i] < min) continue; - uint16_t v = data[i] - min; - v >>= shift; - if (v < 16) hist[v]++; - */ } } } -void simd_histogram_8( +void simd_histogram_8_scalar( const uint16_t* data, int n, uint16_t min, @@ -1322,7 +359,41 @@ void simd_histogram_8( } } -#endif +} // anonymous namespace + +/****************************************************************** + * Histogram subroutines — dispatch to SIMD or scalar + ******************************************************************/ + +void simd_histogram_8( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist) { + with_simd_level_256bit([&]() { + if constexpr (SL == SIMDLevel::NONE) { + simd_histogram_8_scalar(data, n, min, shift, hist); + } else { + faiss::simd_histogram_8(data, n, min, shift, hist); + } + }); +} + +void simd_histogram_16( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist) { + with_simd_level_256bit([&]() { + if constexpr (SL == SIMDLevel::NONE) { + simd_histogram_16_scalar(data, n, min, shift, hist); + } else { + faiss::simd_histogram_16(data, n, min, shift, hist); + } + }); +} void PartitionStats::reset() { memset(this, 0, sizeof(*this)); diff --git a/thirdparty/faiss/faiss/utils/partitioning.h b/thirdparty/faiss/faiss/utils/partitioning.h index 3076f21cc..274de8372 100644 --- a/thirdparty/faiss/faiss/utils/partitioning.h +++ b/thirdparty/faiss/faiss/utils/partitioning.h @@ -11,6 +11,8 @@ #include #include +#include +#include namespace faiss { @@ -58,6 +60,35 @@ void simd_histogram_16( int shift, int* hist); +/** SIMD-dispatched partition for aligned uint16_t arrays. + * Specializations live in per-ISA TUs (partitioning_avx2.cpp, etc.). */ +template +typename C::T partition_fuzzy_simd( + uint16_t* vals, + typename C::TI* ids, + size_t n, + size_t q_min, + size_t q_max, + size_t* q_out); + +/** SIMD-dispatched histogram functions. + * Specializations live in per-ISA TUs. */ +template +void simd_histogram_8( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist); + +template +void simd_histogram_16( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist); + struct PartitionStats { uint64_t bisect_cycles; uint64_t compress_cycles; diff --git a/thirdparty/faiss/faiss/utils/prefetch.h b/thirdparty/faiss/faiss/utils/prefetch.h index 7afe58aba..666b31976 100644 --- a/thirdparty/faiss/faiss/utils/prefetch.h +++ b/thirdparty/faiss/faiss/utils/prefetch.h @@ -9,9 +9,9 @@ // prefetches -#ifdef __AVX__ +#if defined(__x86_64__) || defined(_M_X64) -// AVX +// x86_64 #include diff --git a/thirdparty/faiss/faiss/utils/quantize_lut.h b/thirdparty/faiss/faiss/utils/quantize_lut.h index fc9e2dc01..9adaad39d 100644 --- a/thirdparty/faiss/faiss/utils/quantize_lut.h +++ b/thirdparty/faiss/faiss/utils/quantize_lut.h @@ -50,7 +50,7 @@ void round_uint8_per_column_multi( /** LUT quantization to uint8 and bias to uint16. * - * (nprobe, M, ksub, lut_is_3d) determine the size of the the LUT + * (nprobe, M, ksub, lut_is_3d) determine the size of the LUT * * LUT input: * - 2D size (M, ksub): single matrix per probe (lut_is_3d=false) diff --git a/thirdparty/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp b/thirdparty/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp new file mode 100644 index 000000000..197097eba --- /dev/null +++ b/thirdparty/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp @@ -0,0 +1,14 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_AVX2 + +#define THE_SIMD_LEVEL SIMDLevel::AVX2 +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include + +#endif // COMPILE_SIMD_AVX2 diff --git a/thirdparty/faiss/faiss/utils/simd_impl/partitioning_neon.cpp b/thirdparty/faiss/faiss/utils/simd_impl/partitioning_neon.cpp new file mode 100644 index 000000000..c8bb7bda1 --- /dev/null +++ b/thirdparty/faiss/faiss/utils/simd_impl/partitioning_neon.cpp @@ -0,0 +1,14 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_ARM_NEON + +#define THE_SIMD_LEVEL SIMDLevel::ARM_NEON +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include + +#endif // COMPILE_SIMD_ARM_NEON diff --git a/thirdparty/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h b/thirdparty/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h new file mode 100644 index 000000000..c70f51da8 --- /dev/null +++ b/thirdparty/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h @@ -0,0 +1,1084 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +/** + * @file partitioning_simdlib256.h + * @brief Per-SIMD TU implementation of partitioning and histogram functions. + * + * This header is included once per SIMD TU with THE_SIMD_LEVEL set to the + * desired SIMDLevel. It uses simdlib 256-bit wrappers (simd16uint16, etc.) + * which are locally aliased to THE_SIMD_LEVEL — the global aliases from + * simdlib_dispatch.h resolve through SINGLE_SIMD_LEVEL (= NONE in DD mode) + * and must NOT be used directly in per-ISA TU code. + * + * Usage (in a per-SIMD .cpp file): + * #define THE_SIMD_LEVEL SIMDLevel::AVX2 + * #include + */ + +#ifndef THE_SIMD_LEVEL +#error "Define THE_SIMD_LEVEL before including this header" +#endif + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace faiss { + +namespace { + +// ── Per-TU SIMD type aliases ────────────────────────────────────────── +// The global aliases (simd16uint16 etc.) from simdlib_dispatch.h resolve +// through SINGLE_SIMD_LEVEL, which is NONE in DD mode on x86 — meaning +// scalar emulation even in per-ISA TUs compiled with -mavx2. +// We shadow them here with THE_SIMD_LEVEL so the implementation actually +// uses the SIMD level this TU was compiled for. +static constexpr SIMDLevel THE_SL_256 = + simd256_level_selector::value; +static_assert( + THE_SL_256 == SIMDLevel::NONE || THE_SL_256 == SIMDLevel::AVX2 || + THE_SL_256 == SIMDLevel::ARM_NEON, + "simd256_level_selector must yield a 256-bit (or scalar) level"); +using simd256bit = simd256bit_tpl; +using simd16uint16 = simd16uint16_tpl; +using simd32uint8 = simd32uint8_tpl; +using simd8uint32 = simd8uint32_tpl; +using simd8float32 = simd8float32_tpl; + +/****************************************************************** + * SIMD routines when vals is an aligned array of uint16_t + ******************************************************************/ + +namespace simd_partitioning { + +void find_minimax( + const uint16_t* vals, + size_t n, + uint16_t& smin, + uint16_t& smax) { + simd16uint16 vmin(0xffff), vmax(0); + for (size_t i = 0; i + 15 < n; i += 16) { + simd16uint16 v(vals + i); + vmin.accu_min(v); + vmax.accu_max(v); + } + + ALIGNED(32) uint16_t tab32[32]; + vmin.store(tab32); + vmax.store(tab32 + 16); + + smin = tab32[0], smax = tab32[16]; + + for (int i = 1; i < 16; i++) { + smin = std::min(smin, tab32[i]); + smax = std::max(smax, tab32[i + 16]); + } + + // missing values + for (size_t i = (n & ~15); i < n; i++) { + smin = std::min(smin, vals[i]); + smax = std::max(smax, vals[i]); + } +} + +// max func differentiates between CMin and CMax (keep lowest or largest) +template +simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) { + constexpr bool is_max = C::is_max; + if (is_max) { + return max(v, thr16); + } else { + return min(v, thr16); + } +} + +template +void count_lt_and_eq( + const uint16_t* vals, + int n, + uint16_t thresh, + size_t& n_lt, + size_t& n_eq) { + n_lt = n_eq = 0; + simd16uint16 thr16(thresh); + + size_t n1 = n / 16; + + for (size_t i = 0; i < n1; i++) { + simd16uint16 v(vals); + vals += 16; + simd16uint16 eqmask = (v == thr16); + simd16uint16 max2 = max_func(v, thr16); + simd16uint16 gemask = (v == max2); + uint32_t bits = get_MSBs(uint16_to_uint8_saturate(eqmask, gemask)); + int i_eq = __builtin_popcount(bits & 0x00ff00ff); + int i_ge = __builtin_popcount(bits) - i_eq; + n_eq += i_eq; + n_lt += 16 - i_ge; + } + + for (int i = n1 * 16; i < n; i++) { + uint16_t v = *vals++; + if (C::cmp(thresh, v)) { + n_lt++; + } else if (v == thresh) { + n_eq++; + } + } +} + +/* compress separated values and ids table, keeping all values < thresh and at + * most n_eq equal values */ +template +int simd_compress_array( + uint16_t* vals, + typename C::TI* ids, + size_t n, + uint16_t thresh, + int n_eq) { + simd16uint16 thr16(thresh); + simd16uint16 mixmask(0xff00); + + int wp = 0; + size_t i0; + + // loop while there are eqs to collect + for (i0 = 0; i0 + 15 < n && n_eq > 0; i0 += 16) { + simd16uint16 v(vals + i0); + simd16uint16 max2 = max_func(v, thr16); + simd16uint16 gemask = (v == max2); + simd16uint16 eqmask = (v == thr16); + uint32_t bits = get_MSBs( + blendv(simd32uint8(eqmask), + simd32uint8(gemask), + simd32uint8(mixmask))); + bits ^= 0xAAAAAAAA; + // bit 2*i : eq + // bit 2*i + 1 : lt + + while (bits) { + int j = __builtin_ctz(bits) & (~1); + bool is_eq = (bits >> j) & 1; + bool is_lt = (bits >> j) & 2; + bits &= ~(3 << j); + j >>= 1; + + if (is_lt) { + vals[wp] = vals[i0 + j]; + ids[wp] = ids[i0 + j]; + wp++; + } else if (is_eq && n_eq > 0) { + vals[wp] = vals[i0 + j]; + ids[wp] = ids[i0 + j]; + wp++; + n_eq--; + } + } + } + + // handle remaining, only strictly lt ones. + for (; i0 + 15 < n; i0 += 16) { + simd16uint16 v(vals + i0); + simd16uint16 max2 = max_func(v, thr16); + simd16uint16 gemask = (v == max2); + uint32_t bits = ~get_MSBs(simd32uint8(gemask)); + + while (bits) { + int j = __builtin_ctz(bits); + bits &= ~(3 << j); + j >>= 1; + + vals[wp] = vals[i0 + j]; + ids[wp] = ids[i0 + j]; + wp++; + } + } + + // end with scalar + for (size_t i = (n & ~size_t(15)); i < n; i++) { + if (C::cmp(thresh, vals[i])) { + vals[wp] = vals[i]; + ids[wp] = ids[i]; + wp++; + } else if (vals[i] == thresh && n_eq > 0) { + vals[wp] = vals[i]; + ids[wp] = ids[i]; + wp++; + n_eq--; + } + } + assert(n_eq == 0); + return wp; +} + +// #define MICRO_BENCHMARK + +static uint64_t get_cy() { +#ifdef MICRO_BENCHMARK + uint32_t high, low; + asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high)); + return ((uint64_t)high << 32) | (low); +#else + return 0; +#endif +} + +#define IFV if (false) + +template +uint16_t simd_partition_fuzzy_with_bounds( + uint16_t* vals, + typename C::TI* ids, + size_t n, + size_t q_min, + size_t q_max, + size_t* q_out, + uint16_t s0i, + uint16_t s1i) { + if (q_min == 0) { + if (q_out) { + *q_out = 0; + } + return 0; + } + if (q_max >= n) { + if (q_out) { + *q_out = q_max; + } + return 0xffff; + } + if (s0i == s1i) { + if (q_out) { + *q_out = q_min; + } + return s0i; + } + uint64_t t0 = get_cy(); + + // lower bound inclusive, upper exclusive + size_t s0 = s0i, s1 = s1i + 1; + + IFV printf("bounds: %zu %zu\n", s0, s1 - 1); + + int thresh; + size_t n_eq = 0, n_lt = 0; + size_t q = 0; + + for (int it = 0; it < 200; it++) { + // while(s0 + 1 < s1) { + thresh = (s0 + s1) / 2; + count_lt_and_eq(vals, n, thresh, n_lt, n_eq); + + IFV printf( + " [%zu %zu] thresh=%d n_lt=%zu n_eq=%zu, q=%zu:%zu/%zu\n", + s0, + s1, + thresh, + n_lt, + n_eq, + q_min, + q_max, + n); + if (n_lt <= q_min) { + if (n_lt + n_eq >= q_min) { + q = q_min; + break; + } else { + if (C::is_max) { + s0 = thresh; + } else { + s1 = thresh; + } + } + } else if (n_lt <= q_max) { + q = n_lt; + break; + } else { + if (C::is_max) { + s1 = thresh; + } else { + s0 = thresh; + } + } + } + + uint64_t t1 = get_cy(); + + // number of equal values to keep + int64_t n_eq_1 = q - n_lt; + + IFV printf( + "shrink: thresh=%d q=%zu n_eq_1=%" PRId64 "\n", thresh, q, n_eq_1); + if (n_eq_1 < 0) { // happens when > q elements are at lower bound + assert(s0 + 1 == s1); + q = q_min; + if (C::is_max) { + thresh--; + } else { + thresh++; + } + n_eq_1 = q; + IFV printf( + " override: thresh=%d n_eq_1=%" PRId64 "\n", thresh, n_eq_1); + } else { + assert(n_eq_1 <= n_eq); + } + + size_t wp = simd_compress_array(vals, ids, n, thresh, n_eq_1); + + IFV printf("wp=%zu\n", wp); + assert(wp == q); + if (q_out) { + *q_out = q; + } + + uint64_t t2 = get_cy(); + + partition_stats.bisect_cycles += t1 - t0; + partition_stats.compress_cycles += t2 - t1; + + return thresh; +} + +// Forward declarations of histogram functions defined below +void local_simd_histogram_8( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist); +void local_simd_histogram_16( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist); + +template +uint16_t simd_partition_fuzzy_with_bounds_histogram( + uint16_t* vals, + typename C::TI* ids, + size_t n, + size_t q_min, + size_t q_max, + size_t* q_out, + uint16_t s0i, + uint16_t s1i) { + if (q_min == 0) { + if (q_out) { + *q_out = 0; + } + return 0; + } + if (q_max >= n) { + if (q_out) { + *q_out = q_max; + } + return 0xffff; + } + if (s0i == s1i) { + if (q_out) { + *q_out = q_min; + } + return s0i; + } + + IFV printf( + "partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n", + q_min, + q_max, + n, + s0i, + s1i); + + if (!C::is_max) { + IFV printf( + "revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max); + q_min = n - q_min; + q_max = n - q_max; + } + + // lower and upper bound of range, inclusive + int s0 = s0i, s1 = s1i; + // number of values < s0 and > s1 + size_t n_lt = 0, n_gt = 0; + + // output of loop: + int thresh; // final threshold + uint64_t tot_eq = 0; // total nb of equal values + uint64_t n_eq = 0; // nb of equal values to keep + size_t q; // final quantile + + // buffer for the histograms + int hist[16]; + + for (int it = 0; it < 20; it++) { + // otherwise we would be done already + + int shift = 0; + + IFV printf( + " it %d bounds: %d %d n_lt=%ld n_gt=%ld\n", + it, + s0, + s1, + n_lt, + n_gt); + + int maxval = s1 - s0; + + while (maxval > 15) { + shift++; + maxval >>= 1; + } + + IFV printf( + " histogram shift %d maxval %d ?= %d\n", + shift, + maxval, + int((s1 - s0) >> shift)); + + if (maxval > 7) { + local_simd_histogram_16(vals, n, s0, shift, hist); + } else { + local_simd_histogram_8(vals, n, s0, shift, hist); + } + IFV { + int sum = n_lt + n_gt; + printf(" n_lt=%ld hist=[", n_lt); + for (int i = 0; i <= maxval; i++) { + printf("%d ", hist[i]); + sum += hist[i]; + } + printf("] n_gt=%ld sum=%d\n", n_gt, sum); + assert(sum == n); + } + + size_t sum_below = n_lt; + int i; + for (i = 0; i <= maxval; i++) { + sum_below += hist[i]; + if (sum_below >= q_min) { + break; + } + } + IFV printf(" i=%d sum_below=%ld\n", i, sum_below); + if (i <= maxval) { + s0 = s0 + (i << shift); + s1 = s0 + (1 << shift) - 1; + n_lt = sum_below - hist[i]; + n_gt = n - sum_below; + } else { + assert(false && "not implemented"); + } + + IFV printf( + " new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n", + s0, + s1, + n_lt, + n_gt); + + if (s1 > s0) { + if (n_lt >= q_min && q_max >= n_lt) { + IFV printf(" FOUND1\n"); + thresh = s0; + q = n_lt; + break; + } + + size_t n_lt_2 = n - n_gt; + if (n_lt_2 >= q_min && q_max >= n_lt_2) { + thresh = s1 + 1; + q = n_lt_2; + IFV printf(" FOUND2\n"); + break; + } + } else { + thresh = s0; + q = q_min; + tot_eq = n - n_gt - n_lt; + n_eq = q_min - n_lt; + IFV printf(" FOUND3\n"); + break; + } + } + + IFV printf("end bisection: thresh=%d q=%ld n_eq=%ld\n", thresh, q, n_eq); + + if (!C::is_max) { + if (n_eq == 0) { + thresh--; + } else { + // thresh unchanged + n_eq = tot_eq - n_eq; + } + q = n - q; + IFV printf("revert due to CMin, q->%ld n_eq->%ld\n", q, n_eq); + } + + size_t wp = simd_compress_array(vals, ids, n, thresh, n_eq); + IFV printf("wp=%ld ?= %ld\n", wp, q); + assert(wp == q); + if (q_out) { + *q_out = wp; + } + + return thresh; +} + +template +uint16_t simd_partition_fuzzy( + uint16_t* vals, + typename C::TI* ids, + size_t n, + size_t q_min, + size_t q_max, + size_t* q_out) { + assert(is_aligned_pointer(vals)); + + uint16_t s0i, s1i; + find_minimax(vals, n, s0i, s1i); + // QSelect_stats.t0 += get_cy() - t0; + + return simd_partition_fuzzy_with_bounds( + vals, ids, n, q_min, q_max, q_out, s0i, s1i); +} + +#undef IFV + +} // namespace simd_partitioning + +/****************************************************************** + * Histogram subroutines + ******************************************************************/ + +/************************************************************ + * 8 bins + ************************************************************/ + +simd32uint8 accu4to8(simd16uint16 a4) { + simd16uint16 mask4(0x0f0f); + + simd16uint16 a8_0 = a4 & mask4; + simd16uint16 a8_1 = (a4 >> 4) & mask4; + + return simd32uint8(hadd(a8_0, a8_1)); +} + +simd16uint16 accu8to16(simd32uint8 a8) { + simd16uint16 mask8(0x00ff); + + simd16uint16 a8_0 = simd16uint16(a8) & mask8; + simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8; + + return hadd(a8_0, a8_1); +} + +static const simd32uint8 shifts = simd32uint8::create< + 1, + 16, + 0, + 0, + 4, + 64, + 0, + 0, + 0, + 0, + 1, + 16, + 0, + 0, + 4, + 64, + 1, + 16, + 0, + 0, + 4, + 64, + 0, + 0, + 0, + 0, + 1, + 16, + 0, + 0, + 4, + 64>(); + +// 2-bit accumulator: we can add only up to 3 elements +// on output we return 2*4-bit results +// preproc returns either an index in 0..7 or 0xffff +// that yields a 0 when used in the table look-up +template +void compute_accu2( + const uint16_t*& data, + Preproc& pp, + simd16uint16& a4lo, + simd16uint16& a4hi) { + simd16uint16 mask2(0x3333); + simd16uint16 a2((uint16_t)0); // 2-bit accu + for (int j = 0; j < N; j++) { + simd16uint16 v(data); + data += 16; + v = pp(v); + // 0x800 -> force second half of table + simd16uint16 idx = v | (v << 8) | simd16uint16(0x800); + a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx))); + } + a4lo += a2 & mask2; + a4hi += (a2 >> 2) & mask2; +} + +template +simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) { + assert(n_in % 16 == 0); + int n = n_in / 16; + + simd32uint8 a8lo(0); + simd32uint8 a8hi(0); + + for (int i0 = 0; i0 < n; i0 += 15) { + simd16uint16 a4lo(0); // 4-bit accus + simd16uint16 a4hi(0); + + int i1 = std::min(i0 + 15, n); + int i; + for (i = i0; i + 2 < i1; i += 3) { + compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max + } + switch (i1 - i) { + case 2: + compute_accu2<2>(data, pp, a4lo, a4hi); + break; + case 1: + compute_accu2<1>(data, pp, a4lo, a4hi); + break; + } + + a8lo += accu4to8(a4lo); + a8hi += accu4to8(a4hi); + } + + // move to 16-bit accu + simd16uint16 a16lo = accu8to16(a8lo); + simd16uint16 a16hi = accu8to16(a8hi); + + simd16uint16 a16 = hadd(a16lo, a16hi); + + // the 2 lanes must still be combined + return a16; +} + +/************************************************************ + * 16 bins + ************************************************************/ + +static const simd32uint8 shifts2 = simd32uint8::create< + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128>(); + +simd32uint8 shiftr_16(simd32uint8 x, int n) { + return simd32uint8(simd16uint16(x) >> n); +} + +// 2-bit accumulator: we can add only up to 3 elements +// on output we return 2*4-bit results +template +void compute_accu2_16( + const uint16_t*& data, + Preproc pp, + simd32uint8& a4_0, + simd32uint8& a4_1, + simd32uint8& a4_2, + simd32uint8& a4_3) { + simd32uint8 mask1(0x55); + simd32uint8 a2_0; // 2-bit accu + simd32uint8 a2_1; // 2-bit accu + a2_0.clear(); + a2_1.clear(); + + for (int j = 0; j < N; j++) { + simd16uint16 v(data); + data += 16; + v = pp(v); + + simd16uint16 idx = v | (v << 8); + simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx)); + // contains 0s for out-of-bounds elements + + simd16uint16 lt8 = (v >> 3) == simd16uint16(0); + lt8 = lt8 ^ simd16uint16(0xff00); + + a1 = a1 & lt8; + + a2_0 += a1 & mask1; + a2_1 += shiftr_16(a1, 1) & mask1; + } + simd32uint8 mask2(0x33); + + a4_0 += a2_0 & mask2; + a4_1 += a2_1 & mask2; + a4_2 += shiftr_16(a2_0, 2) & mask2; + a4_3 += shiftr_16(a2_1, 2) & mask2; +} + +simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) { + simd32uint8 mask4(0x0f); + + simd16uint16 a8_0 = combine2x2( + (simd16uint16)(a4_0 & mask4), + (simd16uint16)(shiftr_16(a4_0, 4) & mask4)); + + simd16uint16 a8_1 = combine2x2( + (simd16uint16)(a4_1 & mask4), + (simd16uint16)(shiftr_16(a4_1, 4) & mask4)); + + return simd32uint8(hadd(a8_0, a8_1)); +} + +template +simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) { + assert(n_in % 16 == 0); + int n = n_in / 16; + + simd32uint8 a8lo((uint8_t)0); + simd32uint8 a8hi((uint8_t)0); + + for (int i0 = 0; i0 < n; i0 += 7) { + simd32uint8 a4_0(0); // 0, 4, 8, 12 + simd32uint8 a4_1(0); // 1, 5, 9, 13 + simd32uint8 a4_2(0); // 2, 6, 10, 14 + simd32uint8 a4_3(0); // 3, 7, 11, 15 + + int i1 = std::min(i0 + 7, n); + int i; + for (i = i0; i + 2 < i1; i += 3) { + compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3); + } + switch (i1 - i) { + case 2: + compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3); + break; + case 1: + compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3); + break; + } + + a8lo += accu4to8_2(a4_0, a4_1); + a8hi += accu4to8_2(a4_2, a4_3); + } + + // move to 16-bit accu + simd16uint16 a16lo = accu8to16(a8lo); + simd16uint16 a16hi = accu8to16(a8hi); + + simd16uint16 a16 = hadd(a16lo, a16hi); + + a16 = simd16uint16{simd8uint32{a16}.unzip()}; + + return a16; +} + +struct PreprocNOP { + simd16uint16 operator()(simd16uint16 x) { + return x; + } +}; + +template +struct PreprocMinShift { + simd16uint16 min16; + simd16uint16 max16; + + explicit PreprocMinShift(uint16_t min) { + min16.set1(min); + int vmax0 = std::min((nbin << shift) + min, 65536); + uint16_t vmax = uint16_t(vmax0 - 1 - min); + max16.set1(vmax); // vmax inclusive + } + + simd16uint16 operator()(simd16uint16 x) { + x = x - min16; + simd16uint16 mask = (x == max(x, max16)) - (x == max16); + return (x >> shift) | mask; + } +}; + +/* unbounded versions of the functions */ + +void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) { + PreprocNOP pp; + simd16uint16 a16 = histogram_8(data, pp, (n & ~15)); + + ALIGNED(32) uint16_t a16_tab[16]; + a16.store(a16_tab); + + for (int i = 0; i < 8; i++) { + hist[i] = a16_tab[i] + a16_tab[i + 8]; + } + + for (int i = (n & ~15); i < n; i++) { + hist[data[i]]++; + } +} + +void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) { + simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15)); + + ALIGNED(32) uint16_t a16_tab[16]; + a16.store(a16_tab); + + for (int i = 0; i < 16; i++) { + hist[i] = a16_tab[i]; + } + + for (int i = (n & ~15); i < n; i++) { + hist[data[i]]++; + } +} + +/************************************************************ + * Histogram driver routines + ************************************************************/ + +void local_simd_histogram_8( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist) { + if (shift < 0) { + simd_histogram_8_unbounded(data, n, hist); + return; + } + + simd16uint16 a16; + +#define DISPATCH(s) \ + case s: \ + a16 = histogram_8(data, PreprocMinShift(min), (n & ~15)); \ + break + + switch (shift) { + DISPATCH(0); + DISPATCH(1); + DISPATCH(2); + DISPATCH(3); + DISPATCH(4); + DISPATCH(5); + DISPATCH(6); + DISPATCH(7); + DISPATCH(8); + DISPATCH(9); + DISPATCH(10); + DISPATCH(11); + DISPATCH(12); + DISPATCH(13); + default: + FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift); + } +#undef DISPATCH + + ALIGNED(32) uint16_t a16_tab[16]; + a16.store(a16_tab); + + for (int i = 0; i < 8; i++) { + hist[i] = a16_tab[i] + a16_tab[i + 8]; + } + + // complete with remaining bins + for (int i = (n & ~15); i < n; i++) { + if (data[i] < min) { + continue; + } + uint16_t v = data[i] - min; + v >>= shift; + if (v < 8) { + hist[v]++; + } + } +} + +void local_simd_histogram_16( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist) { + if (shift < 0) { + simd_histogram_16_unbounded(data, n, hist); + return; + } + + simd16uint16 a16; + +#define DISPATCH(s) \ + case s: \ + a16 = histogram_16(data, PreprocMinShift(min), (n & ~15)); \ + break + + switch (shift) { + DISPATCH(0); + DISPATCH(1); + DISPATCH(2); + DISPATCH(3); + DISPATCH(4); + DISPATCH(5); + DISPATCH(6); + DISPATCH(7); + DISPATCH(8); + DISPATCH(9); + DISPATCH(10); + DISPATCH(11); + DISPATCH(12); + default: + FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift); + } +#undef DISPATCH + + ALIGNED(32) uint16_t a16_tab[16]; + a16.store(a16_tab); + + for (int i = 0; i < 16; i++) { + hist[i] = a16_tab[i]; + } + + for (int i = (n & ~15); i < n; i++) { + if (data[i] < min) { + continue; + } + uint16_t v = data[i] - min; + v >>= shift; + if (v < 16) { + hist[v]++; + } + } +} + +} // anonymous namespace + +/****************************************************************** + * Template specializations — entry points called from partitioning.cpp + ******************************************************************/ + +constexpr SIMDLevel SL = THE_SIMD_LEVEL; + +template <> +uint16_t partition_fuzzy_simd>( + uint16_t* vals, + int64_t* ids, + size_t n, + size_t q_min, + size_t q_max, + size_t* q_out) { + return simd_partitioning::simd_partition_fuzzy>( + vals, ids, n, q_min, q_max, q_out); +} + +template <> +uint16_t partition_fuzzy_simd>( + uint16_t* vals, + int64_t* ids, + size_t n, + size_t q_min, + size_t q_max, + size_t* q_out) { + return simd_partitioning::simd_partition_fuzzy>( + vals, ids, n, q_min, q_max, q_out); +} + +template <> +uint16_t partition_fuzzy_simd>( + uint16_t* vals, + int* ids, + size_t n, + size_t q_min, + size_t q_max, + size_t* q_out) { + return simd_partitioning::simd_partition_fuzzy>( + vals, ids, n, q_min, q_max, q_out); +} + +template <> +uint16_t partition_fuzzy_simd>( + uint16_t* vals, + int* ids, + size_t n, + size_t q_min, + size_t q_max, + size_t* q_out) { + return simd_partitioning::simd_partition_fuzzy>( + vals, ids, n, q_min, q_max, q_out); +} + +template <> +void simd_histogram_8( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist) { + local_simd_histogram_8(data, n, min, shift, hist); +} + +template <> +void simd_histogram_16( + const uint16_t* data, + int n, + uint16_t min, + int shift, + int* hist) { + local_simd_histogram_16(data, n, min, shift, hist); +} + +} // namespace faiss diff --git a/thirdparty/faiss/faiss/utils/simd_levels.cpp b/thirdparty/faiss/faiss/utils/simd_levels.cpp index 1dc7e74ab..3402555bc 100644 --- a/thirdparty/faiss/faiss/utils/simd_levels.cpp +++ b/thirdparty/faiss/faiss/utils/simd_levels.cpp @@ -190,6 +190,12 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() { } #endif +#if defined(__riscv) && defined(COMPILE_SIMD_RISCV_RVV) + // RVV is always available on RISC-V builds compiled with rv64gcv. + supported_simd_levels |= (1 << static_cast(SIMDLevel::RISCV_RVV)); + detected_level = SIMDLevel::RISCV_RVV; +#endif + return detected_level; } @@ -259,6 +265,8 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() { return SIMDLevel::ARM_SVE; #elif defined(COMPILE_SIMD_ARM_NEON) return SIMDLevel::ARM_NEON; +#elif defined(COMPILE_SIMD_RISCV_RVV) + return SIMDLevel::RISCV_RVV; #else return SIMDLevel::NONE; #endif @@ -289,6 +297,8 @@ std::string to_string(SIMDLevel level) { return "ARM_NEON"; case SIMDLevel::ARM_SVE: return "ARM_SVE"; + case SIMDLevel::RISCV_RVV: + return "RISCV_RVV"; case SIMDLevel::COUNT: default: throw FaissException("Invalid SIMDLevel"); @@ -314,6 +324,9 @@ SIMDLevel to_simd_level(const std::string& level_str) { if (level_str == "ARM_SVE") { return SIMDLevel::ARM_SVE; } + if (level_str == "RISCV_RVV") { + return SIMDLevel::RISCV_RVV; + } throw FaissException("Invalid SIMD level string: " + level_str); } diff --git a/thirdparty/faiss/faiss/utils/simd_levels.h b/thirdparty/faiss/faiss/utils/simd_levels.h index 61d84b55f..9aa367f6d 100644 --- a/thirdparty/faiss/faiss/utils/simd_levels.h +++ b/thirdparty/faiss/faiss/utils/simd_levels.h @@ -25,6 +25,8 @@ enum class SIMDLevel { // arm & aarch64 ARM_NEON, ARM_SVE, // Scalable Vector Extension (ARMv8.2+) + // riscv + RISCV_RVV, // RISC-V Vector Extension (rv64gcv) COUNT }; @@ -58,6 +60,8 @@ inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::AVX2; inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::ARM_SVE; #elif defined(COMPILE_SIMD_ARM_NEON) inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::ARM_NEON; +#elif defined(COMPILE_SIMD_RISCV_RVV) +inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::RISCV_RVV; #else inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::NONE; #endif @@ -113,6 +117,9 @@ constexpr int simd_width() { static_assert( SL != SIMDLevel::ARM_SVE, "simd_width is not supported: SVE is variable-width"); + static_assert( + SL != SIMDLevel::RISCV_RVV, + "simd_width is not supported: RVV is variable-width"); if constexpr (SL == SIMDLevel::AVX512 || SL == SIMDLevel::AVX512_SPR) return 16; else if constexpr (SL == SIMDLevel::AVX2 || SL == SIMDLevel::ARM_NEON) diff --git a/thirdparty/faiss/faiss/utils/sorting.cpp b/thirdparty/faiss/faiss/utils/sorting.cpp index 5b7767700..29ee75024 100644 --- a/thirdparty/faiss/faiss/utils/sorting.cpp +++ b/thirdparty/faiss/faiss/utils/sorting.cpp @@ -134,9 +134,9 @@ void fvec_argsort(size_t n, const float* vals, size_t* perm) { } void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) { - size_t* perm2 = new size_t[n]; + std::vector perm2(n); // 2 result tables, during merging, flip between them - size_t *permB = perm2, *permA = perm; + size_t *permB = perm2.data(), *permA = perm; int nt = omp_get_max_threads(); { // prepare correct permutation so that the result ends in perm @@ -148,8 +148,8 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) { } } -#pragma omp parallel - for (size_t i = 0; i < n; i++) { +#pragma omp parallel for + for (int64_t i = 0; i < static_cast(n); i++) { permA[i] = i; } @@ -184,7 +184,6 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) { } else { int t0 = s * sub_nt / sub_nseg1; int t1 = (s + 1) * sub_nt / sub_nseg1; - printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0); parallel_merge( permA, permB, segs[s], segs[s + 1], t1 - t0, comp); } @@ -197,7 +196,6 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) { } assert(permA == perm); omp_set_nested(prev_nested); - delete[] perm2; } /***************************************************************************** @@ -816,6 +814,10 @@ void hashtable_int64_to_int64_lookup( size_t k0 = bucket << (log2_capacity - log2_nbucket); size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket); for (;;) { + if (tab[slot * 2] == -1) { // empty slot, key not in table + vals[i] = -1; + break; + } if (tab[slot * 2] == k) { // found! vals[i] = tab[2 * slot + 1]; break; diff --git a/thirdparty/faiss/tests/CMakeLists.txt b/thirdparty/faiss/tests/CMakeLists.txt index 27628e4cb..9d740fdc4 100644 --- a/thirdparty/faiss/tests/CMakeLists.txt +++ b/thirdparty/faiss/tests/CMakeLists.txt @@ -96,7 +96,7 @@ include(../cmake/link_to_faiss_lib.cmake) link_to_faiss_lib(faiss_test) if (FAISS_ENABLE_PYTHON) - target_link_libraries(faiss_test PUBLIC faiss_example_external_module) + add_dependencies(faiss_test faiss_example_external_module) endif() include(FetchContent) diff --git a/thirdparty/faiss/tests/test_NSG_compressed_graph.cpp b/thirdparty/faiss/tests/test_NSG_compressed_graph.cpp index 4afaa582a..bc867e8f3 100644 --- a/thirdparty/faiss/tests/test_NSG_compressed_graph.cpp +++ b/thirdparty/faiss/tests/test_NSG_compressed_graph.cpp @@ -5,7 +5,9 @@ * LICENSE file in the root directory of this source tree. */ +#include #include +#include #include #include #include @@ -83,3 +85,44 @@ TEST(NSGCompressed, test_compressed) { EXPECT_EQ(Iref, I); EXPECT_EQ(Dref, D); } + +// Regression test for sync_prune out-of-bounds bug. +// +// With ntotal=1 and L=1, search_on_graph produces pool = [{id:0}]. +// sync_prune(q=0): pool[0].id == q → start++ → start == pool.size(). +// Old code: pool[start] is out-of-bounds → undefined behavior. +// Fix: guard returns early and fills graph row with EMPTY_ID. +// +// Calls NSG::build() directly (bypassing IndexNSG::check_knn_graph) +// to reach the edge case with ntotal=1. +TEST(NSGBugs, SyncPruneSingleNode) { + constexpr int d = 4; + constexpr int R = 1; + + faiss::IndexFlat storage(d); + float vec[] = {1.0f, 2.0f, 3.0f, 4.0f}; + storage.add(1, vec); + + faiss::idx_t knn_data[] = {-1}; + faiss::nsg::Graph knn_graph(knn_data, 1, 1); + + faiss::NSG nsg_obj(R); + nsg_obj.L = 1; + + // Old code crashes here. Fixed code handles it. + ASSERT_NO_THROW(nsg_obj.build(&storage, 1, knn_graph, false)); + EXPECT_TRUE(nsg_obj.is_built); + EXPECT_EQ(nsg_obj.enterpoint, 0); + + // Search returns the only node + nsg_obj.search_L = 1; + faiss::VisitedTable vt(1); + auto dis = std::unique_ptr( + faiss::nsg::storage_distance_computer(&storage)); + dis->set_query(vec); + + faiss::idx_t label = -1; + float distance = -1; + nsg_obj.search(*dis, 1, &label, &distance, vt); + EXPECT_EQ(label, 0); +} diff --git a/thirdparty/faiss/tests/test_binary_hash.cpp b/thirdparty/faiss/tests/test_binary_hash.cpp index 40b3afa3c..3835ffb3b 100644 --- a/thirdparty/faiss/tests/test_binary_hash.cpp +++ b/thirdparty/faiss/tests/test_binary_hash.cpp @@ -65,3 +65,43 @@ TEST(BinaryHash, MultiHashSmallCodeSizeRoundTrip) { EXPECT_EQ(distances[0], 0); EXPECT_EQ(labels[0], 0); } + +TEST(BinaryHash, MultiHashResetClearsMaps) { + int d = 16; + int nhash = 2; + int b = 4; + faiss::IndexBinaryMultiHash idx(d, nhash, b); + idx.nflip = 0; + + // Add a vector + int n = 1; + std::vector data(n * idx.code_size); + data[0] = 0xAA; + data[1] = 0x55; + idx.add(n, data.data()); + EXPECT_EQ(idx.ntotal, 1); + EXPECT_GT(idx.hashtable_size(), 0u); + + // Reset should clear everything + idx.reset(); + EXPECT_EQ(idx.ntotal, 0); + EXPECT_EQ(idx.hashtable_size(), 0u); + + // Searching for the old vector after reset should not find it + int k = 1; + std::vector distances(k); + std::vector labels(k); + idx.search(1, data.data(), k, distances.data(), labels.data()); + EXPECT_EQ(labels[0], -1); + + // After reset, add a new vector and verify the index is functional + std::vector data2(n * idx.code_size); + data2[0] = 0x55; + data2[1] = 0xAA; + idx.add(n, data2.data()); + EXPECT_EQ(idx.ntotal, 1); + + idx.search(1, data2.data(), k, distances.data(), labels.data()); + EXPECT_EQ(distances[0], 0); + EXPECT_EQ(labels[0], 0); +} diff --git a/thirdparty/faiss/tests/test_build_blocks.py b/thirdparty/faiss/tests/test_build_blocks.py index aa3d4e765..2a18bde3a 100644 --- a/thirdparty/faiss/tests/test_build_blocks.py +++ b/thirdparty/faiss/tests/test_build_blocks.py @@ -10,7 +10,7 @@ import faiss import unittest -from common_faiss_tests import get_dataset_2 +from common_faiss_tests import get_dataset_2, for_all_simd_levels class TestPCA(unittest.TestCase): @@ -297,6 +297,7 @@ def test_hash(self): self.assertTrue(cc[0] == cc[1]) +@for_all_simd_levels class TestScalarQuantizer(unittest.TestCase): def test_8bit_equiv(self): @@ -419,6 +420,7 @@ def test_rand_vector(self): self.assertLess(ninter, 460) +@for_all_simd_levels class TestPairwiseDis(unittest.TestCase): def test_L2(self): diff --git a/thirdparty/faiss/tests/test_contrib.py b/thirdparty/faiss/tests/test_contrib.py index dbeacda64..878e631c1 100644 --- a/thirdparty/faiss/tests/test_contrib.py +++ b/thirdparty/faiss/tests/test_contrib.py @@ -586,6 +586,43 @@ def test_ivf_train_2level(self): ndiff = (Iref != Inew).sum() self.assertLess(ndiff.item(), 57) + def test_balanced_clustering(self): + """Test balanced_assignment_with_penalties from notebook N10159950""" + ds = datasets.SyntheticDataset(32, 10000, 20000, 0) + nc = 100 + + # train centroids + km = faiss.Kmeans(ds.d, nc) + km.train(ds.get_train()) + centroids = km.centroids + + # create biased database (shifted by a constant vector) + biased_xb = ds.get_database().copy() + rs = np.random.RandomState(123) + biased_xb += rs.randn(ds.d).astype("float32") * 0.3 + + # unconstrained assignment on biased data + d2, assign_unc = faiss.knn(biased_xb, centroids, 1) + assign_unc = assign_unc.ravel() + imf_unc = clustering.imbalance_factor(nc, assign_unc) + mse_unc = float(d2.mean()) + + # balanced assignment + assign_bal, stats = clustering.balanced_assignment_with_penalties( + biased_xb, centroids, alpha=0.03, num_iter=20 + ) + + # balanced assignment should reduce imbalance factor + self.assertLess(stats["imf"], imf_unc / 1.5) + + # MSE may increase but should not be too much worse (< 2x) + self.assertLess(stats["mse"], mse_unc * 1.5) + + # all points should be assigned + self.assertEqual(len(assign_bal), len(biased_xb)) + self.assertTrue(np.all(assign_bal >= 0)) + self.assertTrue(np.all(assign_bal < nc)) + class TestBigBatchSearch(unittest.TestCase): diff --git a/thirdparty/faiss/tests/test_extra_distances.py b/thirdparty/faiss/tests/test_extra_distances.py index 38628a7d8..25d4109b3 100644 --- a/thirdparty/faiss/tests/test_extra_distances.py +++ b/thirdparty/faiss/tests/test_extra_distances.py @@ -245,6 +245,7 @@ def test_gower(self): self.assertTrue(np.all(np.isnan(dis_out_of_range))) +@for_all_simd_levels class TestKNN(unittest.TestCase): """ test that the knn search gives the same as distance matrix + argmin """ @@ -280,6 +281,7 @@ def test_Linf(self): self.do_test_knn(faiss.METRIC_Linf) +@for_all_simd_levels class TestHNSW(unittest.TestCase): """ since it has a distance computer, HNSW should work """ @@ -304,6 +306,7 @@ def test_hnsw(self): assert np.all(D[q] == dis[q, I[q]]) +@for_all_simd_levels class TestIVF(unittest.TestCase): """ since it has a distance computer, IVF should work """ diff --git a/thirdparty/faiss/tests/test_fast_scan.py b/thirdparty/faiss/tests/test_fast_scan.py index e5d1a1a37..011ea087a 100644 --- a/thirdparty/faiss/tests/test_fast_scan.py +++ b/thirdparty/faiss/tests/test_fast_scan.py @@ -12,10 +12,13 @@ from faiss.contrib import datasets +from common_faiss_tests import for_all_simd_levels + # the tests tend to timeout in stress modes + dev otherwise faiss.omp_set_num_threads(4) +@for_all_simd_levels class TestSearch(unittest.TestCase): def test_PQ4_accuracy(self): @@ -78,6 +81,7 @@ def test_PQ4_speed(self): self.assertLess(pqfs_t * 4, pq_t) +@for_all_simd_levels class TestRounding(unittest.TestCase): def do_test_rounding(self, implem=4, metric=faiss.METRIC_L2): @@ -133,6 +137,7 @@ def test_implem_14_ip(self): self.do_test_rounding(12, faiss.METRIC_INNER_PRODUCT) +@for_all_simd_levels class TestReconstruct(unittest.TestCase): def test_pqfastscan(self): @@ -312,6 +317,7 @@ def build_fast_scan_index(self, index, params): return index2 +@for_all_simd_levels class TestImplem12(TestImplems): def build_fast_scan_index(self, index, qbs): @@ -340,6 +346,7 @@ def test_qbs6_odd_dim(self): self.do_with_params(30, 0x33) +@for_all_simd_levels class TestImplem13(TestImplems): def build_fast_scan_index(self, index, qbs): @@ -356,6 +363,7 @@ def test_qbs7_k1(self): self.do_with_params(32, 0x223) +@for_all_simd_levels class TestImplem14(TestImplems): def build_fast_scan_index(self, index, params): @@ -392,6 +400,7 @@ def test_1_64_odd_dim(self): self.do_with_params(30, (1, 64)) +@for_all_simd_levels class TestImplem15(TestImplems): def build_fast_scan_index(self, index, params): @@ -408,6 +417,7 @@ def test_2_64(self): self.do_with_params(32, (2, 64)) +@for_all_simd_levels class TestAdd(unittest.TestCase): def do_test_add(self, d, bbs): @@ -627,7 +637,11 @@ def add_TestAQFastScan_subtest_from_idxaq(implem, metric): add_TestAQFastScan_subtest_from_idxaq(implem, 'L2') add_TestAQFastScan_subtest_from_idxaq(implem, 'IP') +# Apply decorator after dynamic method generation. +TestAQFastScan = for_all_simd_levels(TestAQFastScan) + +@for_all_simd_levels class TestPAQFastScan(unittest.TestCase): def subtest_accuracy(self, paq): @@ -688,6 +702,7 @@ def test_io(self): self.subtest_io('PRQ2x3x4fs_Nrq2x4') +@for_all_simd_levels class TestBlockDecode(unittest.TestCase): def test_issue_2739(self): diff --git a/thirdparty/faiss/tests/test_fast_scan_ivf.py b/thirdparty/faiss/tests/test_fast_scan_ivf.py index 67699a127..7e2d74737 100644 --- a/thirdparty/faiss/tests/test_fast_scan_ivf.py +++ b/thirdparty/faiss/tests/test_fast_scan_ivf.py @@ -12,10 +12,13 @@ from faiss.contrib import datasets from faiss.contrib.inspect_tools import get_invlist +from common_faiss_tests import for_all_simd_levels + # the tests tend to timeout in stress modes + dev otherwise faiss.omp_set_num_threads(4) +@for_all_simd_levels class TestLUTQuantization(unittest.TestCase): def compute_dis_float(self, codes, LUT, bias): @@ -166,6 +169,7 @@ def three_metrics(Dref, Iref, Dnew, Inew): ########################################################## +@for_all_simd_levels class TestIVFImplem1(unittest.TestCase): """ Verify implem 1 (search from original invlists) against IndexIVFPQ """ @@ -206,6 +210,7 @@ def test_by_residual_ip(self): self.do_test(True, faiss.METRIC_INNER_PRODUCT) +@for_all_simd_levels class TestIVFImplem2(unittest.TestCase): """ Verify implem 2 (search with original invlists with uint8 LUTs) against IndexIVFPQ. Entails some loss in accuracy. """ @@ -254,6 +259,7 @@ def test_qloss_by_residual_ip(self): self.eval_quant_loss(True, faiss.METRIC_INNER_PRODUCT) +@for_all_simd_levels class TestEquivPQ(unittest.TestCase): def test_equiv_pq(self): @@ -411,26 +417,32 @@ def test_by_residual_odd_dim_single_query(self): self.do_test(True, d=30, nq=1) +@for_all_simd_levels class TestIVFImplem10(TestIVFImplem12): IMPLEM = 10 +@for_all_simd_levels class TestIVFImplem11(TestIVFImplem12): IMPLEM = 11 +@for_all_simd_levels class TestIVFImplem13(TestIVFImplem12): IMPLEM = 13 +@for_all_simd_levels class TestIVFImplem14(TestIVFImplem12): IMPLEM = 14 +@for_all_simd_levels class TestIVFImplem15(TestIVFImplem12): IMPLEM = 15 +@for_all_simd_levels class TestAdd(unittest.TestCase): def do_test(self, by_residual=False, metric=faiss.METRIC_L2, d=32, bbs=32): @@ -482,6 +494,7 @@ def test_bbs64(self): self.do_test(bbs=64) +@for_all_simd_levels class TestTraining(unittest.TestCase): def do_test(self, by_residual=False, metric=faiss.METRIC_L2, d=32, bbs=32): @@ -545,6 +558,7 @@ def test_by_residual_odd_dim(self): self.do_test(by_residual=True, d=30) +@for_all_simd_levels class TestReconstruct(unittest.TestCase): """ test reconstruct and sa_encode / sa_decode (also for a few additive quantizer variants) """ @@ -640,6 +654,7 @@ def test_prq(self): self.do_test_generic("PRQ8x2x4fs", metric=faiss.METRIC_INNER_PRODUCT) +@for_all_simd_levels class TestIsTrained(unittest.TestCase): def test_issue_2019(self): @@ -861,7 +876,11 @@ def add_TestIVFAQFastScan_subtest_rescale_accuracy(aq, st, by_residual, implem): add_TestIVFAQFastScan_subtest_rescale_accuracy('LSQ', 'lsq', byr, implem) add_TestIVFAQFastScan_subtest_rescale_accuracy('RQ', 'rq', byr, implem) +# Apply decorator after dynamic method generation. +TestIVFAQFastScan = for_all_simd_levels(TestIVFAQFastScan) + +@for_all_simd_levels class TestIVFPAQFastScan(unittest.TestCase): def subtest_accuracy(self, paq): @@ -929,6 +948,7 @@ def test_io(self): self.subtest_io('IVF16,PRQ2x3x4fs_Nrq2x4') +@for_all_simd_levels class TestSearchParams(unittest.TestCase): def test_search_params(self): @@ -991,9 +1011,11 @@ def test_IP(self): self.do_test(metric=faiss.METRIC_INNER_PRODUCT) +@for_all_simd_levels class TestRangeSearchImplem10(TestRangeSearchImplem12): IMPLEM = 10 +@for_all_simd_levels class TestRangeSearchImplem110(TestRangeSearchImplem12): IMPLEM = 110 diff --git a/thirdparty/faiss/tests/test_graph_based.py b/thirdparty/faiss/tests/test_graph_based.py index 80a5daa98..15a61f76a 100644 --- a/thirdparty/faiss/tests/test_graph_based.py +++ b/thirdparty/faiss/tests/test_graph_based.py @@ -9,9 +9,10 @@ import unittest import faiss -from common_faiss_tests import get_dataset_2 +from common_faiss_tests import get_dataset_2, for_all_simd_levels +@for_all_simd_levels class TestHNSW(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -222,6 +223,32 @@ def test_hnsw_reset(self): self.assertEqual(index_hnsw.ntotal, 0) +class TestHNSWNaN(unittest.TestCase): + """Adding a vector with NaN to an IVF+HNSW index used to crash because + NaN distances corrupt the MinimaxHeap ordering in HNSW search. The fix + converts NaN to +inf in MinimaxHeap::push so the heap stays well-ordered. + """ + + def test_add_nan_vector_to_ivf_hnsw(self): + d = 64 + nt = 2000 + nb = 1000 + xt = np.random.default_rng(42).random((nt, d), dtype='float32') + xb = np.random.default_rng(43).random((nb, d), dtype='float32') + + index = faiss.index_factory(d, "IVF256_HNSW32,SQ8") + index.train(xt) + index.add(xb) + + # Create a vector with NaN in the first component + vec = np.zeros((1, d), dtype='float32') + vec[0, 0] = np.nan + + # This should not crash + index.add(vec) + self.assertEqual(index.ntotal, nb + 1) + + class Issue3684(unittest.TestCase): def test_issue3684(self): @@ -581,6 +608,29 @@ def test_order(self): np.testing.assert_array_equal(indices, gt) +class TestNNDescentGenRandom(unittest.TestCase): + """Regression tests for gen_random edge cases in NNDescent.""" + + def test_search_L_equals_ntotal(self): + """gen_random(size, N) crashed with division by zero when size == N. + + In search(), L_2 = max(search_L, topk). When search_L >= ntotal, + gen_random is called with size == N, causing rng() % 0. + """ + d = 32 + nb = 200 # just above NUM_EVAL_POINTS=100 + xb = np.random.default_rng(42).random((nb, d)).astype('float32') + xq = np.random.default_rng(43).random((10, d)).astype('float32') + + index = faiss.IndexNNDescentFlat(d, 32) + index.nndescent.search_L = nb # triggers gen_random(size=nb, N=nb) + index.train(xb) + index.add(xb) + + # This crashed with division by zero before the fix + D, I = index.search(xq, k=1) + + class TestNNDescentKNNG(unittest.TestCase): def test_knng_L2(self): diff --git a/thirdparty/faiss/tests/test_hnsw.cpp b/thirdparty/faiss/tests/test_hnsw.cpp index 9e2b46fbc..44cb77478 100644 --- a/thirdparty/faiss/tests/test_hnsw.cpp +++ b/thirdparty/faiss/tests/test_hnsw.cpp @@ -7,13 +7,16 @@ #include +#include #include #include #include #include #include +#include #include +#include #include #include #include @@ -169,6 +172,52 @@ void test_popmin_identical_distances( ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis); } +void copy_base_level_only( + const faiss::IndexHNSWCagra& src, + faiss::IndexHNSWCagra& dst) { + auto n = src.ntotal; + auto d = src.d; + auto M = src.hnsw.nb_neighbors(0) / 2; + auto graph_degree = src.hnsw.nb_neighbors(0); + + if (dst.storage && dst.own_fields) { + delete dst.storage; + } + dst.storage = new faiss::IndexFlatL2(d); + dst.own_fields = true; + dst.d = d; + dst.metric_type = src.metric_type; + dst.is_trained = true; + dst.keep_max_size_level0 = true; + + dst.hnsw.reset(); + dst.hnsw.assign_probas.clear(); + dst.hnsw.cum_nneighbor_per_level.clear(); + dst.hnsw.set_default_probas(M, 1.0 / std::log(M)); + + dst.hnsw.prepare_level_tab(n, false); + + auto src_flat = dynamic_cast(src.storage); + FAISS_THROW_IF_NOT(src_flat); + dst.storage->add(n, src_flat->get_xb()); + dst.ntotal = n; + + for (faiss::idx_t i = 0; i < n; i++) { + size_t src_begin, src_end; + src.hnsw.neighbor_range(i, 0, &src_begin, &src_end); + + size_t dst_begin, dst_end; + dst.hnsw.neighbor_range(i, 0, &dst_begin, &dst_end); + + for (size_t j = 0; j < graph_degree && j < (dst_end - dst_begin); j++) { + dst.hnsw.neighbors[dst_begin + j] = + src.hnsw.neighbors[src_begin + j]; + } + } + + dst.base_level_only = true; +} + TEST(HNSW, Test_popmin) { std::vector sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32, 64, 128}; for (const size_t size : sizes) { @@ -218,6 +267,36 @@ TEST(HNSW, Test_IndexHNSW_METRIC_Lp) { EXPECT_EQ(label, 0); // Label should be 0 } +TEST(HNSW, Test_IndexHNSWCagra_BaseLevelOnly_RangeSearch) { + int d = 8; + int nb = 100; + int nq = 5; + int M = 4; + + std::vector xb(nb * d); + std::vector xq(nq * d); + faiss::float_rand(xb.data(), xb.size(), 1234); + faiss::float_rand(xq.data(), xq.size(), 4321); + + faiss::IndexHNSWCagra index(d, M, faiss::METRIC_L2); + index.add(nb, xb.data()); + index.base_level_only = true; + index.num_base_level_search_entrypoints = 8; + + faiss::IndexHNSWCagra dst_index; + copy_base_level_only(index, dst_index); + dst_index.num_base_level_search_entrypoints = 8; + + faiss::RangeSearchResult res(nq); + float radius = 1e9f; + dst_index.range_search(nq, xq.data(), radius, &res); + + for (int i = 0; i < nq; i++) { + auto count = res.lims[i + 1] - res.lims[i]; + EXPECT_GT(count, 0); + } +} + class HNSWTest : public testing::Test { protected: HNSWTest() { diff --git a/thirdparty/faiss/tests/test_index_binary.py b/thirdparty/faiss/tests/test_index_binary.py index e3f13a18e..5a846c91f 100644 --- a/thirdparty/faiss/tests/test_index_binary.py +++ b/thirdparty/faiss/tests/test_index_binary.py @@ -10,7 +10,9 @@ import unittest import faiss -from common_faiss_tests import compare_binary_result_lists, make_binary_dataset +from common_faiss_tests import ( + compare_binary_result_lists, for_all_simd_levels, make_binary_dataset +) @@ -25,6 +27,7 @@ def binary_dis(x, y): return sum(faiss.popcount64(int(xi ^ yi)) for xi, yi in zip(x, y)) +@for_all_simd_levels class TestBinaryPQ(unittest.TestCase): """ Use a PQ that mimicks a binary encoder """ @@ -81,6 +84,7 @@ def test_encode_to_binary(self): assert 4 * ref_dis == dj +@for_all_simd_levels class TestBinaryFlat(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -155,6 +159,7 @@ def test_reconstruct(self): assert np.all(input_vector[:4] == reconstructed_vector) +@for_all_simd_levels class TestBinaryIVF(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -313,6 +318,7 @@ def test_search_per_invlist(self): compare_binary_result_lists(Dref, Iref, D2, I2) +@for_all_simd_levels class TestHNSW(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -360,6 +366,7 @@ def test_hnsw(self): self.assertTrue((Dref == Dbin).all()) +@for_all_simd_levels class TestReplicasAndShards(unittest.TestCase): @unittest.skipIf(os.name == "posix" and os.uname().sysname == "Darwin", diff --git a/thirdparty/faiss/tests/test_io.py b/thirdparty/faiss/tests/test_io.py index 0733453aa..8d000a3ca 100644 --- a/thirdparty/faiss/tests/test_io.py +++ b/thirdparty/faiss/tests/test_io.py @@ -545,7 +545,7 @@ def test_reader(self): class TestIOFlatMMap(unittest.TestCase): @unittest.skipIf( - platform.system() not in ["Windows", "Linux"], + platform.system() not in ["Windows", "Linux", "Darwin"], "supported OSes only" ) def test_mmap(self): diff --git a/thirdparty/faiss/tests/test_ivf_index.cpp b/thirdparty/faiss/tests/test_ivf_index.cpp index 7a9804bb0..38c58d844 100644 --- a/thirdparty/faiss/tests/test_ivf_index.cpp +++ b/thirdparty/faiss/tests/test_ivf_index.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -16,7 +17,9 @@ #include #include +#include #include +#include namespace { @@ -292,3 +295,197 @@ TEST(IVF, search_preassigned_out_of_range_key) { false), faiss::FaissException); } + +// Test: range_search_preassigned with out-of-range keys throws a catchable +// FaissException instead of calling std::terminate from an uncaught +// exception inside the OpenMP parallel region. +TEST(IVF, range_search_preassigned_out_of_range_key) { + int d = 4; + int nlist = 2; + faiss::IndexFlatL2 quantizer(d); + faiss::IndexIVFFlat idx(&quantizer, d, nlist); + idx.own_fields = false; + + std::vector train_data(nlist * d, 0.0f); + for (int i = 0; i < nlist * d; i++) { + train_data[i] = static_cast(i); + } + idx.train(nlist, train_data.data()); + idx.add(nlist, train_data.data()); + + std::vector xq(d, 1.0f); + faiss::RangeSearchResult result(1); + + faiss::idx_t bad_key = nlist; // out of range + float coarse_dis = 0.0f; + + EXPECT_THROW( + idx.range_search_preassigned( + 1, + xq.data(), + std::numeric_limits::max(), + &bad_key, + &coarse_dis, + &result, + false), + faiss::FaissException); +} + +// Minimal ResultHandler that just collects results presented to it. +struct CollectResultHandler : faiss::ResultHandler { + bool add_result(float, faiss::idx_t) override { + return false; + } +}; + +// Test: search1 with a quantizer that returns out-of-range keys throws +// FaissException. +TEST(IVF, search1_out_of_range_key) { + int d = 4; + int nlist = 2; + faiss::IndexFlatL2 quantizer(d); + faiss::IndexIVFFlat idx(&quantizer, d, nlist); + idx.own_fields = false; + + // Train and add vectors so the index is usable. + std::vector train_data(nlist * d, 0.0f); + for (int i = 0; i < nlist * d; i++) { + train_data[i] = static_cast(i); + } + idx.train(nlist, train_data.data()); + idx.add(nlist, train_data.data()); + + // Corrupt the quantizer by adding an extra centroid far away, so it + // can return key == nlist (out of range) for a query near that point. + std::vector extra_centroid(d, 1e6f); + quantizer.add(1, extra_centroid.data()); + // Now quantizer has nlist+1 centroids, but idx.nlist is still nlist. + + // Query near the extra centroid so quantizer returns the bad key. + std::vector xq(d, 1e6f); + CollectResultHandler handler; + handler.threshold = std::numeric_limits::max(); + + EXPECT_THROW(idx.search1(xq.data(), handler), faiss::FaissException); +} + +// Iterator that enables search callbacks and tracks invocations. +class CallbackTrackingIterator : public TestInvertedListIterator { + public: + CallbackTrackingIterator( + size_t list_no, + TestContext* context, + size_t& distance_count, + size_t& heap_count) + : TestInvertedListIterator(list_no, context), + distance_count_{distance_count}, + heap_count_{heap_count} { + has_search_callbacks_ = true; + } + + void on_distance_computed(faiss::idx_t id, float distance) override { + EXPECT_GE(id, 0) << "vector ID should be non-negative"; + EXPECT_GE(distance, 0.0f) << "L2 distance should be non-negative"; + distance_count_++; + } + + void on_heap_changed(faiss::idx_t new_id, faiss::idx_t evicted_id) + override { + EXPECT_GE(new_id, 0) << "new heap entry ID should be non-negative"; + (void)evicted_id; // may be -1 when heap not yet full + heap_count_++; + } + + private: + size_t& distance_count_; + size_t& heap_count_; +}; + +// InvertedLists that uses CallbackTrackingIterator. +class CallbackTrackingInvertedLists : public TestInvertedLists { + public: + CallbackTrackingInvertedLists( + size_t nlist_in, + size_t code_size_in, + size_t& distance_count, + size_t& heap_count) + : TestInvertedLists(nlist_in, code_size_in), + distance_count_{distance_count}, + heap_count_{heap_count} {} + + faiss::InvertedListsIterator* get_iterator(size_t list_no, void* context) + const override { + auto testContext = (TestContext*)context; + testContext->lists_probed.insert(list_no); + return new CallbackTrackingIterator( + list_no, testContext, distance_count_, heap_count_); + } + + private: + size_t& distance_count_; + size_t& heap_count_; +}; + +// Test: on_distance_computed and on_heap_changed fire during search +// when has_search_callbacks_ is true. +TEST(IVF, search_callbacks) { + constexpr int d = 8; + constexpr int nb = 200; + constexpr int nlist = 4; + + std::mt19937 rng(42); + std::uniform_real_distribution<> distrib; + + omp_set_num_threads(1); + + faiss::IndexFlatL2 quantizer(d); + faiss::IndexIVFFlat index(&quantizer, d, nlist); + + size_t distance_count = 0; + size_t heap_count = 0; + CallbackTrackingInvertedLists invlists( + nlist, index.code_size, distance_count, heap_count); + index.replace_invlists(&invlists); + + // Train + constexpr size_t nt = 100; + std::vector trainvecs(nt * d); + for (size_t i = 0; i < nt * d; i++) { + trainvecs[i] = distrib(rng); + } + index.train(nt, trainvecs.data()); + + // Populate via context + TestContext context; + std::vector database(nb * d); + for (size_t i = 0; i < nb * d; i++) { + database[i] = distrib(rng); + } + std::vector coarse_idx(nb); + index.quantizer->assign(nb, database.data(), coarse_idx.data()); + std::vector xids(nb, 42); + index.add_core( + nb, database.data(), xids.data(), coarse_idx.data(), &context); + + // Search + constexpr faiss::idx_t k = 5; + constexpr size_t nprobe = 2; + std::vector query(d); + for (int i = 0; i < d; i++) { + query[i] = distrib(rng); + } + std::vector distances(k); + std::vector labels(k); + faiss::SearchParametersIVF params; + params.inverted_list_context = &context; + params.nprobe = nprobe; + + index.search(1, query.data(), k, distances.data(), labels.data(), ¶ms); + + EXPECT_GT(distance_count, 0) + << "on_distance_computed should fire for scored vectors"; + EXPECT_GT(heap_count, 0) + << "on_heap_changed should fire when vectors enter the heap"; + EXPECT_GE(distance_count, heap_count) + << "not every distance computation leads to a heap change"; +} diff --git a/thirdparty/faiss/tests/test_mmap.cpp b/thirdparty/faiss/tests/test_mmap.cpp index ce8ab4657..f1b3f3f18 100644 --- a/thirdparty/faiss/tests/test_mmap.cpp +++ b/thirdparty/faiss/tests/test_mmap.cpp @@ -22,6 +22,13 @@ namespace { +#if defined(_WIN32) || defined(__linux__) || defined(__FreeBSD__) || \ + defined(__APPLE__) +constexpr bool kMmapIFCSupported = true; +#else +constexpr bool kMmapIFCSupported = false; +#endif + std::vector make_data(const size_t n, const size_t d, size_t seed) { std::vector database(n * d); std::mt19937 rng(seed); @@ -60,9 +67,9 @@ std::vector make_binary_data( // on top of the existing File1 again TEST(TestMmap, mmap_flatcodes) { -#ifdef _AIX - GTEST_SKIP() << "Skipping test on AIX."; -#endif + if (!kMmapIFCSupported) { + GTEST_SKIP() << "Skipping test on unsupported platform."; + } // generate data const size_t nt = 1000; const size_t nq = 10; @@ -164,9 +171,9 @@ TEST(TestMmap, mmap_flatcodes) { } TEST(TestMmap, mmap_binary_flatcodes) { -#ifdef _AIX - GTEST_SKIP() << "Skipping test on AIX."; -#endif + if (!kMmapIFCSupported) { + GTEST_SKIP() << "Skipping test on unsupported platform."; + } // generate data const size_t nt = 1000; const size_t nq = 10; diff --git a/thirdparty/faiss/tests/test_nndescent.cpp b/thirdparty/faiss/tests/test_nndescent.cpp new file mode 100644 index 000000000..4c78978ac --- /dev/null +++ b/thirdparty/faiss/tests/test_nndescent.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +using namespace faiss::nndescent; + +/// Helper: build a Nhood with known data in all fields. +static Nhood make_populated_nhood() { + std::mt19937 rng(42); + int N = 200; + Nhood nh(/*l=*/50, /*s=*/10, rng, N); + + nh.pool.clear(); + nh.pool.push_back(Neighbor(1, 0.1f, true)); + nh.pool.push_back(Neighbor(2, 0.2f, false)); + nh.pool.push_back(Neighbor(3, 0.3f, true)); + + nh.nn_old = {10, 20, 30}; + nh.rnn_new = {40, 50}; + nh.rnn_old = {60, 70, 80}; + + return nh; +} + +TEST(NhoodCopy, CopyConstructorPreservesAllFields) { + Nhood original = make_populated_nhood(); + Nhood copy(original); + + EXPECT_EQ(copy.M, original.M); + EXPECT_EQ(copy.pool.size(), original.pool.size()); + EXPECT_EQ(copy.nn_new, original.nn_new); + EXPECT_EQ(copy.nn_old, original.nn_old); + EXPECT_EQ(copy.rnn_new, original.rnn_new); + EXPECT_EQ(copy.rnn_old, original.rnn_old); +} + +TEST(NhoodCopy, CopyAssignmentPreservesAllFields) { + Nhood original = make_populated_nhood(); + Nhood assigned; + assigned = original; + + EXPECT_EQ(assigned.M, original.M); + EXPECT_EQ(assigned.pool.size(), original.pool.size()); + EXPECT_EQ(assigned.nn_new, original.nn_new); + EXPECT_EQ(assigned.nn_old, original.nn_old); + EXPECT_EQ(assigned.rnn_new, original.rnn_new); + EXPECT_EQ(assigned.rnn_old, original.rnn_old); +} + +TEST(NhoodCopy, CopyAssignmentSelfAssign) { + Nhood nh = make_populated_nhood(); + auto expected_pool_size = nh.pool.size(); + auto expected_nn_new = nh.nn_new; + + // Use a reference to avoid -Wself-assign-overloaded. + Nhood& ref = nh; + nh = ref; + + EXPECT_EQ(nh.pool.size(), expected_pool_size); + EXPECT_EQ(nh.nn_new, expected_nn_new); +} + +/// Simulates std::vector reallocation during push_back. +TEST(NhoodCopy, VectorReallocationPreservesData) { + std::vector vec; + // Do NOT reserve — force reallocation during push_back + for (int i = 0; i < 20; i++) { + Nhood nh = make_populated_nhood(); + nh.pool[0].id = i; + vec.push_back(std::move(nh)); + } + + for (int i = 0; i < 20; i++) { + EXPECT_EQ(vec[i].pool[0].id, i) << "pool lost at index " << i; + EXPECT_EQ(vec[i].pool.size(), 3) << "pool truncated at index " << i; + EXPECT_EQ(vec[i].nn_old.size(), 3) << "nn_old lost at index " << i; + EXPECT_EQ(vec[i].rnn_new.size(), 2) << "rnn_new lost at index " << i; + EXPECT_EQ(vec[i].rnn_old.size(), 3) << "rnn_old lost at index " << i; + } +} diff --git a/thirdparty/faiss/tests/test_partition.py b/thirdparty/faiss/tests/test_partition.py index 48b4abbc0..351d25ea0 100644 --- a/thirdparty/faiss/tests/test_partition.py +++ b/thirdparty/faiss/tests/test_partition.py @@ -8,6 +8,8 @@ import faiss import unittest +from common_faiss_tests import for_all_simd_levels + class PartitionTests: @@ -44,6 +46,7 @@ def pointer_to_minus1(): return np.array([-1], dtype='int64').view("uint64") +@for_all_simd_levels class TestPartitioningFloat(unittest.TestCase, PartitionTests): def do_partition(self, n, q, maxval=None, seed=None): @@ -89,6 +92,7 @@ def do_partition(self, n, q, maxval=None, seed=None): self.assertEqual(n_eq, 0) +@for_all_simd_levels class TestPartitioningFloatMin(unittest.TestCase, PartitionTests): def do_partition(self, n, q, maxval=None, seed=None): @@ -140,6 +144,7 @@ def do_partition(self, n, q, maxval=None, seed=None): self.assertEqual(n_eq, 0) +@for_all_simd_levels class TestPartitioningUint16(unittest.TestCase, PartitionTests): def do_partition(self, n, q, maxval=65536, seed=None): @@ -185,7 +190,7 @@ def do_partition(self, n, q, maxval=65536, seed=None): self.assertEqual(n_eq, 0) - +@for_all_simd_levels class TestPartitioningUint16Min(unittest.TestCase, PartitionTests): def do_partition(self, n, q, maxval=65536, seed=None): @@ -233,6 +238,7 @@ def do_partition(self, n, q, maxval=65536, seed=None): self.assertEqual(n_eq, 0) +@for_all_simd_levels class TestHistograms(unittest.TestCase): def do_test(self, nbin, n): diff --git a/thirdparty/faiss/tests/test_product_quantizer.py b/thirdparty/faiss/tests/test_product_quantizer.py index 97fd98d3a..5a12d2c3b 100644 --- a/thirdparty/faiss/tests/test_product_quantizer.py +++ b/thirdparty/faiss/tests/test_product_quantizer.py @@ -10,8 +10,10 @@ import faiss import unittest +from common_faiss_tests import for_all_simd_levels +@for_all_simd_levels class TestProductQuantizer(unittest.TestCase): def test_pq(self): @@ -73,6 +75,7 @@ def test_codec(self): self.do_test_codec(i + 1) +@for_all_simd_levels class TestPQTransposedCentroids(unittest.TestCase): def do_test(self, d, dsub): @@ -111,6 +114,7 @@ def test_dsub4_odd(self): self.do_test(36, 4) +@for_all_simd_levels class TestPQTables(unittest.TestCase): def do_test(self, d, dsub, nbit=8, metric=None): diff --git a/thirdparty/faiss/tests/test_rabitq.py b/thirdparty/faiss/tests/test_rabitq.py index f2afc54d9..8d50bb30f 100644 --- a/thirdparty/faiss/tests/test_rabitq.py +++ b/thirdparty/faiss/tests/test_rabitq.py @@ -9,6 +9,8 @@ import faiss from faiss.contrib import datasets +from common_faiss_tests import for_all_simd_levels + def random_rotation(d, seed=123): rs = np.random.RandomState(seed) @@ -177,6 +179,7 @@ def search(self, x, k): return D, I +@for_all_simd_levels class TestRaBitQ(unittest.TestCase): def do_comparison_vs_pq_test(self, metric_type=faiss.METRIC_L2): ds = datasets.SyntheticDataset(TEST_DIM, TEST_N, TEST_N, 100) @@ -321,6 +324,7 @@ def test_serde_rabitq(self): do_test_serde("RaBitQ") +@for_all_simd_levels class TestIVFRaBitQ(unittest.TestCase): def do_comparison_vs_pq_test(self, metric_type=faiss.METRIC_L2): nlist = 64 @@ -506,6 +510,7 @@ def test_serde_ivfrabitq(self): do_test_serde("IVF16,RaBitQ") +@for_all_simd_levels class TestRaBitQuantizerEncodeDecode(unittest.TestCase): def do_test_encode_decode(self, d, metric): # rabitq must precisely reconstruct a vector, @@ -603,6 +608,7 @@ def do_test_serde(description): np.testing.assert_equal(Iref, Inew3) +@for_all_simd_levels class TestMultiBitRaBitQ(unittest.TestCase): """Consolidated tests for multi-bit RaBitQ. @@ -1008,84 +1014,6 @@ def test_degenerate_centroid_distance(self): err_msg=f"nb_bits={nb_bits}") -class TestRaBitQStats(unittest.TestCase): - """Test RaBitQStats tracking for multi-bit two-stage search.""" - - INDEX_TYPES = [ - "IndexRaBitQ", - "IndexIVFRaBitQ", - ] - - @classmethod - def setUpClass(cls): - cls.stats_available = hasattr(faiss, 'cvar') and hasattr( - faiss.cvar, 'rabitq_stats' - ) - if cls.stats_available: - cls.rabitq_stats = faiss.cvar.rabitq_stats - - def test_stats_reset_and_skip_percentage(self): - """Test that stats can be reset and skip_percentage works.""" - if not self.stats_available: - self.skipTest("rabitq_stats not available in Python bindings") - self.rabitq_stats.reset() - self.assertEqual(self.rabitq_stats.n_1bit_evaluations, 0) - self.assertEqual(self.rabitq_stats.n_multibit_evaluations, 0) - self.assertEqual(self.rabitq_stats.skip_percentage(), 0.0) - - def test_stats_collected_multibit_all_index_types(self): - """Test that stats are collected for all multi-bit index types.""" - if not self.stats_available: - self.skipTest("rabitq_stats not available in Python bindings") - ds = datasets.SyntheticDataset(384, 50000, 50000, 10) - nlist = 16 - - for index_type in self.INDEX_TYPES: - for nb_bits in [2, 4]: - with self.subTest(index_type=index_type, nb_bits=nb_bits): - self.rabitq_stats.reset() - - if index_type == "IndexRaBitQ": - index = faiss.IndexRaBitQ( - ds.d, faiss.METRIC_L2, nb_bits - ) - elif index_type == "IndexIVFRaBitQ": - quantizer = faiss.IndexFlat(ds.d, faiss.METRIC_L2) - index = faiss.IndexIVFRaBitQ( - quantizer, ds.d, nlist, faiss.METRIC_L2, - True, nb_bits - ) - index.nprobe = 4 - else: - raise ValueError(f"Unknown index type: {index_type}") - - index.train(ds.get_train()) - index.add(ds.get_database()) - index.search(ds.get_queries(), 10) - - self.assertGreater( - self.rabitq_stats.n_1bit_evaluations, 0 - ) - self.assertGreater( - self.rabitq_stats.n_multibit_evaluations, 0 - ) - # For multi-bit, filtering should skip some candidates - self.assertLess( - self.rabitq_stats.n_multibit_evaluations, - self.rabitq_stats.n_1bit_evaluations, - ) - skip_pct = self.rabitq_stats.skip_percentage() - self.assertGreater(skip_pct, 0.0) - self.assertLessEqual(skip_pct, 100.0) - - n_1bit = self.rabitq_stats.n_1bit_evaluations - n_multibit = self.rabitq_stats.n_multibit_evaluations - print( - f"{index_type} nb_bits={nb_bits}: " - f"n_1bit={n_1bit}, " - f"n_multibit={n_multibit}, " - f"skip={skip_pct:.1f}%" - ) if __name__ == "__main__": unittest.main() diff --git a/thirdparty/faiss/tests/test_rabitq_fastscan.py b/thirdparty/faiss/tests/test_rabitq_fastscan.py index 7ea843743..347dfa0ab 100644 --- a/thirdparty/faiss/tests/test_rabitq_fastscan.py +++ b/thirdparty/faiss/tests/test_rabitq_fastscan.py @@ -9,6 +9,8 @@ import faiss from faiss.contrib import datasets +from common_faiss_tests import for_all_simd_levels + def compute_expected_code_size(d, nb_bits): """Helper: Compute expected code size based on formula.""" @@ -23,6 +25,24 @@ def compute_expected_code_size(d, nb_bits): return base_size +def _create_fastscan_index( + d, metric, use_ivf=False, + nlist=16, nprobe=4, bbs=32, nb_bits=1, +): + """Create FastScan index (IVF or non-IVF).""" + if use_ivf: + quantizer = faiss.IndexFlat(d, metric) + index = faiss.IndexIVFRaBitQFastScan( + quantizer, d, nlist, metric, bbs, + True, nb_bits + ) + index.nprobe = nprobe + else: + index = faiss.IndexRaBitQFastScan(d, metric, bbs, nb_bits) + return index + + +@for_all_simd_levels class TestRaBitQFastScan(unittest.TestCase): """Unified tests for IndexRaBitQFastScan and IndexIVFRaBitQFastScan.""" @@ -33,18 +53,11 @@ def _create_index( self, d, metric, use_ivf=False, nlist=None, bbs=32, nb_bits=1, ): - """Create FastScan index (IVF or non-IVF).""" - if use_ivf: - nlist = nlist or self.NLIST - quantizer = faiss.IndexFlat(d, metric) - index = faiss.IndexIVFRaBitQFastScan( - quantizer, d, nlist, metric, bbs, - True, nb_bits - ) - index.nprobe = self.NPROBE - else: - index = faiss.IndexRaBitQFastScan(d, metric, bbs, nb_bits) - return index + return _create_fastscan_index( + d, metric, use_ivf=use_ivf, + nlist=nlist or self.NLIST, nprobe=self.NPROBE, + bbs=bbs, nb_bits=nb_bits, + ) def _create_baseline(self, d, metric, use_ivf=False, nlist=None): """Create baseline RaBitQ index (IVF or non-IVF).""" @@ -488,6 +501,7 @@ def test_search_with_parameters(self): self.assertGreater(recall, 0.4) +@for_all_simd_levels class TestIVFRaBitQFastScanFiltering(unittest.TestCase): NLIST = 32 NPROBE = 8 @@ -579,6 +593,7 @@ def test_batch_filter_l2_multibit(self): self._do_test_filter("batch", faiss.METRIC_L2, nb_bits=2) +@for_all_simd_levels class TestMultiBitRaBitQFastScan(unittest.TestCase): """Consolidated tests for multi-bit RaBitQ FastScan. @@ -938,8 +953,8 @@ def test_encode_decode_roundtrip(self): d, 500, 100, 10, metric=metric_str ) - index = TestRaBitQFastScan._create_index( - self, d, metric, use_ivf=use_ivf, + index = _create_fastscan_index( + d, metric, use_ivf=use_ivf, nlist=nlist, nb_bits=nb_bits, ) if use_ivf: @@ -1008,16 +1023,17 @@ def test_factory_with_batch_size(self): """Test factory construction with both nb_bits and batch size.""" ds = datasets.SyntheticDataset(64, 150, 200, 10) - factory_str = "RaBitQfs4_64" - index = faiss.index_factory(ds.d, factory_str) - self.assertIsInstance(index, faiss.IndexRaBitQFastScan) - self.assertEqual(index.rabitq.nb_bits, 4) - self.assertEqual(index.bbs, 64) + for bbs in [32, 64]: + factory_str = f"RaBitQfs4_{bbs}" + index = faiss.index_factory(ds.d, factory_str) + self.assertIsInstance(index, faiss.IndexRaBitQFastScan) + self.assertEqual(index.rabitq.nb_bits, 4) + self.assertEqual(index.bbs, bbs) - index.train(ds.get_train()) - index.add(ds.get_database()) - D, I = index.search(ds.get_queries(), 5) - self.assertEqual(D.shape, (ds.nq, 5)) + index.train(ds.get_train()) + index.add(ds.get_database()) + D, I = index.search(ds.get_queries(), 5) + self.assertEqual(D.shape, (ds.nq, 5)) def test_ivf_factory_construction(self): """Test that multi-bit IVF index can be constructed via factory.""" @@ -1054,72 +1070,58 @@ def test_ivf_factory_with_batch_size(self): self.assertEqual(D.shape, (ds.nq, 5)) -class TestRaBitQStatsFastScan(unittest.TestCase): - """Test RaBitQStats tracking for multi-bit two-stage search in FastScan.""" - - NLIST = 16 - NPROBE = 4 - - @classmethod - def setUpClass(cls): - cls.stats_available = hasattr(faiss, 'cvar') and hasattr( - faiss.cvar, 'rabitq_stats' - ) - if cls.stats_available: - cls.rabitq_stats = faiss.cvar.rabitq_stats - - def test_stats_reset_and_skip_percentage(self): - """Test that stats can be reset and skip_percentage works.""" - if not self.stats_available: - self.skipTest("rabitq_stats not available in Python bindings") - self.rabitq_stats.reset() - self.assertEqual(self.rabitq_stats.n_1bit_evaluations, 0) - self.assertEqual(self.rabitq_stats.n_multibit_evaluations, 0) - self.assertEqual(self.rabitq_stats.skip_percentage(), 0.0) - - def test_stats_collected_multibit_all_index_types(self): - """Test stats are collected for all multi-bit FastScan index types.""" - if not self.stats_available: - self.skipTest("rabitq_stats not available in Python bindings") - ds = datasets.SyntheticDataset(384, 50000, 50000, 10) +class TestRaBitQFastScanSearchParams(unittest.TestCase): + """Test that IVFRaBitQSearchParameters qb/centered are respected.""" - for use_ivf in [False, True]: - for nb_bits in [2, 4]: - with self.subTest(use_ivf=use_ivf, nb_bits=nb_bits): - self.rabitq_stats.reset() + def test_higher_qb_improves_recall(self): + """Search with qb=4 should give better recall than qb=1.""" + d = 64 + nlist = 16 + nprobe = 4 + k = 10 + ds = datasets.SyntheticDataset(d, 5000, 5000, 50) - index = TestRaBitQFastScan._create_index( - self, ds.d, faiss.METRIC_L2, - use_ivf=use_ivf, nb_bits=nb_bits, - ) - index.train(ds.get_train()) - index.add(ds.get_database()) - index.search(ds.get_queries(), 10) + # Ground truth with flat index + index_flat = faiss.IndexFlatL2(d) + index_flat.add(ds.get_database()) + _, I_gt = index_flat.search(ds.get_queries(), k) - self.assertGreater( - self.rabitq_stats.n_1bit_evaluations, 0 - ) - self.assertGreater( - self.rabitq_stats.n_multibit_evaluations, 0 - ) - self.assertLess( - self.rabitq_stats.n_multibit_evaluations, - self.rabitq_stats.n_1bit_evaluations - ) - skip_pct = self.rabitq_stats.skip_percentage() - self.assertGreater(skip_pct, 0.0) - self.assertLessEqual(skip_pct, 100.0) + # Build IVF RaBitQ FastScan index with default qb=8 + quantizer = faiss.IndexFlat(d, faiss.METRIC_L2) + index = faiss.IndexIVFRaBitQFastScan( + quantizer, d, nlist, faiss.METRIC_L2, 32, True + ) + index.nprobe = nprobe + index.train(ds.get_train()) + index.add(ds.get_database()) - index_type = ( - "IndexIVFRaBitQFastScan" if use_ivf - else "IndexRaBitQFastScan" - ) - print( - f"{index_type} nb_bits={nb_bits}: " - f"total={self.rabitq_stats.n_1bit_evaluations}, " - f"refined={self.rabitq_stats.n_multibit_evaluations}, " - f"skip={skip_pct:.1f}%" - ) + # Search with qb=1 (coarse quantization) + params_qb1 = faiss.IVFRaBitQSearchParameters() + params_qb1.nprobe = nprobe + params_qb1.qb = 1 + _, I_qb1 = index.search(ds.get_queries(), k, params=params_qb1) + + # Search with qb=4 (finer quantization) + params_qb4 = faiss.IVFRaBitQSearchParameters() + params_qb4.nprobe = nprobe + params_qb4.qb = 4 + _, I_qb4 = index.search(ds.get_queries(), k, params=params_qb4) + + # Compute recall@k + recall_qb1 = np.mean([ + len(np.intersect1d(I_qb1[i], I_gt[i])) / k + for i in range(ds.nq) + ]) + recall_qb4 = np.mean([ + len(np.intersect1d(I_qb4[i], I_gt[i])) / k + for i in range(ds.nq) + ]) + + self.assertGreater( + recall_qb4, recall_qb1, + f"qb=4 recall ({recall_qb4:.3f}) should be higher " + f"than qb=1 recall ({recall_qb1:.3f})" + ) if __name__ == "__main__": diff --git a/thirdparty/faiss/tests/test_read_index_deserialize.cpp b/thirdparty/faiss/tests/test_read_index_deserialize.cpp index a7938e6d2..31994a706 100644 --- a/thirdparty/faiss/tests/test_read_index_deserialize.cpp +++ b/thirdparty/faiss/tests/test_read_index_deserialize.cpp @@ -22,7 +22,7 @@ #include #include #include - +#include #include #include #include @@ -1569,6 +1569,84 @@ TEST(ReadIndexDeserialize, IndexIVFNullInvlistsAdd) { EXPECT_THROW(idx.add(1, xb.data()), FaissException); } +// ----------------------------------------------------------------------- +// IVF quantizer ntotal / nlist deserialization acceptance tests. +// The quantizer may legitimately have ntotal != nlist (e.g., sharded +// indexes, custom inverted list management, untrained quantizers). +// ----------------------------------------------------------------------- + +// Surplus quantizer centroids: ntotal > nlist. Produced by +// shard_ivf_index_centroids(), which distributes all of the original +// quantizer's centroids across shards without adjusting nlist. +// The search-time key < nlist bounds check prevents OOB access if +// the quantizer returns out-of-range keys. +TEST(ReadIndexDeserialize, IVFQuantizerSurplus) { + std::vector buf; + push_fourcc(buf, "IwFl"); + push_index_header(buf, /*d=*/4, /*ntotal=*/0); + push_val(buf, 2); // nlist = 2 + push_val(buf, 1); // nprobe + // Quantizer with ntotal=5 (more centroids than nlist) + push_minimal_flat(buf, /*d=*/4, /*ntotal=*/5); + push_empty_direct_map(buf); + push_null_invlists(buf); + + VectorIOReader reader; + reader.data = buf; + EXPECT_NO_THROW(read_index_up(&reader)); +} + +// Trained quantizer: ntotal == nlist (normal trained IVF). +TEST(ReadIndexDeserialize, IVFQuantizerTrained) { + std::vector buf; + push_fourcc(buf, "IwFl"); + push_index_header(buf, /*d=*/4, /*ntotal=*/0); + push_val(buf, 2); // nlist = 2 + push_val(buf, 1); // nprobe + push_minimal_flat(buf, /*d=*/4, /*ntotal=*/2); + push_empty_direct_map(buf); + push_null_invlists(buf); + + VectorIOReader reader; + reader.data = buf; + EXPECT_NO_THROW(read_index_up(&reader)); +} + +// Sharded quantizer: 0 < ntotal < nlist. Produced by +// shard_ivf_index_centroids(), where each shard's quantizer holds a +// subset of the full index's centroids. +TEST(ReadIndexDeserialize, IVFQuantizerSubset) { + std::vector buf; + push_fourcc(buf, "IwFl"); + push_index_header(buf, /*d=*/4, /*ntotal=*/0); + push_val(buf, 10); // nlist = 10 + push_val(buf, 1); // nprobe + // Quantizer with ntotal=3 (subset of centroids, as in sharding) + push_minimal_flat(buf, /*d=*/4, /*ntotal=*/3); + push_empty_direct_map(buf); + push_null_invlists(buf); + + VectorIOReader reader; + reader.data = buf; + EXPECT_NO_THROW(read_index_up(&reader)); +} + +// Untrained quantizer: ntotal == 0 (custom inverted list management). +TEST(ReadIndexDeserialize, IVFQuantizerUntrained) { + std::vector buf; + push_fourcc(buf, "IwFl"); + push_index_header(buf, /*d=*/4, /*ntotal=*/0); + push_val(buf, 10); // nlist = 10 + push_val(buf, 1); // nprobe + push_minimal_flat(buf, /*d=*/4, /*ntotal=*/0); + push_empty_direct_map(buf); + push_null_invlists(buf); + + VectorIOReader reader; + reader.data = buf; + EXPECT_NO_THROW(read_index_up(&reader)); +} + // ----------------------------------------------------------------------- // VectorTransform deserialization validation tests // ----------------------------------------------------------------------- @@ -2547,7 +2625,8 @@ static std::vector build_AQFastScan_buf( size_t fastscan_M = 3, size_t fastscan_ksub = 16, int bbs = 32, - int qbs = 0) { + int qbs = 0, + size_t fastscan_M2 = 0) { std::vector buf; push_fourcc(buf, "IRfs"); push_index_header(buf, /*d=*/4, /*ntotal=*/0); @@ -2567,19 +2646,21 @@ static std::vector build_AQFastScan_buf( push_val(buf, 1); // max_beam_size // FastScan fields (IndexAdditiveQuantizerFastScan): - push_val(buf, 0); // implem - push_val(buf, bbs); // bbs - push_val(buf, qbs); // qbs - push_val(buf, fastscan_M); // M - push_val(buf, 4); // nbits - push_val(buf, fastscan_ksub); // ksub - push_val(buf, 2); // code_size - push_val(buf, 0); // ntotal2 - push_val(buf, fastscan_M + (fastscan_M % 2)); // M2 (rounded up) - push_val(buf, true); // rescale_norm - push_val(buf, 1); // norm_scale - push_val(buf, 48); // max_train_points - push_vector(buf, {}); // codes + push_val(buf, 0); // implem + push_val(buf, bbs); // bbs + push_val(buf, qbs); // qbs + push_val(buf, fastscan_M); // M + push_val(buf, 4); // nbits + push_val(buf, fastscan_ksub); // ksub + push_val(buf, 2); // code_size + push_val(buf, 0); // ntotal2 + // M2: use override if provided, otherwise roundup(M, 2) + size_t M2 = fastscan_M2 ? fastscan_M2 : (fastscan_M + 1) & ~size_t{1}; + push_val(buf, M2); + push_val(buf, true); // rescale_norm + push_val(buf, 1); // norm_scale + push_val(buf, 48); // max_train_points + push_vector(buf, {}); // codes return buf; } @@ -2633,15 +2714,34 @@ TEST(ReadIndexDeserialize, IndexPQFastScanBbsNotAligned) { expect_read_throws_with(buf, "invalid bbs"); } +// ----------------------------------------------------------------------- +// IndexPQFastScan deserialization: M2 must equal roundup(M, 2). +// A corrupted file with M2=0 while M>0 causes compute_quantized_LUT +// to write M*ksub bytes into a buffer sized for M2*ksub=0 bytes. +// ----------------------------------------------------------------------- +TEST(ReadIndexDeserialize, IndexPQFastScanM2Zero) { + auto buf = build_IndexPQFastScan_buf(/*bbs=*/32, /*M2=*/0); + expect_read_throws_with(buf, "invalid M2"); +} + +// ----------------------------------------------------------------------- +// IndexPQFastScan deserialization: M2 too small (1 < roundup(2, 2) = 2). +// ----------------------------------------------------------------------- +TEST(ReadIndexDeserialize, IndexPQFastScanM2TooSmall) { + auto buf = build_IndexPQFastScan_buf(/*bbs=*/32, /*M2=*/1); + expect_read_throws_with(buf, "invalid M2"); +} + // ----------------------------------------------------------------------- // IndexPQFastScan deserialization: ksub * M2 overflow. // M2 is read directly from the file and could be corrupted to a huge // value that causes ksub * M2 to overflow size_t. // ----------------------------------------------------------------------- TEST(ReadIndexDeserialize, IndexPQFastScanKsubM2Overflow) { + // M2=SIZE_MAX is now caught by M2 != roundup(M, 2) before overflow. auto buf = build_IndexPQFastScan_buf( /*bbs=*/32, /*M2=*/std::numeric_limits::max()); - expect_read_throws_with(buf, "overflow"); + expect_read_throws_with(buf, "invalid M2"); } // ----------------------------------------------------------------------- @@ -2671,6 +2771,20 @@ TEST(ReadIndexDeserialize, AQFastScanBbsZero) { expect_read_throws_with(buf, "invalid bbs"); } +// ----------------------------------------------------------------------- +// IndexAdditiveQuantizerFastScan deserialization: M2 mismatch. +// M2=0 while M=3 causes compute_quantized_LUT to write out of bounds. +// ----------------------------------------------------------------------- +TEST(ReadIndexDeserialize, AQFastScanM2Mismatch) { + auto buf = build_AQFastScan_buf( + /*fastscan_M=*/3, + /*fastscan_ksub=*/16, + /*bbs=*/32, + /*qbs=*/0, + /*fastscan_M2=*/1); + expect_read_throws_with(buf, "invalid M2"); +} + // ----------------------------------------------------------------------- // IndexAdditiveQuantizerFastScan deserialization: ksub * M overflow. // ----------------------------------------------------------------------- @@ -3109,6 +3223,30 @@ TEST(ReadIndexDeserialize, IndexRQFastScanAQDimensionMismatch) { #ifdef FAISS_ENABLE_SVS +#include + +// An invalid storage_kind value should be rejected at deserialization time +// with a FaissException, not abort via FAISS_ASSERT in to_svs_storage_kind(). +TEST(ReadIndexDeserialize, SVSVamanaInvalidStorageKind) { + std::vector buf; + push_fourcc(buf, "ISVD"); + push_index_header(buf, 8, 0); + push_val(buf, 32); // graph_max_degree + push_val(buf, 1.2f); // alpha + push_val(buf, 10); // search_window_size + push_val(buf, 10); // search_buffer_capacity + push_val(buf, 64); // construction_window_size + push_val(buf, 750); // max_candidate_pool_size + push_val(buf, 28); // prune_to + push_val(buf, false); // use_full_search_history + push_val( + buf, + static_cast(SVS_count)); // storage_kind — first invalid value + push_val(buf, true); // initialized + + expect_read_throws_with(buf, "storage_kind"); +} + // When SVS is enabled, deserializing an SVS Vamana index with invalid SVS // stream data should throw a FaissException (from the SVS runtime load // failure) rather than crashing with a null-pointer dereference. @@ -3163,6 +3301,160 @@ TEST(ReadIndexDeserialize, SVSFlatInvalidStreamThrows) { faiss::FaissException); } +// ----------------------------------------------------------------------- +// Tests: IndexRefine / IndexRefinePanorama k_factor validation. +// +// Format "IxRF" (IndexRefineFlat) and "IxRP" (IndexRefinePanorama): +// fourcc + index_header + base_index + refine_index + k_factor +// +// k_factor must be finite and in [1, 1000]. +// ----------------------------------------------------------------------- + +// Helper: build a minimal IxRF or IxRP payload with the given k_factor. +static void push_index_refine( + std::vector& buf, + const char fourcc_str[4], + float k_factor) { + int d = 4; + push_fourcc(buf, fourcc_str); + push_index_header(buf, d, /*ntotal=*/0); + push_minimal_flat(buf, d); // base_index + push_minimal_flat(buf, d); // refine_index + push_val(buf, k_factor); +} + +TEST(ReadIndexDeserialize, IndexRefineKFactorValid) { + std::vector buf; + push_index_refine(buf, "IxRF", 1.0f); + + VectorIOReader reader; + reader.data = buf; + EXPECT_NO_THROW(read_index_up(&reader)); +} + +TEST(ReadIndexDeserialize, IndexRefineKFactorMax) { + std::vector buf; + push_index_refine(buf, "IxRF", 1000.0f); + + VectorIOReader reader; + reader.data = buf; + EXPECT_NO_THROW(read_index_up(&reader)); +} + +TEST(ReadIndexDeserialize, IndexRefineKFactorTooLarge) { + std::vector buf; + push_index_refine(buf, "IxRF", 1001.0f); + expect_read_throws_with(buf, "k_factor"); +} + +TEST(ReadIndexDeserialize, IndexRefineKFactorNegative) { + std::vector buf; + push_index_refine(buf, "IxRF", -1.0f); + expect_read_throws_with(buf, "k_factor"); +} + +TEST(ReadIndexDeserialize, IndexRefineKFactorZero) { + std::vector buf; + push_index_refine(buf, "IxRF", 0.0f); + expect_read_throws_with(buf, "k_factor"); +} + +TEST(ReadIndexDeserialize, IndexRefineKFactorInfinity) { + std::vector buf; + push_index_refine(buf, "IxRF", INFINITY); + expect_read_throws_with(buf, "k_factor"); +} + +TEST(ReadIndexDeserialize, IndexRefineKFactorNaN) { + std::vector buf; + push_index_refine(buf, "IxRF", NAN); + expect_read_throws_with(buf, "k_factor"); +} + +TEST(ReadIndexDeserialize, IndexRefinePanoramaKFactorTooLarge) { + std::vector buf; + push_index_refine(buf, "IxRP", 1e10f); + expect_read_throws_with(buf, "k_factor"); +} + +TEST(ReadIndexDeserialize, IndexRefinePanoramaKFactorValid) { + std::vector buf; + push_index_refine(buf, "IxRP", 4.0f); + + VectorIOReader reader; + reader.data = buf; + EXPECT_NO_THROW(read_index_up(&reader)); +} + +// ----------------------------------------------------------------------- +// Tests: IndexIVFPQR k_factor validation via round-trip. +// +// Create a real IndexIVFPQR, serialize it, patch the k_factor bytes in +// the serialized blob, and verify deserialization rejects invalid values. +// ----------------------------------------------------------------------- + +// Helper: serialize an index to a byte vector. +static std::vector serialize_index(const Index* idx) { + VectorIOWriter writer; + write_index(idx, &writer); + return writer.data; +} + +// Helper: find and patch a float value in a byte buffer. +// Searches backwards from the end (k_factor is the last field written +// for IVFPQR). +static void patch_last_float(std::vector& buf, float new_val) { + size_t offset = buf.size() - sizeof(float); + std::memcpy(buf.data() + offset, &new_val, sizeof(float)); +} + +// Helper: create a trained IndexIVFPQR, serialize it, return the bytes. +// Uses nbits=4 (16 centroids) so training succeeds with few vectors. +static std::vector make_ivfpqr_bytes(float k_factor) { + int d = 8; + IndexFlatL2 quantizer(d); + IndexIVFPQR ivfpqr( + &quantizer, + d, + /*nlist=*/1, + /*M=*/2, + /*nbits_per_idx=*/4, + /*M_refine=*/2, + /*nbits_per_idx_refine=*/4); + + int ntrain = 64; + std::vector train_data(d * ntrain, 0.0f); + for (size_t i = 0; i < train_data.size(); i++) { + train_data[i] = float(i) / float(train_data.size()); + } + ivfpqr.train(ntrain, train_data.data()); + ivfpqr.k_factor = k_factor; + return serialize_index(&ivfpqr); +} + +TEST(ReadIndexDeserialize, IndexIVFPQRKFactorTooLarge) { + auto buf = make_ivfpqr_bytes(4.0f); + patch_last_float(buf, 1e10f); + expect_read_throws_with(buf, "k_factor"); +} + +TEST(ReadIndexDeserialize, IndexIVFPQRKFactorNegative) { + auto buf = make_ivfpqr_bytes(4.0f); + patch_last_float(buf, -1.0f); + expect_read_throws_with(buf, "k_factor"); +} + +TEST(ReadIndexDeserialize, IndexIVFPQRKFactorValid) { + auto buf = make_ivfpqr_bytes(64.0f); // AutoTune max + + VectorIOReader reader; + reader.data = buf; + auto idx = read_index_up(&reader); + auto* result = dynamic_cast(idx.get()); + ASSERT_NE(result, nullptr); + EXPECT_FLOAT_EQ(result->k_factor, 64.0f); +} + #else // !FAISS_ENABLE_SVS // When SVS is not enabled, attempting to read an index with an SVS fourcc diff --git a/thirdparty/faiss/tests/test_residual_quantizer.py b/thirdparty/faiss/tests/test_residual_quantizer.py index 07d017527..21b77e0ad 100644 --- a/thirdparty/faiss/tests/test_residual_quantizer.py +++ b/thirdparty/faiss/tests/test_residual_quantizer.py @@ -211,7 +211,7 @@ def test_training(self): pq.train(xt) err_pq = eval_codec(pq, xb) - # in practice RQ is often better than PQ but it does not the case here, so just check + # in practice RQ is often better than PQ but it is not the case here, so just check # that we are within some factor. self.assertLess(err_rq, err_pq * 1.2) diff --git a/thirdparty/faiss/tests/test_scalar_quantizer.cpp b/thirdparty/faiss/tests/test_scalar_quantizer.cpp index 2c5631c6c..fdae3d11d 100644 --- a/thirdparty/faiss/tests/test_scalar_quantizer.cpp +++ b/thirdparty/faiss/tests/test_scalar_quantizer.cpp @@ -7,9 +7,15 @@ #include +#include +#include #include +#include +#include +#include #include +#include TEST(ScalarQuantizer, RSQuantilesClamping) { int d = 8; @@ -95,3 +101,111 @@ TEST(ScalarQuantizer, RSQuantilesSmallDataset) { ASSERT_NO_THROW(sq.train(n, x.data())); } + +TEST(TestSQ0bit, CoarseOnlySearch) { + // Test QT_0bit: centroid-only distance + int d = 64; + int nlist = 8; + int nb = 1000; + int nq = 10; + int k = 5; + + std::vector xb(nb * d), xq(nq * d); + for (int i = 0; i < nb * d; i++) { + xb[i] = drand48(); + } + for (int i = 0; i < nq * d; i++) { + xq[i] = drand48(); + } + + faiss::IndexFlatL2 quantizer(d); + faiss::IndexIVFScalarQuantizer index( + &quantizer, + d, + nlist, + faiss::ScalarQuantizer::QT_0bit, + faiss::METRIC_L2, + false); + EXPECT_EQ(index.code_size, 0); + EXPECT_FALSE(index.by_residual); + + index.train(nb, xb.data()); + index.add(nb, xb.data()); + EXPECT_EQ(index.ntotal, nb); + + index.nprobe = nlist; + std::vector distances(nq * k); + std::vector labels(nq * k); + index.search(nq, xq.data(), k, distances.data(), labels.data()); + + // Verify we got results + for (int q = 0; q < nq; q++) { + EXPECT_GE(labels[q * k], 0); + } + + // Compare with direct quantizer search - distances should match + std::vector coarse_dis(nq * nlist); + std::vector coarse_ids(nq * nlist); + quantizer.search( + nq, xq.data(), nlist, coarse_dis.data(), coarse_ids.data()); + + for (int q = 0; q < nq; q++) { + float ivf_dis = distances[q * k]; + bool found = false; + for (int j = 0; j < nlist; j++) { + if (std::abs(ivf_dis - coarse_dis[q * nlist + j]) < 1e-5) { + found = true; + break; + } + } + EXPECT_TRUE(found) << "IVF distance " << ivf_dis + << " not found in coarse distances for query " << q; + } +} + +TEST(TestSQ0bit, IndexFactory) { + int d = 32; + std::unique_ptr index(faiss::index_factory(d, "IVF8,SQ0")); + EXPECT_NE(index, nullptr); + auto* ivfsq = dynamic_cast(index.get()); + EXPECT_NE(ivfsq, nullptr); + EXPECT_EQ(ivfsq->sq.qtype, faiss::ScalarQuantizer::QT_0bit); + EXPECT_EQ(ivfsq->code_size, 0); +} + +TEST(TestSQ0bit, InnerProduct) { + int d = 64; + int nlist = 4; + int nb = 500; + int nq = 5; + int k = 3; + + std::vector xb(nb * d), xq(nq * d); + for (int i = 0; i < nb * d; i++) { + xb[i] = drand48(); + } + for (int i = 0; i < nq * d; i++) { + xq[i] = drand48(); + } + + faiss::IndexFlatIP quantizer(d); + faiss::IndexIVFScalarQuantizer index( + &quantizer, + d, + nlist, + faiss::ScalarQuantizer::QT_0bit, + faiss::METRIC_INNER_PRODUCT, + false); + + index.train(nb, xb.data()); + index.add(nb, xb.data()); + + index.nprobe = nlist; + std::vector distances(nq * k); + std::vector labels(nq * k); + index.search(nq, xq.data(), k, distances.data(), labels.data()); + + for (int q = 0; q < nq; q++) { + EXPECT_GE(labels[q * k], 0); + } +} diff --git a/thirdparty/faiss/tests/test_scalar_quantizer_correctness.py b/thirdparty/faiss/tests/test_scalar_quantizer_correctness.py index 41c7b8420..f44f8d449 100644 --- a/thirdparty/faiss/tests/test_scalar_quantizer_correctness.py +++ b/thirdparty/faiss/tests/test_scalar_quantizer_correctness.py @@ -12,7 +12,10 @@ from faiss.contrib.datasets import SyntheticDataset +from common_faiss_tests import for_all_simd_levels + +@for_all_simd_levels class TestScalarQuantizerEncodeDecode(unittest.TestCase): def setUp(self): @@ -42,6 +45,7 @@ def test_4bit_uniform(self): self.do_encode_decode(faiss.ScalarQuantizer.QT_4bit_uniform, 0.1) +@for_all_simd_levels class TestScalarQuantizerSearch(unittest.TestCase): def setUp(self): @@ -70,6 +74,7 @@ def test_SQfp16(self): self.do_search('SQfp16', 0.99) +@for_all_simd_levels class TestScalarQuantizerDistances(unittest.TestCase): def test_distance_matches_reconstruct(self): @@ -90,6 +95,7 @@ def test_distance_matches_reconstruct(self): self.assertAlmostEqual(D[0, i], dist, places=4) +@for_all_simd_levels class TestScalarQuantizerEdgeCases(unittest.TestCase): def test_zero_vectors(self): @@ -139,6 +145,7 @@ def test_non_simd_dims(self): self.assertEqual(I.shape, (5, 10)) +@for_all_simd_levels class TestScalarQuantizerIP(unittest.TestCase): def test_inner_product(self): diff --git a/thirdparty/faiss/tests/test_simdlib.cpp b/thirdparty/faiss/tests/test_simdlib.cpp index c634c3440..e7eeaa87c 100644 --- a/thirdparty/faiss/tests/test_simdlib.cpp +++ b/thirdparty/faiss/tests/test_simdlib.cpp @@ -11,6 +11,11 @@ using namespace faiss; +// Explicit SIMD-level aliases (no global bare aliases). +using simd8float32 = simd8float32_tpl; +using simd8uint32 = simd8uint32_tpl; +using simd16uint16 = simd16uint16_tpl; + TEST(TestSIMDLib, TestCmpltAndBlendInplace) { simd8float32 lowestValues(0, 1, 2, 3, 4, 5, 6, 7); simd8uint32 lowestIndices(0, 1, 2, 3, 4, 5, 6, 7); diff --git a/thirdparty/faiss/tests/test_sorting.cpp b/thirdparty/faiss/tests/test_sorting.cpp new file mode 100644 index 000000000..1ccb4a9b2 --- /dev/null +++ b/thirdparty/faiss/tests/test_sorting.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include + +TEST(TestSorting, argsort_parallel_matches_serial) { + // n > 1M to exercise the parallel merge path + size_t n = 2000000; + + std::vector vals(n); + std::mt19937 rng(42); + std::uniform_real_distribution dist(-1000.0f, 1000.0f); + for (size_t i = 0; i < n; i++) { + vals[i] = dist(rng); + } + + std::vector perm_serial(n); + faiss::fvec_argsort(n, vals.data(), perm_serial.data()); + + std::vector perm_parallel(n); + faiss::fvec_argsort_parallel(n, vals.data(), perm_parallel.data()); + + // Permutations may differ on ties, but sorted values must match + for (size_t i = 0; i < n; i++) { + ASSERT_FLOAT_EQ(vals[perm_serial[i]], vals[perm_parallel[i]]) + << "mismatch at position " << i; + } +} + +TEST(TestSorting, hashtable_lookup) { + int log2_capacity = 12; + size_t capacity = (size_t)1 << log2_capacity; + + std::vector tab(capacity * 2); + faiss::hashtable_int64_to_int64_init(log2_capacity, tab.data()); + + size_t n = 200; + std::vector keys(n), vals(n); + for (size_t i = 0; i < n; i++) { + keys[i] = static_cast(i * 3); + vals[i] = static_cast(i + 1); + } + faiss::hashtable_int64_to_int64_add( + log2_capacity, tab.data(), n, keys.data(), vals.data()); + + // Interleave present and absent keys + size_t n_query = n * 2; + std::vector query_keys(n_query); + std::vector expected(n_query); + for (size_t i = 0; i < n; i++) { + query_keys[2 * i] = keys[i]; + expected[2 * i] = vals[i]; + query_keys[2 * i + 1] = + keys[i] + 1; // not a multiple of 3, never inserted + expected[2 * i + 1] = -1; + } + + std::vector result(n_query); + faiss::hashtable_int64_to_int64_lookup( + log2_capacity, + tab.data(), + n_query, + query_keys.data(), + result.data()); + + for (size_t i = 0; i < n_query; i++) { + ASSERT_EQ(result[i], expected[i]) + << "query key " << query_keys[i] << " at index " << i; + } +}