From 0f2f511b4b25330155cf65e051386d9d0c25f6b1 Mon Sep 17 00:00:00 2001 From: Alexandr Guzhva Date: Tue, 21 Apr 2026 12:56:26 -0400 Subject: [PATCH] remove fork's ScalarQuantizer Signed-off-by: Alexandr Guzhva --- cmake/libs/libfaiss.cmake | 39 +- .../data_view_dense_index/refine_computer.h | 19 +- src/index/hnsw/faiss_hnsw.cc | 77 +- src/index/ivf/ivf.cc | 12 +- src/index/ivf/ivf_wrapper.cc | 8 +- src/index/refine/refine_utils.cc | 49 +- src/index/refine/refine_utils.h | 12 +- src/simd/hook.cc | 2 - thirdparty/faiss/faiss/CMakeLists.txt | 26 + .../faiss/cppcontrib/knowhere/FaissHook.cpp | 83 - .../faiss/cppcontrib/knowhere/FaissHook.h | 48 - .../knowhere/IVFIteratorWorkspace.cpp | 1 + .../knowhere/IndexBinaryScalarQuantizer.cpp | 177 ++ .../knowhere/IndexBinaryScalarQuantizer.h | 63 + .../faiss/cppcontrib/knowhere/IndexCosine.cpp | 18 +- .../faiss/cppcontrib/knowhere/IndexCosine.h | 9 +- .../faiss/cppcontrib/knowhere/IndexFlat.cpp | 2 +- .../faiss/cppcontrib/knowhere/IndexHNSW.cpp | 4 +- .../faiss/cppcontrib/knowhere/IndexHNSW.h | 4 +- .../cppcontrib/knowhere/IndexHNSWBinary.cpp | 33 + .../cppcontrib/knowhere/IndexHNSWBinary.h | 47 + .../cppcontrib/knowhere/IndexIVFFlat.cpp | 3 +- .../faiss/cppcontrib/knowhere/IndexIVFPQ.cpp | 2 +- .../knowhere/IndexIVFScalarQuantizerCC.cpp | 8 +- .../knowhere/IndexIVFScalarQuantizerCC.h | 4 +- .../cppcontrib/knowhere/IndexSQ4Uniform.cpp | 64 +- .../cppcontrib/knowhere/IndexSQ4Uniform.h | 13 +- .../faiss/cppcontrib/knowhere/IndexScaNN.cpp | 1 - .../knowhere/IndexScalarQuantizer.cpp | 355 ++-- .../knowhere/IndexScalarQuantizer.h | 53 +- .../knowhere/impl/RaBitQuantizer.cpp | 2 +- .../knowhere/impl/ScalarQuantizer.cpp | 210 --- .../knowhere/impl/ScalarQuantizer.h | 146 -- .../knowhere/impl/ScalarQuantizerCodec.h | 979 ----------- .../knowhere/impl/ScalarQuantizerCodec_avx.h | 1230 ------------- .../impl/ScalarQuantizerCodec_avx512.h | 1518 ----------------- .../knowhere/impl/ScalarQuantizerCodec_neon.h | 1074 ------------ .../knowhere/impl/ScalarQuantizerCodec_rvv.h | 1354 --------------- .../knowhere/impl/ScalarQuantizerDC.cpp | 117 -- .../knowhere/impl/ScalarQuantizerDC.h | 53 - .../knowhere/impl/ScalarQuantizerDC_avx.cpp | 73 - .../knowhere/impl/ScalarQuantizerDC_avx.h | 41 - .../impl/ScalarQuantizerDC_avx512.cpp | 84 - .../knowhere/impl/ScalarQuantizerDC_avx512.h | 41 - .../knowhere/impl/ScalarQuantizerDC_neon.cpp | 73 - .../knowhere/impl/ScalarQuantizerDC_neon.h | 41 - .../knowhere/impl/ScalarQuantizerDC_rvv.cpp | 78 - .../knowhere/impl/ScalarQuantizerDC_rvv.h | 42 - .../knowhere/impl/ScalarQuantizerOp.cpp | 193 --- .../knowhere/impl/ScalarQuantizerOp.h | 35 - .../knowhere/impl/ScalarQuantizerScanner.h | 343 ---- .../cppcontrib/knowhere/impl/index_read.cpp | 50 +- .../cppcontrib/knowhere/impl/index_write.cpp | 47 +- .../knowhere/impl/sq-avx2-fastpath.cpp | 328 ++++ .../knowhere/impl/sq-avx512-fastpath.cpp | 547 ++++++ .../knowhere/impl/sq-neon-fastpath.cpp | 247 +++ .../cppcontrib/knowhere/utils/distances.cpp | 2 +- thirdparty/faiss/faiss/impl/ScalarQuantizer.h | 19 + .../faiss/impl/scalar_quantizer/sq-avx2.cpp | 37 + .../faiss/impl/scalar_quantizer/sq-avx512.cpp | 36 + .../faiss/impl/scalar_quantizer/sq-neon.cpp | 36 + .../faiss/impl/scalar_quantizer/sq-rvv.cpp | 311 ++++ thirdparty/faiss/faiss/impl/simd_dispatch.h | 8 + thirdparty/faiss/faiss/utils/simd_levels.cpp | 13 + thirdparty/faiss/faiss/utils/simd_levels.h | 7 + 65 files changed, 2499 insertions(+), 8152 deletions(-) delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.cpp delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.h create mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.cpp create mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.h create mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.cpp create mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.cpp delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx512.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_neon.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_rvv.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.cpp delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.cpp delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.cpp delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.cpp delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.cpp delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.cpp delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.h delete mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerScanner.h create mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp create mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp create mode 100644 thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp create mode 100644 thirdparty/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp diff --git a/cmake/libs/libfaiss.cmake b/cmake/libs/libfaiss.cmake index a50ff5e3f..47fd43db2 100644 --- a/cmake/libs/libfaiss.cmake +++ b/cmake/libs/libfaiss.cmake @@ -37,18 +37,31 @@ knowhere_file_glob( FAISS_AVX512_SRCS thirdparty/faiss/faiss/cppcontrib/knowhere/impl/*avx512.cpp ) -# AVX512 vanilla Faiss dynamic dispatch related files +# AVX512 vanilla Faiss dynamic dispatch related files. Baseline +# sq-avx512.cpp is replaced by a knowhere-local prelude file that declares +# a fast DCTemplate specialization for QT_4bit_uniform + L2 and then +# textually #includes the baseline sq-avx512.cpp — see +# cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp for the full design note. knowhere_file_glob( GLOB FAISS_DD_AVX512_SRCS thirdparty/faiss/faiss/impl/fast_scan/impl-avx512.cpp thirdparty/faiss/faiss/impl/hnsw/avx512.cpp thirdparty/faiss/faiss/impl/pq_code_distance/pq_code_distance-avx512.cpp - thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp + thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp thirdparty/faiss/faiss/utils/distances_fused/avx512.cpp thirdparty/faiss/faiss/utils/simd_impl/distances_avx512.cpp thirdparty/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp ) +# Baseline sq-avx512.cpp is pulled in textually by the prelude file, not +# compiled directly. Remove it from the generic list so it is not picked +# up as a stand-alone TU (which would duplicate symbols). +knowhere_file_glob( + GLOB + FAISS_SQ_AVX512_EXCLUDE + thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +) +list(REMOVE_ITEM FAISS_SRCS ${FAISS_SQ_AVX512_EXCLUDE}) # combine files list(APPEND FAISS_AVX512_SRCS ${FAISS_DD_AVX512_SRCS}) # remove platform files from general files @@ -61,7 +74,8 @@ knowhere_file_glob( FAISS_AVX2_SRCS thirdparty/faiss/faiss/cppcontrib/knowhere/impl/*avx.cpp ) -# AVX2 vanilla Faiss dynamic dispatch related files +# AVX2 vanilla Faiss dynamic dispatch related files. sq-avx2.cpp is +# textually wrapped by sq-avx2-fastpath.cpp (see design note there). knowhere_file_glob( GLOB FAISS_DD_AVX2_SRCS @@ -69,12 +83,18 @@ knowhere_file_glob( thirdparty/faiss/faiss/impl/fast_scan/impl-avx2.cpp thirdparty/faiss/faiss/impl/hnsw/avx2.cpp thirdparty/faiss/faiss/impl/pq_code_distance/pq_code_distance-avx2.cpp - thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp + thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.cpp thirdparty/faiss/faiss/utils/simd_impl/distances_avx2.cpp thirdparty/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp thirdparty/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp ) +knowhere_file_glob( + GLOB + FAISS_SQ_AVX2_EXCLUDE + thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +) +list(REMOVE_ITEM FAISS_SRCS ${FAISS_SQ_AVX2_EXCLUDE}) # combine files list(APPEND FAISS_AVX2_SRCS ${FAISS_DD_AVX2_SRCS}) # remove platform files from general files @@ -105,18 +125,25 @@ knowhere_file_glob( FAISS_NEON_SRCS thirdparty/faiss/faiss/cppcontrib/knowhere/impl/*neon.cpp ) -# NEON vanilla Faiss dynamic dispatch related files +# NEON vanilla Faiss dynamic dispatch related files. sq-neon.cpp is +# textually wrapped by sq-neon-fastpath.cpp (see design note there). knowhere_file_glob( GLOB FAISS_DD_NEON_SRCS thirdparty/faiss/faiss/impl/approx_topk/neon.cpp thirdparty/faiss/faiss/impl/fast_scan/impl-neon.cpp - thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp + thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp thirdparty/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp thirdparty/faiss/faiss/utils/simd_impl/distances_aarch64.cpp thirdparty/faiss/faiss/utils/simd_impl/partitioning_neon.cpp thirdparty/faiss/faiss/utils/simd_impl/rabitq_neon.cpp ) +knowhere_file_glob( + GLOB + FAISS_SQ_NEON_EXCLUDE + thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +) +list(REMOVE_ITEM FAISS_SRCS ${FAISS_SQ_NEON_EXCLUDE}) # combine files list(APPEND FAISS_NEON_SRCS ${FAISS_DD_NEON_SRCS}) # remove platform files from general files diff --git a/src/index/data_view_dense_index/refine_computer.h b/src/index/data_view_dense_index/refine_computer.h index 08f409cf4..0390b856d 100644 --- a/src/index/data_view_dense_index/refine_computer.h +++ b/src/index/data_view_dense_index/refine_computer.h @@ -12,9 +12,9 @@ // knowhere-specific indices #pragma once -#include "faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h" #include "faiss/cppcontrib/knowhere/invlists/InvertedLists.h" #include "faiss/impl/DistanceComputer.h" +#include "faiss/impl/ScalarQuantizer.h" #include "knowhere/comp/index_param.h" #include "knowhere/object.h" #include "knowhere/operands.h" @@ -63,16 +63,13 @@ struct QuantRefine { } switch (refine_type) { case RefineType::UINT8_QUANT: - quantizer = new faiss::cppcontrib::knowhere::ScalarQuantizer( - d, faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_8bit); + quantizer = new faiss::ScalarQuantizer(d, faiss::ScalarQuantizer::QuantizerType::QT_8bit); break; case RefineType::BFLOAT16_QUANT: - quantizer = new faiss::cppcontrib::knowhere::ScalarQuantizer( - d, faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_bf16); + quantizer = new faiss::ScalarQuantizer(d, faiss::ScalarQuantizer::QuantizerType::QT_bf16); break; case RefineType::FLOAT16_QUANT: - quantizer = new faiss::cppcontrib::knowhere::ScalarQuantizer( - d, faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_fp16); + quantizer = new faiss::ScalarQuantizer(d, faiss::ScalarQuantizer::QuantizerType::QT_fp16); break; default: throw std::runtime_error("Fail to generate quant for refiner if refine_type == RefineType::DATA_VIEW"); @@ -118,9 +115,9 @@ struct QuantRefine { GetMetric() { return metric_type; } - std::unique_ptr + std::unique_ptr GetQuantComputer() { - return std::unique_ptr( + return std::unique_ptr( quantizer->get_distance_computer(metric_type)); } DataFormatEnum @@ -141,7 +138,7 @@ struct QuantRefine { static constexpr size_t key = 0; static constexpr size_t list_num = 1; static constexpr size_t segment_size = 48; - faiss::cppcontrib::knowhere::ScalarQuantizer* quantizer = nullptr; + faiss::ScalarQuantizer* quantizer = nullptr; faiss::cppcontrib::knowhere::InvertedLists* storage = nullptr; faiss::MetricType metric_type; DataFormatEnum origin_data_type; @@ -153,7 +150,7 @@ template struct QuantDataDistanceComputer : faiss::DistanceComputer { std::vector query_buf; std::shared_ptr quant_data; - std::unique_ptr qc; + std::unique_ptr qc; float q_norm; size_t dim; diff --git a/src/index/hnsw/faiss_hnsw.cc b/src/index/hnsw/faiss_hnsw.cc index 1cf64c1f3..0f9c52d02 100644 --- a/src/index/hnsw/faiss_hnsw.cc +++ b/src/index/hnsw/faiss_hnsw.cc @@ -9,8 +9,10 @@ // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License. +#include #include #include +#include #include #include #include @@ -32,7 +34,6 @@ #include "common/metric.h" #include "faiss/cppcontrib/knowhere/IndexHNSW.h" #include "faiss/cppcontrib/knowhere/IndexRefine.h" -#include "faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h" #include "faiss/cppcontrib/knowhere/index_io.h" #include "faiss/impl/mapped_io.h" #include "index/clustering_config.h" @@ -546,10 +547,10 @@ convert_rows_to_fp32(const void* const __restrict src_in, float* const __restric // where each query_row has ((dim + 7) / 8) * 8 bits, and the total is nrows * ((dim + 7) / 8) * 8 bits. // But the final format required is nrows * dim * 32 bits (float). // There are actually two conversions happening here: - // 1. Each uint8_t value must be converted to float (in `BinarySQDistanceComputerWrapper::set_query` - // and `ScalarQuantizer::compute_codes`), it will be converted back to uint8_t). [same as int8] + // 1. Each uint8_t value must be converted to float (in `BinaryFlatCodesDC::set_query` inside + // IndexBinaryScalarQuantizer, it will be converted back to uint8_t). [same as int8] // 2. Each row must occupy dim * 32 bits of space, even if not all bits are filled; - // this is required by the convention set in `ScalarQuantizer::compute_codes`. + // this is required by the convention set by IndexBinaryScalarQuantizer::sa_encode. const knowhere::bin1* const src = reinterpret_cast(src_in); auto uint8_dim = (dim + 7) / 8; for (size_t i = 0; i < nrows; i++) { @@ -711,20 +712,26 @@ get_index_data_format(const faiss::Index* index) { return DataFormatEnum::fp32; } - // is it sq? - // note: IndexScalarQuantizerCosine preserves the original data, no cosine norm is appliesd - auto index_sq = dynamic_cast(index); - if (index_sq != nullptr) { - if (index_sq->sq.qtype == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_bf16) { - return DataFormatEnum::bf16; - } else if (index_sq->sq.qtype == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_fp16) { - return DataFormatEnum::fp16; - } else if (index_sq->sq.qtype == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit_direct_signed) { - return DataFormatEnum::int8; - } else if (index_sq->sq.qtype == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_1bit_direct) { - return DataFormatEnum::bin1; - } else { - return std::nullopt; + // is it binary (1-bit-direct)? Routed through + // IndexBinaryScalarQuantizer, which replaces the legacy + // IndexScalarQuantizer(QT_1bit_direct) path. + if (dynamic_cast(index) != nullptr) { + return DataFormatEnum::bin1; + } + + // is it sq? All SQ storage produced by knowhere now inherits from + // baseline faiss::IndexScalarQuantizer (Cosine/SQ4U wrappers, + // plain IndexHNSWSQ, and refine). + if (auto* index_sq = dynamic_cast(index)) { + switch (index_sq->sq.qtype) { + case faiss::ScalarQuantizer::QT_bf16: + return DataFormatEnum::bf16; + case faiss::ScalarQuantizer::QT_fp16: + return DataFormatEnum::fp16; + case faiss::ScalarQuantizer::QT_8bit_direct_signed: + return DataFormatEnum::int8; + default: + return std::nullopt; } } @@ -2068,9 +2075,8 @@ class BaseFaissRegularIndexHNSWFlatNode : public BaseFaissRegularIndexHNSWNode { if (is_binary) { if (metric.value() == faiss::MetricType::METRIC_Hamming || metric.value() == faiss::MetricType::METRIC_Jaccard) { - hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_1bit_direct, hnsw_cfg.M.value(), - metric.value()); + hnsw_index = std::make_unique(dim, hnsw_cfg.M.value(), + metric.value()); } else { LOG_KNOWHERE_ERROR_ << "Unsupported metric for binary data: " << hnsw_cfg.metric_type.value(); return Status::invalid_metric_type; @@ -2082,14 +2088,13 @@ class BaseFaissRegularIndexHNSWFlatNode : public BaseFaissRegularIndexHNSWNode { std::make_unique(dim, hnsw_cfg.M.value()); } else if (data_format == DataFormatEnum::fp16) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_fp16, hnsw_cfg.M.value()); + dim, faiss::ScalarQuantizer::QT_fp16, hnsw_cfg.M.value()); } else if (data_format == DataFormatEnum::bf16) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_bf16, hnsw_cfg.M.value()); + dim, faiss::ScalarQuantizer::QT_bf16, hnsw_cfg.M.value()); } else if (data_format == DataFormatEnum::int8) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit_direct_signed, - hnsw_cfg.M.value()); + dim, faiss::ScalarQuantizer::QT_8bit_direct_signed, hnsw_cfg.M.value()); } else { LOG_KNOWHERE_ERROR_ << "Unsupported metric type: " << hnsw_cfg.metric_type.value(); return Status::invalid_metric_type; @@ -2100,16 +2105,13 @@ class BaseFaissRegularIndexHNSWFlatNode : public BaseFaissRegularIndexHNSWNode { dim, hnsw_cfg.M.value(), metric.value()); } else if (data_format == DataFormatEnum::fp16) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_fp16, hnsw_cfg.M.value(), - metric.value()); + dim, faiss::ScalarQuantizer::QT_fp16, hnsw_cfg.M.value(), metric.value()); } else if (data_format == DataFormatEnum::bf16) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_bf16, hnsw_cfg.M.value(), - metric.value()); + dim, faiss::ScalarQuantizer::QT_bf16, hnsw_cfg.M.value(), metric.value()); } else if (data_format == DataFormatEnum::int8) { hnsw_index = std::make_unique( - dim, faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit_direct_signed, - hnsw_cfg.M.value(), metric.value()); + dim, faiss::ScalarQuantizer::QT_8bit_direct_signed, hnsw_cfg.M.value(), metric.value()); } else { LOG_KNOWHERE_ERROR_ << "Unsupported metric type: " << hnsw_cfg.metric_type.value(); return Status::invalid_metric_type; @@ -2548,7 +2550,7 @@ class BaseFaissRegularIndexHNSWSQNode : public BaseFaissRegularIndexHNSWNode { // create an index const bool is_cosine = IsMetricType(hnsw_cfg.metric_type.value(), metric::COSINE); - const bool is_sq4u = sq_type.value() == faiss::cppcontrib::knowhere::ScalarQuantizer::QT_4bit_uniform; + const bool is_sq4u = sq_type.value() == faiss::ScalarQuantizer::QT_4bit_uniform; // should refine be used? std::unique_ptr final_index; @@ -2570,6 +2572,17 @@ class BaseFaissRegularIndexHNSWSQNode : public BaseFaissRegularIndexHNSWNode { } else { hnsw_index = std::make_unique( dim, sq_type.value(), hnsw_cfg.M.value(), metric.value()); + // QT_4bit_uniform + L2 benefits from quantile-based range + // estimation. This used to be hard-coded inside the fork + // IndexScalarQuantizer ctor; moved here so that ctor is + // behaviorally equivalent to baseline. + if (is_sq4u) { + auto* idx_sq = dynamic_cast(hnsw_index->storage); + if (idx_sq != nullptr) { + idx_sq->sq.rangestat = faiss::ScalarQuantizer::RS_quantiles; + idx_sq->sq.rangestat_arg = 0.01; + } + } } hnsw_index->hnsw.efConstruction = hnsw_cfg.efConstruction.value(); diff --git a/src/index/ivf/ivf.cc b/src/index/ivf/ivf.cc index ab47304bb..97fe824fd 100644 --- a/src/index/ivf/ivf.cc +++ b/src/index/ivf/ivf.cc @@ -507,19 +507,19 @@ to_index_flat(std::unique_ptr&& index) { return std::make_unique(std::move(*index)); } -expected +expected get_ivf_sq_quantizer_type(int code_size) { switch (code_size) { case 4: - return faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_4bit; + return faiss::ScalarQuantizer::QuantizerType::QT_4bit; case 6: - return faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_6bit; + return faiss::ScalarQuantizer::QuantizerType::QT_6bit; case 8: - return faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_8bit; + return faiss::ScalarQuantizer::QuantizerType::QT_8bit; case 16: - return faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_fp16; + return faiss::ScalarQuantizer::QuantizerType::QT_fp16; default: - return expected::Err( + return expected::Err( Status::invalid_args, fmt::format("current code size {} not in (4, 6, 8, 16)", code_size)); } } diff --git a/src/index/ivf/ivf_wrapper.cc b/src/index/ivf/ivf_wrapper.cc index 72a3eaf9d..04ca212df 100644 --- a/src/index/ivf/ivf_wrapper.cc +++ b/src/index/ivf/ivf_wrapper.cc @@ -196,15 +196,15 @@ IndexIvfFactory::create_for_sq(faiss::cppcontrib::knowhere::IndexFlat* qzr_raw_p // create IndexIVFSQ // Index does not own qzr - faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType quantizer_type; + faiss::ScalarQuantizer::QuantizerType quantizer_type; // ivf_sq_cfg.sq_type.value() has already been guaranteed to be legal in CheckAndAdjust std::string quantizer_type_tolower = str_to_lower(ivf_sq_cfg.sq_type.value()); if (quantizer_type_tolower == "sq4") { - quantizer_type = faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_4bit; + quantizer_type = faiss::ScalarQuantizer::QuantizerType::QT_4bit; } else if (quantizer_type_tolower == "sq6") { - quantizer_type = faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_6bit; + quantizer_type = faiss::ScalarQuantizer::QuantizerType::QT_6bit; } else { - quantizer_type = faiss::cppcontrib::knowhere::ScalarQuantizer::QuantizerType::QT_8bit; + quantizer_type = faiss::ScalarQuantizer::QuantizerType::QT_8bit; } auto index = std::make_unique(qzr_raw_ptr, d, nlist, quantizer_type, metric); diff --git a/src/index/refine/refine_utils.cc b/src/index/refine/refine_utils.cc index 25f81e74b..3c32f1dc3 100644 --- a/src/index/refine/refine_utils.cc +++ b/src/index/refine/refine_utils.cc @@ -7,8 +7,8 @@ #include #include +#include "faiss/IndexScalarQuantizer.h" #include "faiss/cppcontrib/knowhere/IndexRefine.h" -#include "faiss/cppcontrib/knowhere/IndexScalarQuantizer.h" #include "fmt/format.h" #include "knowhere/log.h" #include "knowhere/tolower.h" @@ -16,21 +16,18 @@ namespace knowhere { // a supporting function -expected +expected get_sq_quantizer_type(const std::string& sq_type) { - std::map sq_types = { - {"sq4u", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_4bit_uniform}, - {"sq6", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_6bit}, - {"sq8", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit}, - {"fp16", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_fp16}, - {"bf16", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_bf16}, - {"int8", faiss::cppcontrib::knowhere::ScalarQuantizer::QT_8bit_direct_signed}}; + std::map sq_types = { + {"sq4u", faiss::ScalarQuantizer::QT_4bit_uniform}, {"sq6", faiss::ScalarQuantizer::QT_6bit}, + {"sq8", faiss::ScalarQuantizer::QT_8bit}, {"fp16", faiss::ScalarQuantizer::QT_fp16}, + {"bf16", faiss::ScalarQuantizer::QT_bf16}, {"int8", faiss::ScalarQuantizer::QT_8bit_direct_signed}}; // todo: tolower auto sq_type_tolower = str_to_lower(sq_type); auto itr = sq_types.find(sq_type_tolower); if (itr == sq_types.cend()) { - return expected::Err( + return expected::Err( Status::invalid_args, fmt::format("invalid scalar quantizer type ({})", sq_type_tolower)); } @@ -61,8 +58,7 @@ is_flat_refine(const std::optional& refine_type) { } bool -has_lossless_quant(const expected& quant_type, - DataFormatEnum dataFormat) { +has_lossless_quant(const expected& quant_type, DataFormatEnum dataFormat) { if (!quant_type.has_value()) { return false; } @@ -72,11 +68,11 @@ has_lossless_quant(const expected>::Err( Status::invalid_args, "fp16 input data does not accept bf16 or fp32 as a refine index."); @@ -127,7 +123,7 @@ pick_refine_index(const DataFormatEnum data_format, const std::optional>::Err( Status::invalid_args, "bf16 input data does not accept fp16 or fp32 as a refine index."); @@ -159,9 +155,22 @@ pick_refine_index(const DataFormatEnum data_format, const std::optional( - base_d, refine_sq_type.value(), base_metric_type); + // create an sq. Baseline faiss::IndexScalarQuantizer — the fork + // variant's ctor is now behavior-identical (see fork + // IndexScalarQuantizer.cpp), and fork index_write.cpp recognises + // baseline IxSQ via an overload (see \u00a75). + auto sq_refine = + std::make_unique(base_d, refine_sq_type.value(), base_metric_type); + + // QT_4bit_uniform + L2 benefits from quantile-based range + // estimation. Previously applied inside the fork + // IndexScalarQuantizer ctor for SQ4U+L2; now applied explicitly + // at the call site so the fork ctor is behavior-identical to + // baseline and this call site can use either. + if (refine_sq_type.value() == faiss::ScalarQuantizer::QT_4bit_uniform && base_metric_type == faiss::METRIC_L2) { + sq_refine->sq.rangestat = faiss::ScalarQuantizer::RS_quantiles; + sq_refine->sq.rangestat_arg = 0.01; + } auto refine_index = std::make_unique(local_index.get(), sq_refine.get()); diff --git a/src/index/refine/refine_utils.h b/src/index/refine/refine_utils.h index e11e425a3..c34e6e5e8 100644 --- a/src/index/refine/refine_utils.h +++ b/src/index/refine/refine_utils.h @@ -7,21 +7,25 @@ #include #include "faiss/Index.h" -#include "faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h" +#include "faiss/impl/ScalarQuantizer.h" #include "knowhere/expected.h" #include "knowhere/operands.h" namespace knowhere { -expected +// Returns a baseline faiss::ScalarQuantizer::QuantizerType. The integer +// values for every qtype this function returns match the fork's enum, +// so static_cast at the boundary of a fork IndexScalarQuantizer ctor is +// lossless. The fork enum is retired at the knowhere layer; fork ctors +// are the only remaining consumers. +expected get_sq_quantizer_type(const std::string& sq_type); expected is_flat_refine(const std::optional& refine_type); bool -has_lossless_quant(const expected& quant_type, - DataFormatEnum dataFormat); +has_lossless_quant(const expected& quant_type, DataFormatEnum dataFormat); bool has_lossless_refine_index(const std::optional& refine, const std::optional& refine_type, diff --git a/src/simd/hook.cc b/src/simd/hook.cc index ab18fd8c6..43d4c69ca 100644 --- a/src/simd/hook.cc +++ b/src/simd/hook.cc @@ -11,7 +11,6 @@ #include "hook.h" -#include #include #include @@ -578,7 +577,6 @@ fvec_hook(std::string& simd_type) { static int init_hook_ = []() { std::string simd_type; fvec_hook(simd_type); - faiss::cppcontrib::knowhere::sq_hook(); return 0; }(); diff --git a/thirdparty/faiss/faiss/CMakeLists.txt b/thirdparty/faiss/faiss/CMakeLists.txt index 91af5c7f1..48ad5073c 100644 --- a/thirdparty/faiss/faiss/CMakeLists.txt +++ b/thirdparty/faiss/faiss/CMakeLists.txt @@ -41,11 +41,16 @@ set(FAISS_SIMD_SVE_SRC impl/pq_code_distance/pq_code_distance-sve.cpp utils/simd_impl/distances_arm_sve.cpp ) +set(FAISS_SIMD_RVV_SRC + impl/scalar_quantizer/sq-rvv.cpp +) # Select SIMD sources based on target architecture if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64|amd64|AMD64)") set(FAISS_SIMD_SRC ${FAISS_SIMD_AVX2_SRC} ${FAISS_SIMD_AVX512_SRC}) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64|arm64|ARM64)") set(FAISS_SIMD_SRC ${FAISS_SIMD_NEON_SRC} ${FAISS_SIMD_SVE_SRC}) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(riscv64|riscv)") + set(FAISS_SIMD_SRC ${FAISS_SIMD_RVV_SRC}) else() set(FAISS_SIMD_SRC "") endif() @@ -477,6 +482,14 @@ if(FAISS_OPT_LEVEL STREQUAL "dd") TARGET_DIRECTORY faiss PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+sve" ) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(riscv64|riscv)") + target_compile_definitions(faiss PRIVATE COMPILE_SIMD_RISCV_RVV) + if(FAISS_SIMD_RVV_SRC) + set_source_files_properties(${FAISS_SIMD_RVV_SRC} + TARGET_DIRECTORY faiss + PROPERTIES COMPILE_OPTIONS "-march=rv64gcv_zvfhmin;-mabi=lp64d" + ) + endif() endif() endif() endif() @@ -489,6 +502,19 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64|arm64|ARM64)") target_sources(faiss PRIVATE ${FAISS_SIMD_NEON_SRC}) endif() +# RVV is the baseline SIMD on rv64 builds compiled with rv64gcv. Compile RVV +# sources into the main faiss target, mirroring the ARM NEON story on aarch64. +if(CMAKE_SYSTEM_PROCESSOR MATCHES "(riscv64|riscv)") + target_compile_definitions(faiss PRIVATE COMPILE_SIMD_RISCV_RVV) + target_sources(faiss PRIVATE ${FAISS_SIMD_RVV_SRC}) + if(NOT WIN32 AND FAISS_SIMD_RVV_SRC) + set_source_files_properties(${FAISS_SIMD_RVV_SRC} + TARGET_DIRECTORY faiss + PROPERTIES COMPILE_OPTIONS "-march=rv64gcv_zvfhmin;-mabi=lp64d" + ) + endif() +endif() + if(FAISS_ENABLE_SVS) find_package(svs_runtime REQUIRED) diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.cpp deleted file mode 100644 index 065c54e05..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.cpp +++ /dev/null @@ -1,83 +0,0 @@ - -// -*- c++ -*- - -#include -#include - -#include -#include -#include -#include -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -sq_get_distance_computer_func_ptr sq_get_distance_computer = - sq_get_distance_computer_ref; -sq_sel_quantizer_func_ptr sq_sel_quantizer = sq_select_quantizer_ref; -sq_sel_inv_list_scanner_func_ptr sq_sel_inv_list_scanner = - sq_select_inverted_list_scanner_ref; - -// Note: The Hamming computer implementation is selected at compile time -// based on the instruction set in `hamdis-inl.h`, not by runtime hook. -sq_get_distance_computer_func_ptr sq_get_hamming_distance_computer = - sq_get_hamming_distance_computer_ref; - -// Note: The Jaccard distance computer uses `__builtin_popcount` for -// computation. This function is efficiently implemented by the -// compiler and automatically utilizes the best available instruction set. -// Therefore, there is no need to manually adjust or hook the Jaccard computer -// for different SIMD instruction sets. -sq_get_distance_computer_func_ptr sq_get_jaccard_distance_computer = - sq_get_jaccard_distance_computer_ref; - -void sq_hook() { - // SQ8 always hook best SIMD -#ifdef __x86_64__ - static std::mutex hook_mutex; - std::lock_guard lock(hook_mutex); - - if (use_avx512 && cpu_support_avx512()) { - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_avx512; - sq_sel_quantizer = sq_select_quantizer_avx512; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_avx512; - } else if (use_avx2 && cpu_support_avx2()) { - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_avx; - sq_sel_quantizer = sq_select_quantizer_avx; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_avx; - } else if (use_sse4_2 && cpu_support_sse4_2()) { - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_ref; - sq_sel_quantizer = sq_select_quantizer_ref; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_ref; - } -#endif - -#if defined(__ARM_NEON) - static std::mutex hook_mutex; - std::lock_guard lock(hook_mutex); - - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_neon; - sq_sel_quantizer = sq_select_quantizer_neon; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_neon; -#endif - -#if defined(__riscv_vector) - static std::mutex hook_mutex; - std::lock_guard lock(hook_mutex); - /* for IVFSQ */ - sq_get_distance_computer = sq_get_distance_computer_rvv; - sq_sel_quantizer = sq_select_quantizer_rvv; - sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_rvv; -#endif -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.h deleted file mode 100644 index 5e0d5b816..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/FaissHook.h +++ /dev/null @@ -1,48 +0,0 @@ - -// -*- c++ -*- - -#pragma once - -#include - -#include -#include -#include -#include "simd/hook.h" - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -// todo aguzhva: replace FaissHook.h with simd/hook.h - -typedef ScalarQuantizer::SQDistanceComputer* (*sq_get_distance_computer_func_ptr)( - MetricType, - ScalarQuantizer::QuantizerType, - size_t, - const std::vector&); -typedef ScalarQuantizer::SQuantizer* (*sq_sel_quantizer_func_ptr)( - ScalarQuantizer::QuantizerType, - size_t, - const std::vector&); -typedef InvertedListScanner* (*sq_sel_inv_list_scanner_func_ptr)( - MetricType, - const ScalarQuantizer*, - const Index*, - size_t, - bool, - const IDSelector*, - bool); - -extern sq_get_distance_computer_func_ptr sq_get_distance_computer; -extern sq_get_distance_computer_func_ptr sq_get_hamming_distance_computer; -extern sq_get_distance_computer_func_ptr sq_get_jaccard_distance_computer; -extern sq_sel_quantizer_func_ptr sq_sel_quantizer; -extern sq_sel_inv_list_scanner_func_ptr sq_sel_inv_list_scanner; -void sq_hook(); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IVFIteratorWorkspace.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IVFIteratorWorkspace.cpp index 60a952c64..bf80155e1 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IVFIteratorWorkspace.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IVFIteratorWorkspace.cpp @@ -12,6 +12,7 @@ #include #include +#include #include namespace faiss::cppcontrib::knowhere { diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.cpp new file mode 100644 index 000000000..f82766f96 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2019-2025 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include +#include +#include + +namespace faiss { +namespace cppcontrib { +namespace knowhere { + +namespace { + +// Adapter that binds one of baseline's HammingComputerN / JaccardComputerN +// size-specialized primitives into the FlatCodesDistanceComputer interface. +template +struct BinaryFlatCodesDC : faiss::FlatCodesDistanceComputer { + BinaryComputer binary_computer; + std::vector tmp; + + BinaryFlatCodesDC(const uint8_t* codes_in, size_t code_size_in) + : FlatCodesDistanceComputer(codes_in, code_size_in), + tmp(code_size_in) {} + + void set_query(const float* x) final { + // Legacy convention: each float is an integer in [0, 255]; cast + // to uint8 to recover the bit-packed query byte. Same pattern + // used by the fork's BinarySQDistanceComputerWrapper. + for (size_t i = 0; i < code_size; ++i) { + tmp[i] = static_cast(x[i]); + } + binary_computer.set(tmp.data(), code_size); + } + + float distance_to_code(const uint8_t* code) final { + return binary_computer.compute(code); + } + + float symmetric_dis(idx_t i, idx_t j) final { + BinaryComputer temp; + temp.set(codes + i * code_size, code_size); + return temp.compute(codes + j * code_size); + } +}; + +faiss::FlatCodesDistanceComputer* +make_hamming_dc(const uint8_t* codes, size_t code_size) { + switch (code_size) { + case 4: + return new BinaryFlatCodesDC(codes, code_size); + case 8: + return new BinaryFlatCodesDC(codes, code_size); + case 16: + return new BinaryFlatCodesDC(codes, code_size); + case 20: + return new BinaryFlatCodesDC(codes, code_size); + case 32: + return new BinaryFlatCodesDC(codes, code_size); + case 64: + return new BinaryFlatCodesDC(codes, code_size); + default: + return new BinaryFlatCodesDC( + codes, code_size); + } +} + +faiss::FlatCodesDistanceComputer* +make_jaccard_dc(const uint8_t* codes, size_t code_size) { + switch (code_size) { + case 8: + return new BinaryFlatCodesDC(codes, code_size); + case 16: + return new BinaryFlatCodesDC(codes, code_size); + case 32: + return new BinaryFlatCodesDC(codes, code_size); + case 64: + return new BinaryFlatCodesDC(codes, code_size); + case 128: + return new BinaryFlatCodesDC( + codes, code_size); + case 256: + return new BinaryFlatCodesDC( + codes, code_size); + case 512: + return new BinaryFlatCodesDC( + codes, code_size); + default: + return new BinaryFlatCodesDC( + codes, code_size); + } +} + +} // namespace + +IndexBinaryScalarQuantizer::IndexBinaryScalarQuantizer() : IndexFlatCodes() {} + +IndexBinaryScalarQuantizer::IndexBinaryScalarQuantizer(int d, MetricType metric) + : IndexFlatCodes(static_cast((d + 7) / 8), d, metric) { + FAISS_THROW_IF_NOT_MSG( + metric == METRIC_Hamming || metric == METRIC_Jaccard || + metric == METRIC_Substructure || + metric == METRIC_Superstructure, + "IndexBinaryScalarQuantizer: unsupported metric (expected Hamming, " + "Jaccard, Substructure, or Superstructure)"); + is_trained = true; +} + +void IndexBinaryScalarQuantizer::sa_encode( + idx_t n, const float* x, uint8_t* bytes) const { + // Follows the legacy Quantizer1bitDirect convention byte-for-byte: + // each vector has d floats, but only the first code_size are read; + // each is cast to uint8 to form the code byte. + const size_t cs = code_size; + for (idx_t vi = 0; vi < n; ++vi) { + const float* src = x + vi * static_cast(d); + uint8_t* dst = bytes + vi * cs; + for (size_t i = 0; i < cs; ++i) { + dst[i] = static_cast(src[i]); + } + } +} + +void IndexBinaryScalarQuantizer::sa_decode( + idx_t n, const uint8_t* bytes, float* x) const { + // Mirror of sa_encode. Output stride is d (matching baseline + // ScalarQuantizer::decode) but only the first code_size lanes of + // each d-float slot are written. Trailing lanes are left untouched + // by design: callers that only need the meaningful bytes (see + // faiss_hnsw.cc GetVectorByIds, bin1 branch) allocate exactly + // code_size floats per vector and rely on the decoder not writing + // past that. Zero-filling the tail would overrun those buffers. + const size_t cs = code_size; + for (idx_t vi = 0; vi < n; ++vi) { + float* dst = x + vi * static_cast(d); + const uint8_t* src = bytes + vi * cs; + for (size_t i = 0; i < cs; ++i) { + dst[i] = static_cast(src[i]); + } + } +} + +faiss::FlatCodesDistanceComputer* +IndexBinaryScalarQuantizer::get_FlatCodesDistanceComputer() const { + switch (metric_type) { + case METRIC_Hamming: + case METRIC_Substructure: + case METRIC_Superstructure: + return make_hamming_dc(codes.data(), code_size); + case METRIC_Jaccard: + return make_jaccard_dc(codes.data(), code_size); + default: + FAISS_THROW_MSG( + "IndexBinaryScalarQuantizer: unsupported metric in " + "get_FlatCodesDistanceComputer"); + } +} + +} // namespace knowhere +} // namespace cppcontrib +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.h new file mode 100644 index 000000000..64103b9c1 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexBinaryScalarQuantizer.h @@ -0,0 +1,63 @@ +// Copyright (C) 2019-2025 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace faiss { +namespace cppcontrib { +namespace knowhere { + +/** + * Storage class for 1-bit-per-dimension binary vectors, with Hamming / + * Jaccard / Substructure / Superstructure semantics. Acts as the storage + * under the fork's IndexHNSW family — a direct replacement for the legacy + * path that routed binary data through faiss::ScalarQuantizer with + * qtype == QT_1bit_direct. + * + * Input/output convention for sa_encode / sa_decode and set_query: the + * `float*` buffer carries per-byte integer values (0..255) that together + * represent the bit-packed binary vector. The first code_size entries of + * each d-float "vector" are meaningful; any remaining lanes are ignored + * on both encode and decode — decode does not touch them, so callers may + * allocate exactly code_size floats per vector. + */ +struct IndexBinaryScalarQuantizer : faiss::IndexFlatCodes { + IndexBinaryScalarQuantizer(); + + /// d is the number of binary dimensions. code_size is (d + 7) / 8. + /// metric must be one of METRIC_Hamming, METRIC_Jaccard, + /// METRIC_Substructure, METRIC_Superstructure. The index is + /// considered trained immediately after construction. + IndexBinaryScalarQuantizer(int d, MetricType metric); + + void + sa_encode(idx_t n, const float* x, uint8_t* bytes) const override; + + void + sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; + + /// Returns a size-specialized Hamming or Jaccard computer wired into + /// the FlatCodesDistanceComputer interface. Uses baseline FAISS + /// primitives from faiss/utils/hamming.h. + faiss::FlatCodesDistanceComputer* + get_FlatCodesDistanceComputer() const override; +}; + +} // namespace knowhere +} // namespace cppcontrib +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.cpp index 22a4a6f23..fa16ecfd5 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.cpp @@ -18,7 +18,7 @@ #include -#include +#include "simd/hook.h" #include #include @@ -335,11 +335,12 @@ FlatCodesDistanceComputer* IndexFlatCosine::get_FlatCodesDistanceComputer() cons IndexScalarQuantizerCosine::IndexScalarQuantizerCosine( int d, - ScalarQuantizer::QuantizerType qtype) - : IndexScalarQuantizer(d, qtype, MetricType::METRIC_INNER_PRODUCT) { + ::faiss::ScalarQuantizer::QuantizerType qtype) + : ::faiss::IndexScalarQuantizer(d, qtype, MetricType::METRIC_INNER_PRODUCT) { } -IndexScalarQuantizerCosine::IndexScalarQuantizerCosine() : IndexScalarQuantizer() { +IndexScalarQuantizerCosine::IndexScalarQuantizerCosine() + : ::faiss::IndexScalarQuantizer() { metric_type = MetricType::METRIC_INNER_PRODUCT; } @@ -349,12 +350,12 @@ void IndexScalarQuantizerCosine::add(idx_t n, const float* x) { return; } - IndexScalarQuantizer::add(n, x); + ::faiss::IndexScalarQuantizer::add(n, x); inverse_norms_storage.add(x, n, d); } void IndexScalarQuantizerCosine::reset() { - IndexScalarQuantizer::reset(); + ::faiss::IndexScalarQuantizer::reset(); inverse_norms_storage.reset(); } @@ -366,7 +367,8 @@ DistanceComputer* IndexScalarQuantizerCosine::get_distance_computer() const { return new WithCosineNormDistanceComputer( this->get_inverse_l2_norms(), this->d, - std::unique_ptr(IndexScalarQuantizer::get_FlatCodesDistanceComputer()) + std::unique_ptr( + ::faiss::IndexScalarQuantizer::get_FlatCodesDistanceComputer()) ); } @@ -483,7 +485,7 @@ IndexHNSWSQCosine::IndexHNSWSQCosine() { IndexHNSWSQCosine::IndexHNSWSQCosine( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M) : IndexHNSW(new IndexScalarQuantizerCosine(d, qtype), M) { diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.h index 4075fa675..56d73b7c1 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexCosine.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -128,12 +128,13 @@ struct IndexFlatCosine : IndexFlat, HasInverseL2Norms { }; // -struct IndexScalarQuantizerCosine : IndexScalarQuantizer, HasInverseL2Norms { +struct IndexScalarQuantizerCosine : ::faiss::IndexScalarQuantizer, + HasInverseL2Norms { L2NormsStorage inverse_norms_storage; IndexScalarQuantizerCosine( int d, - ScalarQuantizer::QuantizerType qtype); + ::faiss::ScalarQuantizer::QuantizerType qtype); IndexScalarQuantizerCosine(); @@ -195,7 +196,7 @@ struct IndexHNSWSQCosine : IndexHNSW, HasInverseL2Norms { IndexHNSWSQCosine(); IndexHNSWSQCosine( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M); const float* get_inverse_l2_norms() const override; diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexFlat.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexFlat.cpp index 3f09ce1ad..f032088b3 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexFlat.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexFlat.cpp @@ -11,7 +11,7 @@ #include -#include +#include "simd/hook.h" #include #include diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.cpp index 6f69022ff..51835d798 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.cpp @@ -685,10 +685,10 @@ void IndexHNSWPQ::train(idx_t n, const float* x) { IndexHNSWSQ::IndexHNSWSQ( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M, MetricType metric) - : IndexHNSW(new IndexScalarQuantizer(d, qtype, metric), M) { + : IndexHNSW(new ::faiss::IndexScalarQuantizer(d, qtype, metric), M) { is_trained = this->storage->is_trained; own_fields = true; } diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.h index 7a2566748..010801edd 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSW.h @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include @@ -145,7 +145,7 @@ struct IndexHNSWSQ : IndexHNSW { IndexHNSWSQ(); IndexHNSWSQ( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M, MetricType metric = METRIC_L2); }; diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.cpp new file mode 100644 index 000000000..7de16b739 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2019-2025 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +#include + +#include + +namespace faiss { +namespace cppcontrib { +namespace knowhere { + +IndexHNSWBinary::IndexHNSWBinary() = default; + +IndexHNSWBinary::IndexHNSWBinary(int d, int M, MetricType metric) + : IndexHNSW(new IndexBinaryScalarQuantizer(d, metric), M) { + is_trained = this->storage->is_trained; + own_fields = true; +} + +} // namespace knowhere +} // namespace cppcontrib +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.h new file mode 100644 index 000000000..f3b614609 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexHNSWBinary.h @@ -0,0 +1,47 @@ +// Copyright (C) 2019-2025 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace faiss { +namespace cppcontrib { +namespace knowhere { + +/** + * HNSW index with a IndexBinaryScalarQuantizer storage — a replacement for + * the legacy IndexHNSWSQ(QT_1bit_direct, metric) path. Inherits from + * IndexHNSW directly (not IndexHNSWSQ) so ctor delegation goes straight + * to the Index*-storage form. + * + * On disk, instances serialize with the same fourcc ("IHNs") and byte + * layout as IndexHNSWSQ with an inner QT_1bit_direct ScalarQuantizer. + * Readers materialize either IndexHNSWSQ (for non-binary SQ qtypes) or + * IndexHNSWBinary (for QT_1bit_direct) depending on the inner storage's + * qtype — see fork's impl/index_read.cpp for the dispatch. + */ +struct IndexHNSWBinary : IndexHNSW { + IndexHNSWBinary(); + + /// d is the number of binary dimensions (not bytes). metric must be + /// supported by IndexBinaryScalarQuantizer: METRIC_Hamming, METRIC_Jaccard, + /// METRIC_Substructure, METRIC_Superstructure. + IndexHNSWBinary(int d, int M, MetricType metric = METRIC_Hamming); +}; + +} // namespace knowhere +} // namespace cppcontrib +} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFFlat.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFFlat.cpp index 37ecfa930..8e2a414d3 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFFlat.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFFlat.cpp @@ -21,7 +21,6 @@ #include -#include #include #include @@ -32,6 +31,8 @@ #include +#include "simd/hook.h" + namespace faiss::cppcontrib::knowhere { diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQ.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQ.cpp index 522639050..0c34c5944 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQ.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFPQ.cpp @@ -22,8 +22,8 @@ #include #include -#include #include +#include "simd/hook.h" #include diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.cpp index 839063b52..b771c7bb9 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.cpp @@ -14,7 +14,7 @@ IndexIVFScalarQuantizerCC::IndexIVFScalarQuantizerCC( size_t d, size_t nlist, size_t ssize, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric, bool by_residual, std::optional raw_data_prefix_path) @@ -52,7 +52,7 @@ void IndexIVFScalarQuantizerCC::add_core( FAISS_THROW_IF_NOT(is_trained); size_t nadd = 0; - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); DirectMapAdd dm_add(direct_map, n, xids); @@ -123,7 +123,7 @@ IndexIVFScalarQuantizerCCCosine::IndexIVFScalarQuantizerCCCosine( size_t d, size_t nlist, size_t ssize, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric, bool by_residual, std::optional raw_data_prefix_path) @@ -153,7 +153,7 @@ void IndexIVFScalarQuantizerCCCosine::add_core( const float* base_x = x_normalized.get(); size_t nadd = 0; - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); DirectMapAdd dm_add(direct_map, n, xids); diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.h index b153ec1a4..9317d0942 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexIVFScalarQuantizerCC.h @@ -25,7 +25,7 @@ struct IndexIVFScalarQuantizerCC : IndexIVFScalarQuantizer { size_t d, size_t nlist, size_t ssize, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric = METRIC_L2, bool by_residual = false, std::optional raw_data_prefix_path = std::nullopt); @@ -57,7 +57,7 @@ struct IndexIVFScalarQuantizerCCCosine : IndexIVFScalarQuantizerCC, HasInverseL2 size_t d, size_t nlist, size_t ssize, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric = METRIC_L2, bool by_residual = false, std::optional raw_data_prefix_path = std::nullopt); diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.cpp index ec8cb0145..7c723b9ee 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include "simd/hook.h" #include #include #include @@ -159,20 +159,20 @@ float WithSQ4UniformNormIPDistanceComputer::symmetric_dis(idx_t i, idx_t j) { IndexScalarQuantizer4bitUniformCosine::IndexScalarQuantizer4bitUniformCosine( int d) - : IndexScalarQuantizer( + : ::faiss::IndexScalarQuantizer( d, - ScalarQuantizer::QT_4bit_uniform, + ::faiss::ScalarQuantizer::QT_4bit_uniform, METRIC_INNER_PRODUCT) { - sq.rangestat = ScalarQuantizer::RS_quantiles; + sq.rangestat = ::faiss::ScalarQuantizer::RS_quantiles; sq.rangestat_arg = 0.01; } IndexScalarQuantizer4bitUniformCosine::IndexScalarQuantizer4bitUniformCosine() - : IndexScalarQuantizer() { + : ::faiss::IndexScalarQuantizer() { metric_type = METRIC_INNER_PRODUCT; - sq.rangestat = ScalarQuantizer::RS_quantiles; + sq.rangestat = ::faiss::ScalarQuantizer::RS_quantiles; sq.rangestat_arg = 0.01; } @@ -194,7 +194,7 @@ void IndexScalarQuantizer4bitUniformCosine::add(idx_t n, const float* x) { auto normalized_data = ::knowhere::CopyAndNormalizeVecs(x, n, d); // Add normalized data - IndexScalarQuantizer::add(n, normalized_data.get()); + ::faiss::IndexScalarQuantizer::add(n, normalized_data.get()); // Store inverse L2 norms from ORIGINAL vectors (not normalized) // This is needed for refine to work correctly with COSINE metric @@ -203,10 +203,19 @@ void IndexScalarQuantizer4bitUniformCosine::add(idx_t n, const float* x) { DistanceComputer* IndexScalarQuantizer4bitUniformCosine::get_distance_computer() const { - std::unique_ptr base_dc( - IndexScalarQuantizer::get_distance_computer()); - - return new SQ4UniformCosineDistanceComputer(d, std::move(base_dc)); + // The DC wrapper does `cosine = 1 - 0.5 * L2^2`, so the inner DC + // must compute L2^2 regardless of this index's metric_type (which is + // METRIC_INNER_PRODUCT because the index is semantically IP-based). + // Ask the SQ directly for an L2 DC rather than going through the + // baseline helper, which would pick a DC from `metric_type` and hand + // us an IP-computing DC. Pre-migration this was hidden by the fork's + // DistanceComputerSQ4UByte which always returned L2 regardless of + // Similarity. + auto* base_dc = sq.get_distance_computer(METRIC_L2); + base_dc->code_size = sq.code_size; + base_dc->codes = codes.data(); + return new SQ4UniformCosineDistanceComputer( + d, std::unique_ptr(base_dc)); } const float* IndexScalarQuantizer4bitUniformCosine::get_inverse_l2_norms() @@ -215,7 +224,7 @@ const float* IndexScalarQuantizer4bitUniformCosine::get_inverse_l2_norms() } void IndexScalarQuantizer4bitUniformCosine::reset() { - IndexScalarQuantizer::reset(); + ::faiss::IndexScalarQuantizer::reset(); inverse_norms_storage.reset(); } @@ -224,20 +233,20 @@ void IndexScalarQuantizer4bitUniformCosine::reset() { ////////////////////////////////////////////////////////////////////////////////// IndexScalarQuantizer4bitUniformIP::IndexScalarQuantizer4bitUniformIP(int d) - : IndexScalarQuantizer( + : ::faiss::IndexScalarQuantizer( d, - ScalarQuantizer::QT_4bit_uniform, + ::faiss::ScalarQuantizer::QT_4bit_uniform, METRIC_INNER_PRODUCT) { } IndexScalarQuantizer4bitUniformIP::IndexScalarQuantizer4bitUniformIP() - : IndexScalarQuantizer() { + : ::faiss::IndexScalarQuantizer() { metric_type = METRIC_INNER_PRODUCT; } void IndexScalarQuantizer4bitUniformIP::add(idx_t n, const float* x) { FAISS_THROW_IF_NOT(is_trained); - IndexScalarQuantizer::add(n, x); + ::faiss::IndexScalarQuantizer::add(n, x); // Compute and store norms squared for IP distance computation for (idx_t i = 0; i < n; i++) { @@ -248,17 +257,22 @@ void IndexScalarQuantizer4bitUniformIP::add(idx_t n, const float* x) { } void IndexScalarQuantizer4bitUniformIP::reset() { - IndexScalarQuantizer::reset(); + ::faiss::IndexScalarQuantizer::reset(); l2_norms_sqr.clear(); } DistanceComputer* IndexScalarQuantizer4bitUniformIP::get_distance_computer() const { - std::unique_ptr base_dc( - IndexScalarQuantizer::get_distance_computer()); - + // See IndexScalarQuantizer4bitUniformCosine::get_distance_computer + // for why we force METRIC_L2 here. The wrapper DC does + // `IP = 0.5 * (||q||^2 + ||b||^2 - L2^2)`, which only holds if the + // inner DC actually returns L2^2. + auto* base_dc = sq.get_distance_computer(METRIC_L2); + base_dc->code_size = sq.code_size; + base_dc->codes = codes.data(); return new WithSQ4UniformNormIPDistanceComputer( - get_l2_norms_sqr(), d, std::move(base_dc)); + get_l2_norms_sqr(), d, + std::unique_ptr(base_dc)); } const float* IndexScalarQuantizer4bitUniformIP::get_l2_norms_sqr() const { @@ -274,11 +288,11 @@ IndexHNSWSQ4UniformCosine::IndexHNSWSQ4UniformCosine() : IndexHNSW() { IndexHNSWSQ4UniformCosine::IndexHNSWSQ4UniformCosine( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M) : IndexHNSW(new IndexScalarQuantizer4bitUniformCosine(d), M) { FAISS_THROW_IF_NOT_MSG( - qtype == ScalarQuantizer::QT_4bit_uniform, + qtype == ::faiss::ScalarQuantizer::QT_4bit_uniform, "IndexHNSWSQ4UniformCosine only supports QT_4bit_uniform"); is_trained = this->storage->is_trained; @@ -299,11 +313,11 @@ IndexHNSWSQ4UniformIP::IndexHNSWSQ4UniformIP() : IndexHNSW() { IndexHNSWSQ4UniformIP::IndexHNSWSQ4UniformIP( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M) : IndexHNSW(new IndexScalarQuantizer4bitUniformIP(d), M) { FAISS_THROW_IF_NOT_MSG( - qtype == ScalarQuantizer::QT_4bit_uniform, + qtype == ::faiss::ScalarQuantizer::QT_4bit_uniform, "IndexHNSWSQ4UniformIP only supports QT_4bit_uniform"); is_trained = this->storage->is_trained; diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.h index def5e0466..c94aae829 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexSQ4Uniform.h @@ -23,7 +23,7 @@ #include #include -#include +#include #include namespace faiss { @@ -128,7 +128,7 @@ struct WithSQ4UniformNormIPDistanceComputer : DistanceComputer { * modifying caller's data. Query normalization handled by knowhere layer. * Implements HasInverseL2Norms. */ -struct IndexScalarQuantizer4bitUniformCosine : IndexScalarQuantizer, +struct IndexScalarQuantizer4bitUniformCosine : ::faiss::IndexScalarQuantizer, HasInverseL2Norms { L2NormsStorage inverse_norms_storage; @@ -155,7 +155,7 @@ struct IndexScalarQuantizer4bitUniformCosine : IndexScalarQuantizer, * Scalar Quantizer specialized for 4-bit uniform quantization with IP metric. * Stores L2 norms squared of vectors to convert L2^2 distances to IP. */ -struct IndexScalarQuantizer4bitUniformIP : IndexScalarQuantizer { +struct IndexScalarQuantizer4bitUniformIP : ::faiss::IndexScalarQuantizer { /// Storage for L2 norms squared (||x||^2) std::vector l2_norms_sqr; @@ -184,7 +184,7 @@ struct IndexHNSWSQ4UniformCosine : IndexHNSW, HasInverseL2Norms { IndexHNSWSQ4UniformCosine( int d, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, int M); const float* get_inverse_l2_norms() const override; @@ -193,7 +193,10 @@ struct IndexHNSWSQ4UniformCosine : IndexHNSW, HasInverseL2Norms { struct IndexHNSWSQ4UniformIP : IndexHNSW { IndexHNSWSQ4UniformIP(); - IndexHNSWSQ4UniformIP(int d, ScalarQuantizer::QuantizerType qtype, int M); + IndexHNSWSQ4UniformIP( + int d, + ::faiss::ScalarQuantizer::QuantizerType qtype, + int M); }; } diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScaNN.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScaNN.cpp index 7b53e575c..15deb73a3 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScaNN.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScaNN.cpp @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.cpp index 281b464db..8c4a6abaf 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.cpp @@ -11,116 +11,22 @@ #include #include +#include #include +#include #include #include #include -#include +#include +#include #include namespace faiss::cppcontrib::knowhere { -/******************************************************************* - * IndexScalarQuantizer implementation - ********************************************************************/ - -IndexScalarQuantizer::IndexScalarQuantizer( - int d, - ScalarQuantizer::QuantizerType qtype, - MetricType metric) - : IndexFlatCodes(0, d, metric), sq(d, qtype) { - if (qtype == ScalarQuantizer::QT_4bit_uniform && metric == METRIC_L2) { - sq.rangestat = ScalarQuantizer::RS_quantiles; - sq.rangestat_arg = 0.01; - } - is_trained = qtype == ScalarQuantizer::QT_fp16 || - qtype == ScalarQuantizer::QT_8bit_direct || - qtype == ScalarQuantizer::QT_bf16 || - qtype == ScalarQuantizer::QT_8bit_direct_signed || - qtype == ScalarQuantizer::QT_1bit_direct; - code_size = sq.code_size; -} - -IndexScalarQuantizer::IndexScalarQuantizer() - : IndexScalarQuantizer(0, ScalarQuantizer::QT_8bit) {} - -void IndexScalarQuantizer::train(idx_t n, const float* x) { - sq.train(n, x); - is_trained = true; -} - -void IndexScalarQuantizer::search( - idx_t n, - const float* x, - idx_t k, - float* distances, - idx_t* labels, - const SearchParameters* params) const { - const IDSelector* sel = params ? params->sel : nullptr; - - FAISS_THROW_IF_NOT(k > 0); - FAISS_THROW_IF_NOT(is_trained); - FAISS_THROW_IF_NOT( - metric_type == METRIC_L2 || metric_type == METRIC_INNER_PRODUCT); - -#pragma omp parallel - { - std::unique_ptr scanner( - sq.select_InvertedListScanner(metric_type, nullptr, true, sel)); - - scanner->list_no = 0; // directly the list number - -#pragma omp for - for (idx_t i = 0; i < n; i++) { - float* D = distances + k * i; - idx_t* I = labels + k * i; - // re-order heap - if (metric_type == METRIC_L2) { - maxheap_heapify(k, D, I); - } else { - minheap_heapify(k, D, I); - } - scanner->set_query(x + i * d); - size_t scan_cnt = 0; - scanner->scan_codes(ntotal, codes.data(), nullptr, nullptr, D, I, k, scan_cnt); - - // re-order heap - if (metric_type == METRIC_L2) { - maxheap_reorder(k, D, I); - } else { - minheap_reorder(k, D, I); - } - } - } -} - -FlatCodesDistanceComputer* IndexScalarQuantizer::get_FlatCodesDistanceComputer() - const { - ScalarQuantizer::SQDistanceComputer* dc = - sq.get_distance_computer(metric_type); - dc->code_size = sq.code_size; - dc->codes = codes.data(); - return dc; -} - -/* Codec interface */ - -void IndexScalarQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes) - const { - FAISS_THROW_IF_NOT(is_trained); - sq.compute_codes(x, bytes, n); -} - -void IndexScalarQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x) - const { - FAISS_THROW_IF_NOT(is_trained); - sq.decode(bytes, x, n); -} - /******************************************************************* * IndexIVFScalarQuantizer implementation ********************************************************************/ @@ -129,7 +35,7 @@ IndexIVFScalarQuantizer::IndexIVFScalarQuantizer( Index* quantizer, size_t d, size_t nlist, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric, bool by_residual) : IndexIVF(quantizer, d, nlist, 0, metric), sq(d, qtype) { @@ -161,7 +67,7 @@ void IndexIVFScalarQuantizer::encode_vectors( const idx_t* list_nos, uint8_t* codes, bool include_listnos) const { - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); size_t coarse_size = include_listnos ? coarse_code_size() : 0; memset(codes, 0, (code_size + coarse_size) * n); @@ -190,7 +96,7 @@ void IndexIVFScalarQuantizer::encode_vectors( void IndexIVFScalarQuantizer::sa_decode(idx_t n, const uint8_t* codes, float* x) const { - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); size_t coarse_size = coarse_code_size(); #pragma omp parallel if (n > 1000) @@ -222,7 +128,7 @@ void IndexIVFScalarQuantizer::add_core( void* inverted_list_context) { FAISS_THROW_IF_NOT(is_trained); - std::unique_ptr squant(sq.select_quantizer()); + std::unique_ptr<::faiss::ScalarQuantizer::SQuantizer> squant(sq.select_quantizer()); DirectMapAdd dm_add(direct_map, n, xids); @@ -262,12 +168,253 @@ void IndexIVFScalarQuantizer::add_core( ntotal += n; } +namespace { + +// Adapter scanners that implement the fork InvertedListScanner interface +// but delegate distance computation to a baseline SQDistanceComputer. +// Two variants are needed because the IP / L2 paths differ in how the +// coarse-centroid residual is folded into the distance: +// IP: dis = coarse_dis + dc.query_to_code(code) +// L2: the query is shifted into the centroid frame in set_list(), and +// the DC already produces the final L2 distance on every code. +// +// scan_cnt is a fork-side out-param that fork's own SQ scanners never +// increment (only IVFFlat/FastScan do), so we match that behavior and +// leave it untouched. + +class BaselineIVFSQScannerIP : public InvertedListScanner { + public: + BaselineIVFSQScannerIP( + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc, + size_t code_size_in, + bool store_pairs_in, + const IDSelector* sel_in, + bool by_residual_in) + : dc_(std::move(dc)), by_residual_(by_residual_in) { + store_pairs = store_pairs_in; + sel = sel_in; + code_size = code_size_in; + keep_max = true; + } + + void set_query(const float* query) override { + dc_->set_query(query); + } + + void set_list(idx_t list_no_in, float coarse_dis) override { + this->list_no = list_no_in; + accu0_ = by_residual_ ? coarse_dis : 0.0f; + } + + float distance_to_code(const uint8_t* code) const override { + return accu0_ + dc_->query_to_code(code); + } + + size_t scan_codes( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + float* simi, + idx_t* idxi, + size_t k, + size_t& /*scan_cnt*/) const override { + size_t nup = 0; + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = accu0_ + dc_->query_to_code(codes + j * code_size); + if (dis > simi[0]) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + minheap_replace_top(k, simi, idxi, dis, id); + nup++; + } + } + return nup; + } + + void scan_codes_and_return( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + std::vector<::knowhere::DistId>& out) const override { + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = accu0_ + dc_->query_to_code(codes + j * code_size); + out.emplace_back(ids[j], dis); + } + } + + void scan_codes_range( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + float radius, + RangeQueryResult& res) const override { + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = accu0_ + dc_->query_to_code(codes + j * code_size); + if (dis > radius) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + res.add(dis, id); + } + } + } + + private: + bool selector_accepts(size_t j, const idx_t* ids) const { + if (!sel) { + return true; + } + return sel->is_member(store_pairs ? static_cast(j) : ids[j]); + } + + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc_; + bool by_residual_; + float accu0_ = 0.0f; +}; + +class BaselineIVFSQScannerL2 : public InvertedListScanner { + public: + BaselineIVFSQScannerL2( + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc, + int d_in, + size_t code_size_in, + const Index* quantizer_in, + bool store_pairs_in, + const IDSelector* sel_in, + bool by_residual_in) + : dc_(std::move(dc)), + by_residual_(by_residual_in), + quantizer_(quantizer_in), + tmp_(d_in) { + store_pairs = store_pairs_in; + sel = sel_in; + code_size = code_size_in; + keep_max = false; + } + + void set_query(const float* query) override { + x_ = query; + if (!by_residual_) { + dc_->set_query(query); + } + } + + void set_list(idx_t list_no_in, float /*coarse_dis*/) override { + this->list_no = list_no_in; + if (by_residual_) { + quantizer_->compute_residual(x_, tmp_.data(), list_no_in); + dc_->set_query(tmp_.data()); + } + } + + float distance_to_code(const uint8_t* code) const override { + return dc_->query_to_code(code); + } + + size_t scan_codes( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + float* simi, + idx_t* idxi, + size_t k, + size_t& /*scan_cnt*/) const override { + size_t nup = 0; + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = dc_->query_to_code(codes + j * code_size); + if (dis < simi[0]) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + maxheap_replace_top(k, simi, idxi, dis, id); + nup++; + } + } + return nup; + } + + void scan_codes_and_return( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + std::vector<::knowhere::DistId>& out) const override { + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = dc_->query_to_code(codes + j * code_size); + out.emplace_back(ids[j], dis); + } + } + + void scan_codes_range( + size_t list_size, + const uint8_t* codes, + const float* /*code_norms*/, + const idx_t* ids, + float radius, + RangeQueryResult& res) const override { + for (size_t j = 0; j < list_size; j++) { + if (!selector_accepts(j, ids)) { + continue; + } + float dis = dc_->query_to_code(codes + j * code_size); + if (dis < radius) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + res.add(dis, id); + } + } + } + + private: + bool selector_accepts(size_t j, const idx_t* ids) const { + if (!sel) { + return true; + } + return sel->is_member(store_pairs ? static_cast(j) : ids[j]); + } + + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc_; + bool by_residual_; + const Index* quantizer_; + const float* x_ = nullptr; + std::vector tmp_; +}; + +} // namespace + InvertedListScanner* IndexIVFScalarQuantizer::get_InvertedListScanner( bool store_pairs, const IDSelector* sel, const IVFSearchParameters*) const { - return sq.select_InvertedListScanner( - metric_type, quantizer, store_pairs, sel, by_residual); + FAISS_THROW_IF_NOT( + metric_type == METRIC_L2 || metric_type == METRIC_INNER_PRODUCT); + std::unique_ptr<::faiss::ScalarQuantizer::SQDistanceComputer> dc( + sq.get_distance_computer(metric_type)); + if (metric_type == METRIC_INNER_PRODUCT) { + return new BaselineIVFSQScannerIP( + std::move(dc), code_size, store_pairs, sel, by_residual); + } + return new BaselineIVFSQScannerL2( + std::move(dc), + static_cast(d), + code_size, + quantizer, + store_pairs, + sel, + by_residual); } void IndexIVFScalarQuantizer::reconstruct_from_offset( diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.h index 085379097..dbaa358a3 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.h +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/IndexScalarQuantizer.h @@ -13,54 +13,14 @@ #include #include +#include #include #include -#include -#include - namespace faiss { namespace cppcontrib { namespace knowhere { -/** - * Flat index built on a scalar quantizer. - */ -struct IndexScalarQuantizer : IndexFlatCodes { - /// Used to encode the vectors - ScalarQuantizer sq; - - /** Constructor. - * - * @param d dimensionality of the input vectors - * @param M number of subquantizers - * @param nbits number of bit per subvector index - */ - IndexScalarQuantizer( - int d, - ScalarQuantizer::QuantizerType qtype, - MetricType metric = METRIC_L2); - - IndexScalarQuantizer(); - - void train(idx_t n, const float* x) override; - - void search( - idx_t n, - const float* x, - idx_t k, - float* distances, - idx_t* labels, - const SearchParameters* params = nullptr) const override; - - FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override; - - /* standalone codec interface */ - void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override; - - void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override; -}; - /** An IVF implementation where the components of the residuals are * encoded with a scalar quantizer. All distance computations * are asymmetric, so the encoded vectors are decoded and approximate @@ -68,13 +28,20 @@ struct IndexScalarQuantizer : IndexFlatCodes { */ struct IndexIVFScalarQuantizer : IndexIVF { - ScalarQuantizer sq; + // Baseline scalar quantizer value-type. Fork IVF still inherits + // from fork IndexIVF (needed for ConcurrentArrayInvertedLists, + // extended search params, and the 8-arg scan_codes interface), but + // the SQ state itself is the upstream struct and the scanner + // returned from get_InvertedListScanner is a fork-interface adapter + // that forwards distance computation to a baseline + // SQDistanceComputer. + ::faiss::ScalarQuantizer sq; IndexIVFScalarQuantizer( Index* quantizer, size_t d, size_t nlist, - ScalarQuantizer::QuantizerType qtype, + ::faiss::ScalarQuantizer::QuantizerType qtype, MetricType metric = METRIC_L2, bool by_residual = true); diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/RaBitQuantizer.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/RaBitQuantizer.cpp index 65bd3d9c6..350396987 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/RaBitQuantizer.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/RaBitQuantizer.cpp @@ -7,10 +7,10 @@ #include #include -#include #include #include #include +#include "simd/hook.h" diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.cpp deleted file mode 100644 index 055662bed..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.cpp +++ /dev/null @@ -1,210 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -// -*- c++ -*- - -#include - -#include -#include - -#include -#include - -#ifdef __SSE__ -#include -#endif - -#include -#include -#include - -#include -#include -#include -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; - -/******************************************************************* - * ScalarQuantizer implementation - * - * The main source of complexity is to support combinations of 4 - * variants without incurring runtime tests or virtual function calls: - * - * - 4 / 8 bits per code component - * - uniform / non-uniform - * - IP / L2 distance search - * - scalar / AVX distance computation - * - * The appropriate Quantizer object is returned via select_quantizer - * that hides the template mess. - ********************************************************************/ - -#ifdef __AVX2__ -#ifdef __F16C__ -#define USE_F16C -#else -#warning \ - "Cannot enable AVX optimizations in scalar quantizer if -mf16c is not set as well" -#endif -#endif - -/******************************************************************* - * ScalarQuantizer implementation - ********************************************************************/ - -ScalarQuantizer::ScalarQuantizer(size_t d, QuantizerType qtype) - : Quantizer(d), qtype(qtype) { - set_derived_sizes(); -} - -ScalarQuantizer::ScalarQuantizer() {} - -void ScalarQuantizer::set_derived_sizes() { - switch (qtype) { - case QT_8bit: - case QT_8bit_uniform: - case QT_8bit_direct: - case QT_8bit_direct_signed: - code_size = d; - bits = 8; - break; - case QT_4bit: - case QT_4bit_uniform: - code_size = (d + 1) / 2; - bits = 4; - break; - case QT_6bit: - code_size = (d * 6 + 7) / 8; - bits = 6; - break; - case QT_fp16: - code_size = d * 2; - bits = 16; - break; - case QT_bf16: - code_size = d * 2; - bits = 16; - break; - case QT_1bit_direct: - code_size = (d + 7) / 8; - bits = 1; - break; - } -} - -void ScalarQuantizer::train(size_t n, const float* x) { - int bit_per_dim = qtype == QT_4bit_uniform ? 4 - : qtype == QT_4bit ? 4 - : qtype == QT_6bit ? 6 - : qtype == QT_8bit_uniform ? 8 - : qtype == QT_8bit ? 8 - : qtype == QT_1bit_direct ? 1 - : -1; - - switch (qtype) { - case QT_4bit_uniform: - case QT_8bit_uniform: - train_Uniform( - rangestat, - rangestat_arg, - n * d, - 1 << bit_per_dim, - x, - trained); - break; - case QT_4bit: - case QT_8bit: - case QT_6bit: - train_NonUniform( - rangestat, - rangestat_arg, - n, - d, - 1 << bit_per_dim, - x, - trained); - break; - case QT_fp16: - case QT_8bit_direct: - case QT_bf16: - case QT_8bit_direct_signed: - case QT_1bit_direct: - // no training necessary - break; - } -} - -ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const { - /* use hook to decide use AVX512 or not */ - return sq_sel_quantizer(qtype, d, trained); -} - -void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n) - const { - std::unique_ptr squant(select_quantizer()); - - memset(codes, 0, code_size * n); -#pragma omp parallel for if (n > 1) - for (int64_t i = 0; i < n; i++) - squant->encode_vector(x + i * d, codes + i * code_size); -} - -void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const { - std::unique_ptr squant(select_quantizer()); - -#pragma omp parallel for if (n > 1) - for (int64_t i = 0; i < n; i++) - squant->decode_vector(codes + i * code_size, x + i * d); -} - -SQDistanceComputer* ScalarQuantizer::get_distance_computer( - MetricType metric) const { - FAISS_THROW_IF_NOT( - metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT || - metric == METRIC_Hamming || metric == METRIC_Jaccard); - /* use hook to decide use AVX512 or not */ - if (metric == METRIC_Hamming) { - assert(qtype == QT_1bit_direct); - return sq_get_hamming_distance_computer(metric, qtype, d, trained); - } - if (metric == METRIC_Jaccard) { - assert(qtype == QT_1bit_direct); - return sq_get_jaccard_distance_computer(metric, qtype, d, trained); - } - return sq_get_distance_computer(metric, qtype, d, trained); -} - -/******************************************************************* - * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object - * - * It is an InvertedListScanner, but is designed to work with - * IndexScalarQuantizer as well. - ********************************************************************/ - -InvertedListScanner* ScalarQuantizer::select_InvertedListScanner( - MetricType mt, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) const { - /* use hook to decide use AVX512 or not */ - return sq_sel_inv_list_scanner(mt, this, quantizer, d, store_pairs, - sel, by_residual); -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h deleted file mode 100644 index 43a2b900e..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizer.h +++ /dev/null @@ -1,146 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -// -*- c++ -*- - -#pragma once - -#include -#include -#include - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -struct InvertedListScanner; - -/** - * The uniform quantizer has a range [vmin, vmax]. The range can be - * the same for all dimensions (uniform) or specific per dimension - * (default). - */ - -struct ScalarQuantizer : Quantizer { - enum QuantizerType { - QT_8bit, ///< 8 bits per component - QT_4bit, ///< 4 bits per component - QT_8bit_uniform, ///< same, shared range for all dimensions - QT_4bit_uniform, - QT_fp16, - QT_8bit_direct, ///< fast indexing of uint8s - QT_6bit, ///< 6 bits per component, - QT_bf16, - QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from - ///< [-128 to 127] - QT_1bit_direct, ///< fast indexing of 1 bit per component - }; - - QuantizerType qtype = QT_8bit; - - /** The uniform encoder can estimate the range of representable - * values of the unform encoder using different statistics. Here - * rs = rangestat_arg */ - - // rangestat_arg. - enum RangeStat { - RS_minmax, ///< [min - rs*(max-min), max + rs*(max-min)] - RS_meanstd, ///< [mean - std * rs, mean + std * rs] - RS_quantiles, ///< [Q(rs), Q(1-rs)] - RS_optim, ///< alternate optimization of reconstruction error - }; - - RangeStat rangestat = RS_minmax; - float rangestat_arg = 0; - - /// bits per scalar code - size_t bits = 0; - - /// trained values (including the range) - std::vector trained; - - ScalarQuantizer(size_t d, QuantizerType qtype); - ScalarQuantizer(); - - /// updates internal values based on qtype and d - void set_derived_sizes(); - - void train(size_t n, const float* x) override; - - /** Encode a set of vectors - * - * @param x vectors to encode, size n * d - * @param codes output codes, size n * code_size - */ - void compute_codes(const float* x, uint8_t* codes, size_t n) const override; - - /** Decode a set of vectors - * - * @param codes codes to decode, size n * code_size - * @param x output vectors, size n * d - */ - void decode(const uint8_t* code, float* x, size_t n) const override; - - /***************************************************** - * Objects that provide methods for encoding/decoding, distance - * computation and inverted list scanning - *****************************************************/ - - struct SQuantizer { - // encodes one vector. Assumes code is filled with 0s on input! - virtual void encode_vector(const float* x, uint8_t* code) const = 0; - virtual void decode_vector(const uint8_t* code, float* x) const = 0; - - virtual ~SQuantizer() {} - }; - - SQuantizer* select_quantizer() const; - - struct SQDistanceComputer : FlatCodesDistanceComputer { - const float* q; - - SQDistanceComputer() : q(nullptr) {} - - virtual float query_to_code(const uint8_t* code) const = 0; - - float distance_to_code(const uint8_t* code) final { - return query_to_code(code); - } - - virtual void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3 - ) const { - dis0 = this->query_to_code(code_0); - dis1 = this->query_to_code(code_1); - dis2 = this->query_to_code(code_2); - dis3 = this->query_to_code(code_3); - } - }; - - SQDistanceComputer* get_distance_computer( - MetricType metric = METRIC_L2) const; - - InvertedListScanner* select_InvertedListScanner( - MetricType mt, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual = false) const; -}; - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec.h deleted file mode 100644 index e848f16fe..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec.h +++ /dev/null @@ -1,979 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - code[i] = (int)(255 * x); - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - return (code[i] + 0.5f) / 255.0f; - } -}; - -struct Codec4bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2); - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; - } -}; - -struct Codec6bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - int bits = (int)(x * 63.0); - code += (i >> 2) * 3; - switch (i & 3) { - case 0: - code[0] |= bits; - break; - case 1: - code[0] |= bits << 6; - code[1] |= bits >> 2; - break; - case 2: - code[1] |= bits << 4; - code[2] |= bits >> 4; - break; - case 3: - code[2] |= bits << 2; - break; - } - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - uint8_t bits = 0x00; - code += (i >> 2) * 3; - switch (i & 3) { - case 0: - bits = code[0] & 0x3f; - break; - case 1: - bits = code[0] >> 6; - bits |= (code[1] & 0xf) << 2; - break; - case 2: - bits = code[1] >> 4; - bits |= (code[2] & 3) << 4; - break; - case 3: - bits = code[2] >> 2; - break; - } - return (bits + 0.5f) / 63.0f; - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -enum class QuantizerTemplateScaling { - UNIFORM = 0, - NON_UNIFORM = 1 -}; - -template -struct QuantizerTemplate {}; - -template -struct QuantizerTemplate : SQuantizer { - const size_t d; - const float vmin, vdiff; - - QuantizerTemplate(size_t d, const std::vector& trained) - : d(d), vmin(trained[0]), vdiff(trained[1]) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = 0; - if (vdiff != 0) { - xi = (x[i] - vmin) / vdiff; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - } - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin + xi * vdiff; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - float xi = Codec::decode_component(code, i); - return vmin + xi * vdiff; - } -}; - -template -struct QuantizerTemplate : SQuantizer { - const size_t d; - const float *vmin, *vdiff; - - QuantizerTemplate(size_t d, const std::vector& trained) - : d(d), vmin(trained.data()), vdiff(trained.data() + d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = 0; - if (vdiff[i] != 0) { - xi = (x[i] - vmin[i]) / vdiff[i]; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - } - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin[i] + xi * vdiff[i]; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - float xi = Codec::decode_component(code, i); - return vmin[i] + xi * vdiff[i]; - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16 {}; - -template <> -struct QuantizerFP16<1> : SQuantizer { - const size_t d; - - QuantizerFP16(size_t d, const std::vector& /* unused */) : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - ((uint16_t*)code)[i] = encode_fp16(x[i]); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = decode_fp16(((uint16_t*)code)[i]); - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return decode_fp16(((uint16_t*)code)[i]); - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16 {}; - -template <> -struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer { - const size_t d; - - QuantizerBF16(size_t d, const std::vector& /* unused */) : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - ((uint16_t*)code)[i] = encode_bf16(x[i]); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = decode_bf16(((uint16_t*)code)[i]); - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return decode_bf16(((uint16_t*)code)[i]); - } -}; - -/******************************************************************* - * Specialized QuantizerTemplate for SQ4U (base version) - *******************************************************************/ - -template <> -struct QuantizerTemplate - : SQuantizer { - const size_t d; - const float vmin, vdiff; - float final_scale; - float final_bias; - - QuantizerTemplate(size_t d, const std::vector& trained) - : d(d), vmin(trained[0]), vdiff(trained[1]) { - final_scale = vdiff / 15.0f; - final_bias = vmin + vdiff * 0.5f / 15.0f; - } - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = 0; - if (vdiff != 0) { - xi = (x[i] - vmin) / vdiff; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - } - Codec4bit::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec4bit::decode_component(code, i); - x[i] = vmin + xi * vdiff; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - float xi = Codec4bit::decode_component(code, i); - return vmin + xi * vdiff; - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect {}; - -template <> -struct Quantizer8bitDirect<1> : SQuantizer { - const size_t d; - - Quantizer8bitDirect(size_t d, const std::vector& /* unused */) - : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - code[i] = (uint8_t)x[i]; - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = code[i]; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return code[i]; - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned {}; - -template <> -struct Quantizer8bitDirectSigned<1> : ScalarQuantizer::SQuantizer { - const size_t d; - - Quantizer8bitDirectSigned(size_t d, const std::vector& /* unused */) - : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - code[i] = (uint8_t)(x[i] + 128); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = code[i] - 128; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return code[i] - 128; - } -}; - -/******************************************************************* - * 1bit_direct quantizer - * - * Note: The 1bit_direct quantizer currently does not support the - *`reconstruct_component` method and does not provide SIMDWIDTH support. - *******************************************************************/ - -struct Quantizer1bitDirect : SQuantizer { - const size_t d; - - Quantizer1bitDirect(size_t d, const std::vector& /* unused */) - : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - size_t code_size = (d + 7) / 8; - for (size_t i = 0; i < code_size; i++) { - code[i] = (uint8_t)x[i]; - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - size_t code_size = (d + 7) / 8; - for (size_t i = 0; i < code_size; i++) { - x[i] = (float)code[i]; - } - } -}; - -template -SQuantizer* select_quantizer_1( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case ScalarQuantizer::QT_8bit: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_6bit: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_4bit: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_8bit_uniform: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_4bit_uniform: - return new QuantizerTemplate( - d, trained); - case ScalarQuantizer::QT_fp16: - return new QuantizerFP16(d, trained); - case ScalarQuantizer::QT_bf16: - return new QuantizerBF16(d, trained); - case ScalarQuantizer::QT_8bit_direct: - return new Quantizer8bitDirect(d, trained); - case ScalarQuantizer::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned(d, trained); - case ScalarQuantizer::QT_1bit_direct: - return new Quantizer1bitDirect(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * DistanceComputerSQ4UByte: specialized distance computer for SQ4U - * Always computes L2 distance in quantized space regardless of Similarity - *******************************************************************/ - -template -struct DistanceComputerSQ4UByte : SQDistanceComputer { - using Quantizer = - QuantizerTemplate; - Quantizer quant; - - // Quantized query codes - uint8_t* q_codes; - - DistanceComputerSQ4UByte(size_t d, const std::vector& trained) - : quant(d, trained) { - q_codes = new uint8_t[(d + 1) / 2]; - } - - ~DistanceComputerSQ4UByte() { - delete[] q_codes; - } - - void set_query(const float* x) override { - // Quantize query to 4-bit codes - // Database layout: low nibble = even index, high nibble = odd index - float inv_scale = 1.0f / quant.final_scale; - float offset = quant.vmin; - - for (size_t i = 0; i < quant.d; i += 2) { - // Quantize first component (even index -> low nibble) - float val0 = (x[i] - offset) * inv_scale; - int q0 = static_cast(std::floor(val0)); - q0 = std::max(0, std::min(15, q0)); - - // Quantize second component (odd index -> high nibble) - int q1 = 0; - if (i + 1 < quant.d) { - float val1 = (x[i + 1] - offset) * inv_scale; - q1 = static_cast(std::floor(val1)); - q1 = std::max(0, std::min(15, q1)); - } - - // Pack: low nibble = q0 (even), high nibble = q1 (odd) - q_codes[i / 2] = q0 | (q1 << 4); - } - } - - // Compute L2 distance between query and database code - float compute_distance_l2(const uint8_t* code8) const { - int32_t accu = 0; - const uint8_t* qc = q_codes; - - for (size_t i = 0; i < quant.d; i += 2) { - uint8_t qbyte = *qc++; - uint8_t dbyte = *code8++; - - // Extract nibbles: low nibble = even index, high nibble = odd index - int q0 = qbyte & 15; // even (low nibble) - int q1 = qbyte >> 4; // odd (high nibble) - int d0 = dbyte & 15; // even (low nibble) - int d1 = dbyte >> 4; // odd (high nibble) - - // Compute differences - int diff0 = q0 - d0; - int diff1 = q1 - d1; - - // Accumulate squared differences - accu += diff0 * diff0 + diff1 * diff1; - } - - // Scale to floating point - float scale = quant.final_scale; - return accu * scale * scale; - } - - // Compute L2 distance between two codes - float compute_code_distance_l2(const uint8_t* code1, const uint8_t* code2) - const { - int32_t accu = 0; - - for (size_t i = 0; i < quant.d; i += 2) { - uint8_t byte1 = *code1++; - uint8_t byte2 = *code2++; - - // Extract nibbles: low nibble = even index, high nibble = odd index - int c1_0 = byte1 & 15; // even (low nibble) - int c1_1 = byte1 >> 4; // odd (high nibble) - int c2_0 = byte2 & 15; // even (low nibble) - int c2_1 = byte2 >> 4; // odd (high nibble) - - // Compute differences - int diff0 = c1_0 - c2_0; - int diff1 = c1_1 - c2_1; - - // Accumulate squared differences - accu += diff0 * diff0 + diff1 * diff1; - } - - // Scale to floating point - float scale = quant.final_scale; - return accu * scale * scale; - } - - float query_to_code(const uint8_t* code) const override { - return compute_distance_l2(code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - const uint8_t* code_i = codes + i * code_size; - const uint8_t* code_j = codes + j * code_size; - return compute_code_distance_l2(code_i, code_j); - } -}; - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2 {}; - -template <> -struct SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2(const float* y) : y(y) {} - - /******* scalar accumulator *******/ - - float accu; - - FAISS_ALWAYS_INLINE void begin() { - accu = 0; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_component(float x) { - float tmp = *yi++ - x; - accu += tmp * tmp; - } - - FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { - float tmp = x1 - x2; - accu += tmp * tmp; - } - - FAISS_ALWAYS_INLINE float result() { - return accu; - } -}; - -template -struct SimilarityIP {}; - -template <> -struct SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - const float *y, *yi; - - float accu; - - explicit SimilarityIP(const float* y) : y(y) {} - - FAISS_ALWAYS_INLINE void begin() { - accu = 0; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_component(float x) { - accu += *yi++ * x; - } - - FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { - accu += x1 * x2; - } - - FAISS_ALWAYS_INLINE float result() { - return accu; - } -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ - -template -struct DCTemplate : SQDistanceComputer {}; - -template -struct DCTemplate : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float xi = quant.reconstruct_component(code, i); - sim.add_component(xi); - } - return sim.result(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float x1 = quant.reconstruct_component(code1, i); - float x2 = quant.reconstruct_component(code2, i); - sim.add_component_2(x1, x2); - } - return sim.result(); - } - - void set_query(const float* x) final { - q = x; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(q, code); - } -}; - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ - -template -struct DistanceComputerByte : SQDistanceComputer {}; - -template -struct DistanceComputerByte : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - int accu = 0; - for (int i = 0; i < d; i++) { - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - accu += int(code1[i]) * code2[i]; - } else { - int diff = int(code1[i]) - code2[i]; - accu += diff * diff; - } - } - return accu; - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -SQDistanceComputer* select_distance_computer( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - switch (qtype) { - case ScalarQuantizer::QT_8bit_uniform: - return new DCTemplate< - QuantizerTemplate, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_4bit_uniform: - return new DistanceComputerSQ4UByte(d, trained); - - case ScalarQuantizer::QT_8bit: - return new DCTemplate< - QuantizerTemplate, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_6bit: - return new DCTemplate< - QuantizerTemplate, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_4bit: - return new DCTemplate< - QuantizerTemplate, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_fp16: - return new DCTemplate, Sim, SIMDWIDTH>( - d, trained); - - case ScalarQuantizer::QT_bf16: - return new DCTemplate, Sim, SIMDWIDTH>( - d, trained); - - case ScalarQuantizer::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte(d, trained); - } else { - return new DCTemplate< - Quantizer8bitDirect, - Sim, - SIMDWIDTH>(d, trained); - } - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate< - Quantizer8bitDirectSigned, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -// This wrapper adapts Jaccard and Hamming binary computers to the -// SQDistanceComputer interface -template -struct BinarySQDistanceComputerWrapper : SQDistanceComputer { - BinaryComputerType binary_computer; - size_t code_size; - std::vector tmp; - - BinarySQDistanceComputerWrapper(size_t code_size, const std::vector&) - : code_size(code_size), tmp(code_size) {} - - void set_query(const float* x) final { - for (size_t i = 0; i < code_size; ++i) { - tmp[i] = (uint8_t)x[i]; - } - binary_computer.set(tmp.data(), code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return binary_computer.compute(code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - const uint8_t* code_i = codes + i * code_size; - const uint8_t* code_j = codes + j * code_size; - - BinaryComputerType temp_computer; - temp_computer.set(code_i, code_size); - return temp_computer.compute(code_j); - } -}; - -SQDistanceComputer* select_hamming_distance_computer( - size_t d, - const std::vector& trained); - -SQDistanceComputer* select_jaccard_distance_computer( - size_t d, - const std::vector& trained); - -template -InvertedListScanner* sel3_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - if (DCClass::Sim::metric_type == METRIC_L2) { - return new IVFSQScannerL2( - sq->d, - sq->trained, - sq->code_size, - quantizer, - store_pairs, - sel, - r); - } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) { - return new IVFSQScannerIP( - sq->d, sq->trained, sq->code_size, store_pairs, sel, r); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -template -InvertedListScanner* sel2_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - if (sel) { - if (store_pairs) { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } else { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } - } else { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } -} - -template -InvertedListScanner* sel12_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate; - using DCClass = DCTemplate; - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case ScalarQuantizer::QT_8bit_uniform: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_4bit_uniform: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_8bit: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_4bit: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_6bit: - return sel12_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_fp16: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_bf16: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner< - DistanceComputerByte>( - sq, quantizer, store_pairs, sel, r); - } else { - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx.h deleted file mode 100644 index ef02f9df9..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx.h +++ /dev/null @@ -1,1230 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit_avx : public Codec8bit { - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - const uint64_t c8 = *(uint64_t*)(code + i); - - const __m128i i8 = _mm_set1_epi64x(c8); - const __m256i i32 = _mm256_cvtepu8_epi32(i8); - const __m256 f8 = _mm256_cvtepi32_ps(i32); - const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f); - const __m256 one_255 = _mm256_set1_ps(1.f / 255.f); - return _mm256_fmadd_ps(f8, one_255, half_one_255); - } -}; - -struct Codec4bit_avx : public Codec4bit { - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - uint32_t c4 = *(uint32_t*)(code + (i >> 1)); - uint32_t mask = 0x0f0f0f0f; - uint32_t c4ev = c4 & mask; - uint32_t c4od = (c4 >> 4) & mask; - - // the 8 lower bytes of c8 contain the values - __m128i c8 = - _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od)); - __m128i c4lo = _mm_cvtepu8_epi32(c8); - __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4)); - __m256i i8 = _mm256_castsi128_si256(c4lo); - i8 = _mm256_insertf128_si256(i8, c4hi, 1); - __m256 f8 = _mm256_cvtepi32_ps(i8); - __m256 half = _mm256_set1_ps(0.5f); - f8 = _mm256_add_ps(f8, half); - __m256 one_255 = _mm256_set1_ps(1.f / 15.f); - return _mm256_mul_ps(f8, one_255); - } - - static FAISS_ALWAYS_INLINE __m256i - decode_8_components_int(const uint8_t* code, int i) { - // Load 4 bytes containing 8 nibbles - uint32_t c4 = *(uint32_t*)(code + (i >> 1)); - uint32_t mask = 0x0f0f0f0f; - uint32_t c4ev = c4 & mask; // Even nibbles - uint32_t c4od = (c4 >> 4) & mask; // Odd nibbles - - // Interleave even and odd nibbles - __m128i c8 = - _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od)); - - // Convert to 8x32-bit integers - __m128i c4lo = _mm_cvtepu8_epi32(c8); - __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4)); - __m256i result = _mm256_castsi128_si256(c4lo); - result = _mm256_insertf128_si256(result, c4hi, 1); - - return result; - } -}; - -struct Codec6bit_avx : public Codec6bit { - /* Load 6 bytes that represent 8 6-bit values, return them as a - * 8*32 bit vector register */ - static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) { - const __m128i perm = _mm_set_epi8( - -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0); - const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0); - - // load 6 bytes - __m128i c1 = - _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]); - - // put in 8 * 32 bits - __m128i c2 = _mm_shuffle_epi8(c1, perm); - __m256i c3 = _mm256_cvtepi16_epi32(c2); - - // shift and mask out useless bits - __m256i c4 = _mm256_srlv_epi32(c3, shifts); - __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4); - return c5; - } - - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here - // // for the reference, maybe, it becomes used oned day. - // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3); - // const uint32_t* data32 = (const uint32_t*)data16; - // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32); - // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL); - // const __m128i i8 = _mm_set1_epi64x(vext); - // const __m256i i32 = _mm256_cvtepi8_epi32(i8); - // const __m256 f8 = _mm256_cvtepi32_ps(i32); - // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); - // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); - // return _mm256_fmadd_ps(f8, one_255, half_one_255); - - __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3)); - __m256 f8 = _mm256_cvtepi32_ps(i8); - // this could also be done with bit manipulations but it is - // not obviously faster - const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); - const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); - return _mm256_fmadd_ps(f8, one_255, half_one_255); - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -template -struct QuantizerTemplate_avx {}; - -template -struct QuantizerTemplate_avx - : public QuantizerTemplate { - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} -}; - -template -struct QuantizerTemplate_avx - : public QuantizerTemplate { - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256 xi = Codec::decode_8_components(code, i); - return _mm256_fmadd_ps( - xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin)); - } -}; - -template -struct QuantizerTemplate_avx - : public QuantizerTemplate { - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} -}; - -template -struct QuantizerTemplate_avx - : public QuantizerTemplate { - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256 xi = Codec::decode_8_components(code, i); - return _mm256_fmadd_ps( - xi, - _mm256_loadu_ps(this->vdiff + i), - _mm256_loadu_ps(this->vmin + i)); - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16_avx {}; - -template <> -struct QuantizerFP16_avx<1> : public QuantizerFP16<1> { - QuantizerFP16_avx(size_t d, const std::vector& unused) - : QuantizerFP16<1>(d, unused) {} -}; - -template <> -struct QuantizerFP16_avx<8> : public QuantizerFP16<1> { - QuantizerFP16_avx(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i)); - return _mm256_cvtph_ps(codei); - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16_avx {}; - -template <> -struct QuantizerBF16_avx<1> : public QuantizerBF16<1> { - QuantizerBF16_avx(size_t d, const std::vector& unused) - : QuantizerBF16<1>(d, unused) {} -}; - -template <> -struct QuantizerBF16_avx<8> : public QuantizerBF16<1> { - QuantizerBF16_avx(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i)); - __m256i code_256i = _mm256_cvtepu16_epi32(code_128i); - code_256i = _mm256_slli_epi32(code_256i, 16); - return _mm256_castsi256_ps(code_256i); - } -}; - -/******************************************************************* - * Specialized QuantizerTemplate for SQ4U - *******************************************************************/ - -template <> -struct QuantizerTemplate_avx< - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM, - 8> - : public QuantizerTemplate< - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM, - 1> { - float final_scale; - float final_bias; - - QuantizerTemplate_avx(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM, - 1>(d, trained) { - final_scale = this->vdiff / 15.0f; - final_bias = this->vmin + this->vdiff * 0.5f / 15.0f; - } - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256i nibbles = Codec4bit_avx::decode_8_components_int(code, i); - __m256 nibbles_f = _mm256_cvtepi32_ps(nibbles); - - return _mm256_fmadd_ps( - nibbles_f, - _mm256_set1_ps(final_scale), - _mm256_set1_ps(final_bias)); - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect_avx {}; - -template <> -struct Quantizer8bitDirect_avx<1> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_avx(size_t d, const std::vector& unused) - : Quantizer8bitDirect(d, unused) {} -}; - -template <> -struct Quantizer8bitDirect_avx<8> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_avx(size_t d, const std::vector& trained) - : Quantizer8bitDirect<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 - return _mm256_cvtepi32_ps(y8); // 8 * float32 - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned_avx {}; - -template <> -struct Quantizer8bitDirectSigned_avx<1> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_avx(size_t d, const std::vector& unused) - : Quantizer8bitDirectSigned(d, unused) {} -}; - -template <> -struct Quantizer8bitDirectSigned_avx<8> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_avx(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 - __m256i c8 = _mm256_set1_epi32(128); - __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes - return _mm256_cvtepi32_ps(z8); // 8 * float32 - } -}; - -template -SQuantizer* select_quantizer_1_avx( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case QuantizerType::QT_8bit: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_6bit: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_4bit: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_avx( - d, trained); - case QuantizerType::QT_fp16: - return new QuantizerFP16_avx(d, trained); - case QuantizerType::QT_bf16: - return new QuantizerBF16_avx(d, trained); - case QuantizerType::QT_8bit_direct: - return new Quantizer8bitDirect_avx(d, trained); - case QuantizerType::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned_avx(d, trained); - case QuantizerType::QT_1bit_direct: - // todo: add more SIMDWIDTH support for avx if needed - return new Quantizer1bitDirect(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2_avx {}; - -template <> -struct SimilarityL2_avx<1> : public SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - explicit SimilarityL2_avx(const float* y) : SimilarityL2<1>(y) {} -}; - -template <> -struct SimilarityL2_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2_avx(const float* y) : y(y) {} - __m256 accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = _mm256_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { - __m256 yiv = _mm256_loadu_ps(yi); - yi += 8; - __m256 tmp = _mm256_sub_ps(yiv, x); - accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); - } - - FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y_2) { - __m256 tmp = _mm256_sub_ps(y_2, x); - accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); - } - - FAISS_ALWAYS_INLINE float result_8() { - const __m128 sum = _mm_add_ps( - _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); - const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); - const __m128 v1 = _mm_add_ps(sum, v0); - __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 v3 = _mm_add_ps(v1, v2); - return _mm_cvtss_f32(v3); - } -}; - -template -struct SimilarityIP_avx {}; - -template <> -struct SimilarityIP_avx<1> : public SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - explicit SimilarityIP_avx(const float* y) : SimilarityIP<1>(y) {} -}; - -template <> -struct SimilarityIP_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - float accu; - - explicit SimilarityIP_avx(const float* y) : y(y) {} - - __m256 accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = _mm256_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { - __m256 yiv = _mm256_loadu_ps(yi); - yi += 8; - accu8 = _mm256_fmadd_ps(yiv, x, accu8); - } - - FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x1, __m256 x2) { - accu8 = _mm256_fmadd_ps(x1, x2, accu8); - } - - FAISS_ALWAYS_INLINE float result_8() { - const __m128 sum = _mm_add_ps( - _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); - const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); - const __m128 v1 = _mm_add_ps(sum, v0); - __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 v3 = _mm_add_ps(v1, v2); - return _mm_cvtss_f32(v3); - } -}; - -/******************************************************************* - * SQ4U specialized distance computer (AVX2 version) - *******************************************************************/ - -template -struct DistanceComputerSQ4UByte_avx : SQDistanceComputer { - using Quantizer = QuantizerTemplate_avx< - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM, - 8>; - using Sim = Similarity; - - Quantizer quant; - std::vector q_lo; - std::vector q_hi; - float final_scale_sq; - - DistanceComputerSQ4UByte_avx(size_t d, const std::vector& trained) - : quant(d, trained), - q_lo((d + 1) / 2 + 32, 0), - q_hi((d + 1) / 2 + 32, 0) { - final_scale_sq = quant.final_scale * quant.final_scale; - } - - void set_query(const float* x) final { - float inv_scale = 1.0f / quant.final_scale; - float offset = quant.vmin; - - for (size_t i = 0; i < quant.d; i++) { - float val = (x[i] - offset) * inv_scale; - int code = (int)std::floor(val); - if (code < 0) - code = 0; - if (code > 15) - code = 15; - - if (i % 2 == 0) { - q_lo[i / 2] = (uint8_t)code; - } else { - q_hi[i / 2] = (uint8_t)code; - } - } - } - - // Only computes L2 distance - float compute_distance(const float* x, const uint8_t* code) const { - return compute_distance_l2(code); - } - - float compute_distance_l2(const uint8_t* code) const { - const size_t d = quant.d; - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - __m256i acc = _mm256_setzero_si256(); - const __m256i mask_f = _mm256_set1_epi8(0xF); - const __m256i one = _mm256_set1_epi16(1); - - size_t i = 0; - // Process 64 dimensions per iteration (32 bytes = 64 nibbles) - for (; i + 64 <= d; i += 64) { - __m256i c256 = _mm256_loadu_si256((const __m256i*)(code + i / 2)); - - __m256i nibbles_lo = _mm256_and_si256(c256, mask_f); - __m256i nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c256, 4), mask_f); - - __m256i q_lo_vec = - _mm256_loadu_si256((const __m256i*)(q_lo_ptr + i / 2)); - __m256i q_hi_vec = - _mm256_loadu_si256((const __m256i*)(q_hi_ptr + i / 2)); - - // Compute absolute differences - __m256i diff_lo = _mm256_sub_epi8(q_lo_vec, nibbles_lo); - __m256i diff_hi = _mm256_sub_epi8(q_hi_vec, nibbles_hi); - - // AVX2 doesn't have _mm256_abs_epi8, so we use max(x, -x) - diff_lo = _mm256_max_epi8( - diff_lo, _mm256_sub_epi8(_mm256_setzero_si256(), diff_lo)); - diff_hi = _mm256_max_epi8( - diff_hi, _mm256_sub_epi8(_mm256_setzero_si256(), diff_hi)); - - // Square using maddubs: treats input as unsigned bytes - __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); - __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); - - // Accumulate to 32-bit - __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); - __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); - - acc = _mm256_add_epi32(acc, sum_lo); - acc = _mm256_add_epi32(acc, sum_hi); - } - - // Horizontal reduction of acc - __m128i acc_lo = _mm256_castsi256_si128(acc); - __m128i acc_hi = _mm256_extracti128_si256(acc, 1); - acc_lo = _mm_add_epi32(acc_lo, acc_hi); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - int32_t sum = _mm_cvtsi128_si32(acc_lo); - - // Handle remaining dimensions scalar - for (; i < d; i++) { - uint8_t c = code[i / 2]; - uint8_t nibble; - if (i % 2 == 0) { - nibble = c & 0xF; - } else { - nibble = (c >> 4) & 0xF; - } - - int diff; - if (i % 2 == 0) { - diff = (int)q_lo[i / 2] - (int)nibble; - } else { - diff = (int)q_hi[i / 2] - (int)nibble; - } - sum += diff * diff; - } - - return sum * final_scale_sq; - } - - float compute_code_distance_l2(const uint8_t* code1, const uint8_t* code2) - const { - const size_t d = quant.d; - __m256i acc = _mm256_setzero_si256(); - const __m256i mask_f = _mm256_set1_epi8(0xF); - const __m256i one = _mm256_set1_epi16(1); - - size_t i = 0; - for (; i + 64 <= d; i += 64) { - __m256i c1_256 = - _mm256_loadu_si256((const __m256i*)(code1 + i / 2)); - __m256i c2_256 = - _mm256_loadu_si256((const __m256i*)(code2 + i / 2)); - - __m256i c1_nibbles_lo = _mm256_and_si256(c1_256, mask_f); - __m256i c1_nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c1_256, 4), mask_f); - - __m256i c2_nibbles_lo = _mm256_and_si256(c2_256, mask_f); - __m256i c2_nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c2_256, 4), mask_f); - - __m256i diff_lo = _mm256_sub_epi8(c1_nibbles_lo, c2_nibbles_lo); - __m256i diff_hi = _mm256_sub_epi8(c1_nibbles_hi, c2_nibbles_hi); - - diff_lo = _mm256_max_epi8( - diff_lo, _mm256_sub_epi8(_mm256_setzero_si256(), diff_lo)); - diff_hi = _mm256_max_epi8( - diff_hi, _mm256_sub_epi8(_mm256_setzero_si256(), diff_hi)); - - __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); - __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); - - __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); - __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); - - acc = _mm256_add_epi32(acc, sum_lo); - acc = _mm256_add_epi32(acc, sum_hi); - } - - __m128i acc_lo = _mm256_castsi256_si128(acc); - __m128i acc_hi = _mm256_extracti128_si256(acc, 1); - acc_lo = _mm_add_epi32(acc_lo, acc_hi); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - int32_t sum = _mm_cvtsi128_si32(acc_lo); - - for (; i < d; i++) { - uint8_t c1 = code1[i / 2]; - uint8_t c2 = code2[i / 2]; - uint8_t n1, n2; - if (i % 2 == 0) { - n1 = c1 & 0xF; - n2 = c2 & 0xF; - } else { - n1 = (c1 >> 4) & 0xF; - n2 = (c2 >> 4) & 0xF; - } - int diff = (int)n1 - (int)n2; - sum += diff * diff; - } - - return sum * final_scale_sq; - } - - float operator()(idx_t i) final { - return compute_distance(nullptr, codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance_l2( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(nullptr, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const { - const size_t d = quant.d; - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - __m256i acc0 = _mm256_setzero_si256(); - __m256i acc1 = _mm256_setzero_si256(); - __m256i acc2 = _mm256_setzero_si256(); - __m256i acc3 = _mm256_setzero_si256(); - - const __m256i mask_f = _mm256_set1_epi8(0xF); - const __m256i one = _mm256_set1_epi16(1); - const __m256i zero = _mm256_setzero_si256(); - - size_t i = 0; - // Process 128 dimensions per outer iteration - for (; i + 128 <= d; i += 128) { - // Chunk 0: first 64 dimensions - __m256i q_lo_0 = - _mm256_loadu_si256((const __m256i*)(q_lo_ptr + i / 2)); - __m256i q_hi_0 = - _mm256_loadu_si256((const __m256i*)(q_hi_ptr + i / 2)); - - auto process_chunk_64 = [&](const uint8_t* code, - __m256i& acc, - __m256i q_lo, - __m256i q_hi, - int offset) { - __m256i c = _mm256_loadu_si256( - (const __m256i*)(code + i / 2 + offset)); - __m256i nibbles_lo = _mm256_and_si256(c, mask_f); - __m256i nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c, 4), mask_f); - - __m256i diff_lo = _mm256_sub_epi8(q_lo, nibbles_lo); - __m256i diff_hi = _mm256_sub_epi8(q_hi, nibbles_hi); - - diff_lo = _mm256_max_epi8( - diff_lo, _mm256_sub_epi8(zero, diff_lo)); - diff_hi = _mm256_max_epi8( - diff_hi, _mm256_sub_epi8(zero, diff_hi)); - - __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); - __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); - - __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); - __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); - - acc = _mm256_add_epi32(acc, sum_lo); - acc = _mm256_add_epi32(acc, sum_hi); - }; - - process_chunk_64(code_0, acc0, q_lo_0, q_hi_0, 0); - process_chunk_64(code_1, acc1, q_lo_0, q_hi_0, 0); - process_chunk_64(code_2, acc2, q_lo_0, q_hi_0, 0); - process_chunk_64(code_3, acc3, q_lo_0, q_hi_0, 0); - - // Chunk 1: next 64 dimensions - __m256i q_lo_1 = - _mm256_loadu_si256((const __m256i*)(q_lo_ptr + i / 2 + 32)); - __m256i q_hi_1 = - _mm256_loadu_si256((const __m256i*)(q_hi_ptr + i / 2 + 32)); - - process_chunk_64(code_0, acc0, q_lo_1, q_hi_1, 32); - process_chunk_64(code_1, acc1, q_lo_1, q_hi_1, 32); - process_chunk_64(code_2, acc2, q_lo_1, q_hi_1, 32); - process_chunk_64(code_3, acc3, q_lo_1, q_hi_1, 32); - } - - // Handle remaining 64-dimensional chunk - if (i + 64 <= d) { - __m256i q_lo_0 = - _mm256_loadu_si256((const __m256i*)(q_lo_ptr + i / 2)); - __m256i q_hi_0 = - _mm256_loadu_si256((const __m256i*)(q_hi_ptr + i / 2)); - - auto process = [&](const uint8_t* code, __m256i& acc) { - __m256i c = _mm256_loadu_si256((const __m256i*)(code + i / 2)); - __m256i nibbles_lo = _mm256_and_si256(c, mask_f); - __m256i nibbles_hi = - _mm256_and_si256(_mm256_srli_epi16(c, 4), mask_f); - - __m256i diff_lo = _mm256_sub_epi8(q_lo_0, nibbles_lo); - __m256i diff_hi = _mm256_sub_epi8(q_hi_0, nibbles_hi); - - diff_lo = _mm256_max_epi8( - diff_lo, _mm256_sub_epi8(zero, diff_lo)); - diff_hi = _mm256_max_epi8( - diff_hi, _mm256_sub_epi8(zero, diff_hi)); - - __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); - __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); - - __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); - __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); - - acc = _mm256_add_epi32(acc, sum_lo); - acc = _mm256_add_epi32(acc, sum_hi); - }; - - process(code_0, acc0); - process(code_1, acc1); - process(code_2, acc2); - process(code_3, acc3); - - i += 64; - } - - // Horizontal reductions - auto reduce = [](const __m256i& acc) -> int32_t { - __m128i acc_lo = _mm256_castsi256_si128(acc); - __m128i acc_hi = _mm256_extracti128_si256(acc, 1); - acc_lo = _mm_add_epi32(acc_lo, acc_hi); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); - return _mm_cvtsi128_si32(acc_lo); - }; - - dis0 = reduce(acc0); - dis1 = reduce(acc1); - dis2 = reduce(acc2); - dis3 = reduce(acc3); - - // Handle remaining dimensions scalar - for (; i < d; i++) { - uint8_t nibble_lo = q_lo[i / 2]; - uint8_t nibble_hi = q_hi[i / 2]; - - auto process_scalar = [&](const uint8_t* code, float& dis) { - uint8_t c = code[i / 2]; - uint8_t nibble; - if (i % 2 == 0) { - nibble = c & 0xF; - } else { - nibble = (c >> 4) & 0xF; - } - int diff; - if (i % 2 == 0) { - diff = (int)nibble_lo - (int)nibble; - } else { - diff = (int)nibble_hi - (int)nibble; - } - dis += diff * diff; - }; - - process_scalar(code_0, dis0); - process_scalar(code_1, dis1); - process_scalar(code_2, dis2); - process_scalar(code_3, dis3); - } - - dis0 *= final_scale_sq; - dis1 *= final_scale_sq; - dis2 *= final_scale_sq; - dis3 *= final_scale_sq; - } - - void distances_batch_4( - const idx_t idx0, - const idx_t idx1, - const idx_t idx2, - const idx_t idx3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) override { - query_to_codes_batch_4( - codes + idx0 * code_size, - codes + idx1 * code_size, - codes + idx2 * code_size, - codes + idx3 * code_size, - dis0, - dis1, - dis2, - dis3); - } -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ - -template -struct DCTemplate_avx : SQDistanceComputer {}; - -template -struct DCTemplate_avx - : public DCTemplate { - DCTemplate_avx(size_t d, const std::vector& trained) - : DCTemplate(d, trained) {} -}; - -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN -template -struct DCTemplate_avx : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_avx(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 x1 = quant.reconstruct_8_components(code1, i); - __m256 x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query(const float* x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return query_to_code(codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(q, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - Similarity sim0(q); - Similarity sim1(q); - Similarity sim2(q); - Similarity sim3(q); - - sim0.begin_8(); - sim1.begin_8(); - sim2.begin_8(); - sim3.begin_8(); - - for (size_t i = 0; i < quant.d; i += 8) { - __m256 xi0 = quant.reconstruct_8_components(code_0, i); - __m256 xi1 = quant.reconstruct_8_components(code_1, i); - __m256 xi2 = quant.reconstruct_8_components(code_2, i); - __m256 xi3 = quant.reconstruct_8_components(code_3, i); - sim0.add_8_components(xi0); - sim1.add_8_components(xi1); - sim2.add_8_components(xi2); - sim3.add_8_components(xi3); - } - - dis0 = sim0.result_8(); - dis1 = sim1.result_8(); - dis2 = sim2.result_8(); - dis3 = sim3.result_8(); - } -}; -FAISS_PRAGMA_IMPRECISE_FUNCTION_END - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ - -template -struct DistanceComputerByte_avx : SQDistanceComputer {}; - -template -struct DistanceComputerByte_avx - : public DistanceComputerByte { - DistanceComputerByte_avx(int d, const std::vector& unused) - : DistanceComputerByte(d, unused) {} -}; - -template -struct DistanceComputerByte_avx : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_avx(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - // __m256i accu = _mm256_setzero_ps (); - __m256i accu = _mm256_setzero_si256(); - for (int i = 0; i < d; i += 16) { - // load 16 bytes, convert to 16 uint16_t - __m256i c1 = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i*)(code1 + i))); - __m256i c2 = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i*)(code2 + i))); - __m256i prod32; - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - prod32 = _mm256_madd_epi16(c1, c2); - } else { - __m256i diff = _mm256_sub_epi16(c1, c2); - prod32 = _mm256_madd_epi16(diff, diff); - } - accu = _mm256_add_epi32(accu, prod32); - } - __m128i sum = _mm256_extractf128_si256(accu, 0); - sum = _mm_add_epi32(sum, _mm256_extractf128_si256(accu, 1)); - sum = _mm_hadd_epi32(sum, sum); - sum = _mm_hadd_epi32(sum, sum); - return _mm_cvtsi128_si32(sum); - } - - void set_query(const float* x) final { - /* - for (int i = 0; i < d; i += 8) { - __m256 xi = _mm256_loadu_ps (x + i); - __m256i ci = _mm256_cvtps_epi32(xi); - */ - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return query_to_code(codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -SQDistanceComputer* select_distance_computer_avx( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - switch (qtype) { - case QuantizerType::QT_8bit_uniform: - return new DCTemplate_avx< - QuantizerTemplate_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit_uniform: - return new DistanceComputerSQ4UByte_avx(d, trained); - - case QuantizerType::QT_8bit: - return new DCTemplate_avx< - QuantizerTemplate_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_6bit: - return new DCTemplate_avx< - QuantizerTemplate_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit: - return new DCTemplate_avx< - QuantizerTemplate_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_fp16: - return new DCTemplate_avx< - QuantizerFP16_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_bf16: - return new DCTemplate_avx< - QuantizerBF16_avx, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte_avx(d, trained); - } else { - return new DCTemplate_avx< - Quantizer8bitDirect_avx, - Sim, - SIMDWIDTH>(d, trained); - } - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate_avx< - Quantizer8bitDirectSigned_avx, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel2_InvertedListScanner_avx( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel12_InvertedListScanner_avx( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate_avx; - using DCClass = DCTemplate_avx; - return sel2_InvertedListScanner_avx( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner_avx( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_avx< - Similarity, - Codec8bit_avx, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_avx< - Similarity, - Codec4bit_avx, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_avx< - Similarity, - Codec8bit_avx, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_avx< - Similarity, - Codec4bit_avx, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_avx< - Similarity, - Codec6bit_avx, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner_avx, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_bf16: - return sel2_InvertedListScanner_avx, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner_avx< - DistanceComputerByte_avx>( - sq, quantizer, store_pairs, sel, r); - } else { - return sel2_InvertedListScanner_avx, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner_avx, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner_avx( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner_avx>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner_avx>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx512.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx512.h deleted file mode 100644 index 335bd0222..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_avx512.h +++ /dev/null @@ -1,1518 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit_avx512 : public Codec8bit_avx { - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - const __m128i c8 = _mm_loadu_si128((const __m128i_u*)(code + i)); - const __m512i i32 = _mm512_cvtepu8_epi32(c8); - const __m512 f8 = _mm512_cvtepi32_ps(i32); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 255.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - } -}; - -struct Codec4bit_avx512 : public Codec4bit_avx { - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - uint64_t c8 = *(uint64_t*)(code + (i >> 1)); - uint64_t mask = 0x0f0f0f0f0f0f0f0f; - uint64_t c8ev = c8 & mask; - uint64_t c8od = (c8 >> 4) & mask; - - // the 8 lower bytes of c8 contain the values - __m128i c16 = - _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od)); - __m256i c8lo = _mm256_cvtepu8_epi32(c16); - __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8)); - __m512i i16 = _mm512_castsi256_si512(c8lo); - i16 = _mm512_inserti32x8(i16, c8hi, 1); - __m512 f16 = _mm512_cvtepi32_ps(i16); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 15.f); - return _mm512_fmadd_ps(f16, one_255, half_one_255); - } - - static FAISS_ALWAYS_INLINE __m512i - decode_16_components_int(const uint8_t* code, int i) { - __m128i v8 = _mm_loadl_epi64((const __m128i*)(code + (i >> 1))); - __m128i v16 = _mm_unpacklo_epi8(v8, v8); - __m512i v512 = _mm512_cvtepu8_epi32(v16); - - // Shift right: 0 for even, 4 for odd - const __m512i shift_counts = _mm512_setr_epi32( - 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4); - v512 = _mm512_srlv_epi32(v512, shift_counts); - return _mm512_and_si512(v512, _mm512_set1_epi32(0xF)); - } -}; - -struct Codec6bit_avx512 : public Codec6bit_avx { - // TODO: can be optimized - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - /* - // todo aguzhva: the following piece of code is very fast - // for Intel chips. AMD ones will be very slow unless Zen3+ - - const uint16_t* data16_0 = (const uint16_t*)(code + (i >> 2) * 3); - const uint64_t* data64_0 = (const uint64_t*)data16_0; - const uint64_t val_0 = *data64_0; - const uint64_t vext_0 = _pdep_u64(val_0, 0x3F3F3F3F3F3F3F3FULL); - - const uint16_t* data16_1 = data16_0 + 3; - const uint32_t* data32_1 = (const uint32_t*)data16_1; - const uint64_t val_1 = *data32_1 + ((uint64_t)data16_1[2] << 32); - const uint64_t vext_1 = _pdep_u64(val_1, 0x3F3F3F3F3F3F3F3FULL); - - const __m128i i8 = _mm_set_epi64x(vext_1, vext_0); - const __m512i i32 = _mm512_cvtepi8_epi32(i8); - const __m512 f8 = _mm512_cvtepi32_ps(i32); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - */ - - /* - // todo aguzhva: another candidate for pdep, which might be faster - const uint16_t* data16_0 = (const uint16_t*)(code + (i >> 2) * 3); - const uint64_t* data64_0 = (const uint64_t*)data16_0; - const uint64_t val_0 = *data64_0; - const uint64_t vext_0 = _pdep_u64(val_0, 0x3F3F3F3F3F3F3F3FULL); - - const uint32_t* data32_1 = (const uint32_t*)data16_0; - const uint64_t val_1 = (val_0 >> 48) | (((uint64_t)data32_1[1]) << 16); - const uint64_t vext_1 = _pdep_u64(val_1, 0x3F3F3F3F3F3F3F3FULL); - - const __m128i i8 = _mm_set_epi64x(vext_1, vext_0); - const __m512i i32 = _mm512_cvtepi8_epi32(i8); - const __m512 f8 = _mm512_cvtepi32_ps(i32); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - */ - - // pure AVX512 implementation, slower than pdep one, but has no problems - // for AMD - - // clang-format off - - // 16 components, 16x6 bit=12 bytes - const __m128i bit_6v = - _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3); - const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v); - - // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F - // 00 01 02 03 - const __m256i shuffle_mask = _mm256_setr_epi16( - 0xFF00, 0x0100, 0x0201, 0xFF02, - 0xFF03, 0x0403, 0x0504, 0xFF05, - 0xFF06, 0x0706, 0x0807, 0xFF08, - 0xFF09, 0x0A09, 0x0B0A, 0xFF0B); - const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask); - - // 0: xxxxxxxx xx543210 - // 1: xxxx5432 10xxxxxx - // 2: xxxxxx54 3210xxxx - // 3: xxxxxxxx 543210xx - const __m256i shift_right_v = _mm256_setr_epi16( - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U); - __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v); - - // remove unneeded bits - shuffled_shifted = - _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F)); - - // scale - const __m512 f8 = - _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted)); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - - // clang-format on - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -template -struct QuantizerTemplate_avx512 {}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m512 xi = Codec::decode_16_components(code, i); - return _mm512_fmadd_ps( - xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin)); - } -}; - -template <> -struct QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - 16> - : public QuantizerTemplate_avx< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - 8> { - float final_scale; - float final_bias; - - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - 8>(d, trained) { - final_scale = this->vdiff / 15.0f; - final_bias = this->vmin + this->vdiff * 0.5f / 15.0f; - } - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m512i nibbles = Codec4bit_avx512::decode_16_components_int(code, i); - __m512 nibbles_f = _mm512_cvtepi32_ps(nibbles); - - return _mm512_fmadd_ps( - nibbles_f, - _mm512_set1_ps(final_scale), - _mm512_set1_ps(final_bias)); - } -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} -}; - -template -struct QuantizerTemplate_avx512 - : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector& trained) - : QuantizerTemplate_avx(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m512 xi = Codec::decode_16_components(code, i); - return _mm512_fmadd_ps( - xi, - _mm512_loadu_ps(this->vdiff + i), - _mm512_loadu_ps(this->vmin + i)); - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16_avx512 {}; - -template <> -struct QuantizerFP16_avx512<1> : public QuantizerFP16_avx<1> { - QuantizerFP16_avx512(size_t d, const std::vector& unused) - : QuantizerFP16_avx<1>(d, unused) {} -}; - -template <> -struct QuantizerFP16_avx512<8> : public QuantizerFP16_avx<8> { - QuantizerFP16_avx512(size_t d, const std::vector& trained) - : QuantizerFP16_avx<8>(d, trained) {} -}; - -template <> -struct QuantizerFP16_avx512<16> : public QuantizerFP16_avx<8> { - QuantizerFP16_avx512(size_t d, const std::vector& trained) - : QuantizerFP16_avx<8>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m256i codei = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); - return _mm512_cvtph_ps(codei); - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16_avx512 {}; - -template <> -struct QuantizerBF16_avx512<1> : public QuantizerBF16_avx<1> { - QuantizerBF16_avx512(size_t d, const std::vector& unused) - : QuantizerBF16_avx<1>(d, unused) {} -}; - -template <> -struct QuantizerBF16_avx512<8> : public QuantizerBF16_avx<8> { - QuantizerBF16_avx512(size_t d, const std::vector& trained) - : QuantizerBF16_avx<8>(d, trained) {} -}; - -template <> -struct QuantizerBF16_avx512<16> : public QuantizerBF16_avx<8> { - QuantizerBF16_avx512(size_t d, const std::vector& trained) - : QuantizerBF16_avx<8>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m256i code_256i = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); - __m512i code_512i = _mm512_cvtepu16_epi32(code_256i); - code_512i = _mm512_slli_epi32(code_512i, 16); - return _mm512_castsi512_ps(code_512i); - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect_avx512 {}; - -template <> -struct Quantizer8bitDirect_avx512<1> : public Quantizer8bitDirect_avx<1> { - Quantizer8bitDirect_avx512(size_t d, const std::vector& unused) - : Quantizer8bitDirect_avx<1>(d, unused) {} -}; - -template <> -struct Quantizer8bitDirect_avx512<8> : public Quantizer8bitDirect_avx<8> { - Quantizer8bitDirect_avx512(size_t d, const std::vector& trained) - : Quantizer8bitDirect_avx<8>(d, trained) {} -}; - -template <> -struct Quantizer8bitDirect_avx512<16> : public Quantizer8bitDirect_avx<8> { - Quantizer8bitDirect_avx512(size_t d, const std::vector& trained) - : Quantizer8bitDirect_avx<8>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8 - __m512i y16 = _mm512_cvtepu8_epi32(x16); // 16 * int32 - return _mm512_cvtepi32_ps(y16); // 16 * float32 - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned_avx512 {}; - -template <> -struct Quantizer8bitDirectSigned_avx512<1> - : public Quantizer8bitDirectSigned_avx<1> { - Quantizer8bitDirectSigned_avx512(size_t d, const std::vector& unused) - : Quantizer8bitDirectSigned_avx<1>(d, unused) {} -}; - -template <> -struct Quantizer8bitDirectSigned_avx512<8> - : public Quantizer8bitDirectSigned_avx<8> { - Quantizer8bitDirectSigned_avx512( - size_t d, - const std::vector& trained) - : Quantizer8bitDirectSigned_avx<8>(d, trained) {} -}; - -template <> -struct Quantizer8bitDirectSigned_avx512<16> - : public Quantizer8bitDirectSigned_avx<8> { - Quantizer8bitDirectSigned_avx512( - size_t d, - const std::vector& trained) - : Quantizer8bitDirectSigned_avx<8>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8 - __m512i y16 = _mm512_cvtepu8_epi32(x16); // 16 * int32 - __m512i c16 = _mm512_set1_epi32(128); - __m512i z16 = _mm512_sub_epi32(y16, c16); // subtract 128 from all lanes - return _mm512_cvtepi32_ps(z16); // 16 * float32 - } -}; - -template -SQuantizer* select_quantizer_1_avx512( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case QuantizerType::QT_8bit: - return new QuantizerTemplate_avx512< - Codec8bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_6bit: - return new QuantizerTemplate_avx512< - Codec6bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_4bit: - return new QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_avx512< - Codec8bit_avx512, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(d, trained); - case QuantizerType::QT_fp16: - return new QuantizerFP16_avx512(d, trained); - case QuantizerType::QT_bf16: - return new QuantizerBF16_avx512(d, trained); - case QuantizerType::QT_8bit_direct: - return new Quantizer8bitDirect_avx512(d, trained); - case QuantizerType::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned_avx512(d, trained); - case QuantizerType::QT_1bit_direct: - // todo: add more SIMDWIDTH support for avx512 if needed - return new Quantizer1bitDirect(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2_avx512 {}; - -template <> -struct SimilarityL2_avx512<1> : public SimilarityL2_avx<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - explicit SimilarityL2_avx512(const float* y) : SimilarityL2_avx<1>(y) {} -}; - -template <> -struct SimilarityL2_avx512<8> : public SimilarityL2_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - explicit SimilarityL2_avx512(const float* y) : SimilarityL2_avx<8>(y) {} -}; - -template <> -struct SimilarityL2_avx512<16> { - static constexpr int simdwidth = 16; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2_avx512(const float* y) : y(y) {} - __m512 accu16; - - FAISS_ALWAYS_INLINE void begin_16() { - accu16 = _mm512_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_16_components(__m512 x) { - __m512 yiv = _mm512_loadu_ps(yi); - yi += 16; - __m512 tmp = _mm512_sub_ps(yiv, x); - accu16 = _mm512_fmadd_ps(tmp, tmp, accu16); - } - - FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x, __m512 y_2) { - __m512 tmp = _mm512_sub_ps(y_2, x); - accu16 = _mm512_fmadd_ps(tmp, tmp, accu16); - } - - FAISS_ALWAYS_INLINE float result_16() { - return _mm512_reduce_add_ps(accu16); - } -}; - -template -struct SimilarityIP_avx512 {}; - -template <> -struct SimilarityIP_avx512<1> : public SimilarityIP_avx<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - explicit SimilarityIP_avx512(const float* y) : SimilarityIP_avx<1>(y) {} -}; - -template <> -struct SimilarityIP_avx512<8> : public SimilarityIP_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - explicit SimilarityIP_avx512(const float* y) : SimilarityIP_avx<8>(y) {} -}; - -template <> -struct SimilarityIP_avx512<16> { - static constexpr int simdwidth = 16; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - float accu; - - explicit SimilarityIP_avx512(const float* y) : y(y) {} - - __m512 accu16; - - FAISS_ALWAYS_INLINE void begin_16() { - accu16 = _mm512_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_16_components(__m512 x) { - __m512 yiv = _mm512_loadu_ps(yi); - yi += 16; - accu16 = _mm512_fmadd_ps(yiv, x, accu16); - } - - FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x1, __m512 x2) { - accu16 = _mm512_fmadd_ps(x1, x2, accu16); - } - - FAISS_ALWAYS_INLINE float result_16() { - return _mm512_reduce_add_ps(accu16); - } -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ - -template -struct DCTemplate_avx512 : SQDistanceComputer {}; - -template -struct DCTemplate_avx512 - : public DCTemplate_avx { - DCTemplate_avx512(size_t d, const std::vector& trained) - : DCTemplate_avx(d, trained) {} -}; - -template -struct DCTemplate_avx512 - : public DCTemplate_avx { - DCTemplate_avx512(size_t d, const std::vector& trained) - : DCTemplate_avx(d, trained) {} -}; - -template -struct DCTemplate_avx512 : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_avx512(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_16(); - for (size_t i = 0; i < quant.d; i += 16) { - __m512 xi = quant.reconstruct_16_components(code, i); - sim.add_16_components(xi); - } - return sim.result_16(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_16(); - for (size_t i = 0; i < quant.d; i += 16) { - __m512 x1 = quant.reconstruct_16_components(code1, i); - __m512 x2 = quant.reconstruct_16_components(code2, i); - sim.add_16_components_2(x1, x2); - } - return sim.result_16(); - } - - void set_query(const float* x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return compute_distance(q, codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(q, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - Similarity sim0(q); - Similarity sim1(q); - Similarity sim2(q); - Similarity sim3(q); - - sim0.begin_16(); - sim1.begin_16(); - sim2.begin_16(); - sim3.begin_16(); - - for (size_t i = 0; i < quant.d; i += 16) { - __m512 xi0 = quant.reconstruct_16_components(code_0, i); - __m512 xi1 = quant.reconstruct_16_components(code_1, i); - __m512 xi2 = quant.reconstruct_16_components(code_2, i); - __m512 xi3 = quant.reconstruct_16_components(code_3, i); - sim0.add_16_components(xi0); - sim1.add_16_components(xi1); - sim2.add_16_components(xi2); - sim3.add_16_components(xi3); - } - - dis0 = sim0.result_16(); - dis1 = sim1.result_16(); - dis2 = sim2.result_16(); - dis3 = sim3.result_16(); - } - - void distances_batch_4( - const idx_t idx0, - const idx_t idx1, - const idx_t idx2, - const idx_t idx3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) override { - query_to_codes_batch_4( - codes + idx0 * code_size, - codes + idx1 * code_size, - codes + idx2 * code_size, - codes + idx3 * code_size, - dis0, - dis1, - dis2, - dis3); - } -}; - -template -struct DistanceComputerSQ4UByte_avx512 : SQDistanceComputer { - using Quantizer = QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM, - 16>; - using Sim = Similarity; - - Quantizer quant; - std::vector q_lo; - std::vector q_hi; - float final_scale_sq; - - DistanceComputerSQ4UByte_avx512(size_t d, const std::vector& trained) - : quant(d, trained), - q_lo((d + 1) / 2 + 64, 0), - q_hi((d + 1) / 2 + 64, 0) { - final_scale_sq = quant.final_scale * quant.final_scale; - } - - void set_query(const float* x) final { - float inv_scale = 1.0f / quant.final_scale; - float offset = quant.vmin; - - for (size_t i = 0; i < quant.d; i++) { - float val = (x[i] - offset) * inv_scale; - int code = (int)std::floor(val); - if (code < 0) - code = 0; - if (code > 15) - code = 15; - - if (i % 2 == 0) { - q_lo[i / 2] = (uint8_t)code; - } else { - q_hi[i / 2] = (uint8_t)code; - } - } - } - - // Only computes L2 distance - float compute_distance(const float* x, const uint8_t* code) const { - return compute_distance_l2(code); - } - - float compute_distance_l2(const uint8_t* code) const { - __m512i acc = _mm512_setzero_si512(); - const size_t d = quant.d; - const __m512i mask_f = _mm512_set1_epi8(0xF); - const __m512i one = _mm512_set1_epi16(1); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - for (; i + 128 <= d; i += 128) { - __m512i c512 = _mm512_loadu_si512((const __m512i*)(code + i / 2)); - - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i q_lo_vec = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_vec = _mm512_loadu_si512(q_hi_ptr + i / 2); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sq_sum = _mm512_add_epi16(sq_lo, sq_hi); - __m512i sum_32 = _mm512_madd_epi16(sq_sum, one); - - acc = _mm512_add_epi32(acc, sum_32); - } - - // Handle remaining dimensions - if (i < d) { - size_t rem = d - i; - uint64_t mask_even = - (rem + 1) / 2 >= 64 ? -1ULL : (1ULL << ((rem + 1) / 2)) - 1; - uint64_t mask_odd = rem / 2 >= 64 ? -1ULL : (1ULL << (rem / 2)) - 1; - - __m512i c512 = _mm512_maskz_loadu_epi8(mask_even, code + i / 2); - - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i q_lo_vec = - _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); - __m512i q_hi_vec = - _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); - - __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); - nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sq_sum = _mm512_add_epi16(sq_lo, sq_hi); - __m512i sum_32 = _mm512_madd_epi16(sq_sum, one); - - acc = _mm512_add_epi32(acc, sum_32); - } - - int32_t sum = _mm512_reduce_add_epi32(acc); - return sum * final_scale_sq; - } - - float compute_code_distance_l2(const uint8_t* code1, const uint8_t* code2) - const { - __m512i acc = _mm512_setzero_si512(); - const size_t d = quant.d; - - size_t i = 0; - for (; i + 128 <= d; i += 128) { - __m512i c1_512 = - _mm512_loadu_si512((const __m512i*)(code1 + i / 2)); - __m512i c2_512 = - _mm512_loadu_si512((const __m512i*)(code2 + i / 2)); - - __m512i c1_nibbles_lo = - _mm512_and_si512(c1_512, _mm512_set1_epi8(0xF)); - __m512i c1_nibbles_hi = _mm512_and_si512( - _mm512_srli_epi16(c1_512, 4), _mm512_set1_epi8(0xF)); - - __m512i c2_nibbles_lo = - _mm512_and_si512(c2_512, _mm512_set1_epi8(0xF)); - __m512i c2_nibbles_hi = _mm512_and_si512( - _mm512_srli_epi16(c2_512, 4), _mm512_set1_epi8(0xF)); - - __m512i diff_lo = _mm512_sub_epi8(c1_nibbles_lo, c2_nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(c1_nibbles_hi, c2_nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, _mm512_set1_epi16(1)); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, _mm512_set1_epi16(1)); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - } - - // Handle remaining dimensions - if (i < d) { - size_t rem = d - i; - uint64_t mask_even = - (rem + 1) / 2 >= 64 ? -1ULL : (1ULL << ((rem + 1) / 2)) - 1; - uint64_t mask_odd = rem / 2 >= 64 ? -1ULL : (1ULL << (rem / 2)) - 1; - - __m512i c1_512 = _mm512_maskz_loadu_epi8(mask_even, code1 + i / 2); - __m512i c2_512 = _mm512_maskz_loadu_epi8(mask_even, code2 + i / 2); - - __m512i c1_nibbles_lo = - _mm512_and_si512(c1_512, _mm512_set1_epi8(0xF)); - __m512i c1_nibbles_hi = _mm512_and_si512( - _mm512_srli_epi16(c1_512, 4), _mm512_set1_epi8(0xF)); - - __m512i c2_nibbles_lo = - _mm512_and_si512(c2_512, _mm512_set1_epi8(0xF)); - __m512i c2_nibbles_hi = _mm512_and_si512( - _mm512_srli_epi16(c2_512, 4), _mm512_set1_epi8(0xF)); - - __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); - c1_nibbles_hi = _mm512_and_si512(c1_nibbles_hi, mask_odd_vec); - c2_nibbles_hi = _mm512_and_si512(c2_nibbles_hi, mask_odd_vec); - - __m512i diff_lo = _mm512_sub_epi8(c1_nibbles_lo, c2_nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(c1_nibbles_hi, c2_nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, _mm512_set1_epi16(1)); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, _mm512_set1_epi16(1)); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - } - - int32_t sum = _mm512_reduce_add_epi32(acc); - return sum * final_scale_sq; - } - - float operator()(idx_t i) final { - return compute_distance(nullptr, codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance_l2( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(nullptr, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - if constexpr (USE_VNNI) { - query_to_codes_batch_4_vnni( - code_0, code_1, code_2, code_3, dis0, dis1, dis2, dis3); - } else { - query_to_codes_batch_4_avx512( - code_0, code_1, code_2, code_3, dis0, dis1, dis2, dis3); - } - } - - __attribute__((target("avx512vnni"))) void query_to_codes_batch_4_vnni( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const { - __m512i acc0 = _mm512_setzero_si512(); - __m512i acc1 = _mm512_setzero_si512(); - __m512i acc2 = _mm512_setzero_si512(); - __m512i acc3 = _mm512_setzero_si512(); - - const size_t d = quant.d; - const __m512i mask_f = _mm512_set1_epi8(0xF); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - // 256 dimensions per iteration - for (; i + 256 <= d; i += 256) { - // Chunk 0 - __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); - - // Chunk 1 - __m512i q_lo_1 = _mm512_loadu_si512(q_lo_ptr + i / 2 + 64); - __m512i q_hi_1 = _mm512_loadu_si512(q_hi_ptr + i / 2 + 64); - - auto process_chunk = [&]( - const uint8_t* code, - __m512i& acc, - __m512i q_lo, - __m512i q_hi, - int offset) __attribute__((target("avx512vnni"))) { - __m512i c512 = _mm512_loadu_si512( - (const __m512i*)(code + i / 2 + offset)); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i diff_lo = _mm512_sub_epi8(q_lo, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); - acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); - }; - - process_chunk(code_0, acc0, q_lo_0, q_hi_0, 0); - process_chunk(code_1, acc1, q_lo_0, q_hi_0, 0); - process_chunk(code_2, acc2, q_lo_0, q_hi_0, 0); - process_chunk(code_3, acc3, q_lo_0, q_hi_0, 0); - - process_chunk(code_0, acc0, q_lo_1, q_hi_1, 64); - process_chunk(code_1, acc1, q_lo_1, q_hi_1, 64); - process_chunk(code_2, acc2, q_lo_1, q_hi_1, 64); - process_chunk(code_3, acc3, q_lo_1, q_hi_1, 64); - } - - if (i + 128 <= d) { - __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); - - auto process_chunk = [&](const uint8_t* code, __m512i& acc) - __attribute__((target("avx512vnni"))) { - __m512i c512 = - _mm512_loadu_si512((const __m512i*)(code + i / 2)); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_0, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_0, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); - acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); - }; - - process_chunk(code_0, acc0); - process_chunk(code_1, acc1); - process_chunk(code_2, acc2); - process_chunk(code_3, acc3); - - i += 128; - } - - // Handle remaining dimensions - if (i < d) { - size_t rem = d - i; - uint64_t mask_even = - (rem + 1) / 2 >= 64 ? -1ULL : (1ULL << ((rem + 1) / 2)) - 1; - uint64_t mask_odd = rem / 2 >= 64 ? -1ULL : (1ULL << (rem / 2)) - 1; - - __m512i q_lo_vec = - _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); - __m512i q_hi_vec = - _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); - __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); - - auto process = [&](const uint8_t* code, __m512i& acc) - __attribute__((target("avx512vnni"))) { - __m512i c512 = _mm512_maskz_loadu_epi8(mask_even, code + i / 2); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); - acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); - }; - - process(code_0, acc0); - process(code_1, acc1); - process(code_2, acc2); - process(code_3, acc3); - } - - dis0 = _mm512_reduce_add_epi32(acc0) * final_scale_sq; - dis1 = _mm512_reduce_add_epi32(acc1) * final_scale_sq; - dis2 = _mm512_reduce_add_epi32(acc2) * final_scale_sq; - dis3 = _mm512_reduce_add_epi32(acc3) * final_scale_sq; - } - - void query_to_codes_batch_4_avx512( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const { - __m512i acc0 = _mm512_setzero_si512(); - __m512i acc1 = _mm512_setzero_si512(); - __m512i acc2 = _mm512_setzero_si512(); - __m512i acc3 = _mm512_setzero_si512(); - - const size_t d = quant.d; - const __m512i mask_f = _mm512_set1_epi8(0xF); - const __m512i one = _mm512_set1_epi16(1); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - // 256 dimensions per iteration - for (; i + 256 <= d; i += 256) { - // Chunk 0 - __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); - - // Chunk 1 - __m512i q_lo_1 = _mm512_loadu_si512(q_lo_ptr + i / 2 + 64); - __m512i q_hi_1 = _mm512_loadu_si512(q_hi_ptr + i / 2 + 64); - - auto process_chunk = [&](const uint8_t* code, - __m512i& acc, - __m512i q_lo, - __m512i q_hi, - int offset) { - __m512i c512 = _mm512_loadu_si512( - (const __m512i*)(code + i / 2 + offset)); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i diff_lo = _mm512_sub_epi8(q_lo, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - }; - - process_chunk(code_0, acc0, q_lo_0, q_hi_0, 0); - process_chunk(code_1, acc1, q_lo_0, q_hi_0, 0); - process_chunk(code_2, acc2, q_lo_0, q_hi_0, 0); - process_chunk(code_3, acc3, q_lo_0, q_hi_0, 0); - - process_chunk(code_0, acc0, q_lo_1, q_hi_1, 64); - process_chunk(code_1, acc1, q_lo_1, q_hi_1, 64); - process_chunk(code_2, acc2, q_lo_1, q_hi_1, 64); - process_chunk(code_3, acc3, q_lo_1, q_hi_1, 64); - } - - if (i + 128 <= d) { - __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); - __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); - - auto process_chunk = [&](const uint8_t* code, __m512i& acc) { - __m512i c512 = - _mm512_loadu_si512((const __m512i*)(code + i / 2)); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_0, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_0, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - }; - - process_chunk(code_0, acc0); - process_chunk(code_1, acc1); - process_chunk(code_2, acc2); - process_chunk(code_3, acc3); - - i += 128; - } - - // Handle remaining dimensions - if (i < d) { - size_t rem = d - i; - uint64_t mask_even = - (rem + 1) / 2 >= 64 ? -1ULL : (1ULL << ((rem + 1) / 2)) - 1; - uint64_t mask_odd = rem / 2 >= 64 ? -1ULL : (1ULL << (rem / 2)) - 1; - - __m512i q_lo_vec = - _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); - __m512i q_hi_vec = - _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); - __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); - - auto process = [&](const uint8_t* code, __m512i& acc) { - __m512i c512 = _mm512_maskz_loadu_epi8(mask_even, code + i / 2); - __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); - __m512i nibbles_hi = - _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); - - nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); - - __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); - __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); - - diff_lo = _mm512_abs_epi8(diff_lo); - diff_hi = _mm512_abs_epi8(diff_hi); - - __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); - __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); - - __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); - __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); - - acc = _mm512_add_epi32(acc, sum_lo); - acc = _mm512_add_epi32(acc, sum_hi); - }; - - process(code_0, acc0); - process(code_1, acc1); - process(code_2, acc2); - process(code_3, acc3); - } - - dis0 = _mm512_reduce_add_epi32(acc0) * final_scale_sq; - dis1 = _mm512_reduce_add_epi32(acc1) * final_scale_sq; - dis2 = _mm512_reduce_add_epi32(acc2) * final_scale_sq; - dis3 = _mm512_reduce_add_epi32(acc3) * final_scale_sq; - } - - void distances_batch_4( - const idx_t idx0, - const idx_t idx1, - const idx_t idx2, - const idx_t idx3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) override { - query_to_codes_batch_4( - codes + idx0 * code_size, - codes + idx1 * code_size, - codes + idx2 * code_size, - codes + idx3 * code_size, - dis0, - dis1, - dis2, - dis3); - } -}; - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ - -template -struct DistanceComputerByte_avx512 : SQDistanceComputer {}; - -template -struct DistanceComputerByte_avx512 - : public DistanceComputerByte_avx { - DistanceComputerByte_avx512(int d, const std::vector& unused) - : DistanceComputerByte_avx(d, unused) {} -}; - -template -struct DistanceComputerByte_avx512 - : public DistanceComputerByte_avx { - DistanceComputerByte_avx512(int d, const std::vector& unused) - : DistanceComputerByte_avx(d, unused) {} -}; - -template -struct DistanceComputerByte_avx512 : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_avx512(int d, const std::vector&) - : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - // __m256i accu = _mm256_setzero_ps (); - __m512i accu = _mm512_setzero_si512(); - for (int i = 0; i < d; i += 32) { - // load 32 bytes, convert to 16 uint16_t - __m512i c1 = _mm512_cvtepu8_epi16( - _mm256_loadu_si256((__m256i*)(code1 + i))); - __m512i c2 = _mm512_cvtepu8_epi16( - _mm256_loadu_si256((__m256i*)(code2 + i))); - __m512i prod32; - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - prod32 = _mm512_madd_epi16(c1, c2); - } else { - __m512i diff = _mm512_sub_epi16(c1, c2); - prod32 = _mm512_madd_epi16(diff, diff); - } - accu = _mm512_add_epi32(accu, prod32); - } - return _mm512_reduce_add_epi32(accu); - } - - void set_query(const float* x) final { - /* - for (int i = 0; i < d; i += 8) { - __m256 xi = _mm256_loadu_ps (x + i); - __m256i ci = _mm256_cvtps_epi32(xi); - */ - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return compute_distance(q, codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -SQDistanceComputer* select_distance_computer_avx512( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - const bool use_vnni = __builtin_cpu_supports("avx512vnni"); - switch (qtype) { - case QuantizerType::QT_8bit_uniform: - return new DCTemplate_avx512< - QuantizerTemplate_avx512, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit_uniform: - if (use_vnni) { - return new DistanceComputerSQ4UByte_avx512( - d, trained); - } else { - return new DistanceComputerSQ4UByte_avx512( - d, trained); - } - - case QuantizerType::QT_8bit: - return new DCTemplate_avx512< - QuantizerTemplate_avx512< - Codec8bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_6bit: - return new DCTemplate_avx512< - QuantizerTemplate_avx512< - Codec6bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit: - return new DCTemplate_avx512< - QuantizerTemplate_avx512< - Codec4bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_fp16: - return new DCTemplate_avx512< - QuantizerFP16_avx512, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_bf16: - return new DCTemplate_avx512< - QuantizerBF16_avx512, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte_avx512( - d, trained); - } else { - return new DCTemplate_avx512< - Quantizer8bitDirect_avx512, - Sim, - SIMDWIDTH>(d, trained); - } - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate_avx512< - Quantizer8bitDirectSigned_avx512, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel2_InvertedListScanner_avx512( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel12_InvertedListScanner_avx512( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate_avx512; - using DCClass = DCTemplate_avx512; - return sel2_InvertedListScanner_avx512( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner_avx512( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec8bit_avx512, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec4bit_avx512, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec8bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec4bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_avx512< - Similarity, - Codec6bit_avx512, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner_avx512, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_bf16: - return sel2_InvertedListScanner_avx512, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner_avx512< - DistanceComputerByte_avx512>( - sq, quantizer, store_pairs, sel, r); - } else { - return sel2_InvertedListScanner_avx512, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner_avx512, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner_avx512( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner_avx512>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner_avx512>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_neon.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_neon.h deleted file mode 100644 index 89040dfa1..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_neon.h +++ /dev/null @@ -1,1074 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include -#include - -#include -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit_neon : public Codec8bit { - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -}; - -struct Codec4bit_neon : public Codec4bit { - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -}; - -struct Codec6bit_neon : public Codec6bit { - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -template -struct QuantizerTemplate_neon {}; - -template -struct QuantizerTemplate_neon - : public QuantizerTemplate { - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} -}; - -template -struct QuantizerTemplate_neon - : public QuantizerTemplate { - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec::decode_8_components(code, i); - return { - vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[0], - vdupq_n_f32(this->vdiff)), - vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[1], - vdupq_n_f32(this->vdiff)) - }; - } -}; - -template <> -struct QuantizerTemplate_neon< - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM, - 8> - : public QuantizerTemplate< - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM, - 1> { - float final_scale; - float final_bias; - - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM, - 1>(d, trained) { - final_scale = this->vdiff / 15.0f; - final_bias = this->vmin + this->vdiff * 0.5f / 15.0f; - } - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec4bit_neon::decode_8_components(code, i); - return {vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[0], - vdupq_n_f32(this->vdiff)), - vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[1], - vdupq_n_f32(this->vdiff))}; - } -}; - -template -struct QuantizerTemplate_neon - : public QuantizerTemplate { - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} -}; - -template -struct QuantizerTemplate_neon - : public QuantizerTemplate { - QuantizerTemplate_neon(size_t d, const std::vector& trained) - : QuantizerTemplate(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec::decode_8_components(code, i); - - float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i); - float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i); - - return { - vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]), - vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1]) - }; - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16_neon {}; - -template <> -struct QuantizerFP16_neon<1> : public QuantizerFP16<1> { - QuantizerFP16_neon(size_t d, const std::vector& unused) - : QuantizerFP16<1>(d, unused) {} -}; - -template <> -struct QuantizerFP16_neon<8> : public QuantizerFP16<1> { - QuantizerFP16_neon(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); - return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), - vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))}; - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16_neon {}; - -template <> -struct QuantizerBF16_neon<1> : public QuantizerBF16<1> { - QuantizerBF16_neon(size_t d, const std::vector& unused) - : QuantizerBF16<1>(d, unused) {} -}; - -template <> -struct QuantizerBF16_neon<8> : public QuantizerBF16<1> { - QuantizerBF16_neon(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); - return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)), - vreinterpretq_f32_u32( - vshlq_n_u32(vmovl_u16(codei.val[1]), 16))}; - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect_neon {}; - -template <> -struct Quantizer8bitDirect_neon<1> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_neon(size_t d, const std::vector& unused) - : Quantizer8bitDirect(d, unused) {} -}; - -template <> -struct Quantizer8bitDirect_neon<8> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_neon(size_t d, const std::vector& trained) - : Quantizer8bitDirect<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i)); - uint16x8_t y8 = vmovl_u8(x8); - uint16x4_t y8_0 = vget_low_u16(y8); - uint16x4_t y8_1 = vget_high_u16(y8); - - // convert uint16 -> uint32 -> fp32 - return {vcvtq_f32_u32(vmovl_u16(y8_0)), vcvtq_f32_u32(vmovl_u16(y8_1))}; - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned_neon {}; - -template <> -struct Quantizer8bitDirectSigned_neon<1> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_neon(size_t d, const std::vector& unused) - : Quantizer8bitDirectSigned(d, unused) {} -}; - -template <> -struct Quantizer8bitDirectSigned_neon<8> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_neon(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i)); - uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16 - uint16x4_t y8_0 = vget_low_u16(y8); - uint16x4_t y8_1 = vget_high_u16(y8); - - float32x4_t z8_0 = vcvtq_f32_u32( - vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32 - float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1)); - - // subtract 128 to convert into signed numbers - return {vsubq_f32(z8_0, vmovq_n_f32(128.0)), - vsubq_f32(z8_1, vmovq_n_f32(128.0))}; - } -}; - -template -SQuantizer* select_quantizer_1_neon( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case QuantizerType::QT_8bit: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_6bit: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_4bit: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_neon( - d, trained); - case QuantizerType::QT_fp16: - return new QuantizerFP16_neon(d, trained); - case QuantizerType::QT_bf16: - return new QuantizerBF16_neon(d, trained); - case QuantizerType::QT_8bit_direct: - return new Quantizer8bitDirect_neon(d, trained); - case QuantizerType::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned_neon(d, trained); - case QuantizerType::QT_1bit_direct: - // todo: add more SIMDWIDTH support for neon if needed - return new Quantizer1bitDirect(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2_neon {}; - -template <> -struct SimilarityL2_neon<1> : public SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - explicit SimilarityL2_neon(const float* y) : SimilarityL2<1>(y) {} -}; - -template <> -struct SimilarityL2_neon<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2_neon(const float* y) : y(y) {} - float32x4x2_t accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { - float32x4x2_t yiv = vld1q_f32_x2(yi); - yi += 8; - - float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]); - float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]); - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); - - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE void add_8_components_2( - float32x4x2_t x, - float32x4x2_t y) { - float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]); - float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]); - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); - - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE float result_8() { - float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]); - float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]); - - float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0); - float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1); - return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0); - } -}; - -template -struct SimilarityIP_neon {}; - -template <> -struct SimilarityIP_neon<1> : public SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - explicit SimilarityIP_neon(const float* y) : SimilarityIP<1>(y) {} -}; - -template <> -struct SimilarityIP_neon<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - float accu; - - explicit SimilarityIP_neon(const float* y) : y(y) {} - - float32x4x2_t accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = { vdupq_n_f32(0.0f), vdupq_n_f32(0.0f) }; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { - float32x4x2_t yiv = vld1q_f32_x2(yi); - yi += 8; - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]); - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE void add_8_components_2(float32x4x2_t x1, float32x4x2_t x2) { - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]); - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE float result_8() { - float32x4x2_t sum = { - vpaddq_f32(accu8.val[0], accu8.val[0]), - vpaddq_f32(accu8.val[1], accu8.val[1]) - }; - float32x4x2_t sum2 = { - vpaddq_f32(sum.val[0], sum.val[0]), - vpaddq_f32(sum.val[1], sum.val[1]) - }; - return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0); - } -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ - -template -struct DCTemplate_neon : SQDistanceComputer {}; - -template -struct DCTemplate_neon - : public DCTemplate { - DCTemplate_neon(size_t d, const std::vector& trained) - : DCTemplate(d, trained) {} -}; - -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN -template -struct DCTemplate_neon : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_neon(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t x1 = quant.reconstruct_8_components(code1, i); - float32x4x2_t x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query(const float* x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return query_to_code(codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(q, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - Similarity sim0(q); - Similarity sim1(q); - Similarity sim2(q); - Similarity sim3(q); - - sim0.begin_8(); - sim1.begin_8(); - sim2.begin_8(); - sim3.begin_8(); - - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t xi0 = quant.reconstruct_8_components(code_0, i); - float32x4x2_t xi1 = quant.reconstruct_8_components(code_1, i); - float32x4x2_t xi2 = quant.reconstruct_8_components(code_2, i); - float32x4x2_t xi3 = quant.reconstruct_8_components(code_3, i); - sim0.add_8_components(xi0); - sim1.add_8_components(xi1); - sim2.add_8_components(xi2); - sim3.add_8_components(xi3); - } - - dis0 = sim0.result_8(); - dis1 = sim1.result_8(); - dis2 = sim2.result_8(); - dis3 = sim3.result_8(); - } -}; -FAISS_PRAGMA_IMPRECISE_FUNCTION_END - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ - -template -struct DistanceComputerByte_neon : SQDistanceComputer {}; - -template -struct DistanceComputerByte_neon - : public DistanceComputerByte { - DistanceComputerByte_neon(int d, const std::vector& unused) - : DistanceComputerByte(d, unused) {} -}; - -template -struct DistanceComputerByte_neon : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_neon(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - int accu = 0; - for (int i = 0; i < d; i++) { - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - accu += int(code1[i]) * code2[i]; - } else { - int diff = int(code1[i]) - code2[i]; - accu += diff * diff; - } - } - return accu; - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator()(idx_t i) final { - return query_to_code(codes + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -template -struct DistanceComputerSQ4UByte_neon : SQDistanceComputer { - using Quantizer = QuantizerTemplate_neon< - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM, - 8>; - using Similarity = Sim; - - Quantizer quant; - std::vector q_lo; - std::vector q_hi; - float final_scale_sq; - - DistanceComputerSQ4UByte_neon(size_t d, const std::vector& trained) - : quant(d, trained), - q_lo((d + 1) / 2 + 64, 0), - q_hi((d + 1) / 2 + 64, 0) { - final_scale_sq = quant.final_scale * quant.final_scale; - } - - void set_query(const float* x) final { - float inv_scale = 1.0f / quant.final_scale; - float offset = quant.vmin; - - for (size_t i = 0; i < quant.d; i++) { - float val = (x[i] - offset) * inv_scale; - int code = (int)std::floor(val); - if (code < 0) - code = 0; - if (code > 15) - code = 15; - - if (i % 2 == 0) { - q_lo[i / 2] = (uint8_t)code; - } else { - q_hi[i / 2] = (uint8_t)code; - } - } - } - - // Only computes L2 distance - float compute_distance(const float* x, const uint8_t* code) const { - return compute_distance_l2(code); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance_l2(code); - } - - float compute_distance_l2(const uint8_t* code) const { - uint32x4_t acc = vdupq_n_u32(0); - const size_t d = quant.d; - const uint8x16_t mask_f = vdupq_n_u8(0xF); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - for (; i + 32 <= d; i += 32) { - uint8x16_t c = vld1q_u8(code + i / 2); - - uint8x16_t nibbles_lo = vandq_u8(c, mask_f); - uint8x16_t nibbles_hi = vandq_u8(vshrq_n_u8(c, 4), mask_f); - - uint8x16_t q_lo_vec = vld1q_u8(q_lo_ptr + i / 2); - uint8x16_t q_hi_vec = vld1q_u8(q_hi_ptr + i / 2); - - uint8x16_t diff_lo = vabdq_u8(q_lo_vec, nibbles_lo); - uint8x16_t diff_hi = vabdq_u8(q_hi_vec, nibbles_hi); - - uint16x8_t sq_lo_1 = - vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); - uint16x8_t sq_lo_2 = - vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); - uint16x8_t sq_hi_1 = - vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); - uint16x8_t sq_hi_2 = - vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); - - acc = vpadalq_u16(acc, sq_lo_1); - acc = vpadalq_u16(acc, sq_lo_2); - acc = vpadalq_u16(acc, sq_hi_1); - acc = vpadalq_u16(acc, sq_hi_2); - } - - uint32_t result = vaddvq_u32(acc); - - if (i < d) { - size_t rem = d - i; - for (size_t j = 0; j < rem; j++) { - size_t idx = i + j; - uint8_t nibble_lo = q_lo[idx / 2]; - uint8_t nibble_hi = q_hi[idx / 2]; - - uint8_t c = code[idx / 2]; - uint8_t nibble; - if (idx % 2 == 0) { - nibble = c & 0xF; - } else { - nibble = (c >> 4) & 0xF; - } - int diff; - if (idx % 2 == 0) { - diff = (int)nibble_lo - (int)nibble; - } else { - diff = (int)nibble_hi - (int)nibble; - } - result += diff * diff; - } - } - - return result * final_scale_sq; - } - - float compute_code_distance_l2(const uint8_t* code1, const uint8_t* code2) - const { - uint32x4_t acc = vdupq_n_u32(0); - const size_t d = quant.d; - const uint8x16_t mask_f = vdupq_n_u8(0xF); - - size_t i = 0; - for (; i + 32 <= d; i += 32) { - uint8x16_t c1 = vld1q_u8(code1 + i / 2); - uint8x16_t c2 = vld1q_u8(code2 + i / 2); - - uint8x16_t n1_lo = vandq_u8(c1, mask_f); - uint8x16_t n1_hi = vandq_u8(vshrq_n_u8(c1, 4), mask_f); - - uint8x16_t n2_lo = vandq_u8(c2, mask_f); - uint8x16_t n2_hi = vandq_u8(vshrq_n_u8(c2, 4), mask_f); - - uint8x16_t diff_lo = vabdq_u8(n1_lo, n2_lo); - uint8x16_t diff_hi = vabdq_u8(n1_hi, n2_hi); - - uint16x8_t sq_lo_1 = - vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); - uint16x8_t sq_lo_2 = - vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); - - uint16x8_t sq_hi_1 = - vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); - uint16x8_t sq_hi_2 = - vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); - - acc = vpadalq_u16(acc, sq_lo_1); - acc = vpadalq_u16(acc, sq_lo_2); - acc = vpadalq_u16(acc, sq_hi_1); - acc = vpadalq_u16(acc, sq_hi_2); - } - - uint32_t result = vaddvq_u32(acc); - - if (i < d) { - size_t rem = d - i; - for (size_t j = 0; j < rem; j++) { - size_t idx = i + j; - uint8_t c1 = code1[idx / 2]; - uint8_t c2 = code2[idx / 2]; - uint8_t n1, n2; - if (idx % 2 == 0) { - n1 = c1 & 0xF; - n2 = c2 & 0xF; - } else { - n1 = (c1 >> 4) & 0xF; - n2 = (c2 >> 4) & 0xF; - } - int diff = (int)n1 - (int)n2; - result += diff * diff; - } - } - - return result * final_scale_sq; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance_l2( - codes + i * code_size, codes + j * code_size); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - uint32x4_t acc0 = vdupq_n_u32(0); - uint32x4_t acc1 = vdupq_n_u32(0); - uint32x4_t acc2 = vdupq_n_u32(0); - uint32x4_t acc3 = vdupq_n_u32(0); - - const size_t d = quant.d; - const uint8x16_t mask_f = vdupq_n_u8(0xF); - const uint8_t* q_lo_ptr = q_lo.data(); - const uint8_t* q_hi_ptr = q_hi.data(); - - size_t i = 0; - for (; i + 32 <= d; i += 32) { - uint8x16_t q_lo_vec = vld1q_u8(q_lo_ptr + i / 2); - uint8x16_t q_hi_vec = vld1q_u8(q_hi_ptr + i / 2); - - auto process = [&](const uint8_t* code, uint32x4_t& acc) { - uint8x16_t c = vld1q_u8(code + i / 2); - uint8x16_t nibbles_lo = vandq_u8(c, mask_f); - uint8x16_t nibbles_hi = vandq_u8(vshrq_n_u8(c, 4), mask_f); - - uint8x16_t diff_lo = vabdq_u8(q_lo_vec, nibbles_lo); - uint8x16_t diff_hi = vabdq_u8(q_hi_vec, nibbles_hi); - - uint16x8_t sq_lo_1 = - vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); - uint16x8_t sq_lo_2 = - vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); - uint16x8_t sq_hi_1 = - vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); - uint16x8_t sq_hi_2 = - vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); - - acc = vpadalq_u16(acc, sq_lo_1); - acc = vpadalq_u16(acc, sq_lo_2); - acc = vpadalq_u16(acc, sq_hi_1); - acc = vpadalq_u16(acc, sq_hi_2); - }; - - process(code_0, acc0); - process(code_1, acc1); - process(code_2, acc2); - process(code_3, acc3); - } - - dis0 = vaddvq_u32(acc0); - dis1 = vaddvq_u32(acc1); - dis2 = vaddvq_u32(acc2); - dis3 = vaddvq_u32(acc3); - - if (i < d) { - size_t rem = d - i; - for (size_t j = 0; j < rem; j++) { - size_t idx = i + j; - uint8_t nibble_lo = q_lo[idx / 2]; - uint8_t nibble_hi = q_hi[idx / 2]; - - auto process_scalar = [&](const uint8_t* code, float& dis) { - uint8_t c = code[idx / 2]; - uint8_t nibble; - if (idx % 2 == 0) { - nibble = c & 0xF; - } else { - nibble = (c >> 4) & 0xF; - } - int diff; - if (idx % 2 == 0) { - diff = (int)nibble_lo - (int)nibble; - } else { - diff = (int)nibble_hi - (int)nibble; - } - dis += diff * diff; - }; - - process_scalar(code_0, dis0); - process_scalar(code_1, dis1); - process_scalar(code_2, dis2); - process_scalar(code_3, dis3); - } - } - - dis0 *= final_scale_sq; - dis1 *= final_scale_sq; - dis2 *= final_scale_sq; - dis3 *= final_scale_sq; - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -SQDistanceComputer* select_distance_computer_neon( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - switch (qtype) { - case QuantizerType::QT_8bit_uniform: - return new DCTemplate_neon< - QuantizerTemplate_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit_uniform: - return new DistanceComputerSQ4UByte_neon(d, trained); - - case QuantizerType::QT_8bit: - return new DCTemplate_neon< - QuantizerTemplate_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_6bit: - return new DCTemplate_neon< - QuantizerTemplate_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_4bit: - return new DCTemplate_neon< - QuantizerTemplate_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_fp16: - return new DCTemplate_neon< - QuantizerFP16_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_bf16: - return new DCTemplate_neon< - QuantizerBF16_neon, - Sim, - SIMDWIDTH>(d, trained); - - case QuantizerType::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte_neon(d, trained); - } else { - return new DCTemplate_neon< - Quantizer8bitDirect_neon, - Sim, - SIMDWIDTH>(d, trained); - } - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate_neon< - Quantizer8bitDirectSigned_neon, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel2_InvertedListScanner_neon( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel12_InvertedListScanner_neon( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate_neon; - using DCClass = DCTemplate_neon; - return sel2_InvertedListScanner_neon( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner_neon( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_neon< - Similarity, - Codec8bit_neon, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_neon< - Similarity, - Codec4bit_neon, - QuantizerTemplateScaling::UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_neon< - Similarity, - Codec8bit_neon, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_neon< - Similarity, - Codec4bit_neon, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_neon< - Similarity, - Codec6bit_neon, - QuantizerTemplateScaling::NON_UNIFORM>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner_neon, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_bf16: - return sel2_InvertedListScanner_neon, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner_neon< - DistanceComputerByte_neon>( - sq, quantizer, store_pairs, sel, r); - } else { - return sel2_InvertedListScanner_neon, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner_neon, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner_neon( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner_neon>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner_neon>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_rvv.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_rvv.h deleted file mode 100644 index a49da8ee8..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerCodec_rvv.h +++ /dev/null @@ -1,1354 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#if defined(__riscv_vector) - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -using QuantizerType = ScalarQuantizer::QuantizerType; -using RangeStat = ScalarQuantizer::RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; -using SQuantizer = ScalarQuantizer::SQuantizer; - -inline size_t get_vlen_f32_m1() { - return __riscv_vsetvlmax_e32m1(); -} -inline size_t get_vlen_f32_m2() { - return __riscv_vsetvlmax_e32m2(); -} -inline size_t get_vlen_f32_m4() { - return __riscv_vsetvlmax_e32m4(); -} - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -constexpr size_t RVV_CODEC_STACK_THRESHOLD = 512; - -struct Codec8bit_rvv : public Codec8bit { - static FAISS_ALWAYS_INLINE vfloat32m4_t - decode_components(const uint8_t* code, int i, size_t vl) { - vuint8m1_t v_u8 = __riscv_vle8_v_u8m1(code + i, vl); - vuint16m2_t v_u16 = __riscv_vwcvtu_x_x_v_u16m2(v_u8, vl); - vuint32m4_t v_u32 = __riscv_vwcvtu_x_x_v_u32m4(v_u16, vl); - vfloat32m4_t v_f32 = __riscv_vfcvt_f_xu_v_f32m4(v_u32, vl); - vfloat32m4_t one_255 = __riscv_vfmv_v_f_f32m4(1.0f / 255.0f, vl); - vfloat32m4_t half_one_255 = __riscv_vfmv_v_f_f32m4(0.5f / 255.0f, vl); - return __riscv_vfmadd_vv_f32m4(v_f32, one_255, half_one_255, vl); - } -}; - -struct Codec4bit_rvv : public Codec4bit { - static FAISS_ALWAYS_INLINE vfloat32m4_t - decode_components(const uint8_t* code, int i, size_t vl) { - auto process = [&](uint32_t* unpacked_buf) -> vfloat32m4_t { - for (size_t j = 0; j < vl; ++j) { - size_t current_idx = static_cast(i) + j; - const uint8_t byte = code[current_idx / 2]; - unpacked_buf[j] = - (current_idx % 2 == 0) ? (byte & 0x0F) : (byte >> 4); - } - vuint32m4_t v_u32 = __riscv_vle32_v_u32m4(unpacked_buf, vl); - vfloat32m4_t v_f32 = __riscv_vfcvt_f_xu_v_f32m4(v_u32, vl); - vfloat32m4_t one_15 = __riscv_vfmv_v_f_f32m4(1.0f / 15.0f, vl); - vfloat32m4_t half = __riscv_vfmv_v_f_f32m4(0.5f, vl); - vfloat32m4_t temp_sum = __riscv_vfadd_vv_f32m4(v_f32, half, vl); - return __riscv_vfmul_vv_f32m4(temp_sum, one_15, vl); - }; - - if (vl <= RVV_CODEC_STACK_THRESHOLD) { - std::array stack_buf{}; - return process(stack_buf.data()); - } else { - std::vector heap_buf(vl); - return process(heap_buf.data()); - } - } -}; - -struct Codec6bit_rvv : public Codec6bit { - static FAISS_ALWAYS_INLINE void decode_components( - const uint8_t* code, - int i, - size_t vl, - float* out) { - const size_t max_chunk = __riscv_vsetvlmax_e32m4(); - - std::array unpacked_buf; - FAISS_THROW_IF_NOT_MSG( - max_chunk <= RVV_CODEC_STACK_THRESHOLD, - "RVV max_chunk exceeds stack buffer"); - - size_t offset = 0; - while (offset < vl) { - const size_t chunk_vl = std::min(vl - offset, max_chunk); - - for (size_t j = 0; j < chunk_vl; ++j) { - size_t abs_i = static_cast(i) + offset + j; - size_t tab = abs_i / 4; - size_t q = abs_i % 4; - const uint8_t* p_grp = code + tab * 3; - uint32_t x4 = 0; - if (q == 0) { - x4 = p_grp[0] & 0x3F; - } else if (q == 1) { - x4 = ((p_grp[0] >> 6) | (p_grp[1] << 2)) & 0x3F; - } else if (q == 2) { - x4 = ((p_grp[1] >> 4) | (p_grp[2] << 4)) & 0x3F; - } else { - x4 = (p_grp[2] >> 2) & 0x3F; - } - unpacked_buf[j] = x4; - } - - vuint32m4_t v_u32 = - __riscv_vle32_v_u32m4(unpacked_buf.data(), chunk_vl); - vfloat32m4_t v_f32 = __riscv_vfcvt_f_xu_v_f32m4(v_u32, chunk_vl); - - vfloat32m4_t one_63 = - __riscv_vfmv_v_f_f32m4(1.0f / 63.0f, chunk_vl); - vfloat32m4_t half_one_63 = - __riscv_vfmv_v_f_f32m4(0.5f / 63.0f, chunk_vl); - - vfloat32m4_t chunk_result = __riscv_vfmadd_vv_f32m4( - v_f32, one_63, half_one_63, chunk_vl); - - __riscv_vse32_v_f32m4(out + offset, chunk_result, chunk_vl); - - offset += chunk_vl; - } - } -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ -template -struct QuantizerTemplate_rvv {}; - -template -struct QuantizerTemplate_rvv - : public QuantizerTemplate< - Codec, - QuantizerTemplateScaling::UNIFORM, - 1> { - QuantizerTemplate_rvv(size_t d, const std::vector& trained) - : QuantizerTemplate( - d, - trained) {} - - FAISS_ALWAYS_INLINE vfloat32m4_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - vfloat32m4_t xi = Codec::decode_components(code, i, vl); - - vfloat32m4_t v_vmin = __riscv_vfmv_v_f_f32m4(this->vmin, vl); - vfloat32m4_t v_vdiff = __riscv_vfmv_v_f_f32m4(this->vdiff, vl); - - return __riscv_vfmadd_vv_f32m4(xi, v_vdiff, v_vmin, vl); - } -}; - -template -struct QuantizerTemplate_rvv - : public QuantizerTemplate< - Codec, - QuantizerTemplateScaling::NON_UNIFORM, - 1> { - QuantizerTemplate_rvv(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec, - QuantizerTemplateScaling::NON_UNIFORM, - 1>(d, trained) {} - FAISS_ALWAYS_INLINE vfloat32m4_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - vfloat32m4_t xi = Codec::decode_components(code, i, vl); - - vfloat32m4_t v_vmin = __riscv_vle32_v_f32m4(this->vmin + i, vl); - vfloat32m4_t v_vdiff = __riscv_vle32_v_f32m4(this->vdiff + i, vl); - - return __riscv_vfmadd_vv_f32m4(xi, v_vdiff, v_vmin, vl); - } -}; - -template <> -struct QuantizerTemplate_rvv< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0> - : public QuantizerTemplate< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 1> { - QuantizerTemplate_rvv(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 1>(d, trained) {} - - FAISS_ALWAYS_INLINE void reconstruct_components( - const uint8_t* code, - int i, - size_t vl, - float* out) const { - Codec6bit_rvv::decode_components(code, i, vl, out); - - const size_t max_chunk = __riscv_vsetvlmax_e32m4(); - size_t offset = 0; - - while (offset < vl) { - const size_t chunk_vl = std::min(vl - offset, max_chunk); - - vfloat32m4_t xi = __riscv_vle32_v_f32m4(out + offset, chunk_vl); - - vfloat32m4_t v_vmin = - __riscv_vle32_v_f32m4(this->vmin + i + offset, chunk_vl); - vfloat32m4_t v_vdiff = - __riscv_vle32_v_f32m4(this->vdiff + i + offset, chunk_vl); - - vfloat32m4_t result = - __riscv_vfmadd_vv_f32m4(xi, v_vdiff, v_vmin, chunk_vl); - - __riscv_vse32_v_f32m4(out + offset, result, chunk_vl); - - offset += chunk_vl; - } - } -}; - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16_rvv {}; - -template <> -struct QuantizerFP16_rvv<1> : public QuantizerFP16<1> { - QuantizerFP16_rvv(size_t d, const std::vector& unused) - : QuantizerFP16<1>(d, unused) {} -}; - -template <> -struct QuantizerFP16_rvv<0> : public QuantizerFP16<1> { - QuantizerFP16_rvv(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE vfloat32m2_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - const _Float16* code_ptr = reinterpret_cast( - code + 2 * static_cast(i)); - vfloat16m1_t v_f16 = __riscv_vle16_v_f16m1(code_ptr, vl); - return __riscv_vfwcvt_f_f_v_f32m2(v_f16, vl); - } -}; - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16_rvv {}; - -template <> -struct QuantizerBF16_rvv<1> : public QuantizerBF16<1> { - QuantizerBF16_rvv(size_t d, const std::vector& unused) - : QuantizerBF16<1>(d, unused) {} -}; - -template <> -struct QuantizerBF16_rvv<0> : public QuantizerBF16<1> { - QuantizerBF16_rvv(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE vfloat32m2_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - const uint16_t* code_ptr = reinterpret_cast( - code + 2 * static_cast(i)); - vuint16m1_t v_u16 = __riscv_vle16_v_u16m1(code_ptr, vl); - vuint32m2_t v_u32 = __riscv_vwaddu_vx_u32m2(v_u16, 0, vl); - vuint32m2_t v_shifted = __riscv_vsll_vx_u32m2(v_u32, 16, vl); - return __riscv_vreinterpret_v_u32m2_f32m2(v_shifted); - } -}; - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect_rvv {}; -template <> -struct Quantizer8bitDirect_rvv<1> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_rvv(size_t d, const std::vector& u) - : Quantizer8bitDirect(d, u) {} -}; -template <> -struct Quantizer8bitDirect_rvv<0> : public Quantizer8bitDirect<1> { - Quantizer8bitDirect_rvv(size_t d, const std::vector& t) - : Quantizer8bitDirect<1>(d, t) {} - - FAISS_ALWAYS_INLINE vfloat32m4_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - vuint8m1_t v_u8 = __riscv_vle8_v_u8m1(code + i, vl); - vuint16m2_t v_u16 = __riscv_vwcvtu_x_x_v_u16m2(v_u8, vl); - vuint32m4_t v_u32 = __riscv_vwcvtu_x_x_v_u32m4(v_u16, vl); - return __riscv_vfcvt_f_xu_v_f32m4(v_u32, vl); - } -}; - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ -template -struct Quantizer8bitDirectSigned_rvv {}; - -template <> -struct Quantizer8bitDirectSigned_rvv<1> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_rvv(size_t d, const std::vector& unused) - : Quantizer8bitDirectSigned(d, unused) {} -}; - -template <> -struct Quantizer8bitDirectSigned_rvv<0> : public Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned_rvv(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE vfloat32m4_t - reconstruct_components(const uint8_t* code, int i, size_t vl) const { - vuint8m1_t v_u8 = __riscv_vle8_v_u8m1(code + i, vl); - vuint16m2_t v_u16 = __riscv_vwcvtu_x_x_v_u16m2(v_u8, vl); - vuint32m4_t v_u32 = __riscv_vwcvtu_x_x_v_u32m4(v_u16, vl); - vfloat32m4_t v_f32 = __riscv_vfcvt_f_xu_v_f32m4(v_u32, vl); - vfloat32m4_t c128 = __riscv_vfmv_v_f_f32m4(128.0f, vl); - return __riscv_vfsub_vv_f32m4(v_f32, c128, vl); - } -}; - -template -ScalarQuantizer::SQuantizer* select_quantizer_1_rvv( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - switch (qtype) { - case ScalarQuantizer::QT_8bit: - return new QuantizerTemplate_rvv< - Codec8bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_8bit_uniform: - return new QuantizerTemplate_rvv< - Codec8bit_rvv, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_4bit: - return new QuantizerTemplate_rvv< - Codec4bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_4bit_uniform: - return new QuantizerTemplate_rvv< - Codec4bit_rvv, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_6bit: - return new QuantizerTemplate_rvv< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(dim, trained); - case ScalarQuantizer::QT_fp16: - return new QuantizerFP16_rvv(dim, trained); - case ScalarQuantizer::QT_bf16: - return new QuantizerBF16_rvv(dim, trained); - case ScalarQuantizer::QT_8bit_direct: - return new Quantizer8bitDirect_rvv(dim, trained); - case ScalarQuantizer::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned_rvv(dim, trained); - default: - FAISS_THROW_FMT("Quantizer type %d not supported", qtype); - } - return nullptr; -} - -/******************************************************************* - * Similarity "Tags": Used as template parameters to select metric. - * These are now stateless. - *******************************************************************/ - -template -struct SimilarityL2_rvv {}; -template <> -struct SimilarityL2_rvv<0> { - static constexpr MetricType metric_type = METRIC_L2; -}; - -template -struct SimilarityIP_rvv {}; -template <> -struct SimilarityIP_rvv<0> { - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; -}; - -/******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons - *******************************************************************/ -template -struct DCTemplate_rvv : SQDistanceComputer {}; - -template -struct DCTemplate_rvv - : public DCTemplate { - DCTemplate_rvv(size_t d, const std::vector& trained) - : DCTemplate(d, trained) {} -}; - -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN -template -struct DCTemplate_rvv : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - using Quantizer6bitSpecialized = QuantizerTemplate_rvv< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0>; - - DCTemplate_rvv(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - size_t d = quant.d; - size_t i = 0; - const size_t vlmax = __riscv_vsetvlmax_e32m2(); - - vfloat32m2_t vacc0 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc1 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc3 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - - for (; i + 4 * vlmax <= d; i += 4 * vlmax) { - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vfloat32m2_t x0 = quant.reconstruct_components(code, i, vlmax); - vfloat32m2_t x1 = - quant.reconstruct_components(code, i + vlmax, vlmax); - vfloat32m2_t x2 = quant.reconstruct_components( - code, i + 2 * vlmax, vlmax); - vfloat32m2_t x3 = quant.reconstruct_components( - code, i + 3 * vlmax, vlmax); - - const float* y_ptr = x + i; - vfloat32m2_t y0 = __riscv_vle32_v_f32m2(y_ptr, vlmax); - vfloat32m2_t y1 = __riscv_vle32_v_f32m2(y_ptr + vlmax, vlmax); - vfloat32m2_t y2 = - __riscv_vle32_v_f32m2(y_ptr + 2 * vlmax, vlmax); - vfloat32m2_t y3 = - __riscv_vle32_v_f32m2(y_ptr + 3 * vlmax, vlmax); - - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(y0, x0, vlmax); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(y1, x1, vlmax); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(y2, x2, vlmax); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(y3, x3, vlmax); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vlmax); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y0, x0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y1, x1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y2, x2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y3, x3, vlmax); - } - } else { - vfloat32m4_t x_m4_0, x_m4_1; - constexpr size_t buf_len = 4 * 128; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - std::array temp_buf; - FAISS_THROW_IF_NOT_MSG( - 4 * vlmax <= buf_len, - "RVV vlmax too large for stack buffer in DCTemplate_rvv"); - - quant.reconstruct_components( - code, i, 2 * vlmax, temp_buf.data()); - quant.reconstruct_components( - code, - i + 2 * vlmax, - 2 * vlmax, - temp_buf.data() + 2 * vlmax); - - x_m4_0 = __riscv_vle32_v_f32m4(temp_buf.data(), 2 * vlmax); - x_m4_1 = __riscv_vle32_v_f32m4( - temp_buf.data() + 2 * vlmax, 2 * vlmax); - - } else { - (void)__riscv_vsetvl_e32m4(2 * vlmax); - x_m4_0 = quant.reconstruct_components(code, i, 2 * vlmax); - x_m4_1 = quant.reconstruct_components( - code, i + 2 * vlmax, 2 * vlmax); - (void)__riscv_vsetvl_e32m2(vlmax); - } - - vfloat32m2_t x0 = __riscv_vget_v_f32m4_f32m2(x_m4_0, 0); - vfloat32m2_t x1 = __riscv_vget_v_f32m4_f32m2(x_m4_0, 1); - vfloat32m2_t x2 = __riscv_vget_v_f32m4_f32m2(x_m4_1, 0); - vfloat32m2_t x3 = __riscv_vget_v_f32m4_f32m2(x_m4_1, 1); - - const float* y_ptr = x + i; - vfloat32m2_t y0 = __riscv_vle32_v_f32m2(y_ptr, vlmax); - vfloat32m2_t y1 = __riscv_vle32_v_f32m2(y_ptr + vlmax, vlmax); - vfloat32m2_t y2 = - __riscv_vle32_v_f32m2(y_ptr + 2 * vlmax, vlmax); - vfloat32m2_t y3 = - __riscv_vle32_v_f32m2(y_ptr + 3 * vlmax, vlmax); - - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(y0, x0, vlmax); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(y1, x1, vlmax); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(y2, x2, vlmax); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(y3, x3, vlmax); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vlmax); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y0, x0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y1, x1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y2, x2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y3, x3, vlmax); - } - } - } - - for (; i < d;) { - size_t vl; - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vl = __riscv_vsetvl_e32m2(d - i); - vfloat32m2_t xi = quant.reconstruct_components(code, i, vl); - const float* y_ptr = x + i; - vfloat32m2_t y_rem = __riscv_vle32_v_f32m2(y_ptr, vl); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t diff = __riscv_vfsub_vv_f32m2(y_rem, xi, vl); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, diff, diff, vl); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y_rem, xi, vl); - } - } else { - vl = __riscv_vsetvl_e32m4(d - i); - - vfloat32m4_t xi_m4; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - if (vl <= RVV_CODEC_STACK_THRESHOLD) { - std::array temp_buf; - quant.reconstruct_components( - code, i, vl, temp_buf.data()); - xi_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - } else { - std::vector temp_buf(vl); - quant.reconstruct_components( - code, i, vl, temp_buf.data()); - xi_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - } - - } else { - xi_m4 = quant.reconstruct_components(code, i, vl); - } - - vfloat32m2_t p0 = __riscv_vget_v_f32m4_f32m2(xi_m4, 0); - vfloat32m2_t p1 = __riscv_vget_v_f32m4_f32m2(xi_m4, 1); - - const float* y_ptr = x + i; - - const size_t vlmax_m2 = __riscv_vsetvlmax_e32m2(); - size_t vl0 = (vl > vlmax_m2) ? vlmax_m2 : vl; - size_t vl1 = vl - vl0; - - if (vl0 > 0) { - vfloat32m2_t y0 = __riscv_vle32_v_f32m2(y_ptr, vl0); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(y0, p0, vl0); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vl0); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y0, p0, vl0); - } - } - if (vl1 > 0) { - vfloat32m2_t y1 = __riscv_vle32_v_f32m2(y_ptr + vl0, vl1); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(y1, p1, vl1); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vl1); - } else { - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y1, p1, vl1); - } - } - } - i += vl; - } - - vfloat32m1_t sum_scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1); - vfloat32m1_t s0 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc0, sum_scalar, vlmax); - vfloat32m1_t s1 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc1, sum_scalar, vlmax); - vfloat32m1_t s2 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc2, sum_scalar, vlmax); - vfloat32m1_t s3 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc3, sum_scalar, vlmax); - - float f0 = __riscv_vfmv_f_s_f32m1_f32(s0); - float f1 = __riscv_vfmv_f_s_f32m1_f32(s1); - float f2 = __riscv_vfmv_f_s_f32m1_f32(s2); - float f3 = __riscv_vfmv_f_s_f32m1_f32(s3); - - return f0 + f1 + f2 + f3; - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - size_t d = quant.d; - size_t i = 0; - const size_t vlmax = __riscv_vsetvlmax_e32m2(); - - vfloat32m2_t vacc0 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc1 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc3 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - - for (; i + 4 * vlmax <= d; i += 4 * vlmax) { - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vfloat32m2_t x1_0 = - quant.reconstruct_components(code1, i, vlmax); - vfloat32m2_t x1_1 = - quant.reconstruct_components(code1, i + vlmax, vlmax); - vfloat32m2_t x1_2 = quant.reconstruct_components( - code1, i + 2 * vlmax, vlmax); - vfloat32m2_t x1_3 = quant.reconstruct_components( - code1, i + 3 * vlmax, vlmax); - vfloat32m2_t x2_0 = - quant.reconstruct_components(code2, i, vlmax); - vfloat32m2_t x2_1 = - quant.reconstruct_components(code2, i + vlmax, vlmax); - vfloat32m2_t x2_2 = quant.reconstruct_components( - code2, i + 2 * vlmax, vlmax); - vfloat32m2_t x2_3 = quant.reconstruct_components( - code2, i + 3 * vlmax, vlmax); - - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(x1_0, x2_0, vlmax); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(x1_1, x2_1, vlmax); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(x1_2, x2_2, vlmax); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(x1_3, x2_3, vlmax); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vlmax); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, x1_0, x2_0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, x1_1, x2_1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, x1_2, x2_2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, x1_3, x2_3, vlmax); - } - } else { - vfloat32m4_t x1_m4_0, x1_m4_1, x2_m4_0, x2_m4_1; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - constexpr size_t buf_len = 8 * 128; - std::array temp_buf; - FAISS_THROW_IF_NOT_MSG( - 8 * vlmax <= buf_len, - "RVV vlmax too large for stack buffer in DCTemplate_rvv"); - - quant.reconstruct_components( - code1, i, 2 * vlmax, temp_buf.data()); - quant.reconstruct_components( - code1, - i + 2 * vlmax, - 2 * vlmax, - temp_buf.data() + 2 * vlmax); - quant.reconstruct_components( - code2, i, 2 * vlmax, temp_buf.data() + 4 * vlmax); - quant.reconstruct_components( - code2, - i + 2 * vlmax, - 2 * vlmax, - temp_buf.data() + 6 * vlmax); - - x1_m4_0 = __riscv_vle32_v_f32m4(temp_buf.data(), 2 * vlmax); - x1_m4_1 = __riscv_vle32_v_f32m4( - temp_buf.data() + 2 * vlmax, 2 * vlmax); - x2_m4_0 = __riscv_vle32_v_f32m4( - temp_buf.data() + 4 * vlmax, 2 * vlmax); - x2_m4_1 = __riscv_vle32_v_f32m4( - temp_buf.data() + 6 * vlmax, 2 * vlmax); - - } else { - x1_m4_0 = quant.reconstruct_components(code1, i, 2 * vlmax); - x1_m4_1 = quant.reconstruct_components( - code1, i + 2 * vlmax, 2 * vlmax); - x2_m4_0 = quant.reconstruct_components(code2, i, 2 * vlmax); - x2_m4_1 = quant.reconstruct_components( - code2, i + 2 * vlmax, 2 * vlmax); - } - vfloat32m2_t x1_0 = __riscv_vget_v_f32m4_f32m2(x1_m4_0, 0); - vfloat32m2_t x1_1 = __riscv_vget_v_f32m4_f32m2(x1_m4_0, 1); - vfloat32m2_t x1_2 = __riscv_vget_v_f32m4_f32m2(x1_m4_1, 0); - vfloat32m2_t x1_3 = __riscv_vget_v_f32m4_f32m2(x1_m4_1, 1); - vfloat32m2_t x2_0 = __riscv_vget_v_f32m4_f32m2(x2_m4_0, 0); - vfloat32m2_t x2_1 = __riscv_vget_v_f32m4_f32m2(x2_m4_0, 1); - vfloat32m2_t x2_2 = __riscv_vget_v_f32m4_f32m2(x2_m4_1, 0); - vfloat32m2_t x2_3 = __riscv_vget_v_f32m4_f32m2(x2_m4_1, 1); - - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(x1_0, x2_0, vlmax); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(x1_1, x2_1, vlmax); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(x1_2, x2_2, vlmax); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(x1_3, x2_3, vlmax); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vlmax); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, x1_0, x2_0, vlmax); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, x1_1, x2_1, vlmax); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, x1_2, x2_2, vlmax); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, x1_3, x2_3, vlmax); - } - } - } - - for (; i < d;) { - size_t vl; - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vl = __riscv_vsetvl_e32m2(d - i); - vfloat32m2_t x1i = quant.reconstruct_components(code1, i, vl); - vfloat32m2_t x2i = quant.reconstruct_components(code2, i, vl); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t diff = __riscv_vfsub_vv_f32m2(x1i, x2i, vl); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, diff, diff, vl); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, x1i, x2i, vl); - } - } else { - vl = __riscv_vsetvl_e32m4(d - i); - - vfloat32m4_t x1i_m4, x2i_m4; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - if (2 * vl <= RVV_CODEC_STACK_THRESHOLD * 2) { - std::array - temp_buf; - quant.reconstruct_components( - code1, i, vl, temp_buf.data()); - quant.reconstruct_components( - code2, i, vl, temp_buf.data() + vl); - x1i_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - x2i_m4 = - __riscv_vle32_v_f32m4(temp_buf.data() + vl, vl); - } else { - std::vector temp_buf(2 * vl); - quant.reconstruct_components( - code1, i, vl, temp_buf.data()); - quant.reconstruct_components( - code2, i, vl, temp_buf.data() + vl); - x1i_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - x2i_m4 = - __riscv_vle32_v_f32m4(temp_buf.data() + vl, vl); - } - } else { - x1i_m4 = quant.reconstruct_components(code1, i, vl); - x2i_m4 = quant.reconstruct_components(code2, i, vl); - } - - vfloat32m2_t p1_0 = __riscv_vget_v_f32m4_f32m2(x1i_m4, 0); - vfloat32m2_t p1_1 = __riscv_vget_v_f32m4_f32m2(x1i_m4, 1); - vfloat32m2_t p2_0 = __riscv_vget_v_f32m4_f32m2(x2i_m4, 0); - vfloat32m2_t p2_1 = __riscv_vget_v_f32m4_f32m2(x2i_m4, 1); - - const size_t vlmax_m2 = __riscv_vsetvlmax_e32m2(); - size_t vl0 = (vl > vlmax_m2) ? vlmax_m2 : vl; - size_t vl1 = vl - vl0; - - if (vl0 > 0) { - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t diff0 = - __riscv_vfsub_vv_f32m2(p1_0, p2_0, vl0); - vacc0 = __riscv_vfmacc_vv_f32m2( - vacc0, diff0, diff0, vl0); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, p1_0, p2_0, vl0); - } - } - if (vl1 > 0) { - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t diff1 = - __riscv_vfsub_vv_f32m2(p1_1, p2_1, vl1); - vacc1 = __riscv_vfmacc_vv_f32m2( - vacc1, diff1, diff1, vl1); - } else { - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, p1_1, p2_1, vl1); - } - } - } - i += vl; - } - - vfloat32m1_t sum_scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1); - vfloat32m1_t s0 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc0, sum_scalar, vlmax); - vfloat32m1_t s1 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc1, sum_scalar, vlmax); - vfloat32m1_t s2 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc2, sum_scalar, vlmax); - vfloat32m1_t s3 = - __riscv_vfredusum_vs_f32m2_f32m1(vacc3, sum_scalar, vlmax); - - float f0 = __riscv_vfmv_f_s_f32m1_f32(s0); - float f1 = __riscv_vfmv_f_s_f32m1_f32(s1); - float f2 = __riscv_vfmv_f_s_f32m1_f32(s2); - float f3 = __riscv_vfmv_f_s_f32m1_f32(s3); - - return f0 + f1 + f2 + f3; - } - - void set_query(const float* x) final { - this->q = x; - } - - float operator()(idx_t i) final { - return this->query_to_code(this->codes + i * this->code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - this->codes + i * this->code_size, - this->codes + j * this->code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_distance(this->q, code); - } - - void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3) const override final { - const size_t vlmax = __riscv_vsetvlmax_e32m2(); - - vfloat32m2_t vacc0 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc1 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc2 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - vfloat32m2_t vacc3 = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - - size_t d = quant.d; - size_t i = 0; - - for (; i < d;) { - size_t vl; - if constexpr ( - std::is_same_v> || - std::is_same_v>) { - vl = __riscv_vsetvl_e32m2(d - i); - vfloat32m2_t x0 = quant.reconstruct_components(code_0, i, vl); - vfloat32m2_t x1 = quant.reconstruct_components(code_1, i, vl); - vfloat32m2_t x2 = quant.reconstruct_components(code_2, i, vl); - vfloat32m2_t x3 = quant.reconstruct_components(code_3, i, vl); - - vfloat32m2_t y = __riscv_vle32_v_f32m2(this->q + i, vl); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = __riscv_vfsub_vv_f32m2(y, x0, vl); - vfloat32m2_t d1 = __riscv_vfsub_vv_f32m2(y, x1, vl); - vfloat32m2_t d2 = __riscv_vfsub_vv_f32m2(y, x2, vl); - vfloat32m2_t d3 = __riscv_vfsub_vv_f32m2(y, x3, vl); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vl); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vl); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vl); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vl); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y, x0, vl); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y, x1, vl); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y, x2, vl); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y, x3, vl); - } - } else { - vl = __riscv_vsetvl_e32m4(d - i); - - vfloat32m4_t x0_m4, x1_m4, x2_m4, x3_m4; - - if constexpr (std::is_same_v< - Quantizer, - Quantizer6bitSpecialized>) { - if (4 * vl <= RVV_CODEC_STACK_THRESHOLD * 4) { - std::array - temp_buf; - quant.reconstruct_components( - code_0, i, vl, temp_buf.data()); - quant.reconstruct_components( - code_1, i, vl, temp_buf.data() + vl); - quant.reconstruct_components( - code_2, i, vl, temp_buf.data() + 2 * vl); - quant.reconstruct_components( - code_3, i, vl, temp_buf.data() + 3 * vl); - x0_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - x1_m4 = __riscv_vle32_v_f32m4(temp_buf.data() + vl, vl); - x2_m4 = __riscv_vle32_v_f32m4( - temp_buf.data() + 2 * vl, vl); - x3_m4 = __riscv_vle32_v_f32m4( - temp_buf.data() + 3 * vl, vl); - } else { - std::vector temp_buf(4 * vl); - quant.reconstruct_components( - code_0, i, vl, temp_buf.data()); - quant.reconstruct_components( - code_1, i, vl, temp_buf.data() + vl); - quant.reconstruct_components( - code_2, i, vl, temp_buf.data() + 2 * vl); - quant.reconstruct_components( - code_3, i, vl, temp_buf.data() + 3 * vl); - x0_m4 = __riscv_vle32_v_f32m4(temp_buf.data(), vl); - x1_m4 = __riscv_vle32_v_f32m4(temp_buf.data() + vl, vl); - x2_m4 = __riscv_vle32_v_f32m4( - temp_buf.data() + 2 * vl, vl); - x3_m4 = __riscv_vle32_v_f32m4( - temp_buf.data() + 3 * vl, vl); - } - - } else { - x0_m4 = quant.reconstruct_components(code_0, i, vl); - x1_m4 = quant.reconstruct_components(code_1, i, vl); - x2_m4 = quant.reconstruct_components(code_2, i, vl); - x3_m4 = quant.reconstruct_components(code_3, i, vl); - } - vfloat32m2_t x0_p0 = __riscv_vget_v_f32m4_f32m2(x0_m4, 0); - vfloat32m2_t x0_p1 = __riscv_vget_v_f32m4_f32m2(x0_m4, 1); - vfloat32m2_t x1_p0 = __riscv_vget_v_f32m4_f32m2(x1_m4, 0); - vfloat32m2_t x1_p1 = __riscv_vget_v_f32m4_f32m2(x1_m4, 1); - vfloat32m2_t x2_p0 = __riscv_vget_v_f32m4_f32m2(x2_m4, 0); - vfloat32m2_t x2_p1 = __riscv_vget_v_f32m4_f32m2(x2_m4, 1); - vfloat32m2_t x3_p0 = __riscv_vget_v_f32m4_f32m2(x3_m4, 0); - vfloat32m2_t x3_p1 = __riscv_vget_v_f32m4_f32m2(x3_m4, 1); - - const size_t vlmax_m2 = __riscv_vsetvlmax_e32m2(); - size_t vl0 = (vl > vlmax_m2) ? vlmax_m2 : vl; - size_t vl1 = vl - vl0; - - if (vl0 > 0) { - vfloat32m2_t y0 = __riscv_vle32_v_f32m2(this->q + i, vl0); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = - __riscv_vfsub_vv_f32m2(y0, x0_p0, vl0); - vfloat32m2_t d1 = - __riscv_vfsub_vv_f32m2(y0, x1_p0, vl0); - vfloat32m2_t d2 = - __riscv_vfsub_vv_f32m2(y0, x2_p0, vl0); - vfloat32m2_t d3 = - __riscv_vfsub_vv_f32m2(y0, x3_p0, vl0); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vl0); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vl0); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vl0); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vl0); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y0, x0_p0, vl0); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y0, x1_p0, vl0); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y0, x2_p0, vl0); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y0, x3_p0, vl0); - } - } - if (vl1 > 0) { - size_t offset = i + vl0; - vfloat32m2_t y1 = - __riscv_vle32_v_f32m2(this->q + offset, vl1); - if constexpr (Sim::metric_type == METRIC_L2) { - vfloat32m2_t d0 = - __riscv_vfsub_vv_f32m2(y1, x0_p1, vl1); - vfloat32m2_t d1 = - __riscv_vfsub_vv_f32m2(y1, x1_p1, vl1); - vfloat32m2_t d2 = - __riscv_vfsub_vv_f32m2(y1, x2_p1, vl1); - vfloat32m2_t d3 = - __riscv_vfsub_vv_f32m2(y1, x3_p1, vl1); - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, d0, d0, vl1); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, d1, d1, vl1); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, d2, d2, vl1); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, d3, d3, vl1); - } else { - vacc0 = __riscv_vfmacc_vv_f32m2(vacc0, y1, x0_p1, vl1); - vacc1 = __riscv_vfmacc_vv_f32m2(vacc1, y1, x1_p1, vl1); - vacc2 = __riscv_vfmacc_vv_f32m2(vacc2, y1, x2_p1, vl1); - vacc3 = __riscv_vfmacc_vv_f32m2(vacc3, y1, x3_p1, vl1); - } - } - } - i += vl; - } - - vfloat32m1_t sum_scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1); - dis0 = __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredusum_vs_f32m2_f32m1(vacc0, sum_scalar, vlmax)); - dis1 = __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredusum_vs_f32m2_f32m1(vacc1, sum_scalar, vlmax)); - dis2 = __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredusum_vs_f32m2_f32m1(vacc2, sum_scalar, vlmax)); - dis3 = __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredusum_vs_f32m2_f32m1(vacc3, sum_scalar, vlmax)); - } -}; -FAISS_PRAGMA_IMPRECISE_FUNCTION_END - -/******************************************************************* - * DistanceComputerByte: computes distances in the integer domain - *******************************************************************/ -template -struct DistanceComputerByte_rvv : SQDistanceComputer {}; - -template -struct DistanceComputerByte_rvv - : public DistanceComputerByte { - DistanceComputerByte_rvv(int d, const std::vector& unused) - : DistanceComputerByte(d, unused) {} -}; - -template -struct DistanceComputerByte_rvv : SQDistanceComputer { - using Sim = Similarity; - int d; - std::vector tmp; - - DistanceComputerByte_rvv(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - size_t remaining_d = static_cast(d); - size_t offset = 0; - uint64_t acc64 = 0; - - while (true) { - size_t vl = __riscv_vsetvl_e8m1(remaining_d); - if (vl == 0) - break; - - vuint8m1_t vx_u = __riscv_vle8_v_u8m1(code1 + offset, vl); - vuint8m1_t vy_u = __riscv_vle8_v_u8m1(code2 + offset, vl); - - if constexpr (Sim::metric_type == METRIC_L2) { - vuint16m2_t vx16 = __riscv_vzext_vf2_u16m2(vx_u, vl); - vuint16m2_t vy16 = __riscv_vzext_vf2_u16m2(vy_u, vl); - vuint32m4_t vx32 = __riscv_vzext_vf2_u32m4(vx16, vl); - vuint32m4_t vy32 = __riscv_vzext_vf2_u32m4(vy16, vl); - vint32m4_t sx32 = __riscv_vreinterpret_v_u32m4_i32m4(vx32); - vint32m4_t sy32 = __riscv_vreinterpret_v_u32m4_i32m4(vy32); - vint32m4_t sdiff = __riscv_vsub_vv_i32m4(sx32, sy32, vl); - vint32m4_t sqr = __riscv_vmul_vv_i32m4(sdiff, sdiff, vl); - vuint32m4_t sqr_u = __riscv_vreinterpret_v_i32m4_u32m4(sqr); - vuint32m1_t vsum = __riscv_vmv_s_x_u32m1(0, 1); - vsum = __riscv_vredsum_vs_u32m4_u32m1(sqr_u, vsum, vl); - acc64 += static_cast(__riscv_vmv_x_s_u32m1_u32(vsum)); - } else { - vuint16m2_t vprod = __riscv_vwmulu_vv_u16m2(vx_u, vy_u, vl); - vuint32m4_t vprod_w = __riscv_vwaddu_vx_u32m4(vprod, 0, vl); - vuint32m1_t vsum = __riscv_vmv_s_x_u32m1(0, 1); - vsum = __riscv_vredsum_vs_u32m4_u32m1(vprod_w, vsum, vl); - acc64 += static_cast(__riscv_vmv_x_s_u32m1_u32(vsum)); - } - - offset += vl; - remaining_d -= vl; - } - if (acc64 > static_cast(std::numeric_limits::max())) { - return std::numeric_limits::max(); - } - return static_cast(acc64); - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = static_cast(x[i]); - } - } - - float operator()(idx_t i) final { - return query_to_code(this->codes + i * this->code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - this->codes + i * this->code_size, - this->codes + j * this->code_size); - } - - float query_to_code(const uint8_t* code) const override final { - return compute_code_distance(tmp.data(), code); - } -}; - -/******************************************************************* - * select_distance_computer: runtime selection of template - * specialization - *******************************************************************/ - -template -ScalarQuantizer::SQDistanceComputer* select_distance_computer_rvv( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - switch (qtype) { - case ScalarQuantizer::QT_8bit: - return new DCTemplate_rvv< - QuantizerTemplate_rvv< - Codec8bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_8bit_uniform: - return new DCTemplate_rvv< - QuantizerTemplate_rvv< - Codec8bit_rvv, - QuantizerTemplateScaling::UNIFORM, - 0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_4bit: - return new DCTemplate_rvv< - QuantizerTemplate_rvv< - Codec4bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_4bit_uniform: - // Fallback to base class for SQ4U to ensure correct COSINE metric - // handling The generic DCTemplate_rvv computes IP distance for - // INNER_PRODUCT metric, but SQ4U with COSINE metric requires L2 - // distance computation. - // TODO: Implement RVV-optimized DistanceComputerSQ4UByte_rvv - // similar to AVX2 version - // Use base scalar Similarity types for fallback to avoid - // instantiating DCTemplate with RVV tag types that lack the - // full scalar Similarity interface (constructor, begin, - // add_component, etc.) - if constexpr (Similarity::metric_type == METRIC_L2) { - return select_distance_computer>( - qtype, dim, trained); - } else { - return select_distance_computer>( - qtype, dim, trained); - } - - case ScalarQuantizer::QT_6bit: - return new DCTemplate_rvv< - QuantizerTemplate_rvv< - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM, - 0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_fp16: - return new DCTemplate_rvv, Similarity, 0>( - dim, trained); - - case ScalarQuantizer::QT_bf16: - return new DCTemplate_rvv, Similarity, 0>( - dim, trained); - - case ScalarQuantizer::QT_8bit_direct: - return new DCTemplate_rvv< - Quantizer8bitDirect_rvv<0>, - Similarity, - 0>(dim, trained); - - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate_rvv< - Quantizer8bitDirectSigned_rvv<0>, - Similarity, - 0>(dim, trained); - - default: - FAISS_THROW_FMT("Quantizer type %d not supported", qtype); - return nullptr; - } -} - -template -InvertedListScanner* sel2_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -template -InvertedListScanner* sel2_InvertedListScanner_rvv( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel12_InvertedListScanner_rvv( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = 0; - using QuantizerClass = QuantizerTemplate_rvv; - using DCClass = DCTemplate_rvv; - return sel2_InvertedListScanner_rvv( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner_rvv( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = 0; - switch (sq->qtype) { - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec8bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec8bit_rvv, - QuantizerTemplateScaling::UNIFORM>( - sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec4bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec4bit_rvv, - QuantizerTemplateScaling::UNIFORM>( - sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_rvv< - Similarity, - Codec6bit_rvv, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner_rvv, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_bf16: - return sel2_InvertedListScanner_rvv, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_8bit_direct: - return sel2_InvertedListScanner_rvv, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - - case QuantizerType::QT_8bit_direct_signed: - return sel2_InvertedListScanner_rvv, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - - default: - FAISS_THROW_MSG("unknown qtype"); - return nullptr; - } -} - -template -InvertedListScanner* select_inverted_list_scanner_rvv( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t /*dim*/, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner_rvv>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner_rvv>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - return nullptr; - } -} - -} -} -} // namespace faiss - -#endif // __riscv_vector diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.cpp deleted file mode 100644 index 6a2db28d9..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ - -/* SSE */ -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (metric == METRIC_L2) { - return select_distance_computer>(qtype, dim, trained); - } else { - return select_distance_computer>(qtype, dim, trained); - } -} - -ScalarQuantizer::SQDistanceComputer* sq_get_hamming_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - return select_hamming_distance_computer(dim, trained); -} - -SQDistanceComputer* select_hamming_distance_computer( - size_t d, - const std::vector& trained) { - size_t code_size = (d + 7) / 8; - switch (code_size) { - case 4: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 8: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 16: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 20: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 32: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 64: - return new BinarySQDistanceComputerWrapper(code_size, trained); - default: - return new BinarySQDistanceComputerWrapper(code_size, trained); - } -} - -ScalarQuantizer::SQDistanceComputer* sq_get_jaccard_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - return select_jaccard_distance_computer(dim, trained); -} - -SQDistanceComputer* select_jaccard_distance_computer( - size_t d, - const std::vector& trained) { - size_t code_size = (d + 7) / 8; - switch (code_size) { - case 8: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 16: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 32: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 64: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 128: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 256: - return new BinarySQDistanceComputerWrapper(code_size, trained); - case 512: - return new BinarySQDistanceComputerWrapper(code_size, trained); - default: - return new BinarySQDistanceComputerWrapper(code_size, trained); - } -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_ref( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - return select_quantizer_1<1>(qtype, dim, trained); -} - -InvertedListScanner* sq_select_inverted_list_scanner_ref( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - return sel0_InvertedListScanner<1>( - mt, sq, quantizer, store_pairs, sel, by_residual); -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.h deleted file mode 100644 index de6008050..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC.h +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQDistanceComputer* sq_get_hamming_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQDistanceComputer* sq_get_jaccard_distance_computer_ref( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_ref( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_ref( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.cpp deleted file mode 100644 index 8eade7715..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (metric == METRIC_L2) { - if (dim % 8 == 0) { - return select_distance_computer_avx>( - qtype, dim, trained); - } else { - return select_distance_computer_avx>( - qtype, dim, trained); - } - } else { - if (dim % 8 == 0) { - return select_distance_computer_avx>( - qtype, dim, trained); - } else { - return select_distance_computer_avx>( - qtype, dim, trained); - } - } -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_avx( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (dim % 8 == 0) { - return select_quantizer_1_avx<8>(qtype, dim, trained); - } else { - return select_quantizer_1_avx<1>(qtype, dim, trained); - } -} - -InvertedListScanner* sq_select_inverted_list_scanner_avx( - MetricType mt, - const ScalarQuantizer *sq, - const Index *quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (dim % 8 == 0) { - return sel0_InvertedListScanner_avx<8>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } else { - return sel0_InvertedListScanner_avx<1>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.h deleted file mode 100644 index af44d70ef..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_avx( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_avx( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.cpp deleted file mode 100644 index f291930f4..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx512( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (metric == METRIC_L2) { - if (dim % 16 == 0) { - return select_distance_computer_avx512>( - qtype, dim, trained); - } else if (dim % 8 == 0) { - return select_distance_computer_avx512>( - qtype, dim, trained); - } else { - return select_distance_computer_avx512>( - qtype, dim, trained); - } - } else { - if (dim % 16 == 0) { - return select_distance_computer_avx512>( - qtype, dim, trained); - } else if (dim % 8 == 0) { - return select_distance_computer_avx512>( - qtype, dim, trained); - } else { - return select_distance_computer_avx512>( - qtype, dim, trained); - } - } -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_avx512( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (dim % 16 == 0) { - return select_quantizer_1_avx512<16>(qtype, dim, trained); - } else if (dim % 8 == 0) { - return select_quantizer_1_avx512<8>(qtype, dim, trained); - } else { - return select_quantizer_1_avx512<1>(qtype, dim, trained); - } -} - -InvertedListScanner* sq_select_inverted_list_scanner_avx512( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (dim % 16 == 0) { - return sel0_InvertedListScanner_avx512<16>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } else if (dim % 8 == 0) { - return sel0_InvertedListScanner_avx512<8>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } else { - return sel0_InvertedListScanner_avx512<1>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.h deleted file mode 100644 index b0d719008..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_avx512.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx512( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_avx512( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_avx512( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.cpp deleted file mode 100644 index d41b470d6..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_neon( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (metric == METRIC_L2) { - if (dim % 8 == 0) { - return select_distance_computer_neon>( - qtype, dim, trained); - } else { - return select_distance_computer_neon>( - qtype, dim, trained); - } - } else { - if (dim % 8 == 0) { - return select_distance_computer_neon>( - qtype, dim, trained); - } else { - return select_distance_computer_neon>( - qtype, dim, trained); - } - } -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_neon( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { - if (dim % 8 == 0) { - return select_quantizer_1_neon<8>(qtype, dim, trained); - } else { - return select_quantizer_1_neon<1>(qtype, dim, trained); - } -} - -InvertedListScanner* sq_select_inverted_list_scanner_neon( - MetricType mt, - const ScalarQuantizer *sq, - const Index *quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (dim % 8 == 0) { - return sel0_InvertedListScanner_neon<8>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } else { - return sel0_InvertedListScanner_neon<1>( - mt, sq, quantizer, store_pairs, sel, by_residual); - } -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.h deleted file mode 100644 index dcca6b284..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_neon.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_neon( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_neon( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_neon( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.cpp deleted file mode 100644 index 03bb55ea1..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -/******************************************************************* - * ScalarQuantizer Distance Computer - ********************************************************************/ -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_rvv( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { -#if defined(__riscv_vector) - - if (metric == METRIC_L2) { - return select_distance_computer_rvv>( - qtype, dim, trained); - } else { - return select_distance_computer_rvv>( - qtype, dim, trained); - } -#else - - if (metric == METRIC_L2) { - return select_distance_computer_rvv>( - qtype, dim, trained); - } else { - return select_distance_computer_rvv>( - qtype, dim, trained); - } -#endif -} - -ScalarQuantizer::SQuantizer* sq_select_quantizer_rvv( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained) { -#if defined(__riscv_vector) - - return select_quantizer_1_rvv<0>(qtype, dim, trained); -#else - - return select_quantizer_1_rvv<1>(qtype, dim, trained); -#endif -} - -InvertedListScanner* sq_select_inverted_list_scanner_rvv( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { -#if defined(__riscv_vector) - - return select_inverted_list_scanner_rvv<0>( - mt, sq, quantizer, dim, store_pairs, sel, by_residual); -#else - - return select_inverted_list_scanner_rvv<1>( - mt, sq, quantizer, dim, store_pairs, sel, by_residual); -#endif -} - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.h deleted file mode 100644 index 75c798aa2..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerDC_rvv.h +++ /dev/null @@ -1,42 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -// Forward declaration for RVV-specific implementations. -ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_rvv( - MetricType metric, - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -ScalarQuantizer::SQuantizer* sq_select_quantizer_rvv( - ScalarQuantizer::QuantizerType qtype, - size_t dim, - const std::vector& trained); - -InvertedListScanner* sq_select_inverted_list_scanner_rvv( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - size_t dim, - bool store_pairs, - const IDSelector* sel, - bool by_residual); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.cpp deleted file mode 100644 index 705cdd11b..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.cpp +++ /dev/null @@ -1,193 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include - -#include -#include -#include - - - -namespace faiss::cppcontrib::knowhere { - -using RangeStat = ScalarQuantizer::RangeStat; - -/******************************************************************* - * Quantizer range training - */ - -static float sqr(float x) { - return x * x; -} - -void train_Uniform( - RangeStat rs, - float rs_arg, - idx_t n, - int k, - const float* x, - std::vector& trained) { - trained.resize(2); - float& vmin = trained[0]; - float& vmax = trained[1]; - - if (rs == RangeStat::RS_minmax) { - vmin = HUGE_VAL; - vmax = -HUGE_VAL; - for (size_t i = 0; i < n; i++) { - if (x[i] < vmin) - vmin = x[i]; - if (x[i] > vmax) - vmax = x[i]; - } - float vexp = (vmax - vmin) * rs_arg; - vmin -= vexp; - vmax += vexp; - } else if (rs == RangeStat::RS_meanstd) { - double sum = 0, sum2 = 0; - for (size_t i = 0; i < n; i++) { - sum += x[i]; - sum2 += x[i] * x[i]; - } - float mean = sum / n; - float var = sum2 / n - mean * mean; - float std = var <= 0 ? 1.0 : sqrt(var); - - vmin = mean - std * rs_arg; - vmax = mean + std * rs_arg; - } else if (rs == RangeStat::RS_quantiles) { - std::vector x_copy(n); - memcpy(x_copy.data(), x, n * sizeof(*x)); - // TODO just do a quickselect - std::sort(x_copy.begin(), x_copy.end()); - int o = int(rs_arg * n); - if (o < 0) - o = 0; - if (o > n - o) - o = n / 2; - vmin = x_copy[o]; - vmax = x_copy[n - 1 - o]; - - } else if (rs == RangeStat::RS_optim) { - float a, b; - float sx = 0; - { - vmin = HUGE_VAL, vmax = -HUGE_VAL; - for (size_t i = 0; i < n; i++) { - if (x[i] < vmin) - vmin = x[i]; - if (x[i] > vmax) - vmax = x[i]; - sx += x[i]; - } - b = vmin; - a = (vmax - vmin) / (k - 1); - } - int verbose = false; - int niter = 2000; - float last_err = -1; - int iter_last_err = 0; - for (int it = 0; it < niter; it++) { - float sn = 0, sn2 = 0, sxn = 0, err1 = 0; - - for (idx_t i = 0; i < n; i++) { - float xi = x[i]; - float ni = floor((xi - b) / a + 0.5); - if (ni < 0) - ni = 0; - if (ni >= k) - ni = k - 1; - err1 += sqr(xi - (ni * a + b)); - sn += ni; - sn2 += ni * ni; - sxn += ni * xi; - } - - if (err1 == last_err) { - iter_last_err++; - if (iter_last_err == 16) - break; - } else { - last_err = err1; - iter_last_err = 0; - } - - float det = sqr(sn) - sn2 * n; - - b = (sn * sxn - sn2 * sx) / det; - a = (sn * sx - n * sxn) / det; - if (verbose) { - printf("it %d, err1=%g \r", it, err1); - fflush(stdout); - } - } - if (verbose) - printf("\n"); - - vmin = b; - vmax = b + a * (k - 1); - - } else { - FAISS_THROW_MSG("Invalid qtype"); - } - vmax -= vmin; -} - -void train_NonUniform( - RangeStat rs, - float rs_arg, - idx_t n, - int d, - int k, - const float* x, - std::vector& trained) { - trained.resize(2 * d); - float* vmin = trained.data(); - float* vmax = trained.data() + d; - if (rs == RangeStat::RS_minmax) { - memcpy(vmin, x, sizeof(*x) * d); - memcpy(vmax, x, sizeof(*x) * d); - for (size_t i = 1; i < n; i++) { - const float* xi = x + i * d; - for (size_t j = 0; j < d; j++) { - if (xi[j] < vmin[j]) - vmin[j] = xi[j]; - if (xi[j] > vmax[j]) - vmax[j] = xi[j]; - } - } - float* vdiff = vmax; - for (size_t j = 0; j < d; j++) { - float vexp = (vmax[j] - vmin[j]) * rs_arg; - vmin[j] -= vexp; - vmax[j] += vexp; - vdiff[j] = vmax[j] - vmin[j]; - } - } else { - // transpose - std::vector xt(n * d); - for (size_t i = 1; i < n; i++) { - const float* xi = x + i * d; - for (size_t j = 0; j < d; j++) { - xt[j * n + i] = xi[j]; - } - } - std::vector trained_d(2); -#pragma omp parallel for - for (int j = 0; j < d; j++) { - train_Uniform(rs, rs_arg, n, k, xt.data() + j * n, trained_d); - vmin[j] = trained_d[0]; - vmax[j] = trained_d[1]; - } - } -} - -} - - diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.h deleted file mode 100644 index 8204f5a1b..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerOp.h +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -void train_Uniform( - ScalarQuantizer::RangeStat rs, - float rs_arg, - idx_t n, - int k, - const float* x, - std::vector& trained); - -void train_NonUniform( - ScalarQuantizer::RangeStat rs, - float rs_arg, - idx_t n, - int d, - int k, - const float* x, - std::vector& trained); - -} -} -} // namespace faiss diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerScanner.h b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerScanner.h deleted file mode 100644 index c3b9c7b79..000000000 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/ScalarQuantizerScanner.h +++ /dev/null @@ -1,343 +0,0 @@ -#pragma once - -#include -//#include - -//struct InvertedListScanner; -//struct IDSelector; - -#include - -namespace faiss { -namespace cppcontrib { -namespace knowhere { - -/******************************************************************* - * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object - * - * It is an InvertedListScanner, but is designed to work with - * IndexScalarQuantizer as well. - ********************************************************************/ -template < - // A predicate for filtering elements. - // std::optional Pred(const size_t idx); - // * return true to accept an element. - // * return false to reject an element. - // * return std::nullopt to break the iteration loop. - typename Pred, - // Apply an element. - // void Apply(const float dis, const size_t idx); - typename Apply, - typename DCClass> -void fvec_distance_ny_scalar_if( - const DCClass& dc, - const uint8_t* __restrict codes, - const size_t code_size, - const size_t ny, - Pred pred, - Apply apply) { - // compute a distance from the query to 1 element - auto distance1 = [&dc, codes, code_size](const size_t idx) { - return dc.query_to_code(codes + idx * code_size); - }; - - // compute distances from the query to 4 elements - auto distance4 = [&dc, codes, code_size]( - const std::array indices, - std::array& dis) { - dc.query_to_codes_batch_4( - codes + indices[0] * code_size, - codes + indices[1] * code_size, - codes + indices[2] * code_size, - codes + indices[3] * code_size, - dis[0], - dis[1], - dis[2], - dis[3]); - }; - - NoRemapping remapper; - - fvec_distance_ny_if< - Pred, - decltype(distance1), - decltype(distance4), - decltype(remapper), - Apply, - 4, - DEFAULT_BUFFER_SIZE>( - ny, pred, distance1, distance4, remapper, apply); -} - -/* use_sel = 0: don't check selector - * = 1: check on ids[j] - * = 2: check in j directly (normally ids is nullptr and store_pairs) - */ - -template -struct IVFSQScannerIP : InvertedListScanner { - DCClass dc; - bool by_residual; - - float accu0; /// added to all distances - - IVFSQScannerIP( - int d, - const std::vector& trained, - size_t code_size, - bool store_pairs, - const IDSelector* sel, - bool by_residual) - : dc(d, trained), by_residual(by_residual), accu0(0) { - this->store_pairs = store_pairs; - this->sel = sel; - this->code_size = code_size; - this->keep_max = true; - } - - void set_query(const float* query) override { - dc.set_query(query); - } - - void set_list(idx_t list_no, float coarse_dis) override { - this->list_no = list_no; - accu0 = by_residual ? coarse_dis : 0; - } - - float distance_to_code(const uint8_t* code) const final { - return accu0 + dc.query_to_code(code); - } - - size_t scan_codes( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - float* simi, - idx_t* idxi, - size_t k, - size_t& scan_cnt) const override { - size_t nup = 0; - // baseline - // for (size_t j = 0; j < list_size; j++, codes += code_size) { - // if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - // continue; - // } - - // // todo aguzhva: upgrade - // float accu = accu0 + dc.query_to_code(codes); - - // if (accu > simi[0]) { - // int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - // minheap_replace_top(k, simi, idxi, accu, id); - // nup++; - // } - // } - - // the lambda that filters acceptable elements. - auto filter = [&](const size_t j) { - return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); - }; - - // the lambda that applies a filtered element. - auto apply = [&](const float dis_in, const size_t j) { - const float dis = accu0 + dis_in; - if (dis > simi[0]) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - minheap_replace_top(k, simi, idxi, dis, id); - nup++; - } - }; - - // compute distances - fvec_distance_ny_scalar_if( - dc, codes, code_size, list_size, filter, apply); - return nup; - } - - void scan_codes_and_return( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - std::vector<::knowhere::DistId>& out) const override { - // the lambda that filters acceptable elements. - auto filter = [&](const size_t j) { - return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); - }; - // the lambda that applies a valid element. - auto apply = [&](const float dis_in, const size_t j) { - const float dis = accu0 + dis_in; - out.emplace_back(ids[j], dis); - }; - fvec_distance_ny_scalar_if( - dc, codes, code_size, list_size, filter, apply); - } - - void scan_codes_range( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - float radius, - RangeQueryResult& res) const override { - for (size_t j = 0; j < list_size; j++, codes += code_size) { - if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - continue; - } - - // todo aguzhva: upgrade - float accu = accu0 + dc.query_to_code(codes); - if (accu > radius) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - res.add(accu, id); - } - } - } -}; - -/* use_sel = 0: don't check selector - * = 1: check on ids[j] - * = 2: check in j directly (normally ids is nullptr and store_pairs) - */ - -template -struct IVFSQScannerL2 : InvertedListScanner { - DCClass dc; - - bool by_residual; - const Index* quantizer; - const float* x; /// current query - - std::vector tmp; - - IVFSQScannerL2( - int d, - const std::vector& trained, - size_t code_size, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) - : dc(d, trained), - by_residual(by_residual), - quantizer(quantizer), - x(nullptr), - tmp(d) { - this->store_pairs = store_pairs; - this->sel = sel; - this->code_size = code_size; - } - - void set_query(const float* query) override { - x = query; - if (!quantizer) { - dc.set_query(query); - } - } - - void set_list(idx_t list_no, float) override { - this->list_no = list_no; - if (by_residual) { - // shift of x_in wrt centroid - quantizer->compute_residual(x, tmp.data(), list_no); - dc.set_query(tmp.data()); - } else { - dc.set_query(x); - } - } - - float distance_to_code(const uint8_t* code) const final { - return dc.query_to_code(code); - } - - size_t scan_codes( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - float* simi, - idx_t* idxi, - size_t k, - size_t& scan_cnt) const override { - size_t nup = 0; - - // // baseline - // for (size_t j = 0; j < list_size; j++, codes += code_size) { - // if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - // continue; - // } - // - // float dis = dc.query_to_code(codes); - // - // if (dis < simi[0]) { - // int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - // maxheap_replace_top(k, simi, idxi, dis, id); - // nup++; - // } - // } - - // the lambda that filters acceptable elements. - auto filter = - [&](const size_t j) { return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); }; - - // the lambda that applies a filtered element. - auto apply = - [&](const float dis, const size_t j) { - if (dis < simi[0]) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - maxheap_replace_top(k, simi, idxi, dis, id); - nup++; - } - }; - - // compute distances - fvec_distance_ny_scalar_if( - dc, codes, code_size, list_size, filter, apply); - - return nup; - } - - void scan_codes_and_return( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - std::vector<::knowhere::DistId>& out) const override { - // the lambda that filters acceptable elements. - auto filter = [&](const size_t j) { - return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); - }; - // the lambda that applies a valid element. - auto apply = [&](const float dis_in, const size_t j) { - out.emplace_back(ids[j], dis_in); - }; - fvec_distance_ny_scalar_if( - dc, codes, code_size, list_size, filter, apply); - } - - void scan_codes_range( - size_t list_size, - const uint8_t* codes, - const float* code_norms, - const idx_t* ids, - float radius, - RangeQueryResult& res) const override { - for (size_t j = 0; j < list_size; j++, codes += code_size) { - if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - continue; - } - - // todo aguzhva: upgrade - float dis = dc.query_to_code(codes); - if (dis < radius) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - res.add(dis, id); - } - } - } -}; - -} -} -} \ No newline at end of file diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_read.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_read.cpp index e2d0e27a5..ae0760929 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_read.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_read.cpp @@ -24,9 +24,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -38,6 +40,7 @@ #include #include #include +#include #include #include @@ -574,7 +577,9 @@ static void read_ProductLocalSearchQuantizer( } } -static void read_ScalarQuantizer(ScalarQuantizer* ivsc, IOReader* f) { +static void read_ScalarQuantizer( + ::faiss::ScalarQuantizer* ivsc, + IOReader* f) { READ1(ivsc->qtype); READ1(ivsc->rangestat); READ1(ivsc->rangestat_arg); @@ -986,12 +991,30 @@ Index* read_index(IOReader* f, int io_flags) { READVECTOR(idxs->inverse_norms_storage.inverse_l2_norms); idx = idxs; } else if (h == fourcc("IxSQ")) { - IndexScalarQuantizer* idxs = new IndexScalarQuantizer(); + ::faiss::IndexScalarQuantizer* idxs = + new ::faiss::IndexScalarQuantizer(); read_index_header(idxs, f); read_ScalarQuantizer(&idxs->sq, f); read_vector(idxs->codes, f); idxs->code_size = idxs->sq.code_size; - idx = idxs; + // Legacy binary format: the fork used integer 9 as QT_1bit_direct + // for 1-bit HNSW storage; baseline maps the same integer to + // QT_0bit. Compare against the raw integer so we never depend on + // either enum name, and route legacy data to + // IndexBinaryScalarQuantizer. + const int legacy_qt_1bit_direct_marker = 9; + if (static_cast(idxs->sq.qtype) == + legacy_qt_1bit_direct_marker) { + IndexBinaryScalarQuantizer* bsq = new IndexBinaryScalarQuantizer( + static_cast(idxs->d), idxs->metric_type); + bsq->ntotal = idxs->ntotal; + bsq->is_trained = idxs->is_trained; + bsq->codes = std::move(idxs->codes); + delete idxs; + idx = bsq; + } else { + idx = idxs; + } } else if (h == fourcc("IvSQ")) { // legacy IndexIVFScalarQuantizer* ivsc = new IndexIVFScalarQuantizer(); std::vector> ids; @@ -1110,6 +1133,27 @@ Index* read_index(IOReader* f, int io_flags) { if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) { dynamic_cast(idxhnsw->storage)->pq.compute_sdc_table(); } + // Legacy binary HNSW: IHNs fourcc with an + // IndexBinaryScalarQuantizer inner storage was how the fork used + // to serialize IndexHNSWSQ(QT_1bit_direct, ...). Swap the outer + // wrapper to IndexHNSWBinary so the runtime type reflects the + // data. The on-disk bytes are unchanged by this conversion. + if (h == fourcc("IHNs") && + dynamic_cast(idxhnsw->storage) != + nullptr) { + IndexHNSWBinary* newh = new IndexHNSWBinary(); + newh->d = idxhnsw->d; + newh->ntotal = idxhnsw->ntotal; + newh->is_trained = idxhnsw->is_trained; + newh->metric_type = idxhnsw->metric_type; + newh->hnsw = std::move(idxhnsw->hnsw); + newh->storage = idxhnsw->storage; + newh->own_fields = idxhnsw->own_fields; + idxhnsw->storage = nullptr; + idxhnsw->own_fields = false; + delete idxhnsw; + idxhnsw = newh; + } idx = idxhnsw; } else if (h == fourcc("IwPf")) { IndexIVFPQFastScan* ivpq = new IndexIVFPQFastScan(); diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_write.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_write.cpp index b1ad3b188..4f0eddbb6 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_write.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/index_write.cpp @@ -23,9 +23,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -37,6 +39,7 @@ #include #include #include +#include #include #include @@ -236,7 +239,9 @@ static void write_ProductLocalSearchQuantizer( } } -static void write_ScalarQuantizer(const ScalarQuantizer* ivsc, IOWriter* f) { +static void write_ScalarQuantizer( + const ::faiss::ScalarQuantizer* ivsc, + IOWriter* f) { WRITE1(ivsc->qtype); WRITE1(ivsc->rangestat); WRITE1(ivsc->rangestat_arg); @@ -586,8 +591,39 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) { // inverse norms WRITEVECTOR(idxs->inverse_norms_storage.inverse_l2_norms); } else if ( - const IndexScalarQuantizer* idxs = - dynamic_cast(idx)) { + const IndexBinaryScalarQuantizer* bsq = + dynamic_cast(idx)) { + // Legacy binary serialization: emit the same IxSQ fourcc + SQ + // wire layout that IndexScalarQuantizer(QT_1bit_direct) used to + // produce, so old readers continue to parse it unchanged. The + // trained vector is empty (1-bit-direct has no training data). + // QuantizerType enum integer 9 was fork's QT_1bit_direct; in + // baseline the same integer is QT_0bit. Emit the raw integer so + // the wire format is stable regardless of which enum is in scope. + uint32_t h = fourcc("IxSQ"); + WRITE1(h); + write_index_header(idx, f); + + const int legacy_qt_1bit_direct_marker = 9; + auto legacy_qtype = + static_cast<::faiss::ScalarQuantizer::QuantizerType>( + legacy_qt_1bit_direct_marker); + ::faiss::ScalarQuantizer::RangeStat legacy_rangestat = + ::faiss::ScalarQuantizer::RS_minmax; + float legacy_rangestat_arg = 0.0f; + size_t legacy_d = static_cast(bsq->d); + size_t legacy_code_size = bsq->code_size; + std::vector legacy_trained; + WRITE1(legacy_qtype); + WRITE1(legacy_rangestat); + WRITE1(legacy_rangestat_arg); + WRITE1(legacy_d); + WRITE1(legacy_code_size); + WRITEVECTOR(legacy_trained); + WRITEVECTOR(bsq->codes); + } else if ( + const ::faiss::IndexScalarQuantizer* idxs = + dynamic_cast(idx)) { uint32_t h = fourcc("IxSQ"); WRITE1(h); write_index_header(idx, f); @@ -665,6 +701,11 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) { } else if (const IndexHNSW* idxhnsw = dynamic_cast(idx)) { uint32_t h = dynamic_cast(idx) ? fourcc("IHNf") : dynamic_cast(idx) ? fourcc("IHNp") + // IndexHNSWBinary reuses the legacy IHNs fourcc so + // on-disk bytes match what IndexHNSWSQ(QT_1bit_direct, + // metric) used to produce. Readers dispatch to + // IndexHNSWBinary based on the inner storage type. + : dynamic_cast(idx) ? fourcc("IHNs") : dynamic_cast(idx) ? fourcc("IHNs") : dynamic_cast(idx) ? fourcc("IHN2") : dynamic_cast(idx) ? fourcc("IHNc") diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp new file mode 100644 index 000000000..eae1db694 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx2-fastpath.cpp @@ -0,0 +1,328 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/****************************************************************************** + * Knowhere-local prelude over baseline sq-avx2.cpp. + * + * See sq-avx512-fastpath.cpp for the full design note on how this pattern + * works (full DCTemplate specialization declared here; baseline .cpp + * included below; template lookup picks our specialization). + * + * This file ports the AVX2 variant of the fork's DistanceComputerSQ4UByte + * for QT_4bit_uniform + L2. + *****************************************************************************/ + +#ifdef COMPILE_SIMD_AVX2 + +#include +#include +#include +#include + +#include +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +template <> +struct DCTemplate< + QuantizerTemplate< + Codec4bit, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::AVX2>, + SimilarityL2, + SIMDLevel::AVX2> : SQDistanceComputer { + using Sim = SimilarityL2; + + size_t d; + float vmin; + float vdiff; + float final_scale_sq; + std::vector q_lo; + std::vector q_hi; + + DCTemplate(size_t d_in, const std::vector& trained) + : d(d_in), + vmin(trained[0]), + vdiff(trained[1]), + // Over-allocate by 32 bytes so full 256-bit loads past the + // logical length read a safe zero-filled tail. + q_lo((d_in + 1) / 2 + 32, 0), + q_hi((d_in + 1) / 2 + 32, 0) { + const float final_scale = vdiff / 15.0f; + final_scale_sq = final_scale * final_scale; + } + + void set_query(const float* x) final { + this->q = x; + const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff; + for (size_t i = 0; i < d; i++) { + float val = (x[i] - vmin) * inv_scale; + int code = static_cast(val); + if (code < 0) { + code = 0; + } + if (code > 15) { + code = 15; + } + if (i % 2 == 0) { + q_lo[i / 2] = static_cast(code); + } else { + q_hi[i / 2] = static_cast(code); + } + } + } + + float query_to_code(const uint8_t* code) const final { + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + __m256i acc = _mm256_setzero_si256(); + const __m256i mask_f = _mm256_set1_epi8(0xF); + const __m256i one = _mm256_set1_epi16(1); + const __m256i zero = _mm256_setzero_si256(); + + size_t i = 0; + // 64 dims per iteration (32 bytes of packed 4-bit codes). + for (; i + 64 <= d; i += 64) { + __m256i c256 = _mm256_loadu_si256( + reinterpret_cast(code + i / 2)); + + __m256i nibbles_lo = _mm256_and_si256(c256, mask_f); + __m256i nibbles_hi = + _mm256_and_si256(_mm256_srli_epi16(c256, 4), mask_f); + + __m256i q_lo_vec = _mm256_loadu_si256( + reinterpret_cast(q_lo_ptr + i / 2)); + __m256i q_hi_vec = _mm256_loadu_si256( + reinterpret_cast(q_hi_ptr + i / 2)); + + __m256i diff_lo = _mm256_sub_epi8(q_lo_vec, nibbles_lo); + __m256i diff_hi = _mm256_sub_epi8(q_hi_vec, nibbles_hi); + + // AVX2 has no _mm256_abs_epi8; emulate via max(x, -x). + diff_lo = _mm256_max_epi8(diff_lo, _mm256_sub_epi8(zero, diff_lo)); + diff_hi = _mm256_max_epi8(diff_hi, _mm256_sub_epi8(zero, diff_hi)); + + __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); + __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); + + __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); + __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); + + acc = _mm256_add_epi32(acc, sum_lo); + acc = _mm256_add_epi32(acc, sum_hi); + } + + // Horizontal reduction. + __m128i acc_lo = _mm256_castsi256_si128(acc); + __m128i acc_hi = _mm256_extracti128_si256(acc, 1); + acc_lo = _mm_add_epi32(acc_lo, acc_hi); + acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); + acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); + int32_t sum = _mm_cvtsi128_si32(acc_lo); + + // Scalar tail. + for (; i < d; i++) { + uint8_t c = code[i / 2]; + uint8_t nibble = (i % 2 == 0) + ? static_cast(c & 0x0F) + : static_cast(c >> 4); + int q_code = (i % 2 == 0) ? q_lo[i / 2] : q_hi[i / 2]; + int diff = q_code - int(nibble); + sum += diff * diff; + } + + return static_cast(sum) * final_scale_sq; + } + + float symmetric_dis(idx_t i, idx_t j) override { + const uint8_t* c1 = codes + i * code_size; + const uint8_t* c2 = codes + j * code_size; + int64_t acc = 0; + for (size_t k = 0; k < d; k++) { + uint8_t a = (k % 2 == 0) + ? static_cast(c1[k / 2] & 0x0F) + : static_cast(c1[k / 2] >> 4); + uint8_t b = (k % 2 == 0) + ? static_cast(c2[k / 2] & 0x0F) + : static_cast(c2[k / 2] >> 4); + int diff = int(a) - int(b); + acc += diff * diff; + } + return static_cast(acc) * final_scale_sq; + } + + /// Batch-4: 128 dims per outer iter, two 64-dim chunks sharing q_lo/q_hi + /// loads across four input codes. Ported verbatim from the fork's AVX2 + /// DistanceComputerSQ4UByte_avx. AVX2 has no abs_epi8 so |diff| is + /// emulated via max(x, -x). + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + __m256i acc0 = _mm256_setzero_si256(); + __m256i acc1 = _mm256_setzero_si256(); + __m256i acc2 = _mm256_setzero_si256(); + __m256i acc3 = _mm256_setzero_si256(); + + const __m256i mask_f = _mm256_set1_epi8(0x0F); + const __m256i one = _mm256_set1_epi16(1); + const __m256i zero = _mm256_setzero_si256(); + + size_t i = 0; + // 128 dims per outer iter. + for (; i + 128 <= d; i += 128) { + __m256i q_lo_0 = _mm256_loadu_si256( + reinterpret_cast(q_lo_ptr + i / 2)); + __m256i q_hi_0 = _mm256_loadu_si256( + reinterpret_cast(q_hi_ptr + i / 2)); + + auto process_chunk_64 = [&](const uint8_t* code, + __m256i& acc, + __m256i q_lo_v, + __m256i q_hi_v, + int offset) { + __m256i c = _mm256_loadu_si256(reinterpret_cast( + code + i / 2 + offset)); + __m256i nibbles_lo = _mm256_and_si256(c, mask_f); + __m256i nibbles_hi = + _mm256_and_si256(_mm256_srli_epi16(c, 4), mask_f); + + __m256i diff_lo = _mm256_sub_epi8(q_lo_v, nibbles_lo); + __m256i diff_hi = _mm256_sub_epi8(q_hi_v, nibbles_hi); + + diff_lo = _mm256_max_epi8( + diff_lo, _mm256_sub_epi8(zero, diff_lo)); + diff_hi = _mm256_max_epi8( + diff_hi, _mm256_sub_epi8(zero, diff_hi)); + + __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); + __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); + + __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); + __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); + + acc = _mm256_add_epi32(acc, sum_lo); + acc = _mm256_add_epi32(acc, sum_hi); + }; + + process_chunk_64(code_0, acc0, q_lo_0, q_hi_0, 0); + process_chunk_64(code_1, acc1, q_lo_0, q_hi_0, 0); + process_chunk_64(code_2, acc2, q_lo_0, q_hi_0, 0); + process_chunk_64(code_3, acc3, q_lo_0, q_hi_0, 0); + + __m256i q_lo_1 = _mm256_loadu_si256( + reinterpret_cast(q_lo_ptr + i / 2 + 32)); + __m256i q_hi_1 = _mm256_loadu_si256( + reinterpret_cast(q_hi_ptr + i / 2 + 32)); + + process_chunk_64(code_0, acc0, q_lo_1, q_hi_1, 32); + process_chunk_64(code_1, acc1, q_lo_1, q_hi_1, 32); + process_chunk_64(code_2, acc2, q_lo_1, q_hi_1, 32); + process_chunk_64(code_3, acc3, q_lo_1, q_hi_1, 32); + } + + // 64-dim remainder chunk. + if (i + 64 <= d) { + __m256i q_lo_0 = _mm256_loadu_si256( + reinterpret_cast(q_lo_ptr + i / 2)); + __m256i q_hi_0 = _mm256_loadu_si256( + reinterpret_cast(q_hi_ptr + i / 2)); + + auto process = [&](const uint8_t* code, __m256i& acc) { + __m256i c = _mm256_loadu_si256( + reinterpret_cast(code + i / 2)); + __m256i nibbles_lo = _mm256_and_si256(c, mask_f); + __m256i nibbles_hi = + _mm256_and_si256(_mm256_srli_epi16(c, 4), mask_f); + + __m256i diff_lo = _mm256_sub_epi8(q_lo_0, nibbles_lo); + __m256i diff_hi = _mm256_sub_epi8(q_hi_0, nibbles_hi); + + diff_lo = _mm256_max_epi8( + diff_lo, _mm256_sub_epi8(zero, diff_lo)); + diff_hi = _mm256_max_epi8( + diff_hi, _mm256_sub_epi8(zero, diff_hi)); + + __m256i sq_lo = _mm256_maddubs_epi16(diff_lo, diff_lo); + __m256i sq_hi = _mm256_maddubs_epi16(diff_hi, diff_hi); + + __m256i sum_lo = _mm256_madd_epi16(sq_lo, one); + __m256i sum_hi = _mm256_madd_epi16(sq_hi, one); + + acc = _mm256_add_epi32(acc, sum_lo); + acc = _mm256_add_epi32(acc, sum_hi); + }; + + process(code_0, acc0); + process(code_1, acc1); + process(code_2, acc2); + process(code_3, acc3); + + i += 64; + } + + auto reduce = [](const __m256i& acc) -> int32_t { + __m128i acc_lo = _mm256_castsi256_si128(acc); + __m128i acc_hi = _mm256_extracti128_si256(acc, 1); + acc_lo = _mm_add_epi32(acc_lo, acc_hi); + acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); + acc_lo = _mm_hadd_epi32(acc_lo, acc_lo); + return _mm_cvtsi128_si32(acc_lo); + }; + + dis0 = static_cast(reduce(acc0)); + dis1 = static_cast(reduce(acc1)); + dis2 = static_cast(reduce(acc2)); + dis3 = static_cast(reduce(acc3)); + + // Scalar tail. + for (; i < d; i++) { + uint8_t nibble_lo = q_lo[i / 2]; + uint8_t nibble_hi = q_hi[i / 2]; + + auto process_scalar = [&](const uint8_t* code, float& dis) { + uint8_t c = code[i / 2]; + uint8_t nibble = (i % 2 == 0) + ? static_cast(c & 0x0F) + : static_cast(c >> 4); + int q_code = (i % 2 == 0) ? nibble_lo : nibble_hi; + int diff = q_code - int(nibble); + dis += static_cast(diff * diff); + }; + + process_scalar(code_0, dis0); + process_scalar(code_1, dis1); + process_scalar(code_2, dis2); + process_scalar(code_3, dis3); + } + + dis0 *= final_scale_sq; + dis1 *= final_scale_sq; + dis2 *= final_scale_sq; + dis3 *= final_scale_sq; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss + +#include "../../../impl/scalar_quantizer/sq-avx2.cpp" + +#endif // COMPILE_SIMD_AVX2 diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp new file mode 100644 index 000000000..99159e1b8 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-avx512-fastpath.cpp @@ -0,0 +1,547 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/****************************************************************************** + * Knowhere-local prelude over baseline sq-avx512.cpp. + * + * What this file does, and why: + * - Declares a FULL template specialization of + * faiss::scalar_quantizer::DCTemplate, AVX512> + * for Q = QuantizerTemplate, UNIFORM, AVX512>. + * - Then textually `#include`s the baseline + * thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp. + * - Knowhere's CMake swaps this file in place of that baseline .cpp when + * building the faiss_avx512 object library. + * + * Effect: baseline's sq-dispatch.h dispatcher (included at the bottom of the + * baseline .cpp we pull in below) instantiates DCTemplate<...> for the + * template args the dispatcher writes for QT_4bit_uniform. C++ template + * lookup picks our full specialization because it is strictly more + * specialized than baseline's partial specialization for the AVX512 level. + * Non-matching combinations (other qtypes, IP metric) still resolve to + * baseline's partial specialization — nothing else changes. + * + * IMPORTANT constraint: the full specialization body must NOT contain a + * member of type Quantizer<...>. Inside baseline's sq-avx512.cpp, the AVX512 + * partial specializations of Codec4bit / QuantizerTemplate are declared + * BELOW the point at which we include that file here. At the point of our + * full specialization, those types are incomplete (primary template only), + * so we cannot have a member of that type. Workaround: read `trained[0]` + * and `trained[1]` directly in the constructor. + * + * For a detailed design note see the project plan's §1.2.A. + *****************************************************************************/ + +#ifdef COMPILE_SIMD_AVX512 + +#include +#include +#include +#include + +#include +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +/************************************************************************* + * QT_4bit_uniform + L2 fast path, AVX512. + * + * Math recap: for UNIFORM 4-bit scaling, + * recon(c) = vmin + vdiff * (c + 0.5) / 15 = final_scale * c + bias + * final_scale = vdiff / 15 + * L2(recon(q), recon(c)) = final_scale^2 * (q_c - c_c)^2 + * + * We pre-nibble the query floats into q_lo / q_hi (even / odd lanes) once + * at set_query time and then compute everything in the int domain, paying + * one float multiply at the end. + ************************************************************************/ + +template <> +struct DCTemplate< + QuantizerTemplate< + Codec4bit, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::AVX512>, + SimilarityL2, + SIMDLevel::AVX512> : SQDistanceComputer { + using Sim = SimilarityL2; + + size_t d; + float vmin; + float vdiff; + float final_scale_sq; + std::vector q_lo; + std::vector q_hi; + bool has_vnni; + + DCTemplate(size_t d_in, const std::vector& trained) + : d(d_in), + vmin(trained[0]), + vdiff(trained[1]), + // Over-allocate by 64 bytes so full 512-bit loads past the + // logical length are safe (readers mask off unused lanes). + q_lo((d_in + 1) / 2 + 64, 0), + q_hi((d_in + 1) / 2 + 64, 0), + has_vnni(__builtin_cpu_supports("avx512vnni")) { + const float final_scale = vdiff / 15.0f; + final_scale_sq = final_scale * final_scale; + } + + void set_query(const float* x) final { + this->q = x; + const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff; + for (size_t i = 0; i < d; i++) { + float val = (x[i] - vmin) * inv_scale; + int code = static_cast(val); + if (code < 0) { + code = 0; + } + if (code > 15) { + code = 15; + } + if (i % 2 == 0) { + q_lo[i / 2] = static_cast(code); + } else { + q_hi[i / 2] = static_cast(code); + } + } + } + + float query_to_code(const uint8_t* code) const final { + __m512i acc = _mm512_setzero_si512(); + const __m512i mask_f = _mm512_set1_epi8(0xF); + const __m512i one = _mm512_set1_epi16(1); + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + size_t i = 0; + // 128 dims per iteration (64 bytes of packed 4-bit codes). + for (; i + 128 <= d; i += 128) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2)); + + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i q_lo_vec = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_vec = _mm512_loadu_si512(q_hi_ptr + i / 2); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sq_sum = _mm512_add_epi16(sq_lo, sq_hi); + __m512i sum_32 = _mm512_madd_epi16(sq_sum, one); + + acc = _mm512_add_epi32(acc, sum_32); + } + + // Tail. q_lo / q_hi are over-allocated so masked loads past the + // logical length read zeros; code is also loaded with mask_even and + // nibbles_hi is masked separately so odd-lane overread is zeroed. + if (i < d) { + size_t rem = d - i; + uint64_t mask_even = (rem + 1) / 2 >= 64 + ? ~0ULL + : (1ULL << ((rem + 1) / 2)) - 1; + uint64_t mask_odd = + rem / 2 >= 64 ? ~0ULL : (1ULL << (rem / 2)) - 1; + + __m512i c512 = _mm512_maskz_loadu_epi8(mask_even, code + i / 2); + + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i q_lo_vec = + _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); + __m512i q_hi_vec = + _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); + + __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); + nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sq_sum = _mm512_add_epi16(sq_lo, sq_hi); + __m512i sum_32 = _mm512_madd_epi16(sq_sum, one); + + acc = _mm512_add_epi32(acc, sum_32); + } + + const int32_t sum = _mm512_reduce_add_epi32(acc); + return static_cast(sum) * final_scale_sq; + } + + float symmetric_dis(idx_t i, idx_t j) override { + // Not on the critical query path; scalar version suffices. + const uint8_t* c1 = codes + i * code_size; + const uint8_t* c2 = codes + j * code_size; + int64_t acc = 0; + for (size_t k = 0; k < d; k++) { + uint8_t a = (k % 2 == 0) + ? static_cast(c1[k / 2] & 0x0F) + : static_cast(c1[k / 2] >> 4); + uint8_t b = (k % 2 == 0) + ? static_cast(c2[k / 2] & 0x0F) + : static_cast(c2[k / 2] >> 4); + int diff = int(a) - int(b); + acc += diff * diff; + } + return static_cast(acc) * final_scale_sq; + } + + /// Batch-4 entry point: dispatches to VNNI or non-VNNI path based on + /// runtime CPU capability. Both paths process 256 dims per outer loop + /// iteration by amortizing two q_lo / q_hi chunks across the four + /// input codes. + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + if (has_vnni) { + query_to_codes_batch_4_vnni( + code_0, code_1, code_2, code_3, dis0, dis1, dis2, dis3); + } else { + query_to_codes_batch_4_avx512( + code_0, code_1, code_2, code_3, dis0, dis1, dis2, dis3); + } + } + + /// VNNI path: uses _mm512_dpbusd_epi32 to fuse square-and-accumulate. + /// Still valid because for 4-bit codes the differences are in [-15, 15] + /// and |diff|^2 fits in u8 × u8 → i32 without overflow. + __attribute__((target("avx512vnni"))) void query_to_codes_batch_4_vnni( + const uint8_t* __restrict code_0, + const uint8_t* __restrict code_1, + const uint8_t* __restrict code_2, + const uint8_t* __restrict code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const { + __m512i acc0 = _mm512_setzero_si512(); + __m512i acc1 = _mm512_setzero_si512(); + __m512i acc2 = _mm512_setzero_si512(); + __m512i acc3 = _mm512_setzero_si512(); + + const __m512i mask_f = _mm512_set1_epi8(0x0F); + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + size_t i = 0; + // 256 dims per iteration — two 128-dim chunks sharing two q loads. + for (; i + 256 <= d; i += 256) { + __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); + __m512i q_lo_1 = _mm512_loadu_si512(q_lo_ptr + i / 2 + 64); + __m512i q_hi_1 = _mm512_loadu_si512(q_hi_ptr + i / 2 + 64); + + auto process_chunk = [&](const uint8_t* code, + __m512i& acc, + __m512i q_lo_v, + __m512i q_hi_v, + int offset) + __attribute__((target("avx512vnni"))) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2 + offset)); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_v, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_v, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); + acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); + }; + + process_chunk(code_0, acc0, q_lo_0, q_hi_0, 0); + process_chunk(code_1, acc1, q_lo_0, q_hi_0, 0); + process_chunk(code_2, acc2, q_lo_0, q_hi_0, 0); + process_chunk(code_3, acc3, q_lo_0, q_hi_0, 0); + + process_chunk(code_0, acc0, q_lo_1, q_hi_1, 64); + process_chunk(code_1, acc1, q_lo_1, q_hi_1, 64); + process_chunk(code_2, acc2, q_lo_1, q_hi_1, 64); + process_chunk(code_3, acc3, q_lo_1, q_hi_1, 64); + } + + // 128-dim remainder (one q chunk). + if (i + 128 <= d) { + __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); + + auto process_chunk = [&](const uint8_t* code, __m512i& acc) + __attribute__((target("avx512vnni"))) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2)); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_0, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_0, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); + acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); + }; + + process_chunk(code_0, acc0); + process_chunk(code_1, acc1); + process_chunk(code_2, acc2); + process_chunk(code_3, acc3); + + i += 128; + } + + // Sub-128-dim tail with masked loads. + if (i < d) { + size_t rem = d - i; + uint64_t mask_even = (rem + 1) / 2 >= 64 + ? ~0ULL + : (1ULL << ((rem + 1) / 2)) - 1; + uint64_t mask_odd = + rem / 2 >= 64 ? ~0ULL : (1ULL << (rem / 2)) - 1; + + __m512i q_lo_vec = + _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); + __m512i q_hi_vec = + _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); + __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); + + auto process = [&](const uint8_t* code, __m512i& acc) + __attribute__((target("avx512vnni"))) { + __m512i c512 = + _mm512_maskz_loadu_epi8(mask_even, code + i / 2); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + acc = _mm512_dpbusd_epi32(acc, diff_lo, diff_lo); + acc = _mm512_dpbusd_epi32(acc, diff_hi, diff_hi); + }; + + process(code_0, acc0); + process(code_1, acc1); + process(code_2, acc2); + process(code_3, acc3); + } + + dis0 = static_cast(_mm512_reduce_add_epi32(acc0)) * + final_scale_sq; + dis1 = static_cast(_mm512_reduce_add_epi32(acc1)) * + final_scale_sq; + dis2 = static_cast(_mm512_reduce_add_epi32(acc2)) * + final_scale_sq; + dis3 = static_cast(_mm512_reduce_add_epi32(acc3)) * + final_scale_sq; + } + + /// Non-VNNI path: squares via _mm512_maddubs_epi16 (u8×u8 → i16) and + /// accumulates to i32 with _mm512_madd_epi16. Same 256-dim outer loop. + void query_to_codes_batch_4_avx512( + const uint8_t* __restrict code_0, + const uint8_t* __restrict code_1, + const uint8_t* __restrict code_2, + const uint8_t* __restrict code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const { + __m512i acc0 = _mm512_setzero_si512(); + __m512i acc1 = _mm512_setzero_si512(); + __m512i acc2 = _mm512_setzero_si512(); + __m512i acc3 = _mm512_setzero_si512(); + + const __m512i mask_f = _mm512_set1_epi8(0x0F); + const __m512i one = _mm512_set1_epi16(1); + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + size_t i = 0; + for (; i + 256 <= d; i += 256) { + __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); + __m512i q_lo_1 = _mm512_loadu_si512(q_lo_ptr + i / 2 + 64); + __m512i q_hi_1 = _mm512_loadu_si512(q_hi_ptr + i / 2 + 64); + + auto process_chunk = [&](const uint8_t* code, + __m512i& acc, + __m512i q_lo_v, + __m512i q_hi_v, + int offset) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2 + offset)); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_v, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_v, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); + __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); + + acc = _mm512_add_epi32(acc, sum_lo); + acc = _mm512_add_epi32(acc, sum_hi); + }; + + process_chunk(code_0, acc0, q_lo_0, q_hi_0, 0); + process_chunk(code_1, acc1, q_lo_0, q_hi_0, 0); + process_chunk(code_2, acc2, q_lo_0, q_hi_0, 0); + process_chunk(code_3, acc3, q_lo_0, q_hi_0, 0); + + process_chunk(code_0, acc0, q_lo_1, q_hi_1, 64); + process_chunk(code_1, acc1, q_lo_1, q_hi_1, 64); + process_chunk(code_2, acc2, q_lo_1, q_hi_1, 64); + process_chunk(code_3, acc3, q_lo_1, q_hi_1, 64); + } + + if (i + 128 <= d) { + __m512i q_lo_0 = _mm512_loadu_si512(q_lo_ptr + i / 2); + __m512i q_hi_0 = _mm512_loadu_si512(q_hi_ptr + i / 2); + + auto process_chunk = [&](const uint8_t* code, __m512i& acc) { + __m512i c512 = _mm512_loadu_si512( + reinterpret_cast(code + i / 2)); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_0, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_0, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); + __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); + + acc = _mm512_add_epi32(acc, sum_lo); + acc = _mm512_add_epi32(acc, sum_hi); + }; + + process_chunk(code_0, acc0); + process_chunk(code_1, acc1); + process_chunk(code_2, acc2); + process_chunk(code_3, acc3); + + i += 128; + } + + if (i < d) { + size_t rem = d - i; + uint64_t mask_even = (rem + 1) / 2 >= 64 + ? ~0ULL + : (1ULL << ((rem + 1) / 2)) - 1; + uint64_t mask_odd = + rem / 2 >= 64 ? ~0ULL : (1ULL << (rem / 2)) - 1; + + __m512i q_lo_vec = + _mm512_maskz_loadu_epi8(mask_even, q_lo_ptr + i / 2); + __m512i q_hi_vec = + _mm512_maskz_loadu_epi8(mask_odd, q_hi_ptr + i / 2); + __m512i mask_odd_vec = _mm512_movm_epi8(mask_odd); + + auto process = [&](const uint8_t* code, __m512i& acc) { + __m512i c512 = + _mm512_maskz_loadu_epi8(mask_even, code + i / 2); + __m512i nibbles_lo = _mm512_and_si512(c512, mask_f); + __m512i nibbles_hi = + _mm512_and_si512(_mm512_srli_epi16(c512, 4), mask_f); + nibbles_hi = _mm512_and_si512(nibbles_hi, mask_odd_vec); + + __m512i diff_lo = _mm512_sub_epi8(q_lo_vec, nibbles_lo); + __m512i diff_hi = _mm512_sub_epi8(q_hi_vec, nibbles_hi); + + diff_lo = _mm512_abs_epi8(diff_lo); + diff_hi = _mm512_abs_epi8(diff_hi); + + __m512i sq_lo = _mm512_maddubs_epi16(diff_lo, diff_lo); + __m512i sq_hi = _mm512_maddubs_epi16(diff_hi, diff_hi); + + __m512i sum_lo = _mm512_madd_epi16(sq_lo, one); + __m512i sum_hi = _mm512_madd_epi16(sq_hi, one); + + acc = _mm512_add_epi32(acc, sum_lo); + acc = _mm512_add_epi32(acc, sum_hi); + }; + + process(code_0, acc0); + process(code_1, acc1); + process(code_2, acc2); + process(code_3, acc3); + } + + dis0 = static_cast(_mm512_reduce_add_epi32(acc0)) * + final_scale_sq; + dis1 = static_cast(_mm512_reduce_add_epi32(acc1)) * + final_scale_sq; + dis2 = static_cast(_mm512_reduce_add_epi32(acc2)) * + final_scale_sq; + dis3 = static_cast(_mm512_reduce_add_epi32(acc3)) * + final_scale_sq; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss + +// Pull in baseline's sq-avx512.cpp. Its AVX512 partial specializations of +// Codec / QuantizerTemplate / Similarity / DCTemplate, its Similarity +// structs, and its dispatcher instantiation all come online after this +// point. Our full specialization above is already visible, so at the +// instantiation moment inside sq-dispatch.h, C++ template lookup selects +// it over the partial one. +#include "../../../impl/scalar_quantizer/sq-avx512.cpp" + +#endif // COMPILE_SIMD_AVX512 diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp new file mode 100644 index 000000000..39bb7fb43 --- /dev/null +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/impl/sq-neon-fastpath.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/****************************************************************************** + * Knowhere-local prelude over baseline sq-neon.cpp. + * + * See sq-avx512-fastpath.cpp for the full design note on how this pattern + * works. This file ports the NEON variant of the fork's + * DistanceComputerSQ4UByte for QT_4bit_uniform + L2. + *****************************************************************************/ + +#ifdef COMPILE_SIMD_ARM_NEON + +#include +#include +#include +#include + +#include +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +template <> +struct DCTemplate< + QuantizerTemplate< + Codec4bit, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::ARM_NEON>, + SimilarityL2, + SIMDLevel::ARM_NEON> : SQDistanceComputer { + using Sim = SimilarityL2; + + size_t d; + float vmin; + float vdiff; + float final_scale_sq; + std::vector q_lo; + std::vector q_hi; + + DCTemplate(size_t d_in, const std::vector& trained) + : d(d_in), + vmin(trained[0]), + vdiff(trained[1]), + // Over-allocate by 16 bytes for safe 128-bit vld1q_u8 past + // the logical length (readers ignore out-of-range lanes). + q_lo((d_in + 1) / 2 + 16, 0), + q_hi((d_in + 1) / 2 + 16, 0) { + const float final_scale = vdiff / 15.0f; + final_scale_sq = final_scale * final_scale; + } + + void set_query(const float* x) final { + this->q = x; + const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff; + for (size_t i = 0; i < d; i++) { + float val = (x[i] - vmin) * inv_scale; + int code = static_cast(val); + if (code < 0) { + code = 0; + } + if (code > 15) { + code = 15; + } + if (i % 2 == 0) { + q_lo[i / 2] = static_cast(code); + } else { + q_hi[i / 2] = static_cast(code); + } + } + } + + float query_to_code(const uint8_t* code) const final { + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + uint32x4_t acc = vdupq_n_u32(0); + const uint8x16_t mask_f = vdupq_n_u8(0x0F); + + size_t i = 0; + // 32 dims per iteration (16 bytes of packed 4-bit codes). + for (; i + 32 <= d; i += 32) { + uint8x16_t c = vld1q_u8(code + i / 2); + + uint8x16_t nibbles_lo = vandq_u8(c, mask_f); + uint8x16_t nibbles_hi = vandq_u8(vshrq_n_u8(c, 4), mask_f); + + uint8x16_t q_lo_vec = vld1q_u8(q_lo_ptr + i / 2); + uint8x16_t q_hi_vec = vld1q_u8(q_hi_ptr + i / 2); + + uint8x16_t diff_lo = vabdq_u8(q_lo_vec, nibbles_lo); + uint8x16_t diff_hi = vabdq_u8(q_hi_vec, nibbles_hi); + + // Widen + square — each byte in [0, 15] so squared fits in u16. + uint16x8_t sq_lo_1 = + vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); + uint16x8_t sq_lo_2 = + vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); + uint16x8_t sq_hi_1 = + vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); + uint16x8_t sq_hi_2 = + vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); + + acc = vpadalq_u16(acc, sq_lo_1); + acc = vpadalq_u16(acc, sq_lo_2); + acc = vpadalq_u16(acc, sq_hi_1); + acc = vpadalq_u16(acc, sq_hi_2); + } + + uint32_t sum = vaddvq_u32(acc); + + // Scalar tail. + for (; i < d; i++) { + uint8_t c = code[i / 2]; + uint8_t nibble = (i % 2 == 0) + ? static_cast(c & 0x0F) + : static_cast(c >> 4); + int q_code = (i % 2 == 0) ? q_lo[i / 2] : q_hi[i / 2]; + int diff = q_code - int(nibble); + sum += diff * diff; + } + + return static_cast(sum) * final_scale_sq; + } + + float symmetric_dis(idx_t i, idx_t j) override { + const uint8_t* c1 = codes + i * code_size; + const uint8_t* c2 = codes + j * code_size; + int64_t acc = 0; + for (size_t k = 0; k < d; k++) { + uint8_t a = (k % 2 == 0) + ? static_cast(c1[k / 2] & 0x0F) + : static_cast(c1[k / 2] >> 4); + uint8_t b = (k % 2 == 0) + ? static_cast(c2[k / 2] & 0x0F) + : static_cast(c2[k / 2] >> 4); + int diff = int(a) - int(b); + acc += diff * diff; + } + return static_cast(acc) * final_scale_sq; + } + + /// Batch-4: 32 dims per outer iter with four parallel u32 accumulators, + /// amortizing the q_lo / q_hi load across four input codes. Ported + /// verbatim from the fork's NEON DistanceComputerSQ4UByte_neon. + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + uint32x4_t acc0 = vdupq_n_u32(0); + uint32x4_t acc1 = vdupq_n_u32(0); + uint32x4_t acc2 = vdupq_n_u32(0); + uint32x4_t acc3 = vdupq_n_u32(0); + + const uint8x16_t mask_f = vdupq_n_u8(0x0F); + const uint8_t* q_lo_ptr = q_lo.data(); + const uint8_t* q_hi_ptr = q_hi.data(); + + size_t i = 0; + for (; i + 32 <= d; i += 32) { + uint8x16_t q_lo_vec = vld1q_u8(q_lo_ptr + i / 2); + uint8x16_t q_hi_vec = vld1q_u8(q_hi_ptr + i / 2); + + auto process = [&](const uint8_t* code, uint32x4_t& acc) { + uint8x16_t c = vld1q_u8(code + i / 2); + uint8x16_t nibbles_lo = vandq_u8(c, mask_f); + uint8x16_t nibbles_hi = vandq_u8(vshrq_n_u8(c, 4), mask_f); + + uint8x16_t diff_lo = vabdq_u8(q_lo_vec, nibbles_lo); + uint8x16_t diff_hi = vabdq_u8(q_hi_vec, nibbles_hi); + + uint16x8_t sq_lo_1 = + vmull_u8(vget_low_u8(diff_lo), vget_low_u8(diff_lo)); + uint16x8_t sq_lo_2 = + vmull_u8(vget_high_u8(diff_lo), vget_high_u8(diff_lo)); + uint16x8_t sq_hi_1 = + vmull_u8(vget_low_u8(diff_hi), vget_low_u8(diff_hi)); + uint16x8_t sq_hi_2 = + vmull_u8(vget_high_u8(diff_hi), vget_high_u8(diff_hi)); + + acc = vpadalq_u16(acc, sq_lo_1); + acc = vpadalq_u16(acc, sq_lo_2); + acc = vpadalq_u16(acc, sq_hi_1); + acc = vpadalq_u16(acc, sq_hi_2); + }; + + process(code_0, acc0); + process(code_1, acc1); + process(code_2, acc2); + process(code_3, acc3); + } + + dis0 = static_cast(vaddvq_u32(acc0)); + dis1 = static_cast(vaddvq_u32(acc1)); + dis2 = static_cast(vaddvq_u32(acc2)); + dis3 = static_cast(vaddvq_u32(acc3)); + + // Scalar tail. + if (i < d) { + size_t rem = d - i; + for (size_t j = 0; j < rem; j++) { + size_t idx = i + j; + uint8_t nibble_lo = q_lo[idx / 2]; + uint8_t nibble_hi = q_hi[idx / 2]; + + auto process_scalar = [&](const uint8_t* code, float& dis) { + uint8_t c = code[idx / 2]; + uint8_t nibble = (idx % 2 == 0) + ? static_cast(c & 0x0F) + : static_cast(c >> 4); + int q_code = (idx % 2 == 0) ? nibble_lo : nibble_hi; + int diff = q_code - int(nibble); + dis += static_cast(diff * diff); + }; + + process_scalar(code_0, dis0); + process_scalar(code_1, dis1); + process_scalar(code_2, dis2); + process_scalar(code_3, dis3); + } + } + + dis0 *= final_scale_sq; + dis1 *= final_scale_sq; + dis2 *= final_scale_sq; + dis3 *= final_scale_sq; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss + +#include "../../../impl/scalar_quantizer/sq-neon.cpp" + +#endif // COMPILE_SIMD_ARM_NEON diff --git a/thirdparty/faiss/faiss/cppcontrib/knowhere/utils/distances.cpp b/thirdparty/faiss/faiss/cppcontrib/knowhere/utils/distances.cpp index 23ac36f34..ef15f6366 100644 --- a/thirdparty/faiss/faiss/cppcontrib/knowhere/utils/distances.cpp +++ b/thirdparty/faiss/faiss/cppcontrib/knowhere/utils/distances.cpp @@ -22,8 +22,8 @@ #include "knowhere/bitsetview_idselector.h" #include "knowhere/object.h" -#include #include +#include "simd/hook.h" #include #include #include diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizer.h b/thirdparty/faiss/faiss/impl/ScalarQuantizer.h index 25bc3db3e..8fc44c805 100644 --- a/thirdparty/faiss/faiss/impl/ScalarQuantizer.h +++ b/thirdparty/faiss/faiss/impl/ScalarQuantizer.h @@ -102,6 +102,25 @@ struct ScalarQuantizer : Quantizer { virtual float query_to_code(const uint8_t* code) const = 0; + /// Compute four query-to-code distances in one call. Default loops + /// query_to_code four times; per-SIMD specializations may batch the + /// inner dim loop across the four codes to amortize query state and + /// expose ILP across independent accumulators. + virtual void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const { + dis0 = query_to_code(code_0); + dis1 = query_to_code(code_1); + dis2 = query_to_code(code_2); + dis3 = query_to_code(code_3); + } + float distance_to_code(const uint8_t* code) final { return query_to_code(code); } diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp index ac31c99cc..708288bb3 100644 --- a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp @@ -387,6 +387,43 @@ struct DCTemplate : SQDistanceComputer { float query_to_code(const uint8_t* code) const final { return compute_distance(q, code); } + + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + Similarity sim0(q); + Similarity sim1(q); + Similarity sim2(q); + Similarity sim3(q); + + sim0.begin_8(); + sim1.begin_8(); + sim2.begin_8(); + sim3.begin_8(); + + for (size_t i = 0; i < quant.d; i += 8) { + const int ii = static_cast(i); + simd8float32 xi0 = quant.reconstruct_8_components(code_0, ii); + simd8float32 xi1 = quant.reconstruct_8_components(code_1, ii); + simd8float32 xi2 = quant.reconstruct_8_components(code_2, ii); + simd8float32 xi3 = quant.reconstruct_8_components(code_3, ii); + sim0.add_8_components(xi0); + sim1.add_8_components(xi1); + sim2.add_8_components(xi2); + sim3.add_8_components(xi3); + } + + dis0 = sim0.result_8(); + dis1 = sim1.result_8(); + dis2 = sim2.result_8(); + dis3 = sim3.result_8(); + } }; template diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp index f85f8bbb8..77abffb36 100644 --- a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp @@ -368,6 +368,42 @@ struct DCTemplate float query_to_code(const uint8_t* code) const final { return compute_distance(q, code); } + + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + Similarity sim0(q); + Similarity sim1(q); + Similarity sim2(q); + Similarity sim3(q); + + sim0.begin_16(); + sim1.begin_16(); + sim2.begin_16(); + sim3.begin_16(); + + for (size_t i = 0; i < quant.d; i += 16) { + simd16float32 xi0 = quant.reconstruct_16_components(code_0, i); + simd16float32 xi1 = quant.reconstruct_16_components(code_1, i); + simd16float32 xi2 = quant.reconstruct_16_components(code_2, i); + simd16float32 xi3 = quant.reconstruct_16_components(code_3, i); + sim0.add_16_components(xi0); + sim1.add_16_components(xi1); + sim2.add_16_components(xi2); + sim3.add_16_components(xi3); + } + + dis0 = sim0.result_16(); + dis1 = sim1.result_16(); + dis2 = sim2.result_16(); + dis3 = sim3.result_16(); + } }; template diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp index 7d895941d..9be9f4a31 100644 --- a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp @@ -358,6 +358,42 @@ struct DCTemplate float query_to_code(const uint8_t* code) const final { return compute_distance(q, code); } + + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + Similarity sim0(q); + Similarity sim1(q); + Similarity sim2(q); + Similarity sim3(q); + + sim0.begin_8(); + sim1.begin_8(); + sim2.begin_8(); + sim3.begin_8(); + + for (size_t i = 0; i < quant.d; i += 8) { + simd8float32 xi0 = quant.reconstruct_8_components(code_0, i); + simd8float32 xi1 = quant.reconstruct_8_components(code_1, i); + simd8float32 xi2 = quant.reconstruct_8_components(code_2, i); + simd8float32 xi3 = quant.reconstruct_8_components(code_3, i); + sim0.add_8_components(xi0); + sim1.add_8_components(xi1); + sim2.add_8_components(xi2); + sim3.add_8_components(xi3); + } + + dis0 = sim0.result_8(); + dis1 = sim1.result_8(); + dis2 = sim2.result_8(); + dis3 = sim3.result_8(); + } }; template diff --git a/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp new file mode 100644 index 000000000..6fe99f8d1 --- /dev/null +++ b/thirdparty/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_RISCV_RVV + +#include +#include +#include +#include +#include + +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +/************************************************************************* + * Marker specializations. + * + * Unlike x86/NEON sq-*.cpp files that expose a fixed 8-wide / 16-wide codec + * interface (reconstruct_8_components / reconstruct_16_components), RVV is + * variable-width: the native vector length is implementation-defined and + * queried at runtime via __riscv_vsetvl. Forcing RVV into a fixed-width + * codec would leave performance on the table on wider hardware. + * + * So the strategy here is: Codec / Quantizer / Similarity classes for + * RISCV_RVV act as opaque TAG TYPES — they only need to be complete types + * so that baseline's sq-dispatch.h can form template arguments like + * `DCTemplate, UNIFORM, RISCV_RVV>, + * SimilarityL2, RISCV_RVV>`. + * + * The real SIMD work lives in full DCTemplate specializations below. + * Unspecialized combinations fall through to scalar via the fallback + * `DCTemplate : DCTemplate`. + ************************************************************************/ + +template <> +struct Codec8bit : Codec8bit {}; + +template <> +struct Codec4bit : Codec4bit {}; + +template <> +struct Codec6bit : Codec6bit {}; + +template +struct QuantizerTemplate< + Codec, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::RISCV_RVV> + : QuantizerTemplate< + Codec, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::NONE> { + QuantizerTemplate(size_t d, const std::vector& trained) + : QuantizerTemplate< + Codec, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::NONE>(d, trained) {} +}; + +template +struct QuantizerTemplate< + Codec, + QuantizerTemplateScaling::NON_UNIFORM, + SIMDLevel::RISCV_RVV> + : QuantizerTemplate< + Codec, + QuantizerTemplateScaling::NON_UNIFORM, + SIMDLevel::NONE> { + QuantizerTemplate(size_t d, const std::vector& trained) + : QuantizerTemplate< + Codec, + QuantizerTemplateScaling::NON_UNIFORM, + SIMDLevel::NONE>(d, trained) {} +}; + +template <> +struct QuantizerFP16 : QuantizerFP16 { + QuantizerFP16(size_t d, const std::vector& trained) + : QuantizerFP16(d, trained) {} +}; + +template <> +struct QuantizerBF16 : QuantizerBF16 { + QuantizerBF16(size_t d, const std::vector& trained) + : QuantizerBF16(d, trained) {} +}; + +template <> +struct Quantizer8bitDirect + : Quantizer8bitDirect { + Quantizer8bitDirect(size_t d, const std::vector& trained) + : Quantizer8bitDirect(d, trained) {} +}; + +template <> +struct Quantizer8bitDirectSigned + : Quantizer8bitDirectSigned { + Quantizer8bitDirectSigned(size_t d, const std::vector& trained) + : Quantizer8bitDirectSigned(d, trained) {} +}; + +template <> +struct SimilarityL2 : SimilarityL2 { + using SimilarityL2::SimilarityL2; +}; + +template <> +struct SimilarityIP : SimilarityIP { + using SimilarityIP::SimilarityIP; +}; + +/************************************************************************* + * Fallback DCTemplate / DistanceComputerByte for RISCV_RVV. + * + * Inheriting from the NONE specialization means every (Quantizer, Similarity) + * combination that does NOT have a hand-tuned RVV full specialization below + * falls through to scalar code. Callers and the dispatcher don't know or care. + ************************************************************************/ + +template +struct DCTemplate + : DCTemplate { + using Base = DCTemplate; + using Base::Base; +}; + +template +struct DistanceComputerByte + : DistanceComputerByte { + using Base = DistanceComputerByte; + using Base::Base; +}; + +/************************************************************************* + * Fast path — QT_4bit_uniform + L2 + * + * 4-bit UNIFORM scaling: every component reconstructs as an affine function + * of the 4-bit code, + * recon(c) = vmin + vdiff * (c + 0.5) / 15 = final_scale * c + bias + * where final_scale = vdiff / 15. L2 distance between two reconstructions + * therefore reduces to final_scale^2 * (q_c - c_c)^2 over integer codes, + * so we can stay in the int domain and pay one float multiply at the end. + * + * The RVV path pre-nibbles the query into q_lo / q_hi (even / odd lanes) + * once at set_query time and then processes native-VL-sized chunks of code + * without ever decoding to float. + ************************************************************************/ + +template <> +struct DCTemplate< + QuantizerTemplate< + Codec4bit, + QuantizerTemplateScaling::UNIFORM, + SIMDLevel::RISCV_RVV>, + SimilarityL2, + SIMDLevel::RISCV_RVV> : SQDistanceComputer { + using Sim = SimilarityL2; + + size_t d; + float vmin; + float vdiff; + float final_scale_sq; + std::vector q_lo; + std::vector q_hi; + + DCTemplate(size_t d_in, const std::vector& trained) + : d(d_in), + vmin(trained[0]), + vdiff(trained[1]), + q_lo((d_in + 1) / 2, 0), + q_hi((d_in + 1) / 2, 0) { + const float final_scale = vdiff / 15.0f; + final_scale_sq = final_scale * final_scale; + } + + void set_query(const float* x) final { + this->q = x; + const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff; + for (size_t i = 0; i < d; i++) { + float val = (x[i] - vmin) * inv_scale; + int code = static_cast(std::floor(val + 0.5f)); + if (code < 0) { + code = 0; + } + if (code > 15) { + code = 15; + } + if (i % 2 == 0) { + q_lo[i / 2] = static_cast(code); + } else { + q_hi[i / 2] = static_cast(code); + } + } + } + + /// Squared integer-domain L2 between pre-nibbled q and packed 4-bit code. + /// Uses RVV's native VL; no fixed width assumptions. Returns the raw + /// integer sum — caller multiplies by final_scale_sq. + int64_t accumulate_int_l2(const uint8_t* code) const { + int64_t acc = 0; + size_t i = 0; + while (i < d) { + // Process up to vl codes per iteration. Each code byte packs two + // 4-bit codes, so we load (vl + 1) / 2 bytes; keep vl even to + // keep the nibble split aligned with the i % 2 split we used at + // set_query time. + size_t remaining = d - i; + size_t vl = __riscv_vsetvl_e8m1(remaining); + if (vl & 1) { + vl -= 1; // keep even; tail handled on next iter or scalar + } + if (vl == 0) { + break; + } + const size_t byte_vl = vl / 2; + + vuint8m1_t packed = __riscv_vle8_v_u8m1(code + i / 2, byte_vl); + vuint8m1_t ql = __riscv_vle8_v_u8m1(q_lo.data() + i / 2, byte_vl); + vuint8m1_t qh = __riscv_vle8_v_u8m1(q_hi.data() + i / 2, byte_vl); + + vuint8m1_t lo_nib = __riscv_vand_vx_u8m1(packed, 0x0F, byte_vl); + vuint8m1_t hi_nib = __riscv_vsrl_vx_u8m1(packed, 4, byte_vl); + + // |ql - lo| and |qh - hi| fit in u8 (values are in [0, 15]). + vuint8m1_t d_lo = __riscv_vsub_vv_u8m1( + __riscv_vmaxu_vv_u8m1(ql, lo_nib, byte_vl), + __riscv_vminu_vv_u8m1(ql, lo_nib, byte_vl), + byte_vl); + vuint8m1_t d_hi = __riscv_vsub_vv_u8m1( + __riscv_vmaxu_vv_u8m1(qh, hi_nib, byte_vl), + __riscv_vminu_vv_u8m1(qh, hi_nib, byte_vl), + byte_vl); + + // Square via widening multiply (each byte squared fits in u16, + // since max byte value is 15 -> 225). + vuint16m2_t sq_lo = __riscv_vwmulu_vv_u16m2(d_lo, d_lo, byte_vl); + vuint16m2_t sq_hi = __riscv_vwmulu_vv_u16m2(d_hi, d_hi, byte_vl); + vuint16m2_t sq_sum = __riscv_vadd_vv_u16m2(sq_lo, sq_hi, byte_vl); + + // Reduce to a scalar u32 (safe: byte_vl * 450 fits in u32 for + // any realistic d). + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, 1); + vuint32m1_t red = + __riscv_vwredsumu_vs_u16m2_u32m1(sq_sum, zero, byte_vl); + acc += __riscv_vmv_x_s_u32m1_u32(red); + + i += vl; + } + // Scalar tail: cover any leftover odd lane (at most one). + for (; i < d; i++) { + uint8_t c_code = + (i % 2 == 0) ? (code[i / 2] & 0x0F) : (code[i / 2] >> 4); + uint8_t q_code = (i % 2 == 0) ? q_lo[i / 2] : q_hi[i / 2]; + int diff = int(q_code) - int(c_code); + acc += diff * diff; + } + return acc; + } + + float query_to_code(const uint8_t* code) const final { + return static_cast(accumulate_int_l2(code)) * final_scale_sq; + } + + float symmetric_dis(idx_t i, idx_t j) override { + // Not on the critical path for most workloads; reconstruct both + // codes into nibbles scalar-style and compute squared distance. + const uint8_t* c1 = codes + i * code_size; + const uint8_t* c2 = codes + j * code_size; + int64_t acc = 0; + for (size_t k = 0; k < d; k++) { + uint8_t a = (k % 2 == 0) ? (c1[k / 2] & 0x0F) : (c1[k / 2] >> 4); + uint8_t b = (k % 2 == 0) ? (c2[k / 2] & 0x0F) : (c2[k / 2] >> 4); + int diff = int(a) - int(b); + acc += diff * diff; + } + return static_cast(acc) * final_scale_sq; + } + + void query_to_codes_batch_4( + const uint8_t* code_0, + const uint8_t* code_1, + const uint8_t* code_2, + const uint8_t* code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const final { + // Simple 4x unroll of the single-code path; good enough as a first + // cut — gives ILP across the four independent accumulate loops. + dis0 = static_cast(accumulate_int_l2(code_0)) * final_scale_sq; + dis1 = static_cast(accumulate_int_l2(code_1)) * final_scale_sq; + dis2 = static_cast(accumulate_int_l2(code_2)) * final_scale_sq; + dis3 = static_cast(accumulate_int_l2(code_3)) * final_scale_sq; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss + +#define THE_LEVEL_TO_DISPATCH SIMDLevel::RISCV_RVV +#include + +#endif // COMPILE_SIMD_RISCV_RVV diff --git a/thirdparty/faiss/faiss/impl/simd_dispatch.h b/thirdparty/faiss/faiss/impl/simd_dispatch.h index bfd27bc35..b18bc5b4a 100644 --- a/thirdparty/faiss/faiss/impl/simd_dispatch.h +++ b/thirdparty/faiss/faiss/impl/simd_dispatch.h @@ -101,6 +101,14 @@ inline auto with_selected_simd_levels(LambdaType&& action) { } [[fallthrough]]; #endif + +#ifdef COMPILE_SIMD_RISCV_RVV + case SIMDLevel::RISCV_RVV: + if constexpr (available_levels & (1 << int(SIMDLevel::RISCV_RVV))) { + return action.template operator()(); + } + [[fallthrough]]; +#endif default: return action.template operator()(); } diff --git a/thirdparty/faiss/faiss/utils/simd_levels.cpp b/thirdparty/faiss/faiss/utils/simd_levels.cpp index 1dc7e74ab..3402555bc 100644 --- a/thirdparty/faiss/faiss/utils/simd_levels.cpp +++ b/thirdparty/faiss/faiss/utils/simd_levels.cpp @@ -190,6 +190,12 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() { } #endif +#if defined(__riscv) && defined(COMPILE_SIMD_RISCV_RVV) + // RVV is always available on RISC-V builds compiled with rv64gcv. + supported_simd_levels |= (1 << static_cast(SIMDLevel::RISCV_RVV)); + detected_level = SIMDLevel::RISCV_RVV; +#endif + return detected_level; } @@ -259,6 +265,8 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() { return SIMDLevel::ARM_SVE; #elif defined(COMPILE_SIMD_ARM_NEON) return SIMDLevel::ARM_NEON; +#elif defined(COMPILE_SIMD_RISCV_RVV) + return SIMDLevel::RISCV_RVV; #else return SIMDLevel::NONE; #endif @@ -289,6 +297,8 @@ std::string to_string(SIMDLevel level) { return "ARM_NEON"; case SIMDLevel::ARM_SVE: return "ARM_SVE"; + case SIMDLevel::RISCV_RVV: + return "RISCV_RVV"; case SIMDLevel::COUNT: default: throw FaissException("Invalid SIMDLevel"); @@ -314,6 +324,9 @@ SIMDLevel to_simd_level(const std::string& level_str) { if (level_str == "ARM_SVE") { return SIMDLevel::ARM_SVE; } + if (level_str == "RISCV_RVV") { + return SIMDLevel::RISCV_RVV; + } throw FaissException("Invalid SIMD level string: " + level_str); } diff --git a/thirdparty/faiss/faiss/utils/simd_levels.h b/thirdparty/faiss/faiss/utils/simd_levels.h index 61d84b55f..9aa367f6d 100644 --- a/thirdparty/faiss/faiss/utils/simd_levels.h +++ b/thirdparty/faiss/faiss/utils/simd_levels.h @@ -25,6 +25,8 @@ enum class SIMDLevel { // arm & aarch64 ARM_NEON, ARM_SVE, // Scalable Vector Extension (ARMv8.2+) + // riscv + RISCV_RVV, // RISC-V Vector Extension (rv64gcv) COUNT }; @@ -58,6 +60,8 @@ inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::AVX2; inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::ARM_SVE; #elif defined(COMPILE_SIMD_ARM_NEON) inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::ARM_NEON; +#elif defined(COMPILE_SIMD_RISCV_RVV) +inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::RISCV_RVV; #else inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::NONE; #endif @@ -113,6 +117,9 @@ constexpr int simd_width() { static_assert( SL != SIMDLevel::ARM_SVE, "simd_width is not supported: SVE is variable-width"); + static_assert( + SL != SIMDLevel::RISCV_RVV, + "simd_width is not supported: RVV is variable-width"); if constexpr (SL == SIMDLevel::AVX512 || SL == SIMDLevel::AVX512_SPR) return 16; else if constexpr (SL == SIMDLevel::AVX2 || SL == SIMDLevel::ARM_NEON)