Skip to content

Commit 19f4fbc

Browse files
authored
sparse: add SINDI inverted index with SIMD-accelerated window-based search (#1524)
* sparse: add SINDI inverted index with SIMD-accelerated window-based search SINDI partitions posting lists into fixed-size windows (1024-65535 docs), storing only 16-bit local offsets instead of full 32-bit doc IDs. Search uses a window-level TAAT (Term-At-A-Time) strategy: for each window, scores are scatter-accumulated into a dense buffer via SIMD (AVX-512/AVX2/SVE), then batch-inserted into a top-k heap. Per-dimension window NNZ metadata enables O(1) window skipping. Supports both IP (fp16 quantized values) and BM25 (uint16 term frequencies) scoring. Based on the SINDI algorithm for sparse vector search. Reference: https://arxiv.org/abs/2509.08395 Signed-off-by: Shawn Wang <shawn.wang@zilliz.com> * fix ci Signed-off-by: Shawn Wang <shawn.wang@zilliz.com> * fix comment Signed-off-by: Shawn Wang <shawn.wang@zilliz.com> * fix: address comments Signed-off-by: Shawn Wang <shawn.wang@zilliz.com> --------- Signed-off-by: Shawn Wang <shawn.wang@zilliz.com>
1 parent 902ea36 commit 19f4fbc

12 files changed

Lines changed: 1924 additions & 68 deletions

CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64|amd64)")
205205
src/index/sparse/codec/streamvbyte_0124_decode.c
206206
src/index/sparse/codec/streamvbyte_0124_encode.c
207207
PROPERTIES COMPILE_FLAGS "-msse4.1")
208-
208+
set_source_files_properties(
209+
src/index/sparse/sindi_simd_avx2.cc
210+
PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c")
211+
set_source_files_properties(
212+
src/index/sparse/sindi_simd_avx512.cc
213+
PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512cd -mavx2 -mfma -mf16c")
209214
endif()
210215

211216
knowhere_file_glob(GLOB_RECURSE KNOWHERE_GPU_SRCS src/index/gpu/flat_gpu/*.cc

src/index/sparse/block_inverted_index.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,7 @@ BlockInvertedIndex<DType, QType, MetricType>::serialize(MemoryIOWriter& writer)
851851
// - nr_sections (uint32_t): Number of sections
852852
// - section_headers[nr_sections]: Array of section headers, each containing:
853853
// - type (InvertedIndexSectionType): Type of the section
854+
// - padding (uint32_t): Padding to align the section header to 8 bytes
854855
// - offset (uint64_t): Offset of the section from the beginning of the file
855856
// - size (uint64_t): Size of the section in bytes
856857
//

src/index/sparse/flatten_inverted_index.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,7 @@ FlattenInvertedIndex<DType, QType>::serialize(MemoryIOWriter& writer) const {
565565
// - nr_sections (uint32_t): Number of sections
566566
// - section_headers[nr_sections]: Array of section headers, each containing:
567567
// - type (InvertedIndexSectionType): Type of the section
568+
// - padding (uint32_t): Padding to align the section header to 8 bytes
568569
// - offset (uint64_t): Offset of the section from the beginning of the file
569570
// - size (uint64_t): Size of the section in bytes
570571
//

src/index/sparse/inverted_index.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ enum class InvertedIndexAlgo : uint32_t {
3131
DAAT_WAND = 2,
3232
BLOCK_MAX_MAXSCORE = 3,
3333
BLOCK_MAX_WAND = 4,
34+
SINDI = 5,
3435
};
3536

3637
enum class InvertedIndexEncoding : uint32_t {
@@ -247,6 +248,8 @@ class InvertedIndex {
247248
LOG_KNOWHERE_WARNING_ << "No block size provided, using default block size 128";
248249
meta_data_.block_max_data_.block_size_ = 128;
249250
}
251+
} else if (build_algo == "SINDI") {
252+
build_algo_ = InvertedIndexAlgo::SINDI;
250253
} else {
251254
build_algo_ = InvertedIndexAlgo::TAAT_NAIVE;
252255
}

src/index/sparse/sindi_inverted_index.h

Lines changed: 1002 additions & 0 deletions
Large diffs are not rendered by default.

src/index/sparse/sindi_simd.cc

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#include "index/sparse/sindi_simd.h"
2+
3+
#include "simd/hook.h"
4+
5+
namespace knowhere::sparse::inverted::sindi {
6+
7+
void
8+
ip_scatter_scalar_fp16(float qval, const knowhere::fp16* vals, const uint16_t* ids, int32_t num, float* out) {
9+
for (int32_t i = 0; i < num; ++i) {
10+
out[ids[i]] += qval * static_cast<float>(vals[i]);
11+
}
12+
}
13+
14+
void
15+
bm25_scatter_scalar_u16(float qval, const uint16_t* vals, const uint16_t* ids, int32_t num, float* out, float k1,
16+
float b, float avgdl, const float* row_sums) {
17+
const float p1 = k1 + 1.0f;
18+
const float p2 = k1 * (1.0f - b);
19+
const float p3 = k1 * b / avgdl;
20+
21+
for (int32_t i = 0; i < num; ++i) {
22+
float tf = static_cast<float>(vals[i]);
23+
uint16_t docid = ids[i];
24+
float dl = row_sums[docid];
25+
float bm25_score = qval * p1 * tf / (tf + p2 + p3 * dl);
26+
out[docid] += bm25_score;
27+
}
28+
}
29+
30+
void
31+
batch_insert_scalar(const float* scores, size_t docid_start, size_t count,
32+
knowhere::ResultMinHeap<float, uint32_t>& topk_q, float& threshold, const BitsetView& bitset) {
33+
for (size_t i = 0; i < count; ++i) {
34+
float s = scores[i];
35+
if (s <= threshold) {
36+
continue;
37+
}
38+
if (!bitset.empty() && bitset.test(static_cast<int64_t>(docid_start + i))) {
39+
continue;
40+
}
41+
if (topk_q.Push(s, static_cast<uint32_t>(docid_start + i))) {
42+
if (topk_q.Full()) {
43+
threshold = topk_q.Threshold();
44+
}
45+
}
46+
}
47+
}
48+
49+
const IPKernels&
50+
get_ip_kernels() {
51+
static const IPKernels kernels = []() {
52+
IPKernels k{};
53+
#if defined(__x86_64__)
54+
if (faiss::cppcontrib::knowhere::cpu_support_avx512()) {
55+
k.accumulate = ip_scatter_avx512_fp16;
56+
k.batch_insert = batch_insert_avx512;
57+
return k;
58+
}
59+
if (faiss::cppcontrib::knowhere::cpu_support_avx2()) {
60+
k.accumulate = ip_scatter_avx2_fp16;
61+
k.batch_insert = batch_insert_avx2;
62+
return k;
63+
}
64+
#elif defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
65+
if (faiss::cppcontrib::knowhere::supports_sve()) {
66+
k.accumulate = ip_scatter_sve_fp16;
67+
k.batch_insert = batch_insert_sve;
68+
return k;
69+
}
70+
#endif
71+
k.accumulate = ip_scatter_scalar_fp16;
72+
k.batch_insert = batch_insert_scalar;
73+
return k;
74+
}();
75+
return kernels;
76+
}
77+
78+
const BM25Kernels&
79+
get_bm25_kernels() {
80+
static const BM25Kernels kernels = []() {
81+
BM25Kernels k{};
82+
#if defined(__x86_64__)
83+
if (faiss::cppcontrib::knowhere::cpu_support_avx512()) {
84+
k.accumulate = bm25_scatter_avx512_u16;
85+
k.batch_insert = batch_insert_avx512;
86+
return k;
87+
}
88+
if (faiss::cppcontrib::knowhere::cpu_support_avx2()) {
89+
k.accumulate = bm25_scatter_avx2_u16;
90+
k.batch_insert = batch_insert_avx2;
91+
return k;
92+
}
93+
#elif defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
94+
if (faiss::cppcontrib::knowhere::supports_sve()) {
95+
k.accumulate = bm25_scatter_sve_u16;
96+
k.batch_insert = batch_insert_sve;
97+
return k;
98+
}
99+
#endif
100+
k.accumulate = bm25_scatter_scalar_u16;
101+
k.batch_insert = batch_insert_scalar;
102+
return k;
103+
}();
104+
return kernels;
105+
}
106+
107+
} // namespace knowhere::sparse::inverted::sindi

src/index/sparse/sindi_simd.h

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#pragma once
2+
3+
#include <cstdint>
4+
5+
#include "knowhere/bitsetview.h"
6+
#include "knowhere/heap.h"
7+
#include "knowhere/operands.h"
8+
9+
namespace knowhere::sparse::inverted::sindi {
10+
11+
using ip_accumulate_fn_t = void (*)(float qval, const knowhere::fp16* vals, const uint16_t* ids, int32_t num,
12+
float* out);
13+
14+
using bm25_accumulate_fn_t = void (*)(float qval, const uint16_t* tf_vals, const uint16_t* ids, int32_t num, float* out,
15+
float k1, float b, float avgdl, const float* row_sums);
16+
17+
using batch_insert_fn_t = void (*)(const float* scores, size_t docid_start, size_t count,
18+
knowhere::ResultMinHeap<float, uint32_t>& topk_q, float& threshold,
19+
const BitsetView& bitset);
20+
21+
struct IPKernels {
22+
ip_accumulate_fn_t accumulate;
23+
batch_insert_fn_t batch_insert;
24+
};
25+
26+
struct BM25Kernels {
27+
bm25_accumulate_fn_t accumulate;
28+
batch_insert_fn_t batch_insert;
29+
};
30+
31+
const IPKernels&
32+
get_ip_kernels();
33+
const BM25Kernels&
34+
get_bm25_kernels();
35+
36+
// Scalar implementations (always available)
37+
void
38+
ip_scatter_scalar_fp16(float qval, const knowhere::fp16* vals, const uint16_t* ids, int32_t num, float* out);
39+
void
40+
bm25_scatter_scalar_u16(float qval, const uint16_t* vals, const uint16_t* ids, int32_t num, float* out, float k1,
41+
float b, float avgdl, const float* row_sums);
42+
void
43+
batch_insert_scalar(const float* scores, size_t docid_start, size_t count,
44+
knowhere::ResultMinHeap<float, uint32_t>& topk_q, float& threshold, const BitsetView& bitset);
45+
46+
#if defined(__x86_64__)
47+
// AVX2 implementations (compiled separately with -mavx2)
48+
void
49+
ip_scatter_avx2_fp16(float qval, const knowhere::fp16* vals, const uint16_t* ids, int32_t num, float* out);
50+
void
51+
bm25_scatter_avx2_u16(float qval, const uint16_t* vals, const uint16_t* ids, int32_t num, float* out, float k1, float b,
52+
float avgdl, const float* row_sums);
53+
void
54+
batch_insert_avx2(const float* scores, size_t docid_start, size_t count,
55+
knowhere::ResultMinHeap<float, uint32_t>& topk_q, float& threshold, const BitsetView& bitset);
56+
57+
// AVX512 implementations (compiled separately with -mavx512f)
58+
void
59+
ip_scatter_avx512_fp16(float qval, const knowhere::fp16* vals, const uint16_t* ids, int32_t num, float* out);
60+
void
61+
bm25_scatter_avx512_u16(float qval, const uint16_t* vals, const uint16_t* ids, int32_t num, float* out, float k1,
62+
float b, float avgdl, const float* row_sums);
63+
void
64+
batch_insert_avx512(const float* scores, size_t docid_start, size_t count,
65+
knowhere::ResultMinHeap<float, uint32_t>& topk_q, float& threshold, const BitsetView& bitset);
66+
#endif
67+
68+
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
69+
// SVE implementations (compiled with SVE support)
70+
void
71+
ip_scatter_sve_fp16(float qval, const knowhere::fp16* vals, const uint16_t* ids, int32_t num, float* out);
72+
void
73+
bm25_scatter_sve_u16(float qval, const uint16_t* vals, const uint16_t* ids, int32_t num, float* out, float k1, float b,
74+
float avgdl, const float* row_sums);
75+
void
76+
batch_insert_sve(const float* scores, size_t docid_start, size_t count,
77+
knowhere::ResultMinHeap<float, uint32_t>& topk_q, float& threshold, const BitsetView& bitset);
78+
#endif
79+
80+
} // namespace knowhere::sparse::inverted::sindi
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#include "index/sparse/sindi_simd.h"
2+
3+
#if defined(__x86_64__)
4+
#include <immintrin.h>
5+
6+
namespace knowhere::sparse::inverted::sindi {
7+
8+
void
9+
ip_scatter_avx2_fp16(float qval, const knowhere::fp16* vals, const uint16_t* ids, int32_t num, float* out) {
10+
int32_t i = 0;
11+
const __m256 vq = _mm256_set1_ps(qval);
12+
for (; i + 8 <= num; i += 8) {
13+
const uint16_t* hptr = reinterpret_cast<const uint16_t*>(vals + i);
14+
__m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i*>(hptr));
15+
__m256 v_vals = _mm256_cvtph_ps(h);
16+
__m256 v_mul = _mm256_mul_ps(v_vals, vq);
17+
18+
__m128i idx16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ids + i));
19+
__m256i v_idx = _mm256_cvtepu16_epi32(idx16);
20+
__m256 v_old = _mm256_i32gather_ps(out, v_idx, 4);
21+
__m256 v_sum = _mm256_add_ps(v_old, v_mul);
22+
23+
alignas(32) uint32_t tmp_idx[8];
24+
alignas(32) float tmp_sum[8];
25+
_mm256_store_si256(reinterpret_cast<__m256i*>(tmp_idx), v_idx);
26+
_mm256_store_ps(tmp_sum, v_sum);
27+
out[tmp_idx[0]] = tmp_sum[0];
28+
out[tmp_idx[1]] = tmp_sum[1];
29+
out[tmp_idx[2]] = tmp_sum[2];
30+
out[tmp_idx[3]] = tmp_sum[3];
31+
out[tmp_idx[4]] = tmp_sum[4];
32+
out[tmp_idx[5]] = tmp_sum[5];
33+
out[tmp_idx[6]] = tmp_sum[6];
34+
out[tmp_idx[7]] = tmp_sum[7];
35+
}
36+
for (; i < num; ++i) {
37+
out[ids[i]] += qval * static_cast<float>(vals[i]);
38+
}
39+
}
40+
41+
void
42+
bm25_scatter_avx2_u16(float qval, const uint16_t* vals, const uint16_t* ids, int32_t num, float* out, float k1, float b,
43+
float avgdl, const float* row_sums) {
44+
const float p1 = k1 + 1.0f;
45+
const float p2 = k1 * (1.0f - b);
46+
const float p3 = k1 * b / avgdl;
47+
48+
int32_t i = 0;
49+
const __m256 vqval = _mm256_set1_ps(qval);
50+
const __m256 vp1 = _mm256_set1_ps(p1);
51+
const __m256 vp2 = _mm256_set1_ps(p2);
52+
const __m256 vp3 = _mm256_set1_ps(p3);
53+
54+
for (; i + 8 <= num; i += 8) {
55+
const uint16_t* hptr = vals + i;
56+
__m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i*>(hptr));
57+
__m256i w = _mm256_cvtepu16_epi32(h);
58+
__m256 tf_vec = _mm256_cvtepi32_ps(w);
59+
60+
__m128i idx16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ids + i));
61+
__m256i v_idx = _mm256_cvtepu16_epi32(idx16);
62+
__m256 dl_vec = _mm256_i32gather_ps(row_sums, v_idx, 4);
63+
64+
__m256 numerator = _mm256_mul_ps(tf_vec, vp1);
65+
numerator = _mm256_mul_ps(numerator, vqval);
66+
67+
__m256 denominator = _mm256_fmadd_ps(dl_vec, vp3, vp2);
68+
denominator = _mm256_add_ps(tf_vec, denominator);
69+
70+
__m256 bm25_vec = _mm256_div_ps(numerator, denominator);
71+
72+
__m256 v_old = _mm256_i32gather_ps(out, v_idx, 4);
73+
__m256 v_sum = _mm256_add_ps(v_old, bm25_vec);
74+
75+
alignas(32) uint32_t tmp_idx[8];
76+
alignas(32) float tmp_sum[8];
77+
_mm256_store_si256(reinterpret_cast<__m256i*>(tmp_idx), v_idx);
78+
_mm256_store_ps(tmp_sum, v_sum);
79+
out[tmp_idx[0]] = tmp_sum[0];
80+
out[tmp_idx[1]] = tmp_sum[1];
81+
out[tmp_idx[2]] = tmp_sum[2];
82+
out[tmp_idx[3]] = tmp_sum[3];
83+
out[tmp_idx[4]] = tmp_sum[4];
84+
out[tmp_idx[5]] = tmp_sum[5];
85+
out[tmp_idx[6]] = tmp_sum[6];
86+
out[tmp_idx[7]] = tmp_sum[7];
87+
}
88+
89+
for (; i < num; ++i) {
90+
float tf = static_cast<float>(vals[i]);
91+
uint16_t docid = ids[i];
92+
float dl = row_sums[docid];
93+
float bm25_score = qval * p1 * tf / (tf + p2 + p3 * dl);
94+
out[docid] += bm25_score;
95+
}
96+
}
97+
98+
void
99+
batch_insert_avx2(const float* scores, size_t docid_start, size_t count,
100+
knowhere::ResultMinHeap<float, uint32_t>& topk_q, float& threshold, const BitsetView& bitset) {
101+
size_t i = 0;
102+
__m256 vthr = _mm256_set1_ps(threshold);
103+
for (; i + 8 <= count; i += 8) {
104+
_mm_prefetch(reinterpret_cast<const char*>(scores + i + 32), _MM_HINT_T0);
105+
__m256 v = _mm256_loadu_ps(scores + i);
106+
__m256 cmp = _mm256_cmp_ps(v, vthr, _CMP_GT_OQ);
107+
int mm = _mm256_movemask_ps(cmp);
108+
while (mm != 0) {
109+
unsigned bit = __builtin_ctz(static_cast<unsigned>(mm));
110+
mm &= (mm - 1);
111+
size_t idx = i + bit;
112+
if (!bitset.empty() && bitset.test(static_cast<int64_t>(docid_start + idx))) {
113+
continue;
114+
}
115+
float s = scores[idx];
116+
if (topk_q.Push(s, static_cast<uint32_t>(docid_start + idx))) {
117+
if (topk_q.Full()) {
118+
threshold = topk_q.Threshold();
119+
vthr = _mm256_set1_ps(threshold);
120+
}
121+
}
122+
}
123+
}
124+
for (; i < count; ++i) {
125+
float s = scores[i];
126+
if (s <= threshold) {
127+
continue;
128+
}
129+
if (!bitset.empty() && bitset.test(static_cast<int64_t>(docid_start + i))) {
130+
continue;
131+
}
132+
if (topk_q.Push(s, static_cast<uint32_t>(docid_start + i))) {
133+
if (topk_q.Full()) {
134+
threshold = topk_q.Threshold();
135+
}
136+
}
137+
}
138+
}
139+
140+
} // namespace knowhere::sparse::inverted::sindi
141+
142+
#endif // __x86_64__

0 commit comments

Comments
 (0)