Skip to content

Commit e2c38f3

Browse files
authored
Implement experimental new IVF ANN index for vec0 tables
Add new experimental IVF ANN INdex
2 parents 43982c1 + bb3ef78 commit e2c38f3

22 files changed

+4997
-28
lines changed

Makefile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,21 @@ test-loadable-watch:
206206
test-unit:
207207
$(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST -DSQLITE_VEC_ENABLE_RESCORE tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit
208208

209+
# Standalone sqlite3 CLI with vec0 compiled in. Useful for benchmarking,
210+
# profiling (has debug symbols), and scripting without .load_extension.
211+
# make cli
212+
# dist/sqlite3 :memory: "SELECT vec_version()"
213+
# dist/sqlite3 < script.sql
214+
cli: sqlite-vec.h $(prefix)
215+
$(CC) -O2 -g \
216+
-DSQLITE_CORE \
217+
-DSQLITE_EXTRA_INIT=core_init \
218+
-DSQLITE_THREADSAFE=0 \
219+
-Ivendor/ -I./ \
220+
$(CFLAGS) \
221+
vendor/sqlite3.c vendor/shell.c sqlite-vec.c examples/sqlite3-cli/core_init.c \
222+
-ldl -lm -o $(prefix)/sqlite3
223+
209224
fuzz-build:
210225
$(MAKE) -C tests/fuzz all
211226

benchmarks-ann/Makefile

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,20 @@ BASELINES = \
88
"brute-int8:type=baseline,variant=int8" \
99
"brute-bit:type=baseline,variant=bit"
1010

11-
# --- Index-specific configs ---
12-
# Each index branch should add its own configs here. Example:
13-
#
14-
# DISKANN_CONFIGS = \
15-
# "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \
16-
# "diskann-R72-int8:type=diskann,R=72,L=128,quantizer=int8"
17-
#
18-
# IVF_CONFIGS = \
19-
# "ivf-n128-p16:type=ivf,nlist=128,nprobe=16"
20-
#
21-
# ANNOY_CONFIGS = \
22-
# "annoy-t50:type=annoy,n_trees=50"
11+
# --- IVF configs ---
12+
IVF_CONFIGS = \
13+
"ivf-n32-p8:type=ivf,nlist=32,nprobe=8" \
14+
"ivf-n128-p16:type=ivf,nlist=128,nprobe=16" \
15+
"ivf-n512-p32:type=ivf,nlist=512,nprobe=32"
2316

2417
RESCORE_CONFIGS = \
2518
"rescore-bit-os8:type=rescore,quantizer=bit,oversample=8" \
2619
"rescore-bit-os16:type=rescore,quantizer=bit,oversample=16" \
2720
"rescore-int8-os8:type=rescore,quantizer=int8,oversample=8"
2821

29-
ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS)
22+
ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS)
3023

31-
.PHONY: seed ground-truth bench-smoke bench-rescore bench-10k bench-50k bench-100k bench-all \
24+
.PHONY: seed ground-truth bench-smoke bench-rescore bench-ivf bench-10k bench-50k bench-100k bench-all \
3225
report clean
3326

3427
# --- Data preparation ---
@@ -43,7 +36,8 @@ ground-truth: seed
4336
# --- Quick smoke test ---
4437
bench-smoke: seed
4538
$(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \
46-
$(BASELINES)
39+
"brute-float:type=baseline,variant=float" \
40+
"ivf-quick:type=ivf,nlist=16,nprobe=4"
4741

4842
bench-rescore: seed
4943
$(BENCH) --subset-size 10000 -k 10 -o runs/rescore \
@@ -62,6 +56,12 @@ bench-100k: seed
6256

6357
bench-all: bench-10k bench-50k bench-100k
6458

59+
# --- IVF across sizes ---
60+
bench-ivf: seed
61+
$(BENCH) --subset-size 10000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
62+
$(BENCH) --subset-size 50000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
63+
$(BENCH) --subset-size 100000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
64+
6565
# --- Report ---
6666
report:
6767
@echo "Use: sqlite3 runs/<dir>/results.db 'SELECT * FROM bench_results ORDER BY recall DESC'"

benchmarks-ann/bench.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,48 @@ def _rescore_describe(params):
173173
}
174174

175175

176+
# ============================================================================
177+
# IVF implementation
178+
# ============================================================================
179+
180+
181+
def _ivf_create_table_sql(params):
182+
return (
183+
f"CREATE VIRTUAL TABLE vec_items USING vec0("
184+
f" id integer primary key,"
185+
f" embedding float[768] distance_metric=cosine"
186+
f" indexed by ivf("
187+
f" nlist={params['nlist']},"
188+
f" nprobe={params['nprobe']}"
189+
f" )"
190+
f")"
191+
)
192+
193+
194+
def _ivf_post_insert_hook(conn, params):
195+
print(" Training k-means centroids...", flush=True)
196+
t0 = time.perf_counter()
197+
conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')")
198+
conn.commit()
199+
elapsed = time.perf_counter() - t0
200+
print(f" Training done in {elapsed:.1f}s", flush=True)
201+
return elapsed
202+
203+
204+
def _ivf_describe(params):
205+
return f"ivf nlist={params['nlist']:<4} nprobe={params['nprobe']}"
206+
207+
208+
INDEX_REGISTRY["ivf"] = {
209+
"defaults": {"nlist": 128, "nprobe": 16},
210+
"create_table_sql": _ivf_create_table_sql,
211+
"insert_sql": None,
212+
"post_insert_hook": _ivf_post_insert_hook,
213+
"run_query": None,
214+
"describe": _ivf_describe,
215+
}
216+
217+
176218
# ============================================================================
177219
# Config parsing
178220
# ============================================================================

sqlite-vec-ivf-kmeans.c

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
/**
2+
* sqlite-vec-ivf-kmeans.c — Pure k-means clustering algorithm.
3+
*
4+
* No SQLite dependency. Operates on float arrays in memory.
5+
* #include'd into sqlite-vec.c after struct definitions.
6+
*/
7+
8+
#ifndef SQLITE_VEC_IVF_KMEANS_C
9+
#define SQLITE_VEC_IVF_KMEANS_C
10+
11+
// When opened standalone in an editor, pull in types so the LSP is happy.
12+
// When #include'd from sqlite-vec.c, SQLITE_VEC_H is already defined.
13+
#ifndef SQLITE_VEC_H
14+
#include "sqlite-vec.c" // IWYU pragma: keep
15+
#endif
16+
17+
#include <float.h>
18+
#include <string.h>
19+
20+
#define VEC0_IVF_KMEANS_MAX_ITER 25
21+
#define VEC0_IVF_KMEANS_DEFAULT_SEED 0
22+
23+
// Simple xorshift32 PRNG
24+
static uint32_t ivf_xorshift32(uint32_t *state) {
25+
uint32_t x = *state;
26+
x ^= x << 13;
27+
x ^= x >> 17;
28+
x ^= x << 5;
29+
*state = x;
30+
return x;
31+
}
32+
33+
// L2 squared distance between two float vectors
34+
static float ivf_l2_dist(const float *a, const float *b, int D) {
35+
float sum = 0.0f;
36+
for (int d = 0; d < D; d++) {
37+
float diff = a[d] - b[d];
38+
sum += diff * diff;
39+
}
40+
return sum;
41+
}
42+
43+
// Find nearest centroid for a single vector. Returns centroid index.
44+
static int ivf_nearest_centroid(const float *vec, const float *centroids,
45+
int D, int k) {
46+
float min_dist = FLT_MAX;
47+
int best = 0;
48+
for (int c = 0; c < k; c++) {
49+
float dist = ivf_l2_dist(vec, &centroids[c * D], D);
50+
if (dist < min_dist) {
51+
min_dist = dist;
52+
best = c;
53+
}
54+
}
55+
return best;
56+
}
57+
58+
/**
59+
* K-means++ initialization.
60+
* Picks k initial centroids from the data with probability proportional
61+
* to squared distance from nearest existing centroid.
62+
*/
63+
static int ivf_kmeans_init_plusplus(const float *vectors, int N, int D,
64+
int k, uint32_t seed, float *centroids) {
65+
if (N <= 0 || k <= 0 || D <= 0)
66+
return -1;
67+
if (seed == 0)
68+
seed = 42;
69+
70+
// Pick first centroid randomly
71+
int first = ivf_xorshift32(&seed) % N;
72+
memcpy(centroids, &vectors[first * D], D * sizeof(float));
73+
74+
if (k == 1)
75+
return 0;
76+
77+
// Allocate distance array
78+
float *dists = sqlite3_malloc64((i64)N * sizeof(float));
79+
if (!dists)
80+
return -1;
81+
82+
for (int c = 1; c < k; c++) {
83+
// Compute D(x) = distance to nearest existing centroid
84+
double total = 0.0;
85+
for (int i = 0; i < N; i++) {
86+
float d = ivf_l2_dist(&vectors[i * D], &centroids[(c - 1) * D], D);
87+
if (c == 1 || d < dists[i]) {
88+
dists[i] = d;
89+
}
90+
total += dists[i];
91+
}
92+
93+
// Weighted random selection
94+
if (total <= 0.0) {
95+
// All distances zero — pick randomly
96+
int pick = ivf_xorshift32(&seed) % N;
97+
memcpy(&centroids[c * D], &vectors[pick * D], D * sizeof(float));
98+
} else {
99+
double threshold = ((double)ivf_xorshift32(&seed) / (double)0xFFFFFFFF) * total;
100+
double cumulative = 0.0;
101+
int pick = N - 1;
102+
for (int i = 0; i < N; i++) {
103+
cumulative += dists[i];
104+
if (cumulative >= threshold) {
105+
pick = i;
106+
break;
107+
}
108+
}
109+
memcpy(&centroids[c * D], &vectors[pick * D], D * sizeof(float));
110+
}
111+
}
112+
113+
sqlite3_free(dists);
114+
return 0;
115+
}
116+
117+
/**
118+
* Lloyd's k-means algorithm.
119+
*
120+
* @param vectors N*D float array (row-major)
121+
* @param N number of vectors
122+
* @param D dimensionality
123+
* @param k number of clusters
124+
* @param max_iter maximum iterations
125+
* @param seed PRNG seed for initialization
126+
* @param out_centroids output: k*D float array (caller-allocated)
127+
* @return 0 on success, -1 on error
128+
*/
129+
static int ivf_kmeans(const float *vectors, int N, int D, int k,
130+
int max_iter, uint32_t seed, float *out_centroids) {
131+
if (N <= 0 || D <= 0 || k <= 0)
132+
return -1;
133+
134+
// Clamp k to N
135+
if (k > N)
136+
k = N;
137+
138+
// Allocate working memory
139+
int *assignments = sqlite3_malloc64((i64)N * sizeof(int));
140+
float *new_centroids = sqlite3_malloc64((i64)k * D * sizeof(float));
141+
int *counts = sqlite3_malloc64((i64)k * sizeof(int));
142+
143+
if (!assignments || !new_centroids || !counts) {
144+
sqlite3_free(assignments);
145+
sqlite3_free(new_centroids);
146+
sqlite3_free(counts);
147+
return -1;
148+
}
149+
150+
memset(assignments, -1, N * sizeof(int));
151+
152+
// Initialize centroids via k-means++
153+
if (ivf_kmeans_init_plusplus(vectors, N, D, k, seed, out_centroids) != 0) {
154+
sqlite3_free(assignments);
155+
sqlite3_free(new_centroids);
156+
sqlite3_free(counts);
157+
return -1;
158+
}
159+
160+
for (int iter = 0; iter < max_iter; iter++) {
161+
// Assignment step
162+
int changed = 0;
163+
for (int i = 0; i < N; i++) {
164+
int nearest = ivf_nearest_centroid(&vectors[i * D], out_centroids, D, k);
165+
if (nearest != assignments[i]) {
166+
assignments[i] = nearest;
167+
changed++;
168+
}
169+
}
170+
if (changed == 0)
171+
break;
172+
173+
// Update step
174+
memset(new_centroids, 0, (size_t)k * D * sizeof(float));
175+
memset(counts, 0, k * sizeof(int));
176+
177+
for (int i = 0; i < N; i++) {
178+
int c = assignments[i];
179+
counts[c]++;
180+
for (int d = 0; d < D; d++) {
181+
new_centroids[c * D + d] += vectors[i * D + d];
182+
}
183+
}
184+
185+
for (int c = 0; c < k; c++) {
186+
if (counts[c] == 0) {
187+
// Empty cluster: reassign to farthest point from its nearest centroid
188+
float max_dist = -1.0f;
189+
int farthest = 0;
190+
for (int i = 0; i < N; i++) {
191+
float d = ivf_l2_dist(&vectors[i * D],
192+
&out_centroids[assignments[i] * D], D);
193+
if (d > max_dist) {
194+
max_dist = d;
195+
farthest = i;
196+
}
197+
}
198+
memcpy(&out_centroids[c * D], &vectors[farthest * D],
199+
D * sizeof(float));
200+
} else {
201+
for (int d = 0; d < D; d++) {
202+
out_centroids[c * D + d] = new_centroids[c * D + d] / counts[c];
203+
}
204+
}
205+
}
206+
}
207+
208+
sqlite3_free(assignments);
209+
sqlite3_free(new_centroids);
210+
sqlite3_free(counts);
211+
return 0;
212+
}
213+
214+
#endif /* SQLITE_VEC_IVF_KMEANS_C */

0 commit comments

Comments
 (0)