Skip to content

Commit b008654

Browse files
asg017claude
andcommitted
Filter deleted nodes from DiskANN search results and add delete tests
DiskANN's delete repair only fixes forward edges (nodes the deleted node pointed to). Stale reverse edges can cause deleted rowids to appear in search results. Fix: track a 'confirmed' flag on each search candidate, set when the full-precision vector is successfully read during re-ranking. Only confirmed candidates are included in output. Zero additional SQL queries — piggybacks on the existing re-rank vector read. Also adds delete hardening tests: - Rescore: interleaved delete+KNN, rowid_in after deletes, full delete+reinsert cycle - DiskANN: delete+reinsert cycles with KNN verification, interleaved delete+KNN Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 2f4c2e4 commit b008654

4 files changed

Lines changed: 190 additions & 8 deletions

File tree

sqlite-vec-diskann.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,7 @@ static int diskann_candidate_list_insert(
608608
list->items[lo].rowid = rowid;
609609
list->items[lo].distance = distance;
610610
list->items[lo].visited = 0;
611+
list->items[lo].confirmed = 0;
611612
list->count++;
612613
return 1;
613614
}
@@ -741,8 +742,9 @@ static int diskann_search(
741742
return rc;
742743
}
743744

744-
// Seed with medoid
745+
// Seed with medoid (confirmed — we already read its vector above)
745746
diskann_candidate_list_insert(&candidates, medoid, medoidDist);
747+
candidates.items[0].confirmed = 1;
746748

747749
// Pre-quantize query vector once for all quantized distance comparisons
748750
u8 *queryQuantized = NULL;
@@ -815,16 +817,27 @@ static int diskann_search(
815817
sqlite3_free(fullVec);
816818
// Update distance in candidate list and re-sort
817819
diskann_candidate_list_insert(&candidates, currentRowid, exactDist);
820+
// Mark as confirmed (vector exists, distance is exact)
821+
for (int ci = 0; ci < candidates.count; ci++) {
822+
if (candidates.items[ci].rowid == currentRowid) {
823+
candidates.items[ci].confirmed = 1;
824+
break;
825+
}
826+
}
818827
}
828+
// If vector read failed, candidate stays unconfirmed (stale edge to deleted node)
819829
}
820830

821-
// 5. Output results (candidates are already sorted by distance)
822-
int resultCount = (candidates.count < k) ? candidates.count : k;
823-
*outCount = resultCount;
824-
for (int i = 0; i < resultCount; i++) {
825-
outRowids[i] = candidates.items[i].rowid;
826-
outDistances[i] = candidates.items[i].distance;
831+
// 5. Output results — only include confirmed candidates (whose vectors exist)
832+
int resultCount = 0;
833+
for (int i = 0; i < candidates.count && resultCount < k; i++) {
834+
if (candidates.items[i].confirmed) {
835+
outRowids[resultCount] = candidates.items[i].rowid;
836+
outDistances[resultCount] = candidates.items[i].distance;
837+
resultCount++;
838+
}
827839
}
840+
*outCount = resultCount;
828841

829842
sqlite3_free(queryQuantized);
830843
diskann_candidate_list_free(&candidates);

sqlite-vec.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2586,7 +2586,8 @@ struct Vec0DiskannConfig {
25862586
struct Vec0DiskannCandidate {
25872587
i64 rowid;
25882588
f32 distance;
2589-
int visited; // 1 if this candidate's neighbors have been explored
2589+
int visited; // 1 if this candidate's neighbors have been explored
2590+
int confirmed; // 1 if full-precision vector was successfully read (node exists)
25902591
};
25912592

25922593
/**

tests/test-diskann.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,3 +1176,73 @@ def test_corrupt_truncated_node_blob(db):
11761176
).fetchall()
11771177
except sqlite3.OperationalError:
11781178
pass # Error is acceptable — crash is not
1179+
1180+
1181+
def test_diskann_delete_reinsert_cycle_knn(db):
1182+
"""Repeatedly delete and reinsert rows, verify KNN stays correct."""
1183+
import random
1184+
random.seed(101)
1185+
db.execute("""
1186+
CREATE VIRTUAL TABLE t USING vec0(
1187+
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
1188+
)
1189+
""")
1190+
N = 30
1191+
vecs = {}
1192+
for i in range(1, N + 1):
1193+
v = [random.gauss(0, 1) for _ in range(8)]
1194+
vecs[i] = v
1195+
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(v)])
1196+
1197+
# 3 cycles: delete half, reinsert with new vectors, verify KNN
1198+
for cycle in range(3):
1199+
to_delete = random.sample(sorted(vecs.keys()), len(vecs) // 2)
1200+
for r in to_delete:
1201+
db.execute("DELETE FROM t WHERE rowid = ?", [r])
1202+
del vecs[r]
1203+
1204+
# Reinsert with new rowids
1205+
new_start = 100 + cycle * 50
1206+
for i in range(len(to_delete)):
1207+
rid = new_start + i
1208+
v = [random.gauss(0, 1) for _ in range(8)]
1209+
vecs[rid] = v
1210+
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [rid, _f32(v)])
1211+
1212+
# KNN should return only alive rows
1213+
query = [0.0] * 8
1214+
rows = db.execute(
1215+
"SELECT rowid FROM t WHERE emb MATCH ? AND k=10",
1216+
[_f32(query)],
1217+
).fetchall()
1218+
returned = {r["rowid"] for r in rows}
1219+
assert returned.issubset(set(vecs.keys())), \
1220+
f"Cycle {cycle}: deleted rowid in KNN results"
1221+
assert len(rows) >= 1
1222+
1223+
1224+
def test_diskann_delete_interleaved_with_knn(db):
1225+
"""Delete one row at a time, querying KNN after each delete."""
1226+
db.execute("""
1227+
CREATE VIRTUAL TABLE t USING vec0(
1228+
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
1229+
)
1230+
""")
1231+
N = 20
1232+
for i in range(1, N + 1):
1233+
vec = [0.0] * 8
1234+
vec[i % 8] = float(i)
1235+
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)])
1236+
1237+
alive = set(range(1, N + 1))
1238+
for to_del in [1, 5, 10, 15, 20]:
1239+
db.execute("DELETE FROM t WHERE rowid = ?", [to_del])
1240+
alive.discard(to_del)
1241+
1242+
rows = db.execute(
1243+
"SELECT rowid FROM t WHERE emb MATCH ? AND k=5",
1244+
[_f32([1, 0, 0, 0, 0, 0, 0, 0])],
1245+
).fetchall()
1246+
returned = {r["rowid"] for r in rows}
1247+
assert returned.issubset(alive), \
1248+
f"Deleted rowid {to_del} found in KNN results"

tests/test-rescore-mutations.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,104 @@ def test_insert_batch_recall(db):
443443
# ============================================================================
444444

445445

446+
def test_delete_interleaved_with_knn(db):
447+
"""Delete rows one at a time, running KNN after each delete to verify correctness."""
448+
db.execute(
449+
"CREATE VIRTUAL TABLE t USING vec0("
450+
" embedding float[8] indexed by rescore(quantizer=bit)"
451+
")"
452+
)
453+
N = 30
454+
random.seed(42)
455+
vecs = {i: [random.gauss(0, 1) for _ in range(8)] for i in range(1, N + 1)}
456+
for rowid, vec in vecs.items():
457+
db.execute(
458+
"INSERT INTO t(rowid, embedding) VALUES (?, ?)",
459+
[rowid, float_vec(vec)],
460+
)
461+
462+
alive = set(vecs.keys())
463+
query = [0.0] * 8
464+
465+
for to_del in [5, 10, 15, 20, 25]:
466+
db.execute("DELETE FROM t WHERE rowid = ?", [to_del])
467+
alive.discard(to_del)
468+
469+
rows = db.execute(
470+
"SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 10",
471+
[float_vec(query)],
472+
).fetchall()
473+
returned = {r["rowid"] for r in rows}
474+
# All returned rows must be alive (not deleted)
475+
assert returned.issubset(alive), f"Deleted rowid found in KNN after deleting {to_del}"
476+
# Count should match alive set (up to k)
477+
assert len(rows) == min(10, len(alive))
478+
479+
480+
def test_delete_with_rowid_in_constraint(db):
481+
"""Delete rows and verify KNN with rowid_in filter excludes deleted rows."""
482+
db.execute(
483+
"CREATE VIRTUAL TABLE t USING vec0("
484+
" embedding float[8] indexed by rescore(quantizer=int8)"
485+
")"
486+
)
487+
for i in range(1, 11):
488+
db.execute(
489+
"INSERT INTO t(rowid, embedding) VALUES (?, ?)",
490+
[i, float_vec([float(i)] * 8)],
491+
)
492+
493+
# Delete rows 3, 5, 7
494+
for r in [3, 5, 7]:
495+
db.execute("DELETE FROM t WHERE rowid = ?", [r])
496+
497+
# KNN with rowid IN (1,2,3,4,5) — should only return 1, 2, 4 (3 and 5 deleted)
498+
rows = db.execute(
499+
"SELECT rowid FROM t WHERE embedding MATCH ? AND k = 5 AND rowid IN (1, 2, 3, 4, 5)",
500+
[float_vec([1.0] * 8)],
501+
).fetchall()
502+
returned = {r["rowid"] for r in rows}
503+
assert 3 not in returned
504+
assert 5 not in returned
505+
assert returned.issubset({1, 2, 4})
506+
507+
508+
def test_delete_all_then_reinsert_batch(db):
509+
"""Delete all rows, reinsert a new batch, verify KNN only returns new rows."""
510+
db.execute(
511+
"CREATE VIRTUAL TABLE t USING vec0("
512+
" embedding float[8] indexed by rescore(quantizer=bit)"
513+
")"
514+
)
515+
# First batch
516+
for i in range(1, 21):
517+
db.execute(
518+
"INSERT INTO t(rowid, embedding) VALUES (?, ?)",
519+
[i, float_vec([float(i)] * 8)],
520+
)
521+
522+
# Delete all
523+
for i in range(1, 21):
524+
db.execute("DELETE FROM t WHERE rowid = ?", [i])
525+
526+
assert db.execute("SELECT count(*) FROM t").fetchone()[0] == 0
527+
528+
# Second batch with different rowids and vectors
529+
for i in range(100, 110):
530+
db.execute(
531+
"INSERT INTO t(rowid, embedding) VALUES (?, ?)",
532+
[i, float_vec([float(i - 100)] * 8)],
533+
)
534+
535+
rows = db.execute(
536+
"SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 5",
537+
[float_vec([0.0] * 8)],
538+
).fetchall()
539+
returned = {r["rowid"] for r in rows}
540+
# All returned rowids should be from the second batch
541+
assert returned.issubset(set(range(100, 110)))
542+
543+
446544
def test_knn_int8_cosine(db):
447545
"""Rescore with quantizer=int8 and distance_metric=cosine."""
448546
db.execute(

0 commit comments

Comments
 (0)