Skip to content

Commit 0bd5d6d

Browse files
Release v0.4.1 (#16)
* Bump version to 0.4.1 for development * fix: eliminate duplicate documents in overwrite operations and add comprehensive testing * Add: latest uv.lock file for reproducible Python dependency management * 📄 docs(changelog): update for v0.4.1 release * refactor: clean up commented code and whitespace in hnsw_index.rs * 📄 docs(changelog): update for v0.4.1 release
1 parent a1c4608 commit 0bd5d6d

8 files changed

Lines changed: 840 additions & 59 deletions

File tree

CHANGELOG.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
---
99

10+
## [0.4.1] - 2025-08-20
11+
12+
### Added
13+
- Comprehensive test suite for overwrite behavior verification (`test_overwrite_fix.py`)
14+
- Product Quantization (PQ) overwrite testing across all storage modes (`test_pq_overwrite_comprehensive.py`)
15+
- Enhanced logging and storage analysis for overwrite operations
16+
- Training state cleanup during document removal operations
17+
18+
### Changed
19+
- Overwrite operations now use two-phase process (remove then add) to prevent duplicates
20+
- `remove_point()` method now delegates to internal `remove_point_internal()` for better code reuse
21+
- Enhanced `add()` method with comprehensive PQ support and storage mode awareness
22+
- Improved error handling and logging throughout overwrite operations
23+
24+
### Fixed
25+
- Critical: Fixed duplicate document bug where `overwrite=True` created multiple entries instead of replacing existing ones
26+
- Memory leak from accumulated duplicate vectors in HNSW graph during overwrites
27+
- Product Quantization codes and training state not properly cleaned up during document removal
28+
- Vector count inconsistencies when removing documents during overwrite operations
29+
30+
### Removed
31+
- Legacy overwrite behavior that created duplicates instead of proper replacements
32+
33+
---
34+
1035
## [0.4.0] - 2025-08-13
1136

1237
### Added
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#!/usr/bin/env python3
2+
"""
3+
ZeusDB Overwrite Bug Fix – Verification Tests
4+
5+
This script covers:
6+
1) Core overwrite behavior (no duplicates, metadata and vector updated)
7+
2) Edge cases (overwrite non-existent doc, multiple overwrites)
8+
"""
9+
10+
import zeusdb_vector_database as zdb
11+
12+
13+
def test_overwrite_bug_fix():
14+
"""Test that overwrite=True replaces documents without creating duplicates."""
15+
print("🧪 Testing ZeusDB overwrite bug fix...")
16+
17+
# Create a small test index
18+
vdb = zdb.VectorDatabase()
19+
index = vdb.create(
20+
index_type="hnsw",
21+
dim=3,
22+
space="cosine",
23+
m=16,
24+
ef_construction=200,
25+
expected_size=100,
26+
)
27+
28+
# Test vectors
29+
vector1 = [1.0, 0.0, 0.0]
30+
vector2 = [0.0, 1.0, 0.0]
31+
vector1_updated = [0.0, 0.0, 1.0] # Different vector, same ID ("doc1")
32+
33+
print("\n📝 Step 1: Adding initial documents...")
34+
result1 = index.add(
35+
{
36+
"vectors": [vector1, vector2],
37+
"ids": ["doc1", "doc2"],
38+
"metadatas": [
39+
{"text": "first document", "version": 1},
40+
{"text": "second document", "version": 1},
41+
],
42+
},
43+
overwrite=True,
44+
)
45+
print(
46+
f" Initial add result: ✅ {result1.total_inserted} inserted, ❌ {result1.total_errors} errors"
47+
)
48+
assert result1.total_inserted == 2
49+
assert result1.total_errors == 0
50+
51+
print("\n🔍 Step 2: Search after initial add (near vector1)...")
52+
search_results = index.search(vector1, top_k=5)
53+
print(f" Found {len(search_results)} results:")
54+
for i, r in enumerate(search_results, 1):
55+
print(f" {i}. ID: {r['id']}, Score: {r['score']:.4f}")
56+
unique_ids_initial = {r["id"] for r in search_results}
57+
print(f" Unique IDs found: {unique_ids_initial}")
58+
assert "doc1" in unique_ids_initial and "doc2" in unique_ids_initial
59+
60+
print("\n🔄 Step 3: Overwriting doc1 with new vector...")
61+
result2 = index.add(
62+
{
63+
"vectors": [vector1_updated],
64+
"ids": ["doc1"],
65+
"metadatas": [{"text": "first document UPDATED", "version": 2}],
66+
},
67+
overwrite=True,
68+
)
69+
print(
70+
f" Overwrite result: ✅ {result2.total_inserted} inserted, ❌ {result2.total_errors} errors"
71+
)
72+
assert result2.total_inserted == 1
73+
assert result2.total_errors == 0
74+
75+
print("\n🔍 Step 4a: Search after overwrite (near OLD vector1)...")
76+
search_results_after_old = index.search(vector1, top_k=5)
77+
print(f" Found {len(search_results_after_old)} results:")
78+
for i, r in enumerate(search_results_after_old, 1):
79+
print(f" {i}. ID: {r['id']}, Score: {r['score']:.4f}")
80+
81+
print("\n🔍 Step 4b: Search after overwrite (near UPDATED vector1_updated)...")
82+
search_results_after_new = index.search(vector1_updated, top_k=5)
83+
print(f" Found {len(search_results_after_new)} results:")
84+
for i, r in enumerate(search_results_after_new, 1):
85+
print(f" {i}. ID: {r['id']}, Score: {r['score']:.4f}")
86+
87+
# Combine both result sets to check for duplicates across queries
88+
all_after = search_results_after_old + search_results_after_new
89+
id_counts = {}
90+
for r in all_after:
91+
id_counts[r["id"]] = id_counts.get(r["id"], 0) + 1
92+
print(f" ID occurrence counts across both searches: {id_counts}")
93+
94+
# ✅ No duplicate IDs should appear within a single search result set
95+
for results in (search_results_after_old, search_results_after_new):
96+
counts = {}
97+
for r in results:
98+
counts[r["id"]] = counts.get(r["id"], 0) + 1
99+
dups = [k for k, v in counts.items() if v > 1]
100+
assert not dups, f"Found duplicate IDs in a single result set: {dups}"
101+
102+
# ✅ Each known ID should be present (and not duplicated overall)
103+
assert any(r["id"] == "doc1" for r in all_after), "doc1 should exist after overwrite"
104+
assert any(r["id"] == "doc2" for r in all_after), "doc2 should still exist after overwrite"
105+
106+
print("\n🔍 Step 5: Verify updated document content (fail fast if missing)...")
107+
# ---- Option A: fail fast & keep scope tight ----
108+
doc1_records = index.get_records("doc1", return_vector=True)
109+
assert doc1_records and isinstance(doc1_records, list), "doc1 not found after overwrite"
110+
111+
metadata = doc1_records[0]["metadata"]
112+
print(f" doc1 metadata: {metadata}")
113+
print(f" doc1 vector: {doc1_records[0].get('vector', 'Not returned')}")
114+
115+
print("\n✅ Step 6: Verification...")
116+
# Verify updated metadata
117+
assert metadata["version"] == 2, "doc1 should have updated metadata"
118+
assert "UPDATED" in metadata["text"], "doc1 should have updated text"
119+
120+
# Optional: ensure only one record for doc1 is returned by direct lookup
121+
assert len(doc1_records) == 1, f"Expected 1 record for doc1, got {len(doc1_records)}"
122+
123+
print(" ✅ No duplicate IDs in searches")
124+
print(" ✅ All documents still accessible")
125+
print(" ✅ Metadata properly updated")
126+
print(" ✅ Vector count consistent")
127+
print("\n🎉 SUCCESS: Overwrite bug fix verified!")
128+
return True
129+
130+
131+
def test_edge_cases():
132+
"""Test edge cases for the overwrite functionality."""
133+
print("\n🧪 Testing edge cases...")
134+
135+
vdb = zdb.VectorDatabase()
136+
index = vdb.create(dim=3, space="cosine")
137+
138+
# Edge Case 1: Overwrite non-existent document (should add it)
139+
print("\n📝 Edge Case 1: Overwrite non-existent document")
140+
result = index.add(
141+
{
142+
"vectors": [[1.0, 0.0, 0.0]],
143+
"ids": ["new_doc"],
144+
"metadatas": [{"text": "brand new"}],
145+
},
146+
overwrite=True,
147+
)
148+
assert result.total_inserted == 1
149+
assert result.total_errors == 0
150+
print(" ✅ Successfully added non-existent document with overwrite=True")
151+
152+
# Edge Case 2: Multiple overwrites of the same document
153+
print("\n📝 Edge Case 2: Multiple overwrites of same document")
154+
for i in range(3):
155+
result = index.add(
156+
{
157+
"vectors": [[0.0, 1.0, float(i)]],
158+
"ids": ["multi_overwrite"],
159+
"metadatas": [{"text": f"version {i}", "iteration": i}],
160+
},
161+
overwrite=True,
162+
)
163+
assert result.total_inserted == 1
164+
assert result.total_errors == 0
165+
166+
# Verify only one copy exists in search results
167+
search_results = index.search([0.0, 1.0, 2.0], top_k=10)
168+
multi = [r for r in search_results if r["id"] == "multi_overwrite"]
169+
assert len(multi) == 1, f"Expected 1 result for multi_overwrite, got {len(multi)}"
170+
171+
final_metadata = multi[0]["metadata"]
172+
assert final_metadata["iteration"] == 2, "Should have final iteration metadata"
173+
print(" ✅ Multiple overwrites work correctly")
174+
print("\n🎉 All edge cases passed!")
175+
176+
177+
if __name__ == "__main__":
178+
try:
179+
ok1 = test_overwrite_bug_fix()
180+
test_edge_cases()
181+
if ok1:
182+
print("\n🏆 ALL TESTS PASSED! The overwrite bug has been successfully fixed.")
183+
except Exception as e:
184+
print(f"\n❌ TEST FAILED: {e}")
185+
import traceback
186+
187+
traceback.print_exc()
188+
exit(1)

0 commit comments

Comments
 (0)