-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreal_world_scenario.py
More file actions
127 lines (109 loc) · 4.19 KB
/
real_world_scenario.py
File metadata and controls
127 lines (109 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Real-world scenario: Document search system.
Demonstrates a practical use case:
- Building a searchable document database
- Semantic search with embeddings
- CRUD operations
- Metadata filtering
"""
import random
from sqlite_vec_client import SQLiteVecClient
def generate_mock_embedding(dim: int, seed: int) -> list[float]:
"""Generate a mock embedding for demonstration."""
random.seed(seed)
return [random.random() for _ in range(dim)]
def main():
# Initialize document search system
with SQLiteVecClient(table="knowledge_base", db_path=":memory:") as client:
client.create_table(dim=384, distance="cosine")
# Add documents to knowledge base
documents = [
{
"text": "Python is a programming language known for its simplicity",
"metadata": {
"category": "programming",
"language": "python",
"difficulty": "beginner",
},
},
{
"text": "Machine learning models require large datasets for training",
"metadata": {
"category": "ai",
"language": "general",
"difficulty": "intermediate",
},
},
{
"text": "SQLite is a lightweight embedded database engine",
"metadata": {
"category": "database",
"language": "sql",
"difficulty": "beginner",
},
},
{
"text": "Neural networks consist of layers of interconnected nodes",
"metadata": {
"category": "ai",
"language": "general",
"difficulty": "advanced",
},
},
{
"text": "Vector databases enable semantic search capabilities",
"metadata": {
"category": "database",
"language": "general",
"difficulty": "intermediate",
},
},
]
texts = [doc["text"] for doc in documents]
metadata = [doc["metadata"] for doc in documents]
embeddings = [generate_mock_embedding(384, i) for i in range(len(documents))]
rowids = client.add(texts=texts, embeddings=embeddings, metadata=metadata)
print(f"Indexed {len(rowids)} documents in knowledge base\n")
# Semantic search: Find documents about databases
print("Search: 'database systems'")
query_emb = generate_mock_embedding(384, 42)
results = client.similarity_search(embedding=query_emb, top_k=3)
print("Top 3 results:")
for i, (rowid, text, distance) in enumerate(results, 1):
record = client.get(rowid)
if record:
_, _, meta, _ = record
print(f" {i}. [{meta['category']}] {text[:60]}...")
print(
f" Distance: {distance:.4f}, Difficulty: {meta['difficulty']}\n"
)
# Filter by category using get_all
print("All AI-related documents (intermediate):")
for rowid, text, meta, _ in client.get_all():
if (
meta.get("category") == "ai"
and meta.get("difficulty") == "intermediate"
):
print(f" • {text[:60]}...")
# Update document
if rowids:
client.update(
rowids[0],
metadata={
"category": "programming",
"language": "python",
"difficulty": "beginner",
"reviewed": True,
},
)
print(f"\nUpdated document {rowids[0]}")
# Statistics
print("\nKnowledge base statistics:")
print(f" Total documents: {client.count()}")
# List first 3 documents
print(" First 3 documents:")
for i, (rowid, text, _, _) in enumerate(client.get_all()):
if i >= 3:
break
print(f" [{rowid}] {text[:50]}...")
if __name__ == "__main__":
main()