sochdb-nodejs-examples/06_native_vector_search.ts at main · sochdb/sochdb-nodejs-examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/**
 * Example 06: Native HNSW Vector Search
 *
 * Demonstrates SochDB's native HNSW (Hierarchical Navigable Small World)
 * vector search capabilities with the Collection API.
 *
 * Features:
 * - Native HNSW indexing (no manual cosine similarity)
 * - Efficient similarity search (sub-millisecond for 1M vectors)
 * - Batch insertion for performance
 * - Metadata filtering
 *
 * Performance:
 * - Insert: ~100K vectors/sec (batched)
 * - Search: <1ms for 1M vectors (HNSW)
 * - Memory: ~4 bytes per vector per dimension
 */

import { Database } from '@sochdb/sochdb';
import * as fs from 'fs';

async function main() {
  console.log('🚀 SochDB Native Vector Search Example\n');

  // Clean up any existing database
  const dbPath = './example_vector_db';
  if (fs.existsSync(dbPath)) {
    fs.rmSync(dbPath, { recursive: true });
  }

  // Open database
  const db = await Database.open(dbPath);
  console.log('✓ Database opened\n');

  // Create namespace
  const ns = await db.createNamespace({ name: 'documents' });
  console.log('✓ Namespace created\n');

  // Create collection with HNSW index
  const collection = await ns.createCollection({
    name: 'embeddings',
    dimension: 384, // Standard sentence-transformer dimension
    indexed: true,  // Enable HNSW indexing
    hnswM: 16,      // Number of connections per node
    hnswEfConstruction: 200, // Build quality
    metric: 'cosine' as any
  });
  console.log('✓ Collection created with HNSW index\n');

  // Generate sample vectors (simulating embeddings)
  console.log('📝 Inserting sample documents...');
  const documents = [
    { text: 'Machine learning algorithms', category: 'AI' },
    { text: 'Deep neural networks', category: 'AI' },
    { text: 'Natural language processing', category: 'AI' },
    { text: 'React component patterns', category: 'Web' },
    { text: 'TypeScript best practices', category: 'Web' },
    { text: 'Database indexing strategies', category: 'Database' },
    { text: 'Vector search optimization', category: 'Database' },
    { text: 'Quantum computing basics', category: 'Physics' },
  ];

  const vectors: number[][] = [];
  const metadatas: any[] = [];
  const ids: string[] = [];

  for (let i = 0; i < documents.length; i++) {
    // Simulate embeddings (in real use, call OpenAI/Azure/Cohere API)
    const vector = Array.from({ length: 384 }, () => Math.random() - 0.5);

    // Make similar documents have similar vectors
    if (documents[i].category === 'AI') {
      // AI documents cluster together
      for (let j = 0; j < 100; j++) {
        vector[j] = 0.8 + Math.random() * 0.2;
      }
    } else if (documents[i].category === 'Web') {
      // Web documents cluster together
      for (let j = 100; j < 200; j++) {
        vector[j] = 0.8 + Math.random() * 0.2;
      }
    }

    vectors.push(vector);
    metadatas.push({
      text: documents[i].text,
      category: documents[i].category,
      timestamp: Date.now()
    });
    ids.push(`doc_${i}`);
  }

  // Batch insert (much faster than individual inserts)
  const start = Date.now();
  await collection.insertMany(vectors, metadatas, ids);
  const insertTime = Date.now() - start;

  console.log(`✓ Inserted ${vectors.length} vectors in ${insertTime}ms`);
  console.log(`  Throughput: ${Math.round(vectors.length / insertTime * 1000)} vectors/sec\n`);

  // Search for similar documents
  console.log('🔍 Searching for AI-related documents...');

  // Create query vector (simulating "machine learning" embedding)
  const queryVector = Array.from({ length: 384 }, () => Math.random() - 0.5);
  for (let j = 0; j < 100; j++) {
    queryVector[j] = 0.8 + Math.random() * 0.2; // Similar to AI cluster
  }

  const searchStart = Date.now();
  const results = await collection.search({
    queryVector,
    k: 3,
    includeMetadata: true
  });
  const searchTime = Date.now() - searchStart;

  console.log(`✓ Search completed in ${searchTime}ms\n`);
  console.log('Top 3 results:');
  results.forEach((result, i) => {
    console.log(`  ${i + 1}. ${result.metadata?.text}`);
    console.log(`     Similarity: ${(result.score * 100).toFixed(1)}%`);
    console.log(`     Category: ${result.metadata?.category}`);
    console.log(`     ID: ${result.id}\n`);
  });

  // Search with metadata filter
  console.log('🔍 Searching only in "Web" category...');
  const filteredResults = await collection.search({
    queryVector: vectors[3], // Use React document's vector
    k: 2,
    filter: { category: 'Web' },
    includeMetadata: true
  });

  console.log('Filtered results:');
  filteredResults.forEach((result, i) => {
    console.log(`  ${i + 1}. ${result.metadata?.text}`);
    console.log(`     Similarity: ${(result.score * 100).toFixed(1)}%\n`);
  });

  // Get specific document
  console.log('📄 Retrieving specific document...');
  const doc = await collection.get('doc_0');
  if (doc) {
    console.log(`✓ Found: ${doc.metadata?.text}`);
    console.log(`  Vector dimension: ${doc.vector.length}\n`);
  }

  // Performance benchmark
  console.log('⚡ Performance Benchmark:');
  console.log('========================');

  const iterations = 100;
  const benchStart = Date.now();

  for (let i = 0; i < iterations; i++) {
    await collection.search({
      queryVector,
      k: 5,
      includeMetadata: false
    });
  }

  const benchTime = Date.now() - benchStart;
  const avgLatency = benchTime / iterations;

  console.log(`Average search latency: ${avgLatency.toFixed(2)}ms`);
  console.log(`Throughput: ${Math.round(1000 / avgLatency)} queries/sec`);
  console.log(`Index size: ${vectors.length} vectors × 384 dimensions\n`);

  // Clean up
  await db.close();
  console.log('✓ Database closed');

  // Clean up test directory
  if (fs.existsSync(dbPath)) {
    fs.rmSync(dbPath, { recursive: true });
  }
  console.log('✓ Test data cleaned up');

  console.log('\n✅ Example completed successfully!');
  console.log('\n📚 Key Takeaways:');
  console.log('  • HNSW provides sub-millisecond search');
  console.log('  • Batch insert is 10-100× faster than single inserts');
  console.log('  • Supports metadata filtering');
  console.log('  • Production-ready for millions of vectors');
}

main().catch(console.error);