Skip to content

Commit 56ab0ae

Browse files
authored
Merge pull request #1 from diberry/move-docdb-vector-ts
Migrate TypeScript files from Azure-Samples/cosmos-db-vector-samples
2 parents 183f672 + f22ceb8 commit 56ab0ae

9 files changed

Lines changed: 793 additions & 0 deletions

File tree

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
DEBUG=true
2+
3+
# ========================================
4+
# Azure OpenAI Embedding Settings
5+
# ========================================
6+
AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002
7+
AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15
8+
AZURE_OPENAI_EMBEDDING_KEY=
9+
AZURE_OPENAI_EMBEDDING_ENDPOINT=https://<RESOURCE-NAME>.openai.azure.com
10+
EMBEDDING_SIZE_BATCH=16
11+
12+
# ========================================
13+
# Data File Paths and Vector Configuration
14+
# ========================================
15+
DATA_FILE_WITHOUT_VECTORS=../data/HotelsData_toCosmosDB.JSON
16+
DATA_FILE_WITH_VECTORS=../data/HotelsData_toCosmosDB_Vector.json
17+
DATA_FILE_WITH_SIMILARITY=../data/HotelsData_toCosmosDB_Vector_Similarity.json
18+
QUERY_FILE_WITH_VECTORS=../data/HotelsData_Query_Vector.json
19+
DATA_FOLDER=../data/
20+
FIELD_TO_EMBED=Description
21+
EMBEDDED_FIELD=text_embedding_ada_002
22+
EMBEDDING_DIMENSIONS=1536
23+
LOAD_SIZE_BATCH=100
24+
25+
# ========================================
26+
# MongoDB/Cosmos DB Connection Settings
27+
# ========================================
28+
MONGO_CONNECTION_STRING=mongodb+srv://<USERNAME>:<PASSWORD>@<CLUSTER-NAME>.global.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000
29+
MONGO_CLUSTER_NAME=<CLUSTER-NAME>
30+
31+
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"name": "ts-cosmos-nodejs-vector-samples",
3+
"version": "1.0.0",
4+
"description": "Samples for MongoDB vCore vector search with Cosmos DB",
5+
"main": "index.js",
6+
"type": "module",
7+
"scripts": {
8+
"build": "tsc",
9+
"start:one-insert": "node --env-file .env dist/insert-one-document.js",
10+
"start:embed": "node --env-file .env dist/create-embeddings.js",
11+
"start:show-indexes": "node --env-file .env dist/showIndexes.js",
12+
"start:ivf": "node --env-file .env dist/ivf.js",
13+
"start:hnsw": "node --env-file .env dist/hnsw.js",
14+
"start:diskann": "node --env-file .env dist/diskann.js"
15+
},
16+
"dependencies": {
17+
"@azure/identity": "^4.11.1",
18+
"mongodb": "^6.18.0",
19+
"openai": "^5.16.0"
20+
},
21+
"devDependencies": {
22+
"@types/node": "^24.3.0",
23+
"typescript": "^5.9.2"
24+
}
25+
}
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/**
2+
* Module for creating embedding vectors using OpenAI API
3+
* Supports text embedding models for generating embeddings
4+
* that can be used with Cosmos DB MongoDB vCore vector search
5+
*/
6+
import * as path from "node:path";
7+
import { AzureOpenAI } from "openai";
8+
import { Embedding } from "openai/resources";
9+
import { readFileReturnJson, writeFileJson, JsonData } from "./utils.js";
10+
11+
// ESM specific features - create __dirname equivalent
12+
import { fileURLToPath } from "node:url";
13+
import { dirname } from "node:path";
14+
const __filename = fileURLToPath(import.meta.url);
15+
const __dirname = dirname(__filename);
16+
17+
const apiKey = process.env.AZURE_OPENAI_EMBEDDING_KEY;
18+
const apiVersion = process.env.AZURE_OPENAI_EMBEDDING_API_VERSION;
19+
const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT;
20+
console.log(`Using OpenAI endpoint: ${endpoint}`);
21+
const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!;
22+
23+
const dataWithVectors = process.env.DATA_FILE_WITH_VECTORS!;
24+
const dataWithoutVectors = process.env.DATA_FILE_WITHOUT_VECTORS!;
25+
const fieldToEmbed = process.env.FIELD_TO_EMBED! || "description";
26+
const newEmbeddedField = process.env.EMBEDDED_FIELD! || deployment;
27+
const batchSize = parseInt(process.env.EMBEDDING_BATCH_SIZE || '16', 10);
28+
29+
// Define a reusable delay function
30+
async function delay(ms: number = 200): Promise<void> {
31+
await new Promise(resolve => setTimeout(resolve, ms));
32+
}
33+
34+
export async function createEmbeddings(client: AzureOpenAI, model: string, inputItems: string[]): Promise<Embedding[]> {
35+
const response = await client.embeddings.create({
36+
model,
37+
input: inputItems
38+
});
39+
40+
if (!response.data || response.data.length === 0) {
41+
throw new Error(`No embedding data returned`);
42+
}
43+
return response.data;
44+
}
45+
46+
export async function processEmbeddingBatch<T>(
47+
client: AzureOpenAI,
48+
model: string,
49+
fieldToEmbed: string,
50+
newEmbeddedField: string,
51+
maxEmbeddings: number,
52+
items: T[]
53+
54+
): Promise<T[]> {
55+
if (!Array.isArray(items) || items.length === 0) {
56+
throw new Error("Items must be a non-empty array");
57+
}
58+
59+
if (!fieldToEmbed) {
60+
throw new Error("Field to embed must be specified");
61+
}
62+
63+
const itemsWithEmbeddings: T[] = [];
64+
maxEmbeddings = maxEmbeddings || items.length;
65+
66+
// Process in batches to avoid rate limits and memory issues
67+
for (let i = 0; i < maxEmbeddings; i += batchSize) {
68+
const batchEnd = Math.min(i + batchSize, items.length);
69+
console.log(`Processing batch: ${i} to ${batchEnd - 1} (of ${items.length} items)`);
70+
71+
const batchItems = items.slice(i, batchEnd);
72+
const textsToEmbed = batchItems.map(item => {
73+
if (!item[fieldToEmbed]) {
74+
console.warn(`Item is missing the field to embed: ${fieldToEmbed}`);
75+
return ""; // Provide a fallback value to prevent API errors
76+
}
77+
return item[fieldToEmbed];
78+
});
79+
80+
try {
81+
const embeddings = await createEmbeddings(client, model, textsToEmbed);
82+
83+
embeddings.forEach((embeddingData, index) => {
84+
const originalItem = batchItems[index];
85+
const newItem = {
86+
...originalItem,
87+
[newEmbeddedField]: embeddingData.embedding
88+
};
89+
itemsWithEmbeddings.push(newItem);
90+
});
91+
92+
// Add a small delay between batches to avoid rate limiting
93+
if (batchEnd < items.length) {
94+
await delay();
95+
}
96+
} catch (error) {
97+
console.error(`Error generating embeddings for batch ${i}:`, error);
98+
throw error;
99+
}
100+
}
101+
102+
return itemsWithEmbeddings;
103+
}
104+
105+
106+
try {
107+
108+
const client = new AzureOpenAI( {
109+
apiKey,
110+
apiVersion,
111+
endpoint,
112+
deployment
113+
});
114+
115+
const data = await readFileReturnJson(path.join(__dirname, "..", dataWithoutVectors!));
116+
const model = deployment;
117+
const maxEmbeddings = data.length;
118+
119+
const embeddings = await processEmbeddingBatch<JsonData>(
120+
client,
121+
model,
122+
fieldToEmbed,
123+
newEmbeddedField,
124+
maxEmbeddings,
125+
data
126+
);
127+
128+
await writeFileJson(path.join(__dirname, "..", dataWithVectors!), embeddings);
129+
130+
} catch (error) {
131+
console.error(`Failed to save embeddings to file: ${(error as Error).message}`);
132+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import path from 'path';
2+
import { readFileReturnJson, getClientsPasswordless, insertData, printSearchResults } from './utils.js';
3+
4+
// ESM specific features - create __dirname equivalent
5+
import { fileURLToPath } from "node:url";
6+
import { dirname } from "node:path";
7+
const __filename = fileURLToPath(import.meta.url);
8+
const __dirname = dirname(__filename);
9+
10+
const config = {
11+
query: "quintessential lodging near running trails, eateries, retail",
12+
dbName: "Hotels",
13+
collectionName: "hotels_diskann",
14+
indexName: "vectorIndex_diskann",
15+
dataFile: process.env.DATA_FILE_WITH_VECTORS!,
16+
batchSize: parseInt(process.env.LOAD_SIZE_BATCH! || '100', 10),
17+
embeddedField: process.env.EMBEDDED_FIELD!,
18+
embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS!, 10),
19+
deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL!,
20+
};
21+
22+
async function main() {
23+
24+
const { aiClient, dbClient } = getClientsPasswordless();
25+
26+
try {
27+
28+
if (!aiClient) {
29+
throw new Error('AI client is not configured. Please check your environment variables.');
30+
}
31+
if (!dbClient) {
32+
throw new Error('Database client is not configured. Please check your environment variables.');
33+
}
34+
35+
await dbClient.connect();
36+
const db = dbClient.db(config.dbName);
37+
const collection = await db.createCollection(config.collectionName);
38+
console.log('Created collection:', config.collectionName);
39+
const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile));
40+
const insertSummary = await insertData(config, collection, data);
41+
console.log('Created vector index:', config.indexName);
42+
43+
// Create the vector index
44+
const indexOptions = {
45+
createIndexes: config.collectionName,
46+
indexes: [
47+
{
48+
name: config.indexName,
49+
key: {
50+
[config.embeddedField]: 'cosmosSearch'
51+
},
52+
cosmosSearchOptions: {
53+
kind: 'vector-diskann',
54+
dimensions: config.embeddingDimensions,
55+
similarity: 'COS', // 'COS', 'L2', 'IP'
56+
maxDegree: 20, // 20 - 2048, edges per node
57+
lBuild: 10 // 10 - 500, candidate neighbors evaluated
58+
}
59+
}
60+
]
61+
};
62+
const vectorIndexSummary = await db.command(indexOptions);
63+
64+
// Create embedding for the query
65+
const createEmbeddedForQueryResponse = await aiClient.embeddings.create({
66+
model: config.deployment,
67+
input: [config.query]
68+
});
69+
70+
// Perform the vector similarity search
71+
const searchResults = await collection.aggregate([
72+
{
73+
$search: {
74+
cosmosSearch: {
75+
vector: createEmbeddedForQueryResponse.data[0].embedding,
76+
path: config.embeddedField,
77+
k: 5
78+
}
79+
}
80+
},
81+
{
82+
$project: {
83+
score: {
84+
$meta: "searchScore"
85+
},
86+
document: "$$ROOT"
87+
}
88+
}
89+
]).toArray();
90+
91+
// Print the results
92+
printSearchResults(insertSummary, vectorIndexSummary, searchResults);
93+
94+
} catch (error) {
95+
console.error('App failed:', error);
96+
process.exitCode = 1;
97+
} finally {
98+
console.log('Closing database connection...');
99+
if (dbClient) await dbClient.close();
100+
console.log('Database connection closed');
101+
}
102+
}
103+
104+
// Execute the main function
105+
main().catch(error => {
106+
console.error('Unhandled error:', error);
107+
process.exitCode = 1;
108+
});

0 commit comments

Comments
 (0)