1+ /**
2+ * Module for creating embedding vectors using OpenAI API
3+ * Supports text embedding models for generating embeddings
4+ * that can be used with Cosmos DB MongoDB vCore vector search
5+ */
6+ import * as path from "node:path" ;
7+ import { AzureOpenAI } from "openai" ;
8+ import { Embedding } from "openai/resources" ;
9+ import { readFileReturnJson , writeFileJson , JsonData } from "./utils.js" ;
10+
11+ // ESM specific features - create __dirname equivalent
12+ import { fileURLToPath } from "node:url" ;
13+ import { dirname } from "node:path" ;
14+ const __filename = fileURLToPath ( import . meta. url ) ;
15+ const __dirname = dirname ( __filename ) ;
16+
17+ const apiKey = process . env . AZURE_OPENAI_EMBEDDING_KEY ;
18+ const apiVersion = process . env . AZURE_OPENAI_EMBEDDING_API_VERSION ;
19+ const endpoint = process . env . AZURE_OPENAI_EMBEDDING_ENDPOINT ;
20+ console . log ( `Using OpenAI endpoint: ${ endpoint } ` ) ;
21+ const deployment = process . env . AZURE_OPENAI_EMBEDDING_MODEL ! ;
22+
23+ const dataWithVectors = process . env . DATA_FILE_WITH_VECTORS ! ;
24+ const dataWithoutVectors = process . env . DATA_FILE_WITHOUT_VECTORS ! ;
25+ const fieldToEmbed = process . env . FIELD_TO_EMBED ! || "description" ;
26+ const newEmbeddedField = process . env . EMBEDDED_FIELD ! || deployment ;
27+ const batchSize = parseInt ( process . env . EMBEDDING_BATCH_SIZE || '16' , 10 ) ;
28+
29+ // Define a reusable delay function
30+ async function delay ( ms : number = 200 ) : Promise < void > {
31+ await new Promise ( resolve => setTimeout ( resolve , ms ) ) ;
32+ }
33+
34+ export async function createEmbeddings ( client : AzureOpenAI , model : string , inputItems : string [ ] ) : Promise < Embedding [ ] > {
35+ const response = await client . embeddings . create ( {
36+ model,
37+ input : inputItems
38+ } ) ;
39+
40+ if ( ! response . data || response . data . length === 0 ) {
41+ throw new Error ( `No embedding data returned` ) ;
42+ }
43+ return response . data ;
44+ }
45+
46+ export async function processEmbeddingBatch < T > (
47+ client : AzureOpenAI ,
48+ model : string ,
49+ fieldToEmbed : string ,
50+ newEmbeddedField : string ,
51+ maxEmbeddings : number ,
52+ items : T [ ]
53+
54+ ) : Promise < T [ ] > {
55+ if ( ! Array . isArray ( items ) || items . length === 0 ) {
56+ throw new Error ( "Items must be a non-empty array" ) ;
57+ }
58+
59+ if ( ! fieldToEmbed ) {
60+ throw new Error ( "Field to embed must be specified" ) ;
61+ }
62+
63+ const itemsWithEmbeddings : T [ ] = [ ] ;
64+ maxEmbeddings = maxEmbeddings || items . length ;
65+
66+ // Process in batches to avoid rate limits and memory issues
67+ for ( let i = 0 ; i < maxEmbeddings ; i += batchSize ) {
68+ const batchEnd = Math . min ( i + batchSize , items . length ) ;
69+ console . log ( `Processing batch: ${ i } to ${ batchEnd - 1 } (of ${ items . length } items)` ) ;
70+
71+ const batchItems = items . slice ( i , batchEnd ) ;
72+ const textsToEmbed = batchItems . map ( item => {
73+ if ( ! item [ fieldToEmbed ] ) {
74+ console . warn ( `Item is missing the field to embed: ${ fieldToEmbed } ` ) ;
75+ return "" ; // Provide a fallback value to prevent API errors
76+ }
77+ return item [ fieldToEmbed ] ;
78+ } ) ;
79+
80+ try {
81+ const embeddings = await createEmbeddings ( client , model , textsToEmbed ) ;
82+
83+ embeddings . forEach ( ( embeddingData , index ) => {
84+ const originalItem = batchItems [ index ] ;
85+ const newItem = {
86+ ...originalItem ,
87+ [ newEmbeddedField ] : embeddingData . embedding
88+ } ;
89+ itemsWithEmbeddings . push ( newItem ) ;
90+ } ) ;
91+
92+ // Add a small delay between batches to avoid rate limiting
93+ if ( batchEnd < items . length ) {
94+ await delay ( ) ;
95+ }
96+ } catch ( error ) {
97+ console . error ( `Error generating embeddings for batch ${ i } :` , error ) ;
98+ throw error ;
99+ }
100+ }
101+
102+ return itemsWithEmbeddings ;
103+ }
104+
105+
106+ try {
107+
108+ const client = new AzureOpenAI ( {
109+ apiKey,
110+ apiVersion,
111+ endpoint,
112+ deployment
113+ } ) ;
114+
115+ const data = await readFileReturnJson ( path . join ( __dirname , ".." , dataWithoutVectors ! ) ) ;
116+ const model = deployment ;
117+ const maxEmbeddings = data . length ;
118+
119+ const embeddings = await processEmbeddingBatch < JsonData > (
120+ client ,
121+ model ,
122+ fieldToEmbed ,
123+ newEmbeddedField ,
124+ maxEmbeddings ,
125+ data
126+ ) ;
127+
128+ await writeFileJson ( path . join ( __dirname , ".." , dataWithVectors ! ) , embeddings ) ;
129+
130+ } catch ( error ) {
131+ console . error ( `Failed to save embeddings to file: ${ ( error as Error ) . message } ` ) ;
132+ }
0 commit comments