Foundry-Local/sdk/js/src/openai/audioClient.ts at 6dbf211c0d64f20fecd19cd9bf15b129d1fcf6a3 · microsoft/Foundry-Local · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import { CoreInterop } from '../detail/coreInterop.js';
import { LiveAudioTranscriptionSession } from './liveAudioSession.js';

export class AudioClientSettings {
    language?: string;
    temperature?: number;

    /**
     * Serializes the settings into an OpenAI-compatible request object.
     * @internal
     */
    _serialize() {
        // Standard OpenAI properties
        const result: any = {
            Language: this.language,
            Temperature: this.temperature,
        };

        // Foundry specific metadata properties
        const metadata: Record<string, string> = {};
        if (this.language !== undefined) {
          metadata["language"] = this.language;
        }
        if (this.temperature !== undefined) {
            metadata["temperature"] = this.temperature.toString();
        }

        if (Object.keys(metadata).length > 0) {
            result.metadata = metadata;
        }

        // Filter out undefined properties
        return Object.fromEntries(Object.entries(result).filter(([_, v]) => v !== undefined));
    }
}

/**
 * Client for performing audio operations (transcription, translation) with a loaded model.
 * Follows the OpenAI Audio API structure.
 */
export class AudioClient {
    private modelId: string;
    private coreInterop: CoreInterop;

    /**
     * Configuration settings for audio operations.
     */
    public settings = new AudioClientSettings();

    /**
     * @internal
     * Restricted to internal use because CoreInterop is an internal implementation detail.
     * Users should create clients via the Model.createAudioClient() factory method.
     */
    constructor(modelId: string, coreInterop: CoreInterop) {
        this.modelId = modelId;
        this.coreInterop = coreInterop;
    }

    /**
     * Creates a LiveAudioTranscriptionSession for real-time audio streaming ASR.
     * @returns A LiveAudioTranscriptionSession instance.
     */
    public createLiveTranscriptionSession(): LiveAudioTranscriptionSession {
        return new LiveAudioTranscriptionSession(this.modelId, this.coreInterop);
    }

    /**
     * Validates that the audio file path is a non-empty string.
     * @internal
     */
    private validateAudioFilePath(audioFilePath: string): void {
        if (typeof audioFilePath !== 'string' || audioFilePath.trim() === '') {
            throw new Error('Audio file path must be a non-empty string.');
        }
    }

    /**
     * Transcribes audio into the input language.
     * @param audioFilePath - Path to the audio file to transcribe.
     * @returns The transcription result.
     * @throws Error - If audioFilePath is invalid or transcription fails.
     */
    public async transcribe(audioFilePath: string): Promise<any> {
        this.validateAudioFilePath(audioFilePath);
        const request = {
            Model: this.modelId,
            FileName: audioFilePath,
            ...this.settings._serialize()
        };

        try {
            const response = this.coreInterop.executeCommand("audio_transcribe", { Params: { OpenAICreateRequest: JSON.stringify(request) } });
            return JSON.parse(response);
        } catch (error) {
            throw new Error(`Audio transcription failed for model '${this.modelId}': ${error instanceof Error ? error.message : String(error)}`, { cause: error });
        }
    }

    /**
     * Transcribes audio into the input language using streaming, returning an async iterable of chunks.
     * @param audioFilePath - Path to the audio file to transcribe.
     * @returns An async iterable that yields parsed streaming transcription chunks.
     * @throws Error - If audioFilePath is invalid, or streaming fails.
     *
     * @example
     * ```typescript
     * for await (const chunk of audioClient.transcribeStreaming('recording.wav')) {
     *     process.stdout.write(chunk.text);
     * }
     * ```
     */
    public transcribeStreaming(audioFilePath: string): AsyncIterable<any> {
        this.validateAudioFilePath(audioFilePath);

        const request = {
            Model: this.modelId,
            FileName: audioFilePath,
            ...this.settings._serialize()
        };

        // Capture instance properties to local variables because `this` is not
        // accessible inside the [Symbol.asyncIterator]() method below — it's a
        // regular method on the returned object literal, not on the AudioClient.
        const coreInterop = this.coreInterop;
        const modelId = this.modelId;

        // Return an AsyncIterable object. The [Symbol.asyncIterator]() factory
        // is called once when the consumer starts a `for await` loop, and it
        // returns the AsyncIterator (with next() / return() methods).
        return {
            [Symbol.asyncIterator](): AsyncIterator<any> {
                // Buffer for chunks received from the native callback.
                // Uses a head index for O(1) dequeue instead of Array.shift() which is O(n).
                // JavaScript's single-threaded event loop ensures no race conditions
                // between the callback pushing chunks and next() consuming them.
                const chunks: any[] = [];
                let head = 0;
                let done = false;
                let cancelled = false;
                let error: Error | null = null;
                let resolve: (() => void) | null = null;
                let nextInFlight = false;

                const streamingPromise = coreInterop.executeCommandStreaming(
                    "audio_transcribe",
                    { Params: { OpenAICreateRequest: JSON.stringify(request) } },
                    (chunkStr: string) => {
                        if (cancelled || error) return;
                        if (chunkStr) {
                            try {
                                const chunk = JSON.parse(chunkStr);
                                chunks.push(chunk);
                            } catch (e) {
                                if (!error) {
                                    error = new Error(
                                        `Failed to parse streaming chunk: ${e instanceof Error ? e.message : String(e)}`,
                                        { cause: e }
                                    );
                                }
                            }
                        }
                        // Wake up any waiting next() call
                        if (resolve) {
                            const r = resolve;
                            resolve = null;
                            r();
                        }
                    }
                // When the native stream completes, mark done and wake up any
                // pending next() call so it can see that iteration has ended.
                ).then(() => {
                    done = true;
                    if (resolve) {
                        const r = resolve;
                        resolve = null;
                        r(); // resolve the pending next() promise
                    }
                }).catch((err) => {
                    if (!error) {
                        const underlyingError = err instanceof Error ? err : new Error(String(err));
                        error = new Error(
                            `Streaming audio transcription failed for model '${modelId}': ${underlyingError.message}`,
                            { cause: underlyingError }
                        );
                    }
                    done = true;
                    if (resolve) {
                        const r = resolve;
                        resolve = null;
                        r();
                    }
                });

                // Return the AsyncIterator object consumed by `for await`.
                // next() yields buffered chunks one at a time; return() is
                // called automatically when the consumer breaks out early.
                return {
                    async next(): Promise<IteratorResult<any>> {
                        if (nextInFlight) {
                            throw new Error('next() called concurrently on streaming iterator; await each call before invoking next().');
                        }
                        nextInFlight = true;
                        try {
                            while (true) {
                                if (head < chunks.length) {
                                    const value = chunks[head];
                                    chunks[head] = undefined; // allow GC
                                    head++;
                                    // Compact the array when all buffered chunks have been consumed
                                    if (head === chunks.length) {
                                        chunks.length = 0;
                                        head = 0;
                                    }
                                    return { value, done: false };
                                }
                                if (error) {
                                    throw error;
                                }
                                if (done || cancelled) {
                                    return { value: undefined, done: true };
                                }
                                // Wait for the next chunk or completion
                                await new Promise<void>((r) => { resolve = r; });
                            }
                        } finally {
                            nextInFlight = false;
                        }
                    },
                    async return(): Promise<IteratorResult<any>> {
                        // Mark cancelled so the callback stops buffering.
                        // Note: the underlying native stream cannot be cancelled
                        // (CoreInterop.executeCommandStreaming has no abort support),
                        // so the koffi callback may still fire but will no-op due
                        // to the cancelled guard above.
                        cancelled = true;
                        chunks.length = 0;
                        head = 0;
                        if (resolve) {
                            const r = resolve;
                            resolve = null;
                            r();
                        }
                        return { value: undefined, done: true };
                    }
                };
            }
        };
    }
}