Skip to content

Commit 4787781

Browse files
committed
feat: record and replay Gemini audio responses
Detects audio inlineData in both non-streaming JSON and streaming SSE Gemini responses, producing AudioResponse fixtures with b64Json and contentType. Widens recorder EndpointType to include audio-gen and fal-audio. Adds Float32Array alignment guard for embeddings.
1 parent a05e72e commit 4787781

1 file changed

Lines changed: 63 additions & 16 deletions

File tree

src/recorder.ts

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -168,23 +168,37 @@ export async function proxyAndRecord(
168168
if (collapsed.droppedChunks && collapsed.droppedChunks > 0) {
169169
defaults.logger.warn(`${collapsed.droppedChunks} chunk(s) dropped during stream collapse`);
170170
}
171-
if (collapsed.content === "" && (!collapsed.toolCalls || collapsed.toolCalls.length === 0)) {
171+
// Audio from streamed inlineData (e.g. Gemini SSE with audio parts)
172+
if (collapsed.audioB64) {
173+
fixtureResponse = {
174+
audio: {
175+
b64Json: collapsed.audioB64,
176+
contentType: collapsed.audioMimeType ?? "audio/mpeg",
177+
},
178+
};
179+
} else if (
180+
collapsed.content === "" &&
181+
(!collapsed.toolCalls || collapsed.toolCalls.length === 0)
182+
) {
172183
defaults.logger.warn("Stream collapse produced empty content — fixture may be incomplete");
173-
}
174-
const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {};
175-
if (collapsed.toolCalls && collapsed.toolCalls.length > 0) {
176-
if (collapsed.content) {
177-
// Both content and toolCalls present — save as ContentWithToolCallsResponse
178-
fixtureResponse = {
179-
content: collapsed.content,
180-
toolCalls: collapsed.toolCalls,
181-
...reasoningSpread,
182-
};
184+
const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {};
185+
fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread };
186+
} else {
187+
const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {};
188+
if (collapsed.toolCalls && collapsed.toolCalls.length > 0) {
189+
if (collapsed.content) {
190+
// Both content and toolCalls present — save as ContentWithToolCallsResponse
191+
fixtureResponse = {
192+
content: collapsed.content,
193+
toolCalls: collapsed.toolCalls,
194+
...reasoningSpread,
195+
};
196+
} else {
197+
fixtureResponse = { toolCalls: collapsed.toolCalls, ...reasoningSpread };
198+
}
183199
} else {
184-
fixtureResponse = { toolCalls: collapsed.toolCalls, ...reasoningSpread };
200+
fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread };
185201
}
186-
} else {
187-
fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread };
188202
}
189203
} else {
190204
// Non-streaming — try to parse as JSON
@@ -221,7 +235,10 @@ export async function proxyAndRecord(
221235
const fixtureMatch = buildFixtureMatch(matchRequest);
222236

223237
// Build and save the fixture
224-
const fixture: Fixture = { match: fixtureMatch, response: fixtureResponse };
238+
const fixture: Fixture = {
239+
match: fixtureMatch,
240+
response: fixtureResponse,
241+
};
225242

226243
// Check if the match is empty (all undefined values) — warn but still save to disk
227244
const matchValues = Object.values(fixtureMatch);
@@ -451,6 +468,10 @@ function buildFixtureResponse(
451468
}
452469
if (typeof first.embedding === "string" && encodingFormat === "base64") {
453470
const buf = Buffer.from(first.embedding, "base64");
471+
if (buf.byteLength % 4 !== 0) {
472+
// Malformed embedding — return a zero-dimension embedding fixture
473+
return { embedding: [] };
474+
}
454475
const aligned = new Uint8Array(buf).buffer; // Always offset 0
455476
const floats = new Float32Array(aligned, 0, buf.byteLength / 4);
456477
return { embedding: Array.from(floats) };
@@ -626,6 +647,24 @@ function buildFixtureResponse(
626647
const content = candidate.content as Record<string, unknown> | undefined;
627648
if (content && Array.isArray(content.parts)) {
628649
const parts = content.parts as Array<Record<string, unknown>>;
650+
651+
// Audio inlineData parts take priority over text
652+
const audioParts = parts.filter(
653+
(p: Record<string, unknown>) =>
654+
p.inlineData &&
655+
typeof (p.inlineData as Record<string, unknown>).mimeType === "string" &&
656+
((p.inlineData as Record<string, unknown>).mimeType as string).startsWith("audio/"),
657+
);
658+
if (audioParts.length > 0) {
659+
const inlineData = audioParts[0].inlineData as Record<string, unknown>;
660+
return {
661+
audio: {
662+
b64Json: String(inlineData.data ?? ""),
663+
contentType: String(inlineData.mimeType),
664+
},
665+
};
666+
}
667+
629668
const fnCallParts = parts.filter((p) => p.functionCall);
630669
const textParts = parts.filter((p) => typeof p.text === "string" && !p.thought);
631670
const thoughtParts = parts.filter((p) => p.thought === true && typeof p.text === "string");
@@ -833,7 +872,15 @@ function buildFixtureResponse(
833872
/**
834873
* Derive fixture match criteria from the original request.
835874
*/
836-
type EndpointType = "chat" | "image" | "speech" | "transcription" | "video" | "embedding";
875+
type EndpointType =
876+
| "chat"
877+
| "image"
878+
| "speech"
879+
| "transcription"
880+
| "video"
881+
| "embedding"
882+
| "audio-gen"
883+
| "fal-audio";
837884

838885
function buildFixtureMatch(request: ChatCompletionRequest): {
839886
userMessage?: string;

0 commit comments

Comments
 (0)