diff --git a/docs/index.html b/docs/index.html index 54bbad60..2e32da81 100644 --- a/docs/index.html +++ b/docs/index.html @@ -1637,7 +1637,7 @@

How aimock compares

Image generation Built-in ✓ - + @@ -1727,7 +1727,7 @@

How aimock compares

Drift detection - + @@ -1736,7 +1736,7 @@

How aimock compares

Sequential / stateful responses Built-in ✓ manual - + @@ -1745,7 +1745,7 @@

How aimock compares

Request journal manual - + @@ -1781,7 +1781,7 @@

How aimock compares

Structured output / JSON mode Built-in ✓ manual - + @@ -1810,8 +1810,8 @@

How aimock compares

- - + + Vitest / Jest plugins diff --git a/docs/mcp/index.html b/docs/mcp/index.html new file mode 100644 index 00000000..d3331ba5 --- /dev/null +++ b/docs/mcp/index.html @@ -0,0 +1,238 @@ + + + + + + MCP Server — aimock + + + + + + + + + + +
+ + +
+

MCP Server

+

+ aimock's documentation and source code are indexed by a + Pathfinder + instance, giving AI agents semantic search and file exploration via MCP. +

+ +

Available Tools

+

+ The MCP server exposes five tools that agents can use to find information about aimock: +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ToolDescription
search-docs + Semantic search over aimock documentation — guides, API references, provider configs +
search-codeSemantic search over aimock's TypeScript source — routers, handlers, builders
explore-docsBrowse documentation files with bash commands (find, grep, cat, ls, head)
explore-codeBrowse source code files with bash commands
submit-feedbackReport whether search results were helpful to improve quality
+ +

Connect Your Agent

+ +

Claude Code

+

Add via the CLI:

+
+
Terminal bash
+
claude mcp add aimock-docs --transport http https://mcp.aimock.copilotkit.dev/mcp
+
+

Or add to ~/.claude/mcp.json:

+
+
~/.claude/mcp.json json
+
{
+  "mcp-servers": {
+    "aimock-docs": {
+      "type": "http",
+      "url": "https://mcp.aimock.copilotkit.dev/mcp"
+    }
+  }
+}
+
+ +

Claude Desktop

+
+
+ ~/Library/Application Support/Claude/claude_desktop_config.json + json +
+
{
+  "mcpServers": {
+    "aimock-docs": {
+      "type": "http",
+      "url": "https://mcp.aimock.copilotkit.dev/mcp"
+    }
+  }
+}
+
+ +

Cursor

+
+
.cursor/mcp.json json
+
{
+  "mcpServers": {
+    "aimock-docs": {
+      "type": "http",
+      "url": "https://mcp.aimock.copilotkit.dev/mcp"
+    }
+  }
+}
+
+ +

OpenAI Codex

+
+
codex-mcp.json json
+
{
+  "mcpServers": {
+    "aimock-docs": {
+      "type": "http",
+      "url": "https://mcp.aimock.copilotkit.dev/mcp"
+    }
+  }
+}
+
+

Then run: codex --mcp-config codex-mcp.json

+ +

VS Code (Continue)

+
+
+ ~/.continue/config.json json +
+
{
+  "mcpServers": [
+    {
+      "name": "aimock-docs",
+      "transport": {
+        "type": "http",
+        "url": "https://mcp.aimock.copilotkit.dev/mcp"
+      }
+    }
+  ]
+}
+
+ +

Generic (Streamable HTTP)

+

Any MCP-compatible client can connect via Streamable HTTP:

+
+
Endpoint text
+
URL:       https://mcp.aimock.copilotkit.dev/mcp
+Transport: Streamable HTTP
+Method:    POST
+
+ +

What Gets Indexed

+

+ The MCP server indexes two sources from the + aimock repository: +

+ +

+ The index auto-refreshes daily and on push via GitHub webhooks, so agents always search + against the latest content. +

+ +

Verifying the Connection

+

+ Once connected, your agent should see the aimock tools in its available tools list. Try + asking your agent: +

+ +
+ +
+ + + + + diff --git a/docs/sidebar.js b/docs/sidebar.js index 05e6c4ab..7c9901de 100644 --- a/docs/sidebar.js +++ b/docs/sidebar.js @@ -10,6 +10,7 @@ { label: "Quick Start: LLM", href: "/chat-completions" }, { label: "Quick Start: aimock", href: "/aimock-cli" }, { label: "Examples", href: "/examples" }, + { label: "MCP Server", href: "/mcp" }, ], }, { diff --git a/src/__tests__/elevenlabs-audio.test.ts b/src/__tests__/elevenlabs-audio.test.ts new file mode 100644 index 00000000..c76ae85a --- /dev/null +++ b/src/__tests__/elevenlabs-audio.test.ts @@ -0,0 +1,229 @@ +import { describe, test, expect, afterEach } from "vitest"; +import { LLMock } from "../llmock.js"; + +describe("ElevenLabs sound generation", () => { + let mock: LLMock; + + afterEach(async () => { + await mock?.stop(); + }); + + test("sound generation with string-form audio returns binary", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "castle door opening", endpoint: "audio-gen" }, + response: { audio: "SGVsbG8=", format: "mp3" }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/sound-generation`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ text: "castle door opening" }), + }); + + expect(res.status).toBe(200); + expect(res.headers.get("content-type")).toBe("audio/mpeg"); + const buffer = await res.arrayBuffer(); + expect(buffer.byteLength).toBeGreaterThan(0); + // "SGVsbG8=" decodes to "Hello" (5 bytes) + expect(buffer.byteLength).toBe(5); + }); + + test("sound generation with object-form audio", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "explosion", endpoint: "audio-gen" }, + response: { audio: { b64Json: "SGVsbG8=", contentType: "audio/wav" } }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/sound-generation`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ text: "explosion" }), + }); + + expect(res.status).toBe(200); + expect(res.headers.get("content-type")).toBe("audio/wav"); + const buffer = await res.arrayBuffer(); + expect(buffer.byteLength).toBe(5); + }); + + test("missing text field returns 400", async () => { + mock = new LLMock({ port: 0 }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/sound-generation`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({}), + }); + + expect(res.status).toBe(400); + const data = await res.json(); + expect(data.error.message).toContain("text"); + }); + + test("no matching fixture returns 404", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "specific sound", endpoint: "audio-gen" }, + response: { audio: "SGVsbG8=" }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/sound-generation`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ text: "completely different sound" }), + }); + + expect(res.status).toBe(404); + }); + + test("error fixture returns error status", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "rate limited", endpoint: "audio-gen" }, + response: { error: { message: "rate limit", type: "rate_limit_error" }, status: 429 }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/sound-generation`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ text: "rate limited" }), + }); + + expect(res.status).toBe(429); + const data = await res.json(); + expect(data.error.message).toBe("rate limit"); + }); +}); + +describe("ElevenLabs music", () => { + let mock: LLMock; + + afterEach(async () => { + await mock?.stop(); + }); + + test("music compose returns binary audio with song-id header", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "upbeat piano", endpoint: "audio-gen" }, + response: { audio: "SGVsbG8=", format: "mp3" }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/music`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ prompt: "upbeat piano" }), + }); + + expect(res.status).toBe(200); + expect(res.headers.get("content-type")).toBe("audio/mpeg"); + expect(res.headers.get("song-id")).toBeTruthy(); + expect(res.headers.get("song-id")).toMatch(/^mock-song-/); + const buffer = await res.arrayBuffer(); + expect(buffer.byteLength).toBe(5); + }); + + test("music stream returns binary audio", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "ambient drone", endpoint: "audio-gen" }, + response: { audio: "SGVsbG8=" }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/music/stream`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ prompt: "ambient drone" }), + }); + + expect(res.status).toBe(200); + const buffer = await res.arrayBuffer(); + expect(buffer.byteLength).toBe(5); + }); + + test("music plan returns JSON text", async () => { + mock = new LLMock({ port: 0 }); + const compositionPlan = JSON.stringify({ sections: ["intro", "verse", "chorus"] }); + mock.addFixture({ + match: { userMessage: "jazz song", endpoint: "audio-gen" }, + response: { content: compositionPlan }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/music/plan`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ prompt: "jazz song" }), + }); + + expect(res.status).toBe(200); + expect(res.headers.get("content-type")).toBe("application/json"); + const data = await res.json(); + expect(data.sections).toEqual(["intro", "verse", "chorus"]); + }); + + test("missing prompt returns 400 for music", async () => { + mock = new LLMock({ port: 0 }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/music`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({}), + }); + + expect(res.status).toBe(400); + const data = await res.json(); + expect(data.error.message).toContain("prompt"); + }); +}); + +describe("ElevenLabs convenience methods", () => { + let mock: LLMock; + + afterEach(async () => { + await mock?.stop(); + }); + + test("onSoundEffect creates fixture with correct endpoint", async () => { + mock = new LLMock({ port: 0 }); + mock.onSoundEffect("door", { audio: "SGVsbG8=" }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/sound-generation`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ text: "door" }), + }); + + expect(res.status).toBe(200); + const buffer = await res.arrayBuffer(); + expect(buffer.byteLength).toBe(5); + }); + + test("onMusic creates fixture with correct endpoint", async () => { + mock = new LLMock({ port: 0 }); + mock.onMusic("piano", { audio: "SGVsbG8=" }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1/music`, { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: "Bearer test" }, + body: JSON.stringify({ prompt: "piano" }), + }); + + expect(res.status).toBe(200); + expect(res.headers.get("song-id")).toBeTruthy(); + const buffer = await res.arrayBuffer(); + expect(buffer.byteLength).toBe(5); + }); +}); diff --git a/src/__tests__/fal-audio.test.ts b/src/__tests__/fal-audio.test.ts new file mode 100644 index 00000000..cc33aeb5 --- /dev/null +++ b/src/__tests__/fal-audio.test.ts @@ -0,0 +1,272 @@ +import { describe, test, expect, afterEach } from "vitest"; +import { LLMock } from "../llmock.js"; + +describe("fal.ai audio queue", () => { + let mock: LLMock; + + afterEach(async () => { + await mock?.stop(); + }); + + test("queue submit returns queue envelope", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("drum loop", { audio: "SGVsbG8=", format: "mp3" }); + await mock.start(); + + const res = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "drum loop" }), + }); + expect(res.status).toBe(200); + const data = await res.json(); + expect(data.request_id).toBeDefined(); + expect(typeof data.request_id).toBe("string"); + expect(data.response_url).toContain(data.request_id); + expect(data.status_url).toContain(data.request_id); + expect(data.cancel_url).toContain(data.request_id); + expect(data.queue_position).toBe(0); + }); + + test("queue status returns COMPLETED", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("drum loop", { audio: "SGVsbG8=", format: "mp3" }); + await mock.start(); + + // Submit first + const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "drum loop" }), + }); + const envelope = await submit.json(); + + // Check status + const status = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}/status`); + expect(status.status).toBe(200); + const statusData = await status.json(); + expect(statusData.status).toBe("COMPLETED"); + expect(statusData.request_id).toBe(envelope.request_id); + expect(statusData.response_url).toBeDefined(); + }); + + test("queue result returns audio file object", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("drum loop", { audio: "SGVsbG8=", format: "mp3" }); + await mock.start(); + + // Submit + const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "drum loop" }), + }); + const envelope = await submit.json(); + + // Get result + const result = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}`); + expect(result.status).toBe(200); + const data = await result.json(); + expect(data.audio).toBeDefined(); + expect(data.audio.url).toContain("generated_audio.mp3"); + expect(data.audio.content_type).toBe("audio/mpeg"); + expect(data.audio.file_name).toBe("generated_audio.mp3"); + expect(typeof data.audio.file_size).toBe("number"); + expect(data.audio.file_size).toBeGreaterThan(0); + }); + + test("full queue lifecycle: submit -> status -> result", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("synth pad", { audio: "AAAA", format: "wav" }); + await mock.start(); + + // Submit + const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "synth pad" }), + }); + expect(submit.status).toBe(200); + const envelope = await submit.json(); + const requestId = envelope.request_id; + + // Status + const status = await fetch(`${mock.url}/fal/queue/requests/${requestId}/status`); + expect(status.status).toBe(200); + const statusData = await status.json(); + expect(statusData.status).toBe("COMPLETED"); + + // Result + const result = await fetch(`${mock.url}/fal/queue/requests/${requestId}`); + expect(result.status).toBe(200); + const resultData = await result.json(); + expect(resultData.audio.url).toContain("generated_audio.wav"); + expect(resultData.audio.content_type).toBe("audio/wav"); + }); + + test("synchronous run returns result directly", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("drum loop", { audio: "SGVsbG8=", format: "mp3" }); + await mock.start(); + + const res = await fetch(`${mock.url}/fal/run/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "drum loop" }), + }); + expect(res.status).toBe(200); + const data = await res.json(); + // Synchronous run returns result directly, no queue envelope + expect(data.audio).toBeDefined(); + expect(data.audio.url).toContain("generated_audio.mp3"); + expect(data.audio.content_type).toBe("audio/mpeg"); + expect(data.request_id).toBeUndefined(); // no queue envelope + }); + + test("cancel returns ALREADY_COMPLETED", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("drum loop", { audio: "SGVsbG8=" }); + await mock.start(); + + // Submit + const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "drum loop" }), + }); + const envelope = await submit.json(); + + // Cancel + const cancel = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}/cancel`, { + method: "PUT", + }); + expect(cancel.status).toBe(400); + const cancelData = await cancel.json(); + expect(cancelData.status).toBe("ALREADY_COMPLETED"); + }); + + test("unknown request_id returns 404", async () => { + mock = new LLMock({ port: 0 }); + await mock.start(); + + const res = await fetch(`${mock.url}/fal/queue/requests/nonexistent/status`); + expect(res.status).toBe(404); + }); + + test("object-form audio response with contentType", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("speech", { + audio: { b64Json: "SGVsbG8=", contentType: "audio/wav" }, + }); + await mock.start(); + + // Submit + const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "speech" }), + }); + const envelope = await submit.json(); + + // Get result + const result = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}`); + const data = await result.json(); + expect(data.audio.content_type).toBe("audio/wav"); + }); + + test("onFalAudio convenience method registers fixture correctly", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("loop", { audio: "SGVsbG8=" }); + await mock.start(); + + const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "loop" }), + }); + expect(submit.status).toBe(200); + const envelope = await submit.json(); + expect(envelope.request_id).toBeDefined(); + + // Verify result is retrievable + const result = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}`); + expect(result.status).toBe(200); + const data = await result.json(); + expect(data.audio).toBeDefined(); + }); + + test("no matching fixture returns 404", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("specific prompt", { audio: "SGVsbG8=" }); + await mock.start(); + + const res = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "completely different" }), + }); + expect(res.status).toBe(404); + const data = await res.json(); + expect(data.error.message).toContain("No fixture matched"); + }); + + test("error fixture returns error status", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "quota", endpoint: "fal-audio" }, + response: { error: { message: "quota exceeded", type: "rate_limit" }, status: 429 }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ prompt: "quota" }), + }); + expect(res.status).toBe(429); + const data = await res.json(); + expect(data.error.message).toBe("quota exceeded"); + }); + + test("X-Test-Id isolation for fal queue jobs", async () => { + mock = new LLMock({ port: 0 }); + mock.onFalAudio("drum", { audio: "SGVsbG8=" }); + await mock.start(); + + // Submit with test-id A + const submitA = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json", "X-Test-Id": "testA" }, + body: JSON.stringify({ prompt: "drum" }), + }); + const envelopeA = await submitA.json(); + + // Submit with test-id B + const submitB = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, { + method: "POST", + headers: { "Content-Type": "application/json", "X-Test-Id": "testB" }, + body: JSON.stringify({ prompt: "drum" }), + }); + const envelopeB = await submitB.json(); + + // A's request_id should not be visible to B + const crossLookup = await fetch( + `${mock.url}/fal/queue/requests/${envelopeA.request_id}/status`, + { headers: { "X-Test-Id": "testB" } }, + ); + expect(crossLookup.status).toBe(404); + + // A's request_id should be visible to A + const sameLookup = await fetch( + `${mock.url}/fal/queue/requests/${envelopeA.request_id}/status`, + { headers: { "X-Test-Id": "testA" } }, + ); + expect(sameLookup.status).toBe(200); + + // B's request_id should be visible to B + const bLookup = await fetch(`${mock.url}/fal/queue/requests/${envelopeB.request_id}/status`, { + headers: { "X-Test-Id": "testB" }, + }); + expect(bLookup.status).toBe(200); + }); +}); diff --git a/src/__tests__/gemini-audio-record.test.ts b/src/__tests__/gemini-audio-record.test.ts new file mode 100644 index 00000000..880a465d --- /dev/null +++ b/src/__tests__/gemini-audio-record.test.ts @@ -0,0 +1,391 @@ +import { describe, it, expect } from "vitest"; +import * as http from "node:http"; +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import { proxyAndRecord } from "../recorder.js"; +import type { Fixture, RecordConfig, ChatCompletionRequest } from "../types.js"; +import { Logger } from "../logger.js"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function createUpstream( + handler: (req: http.IncomingMessage, res: http.ServerResponse) => void, +): Promise<{ server: http.Server; url: string }> { + return new Promise((resolve) => { + const server = http.createServer(handler); + server.listen(0, "127.0.0.1", () => { + const addr = server.address() as { port: number }; + resolve({ server, url: `http://127.0.0.1:${addr.port}` }); + }); + }); +} + +function closeServer(server: http.Server): Promise { + return new Promise((resolve) => server.close(() => resolve())); +} + +function createMockReqRes( + urlPath: string, + headers: Record = {}, +): { req: http.IncomingMessage; res: http.ServerResponse; getResponse: () => Promise } { + const chunks: Buffer[] = []; + let statusCode = 200; + + const req = { + method: "POST", + url: urlPath, + headers: { "content-type": "application/json", ...headers }, + } as unknown as http.IncomingMessage; + + const res = { + statusCode, + headersSent: false, + destroyed: false, + writableEnded: false, + // eslint-disable-next-line @typescript-eslint/no-unused-vars + writeHead(status: number, hdrs?: Record) { + statusCode = status; + res.statusCode = status; + (res as unknown as { headersSent: boolean }).headersSent = true; + }, + write(data: string | Buffer) { + chunks.push(Buffer.isBuffer(data) ? data : Buffer.from(data)); + return true; + }, + end(data?: string | Buffer) { + if (data) chunks.push(Buffer.isBuffer(data) ? data : Buffer.from(data)); + (res as unknown as { writableEnded: boolean }).writableEnded = true; + }, + setHeader() {}, + flushHeaders() {}, + on() { + return res; + }, + } as unknown as http.ServerResponse; + + return { + req, + res, + getResponse: async () => Buffer.concat(chunks).toString(), + }; +} + +function makeTmpDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), "aimock-gemini-audio-")); +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe("Gemini audio recording: non-streaming JSON", () => { + it("records non-streaming Gemini audio response as AudioResponse", async () => { + const fixturePath = makeTmpDir(); + const { server, url } = await createUpstream((_req, res) => { + res.writeHead(200, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + candidates: [ + { + content: { + role: "model", + parts: [{ inlineData: { mimeType: "audio/mp3", data: "SGVsbG8=" } }], + }, + finishReason: "STOP", + }, + ], + }), + ); + }); + + try { + const fixtures: Fixture[] = []; + const record: RecordConfig = { providers: { gemini: url }, fixturePath }; + const logger = new Logger("silent"); + const request: ChatCompletionRequest = { + model: "gemini-2.0-flash", + messages: [{ role: "user", content: "say hello" }], + }; + + const { req, res } = createMockReqRes("/v1beta/models/gemini-2.0-flash:generateContent"); + await proxyAndRecord( + req, + res, + request, + "gemini", + "/v1beta/models/gemini-2.0-flash:generateContent", + fixtures, + { record, logger }, + ); + + expect(fixtures).toHaveLength(1); + const response = fixtures[0].response as { + audio?: { b64Json: string; contentType?: string }; + }; + expect(response.audio).toBeDefined(); + expect(response.audio!.b64Json).toBe("SGVsbG8="); + expect(response.audio!.contentType).toBe("audio/mp3"); + } finally { + await closeServer(server); + fs.rmSync(fixturePath, { recursive: true, force: true }); + } + }); +}); + +describe("Gemini audio recording: streaming SSE", () => { + it("records streaming Gemini SSE audio response as AudioResponse", async () => { + const fixturePath = makeTmpDir(); + const chunk1 = JSON.stringify({ + candidates: [ + { + content: { + parts: [{ inlineData: { mimeType: "audio/mp3", data: "AAAA" } }], + }, + }, + ], + }); + const chunk2 = JSON.stringify({ + candidates: [ + { + content: { + parts: [{ inlineData: { mimeType: "audio/mp3", data: "BBBB" } }], + }, + }, + ], + }); + const sseBody = `data: ${chunk1}\n\ndata: ${chunk2}\n\n`; + + const { server, url } = await createUpstream((_req, res) => { + res.writeHead(200, { "Content-Type": "text/event-stream" }); + res.end(sseBody); + }); + + try { + const fixtures: Fixture[] = []; + const record: RecordConfig = { providers: { gemini: url }, fixturePath }; + const logger = new Logger("silent"); + const request: ChatCompletionRequest = { + model: "gemini-2.0-flash", + messages: [{ role: "user", content: "stream audio" }], + stream: true, + }; + + const { req, res } = createMockReqRes( + "/v1beta/models/gemini-2.0-flash:streamGenerateContent", + ); + await proxyAndRecord( + req, + res, + request, + "gemini", + "/v1beta/models/gemini-2.0-flash:streamGenerateContent", + fixtures, + { record, logger }, + ); + + expect(fixtures).toHaveLength(1); + const response = fixtures[0].response as { + audio?: { b64Json: string; contentType?: string }; + }; + expect(response.audio).toBeDefined(); + expect(response.audio!.b64Json).toBe("AAAABBBB"); + expect(response.audio!.contentType).toBe("audio/mp3"); + } finally { + await closeServer(server); + fs.rmSync(fixturePath, { recursive: true, force: true }); + } + }); +}); + +describe("Gemini audio recording: audio priority over text", () => { + it("audio parts take priority over text parts in non-streaming", async () => { + const fixturePath = makeTmpDir(); + const { server, url } = await createUpstream((_req, res) => { + res.writeHead(200, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + candidates: [ + { + content: { + role: "model", + parts: [ + { text: "Here is the audio" }, + { inlineData: { mimeType: "audio/wav", data: "UklGRg==" } }, + ], + }, + finishReason: "STOP", + }, + ], + }), + ); + }); + + try { + const fixtures: Fixture[] = []; + const record: RecordConfig = { providers: { gemini: url }, fixturePath }; + const logger = new Logger("silent"); + const request: ChatCompletionRequest = { + model: "gemini-2.0-flash", + messages: [{ role: "user", content: "audio with text" }], + }; + + const { req, res } = createMockReqRes("/v1beta/models/gemini-2.0-flash:generateContent"); + await proxyAndRecord( + req, + res, + request, + "gemini", + "/v1beta/models/gemini-2.0-flash:generateContent", + fixtures, + { record, logger }, + ); + + expect(fixtures).toHaveLength(1); + const response = fixtures[0].response as { + audio?: { b64Json: string; contentType?: string }; + content?: string; + }; + // Audio should take priority — no content field + expect(response.audio).toBeDefined(); + expect(response.audio!.b64Json).toBe("UklGRg=="); + expect(response.audio!.contentType).toBe("audio/wav"); + expect(response.content).toBeUndefined(); + } finally { + await closeServer(server); + fs.rmSync(fixturePath, { recursive: true, force: true }); + } + }); +}); + +describe("Gemini audio recording: replay after record", () => { + it("recorded fixture matches on replay", async () => { + const fixturePath = makeTmpDir(); + const { server, url } = await createUpstream((_req, res) => { + res.writeHead(200, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + candidates: [ + { + content: { + role: "model", + parts: [{ inlineData: { mimeType: "audio/mp3", data: "dGVzdA==" } }], + }, + finishReason: "STOP", + }, + ], + }), + ); + }); + + try { + const fixtures: Fixture[] = []; + const record: RecordConfig = { providers: { gemini: url }, fixturePath }; + const logger = new Logger("silent"); + const request: ChatCompletionRequest = { + model: "gemini-2.0-flash", + messages: [{ role: "user", content: "replay test" }], + }; + + // First call: record + const { req: req1, res: res1 } = createMockReqRes( + "/v1beta/models/gemini-2.0-flash:generateContent", + ); + await proxyAndRecord( + req1, + res1, + request, + "gemini", + "/v1beta/models/gemini-2.0-flash:generateContent", + fixtures, + { record, logger }, + ); + + expect(fixtures).toHaveLength(1); + const recorded = fixtures[0]; + expect(recorded.match.userMessage).toBe("replay test"); + + // The fixture is now in memory — verify its shape is correct for replay + const response = recorded.response as { + audio?: { b64Json: string; contentType?: string }; + }; + expect(response.audio).toBeDefined(); + expect(response.audio!.b64Json).toBe("dGVzdA=="); + expect(response.audio!.contentType).toBe("audio/mp3"); + + // Verify fixture was written to disk + const files = fs.readdirSync(fixturePath).filter((f) => f.endsWith(".json")); + expect(files.length).toBeGreaterThanOrEqual(1); + + const diskFixture = JSON.parse( + fs.readFileSync(path.join(fixturePath, files[0]), "utf-8"), + ) as { fixtures: Array<{ response: { audio?: { b64Json: string; contentType?: string } } }> }; + expect(diskFixture.fixtures[0].response.audio).toBeDefined(); + expect(diskFixture.fixtures[0].response.audio!.b64Json).toBe("dGVzdA=="); + } finally { + await closeServer(server); + fs.rmSync(fixturePath, { recursive: true, force: true }); + } + }); +}); + +describe("Gemini audio recording: non-audio inlineData is ignored", () => { + it("image inlineData does not produce AudioResponse", async () => { + const fixturePath = makeTmpDir(); + const { server, url } = await createUpstream((_req, res) => { + res.writeHead(200, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + candidates: [ + { + content: { + role: "model", + parts: [ + { text: "Here is an image" }, + { inlineData: { mimeType: "image/png", data: "iVBORw0KGgo=" } }, + ], + }, + finishReason: "STOP", + }, + ], + }), + ); + }); + + try { + const fixtures: Fixture[] = []; + const record: RecordConfig = { providers: { gemini: url }, fixturePath }; + const logger = new Logger("silent"); + const request: ChatCompletionRequest = { + model: "gemini-2.0-flash", + messages: [{ role: "user", content: "show image" }], + }; + + const { req, res } = createMockReqRes("/v1beta/models/gemini-2.0-flash:generateContent"); + await proxyAndRecord( + req, + res, + request, + "gemini", + "/v1beta/models/gemini-2.0-flash:generateContent", + fixtures, + { record, logger }, + ); + + expect(fixtures).toHaveLength(1); + const response = fixtures[0].response as { + audio?: unknown; + content?: string; + }; + // Should NOT be an AudioResponse — image/png is not audio/ + expect(response.audio).toBeUndefined(); + // Should fall through to text extraction + expect(response.content).toBe("Here is an image"); + } finally { + await closeServer(server); + fs.rmSync(fixturePath, { recursive: true, force: true }); + } + }); +}); diff --git a/src/__tests__/gemini-audio.test.ts b/src/__tests__/gemini-audio.test.ts new file mode 100644 index 00000000..8b5c03be --- /dev/null +++ b/src/__tests__/gemini-audio.test.ts @@ -0,0 +1,176 @@ +import { describe, test, expect, afterEach } from "vitest"; +import { LLMock } from "../llmock.js"; + +describe("Gemini audio responses", () => { + let mock: LLMock; + + afterEach(async () => { + await mock?.stop(); + }); + + test("non-streaming generateContent with string-form audio", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "piano loop" }, + response: { audio: "SGVsbG8=", format: "mp3" }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ role: "user", parts: [{ text: "piano loop" }] }], + }), + }); + expect(res.status).toBe(200); + const data = await res.json(); + expect(data.candidates[0].content.parts[0].inlineData).toEqual({ + mimeType: "audio/mpeg", + data: "SGVsbG8=", + }); + expect(data.candidates[0].finishReason).toBe("STOP"); + expect(data.usageMetadata).toBeDefined(); + }); + + test("streaming streamGenerateContent with string-form audio", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "piano loop" }, + response: { audio: "SGVsbG8=", format: "mp3" }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1beta/models/lyria-3:streamGenerateContent`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ role: "user", parts: [{ text: "piano loop" }] }], + }), + }); + expect(res.status).toBe(200); + expect(res.headers.get("content-type")).toBe("text/event-stream"); + + const text = await res.text(); + // Parse SSE data lines + const chunks = text + .split("\n\n") + .filter((line) => line.startsWith("data: ")) + .map((line) => JSON.parse(line.replace("data: ", ""))); + + expect(chunks.length).toBeGreaterThanOrEqual(1); + const chunk = chunks[0]; + expect(chunk.candidates[0].content.parts[0].inlineData).toEqual({ + mimeType: "audio/mpeg", + data: "SGVsbG8=", + }); + expect(chunk.candidates[0].finishReason).toBe("STOP"); + }); + + test("non-streaming with object-form audio", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "wav audio" }, + response: { audio: { b64Json: "SGVsbG8=", contentType: "audio/wav" } }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ role: "user", parts: [{ text: "wav audio" }] }], + }), + }); + const data = await res.json(); + expect(data.candidates[0].content.parts[0].inlineData).toEqual({ + mimeType: "audio/wav", + data: "SGVsbG8=", + }); + }); + + test("object-form audio without contentType defaults to audio/mpeg", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "default format" }, + response: { audio: { b64Json: "SGVsbG8=" } }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ role: "user", parts: [{ text: "default format" }] }], + }), + }); + const data = await res.json(); + expect(data.candidates[0].content.parts[0].inlineData.mimeType).toBe("audio/mpeg"); + }); + + test("string-form audio with format opus", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "opus audio" }, + response: { audio: "SGVsbG8=", format: "opus" }, + }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ role: "user", parts: [{ text: "opus audio" }] }], + }), + }); + const data = await res.json(); + expect(data.candidates[0].content.parts[0].inlineData.mimeType).toBe("audio/opus"); + }); + + test("Vertex AI path works too", async () => { + mock = new LLMock({ port: 0 }); + mock.addFixture({ + match: { userMessage: "piano loop" }, + response: { audio: "SGVsbG8=", format: "mp3" }, + }); + await mock.start(); + + const res = await fetch( + `${mock.url}/v1/projects/proj/locations/us-central1/publishers/google/models/lyria-3:generateContent`, + { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ role: "user", parts: [{ text: "piano loop" }] }], + }), + }, + ); + expect(res.status).toBe(200); + const data = await res.json(); + expect(data.candidates[0].content.parts[0].inlineData).toEqual({ + mimeType: "audio/mpeg", + data: "SGVsbG8=", + }); + }); + + test("onAudio() convenience method works via Gemini", async () => { + mock = new LLMock({ port: 0 }); + mock.onAudio("piano loop", { audio: "SGVsbG8=" }); + await mock.start(); + + const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ role: "user", parts: [{ text: "piano loop" }] }], + }), + }); + expect(res.status).toBe(200); + const data = await res.json(); + // onAudio without format defaults to mp3 + expect(data.candidates[0].content.parts[0].inlineData).toEqual({ + mimeType: "audio/mpeg", + data: "SGVsbG8=", + }); + }); +}); diff --git a/src/__tests__/multimedia-types.test.ts b/src/__tests__/multimedia-types.test.ts index 1217ba2d..bf703285 100644 --- a/src/__tests__/multimedia-types.test.ts +++ b/src/__tests__/multimedia-types.test.ts @@ -26,11 +26,36 @@ describe("multimedia type guards", () => { expect(isImageResponse(r)).toBe(false); }); - test("isAudioResponse detects audio", () => { + test("isAudioResponse detects audio (string form)", () => { const r: FixtureResponse = { audio: "AAAA", format: "mp3" }; expect(isAudioResponse(r)).toBe(true); }); + test("isAudioResponse detects audio (object form with contentType)", () => { + const r: FixtureResponse = { audio: { b64Json: "abc", contentType: "audio/mp3" } }; + expect(isAudioResponse(r)).toBe(true); + }); + + test("isAudioResponse detects audio (object form without contentType)", () => { + const r: FixtureResponse = { audio: { b64Json: "abc" } }; + expect(isAudioResponse(r)).toBe(true); + }); + + test("isAudioResponse accepts empty b64Json (validation is in fixture-loader)", () => { + const r: FixtureResponse = { audio: { b64Json: "" } }; + expect(isAudioResponse(r)).toBe(true); + }); + + test("isAudioResponse rejects numeric audio", () => { + const r = { audio: 123 } as unknown as FixtureResponse; + expect(isAudioResponse(r)).toBe(false); + }); + + test("isAudioResponse rejects object without b64Json", () => { + const r = { audio: { foo: "bar" } } as unknown as FixtureResponse; + expect(isAudioResponse(r)).toBe(false); + }); + test("isAudioResponse rejects text response", () => { const r: FixtureResponse = { content: "hello" }; expect(isAudioResponse(r)).toBe(false); diff --git a/src/__tests__/ws-gemini-live-audio.test.ts b/src/__tests__/ws-gemini-live-audio.test.ts new file mode 100644 index 00000000..669a198f --- /dev/null +++ b/src/__tests__/ws-gemini-live-audio.test.ts @@ -0,0 +1,188 @@ +import { describe, it, expect, afterEach } from "vitest"; +import { createServer, type ServerInstance } from "../server.js"; +import type { Fixture } from "../types.js"; +import { connectWebSocket } from "./ws-test-client.js"; + +// --- helpers --- + +const GEMINI_WS_PATH = + "/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent"; + +function setupMsg(model = "gemini-2.0-flash-exp"): string { + return JSON.stringify({ + setup: { model }, + }); +} + +function clientContentMsg(text: string): string { + return JSON.stringify({ + clientContent: { + turns: [{ role: "user", parts: [{ text }] }], + turnComplete: true, + }, + }); +} + +// --- tests --- + +let instance: ServerInstance | null = null; + +afterEach(async () => { + if (instance) { + await new Promise((resolve) => { + instance!.server.close(() => resolve()); + }); + instance = null; + } +}); + +describe("WebSocket Gemini Live — audio responses", () => { + it("returns audio inlineData for string-form AudioResponse", async () => { + const audioFixture: Fixture = { + match: { userMessage: "play-audio-string" }, + response: { audio: "SGVsbG8=", format: "mp3" }, + }; + instance = await createServer([audioFixture]); + const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH); + + ws.send(setupMsg()); + await ws.waitForMessages(1); // setupComplete + + ws.send(clientContentMsg("play-audio-string")); + + const raw = await ws.waitForMessages(2); // setupComplete + audio response + const msg = JSON.parse(raw[1]); + + expect(msg.serverContent).toBeDefined(); + expect(msg.serverContent.modelTurn.parts).toHaveLength(1); + expect(msg.serverContent.modelTurn.parts[0].inlineData).toEqual({ + mimeType: "audio/mpeg", + data: "SGVsbG8=", + }); + expect(msg.serverContent.turnComplete).toBe(true); + + ws.close(); + }); + + it("returns audio inlineData for object-form AudioResponse with contentType", async () => { + const audioFixture: Fixture = { + match: { userMessage: "play-audio-object" }, + response: { audio: { b64Json: "SGVsbG8=", contentType: "audio/wav" } }, + }; + instance = await createServer([audioFixture]); + const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH); + + ws.send(setupMsg()); + await ws.waitForMessages(1); // setupComplete + + ws.send(clientContentMsg("play-audio-object")); + + const raw = await ws.waitForMessages(2); + const msg = JSON.parse(raw[1]); + + expect(msg.serverContent).toBeDefined(); + expect(msg.serverContent.modelTurn.parts[0].inlineData).toEqual({ + mimeType: "audio/wav", + data: "SGVsbG8=", + }); + expect(msg.serverContent.turnComplete).toBe(true); + + ws.close(); + }); + + it("defaults to audio/mpeg when object-form AudioResponse omits contentType", async () => { + const audioFixture: Fixture = { + match: { userMessage: "play-audio-no-ct" }, + response: { audio: { b64Json: "SGVsbG8=" } }, + }; + instance = await createServer([audioFixture]); + const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH); + + ws.send(setupMsg()); + await ws.waitForMessages(1); // setupComplete + + ws.send(clientContentMsg("play-audio-no-ct")); + + const raw = await ws.waitForMessages(2); + const msg = JSON.parse(raw[1]); + + expect(msg.serverContent.modelTurn.parts[0].inlineData.mimeType).toBe("audio/mpeg"); + expect(msg.serverContent.modelTurn.parts[0].inlineData.data).toBe("SGVsbG8="); + expect(msg.serverContent.turnComplete).toBe(true); + + ws.close(); + }); + + it("sends audio as a single frame with turnComplete: true (not chunked)", async () => { + const audioFixture: Fixture = { + match: { userMessage: "play-audio-single" }, + response: { audio: "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo=", format: "opus" }, + }; + instance = await createServer([audioFixture]); + const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH); + + ws.send(setupMsg()); + await ws.waitForMessages(1); // setupComplete + + ws.send(clientContentMsg("play-audio-single")); + + const raw = await ws.waitForMessages(2); // setupComplete + exactly 1 audio frame + // Only 2 messages total — setupComplete and the single audio response + expect(raw).toHaveLength(2); + + const msg = JSON.parse(raw[1]); + expect(msg.serverContent).toBeDefined(); + expect(msg.serverContent.turnComplete).toBe(true); + expect(msg.serverContent.modelTurn.parts[0].inlineData).toEqual({ + mimeType: "audio/opus", + data: "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo=", + }); + + // Verify no additional messages arrive (wait briefly and confirm count stays at 2) + await new Promise((r) => setTimeout(r, 100)); + + ws.close(); + }); + + it("client-sent inlineData parts do not crash the handler", async () => { + // Fixture matches on the text part, ignoring the inlineData part + const textFixture: Fixture = { + match: { userMessage: "transcribe-this" }, + response: { content: "I heard your audio" }, + }; + instance = await createServer([textFixture]); + const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH); + + ws.send(setupMsg()); + await ws.waitForMessages(1); // setupComplete + + // Send clientContent with both an inlineData part (simulating audio input) + // and a text part for fixture matching + ws.send( + JSON.stringify({ + clientContent: { + turns: [ + { + role: "user", + parts: [ + { inlineData: { mimeType: "audio/pcm", data: "dGVzdC1hdWRpbw==" } }, + { text: "transcribe-this" }, + ], + }, + ], + turnComplete: true, + }, + }), + ); + + const raw = await ws.waitForMessages(2); + const msg = JSON.parse(raw[1]); + + // The handler should process the text part and return a text response + expect(msg.serverContent).toBeDefined(); + expect(msg.serverContent.modelTurn.parts[0].text).toBe("I heard your audio"); + expect(msg.serverContent.turnComplete).toBe(true); + + ws.close(); + }); +}); diff --git a/src/elevenlabs-audio.ts b/src/elevenlabs-audio.ts new file mode 100644 index 00000000..49710fdd --- /dev/null +++ b/src/elevenlabs-audio.ts @@ -0,0 +1,268 @@ +import type http from "node:http"; +import type { ChatCompletionRequest, Fixture, HandlerDefaults } from "./types.js"; +import { + isAudioResponse, + isTextResponse, + isErrorResponse, + FORMAT_TO_CONTENT_TYPE, + getTestId, +} from "./helpers.js"; +import { matchFixture } from "./router.js"; +import { writeErrorResponse } from "./sse-writer.js"; +import { proxyAndRecord } from "./recorder.js"; +import type { Journal } from "./journal.js"; + +export async function handleElevenLabsAudio( + req: http.IncomingMessage, + res: http.ServerResponse, + body: string, + fixtures: Fixture[], + defaults: HandlerDefaults, + journal: Journal, + subType: string, // "sound-generation" | "music" | "stream" | "plan" +): Promise { + const path = req.url ?? "/v1/sound-generation"; + const method = req.method ?? "POST"; + + // Parse JSON body + let parsed: Record; + try { + parsed = JSON.parse(body) as Record; + } catch { + journal.add({ + method, + path, + headers: {}, + body: null, + response: { status: 400, fixture: null }, + }); + writeErrorResponse( + res, + 400, + JSON.stringify({ + error: { message: "Malformed JSON", type: "invalid_request_error", code: "invalid_json" }, + }), + ); + return; + } + + // Extract prompt text based on subType + let promptText: string | undefined; + if (subType === "sound-generation") { + if (typeof parsed.text === "string" && parsed.text) { + promptText = parsed.text; + } + } else { + // music, music-stream, music-plan all use "prompt" (or composition_plan fallback) + if (typeof parsed.prompt === "string" && parsed.prompt) { + promptText = parsed.prompt; + } else if (parsed.composition_plan != null) { + promptText = + typeof parsed.composition_plan === "string" + ? parsed.composition_plan + : JSON.stringify(parsed.composition_plan); + } + } + + // Build synthetic ChatCompletionRequest for fixture matching (needed for journal even on validation failure) + const syntheticReq: ChatCompletionRequest = { + model: + (parsed.model_id as string) ?? + (subType === "sound-generation" ? "eleven_text_to_sound_v2" : "music_v1"), + messages: [{ role: "user", content: promptText ?? "" }], + _endpointType: "audio-gen", + }; + + // Validate required field + if (!promptText) { + const field = subType === "sound-generation" ? "text" : "prompt"; + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status: 400, fixture: null }, + }); + writeErrorResponse( + res, + 400, + JSON.stringify({ + error: { + message: `Missing required parameter: '${field}'`, + type: "invalid_request_error", + }, + }), + ); + return; + } + + // Match fixture + const testId = getTestId(req); + const matchCounts = journal.getFixtureMatchCountsForTest(testId); + const fixture = matchFixture(fixtures, syntheticReq, matchCounts, defaults.requestTransform); + + if (fixture) { + journal.incrementFixtureMatchCount(fixture, fixtures, testId); + } + + // No fixture match + if (!fixture) { + if (defaults.record) { + const proxied = await proxyAndRecord( + req, + res, + syntheticReq, + "elevenlabs", + req.url ?? "/v1/sound-generation", + fixtures, + defaults, + body, + ); + if (proxied) { + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status: res.statusCode ?? 200, fixture: null }, + }); + return; + } + } + + const strictStatus = defaults.strict ? 503 : 404; + const strictMessage = defaults.strict + ? "Strict mode: no fixture matched" + : "No fixture matched"; + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status: strictStatus, fixture: null }, + }); + writeErrorResponse( + res, + strictStatus, + JSON.stringify({ + error: { message: strictMessage, type: "invalid_request_error", code: "no_fixture_match" }, + }), + ); + return; + } + + const response = fixture.response; + + // Error fixture + if (isErrorResponse(response)) { + const status = response.status ?? 500; + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status, fixture }, + }); + writeErrorResponse(res, status, JSON.stringify(response)); + return; + } + + // plan returns JSON text, not audio + if (subType === "plan") { + if (!isTextResponse(response)) { + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status: 500, fixture }, + }); + writeErrorResponse( + res, + 500, + JSON.stringify({ + error: { + message: "Fixture response is not a text type for plan endpoint", + type: "server_error", + }, + }), + ); + return; + } + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status: 200, fixture }, + }); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(response.content); + return; + } + + // All other subTypes expect audio + if (!isAudioResponse(response)) { + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status: 500, fixture }, + }); + writeErrorResponse( + res, + 500, + JSON.stringify({ + error: { message: "Fixture response is not an audio type", type: "server_error" }, + }), + ); + return; + } + + // Decode audio bytes and determine content type + let audioBytes: Buffer; + let contentType: string; + + if (typeof response.audio === "string") { + audioBytes = Buffer.from(response.audio, "base64"); + const format = response.format ?? "mp3"; + contentType = FORMAT_TO_CONTENT_TYPE[format] ?? "audio/mpeg"; + } else { + audioBytes = Buffer.from(response.audio.b64Json, "base64"); + contentType = response.audio.contentType ?? "audio/mpeg"; + } + + // Music endpoints get a song-id header + if (subType === "music" || subType === "stream") { + res.setHeader("song-id", "mock-song-" + Date.now()); + } + + // Stream uses chunked transfer encoding + if (subType === "stream") { + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status: 200, fixture }, + }); + res.writeHead(200, { + "Content-Type": contentType, + "Transfer-Encoding": "chunked", + }); + res.end(audioBytes); + return; + } + + // Standard binary response for sound-generation and music + journal.add({ + method, + path, + headers: {}, + body: syntheticReq, + response: { status: 200, fixture }, + }); + res.writeHead(200, { "Content-Type": contentType }); + res.end(audioBytes); +} diff --git a/src/fal-audio.ts b/src/fal-audio.ts new file mode 100644 index 00000000..88b74d5a --- /dev/null +++ b/src/fal-audio.ts @@ -0,0 +1,622 @@ +import type http from "node:http"; +import crypto from "node:crypto"; +import type { AudioResponse, ChatCompletionRequest, Fixture, HandlerDefaults } from "./types.js"; +import { isAudioResponse, isErrorResponse, FORMAT_TO_CONTENT_TYPE, getTestId } from "./helpers.js"; +import { matchFixture } from "./router.js"; +import { proxyAndRecord } from "./recorder.js"; +import type { Journal } from "./journal.js"; + +// ─── FalJobMap with TTL and size bound ─────────────────────────────────── + +const FAL_JOB_MAX_ENTRIES = 10_000; +const FAL_JOB_TTL_MS = 3_600_000; // 1 hour + +interface FalJob { + requestId: string; + modelId: string; + status: "IN_QUEUE" | "IN_PROGRESS" | "COMPLETED"; + result: Record | null; + createdAt: number; +} + +interface FalJobEntry { + job: FalJob; + createdAt: number; +} + +/** + * A Map wrapper for fal.ai queue jobs that enforces a maximum size and per-entry TTL. + * Entries older than FAL_JOB_TTL_MS are lazily evicted on `get`. + * When the map exceeds FAL_JOB_MAX_ENTRIES on `set`, the oldest entries + * are removed to stay within bounds. + */ +export class FalJobMap { + private readonly entries = new Map(); + + get(key: string): FalJob | undefined { + const entry = this.entries.get(key); + if (!entry) return undefined; + if (Date.now() - entry.createdAt > FAL_JOB_TTL_MS) { + this.entries.delete(key); + return undefined; + } + return entry.job; + } + + set(key: string, job: FalJob): void { + this.entries.set(key, { job, createdAt: Date.now() }); + // Evict oldest entries if over capacity + if (this.entries.size > FAL_JOB_MAX_ENTRIES) { + const excess = this.entries.size - FAL_JOB_MAX_ENTRIES; + const iter = this.entries.keys(); + for (let i = 0; i < excess; i++) { + const next = iter.next(); + if (!next.done) this.entries.delete(next.value); + } + } + } + + delete(key: string): boolean { + return this.entries.delete(key); + } + + clear(): void { + this.entries.clear(); + } + + get size(): number { + return this.entries.size; + } +} + +// Module-level singleton — exported so server.ts can clear it during reset +export const falJobs = new FalJobMap(); + +// ─── Audio response translation ────────────────────────────────────────── + +function audioToFalFile(response: AudioResponse): Record { + let contentType: string; + let data: string; + + if (typeof response.audio === "string") { + data = response.audio; + contentType = FORMAT_TO_CONTENT_TYPE[response.format ?? "mp3"] ?? "audio/mpeg"; + } else { + data = response.audio.b64Json; + contentType = response.audio.contentType ?? "audio/mpeg"; + } + + const ext = + response.format ?? + (contentType !== "audio/mpeg" + ? (Object.entries(FORMAT_TO_CONTENT_TYPE).find(([, v]) => v === contentType)?.[0] ?? "mp3") + : "mp3"); + + const fileSize = + Math.ceil((data.length * 3) / 4) - (data.endsWith("==") ? 2 : data.endsWith("=") ? 1 : 0); + + return { + audio: { + url: `https://mock.fal.media/files/generated_audio.${ext}`, + content_type: contentType, + file_name: `generated_audio.${ext}`, + file_size: fileSize, + }, + }; +} + +// ─── Route patterns ────────────────────────────────────────────────────── + +const QUEUE_SUBMIT_RE = /^\/fal\/queue\/submit\/(.+)$/; +const QUEUE_STATUS_RE = /^\/fal\/queue\/requests\/([^/]+)\/status$/; +const QUEUE_RESULT_RE = /^\/fal\/queue\/requests\/([^/]+)$/; +const QUEUE_CANCEL_RE = /^\/fal\/queue\/requests\/([^/]+)\/cancel$/; +const SYNC_RUN_RE = /^\/fal\/run\/(.+)$/; + +// ─── Handler ───────────────────────────────────────────────────────────── + +export async function handleFalQueue( + req: http.IncomingMessage, + res: http.ServerResponse, + body: string, + pathname: string, + fixtures: Fixture[], + defaults: HandlerDefaults, + journal: Journal, +): Promise { + const testId = getTestId(req); + const matchCounts = journal.getFixtureMatchCountsForTest(testId); + + // ── Queue Submit ─────────────────────────────────────────────────── + const submitMatch = QUEUE_SUBMIT_RE.exec(pathname); + if (submitMatch && req.method === "POST") { + const modelId = submitMatch[1]; + return handleQueueSubmit( + req, + res, + body, + pathname, + modelId, + testId, + fixtures, + defaults, + matchCounts, + journal, + ); + } + + // ── Queue Status ─────────────────────────────────────────────────── + const statusMatch = QUEUE_STATUS_RE.exec(pathname); + if (statusMatch) { + const requestId = statusMatch[1]; + return handleQueueStatus(req, res, pathname, requestId, testId, journal); + } + + // ── Queue Cancel ─────────────────────────────────────────────────── + const cancelMatch = QUEUE_CANCEL_RE.exec(pathname); + if (cancelMatch) { + const requestId = cancelMatch[1]; + return handleQueueCancel(req, res, pathname, requestId, testId, journal); + } + + // ── Queue Result ─────────────────────────────────────────────────── + const resultMatch = QUEUE_RESULT_RE.exec(pathname); + if (resultMatch) { + const requestId = resultMatch[1]; + return handleQueueResult(req, res, pathname, requestId, testId, journal); + } + + // ── Synchronous Run ──────────────────────────────────────────────── + const runMatch = SYNC_RUN_RE.exec(pathname); + if (runMatch && req.method === "POST") { + const modelId = runMatch[1]; + return handleSyncRun( + req, + res, + body, + pathname, + modelId, + fixtures, + defaults, + matchCounts, + journal, + ); + } + + // Unknown fal path + const errorBody = { error: { message: "Unknown fal.ai endpoint", type: "not_found" } }; + journal.add({ + method: req.method ?? "GET", + path: pathname, + headers: {}, + body: null, + response: { status: 404, fixture: null }, + }); + res.writeHead(404, { "Content-Type": "application/json" }); + res.end(JSON.stringify(errorBody)); +} + +// ─── Sub-handlers ──────────────────────────────────────────────────────── + +async function handleQueueSubmit( + req: http.IncomingMessage, + res: http.ServerResponse, + body: string, + pathname: string, + modelId: string, + testId: string, + fixtures: Fixture[], + defaults: HandlerDefaults, + matchCounts: Map, + journal: Journal, +): Promise { + let parsed: Record = {}; + if (body.trim()) { + try { + parsed = JSON.parse(body) as Record; + } catch { + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: null, + response: { status: 400, fixture: null }, + }); + res.writeHead(400, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + error: { message: "Malformed JSON", type: "invalid_request_error" }, + }), + ); + return; + } + } + + const prompt = + (typeof parsed.prompt === "string" ? parsed.prompt : null) ?? + (typeof parsed.text === "string" ? parsed.text : null) ?? + ""; + + const syntheticReq: ChatCompletionRequest = { + model: modelId, + messages: [{ role: "user", content: prompt }], + _endpointType: "fal-audio", + }; + + const fixture = matchFixture(fixtures, syntheticReq, matchCounts, defaults.requestTransform); + + if (!fixture) { + if (defaults.record) { + const proxied = await proxyAndRecord( + req, + res, + syntheticReq, + "fal", + pathname, + fixtures, + defaults, + body, + ); + if (proxied) { + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status: res.statusCode ?? 200, fixture: null }, + }); + return; + } + } + + const strictStatus = defaults.strict ? 503 : 404; + const strictMessage = defaults.strict + ? "Strict mode: no fixture matched" + : "No fixture matched"; + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status: strictStatus, fixture: null }, + }); + res.writeHead(strictStatus, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + error: { + message: strictMessage, + type: "invalid_request_error", + code: "no_fixture_match", + }, + }), + ); + return; + } + + journal.incrementFixtureMatchCount(fixture, fixtures, testId); + const response = fixture.response; + + if (isErrorResponse(response)) { + const status = response.status ?? 500; + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status, fixture }, + }); + res.writeHead(status, { "Content-Type": "application/json" }); + res.end(JSON.stringify(response)); + return; + } + + if (!isAudioResponse(response)) { + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status: 500, fixture }, + }); + res.writeHead(500, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + error: { message: "Fixture response is not an audio type", type: "server_error" }, + }), + ); + return; + } + + const requestId = crypto.randomUUID(); + const result = audioToFalFile(response); + + const job: FalJob = { + requestId, + modelId, + status: "COMPLETED", + result, + createdAt: Date.now(), + }; + + const stateKey = `${testId}:${requestId}`; + falJobs.set(stateKey, job); + + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status: 200, fixture }, + }); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + request_id: requestId, + response_url: `https://queue.fal.run/${modelId}/requests/${requestId}/response`, + status_url: `https://queue.fal.run/${modelId}/requests/${requestId}/status`, + cancel_url: `https://queue.fal.run/${modelId}/requests/${requestId}/cancel`, + queue_position: 0, + }), + ); +} + +function handleQueueStatus( + req: http.IncomingMessage, + res: http.ServerResponse, + pathname: string, + requestId: string, + testId: string, + journal: Journal, +): void { + const stateKey = `${testId}:${requestId}`; + const job = falJobs.get(stateKey); + + if (!job) { + journal.add({ + method: req.method ?? "GET", + path: pathname, + headers: {}, + body: null, + response: { status: 404, fixture: null }, + }); + res.writeHead(404, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + error: { message: `Request ${requestId} not found`, type: "not_found" }, + }), + ); + return; + } + + journal.add({ + method: req.method ?? "GET", + path: pathname, + headers: {}, + body: null, + response: { status: 200, fixture: null }, + }); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + status: job.status, + request_id: job.requestId, + response_url: `https://queue.fal.run/${job.modelId}/requests/${requestId}/response`, + }), + ); +} + +function handleQueueResult( + req: http.IncomingMessage, + res: http.ServerResponse, + pathname: string, + requestId: string, + testId: string, + journal: Journal, +): void { + const stateKey = `${testId}:${requestId}`; + const job = falJobs.get(stateKey); + + if (!job) { + journal.add({ + method: req.method ?? "GET", + path: pathname, + headers: {}, + body: null, + response: { status: 404, fixture: null }, + }); + res.writeHead(404, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + error: { message: `Request ${requestId} not found`, type: "not_found" }, + }), + ); + return; + } + + journal.add({ + method: req.method ?? "GET", + path: pathname, + headers: {}, + body: null, + response: { status: 200, fixture: null }, + }); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify(job.result)); +} + +function handleQueueCancel( + req: http.IncomingMessage, + res: http.ServerResponse, + pathname: string, + requestId: string, + testId: string, + journal: Journal, +): void { + const stateKey = `${testId}:${requestId}`; + const job = falJobs.get(stateKey); + + if (!job) { + journal.add({ + method: req.method ?? "DELETE", + path: pathname, + headers: {}, + body: null, + response: { status: 404, fixture: null }, + }); + res.writeHead(404, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ status: "NOT_FOUND" })); + return; + } + + // Since we complete immediately, cancellation always returns ALREADY_COMPLETED + journal.add({ + method: req.method ?? "DELETE", + path: pathname, + headers: {}, + body: null, + response: { status: 400, fixture: null }, + }); + res.writeHead(400, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ status: "ALREADY_COMPLETED" })); +} + +async function handleSyncRun( + req: http.IncomingMessage, + res: http.ServerResponse, + body: string, + pathname: string, + modelId: string, + fixtures: Fixture[], + defaults: HandlerDefaults, + matchCounts: Map, + journal: Journal, +): Promise { + let parsed: Record = {}; + if (body.trim()) { + try { + parsed = JSON.parse(body) as Record; + } catch { + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: null, + response: { status: 400, fixture: null }, + }); + res.writeHead(400, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + error: { message: "Malformed JSON", type: "invalid_request_error" }, + }), + ); + return; + } + } + + const prompt = + (typeof parsed.prompt === "string" ? parsed.prompt : null) ?? + (typeof parsed.text === "string" ? parsed.text : null) ?? + ""; + + const syntheticReq: ChatCompletionRequest = { + model: modelId, + messages: [{ role: "user", content: prompt }], + _endpointType: "fal-audio", + }; + + const fixture = matchFixture(fixtures, syntheticReq, matchCounts, defaults.requestTransform); + + if (!fixture) { + if (defaults.record) { + const proxied = await proxyAndRecord( + req, + res, + syntheticReq, + "fal", + pathname, + fixtures, + defaults, + body, + ); + if (proxied) { + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status: res.statusCode ?? 200, fixture: null }, + }); + return; + } + } + + const strictStatus = defaults.strict ? 503 : 404; + const strictMessage = defaults.strict + ? "Strict mode: no fixture matched" + : "No fixture matched"; + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status: strictStatus, fixture: null }, + }); + res.writeHead(strictStatus, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + error: { + message: strictMessage, + type: "invalid_request_error", + code: "no_fixture_match", + }, + }), + ); + return; + } + + journal.incrementFixtureMatchCount(fixture, fixtures, getTestId(req)); + const response = fixture.response; + + if (isErrorResponse(response)) { + const status = response.status ?? 500; + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status, fixture }, + }); + res.writeHead(status, { "Content-Type": "application/json" }); + res.end(JSON.stringify(response)); + return; + } + + if (!isAudioResponse(response)) { + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status: 500, fixture }, + }); + res.writeHead(500, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + error: { message: "Fixture response is not an audio type", type: "server_error" }, + }), + ); + return; + } + + const result = audioToFalFile(response); + + journal.add({ + method: req.method ?? "POST", + path: pathname, + headers: {}, + body: syntheticReq, + response: { status: 200, fixture }, + }); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify(result)); +} diff --git a/src/fixture-loader.ts b/src/fixture-loader.ts index 40d2ddbb..5e5f8df5 100644 --- a/src/fixture-loader.ts +++ b/src/fixture-loader.ts @@ -375,6 +375,25 @@ export function validateFixtures(fixtures: Fixture[]): ValidationResult[] { } } + // Audio response checks — validate object-form audio + if (isAudioResponse(response) && typeof response.audio === "object") { + const audioObj = response.audio; + if (typeof audioObj.b64Json !== "string" || audioObj.b64Json === "") { + results.push({ + severity: "error", + fixtureIndex: i, + message: "audio.b64Json must be a non-empty string", + }); + } + if (audioObj.contentType !== undefined && typeof audioObj.contentType !== "string") { + results.push({ + severity: "error", + fixtureIndex: i, + message: `audio.contentType must be a string, got ${typeof audioObj.contentType}`, + }); + } + } + // Validate ResponseOverrides fields if ( isTextResponse(response) || diff --git a/src/gemini.ts b/src/gemini.ts index 232ab49a..6eea8be2 100644 --- a/src/gemini.ts +++ b/src/gemini.ts @@ -8,6 +8,7 @@ import type * as http from "node:http"; import type { + AudioResponse, ChatCompletionRequest, ChatMessage, Fixture, @@ -23,7 +24,9 @@ import { isToolCallResponse, isContentWithToolCallsResponse, isErrorResponse, + isAudioResponse, extractOverrides, + formatToMime, generateToolCallId, flattenHeaders, getTestId, @@ -43,6 +46,7 @@ interface GeminiPart { thought?: boolean; functionCall?: { name: string; args: Record; id?: string }; functionResponse?: { name: string; response: unknown }; + inlineData?: { mimeType: string; data: string }; } interface GeminiContent { @@ -449,6 +453,48 @@ function buildGeminiContentWithToolCallsResponse( }; } +// ─── Audio response builders ──────────────────────────────────────────────── + +function resolveAudioInlineData(audio: AudioResponse): { mimeType: string; data: string } { + if (typeof audio.audio === "string") { + return { mimeType: formatToMime(audio.format ?? "mp3"), data: audio.audio }; + } + return { + mimeType: audio.audio.contentType ?? "audio/mpeg", + data: audio.audio.b64Json, + }; +} + +function buildGeminiAudioResponse(audio: AudioResponse): GeminiResponseChunk { + const inlineData = resolveAudioInlineData(audio); + return { + candidates: [ + { + content: { role: "model", parts: [{ inlineData }] }, + finishReason: "STOP", + index: 0, + }, + ], + usageMetadata: { promptTokenCount: 0, candidatesTokenCount: 0, totalTokenCount: 0 }, + }; +} + +function buildGeminiAudioStreamChunks(audio: AudioResponse): GeminiResponseChunk[] { + const inlineData = resolveAudioInlineData(audio); + return [ + { + candidates: [ + { + content: { role: "model", parts: [{ inlineData }] }, + finishReason: "STOP", + index: 0, + }, + ], + usageMetadata: { promptTokenCount: 0, candidatesTokenCount: 0, totalTokenCount: 0 }, + }, + ]; +} + // ─── SSE writer for Gemini streaming ──────────────────────────────────────── interface GeminiStreamOptions { @@ -649,6 +695,38 @@ export async function handleGemini( return; } + // Audio response + if (isAudioResponse(response)) { + const journalEntry = journal.add({ + method: req.method ?? "POST", + path, + headers: flattenHeaders(req.headers), + body: completionReq, + response: { status: 200, fixture }, + }); + if (!streaming) { + const body = buildGeminiAudioResponse(response); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify(body)); + } else { + const chunks = buildGeminiAudioStreamChunks(response); + const interruption = createInterruptionSignal(fixture); + const completed = await writeGeminiSSEStream(res, chunks, { + latency, + streamingProfile: fixture.streamingProfile, + signal: interruption?.signal, + onChunkSent: interruption?.tick, + }); + if (!completed) { + if (!res.writableEnded) res.destroy(); + journalEntry.response.interrupted = true; + journalEntry.response.interruptReason = interruption?.reason(); + } + interruption?.cleanup(); + } + return; + } + // Content + tool calls response (must be checked before isTextResponse / isToolCallResponse) if (isContentWithToolCallsResponse(response)) { if (response.webSearches?.length) { diff --git a/src/helpers.ts b/src/helpers.ts index 2b654a61..38360261 100644 --- a/src/helpers.ts +++ b/src/helpers.ts @@ -91,7 +91,30 @@ export function isImageResponse(r: FixtureResponse): r is ImageResponse { } export function isAudioResponse(r: FixtureResponse): r is AudioResponse { - return "audio" in r && typeof (r as AudioResponse).audio === "string"; + if (!("audio" in r)) return false; + const a = (r as AudioResponse).audio; + return typeof a === "string" || (typeof a === "object" && a !== null && "b64Json" in a); +} + +/** + * Map audio format shorthand to MIME content types. + * Shared between speech, ElevenLabs, and fal audio handlers. + */ +export const FORMAT_TO_CONTENT_TYPE: Record = { + mp3: "audio/mpeg", + opus: "audio/opus", + aac: "audio/aac", + flac: "audio/flac", + wav: "audio/wav", + pcm: "audio/pcm", +}; + +/** + * Resolve a format string (e.g. "mp3", "opus") to its MIME content type. + * Falls back to "application/octet-stream" for unknown formats. + */ +export function formatToMime(format: string): string { + return FORMAT_TO_CONTENT_TYPE[format] ?? "application/octet-stream"; } export function isTranscriptionResponse(r: FixtureResponse): r is TranscriptionResponse { diff --git a/src/index.ts b/src/index.ts index 046908ec..7bbad2f4 100644 --- a/src/index.ts +++ b/src/index.ts @@ -84,6 +84,8 @@ export { handleImages } from "./images.js"; export { handleSpeech } from "./speech.js"; export { handleTranscription } from "./transcription.js"; export { handleVideoCreate, handleVideoStatus, VideoStateMap } from "./video.js"; +export { handleElevenLabsAudio } from "./elevenlabs-audio.js"; +export { handleFalQueue } from "./fal-audio.js"; // Helpers export { @@ -110,6 +112,8 @@ export { isVideoResponse, generateDeterministicEmbedding, buildEmbeddingResponse, + FORMAT_TO_CONTENT_TYPE, + formatToMime, } from "./helpers.js"; export type { EmbeddingAPIResponse } from "./helpers.js"; diff --git a/src/llmock.ts b/src/llmock.ts index a5c8dac1..68b188a6 100644 --- a/src/llmock.ts +++ b/src/llmock.ts @@ -27,6 +27,7 @@ import { Journal } from "./journal.js"; import type { SearchFixture, SearchResult } from "./search.js"; import type { RerankFixture, RerankResult } from "./rerank.js"; import type { ModerationFixture, ModerationResult } from "./moderation.js"; +import { falJobs } from "./fal-audio.js"; export class LLMock { private fixtures: Fixture[] = []; @@ -158,6 +159,31 @@ export class LLMock { }); } + onAudio(input: string | RegExp, response: AudioResponse): this { + return this.addFixture({ match: { userMessage: input }, response }); + } + + onSoundEffect(text: string | RegExp, response: AudioResponse): this { + return this.addFixture({ + match: { userMessage: text, endpoint: "audio-gen" }, + response, + }); + } + + onMusic(prompt: string | RegExp, response: AudioResponse): this { + return this.addFixture({ + match: { userMessage: prompt, endpoint: "audio-gen" }, + response, + }); + } + + onFalAudio(prompt: string | RegExp, response: AudioResponse, model?: string): this { + return this.addFixture({ + match: { userMessage: prompt, endpoint: "fal-audio", ...(model ? { model } : {}) }, + response, + }); + } + // ---- Service mock convenience methods ---- onSearch(pattern: string | RegExp, results: SearchResult[]): this { @@ -204,11 +230,9 @@ export class LLMock { fixture.match.predicate = (req) => { const result = original(req); if (result) { - // Defer splice so it doesn't mutate the array while matchFixture iterates it - queueMicrotask(() => { - const idx = this.fixtures.indexOf(fixture); - if (idx !== -1) this.fixtures.splice(idx, 1); - }); + // Remove synchronously on first match to prevent race conditions + const idx = this.fixtures.indexOf(fixture); + if (idx !== -1) this.fixtures.splice(idx, 1); } return result; }; @@ -284,6 +308,7 @@ export class LLMock { this.searchFixtures.length = 0; this.rerankFixtures.length = 0; this.moderationFixtures.length = 0; + falJobs.clear(); if (this.serverInstance) { this.serverInstance.journal.clear(); this.serverInstance.videoStates.clear(); diff --git a/src/recorder.ts b/src/recorder.ts index 229a389a..22b0d904 100644 --- a/src/recorder.ts +++ b/src/recorder.ts @@ -168,23 +168,37 @@ export async function proxyAndRecord( if (collapsed.droppedChunks && collapsed.droppedChunks > 0) { defaults.logger.warn(`${collapsed.droppedChunks} chunk(s) dropped during stream collapse`); } - if (collapsed.content === "" && (!collapsed.toolCalls || collapsed.toolCalls.length === 0)) { + // Audio from streamed inlineData (e.g. Gemini SSE with audio parts) + if (collapsed.audioB64) { + fixtureResponse = { + audio: { + b64Json: collapsed.audioB64, + contentType: collapsed.audioMimeType ?? "audio/mpeg", + }, + }; + } else if ( + collapsed.content === "" && + (!collapsed.toolCalls || collapsed.toolCalls.length === 0) + ) { defaults.logger.warn("Stream collapse produced empty content — fixture may be incomplete"); - } - const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {}; - if (collapsed.toolCalls && collapsed.toolCalls.length > 0) { - if (collapsed.content) { - // Both content and toolCalls present — save as ContentWithToolCallsResponse - fixtureResponse = { - content: collapsed.content, - toolCalls: collapsed.toolCalls, - ...reasoningSpread, - }; + const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {}; + fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread }; + } else { + const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {}; + if (collapsed.toolCalls && collapsed.toolCalls.length > 0) { + if (collapsed.content) { + // Both content and toolCalls present — save as ContentWithToolCallsResponse + fixtureResponse = { + content: collapsed.content, + toolCalls: collapsed.toolCalls, + ...reasoningSpread, + }; + } else { + fixtureResponse = { toolCalls: collapsed.toolCalls, ...reasoningSpread }; + } } else { - fixtureResponse = { toolCalls: collapsed.toolCalls, ...reasoningSpread }; + fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread }; } - } else { - fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread }; } } else { // Non-streaming — try to parse as JSON @@ -221,7 +235,10 @@ export async function proxyAndRecord( const fixtureMatch = buildFixtureMatch(matchRequest); // Build and save the fixture - const fixture: Fixture = { match: fixtureMatch, response: fixtureResponse }; + const fixture: Fixture = { + match: fixtureMatch, + response: fixtureResponse, + }; // Check if the match is empty (all undefined values) — warn but still save to disk const matchValues = Object.values(fixtureMatch); @@ -451,6 +468,10 @@ function buildFixtureResponse( } if (typeof first.embedding === "string" && encodingFormat === "base64") { const buf = Buffer.from(first.embedding, "base64"); + if (buf.byteLength % 4 !== 0) { + // Malformed embedding — return a zero-dimension embedding fixture + return { embedding: [] }; + } const aligned = new Uint8Array(buf).buffer; // Always offset 0 const floats = new Float32Array(aligned, 0, buf.byteLength / 4); return { embedding: Array.from(floats) }; @@ -626,6 +647,24 @@ function buildFixtureResponse( const content = candidate.content as Record | undefined; if (content && Array.isArray(content.parts)) { const parts = content.parts as Array>; + + // Audio inlineData parts take priority over text + const audioParts = parts.filter( + (p: Record) => + p.inlineData && + typeof (p.inlineData as Record).mimeType === "string" && + ((p.inlineData as Record).mimeType as string).startsWith("audio/"), + ); + if (audioParts.length > 0) { + const inlineData = audioParts[0].inlineData as Record; + return { + audio: { + b64Json: String(inlineData.data ?? ""), + contentType: String(inlineData.mimeType), + }, + }; + } + const fnCallParts = parts.filter((p) => p.functionCall); const textParts = parts.filter((p) => typeof p.text === "string" && !p.thought); const thoughtParts = parts.filter((p) => p.thought === true && typeof p.text === "string"); @@ -833,7 +872,15 @@ function buildFixtureResponse( /** * Derive fixture match criteria from the original request. */ -type EndpointType = "chat" | "image" | "speech" | "transcription" | "video" | "embedding"; +type EndpointType = + | "chat" + | "image" + | "speech" + | "transcription" + | "video" + | "embedding" + | "audio-gen" + | "fal-audio"; function buildFixtureMatch(request: ChatCompletionRequest): { userMessage?: string; diff --git a/src/router.ts b/src/router.ts index 65f6be8d..28f11c90 100644 --- a/src/router.ts +++ b/src/router.ts @@ -62,6 +62,8 @@ export function matchFixture( const compatible = (reqEndpoint === "image" && isImageResponse(r)) || (reqEndpoint === "speech" && isAudioResponse(r)) || + (reqEndpoint === "audio-gen" && isAudioResponse(r)) || + (reqEndpoint === "fal-audio" && isAudioResponse(r)) || (reqEndpoint === "transcription" && isTranscriptionResponse(r)) || (reqEndpoint === "video" && isVideoResponse(r)); if (!compatible) continue; diff --git a/src/server.ts b/src/server.ts index 08144b5b..a5c0baa9 100644 --- a/src/server.ts +++ b/src/server.ts @@ -25,6 +25,7 @@ import { isToolCallResponse, isContentWithToolCallsResponse, isErrorResponse, + isAudioResponse, flattenHeaders, getTestId, } from "./helpers.js"; @@ -38,6 +39,8 @@ import { handleImages } from "./images.js"; import { handleSpeech } from "./speech.js"; import { handleTranscription } from "./transcription.js"; import { handleVideoCreate, handleVideoStatus, VideoStateMap } from "./video.js"; +import { handleElevenLabsAudio } from "./elevenlabs-audio.js"; +import { handleFalQueue, falJobs } from "./fal-audio.js"; import { handleOllama, handleOllamaGenerate } from "./ollama.js"; import { handleCohere } from "./cohere.js"; import { handleSearch, type SearchFixture } from "./search.js"; @@ -77,6 +80,11 @@ const TRANSCRIPTIONS_PATH = "/v1/audio/transcriptions"; const VIDEOS_PATH = "/v1/videos"; const VIDEOS_STATUS_RE = /^\/v1\/videos\/([^/]+)$/; const GEMINI_PREDICT_RE = /^\/v1beta\/models\/([^:]+):predict$/; +const ELEVENLABS_SOUND_GENERATION_PATH = "/v1/sound-generation"; +const ELEVENLABS_MUSIC_RE = /^\/v1\/music(?:\/(.+))?$/; +const FAL_QUEUE_SUBMIT_RE = /^\/fal\/queue\/submit\/(.+)$/; +const FAL_QUEUE_REQUESTS_RE = /^\/fal\/queue\/requests\/(.+)$/; +const FAL_RUN_RE = /^\/fal\/run\/(.+)$/; const DEFAULT_CHUNK_SIZE = 20; // OpenAI-compatible endpoint suffixes for path prefix normalization. @@ -292,6 +300,7 @@ async function handleControlAPI( fixtures.length = 0; journal.clear(); videoStates.clear(); + falJobs.clear(); if (defaults.registry) { defaults.registry.setGauge("aimock_fixtures_loaded", {}, fixtures.length); } @@ -562,6 +571,29 @@ async function handleCompletions( return; } + // Audio responses are not supported on the chat completions endpoint + if (isAudioResponse(response)) { + journal.add({ + method: req.method ?? "POST", + path: req.url ?? COMPLETIONS_PATH, + headers: flattenHeaders(req.headers), + body, + response: { status: 422, fixture }, + }); + writeErrorResponse( + res, + 422, + JSON.stringify({ + error: { + message: + "Audio responses are not supported on the chat completions endpoint. Use Gemini generateContent or a dedicated audio endpoint.", + type: "invalid_request_error", + }, + }), + ); + return; + } + // Content + tool calls response if (isContentWithToolCallsResponse(response)) { if (response.webSearches?.length) { @@ -1572,6 +1604,199 @@ export async function createServer( return; } + // POST /v1/sound-generation — ElevenLabs Sound Generation API + if (pathname === ELEVENLABS_SOUND_GENERATION_PATH && req.method === "POST") { + setCorsHeaders(res); + try { + const raw = await readBody(req); + const chaosResult = applyChaos( + res, + null, + defaults.chaos, + req.headers, + journal, + { + method: req.method ?? "POST", + path: pathname, + headers: flattenHeaders(req.headers), + body: { model: "", messages: [] }, + }, + defaults.registry, + defaults.logger, + ); + if (chaosResult) return; + await handleElevenLabsAudio(req, res, raw, fixtures, defaults, journal, "sound-generation"); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : "Internal error"; + if (!res.headersSent) { + writeErrorResponse( + res, + 500, + JSON.stringify({ error: { message: msg, type: "server_error" } }), + ); + } else if (!res.writableEnded) { + res.destroy(); + } + } + return; + } + + // POST /v1/music/(generation|variation|remix|extend) — ElevenLabs Music API + const musicMatch = pathname.match(ELEVENLABS_MUSIC_RE); + if (musicMatch && req.method === "POST") { + setCorsHeaders(res); + const musicSubType = musicMatch[1] ?? "music"; + try { + const raw = await readBody(req); + const chaosResult = applyChaos( + res, + null, + defaults.chaos, + req.headers, + journal, + { + method: req.method ?? "POST", + path: pathname, + headers: flattenHeaders(req.headers), + body: { model: "", messages: [] }, + }, + defaults.registry, + defaults.logger, + ); + if (chaosResult) return; + await handleElevenLabsAudio(req, res, raw, fixtures, defaults, journal, musicSubType); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : "Internal error"; + if (!res.headersSent) { + writeErrorResponse( + res, + 500, + JSON.stringify({ error: { message: msg, type: "server_error" } }), + ); + } else if (!res.writableEnded) { + res.destroy(); + } + } + return; + } + + // POST /fal/queue/submit/{model} — fal.ai Queue Submit + const falQueueSubmitMatch = pathname.match(FAL_QUEUE_SUBMIT_RE); + if (falQueueSubmitMatch && req.method === "POST") { + setCorsHeaders(res); + try { + const raw = await readBody(req); + const chaosResult = applyChaos( + res, + null, + defaults.chaos, + req.headers, + journal, + { + method: req.method ?? "POST", + path: pathname, + headers: flattenHeaders(req.headers), + body: { model: "", messages: [] }, + }, + defaults.registry, + defaults.logger, + ); + if (chaosResult) return; + await handleFalQueue(req, res, raw, pathname, fixtures, defaults, journal); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : "Internal error"; + if (!res.headersSent) { + writeErrorResponse( + res, + 500, + JSON.stringify({ error: { message: msg, type: "server_error" } }), + ); + } else if (!res.writableEnded) { + res.destroy(); + } + } + return; + } + + // GET /fal/queue/requests/{requestId} — fal.ai Queue Status/Result + const falQueueRequestsMatch = pathname.match(FAL_QUEUE_REQUESTS_RE); + if ( + falQueueRequestsMatch && + (req.method === "GET" || req.method === "POST" || req.method === "PUT") + ) { + setCorsHeaders(res); + try { + const raw = req.method === "POST" ? await readBody(req) : "{}"; + const chaosResult = applyChaos( + res, + null, + defaults.chaos, + req.headers, + journal, + { + method: req.method ?? "GET", + path: pathname, + headers: flattenHeaders(req.headers), + body: { model: "", messages: [] }, + }, + defaults.registry, + defaults.logger, + ); + if (chaosResult) return; + await handleFalQueue(req, res, raw, pathname, fixtures, defaults, journal); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : "Internal error"; + if (!res.headersSent) { + writeErrorResponse( + res, + 500, + JSON.stringify({ error: { message: msg, type: "server_error" } }), + ); + } else if (!res.writableEnded) { + res.destroy(); + } + } + return; + } + + // POST /fal/run/{model} — fal.ai Synchronous Run + const falRunMatch = pathname.match(FAL_RUN_RE); + if (falRunMatch && req.method === "POST") { + setCorsHeaders(res); + try { + const raw = await readBody(req); + const chaosResult = applyChaos( + res, + null, + defaults.chaos, + req.headers, + journal, + { + method: req.method ?? "POST", + path: pathname, + headers: flattenHeaders(req.headers), + body: { model: "", messages: [] }, + }, + defaults.registry, + defaults.logger, + ); + if (chaosResult) return; + await handleFalQueue(req, res, raw, pathname, fixtures, defaults, journal); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : "Internal error"; + if (!res.headersSent) { + writeErrorResponse( + res, + 500, + JSON.stringify({ error: { message: msg, type: "server_error" } }), + ); + } else if (!res.writableEnded) { + res.destroy(); + } + } + return; + } + // POST /v1/chat/completions — Chat Completions API if (pathname !== COMPLETIONS_PATH) { handleNotFound(res, "Not found"); diff --git a/src/speech.ts b/src/speech.ts index dddc9027..623c1a9a 100644 --- a/src/speech.ts +++ b/src/speech.ts @@ -1,6 +1,12 @@ import type * as http from "node:http"; import type { ChatCompletionRequest, Fixture, HandlerDefaults } from "./types.js"; -import { isAudioResponse, isErrorResponse, flattenHeaders, getTestId } from "./helpers.js"; +import { + isAudioResponse, + isErrorResponse, + flattenHeaders, + getTestId, + FORMAT_TO_CONTENT_TYPE, +} from "./helpers.js"; import { matchFixture } from "./router.js"; import { writeErrorResponse } from "./sse-writer.js"; import type { Journal } from "./journal.js"; @@ -16,15 +22,6 @@ interface SpeechRequest { [key: string]: unknown; } -const FORMAT_TO_CONTENT_TYPE: Record = { - mp3: "audio/mpeg", - opus: "audio/opus", - aac: "audio/aac", - flac: "audio/flac", - wav: "audio/wav", - pcm: "audio/pcm", -}; - export async function handleSpeech( req: http.IncomingMessage, res: http.ServerResponse, @@ -187,6 +184,29 @@ export async function handleSpeech( return; } + // Object-form audio is not supported for the speech endpoint — reject early + if (typeof response.audio !== "string") { + journal.add({ + method, + path, + headers: flattenHeaders(req.headers), + body: syntheticReq, + response: { status: 500, fixture }, + }); + writeErrorResponse( + res, + 500, + JSON.stringify({ + error: { + message: + "Object-form audio not supported for speech endpoint. Use string-form: { audio: '' }", + type: "server_error", + }, + }), + ); + return; + } + journal.add({ method, path, diff --git a/src/stream-collapse.ts b/src/stream-collapse.ts index 31bad190..0f814d4a 100644 --- a/src/stream-collapse.ts +++ b/src/stream-collapse.ts @@ -22,6 +22,8 @@ export interface CollapseResult { toolCalls?: ToolCall[]; droppedChunks?: number; truncated?: boolean; + audioB64?: string; + audioMimeType?: string; } // --------------------------------------------------------------------------- @@ -259,7 +261,10 @@ export function collapseAnthropicSSE(body: string): CollapseResult { export function collapseGeminiSSE(body: string): CollapseResult { const lines = body.split("\n\n").filter((l) => l.trim().length > 0); let content = ""; + let reasoning = ""; let droppedChunks = 0; + let audioB64 = ""; + let audioMimeType: string | undefined; const toolCalls: ToolCall[] = []; for (const line of lines) { @@ -292,21 +297,50 @@ export function collapseGeminiSSE(body: string): CollapseResult { name: String(fc.name ?? ""), arguments: typeof fc.args === "string" ? (fc.args as string) : JSON.stringify(fc.args), }); + } else if ( + part.inlineData && + typeof (part.inlineData as Record).mimeType === "string" && + ((part.inlineData as Record).mimeType as string).startsWith("audio/") + ) { + const inlineData = part.inlineData as Record; + if (!audioMimeType) { + audioMimeType = inlineData.mimeType as string; + } + if (typeof inlineData.data === "string") { + audioB64 += inlineData.data; + } } else if (typeof part.text === "string") { - content += part.text; + if (part.thought) { + reasoning += part.text; + } else { + content += part.text; + } } } } + if (audioB64) { + return { + audioB64, + audioMimeType, + ...(droppedChunks > 0 ? { droppedChunks } : {}), + }; + } + if (toolCalls.length > 0) { return { ...(content ? { content } : {}), toolCalls, + ...(reasoning ? { reasoning } : {}), ...(droppedChunks > 0 ? { droppedChunks } : {}), }; } - return { content, ...(droppedChunks > 0 ? { droppedChunks } : {}) }; + return { + content, + ...(reasoning ? { reasoning } : {}), + ...(droppedChunks > 0 ? { droppedChunks } : {}), + }; } // --------------------------------------------------------------------------- @@ -453,6 +487,7 @@ export function collapseCohereSSE(body: string): CollapseResult { if (toolCallMap.size > 0) { const sorted = Array.from(toolCallMap.entries()).sort(([a], [b]) => a - b); return { + ...(content ? { content } : {}), toolCalls: sorted.map(([, tc]) => ({ name: tc.name, arguments: tc.arguments, diff --git a/src/types.ts b/src/types.ts index e434f37d..8b8bcffb 100644 --- a/src/types.ts +++ b/src/types.ts @@ -72,7 +72,15 @@ export interface FixtureMatch { predicate?: (req: ChatCompletionRequest) => boolean; /** Which occurrence of this match to respond to (0-indexed). Undefined means match any. */ sequenceIndex?: number; - endpoint?: "chat" | "image" | "speech" | "transcription" | "video" | "embedding"; + endpoint?: + | "chat" + | "image" + | "speech" + | "transcription" + | "video" + | "embedding" + | "audio-gen" + | "fal-audio"; } // Fixture response types @@ -155,7 +163,7 @@ export interface ImageResponse { } export interface AudioResponse { - audio: string; + audio: string | { b64Json: string; contentType?: string }; format?: string; } @@ -277,7 +285,15 @@ export interface FixtureFileEntry { model?: string; responseFormat?: string; sequenceIndex?: number; - endpoint?: "chat" | "image" | "speech" | "transcription" | "video" | "embedding"; + endpoint?: + | "chat" + | "image" + | "speech" + | "transcription" + | "video" + | "embedding" + | "audio-gen" + | "fal-audio"; // predicate not supported in JSON files }; response: FixtureFileResponse; @@ -376,7 +392,9 @@ export type RecordProviderKey = | "bedrock" | "azure" | "ollama" - | "cohere"; + | "cohere" + | "elevenlabs" + | "fal"; export interface RecordConfig { providers: Partial>; diff --git a/src/ws-gemini-live.ts b/src/ws-gemini-live.ts index 1880628d..0e9daad8 100644 --- a/src/ws-gemini-live.ts +++ b/src/ws-gemini-live.ts @@ -6,9 +6,23 @@ * messages in the Gemini Live streaming format. */ -import type { Fixture, ChatMessage, ChatCompletionRequest, ToolDefinition } from "./types.js"; +import type { + Fixture, + ChatMessage, + ChatCompletionRequest, + ToolDefinition, + AudioResponse, +} from "./types.js"; import { matchFixture } from "./router.js"; -import { isTextResponse, isToolCallResponse, isErrorResponse } from "./helpers.js"; +import { + isTextResponse, + isToolCallResponse, + isContentWithToolCallsResponse, + isErrorResponse, + isAudioResponse, + formatToMime, + generateToolCallId, +} from "./helpers.js"; import { createInterruptionSignal } from "./interruption.js"; import { delay } from "./sse-writer.js"; import { DEFAULT_TEST_ID, type Journal } from "./journal.js"; @@ -19,8 +33,10 @@ import type { WebSocketConnection } from "./ws-framing.js"; interface GeminiLivePart { text?: string; + thought?: boolean; functionCall?: { name: string; args: Record }; functionResponse?: { name: string; response: unknown; id?: string }; + inlineData?: { mimeType: string; data: string }; } interface GeminiLiveTurn { @@ -89,7 +105,9 @@ function geminiTurnsToMessages(turns: GeminiLiveTurn[]): ChatMessage[] { if (role === "user") { const funcResponses = turn.parts.filter((p) => p.functionResponse); - const textParts = turn.parts.filter((p) => p.text !== undefined); + // inlineData parts (e.g. client audio input) are silently skipped — + // only text and functionResponse parts are relevant for fixture matching. + const textParts = turn.parts.filter((p) => p.text !== undefined && !p.thought); if (funcResponses.length > 0) { for (let i = 0; i < funcResponses.length; i++) { @@ -113,7 +131,7 @@ function geminiTurnsToMessages(turns: GeminiLiveTurn[]): ChatMessage[] { } } else if (role === "model") { const funcCalls = turn.parts.filter((p) => p.functionCall); - const textParts = turn.parts.filter((p) => p.text !== undefined); + const textParts = turn.parts.filter((p) => p.text !== undefined && !p.thought); if (funcCalls.length > 0) { messages.push({ @@ -327,6 +345,13 @@ async function processMessage( if (!fixture) { if (defaults.strict) { defaults.logger.warn(`STRICT: No fixture matched for WebSocket message`); + journal.add({ + method: "WS", + path, + headers: {}, + body: completionReq, + response: { status: 404, fixture: null }, + }); ws.close(1008, "Strict mode: no fixture matched"); return; } @@ -370,6 +395,184 @@ async function processMessage( return; } + // Audio response — single frame with inlineData and turnComplete: true + if (isAudioResponse(response)) { + journal.add({ + method: "WS", + path, + headers: {}, + body: completionReq, + response: { status: 200, fixture }, + }); + + const audioResp = response as AudioResponse; + let mimeType: string; + let data: string; + + if (typeof audioResp.audio === "string") { + mimeType = formatToMime(audioResp.format ?? "mp3"); + data = audioResp.audio; + } else { + mimeType = audioResp.audio.contentType ?? "audio/mpeg"; + data = audioResp.audio.b64Json; + } + + ws.send( + JSON.stringify({ + serverContent: { + modelTurn: { + parts: [{ inlineData: { mimeType, data } }], + }, + turnComplete: true, + }, + }), + ); + + session.conversationHistory.push({ + role: "assistant", + content: "[audio]", + }); + return; + } + + // Content + tool calls response (must be checked before isTextResponse / isToolCallResponse) + if (isContentWithToolCallsResponse(response)) { + const journalEntry = journal.add({ + method: "WS", + path, + headers: {}, + body: completionReq, + response: { status: 200, fixture }, + }); + + const content = response.content; + const chunkList: string[] = []; + for (let i = 0; i < content.length; i += chunkSize) { + chunkList.push(content.slice(i, i + chunkSize)); + } + + const interruption = createInterruptionSignal(fixture); + let interrupted = false; + + // Stream text content chunks + if (content.length === 0) { + if (!ws.isClosed) { + ws.send( + JSON.stringify({ + serverContent: { + modelTurn: { parts: [{ text: "" }] }, + turnComplete: false, + }, + }), + ); + } + } else { + for (let i = 0; i < chunkList.length; i++) { + if (ws.isClosed) break; + if (latency > 0) await delay(latency, interruption?.signal); + if (interruption?.signal.aborted) { + interrupted = true; + break; + } + if (ws.isClosed) break; + + ws.send( + JSON.stringify({ + serverContent: { + modelTurn: { parts: [{ text: chunkList[i] }] }, + turnComplete: false, + }, + }), + ); + interruption?.tick(); + if (interruption?.signal.aborted) { + interrupted = true; + break; + } + } + } + + if (interrupted) { + ws.destroy(); + journalEntry.response.interrupted = true; + journalEntry.response.interruptReason = interruption?.reason(); + interruption?.cleanup(); + return; + } + + // Pre-compute tool calls with stable IDs so wire message and history match + const resolvedToolCalls = response.toolCalls.map((tc) => ({ + ...tc, + resolvedId: tc.id ?? generateToolCallId(), + })); + + // Send tool calls + if (!ws.isClosed) { + if (latency > 0) await delay(latency, interruption?.signal); + if (interruption?.signal.aborted) { + ws.destroy(); + journalEntry.response.interrupted = true; + journalEntry.response.interruptReason = interruption?.reason(); + interruption?.cleanup(); + return; + } + + const functionCalls = resolvedToolCalls.map((tc) => { + let argsObj: Record; + try { + argsObj = JSON.parse(tc.arguments || "{}") as Record; + } catch { + defaults.logger.warn( + `Malformed JSON in fixture tool call arguments for "${tc.name}": ${tc.arguments}`, + ); + argsObj = {}; + } + return { + name: tc.name, + args: argsObj, + id: tc.resolvedId, + }; + }); + + ws.send(JSON.stringify({ toolCall: { functionCalls } })); + interruption?.tick(); + } + + if (interruption?.signal.aborted) { + ws.destroy(); + journalEntry.response.interrupted = true; + journalEntry.response.interruptReason = interruption?.reason(); + interruption?.cleanup(); + return; + } + + interruption?.cleanup(); + + // Send turnComplete + if (!ws.isClosed) { + ws.send( + JSON.stringify({ + serverContent: { turnComplete: true }, + }), + ); + } + + // Add to conversation history using the same resolved IDs from the wire message + session.conversationHistory.push({ + role: "assistant", + content: content || null, + tool_calls: resolvedToolCalls.map((tc) => ({ + id: tc.resolvedId, + type: "function" as const, + function: { + name: tc.name, + arguments: tc.arguments, + }, + })), + }); + return; + } + // Text response — stream chunks with serverContent if (isTextResponse(response)) { const journalEntry = journal.add({