|
| 1 | +import { afterEach, beforeEach, describe, expect, it, jest } from "@jest/globals"; |
| 2 | +import { GeminiTTSClient } from "../engines/gemini"; |
| 3 | +import { createBrowserTTSClient } from "../factory-browser"; |
| 4 | +import { createTTSClient } from "../factory"; |
| 5 | + |
| 6 | +const originalFetch = globalThis.fetch; |
| 7 | + |
| 8 | +function response(body: any, init: { ok?: boolean; status?: number; statusText?: string } = {}) { |
| 9 | + return { |
| 10 | + ok: init.ok ?? true, |
| 11 | + status: init.status ?? 200, |
| 12 | + statusText: init.statusText ?? "OK", |
| 13 | + headers: {} as Headers, |
| 14 | + body: null as any, |
| 15 | + json: async () => body, |
| 16 | + text: async () => (typeof body === "string" ? body : JSON.stringify(body)), |
| 17 | + arrayBuffer: async () => new ArrayBuffer(0), |
| 18 | + }; |
| 19 | +} |
| 20 | + |
| 21 | +function audioResponse(base64Audio: string) { |
| 22 | + return response({ |
| 23 | + candidates: [ |
| 24 | + { |
| 25 | + content: { |
| 26 | + parts: [ |
| 27 | + { |
| 28 | + inlineData: { |
| 29 | + data: base64Audio, |
| 30 | + }, |
| 31 | + }, |
| 32 | + ], |
| 33 | + }, |
| 34 | + }, |
| 35 | + ], |
| 36 | + }); |
| 37 | +} |
| 38 | + |
| 39 | +function b64(bytes: number[]): string { |
| 40 | + return Buffer.from(new Uint8Array(bytes)).toString("base64"); |
| 41 | +} |
| 42 | + |
| 43 | +describe("GeminiTTSClient", () => { |
| 44 | + let client: GeminiTTSClient; |
| 45 | + |
| 46 | + beforeEach(() => { |
| 47 | + client = new GeminiTTSClient({ apiKey: "test-api-key" }); |
| 48 | + }); |
| 49 | + |
| 50 | + afterEach(() => { |
| 51 | + globalThis.fetch = originalFetch; |
| 52 | + jest.restoreAllMocks(); |
| 53 | + }); |
| 54 | + |
| 55 | + it("initializes with default values", () => { |
| 56 | + expect(client.getProperty("model")).toBe("gemini-3.1-flash-tts-preview"); |
| 57 | + expect(client.getProperty("voice")).toBe("Kore"); |
| 58 | + }); |
| 59 | + |
| 60 | + it("initializes with custom model and voice", () => { |
| 61 | + const c = new GeminiTTSClient({ |
| 62 | + apiKey: "test", |
| 63 | + model: "gemini-2.5-flash-preview-tts", |
| 64 | + voice: "Puck", |
| 65 | + }); |
| 66 | + |
| 67 | + expect(c.getProperty("model")).toBe("gemini-2.5-flash-preview-tts"); |
| 68 | + expect(c.getProperty("voice")).toBe("Puck"); |
| 69 | + }); |
| 70 | + |
| 71 | + it("initializes with properties object", () => { |
| 72 | + const c = new GeminiTTSClient({ |
| 73 | + apiKey: "test", |
| 74 | + properties: { model: "gemini-2.5-flash-preview-tts", voice: "Zephyr" }, |
| 75 | + }); |
| 76 | + |
| 77 | + expect(c.getProperty("model")).toBe("gemini-2.5-flash-preview-tts"); |
| 78 | + expect(c.getProperty("voice")).toBe("Zephyr"); |
| 79 | + }); |
| 80 | + |
| 81 | + it("initializes with propertiesJson string", () => { |
| 82 | + const c = new GeminiTTSClient({ |
| 83 | + apiKey: "test", |
| 84 | + propertiesJson: JSON.stringify({ voice: "Sulafat" }), |
| 85 | + }); |
| 86 | + |
| 87 | + expect(c.getProperty("voice")).toBe("Sulafat"); |
| 88 | + }); |
| 89 | + |
| 90 | + it("sets and gets model, voice, and baseURL", () => { |
| 91 | + client.setProperty("model", "gemini-2.5-flash-preview-tts"); |
| 92 | + client.setProperty("voice", "Puck"); |
| 93 | + client.setProperty("baseURL", "https://example.test/v1beta"); |
| 94 | + |
| 95 | + expect(client.getProperty("model")).toBe("gemini-2.5-flash-preview-tts"); |
| 96 | + expect(client.getProperty("voice")).toBe("Puck"); |
| 97 | + expect(client.getProperty("baseURL")).toBe("https://example.test/v1beta"); |
| 98 | + }); |
| 99 | + |
| 100 | + it("requires apiKey credential", () => { |
| 101 | + expect((client as any).getRequiredCredentials()).toEqual(["apiKey"]); |
| 102 | + }); |
| 103 | + |
| 104 | + it("returns false for checkCredentials without api key", async () => { |
| 105 | + expect(await new GeminiTTSClient({}).checkCredentials()).toBe(false); |
| 106 | + }); |
| 107 | + |
| 108 | + it("checks credentials against model list", async () => { |
| 109 | + globalThis.fetch = jest.fn(async () => |
| 110 | + response({ |
| 111 | + models: [{ name: "models/gemini-3.1-flash-tts-preview" }], |
| 112 | + }) |
| 113 | + ) as any; |
| 114 | + |
| 115 | + expect(await client.checkCredentials()).toBe(true); |
| 116 | + }); |
| 117 | + |
| 118 | + it("gets static voices", async () => { |
| 119 | + const voices = await client.getVoices(); |
| 120 | + |
| 121 | + expect(voices).toHaveLength(30); |
| 122 | + expect(voices[0]).toHaveProperty("id", "Zephyr"); |
| 123 | + expect(voices[0]).toHaveProperty("provider", "gemini"); |
| 124 | + }); |
| 125 | + |
| 126 | + it("filters voices by supported languages", async () => { |
| 127 | + expect((await client.getVoicesByLanguage("en")).length).toBeGreaterThan(0); |
| 128 | + expect((await client.getVoicesByLanguage("fr")).length).toBeGreaterThan(0); |
| 129 | + }); |
| 130 | + |
| 131 | + it("creates via node and browser factories", () => { |
| 132 | + expect(createTTSClient("gemini", { apiKey: "test" })).toBeInstanceOf(GeminiTTSClient); |
| 133 | + expect(createBrowserTTSClient("gemini", { apiKey: "test" })).toBeInstanceOf(GeminiTTSClient); |
| 134 | + }); |
| 135 | + |
| 136 | + it("strips SSML while preserving Gemini audio tags", async () => { |
| 137 | + const result = await (client as any).prepareText( |
| 138 | + "<speak>Hello <break time=\"500ms\"/> [laughs] world</speak>" |
| 139 | + ); |
| 140 | + |
| 141 | + expect(result).not.toContain("<speak>"); |
| 142 | + expect(result).not.toContain("<break"); |
| 143 | + expect(result).toContain("[laughs]"); |
| 144 | + }); |
| 145 | + |
| 146 | + it("returns WAV bytes by default and sends the Gemini request shape", async () => { |
| 147 | + const pcm = b64([0, 0, 1, 0]); |
| 148 | + globalThis.fetch = jest.fn(async () => audioResponse(pcm)) as any; |
| 149 | + |
| 150 | + const bytes = await client.synthToBytes("Say cheerfully: Hello", { voice: "Puck" }); |
| 151 | + const request = JSON.parse(((globalThis.fetch as any).mock.calls[0][1] as any).body); |
| 152 | + |
| 153 | + expect(String.fromCharCode(bytes[0], bytes[1], bytes[2], bytes[3])).toBe("RIFF"); |
| 154 | + expect((globalThis.fetch as any).mock.calls[0][0]).toContain( |
| 155 | + "/models/gemini-3.1-flash-tts-preview:generateContent" |
| 156 | + ); |
| 157 | + expect(request.generationConfig.responseModalities).toEqual(["AUDIO"]); |
| 158 | + expect( |
| 159 | + request.generationConfig.speechConfig.voiceConfig.prebuiltVoiceConfig.voiceName |
| 160 | + ).toBe("Puck"); |
| 161 | + }); |
| 162 | + |
| 163 | + it("returns raw PCM when requested", async () => { |
| 164 | + globalThis.fetch = jest.fn(async () => audioResponse(b64([0, 0, 1, 0]))) as any; |
| 165 | + |
| 166 | + const bytes = await client.synthToBytes("Hello", { format: "pcm" }); |
| 167 | + |
| 168 | + expect(Array.from(bytes)).toEqual([0, 0, 1, 0]); |
| 169 | + }); |
| 170 | + |
| 171 | + it("uses selected model in request URL", async () => { |
| 172 | + globalThis.fetch = jest.fn(async () => audioResponse(b64([0, 0]))) as any; |
| 173 | + |
| 174 | + await client.synthToBytes("Hello", { model: "gemini-2.5-flash-preview-tts" }); |
| 175 | + |
| 176 | + expect((globalThis.fetch as any).mock.calls[0][0]).toContain( |
| 177 | + "/models/gemini-2.5-flash-preview-tts:generateContent" |
| 178 | + ); |
| 179 | + }); |
| 180 | + |
| 181 | + it("throws useful error for HTTP failures", async () => { |
| 182 | + globalThis.fetch = jest.fn(async () => |
| 183 | + response("bad request", { ok: false, status: 400, statusText: "Bad Request" }) |
| 184 | + ) as any; |
| 185 | + |
| 186 | + await expect(client.synthToBytes("Hello")).rejects.toThrow( |
| 187 | + "Gemini TTS API error: 400 Bad Request" |
| 188 | + ); |
| 189 | + }); |
| 190 | + |
| 191 | + it("throws useful error for missing audio data", async () => { |
| 192 | + globalThis.fetch = jest.fn(async () => |
| 193 | + response({ |
| 194 | + candidates: [ |
| 195 | + { |
| 196 | + finishReason: "STOP", |
| 197 | + content: { parts: [{ text: "not audio" }] }, |
| 198 | + }, |
| 199 | + ], |
| 200 | + }) |
| 201 | + ) as any; |
| 202 | + |
| 203 | + await expect(client.synthToBytes("Hello")).rejects.toThrow( |
| 204 | + "Gemini TTS response did not include audio data" |
| 205 | + ); |
| 206 | + }); |
| 207 | + |
| 208 | + it("wraps synthesized bytes in a stream and returns estimated word boundaries", async () => { |
| 209 | + globalThis.fetch = jest.fn(async () => audioResponse(b64([0, 0, 1, 0]))) as any; |
| 210 | + |
| 211 | + const result = await client.synthToBytestream("Hello world", { useWordBoundary: true }); |
| 212 | + const reader = result.audioStream.getReader(); |
| 213 | + const chunk = await reader.read(); |
| 214 | + |
| 215 | + expect(chunk.done).toBe(false); |
| 216 | + expect(chunk.value?.length).toBeGreaterThan(0); |
| 217 | + expect(result.wordBoundaries).toHaveLength(2); |
| 218 | + }); |
| 219 | + |
| 220 | + it("provides credential status", async () => { |
| 221 | + globalThis.fetch = jest.fn(async () => |
| 222 | + response({ |
| 223 | + models: [{ name: "models/gemini-3.1-flash-tts-preview" }], |
| 224 | + }) |
| 225 | + ) as any; |
| 226 | + |
| 227 | + const status = await client.getCredentialStatus(); |
| 228 | + |
| 229 | + expect(status.engine).toBe("gemini"); |
| 230 | + expect(status.requiresCredentials).toBe(true); |
| 231 | + }); |
| 232 | +}); |
0 commit comments