Skip to content

Commit 21d2417

Browse files
heavygeeHAPI
andcommitted
feat(voice): pluggable voice backend with Gemini Live & Qwen Realtime
Rebased from Overbaker/hapi#401 onto current main. Adds a pluggable voice backend architecture that extends the existing ElevenLabs integration: - **Gemini 2.5 Live** (`gemini-live`): Google real-time audio via WebSocket with full function calling (messageCodingAgent, processPermissionRequest) - **Qwen Realtime** (`qwen-realtime`): Alibaba DashScope via hub WebSocket proxy (browser cannot set Authorization header directly) - **VoiceBackendSession**: dynamic backend selector with React.lazy loading, gates voice button until backend module is registered - **Hub WS proxies**: JWT-authenticated `/api/voice/gemini-ws` and `/api/voice/qwen-ws` endpoints in Bun.serve, with message queueing during upstream connect to prevent dropped setup frames - **AudioWorklet pipeline**: inline Blob URL recorder, 24 kHz PCM player, serial tool call execution, AudioContext created in user gesture for mobile - Backend discovery: `GET /voice/backend` + `POST /voice/gemini-token` / `POST /voice/qwen-token` hub routes; frontend auto-detects active backend Merge notes: - Rebased 135 upstream commits cleanly; HappyComposer keeps upstream's configurable enter-behavior setting (supersedes hard-coded Ctrl+Enter) - Converted gemini test files from bun:test to vitest (web package uses vitest) - All 221 hub tests and 636 web tests pass; TypeScript clean via [HAPI](https://hapi.run) Co-Authored-By: HAPI <noreply@hapi.run>
1 parent 3258c52 commit 21d2417

21 files changed

Lines changed: 1971 additions & 17 deletions

hub/src/socket/server.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ export function createSocketServer(deps: SocketServerDeps): {
6666
const engine = new Engine({
6767
path: '/socket.io/',
6868
cors: corsOptions,
69+
maxHttpBufferSize: 55 * 1024 * 1024, // 55MB to match upload limit
6970
allowRequest: async (req) => {
7071
const origin = req.headers.get('origin')
7172
if (!origin || allowAllOrigins || corsOrigins.includes(origin)) {

hub/src/web/routes/voice.test.ts

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import { describe, test, expect, afterEach } from 'bun:test'
2+
import { Hono } from 'hono'
3+
import type { WebAppEnv } from '../middleware/auth'
4+
import { createVoiceRoutes } from './voice'
5+
6+
function createApp() {
7+
const app = new Hono<WebAppEnv>()
8+
app.route('/api', createVoiceRoutes())
9+
return app
10+
}
11+
12+
describe('GET /api/voice/backend', () => {
13+
const originalEnv = process.env.VOICE_BACKEND
14+
15+
afterEach(() => {
16+
if (originalEnv === undefined) {
17+
delete process.env.VOICE_BACKEND
18+
} else {
19+
process.env.VOICE_BACKEND = originalEnv
20+
}
21+
})
22+
23+
test('returns elevenlabs by default', async () => {
24+
delete process.env.VOICE_BACKEND
25+
const app = createApp()
26+
const res = await app.request('/api/voice/backend')
27+
expect(res.status).toBe(200)
28+
const body = await res.json() as { backend: string }
29+
expect(body.backend).toBe('elevenlabs')
30+
})
31+
32+
test('returns gemini-live when configured', async () => {
33+
process.env.VOICE_BACKEND = 'gemini-live'
34+
const app = createApp()
35+
const res = await app.request('/api/voice/backend')
36+
expect(res.status).toBe(200)
37+
const body = await res.json() as { backend: string }
38+
expect(body.backend).toBe('gemini-live')
39+
})
40+
41+
test('returns qwen-realtime when configured', async () => {
42+
process.env.VOICE_BACKEND = 'qwen-realtime'
43+
const app = createApp()
44+
const res = await app.request('/api/voice/backend')
45+
expect(res.status).toBe(200)
46+
const body = await res.json() as { backend: string }
47+
expect(body.backend).toBe('qwen-realtime')
48+
})
49+
50+
test('falls back to elevenlabs for unknown values', async () => {
51+
process.env.VOICE_BACKEND = 'unknown-backend'
52+
const app = createApp()
53+
const res = await app.request('/api/voice/backend')
54+
expect(res.status).toBe(200)
55+
const body = await res.json() as { backend: string }
56+
expect(body.backend).toBe('elevenlabs')
57+
})
58+
})
59+
60+
describe('POST /api/voice/gemini-token', () => {
61+
const origGemini = process.env.GEMINI_API_KEY
62+
const origGoogle = process.env.GOOGLE_API_KEY
63+
64+
afterEach(() => {
65+
if (origGemini === undefined) delete process.env.GEMINI_API_KEY
66+
else process.env.GEMINI_API_KEY = origGemini
67+
if (origGoogle === undefined) delete process.env.GOOGLE_API_KEY
68+
else process.env.GOOGLE_API_KEY = origGoogle
69+
})
70+
71+
test('returns 400 when no API key configured', async () => {
72+
delete process.env.GEMINI_API_KEY
73+
delete process.env.GOOGLE_API_KEY
74+
const app = createApp()
75+
const res = await app.request('/api/voice/gemini-token', { method: 'POST' })
76+
expect(res.status).toBe(400)
77+
const body = await res.json() as { allowed: boolean; error: string }
78+
expect(body.allowed).toBe(false)
79+
expect(body.error).toContain('not configured')
80+
})
81+
82+
test('returns proxied wsUrl when GEMINI_API_KEY is set', async () => {
83+
process.env.GEMINI_API_KEY = 'test-gemini-key'
84+
delete process.env.GOOGLE_API_KEY
85+
const app = createApp()
86+
const res = await app.request('/api/voice/gemini-token', { method: 'POST' })
87+
expect(res.status).toBe(200)
88+
const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string }
89+
expect(body.allowed).toBe(true)
90+
expect(body.apiKey).toBe('proxied')
91+
expect(body.wsUrl).toContain('/api/voice/gemini-ws')
92+
})
93+
94+
test('falls back to GOOGLE_API_KEY', async () => {
95+
delete process.env.GEMINI_API_KEY
96+
process.env.GOOGLE_API_KEY = 'test-google-key'
97+
const app = createApp()
98+
const res = await app.request('/api/voice/gemini-token', { method: 'POST' })
99+
expect(res.status).toBe(200)
100+
const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string }
101+
expect(body.allowed).toBe(true)
102+
expect(body.apiKey).toBe('proxied')
103+
expect(body.wsUrl).toContain('/api/voice/gemini-ws')
104+
})
105+
})
106+
107+
describe('POST /api/voice/qwen-token', () => {
108+
const origDash = process.env.DASHSCOPE_API_KEY
109+
const origQwen = process.env.QWEN_API_KEY
110+
111+
afterEach(() => {
112+
if (origDash === undefined) delete process.env.DASHSCOPE_API_KEY
113+
else process.env.DASHSCOPE_API_KEY = origDash
114+
if (origQwen === undefined) delete process.env.QWEN_API_KEY
115+
else process.env.QWEN_API_KEY = origQwen
116+
})
117+
118+
test('returns 400 when no API key configured', async () => {
119+
delete process.env.DASHSCOPE_API_KEY
120+
delete process.env.QWEN_API_KEY
121+
const app = createApp()
122+
const res = await app.request('/api/voice/qwen-token', { method: 'POST' })
123+
expect(res.status).toBe(400)
124+
const body = await res.json() as { allowed: boolean; error: string }
125+
expect(body.allowed).toBe(false)
126+
expect(body.error).toContain('not configured')
127+
})
128+
129+
test('returns wsUrl when DASHSCOPE_API_KEY is set (no raw key exposed)', async () => {
130+
process.env.DASHSCOPE_API_KEY = 'test-dash-key'
131+
delete process.env.QWEN_API_KEY
132+
const app = createApp()
133+
const res = await app.request('/api/voice/qwen-token', { method: 'POST' })
134+
expect(res.status).toBe(200)
135+
const body = await res.json() as { allowed: boolean; wsUrl: string }
136+
expect(body.allowed).toBe(true)
137+
expect(body.wsUrl).toContain('/api/voice/qwen-ws')
138+
expect(body).not.toHaveProperty('apiKey')
139+
})
140+
141+
test('falls back to QWEN_API_KEY', async () => {
142+
delete process.env.DASHSCOPE_API_KEY
143+
process.env.QWEN_API_KEY = 'test-qwen-key'
144+
const app = createApp()
145+
const res = await app.request('/api/voice/qwen-token', { method: 'POST' })
146+
expect(res.status).toBe(200)
147+
const body = await res.json() as { allowed: boolean; wsUrl: string }
148+
expect(body.allowed).toBe(true)
149+
expect(body.wsUrl).toContain('/api/voice/qwen-ws')
150+
expect(body).not.toHaveProperty('apiKey')
151+
})
152+
})

hub/src/web/routes/voice.ts

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ import type { WebAppEnv } from '../middleware/auth'
44
import {
55
ELEVENLABS_API_BASE,
66
VOICE_AGENT_NAME,
7-
buildVoiceAgentConfig
7+
buildVoiceAgentConfig,
8+
DEFAULT_VOICE_BACKEND
89
} from '@hapi/protocol/voice'
10+
import type { VoiceBackendType } from '@hapi/protocol/voice'
911

1012
const tokenRequestSchema = z.object({
1113
customAgentId: z.string().optional(),
@@ -116,6 +118,65 @@ async function getOrCreateAgentId(apiKey: string): Promise<string | null> {
116118
export function createVoiceRoutes(): Hono<WebAppEnv> {
117119
const app = new Hono<WebAppEnv>()
118120

121+
// Return the configured voice backend type
122+
app.get('/voice/backend', (c) => {
123+
const raw = process.env.VOICE_BACKEND
124+
const backend: VoiceBackendType =
125+
raw === 'gemini-live' ? 'gemini-live'
126+
: raw === 'qwen-realtime' ? 'qwen-realtime'
127+
: DEFAULT_VOICE_BACKEND
128+
return c.json({ backend })
129+
})
130+
131+
// Get Gemini API key for Gemini Live voice sessions
132+
// Gemini Live API does not support ephemeral tokens, so we proxy the key.
133+
// The key is short-lived in the browser session and never persisted client-side.
134+
app.post('/voice/gemini-token', async (c) => {
135+
const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY
136+
if (!apiKey) {
137+
return c.json({
138+
allowed: false,
139+
error: 'Gemini API key not configured (set GEMINI_API_KEY or GOOGLE_API_KEY)'
140+
}, 400)
141+
}
142+
143+
// Use server-side WS proxy to avoid region restrictions.
144+
// The proxy at /api/voice/gemini-ws handles the API key server-side.
145+
// Derive wsUrl from the request origin so remote browsers connect back to the hub,
146+
// not to localhost. HAPI_PUBLIC_URL overrides when set (e.g. behind a reverse proxy).
147+
const requestOrigin = new URL(c.req.url).origin
148+
const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin
149+
const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/gemini-ws'
150+
151+
return c.json({
152+
allowed: true,
153+
apiKey: 'proxied', // Dummy — key is handled server-side
154+
wsUrl: wsProxyUrl, // Always proxy — env WS URLs are upstream-only (server-side)
155+
baseUrl: process.env.GEMINI_API_BASE || undefined
156+
})
157+
})
158+
159+
// Check Qwen (DashScope) availability for Qwen Realtime voice sessions
160+
// The actual API key is never sent to the browser — it stays server-side in the WS proxy.
161+
app.post('/voice/qwen-token', async (c) => {
162+
const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY
163+
if (!apiKey) {
164+
return c.json({
165+
allowed: false,
166+
error: 'DashScope API key not configured (set DASHSCOPE_API_KEY or QWEN_API_KEY)'
167+
}, 400)
168+
}
169+
170+
const requestOrigin = new URL(c.req.url).origin
171+
const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin
172+
const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/qwen-ws'
173+
174+
return c.json({
175+
allowed: true,
176+
wsUrl: wsProxyUrl // Always proxy — env WS URLs are upstream-only (server-side)
177+
})
178+
})
179+
119180
// Get ElevenLabs ConvAI conversation token
120181
app.post('/voice/token', async (c) => {
121182
const json = await c.req.json().catch(() => null)

0 commit comments

Comments
 (0)