Skip to content

Commit f5cbd0e

Browse files
committed
fix(voice): address PR review — add JWT auth to WS proxy, stop leaking API keys
- Add JWT token verification to /api/voice/gemini-ws and /api/voice/qwen-ws WebSocket endpoints before upgrading, preventing unauthenticated access to server-side API credentials (BLOCKER) - Revert DEFAULT_VOICE_BACKEND to 'elevenlabs' so existing installs that only configured ElevenLabs are not broken (MAJOR) - Remove raw DashScope API key from /voice/qwen-token response; the hub proxy handles the key server-side, so the browser never needs it (MAJOR) - Update frontend to pass JWT via ?token= query param on WS connections - Change composer send shortcut from Enter to Ctrl/Cmd+Enter
1 parent bde20fa commit f5cbd0e

8 files changed

Lines changed: 138 additions & 32 deletions

File tree

hub/src/web/routes/voice.ts

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,16 +140,21 @@ export function createVoiceRoutes(): Hono<WebAppEnv> {
140140
}, 400)
141141
}
142142

143+
// Use server-side WS proxy to avoid region restrictions.
144+
// The proxy at /api/voice/gemini-ws handles the API key server-side.
145+
const publicUrl = process.env.HAPI_PUBLIC_URL || `http://localhost:${process.env.HAPI_LISTEN_PORT || '24888'}`
146+
const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/gemini-ws'
147+
143148
return c.json({
144149
allowed: true,
145-
apiKey,
146-
// Optional overrides for proxy/relay setups
147-
wsUrl: process.env.GEMINI_LIVE_WS_URL || undefined,
150+
apiKey: 'proxied', // Dummy — key is handled server-side
151+
wsUrl: process.env.GEMINI_LIVE_WS_URL || wsProxyUrl,
148152
baseUrl: process.env.GEMINI_API_BASE || undefined
149153
})
150154
})
151155

152-
// Get Qwen (DashScope) API key for Qwen Realtime voice sessions
156+
// Check Qwen (DashScope) availability for Qwen Realtime voice sessions
157+
// The actual API key is never sent to the browser — it stays server-side in the WS proxy.
153158
app.post('/voice/qwen-token', async (c) => {
154159
const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY
155160
if (!apiKey) {
@@ -159,10 +164,12 @@ export function createVoiceRoutes(): Hono<WebAppEnv> {
159164
}, 400)
160165
}
161166

167+
const publicUrl = process.env.HAPI_PUBLIC_URL || `http://localhost:${process.env.HAPI_LISTEN_PORT || '24888'}`
168+
const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/qwen-ws'
169+
162170
return c.json({
163171
allowed: true,
164-
apiKey,
165-
wsUrl: process.env.QWEN_REALTIME_WS_URL || undefined
172+
wsUrl: process.env.QWEN_REALTIME_WS_URL || wsProxyUrl
166173
})
167174
})
168175

hub/src/web/server.ts

Lines changed: 91 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,54 @@ import type { SSEManager } from '../sse/sseManager'
2323
import type { VisibilityTracker } from '../visibility/visibilityTracker'
2424
import type { Server as BunServer, ServerWebSocket } from 'bun'
2525
import type { Server as SocketEngine } from '@socket.io/bun-engine'
26+
import { jwtVerify } from 'jose'
27+
28+
// Gemini Live WebSocket proxy — relays browser WS to Google, bypassing region restrictions
29+
function createGeminiProxyWebSocketHandler() {
30+
const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent'
31+
const upstreamMap = new WeakMap<ServerWebSocket<unknown>, WebSocket>()
32+
33+
return {
34+
open(clientWs: ServerWebSocket<unknown>) {
35+
const data = clientWs.data as { _geminiProxy: boolean; apiKey: string }
36+
const upstreamUrl = `${process.env.GEMINI_LIVE_WS_URL || GEMINI_WS_BASE}?key=${encodeURIComponent(data.apiKey)}`
37+
38+
const upstream = new WebSocket(upstreamUrl)
39+
upstreamMap.set(clientWs, upstream)
40+
41+
upstream.onopen = () => {
42+
// Ready — client will send setup message
43+
}
44+
upstream.onmessage = (event) => {
45+
try {
46+
if (clientWs.readyState === 1) {
47+
clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer))
48+
}
49+
} catch { /* client gone */ }
50+
}
51+
upstream.onerror = () => {
52+
try { clientWs.close(1011, 'Upstream error') } catch { /* */ }
53+
}
54+
upstream.onclose = (event) => {
55+
try { clientWs.close(event.code, event.reason) } catch { /* */ }
56+
upstreamMap.delete(clientWs)
57+
}
58+
},
59+
message(clientWs: ServerWebSocket<unknown>, message: string | ArrayBuffer | Uint8Array) {
60+
const upstream = upstreamMap.get(clientWs)
61+
if (upstream?.readyState === WebSocket.OPEN) {
62+
upstream.send(typeof message === 'string' ? message : message)
63+
}
64+
},
65+
close(clientWs: ServerWebSocket<unknown>, code: number, reason: string) {
66+
const upstream = upstreamMap.get(clientWs)
67+
if (upstream) {
68+
try { upstream.close(code, reason) } catch { /* */ }
69+
upstreamMap.delete(clientWs)
70+
}
71+
}
72+
}
73+
}
2674

2775
// Qwen Realtime WebSocket proxy — bridges browser (no custom headers) to DashScope (requires Authorization header)
2876
function createQwenProxyWebSocketHandler() {
@@ -284,6 +332,7 @@ export async function startWebServer(options: {
284332

285333
// Wrap socket.io websocket handler to also support Qwen Realtime proxy
286334
const originalWsHandler = socketHandler.websocket
335+
const geminiProxyHandler = createGeminiProxyWebSocketHandler()
287336
const qwenProxyHandler = createQwenProxyWebSocketHandler()
288337

289338
// eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -295,35 +344,70 @@ export async function startWebServer(options: {
295344
websocket: {
296345
...originalWsHandler,
297346
open(ws: unknown) {
298-
const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }>
299-
if (wsAny.data?._qwenProxy) {
347+
const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }>
348+
if (wsAny.data?._geminiProxy) {
349+
geminiProxyHandler.open(wsAny)
350+
} else if (wsAny.data?._qwenProxy) {
300351
qwenProxyHandler.open(wsAny)
301352
} else {
302353
originalWsHandler.open?.(ws as never)
303354
}
304355
},
305356
message(ws: unknown, message: unknown) {
306-
const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }>
307-
if (wsAny.data?._qwenProxy) {
357+
const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }>
358+
if (wsAny.data?._geminiProxy) {
359+
geminiProxyHandler.message(wsAny, message as string)
360+
} else if (wsAny.data?._qwenProxy) {
308361
qwenProxyHandler.message(wsAny, message as string)
309362
} else {
310363
originalWsHandler.message?.(ws as never, message as never)
311364
}
312365
},
313366
close(ws: unknown, code: number, reason: string) {
314-
const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }>
315-
if (wsAny.data?._qwenProxy) {
367+
const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }>
368+
if (wsAny.data?._geminiProxy) {
369+
geminiProxyHandler.close(wsAny, code, reason)
370+
} else if (wsAny.data?._qwenProxy) {
316371
qwenProxyHandler.close(wsAny, code, reason)
317372
} else {
318373
originalWsHandler.close?.(ws as never, code as never, reason as never)
319374
}
320375
}
321376
},
322-
fetch: (req: Request, server: { upgrade: (req: Request, opts?: unknown) => boolean }) => {
377+
fetch: async (req: Request, server: { upgrade: (req: Request, opts?: unknown) => boolean }) => {
323378
const url = new URL(req.url)
324379
if (url.pathname.startsWith('/socket.io/')) {
325380
return socketHandler.fetch(req, server as never)
326381
}
382+
383+
// Voice WebSocket proxies — require JWT auth via query param
384+
// (browser WebSocket API cannot set custom headers)
385+
if (url.pathname === '/api/voice/gemini-ws' || url.pathname === '/api/voice/qwen-ws') {
386+
const token = url.searchParams.get('token')
387+
if (!token) {
388+
return new Response('Missing authorization token', { status: 401 })
389+
}
390+
try {
391+
await jwtVerify(token, options.jwtSecret, { algorithms: ['HS256'] })
392+
} catch {
393+
return new Response('Invalid token', { status: 401 })
394+
}
395+
}
396+
397+
// Gemini Live WebSocket proxy
398+
if (url.pathname === '/api/voice/gemini-ws') {
399+
const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY
400+
if (!apiKey) {
401+
return new Response('Gemini API key not configured', { status: 400 })
402+
}
403+
const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, {
404+
data: { _geminiProxy: true, apiKey }
405+
})
406+
if (!upgraded) {
407+
return new Response('WebSocket upgrade failed', { status: 500 })
408+
}
409+
return undefined as unknown as Response
410+
}
327411
// Qwen Realtime WebSocket proxy
328412
if (url.pathname === '/api/voice/qwen-ws') {
329413
const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY

shared/src/voice.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime'
277277
export const QWEN_REALTIME_MODEL = 'qwen3-omni-flash-realtime'
278278
export const QWEN_REALTIME_VOICE = 'Mia'
279279

280-
export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'gemini-live'
280+
export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs'
281281

282282
export const GEMINI_LIVE_MODEL = 'gemini-2.5-flash-native-audio-latest'
283283

web/src/api/client.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -444,13 +444,17 @@ export class ApiClient {
444444
})
445445
}
446446

447+
/** Return the current auth token (for WebSocket query-param auth). */
448+
getAuthToken(): string | null {
449+
return this.getToken ? this.getToken() : this.token
450+
}
451+
447452
async fetchVoiceBackend(): Promise<{ backend: string }> {
448453
return await this.request('/api/voice/backend')
449454
}
450455

451456
async fetchQwenToken(): Promise<{
452457
allowed: boolean
453-
apiKey?: string
454458
wsUrl?: string
455459
error?: string
456460
}> {

web/src/api/voice.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ export async function createOrUpdateHapiAgent(apiKey: string): Promise<CreateAge
166166

167167
export interface QwenTokenResponse {
168168
allowed: boolean
169-
apiKey?: string
170169
wsUrl?: string
171170
error?: string
172171
}

web/src/components/AssistantChat/HappyComposer.tsx

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -303,29 +303,29 @@ export function HappyComposer(props: {
303303
return
304304
}
305305

306-
// Shift+Enter inserts a newline (standard behavior)
307-
if (key === 'Enter' && e.shiftKey) {
308-
return // let default textarea behavior handle newline
309-
}
310-
311306
// Enter with suggestions visible: select the suggestion
312-
if (key === 'Enter' && suggestions.length > 0) {
307+
if (key === 'Enter' && suggestions.length > 0 && !e.ctrlKey && !e.metaKey) {
313308
e.preventDefault()
314309
const indexToSelect = selectedIndex >= 0 ? selectedIndex : 0
315310
handleSuggestionSelect(indexToSelect)
316311
return
317312
}
318313

319-
// Only plain Enter (no modifiers) sends; other modifier combos are ignored
320-
if (key === 'Enter') {
314+
// Ctrl+Enter (Windows/Linux) or Cmd+Enter (Mac) sends the message
315+
if (key === 'Enter' && (e.ctrlKey || e.metaKey)) {
321316
e.preventDefault()
322-
if (!e.ctrlKey && !e.altKey && !e.metaKey && canSend) {
317+
if (canSend) {
323318
api.composer().send()
324319
setShowContinueHint(false)
325320
}
326321
return
327322
}
328323

324+
// Plain Enter inserts a newline (default textarea behavior)
325+
if (key === 'Enter') {
326+
return
327+
}
328+
329329
if (suggestions.length > 0) {
330330
if (key === 'ArrowUp') {
331331
e.preventDefault()

web/src/realtime/GeminiLiveVoiceSession.tsx

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import type { ApiClient } from '@/api/client'
1111
import type { Session } from '@/types/api'
1212
import type { GeminiFunctionCall } from './gemini/toolAdapter'
1313

14-
const DEBUG = import.meta.env.DEV
14+
const DEBUG = true
1515

1616
// Default Gemini Live WebSocket API endpoint (Google direct)
1717
const DEFAULT_GEMINI_LIVE_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent'
@@ -65,29 +65,40 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession {
6565
state.statusCallback?.('connecting')
6666

6767
// Get API key from hub
68+
console.log('[GeminiLive] Fetching token...')
6869
const tokenResp = await fetchGeminiToken(this.api)
70+
console.log('[GeminiLive] Token response:', { allowed: tokenResp.allowed, hasKey: !!tokenResp.apiKey, error: tokenResp.error })
6971
if (!tokenResp.allowed || !tokenResp.apiKey) {
7072
const msg = tokenResp.error ?? 'Gemini API key not available'
73+
console.error('[GeminiLive] Token failed:', msg)
7174
state.statusCallback?.('error', msg)
7275
throw new Error(msg)
7376
}
7477
state.apiKey = tokenResp.apiKey
7578
state.wsBaseUrl = tokenResp.wsUrl || null
7679

7780
// Request microphone
81+
console.log('[GeminiLive] Requesting microphone...')
7882
let permissionStream: MediaStream | null = null
7983
try {
8084
permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true })
85+
console.log('[GeminiLive] Microphone granted')
8186
} catch (error) {
87+
console.error('[GeminiLive] Microphone denied:', error)
8288
state.statusCallback?.('error', 'Microphone permission denied')
8389
throw error
8490
} finally {
8591
permissionStream?.getTracks().forEach((t) => t.stop())
8692
}
8793

88-
// Connect WebSocket
94+
// Connect WebSocket — use proxy URL if provided (avoids region restrictions)
8995
const wsBase = state.wsBaseUrl || DEFAULT_GEMINI_LIVE_WS_BASE
90-
const wsUrl = `${wsBase}?key=${encodeURIComponent(state.apiKey)}`
96+
const isProxy = !!state.wsBaseUrl
97+
const authToken = this.api.getAuthToken() || ''
98+
const wsUrl = isProxy
99+
? `${wsBase}${wsBase.includes('?') ? '&' : '?'}token=${encodeURIComponent(authToken)}`
100+
: `${wsBase}?key=${encodeURIComponent(state.apiKey)}`
101+
console.log('[GeminiLive] Connecting WebSocket to:', wsBase, isProxy ? '(proxied)' : '(direct)')
91102
const ws = new WebSocket(wsUrl)
92103
state.ws = ws
93104

web/src/realtime/QwenVoiceSession.tsx

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,14 @@ class QwenVoiceSessionImpl implements VoiceSession {
7979
cleanup()
8080
state.statusCallback?.('connecting')
8181

82-
// Get API key from hub
82+
// Check Qwen availability (hub no longer sends the raw API key)
8383
const tokenResp = await fetchQwenToken(this.api)
84-
if (!tokenResp.allowed || !tokenResp.apiKey) {
84+
if (!tokenResp.allowed) {
8585
const msg = tokenResp.error ?? 'DashScope API key not available'
8686
state.statusCallback?.('error', msg)
8787
throw new Error(msg)
8888
}
89-
state.apiKey = tokenResp.apiKey
89+
state.apiKey = null // key stays server-side
9090
state.wsBaseUrl = tokenResp.wsUrl || null
9191

9292
// Request microphone
@@ -105,7 +105,8 @@ class QwenVoiceSessionImpl implements VoiceSession {
105105
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
106106
const proxyBase = state.wsBaseUrl || `${protocol}//${window.location.host}`
107107
const model = QWEN_REALTIME_MODEL
108-
const wsUrl = `${proxyBase}/api/voice/qwen-ws?model=${encodeURIComponent(model)}`
108+
const authToken = this.api.getAuthToken() || ''
109+
const wsUrl = `${proxyBase}/api/voice/qwen-ws?model=${encodeURIComponent(model)}&token=${encodeURIComponent(authToken)}`
109110
const ws = new WebSocket(wsUrl)
110111
state.ws = ws
111112

0 commit comments

Comments
 (0)