Skip to content

Commit 10ec0c0

Browse files
feat(voice): add realtime dictation modes
1 parent e08f518 commit 10ec0c0

11 files changed

Lines changed: 623 additions & 9 deletions

File tree

hub/src/web/routes/voice.ts

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,19 @@ const tokenRequestSchema = z.object({
1414
customApiKey: z.string().optional()
1515
})
1616

17+
const scribeTokenRequestSchema = z.object({
18+
customApiKey: z.string().optional()
19+
})
20+
21+
const transcriptionModelSchema = z.enum(['scribe_v1', 'scribe_v2'])
22+
23+
const SUPPORTED_ELEVENLABS_LANGUAGE_CODES = new Set([
24+
'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko',
25+
'pt', 'pt-br', 'it', 'es', 'id', 'nl', 'tr', 'pl', 'sv', 'bg',
26+
'ro', 'ar', 'cs', 'el', 'fi', 'ms', 'da', 'ta', 'uk', 'ru',
27+
'hu', 'hr', 'sk', 'no', 'vi', 'tl'
28+
])
29+
1730
// Cache for auto-created agent IDs (keyed by API key hash)
1831
const agentIdCache = new Map<string, string>()
1932

@@ -30,6 +43,28 @@ interface ElevenLabsTool {
3043
}
3144
}
3245

46+
function normalizeTranscriptionLanguageCode(raw: string | null): string | undefined {
47+
if (!raw) return undefined
48+
49+
const normalized = raw.trim().toLowerCase()
50+
if (!normalized) return undefined
51+
52+
if (SUPPORTED_ELEVENLABS_LANGUAGE_CODES.has(normalized)) {
53+
return normalized
54+
}
55+
56+
if (normalized === 'pt-br' || normalized.startsWith('pt-br-')) {
57+
return 'pt-br'
58+
}
59+
60+
const base = normalized.split(/[-_]/)[0]
61+
if (base && SUPPORTED_ELEVENLABS_LANGUAGE_CODES.has(base)) {
62+
return base
63+
}
64+
65+
return undefined
66+
}
67+
3368
/**
3469
* Find an existing "Hapi Voice Assistant" agent
3570
*/
@@ -315,5 +350,113 @@ export function createVoiceRoutes(): Hono<WebAppEnv> {
315350
}
316351
})
317352

353+
app.post('/voice/transcribe', async (c) => {
354+
const formData = await c.req.formData().catch(() => null)
355+
if (!formData) {
356+
return c.json({ error: 'Invalid form data' }, 400)
357+
}
358+
359+
const file = formData.get('file')
360+
const modelIdRaw = formData.get('modelId')
361+
const languageCodeRaw = formData.get('languageCode')
362+
363+
if (!(file instanceof File)) {
364+
return c.json({ error: 'Missing audio file' }, 400)
365+
}
366+
367+
const modelIdParsed = transcriptionModelSchema.safeParse(
368+
typeof modelIdRaw === 'string' ? modelIdRaw : 'scribe_v2'
369+
)
370+
if (!modelIdParsed.success) {
371+
return c.json({ error: 'Invalid modelId' }, 400)
372+
}
373+
374+
const apiKey = process.env.ELEVENLABS_API_KEY
375+
if (!apiKey) {
376+
return c.json({ error: 'ElevenLabs API key not configured' }, 400)
377+
}
378+
379+
const upstreamFormData = new FormData()
380+
upstreamFormData.set('model_id', modelIdParsed.data)
381+
upstreamFormData.set('file', file, file.name || 'speech.webm')
382+
const languageCode = typeof languageCodeRaw === 'string'
383+
? normalizeTranscriptionLanguageCode(languageCodeRaw)
384+
: undefined
385+
if (languageCode && modelIdParsed.data === 'scribe_v2') {
386+
upstreamFormData.set('language_code', languageCode)
387+
}
388+
389+
try {
390+
const response = await fetch(`${ELEVENLABS_API_BASE}/speech-to-text`, {
391+
method: 'POST',
392+
headers: {
393+
'xi-api-key': apiKey,
394+
'Accept': 'application/json'
395+
},
396+
body: upstreamFormData
397+
})
398+
399+
if (!response.ok) {
400+
const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string } | string; error?: string }
401+
const errorMessage = typeof errorData.detail === 'string'
402+
? errorData.detail
403+
: errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}`
404+
return c.json({ error: errorMessage }, 500)
405+
}
406+
407+
const data = await response.json() as { text?: string; language_code?: string }
408+
return c.json({
409+
text: data.text ?? '',
410+
languageCode: data.language_code
411+
})
412+
} catch (error) {
413+
return c.json({
414+
error: error instanceof Error ? error.message : 'Network error'
415+
}, 500)
416+
}
417+
})
418+
419+
app.post('/voice/scribe-token', async (c) => {
420+
const json = await c.req.json().catch(() => null)
421+
const parsed = scribeTokenRequestSchema.safeParse(json ?? {})
422+
if (!parsed.success) {
423+
return c.json({ error: 'Invalid request body' }, 400)
424+
}
425+
426+
const apiKey = parsed.data.customApiKey || process.env.ELEVENLABS_API_KEY
427+
if (!apiKey) {
428+
return c.json({ error: 'ElevenLabs API key not configured' }, 400)
429+
}
430+
431+
try {
432+
const response = await fetch(`${ELEVENLABS_API_BASE}/single-use-token/realtime_scribe`, {
433+
method: 'POST',
434+
headers: {
435+
'xi-api-key': apiKey,
436+
'Accept': 'application/json'
437+
}
438+
})
439+
440+
if (!response.ok) {
441+
const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string } | string; error?: string }
442+
const errorMessage = typeof errorData.detail === 'string'
443+
? errorData.detail
444+
: errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}`
445+
return c.json({ error: errorMessage }, 500)
446+
}
447+
448+
const data = await response.json() as { token?: string }
449+
if (!data.token) {
450+
return c.json({ error: 'No token in ElevenLabs response' }, 500)
451+
}
452+
453+
return c.json({ token: data.token })
454+
} catch (error) {
455+
return c.json({
456+
error: error instanceof Error ? error.message : 'Network error'
457+
}, 500)
458+
}
459+
})
460+
318461
return app
319462
}

web/src/api/client.ts

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ import type {
1919
SpawnResponse,
2020
UploadFileResponse,
2121
VisibilityPayload,
22+
VoiceScribeTokenResponse,
23+
VoiceTranscriptionResponse,
2224
SessionResponse,
2325
SessionsResponse
2426
} from '@/types/api'
@@ -94,7 +96,7 @@ export class ApiClient {
9496
if (authToken) {
9597
headers.set('authorization', `Bearer ${authToken}`)
9698
}
97-
if (init?.body !== undefined && !headers.has('content-type')) {
99+
if (init?.body !== undefined && !(init.body instanceof FormData) && !headers.has('content-type')) {
98100
headers.set('content-type', 'application/json')
99101
}
100102

@@ -443,4 +445,28 @@ export class ApiClient {
443445
body: JSON.stringify(options || {})
444446
})
445447
}
448+
449+
async transcribeVoice(
450+
file: File,
451+
options?: { modelId?: 'scribe_v1' | 'scribe_v2'; languageCode?: string }
452+
): Promise<VoiceTranscriptionResponse> {
453+
const formData = new FormData()
454+
formData.set('file', file)
455+
formData.set('modelId', options?.modelId ?? 'scribe_v2')
456+
if (options?.languageCode) {
457+
formData.set('languageCode', options.languageCode)
458+
}
459+
460+
return await this.request('/api/voice/transcribe', {
461+
method: 'POST',
462+
body: formData
463+
})
464+
}
465+
466+
async fetchVoiceScribeToken(): Promise<VoiceScribeTokenResponse> {
467+
return await this.request('/api/voice/scribe-token', {
468+
method: 'POST',
469+
body: JSON.stringify({})
470+
})
471+
}
446472
}

web/src/components/AssistantChat/HappyComposer.tsx

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import {
1212
useRef,
1313
useState
1414
} from 'react'
15+
import type { ApiClient } from '@/api/client'
1516
import type { AgentState, CodexCollaborationMode, PermissionMode } from '@/types/api'
1617
import type { Suggestion } from '@/hooks/useActiveSuggestions'
1718
import type { ConversationStatus } from '@/realtime/types'
@@ -23,6 +24,9 @@ import { usePWAInstall } from '@/hooks/usePWAInstall'
2324
import { supportsEffort, supportsModelChange } from '@hapi/protocol'
2425
import { markSkillUsed } from '@/lib/recent-skills'
2526
import { useComposerDraft } from '@/hooks/useComposerDraft'
27+
import { useElevenLabsTranscription } from '@/hooks/useElevenLabsTranscription'
28+
import { useSpeechToText } from '@/hooks/useSpeechToText'
29+
import { useVoiceMode } from '@/hooks/useVoiceMode'
2630
import { FloatingOverlay } from '@/components/ChatInput/FloatingOverlay'
2731
import { Autocomplete } from '@/components/ChatInput/Autocomplete'
2832
import { StatusBar } from '@/components/AssistantChat/StatusBar'
@@ -66,6 +70,7 @@ export function HappyComposer(props: {
6670
terminalUnsupported?: boolean
6771
autocompletePrefixes?: string[]
6872
autocompleteSuggestions?: (query: string) => Promise<Suggestion[]>
73+
voiceTranscriptionApi?: ApiClient
6974
// Voice assistant props
7075
voiceStatus?: ConversationStatus
7176
voiceMicMuted?: boolean
@@ -99,6 +104,7 @@ export function HappyComposer(props: {
99104
terminalUnsupported = false,
100105
autocompletePrefixes = ['@', '/', '$'],
101106
autocompleteSuggestions = defaultSuggestionHandler,
107+
voiceTranscriptionApi,
102108
voiceStatus = 'disconnected',
103109
voiceMicMuted = false,
104110
onVoiceToggle,
@@ -170,6 +176,7 @@ export function HappyComposer(props: {
170176
}, [controlledByUser])
171177

172178
const { haptic: platformHaptic, isTouch } = usePlatform()
179+
const { voiceMode } = useVoiceMode()
173180
const { isStandalone, isIOS } = usePWAInstall()
174181
const isIOSPWA = isIOS && isStandalone
175182
const bottomPaddingClass = isIOSPWA ? 'pb-0' : 'pb-3'
@@ -190,6 +197,34 @@ export function HappyComposer(props: {
190197
}
191198
}, [platformHaptic])
192199

200+
const dictation = useSpeechToText({
201+
getCurrentText: () => composerText,
202+
onTextChange: (text) => api.composer().setText(text)
203+
})
204+
const elevenLabsDictation = useElevenLabsTranscription({
205+
api: voiceTranscriptionApi ?? null,
206+
getCurrentText: () => composerText,
207+
onTextChange: (text) => api.composer().setText(text)
208+
})
209+
210+
const effectiveVoiceStatus = voiceMode === 'dictation-local'
211+
? dictation.status
212+
: voiceMode === 'dictation-elevenlabs'
213+
? elevenLabsDictation.status
214+
: voiceStatus
215+
const effectiveVoiceEnabled = voiceMode === 'dictation-local'
216+
? dictation.supported
217+
: voiceMode === 'dictation-elevenlabs'
218+
? elevenLabsDictation.supported
219+
: Boolean(onVoiceToggle)
220+
const effectiveVoiceMicMuted = voiceMode === 'assistant' ? voiceMicMuted : false
221+
const effectiveOnVoiceToggle = voiceMode === 'dictation-local'
222+
? dictation.toggle
223+
: voiceMode === 'dictation-elevenlabs'
224+
? elevenLabsDictation.toggle
225+
: onVoiceToggle
226+
const effectiveOnVoiceMicToggle = voiceMode === 'assistant' ? onVoiceMicToggle : undefined
227+
193228
const handleSuggestionSelect = useCallback((index: number) => {
194229
const suggestion = suggestions[index]
195230
if (!suggestion || !textareaRef.current) return
@@ -488,7 +523,7 @@ export function HappyComposer(props: {
488523
|| showEffortSettings
489524
)
490525
const showAbortButton = true
491-
const voiceEnabled = Boolean(onVoiceToggle)
526+
const voiceEnabled = effectiveVoiceEnabled
492527

493528
const handleSend = useCallback(() => {
494529
api.composer().send()
@@ -764,7 +799,7 @@ export function HappyComposer(props: {
764799
permissionMode={permissionMode}
765800
collaborationMode={collaborationMode}
766801
agentFlavor={agentFlavor}
767-
voiceStatus={voiceStatus}
802+
voiceStatus={effectiveVoiceStatus}
768803
/>
769804

770805
<div className="overflow-hidden rounded-[20px] bg-[var(--app-secondary-bg)]">
@@ -809,10 +844,10 @@ export function HappyComposer(props: {
809844
isSwitching={isSwitching}
810845
onSwitch={handleSwitch}
811846
voiceEnabled={voiceEnabled}
812-
voiceStatus={voiceStatus}
813-
voiceMicMuted={voiceMicMuted}
814-
onVoiceToggle={onVoiceToggle ?? (() => {})}
815-
onVoiceMicToggle={onVoiceMicToggle}
847+
voiceStatus={effectiveVoiceStatus}
848+
voiceMicMuted={effectiveVoiceMicMuted}
849+
onVoiceToggle={effectiveOnVoiceToggle ?? (() => {})}
850+
onVoiceMicToggle={effectiveOnVoiceMicToggle}
816851
onSend={handleSend}
817852
/>
818853
</div>

web/src/components/SessionChat.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ export function SessionChat(props: {
421421
onTerminal={props.session.active && terminalSupported ? handleViewTerminal : undefined}
422422
terminalUnsupported={props.session.active && !terminalSupported}
423423
autocompleteSuggestions={props.autocompleteSuggestions}
424+
voiceTranscriptionApi={props.api}
424425
voiceStatus={voice?.status}
425426
voiceMicMuted={voice?.micMuted}
426427
onVoiceToggle={voice ? handleVoiceToggle : undefined}

0 commit comments

Comments
 (0)