@@ -14,6 +14,19 @@ const tokenRequestSchema = z.object({
1414 customApiKey : z . string ( ) . optional ( )
1515} )
1616
17+ const scribeTokenRequestSchema = z . object ( {
18+ customApiKey : z . string ( ) . optional ( )
19+ } )
20+
21+ const transcriptionModelSchema = z . enum ( [ 'scribe_v1' , 'scribe_v2' ] )
22+
23+ const SUPPORTED_ELEVENLABS_LANGUAGE_CODES = new Set ( [
24+ 'en' , 'ja' , 'zh' , 'de' , 'hi' , 'fr' , 'ko' ,
25+ 'pt' , 'pt-br' , 'it' , 'es' , 'id' , 'nl' , 'tr' , 'pl' , 'sv' , 'bg' ,
26+ 'ro' , 'ar' , 'cs' , 'el' , 'fi' , 'ms' , 'da' , 'ta' , 'uk' , 'ru' ,
27+ 'hu' , 'hr' , 'sk' , 'no' , 'vi' , 'tl'
28+ ] )
29+
1730// Cache for auto-created agent IDs (keyed by API key hash)
1831const agentIdCache = new Map < string , string > ( )
1932
@@ -30,6 +43,28 @@ interface ElevenLabsTool {
3043 }
3144}
3245
46+ function normalizeTranscriptionLanguageCode ( raw : string | null ) : string | undefined {
47+ if ( ! raw ) return undefined
48+
49+ const normalized = raw . trim ( ) . toLowerCase ( )
50+ if ( ! normalized ) return undefined
51+
52+ if ( SUPPORTED_ELEVENLABS_LANGUAGE_CODES . has ( normalized ) ) {
53+ return normalized
54+ }
55+
56+ if ( normalized === 'pt-br' || normalized . startsWith ( 'pt-br-' ) ) {
57+ return 'pt-br'
58+ }
59+
60+ const base = normalized . split ( / [ - _ ] / ) [ 0 ]
61+ if ( base && SUPPORTED_ELEVENLABS_LANGUAGE_CODES . has ( base ) ) {
62+ return base
63+ }
64+
65+ return undefined
66+ }
67+
3368/**
3469 * Find an existing "Hapi Voice Assistant" agent
3570 */
@@ -315,5 +350,113 @@ export function createVoiceRoutes(): Hono<WebAppEnv> {
315350 }
316351 } )
317352
353+ app . post ( '/voice/transcribe' , async ( c ) => {
354+ const formData = await c . req . formData ( ) . catch ( ( ) => null )
355+ if ( ! formData ) {
356+ return c . json ( { error : 'Invalid form data' } , 400 )
357+ }
358+
359+ const file = formData . get ( 'file' )
360+ const modelIdRaw = formData . get ( 'modelId' )
361+ const languageCodeRaw = formData . get ( 'languageCode' )
362+
363+ if ( ! ( file instanceof File ) ) {
364+ return c . json ( { error : 'Missing audio file' } , 400 )
365+ }
366+
367+ const modelIdParsed = transcriptionModelSchema . safeParse (
368+ typeof modelIdRaw === 'string' ? modelIdRaw : 'scribe_v2'
369+ )
370+ if ( ! modelIdParsed . success ) {
371+ return c . json ( { error : 'Invalid modelId' } , 400 )
372+ }
373+
374+ const apiKey = process . env . ELEVENLABS_API_KEY
375+ if ( ! apiKey ) {
376+ return c . json ( { error : 'ElevenLabs API key not configured' } , 400 )
377+ }
378+
379+ const upstreamFormData = new FormData ( )
380+ upstreamFormData . set ( 'model_id' , modelIdParsed . data )
381+ upstreamFormData . set ( 'file' , file , file . name || 'speech.webm' )
382+ const languageCode = typeof languageCodeRaw === 'string'
383+ ? normalizeTranscriptionLanguageCode ( languageCodeRaw )
384+ : undefined
385+ if ( languageCode && modelIdParsed . data === 'scribe_v2' ) {
386+ upstreamFormData . set ( 'language_code' , languageCode )
387+ }
388+
389+ try {
390+ const response = await fetch ( `${ ELEVENLABS_API_BASE } /speech-to-text` , {
391+ method : 'POST' ,
392+ headers : {
393+ 'xi-api-key' : apiKey ,
394+ 'Accept' : 'application/json'
395+ } ,
396+ body : upstreamFormData
397+ } )
398+
399+ if ( ! response . ok ) {
400+ const errorData = await response . json ( ) . catch ( ( ) => ( { } ) ) as { detail ?: { message ?: string } | string ; error ?: string }
401+ const errorMessage = typeof errorData . detail === 'string'
402+ ? errorData . detail
403+ : errorData . detail ?. message || errorData . error || `ElevenLabs API error: ${ response . status } `
404+ return c . json ( { error : errorMessage } , 500 )
405+ }
406+
407+ const data = await response . json ( ) as { text ?: string ; language_code ?: string }
408+ return c . json ( {
409+ text : data . text ?? '' ,
410+ languageCode : data . language_code
411+ } )
412+ } catch ( error ) {
413+ return c . json ( {
414+ error : error instanceof Error ? error . message : 'Network error'
415+ } , 500 )
416+ }
417+ } )
418+
419+ app . post ( '/voice/scribe-token' , async ( c ) => {
420+ const json = await c . req . json ( ) . catch ( ( ) => null )
421+ const parsed = scribeTokenRequestSchema . safeParse ( json ?? { } )
422+ if ( ! parsed . success ) {
423+ return c . json ( { error : 'Invalid request body' } , 400 )
424+ }
425+
426+ const apiKey = parsed . data . customApiKey || process . env . ELEVENLABS_API_KEY
427+ if ( ! apiKey ) {
428+ return c . json ( { error : 'ElevenLabs API key not configured' } , 400 )
429+ }
430+
431+ try {
432+ const response = await fetch ( `${ ELEVENLABS_API_BASE } /single-use-token/realtime_scribe` , {
433+ method : 'POST' ,
434+ headers : {
435+ 'xi-api-key' : apiKey ,
436+ 'Accept' : 'application/json'
437+ }
438+ } )
439+
440+ if ( ! response . ok ) {
441+ const errorData = await response . json ( ) . catch ( ( ) => ( { } ) ) as { detail ?: { message ?: string } | string ; error ?: string }
442+ const errorMessage = typeof errorData . detail === 'string'
443+ ? errorData . detail
444+ : errorData . detail ?. message || errorData . error || `ElevenLabs API error: ${ response . status } `
445+ return c . json ( { error : errorMessage } , 500 )
446+ }
447+
448+ const data = await response . json ( ) as { token ?: string }
449+ if ( ! data . token ) {
450+ return c . json ( { error : 'No token in ElevenLabs response' } , 500 )
451+ }
452+
453+ return c . json ( { token : data . token } )
454+ } catch ( error ) {
455+ return c . json ( {
456+ error : error instanceof Error ? error . message : 'Network error'
457+ } , 500 )
458+ }
459+ } )
460+
318461 return app
319462}
0 commit comments