1- import type { AdminUser , IAdminForth } from "adminforth" ;
1+ import type { AdminUser , AudioAdapter , IAdminForth } from "adminforth" ;
22import { logger } from "adminforth" ;
33import { randomUUID } from "crypto" ;
44import { HumanMessage , SystemMessage } from "langchain" ;
@@ -11,6 +11,7 @@ import type { AgentEventEmitter } from "./agentEvents.js";
1111import { buildAgentTurnSystemPrompt } from "./agent/systemPrompt.js" ;
1212import type { CurrentPageContext } from "./agent/tools/getUserLocation.js" ;
1313import { isAbortError , getErrorMessage } from "./errors.js" ;
14+ import { sanitizeSpeechText } from "./sanitizeSpeechText.js" ;
1415import type { AgentSessionStore } from "./sessionStore.js" ;
1516import type { PluginOptions } from "./types.js" ;
1617
@@ -54,6 +55,15 @@ export type HandleTurnInput = Omit<RunAndPersistAgentResponseInput, "failureLogM
5455 abortLogMessage ?: string ;
5556} ;
5657
58+ export type HandleSpeechTurnInput = Omit < HandleTurnInput , "prompt" > & {
59+ audioAdapter : AudioAdapter ;
60+ audio : {
61+ buffer : Buffer ;
62+ filename : string ;
63+ mimeType : string ;
64+ } ;
65+ } ;
66+
5767type AgentTurnServiceOptions = {
5868 getAdminforth : ( ) => IAdminForth ;
5969 getPluginInstanceId : ( ) => string ;
@@ -342,6 +352,167 @@ export class AgentTurnService {
342352
343353 return agentResponse ;
344354 }
355+
356+ async handleSpeechTurn ( input : HandleSpeechTurnInput ) {
357+ let transcription ;
358+
359+ try {
360+ transcription = await input . audioAdapter . transcribe ( {
361+ buffer : input . audio . buffer ,
362+ filename : input . audio . filename ,
363+ mimeType : input . audio . mimeType ,
364+ language : "auto" ,
365+ abortSignal : input . abortSignal ,
366+ } ) ;
367+ } catch ( error ) {
368+ if ( input . abortSignal ?. aborted || isAbortError ( error ) ) {
369+ logger . info ( "Agent speech transcription aborted by the client" ) ;
370+ await input . emit ( { type : "finish" } ) ;
371+ return null ;
372+ }
373+
374+ logger . error ( `Agent speech transcription failed:\n${ getErrorMessage ( error ) } ` ) ;
375+ await input . emit ( {
376+ type : "error" ,
377+ error : "Speech transcription failed. Check server logs for details." ,
378+ } ) ;
379+ await input . emit ( { type : "finish" } ) ;
380+ return null ;
381+ }
382+
383+ if ( input . abortSignal ?. aborted ) {
384+ await input . emit ( { type : "finish" } ) ;
385+ return null ;
386+ }
387+
388+ const prompt = transcription . text ;
389+ if ( ! prompt ) {
390+ await input . emit ( {
391+ type : "error" ,
392+ error : "Speech transcription is empty" ,
393+ } ) ;
394+ await input . emit ( { type : "finish" } ) ;
395+ return null ;
396+ }
397+
398+ await input . emit ( {
399+ type : "transcript" ,
400+ text : transcription . text ,
401+ language : transcription . language ,
402+ } ) ;
403+
404+ const agentResponse = await this . runAndPersistAgentResponse ( {
405+ prompt,
406+ sessionId : input . sessionId ,
407+ modeName : input . modeName ,
408+ userTimeZone : input . userTimeZone ,
409+ currentPage : input . currentPage ,
410+ abortSignal : input . abortSignal ,
411+ adminUser : input . adminUser ,
412+ emit : async ( event ) => {
413+ if ( event . type === "tool-call" ) {
414+ await input . emit ( event ) ;
415+ }
416+ } ,
417+ failureLogMessage : input . failureLogMessage ?? "Agent speech response failed" ,
418+ abortLogMessage : input . abortLogMessage ?? "Agent speech response aborted by the client" ,
419+ } ) ;
420+
421+ if ( agentResponse . aborted ) {
422+ await input . emit ( { type : "finish" } ) ;
423+ return agentResponse ;
424+ }
425+
426+ if ( agentResponse . failed ) {
427+ await input . emit ( {
428+ type : "error" ,
429+ error : agentResponse . text ,
430+ } ) ;
431+ await input . emit ( { type : "finish" } ) ;
432+ return agentResponse ;
433+ }
434+
435+ try {
436+ await input . emit ( {
437+ type : "speech-response" ,
438+ transcript : {
439+ text : transcription . text ,
440+ language : transcription . language ,
441+ } ,
442+ response : {
443+ text : agentResponse . text ,
444+ } ,
445+ sessionId : input . sessionId ,
446+ turnId : agentResponse . turnId ,
447+ } ) ;
448+ const speech = await input . audioAdapter . synthesize ( {
449+ text : sanitizeSpeechText ( agentResponse . text ) ,
450+ stream : true ,
451+ streamFormat : "audio" ,
452+ format : "pcm" ,
453+ abortSignal : input . abortSignal ,
454+ } ) ;
455+
456+ await input . emit ( {
457+ type : "audio-start" ,
458+ mimeType : speech . mimeType ,
459+ format : speech . format ,
460+ sampleRate : 24000 ,
461+ channelCount : 1 ,
462+ bitsPerSample : 16 ,
463+ } ) ;
464+
465+ const reader = speech . audioStream . getReader ( ) ;
466+ const cancelAudioStream = ( ) => {
467+ void reader . cancel ( ) . catch ( ( ) => undefined ) ;
468+ } ;
469+
470+ try {
471+ input . abortSignal ?. addEventListener ( "abort" , cancelAudioStream , { once : true } ) ;
472+
473+ while ( true ) {
474+ if ( input . abortSignal ?. aborted ) {
475+ await reader . cancel ( ) . catch ( ( ) => undefined ) ;
476+ break ;
477+ }
478+
479+ const { value, done } = await reader . read ( ) ;
480+
481+ if ( done ) {
482+ break ;
483+ }
484+
485+ if ( input . abortSignal ?. aborted ) {
486+ break ;
487+ }
488+
489+ await input . emit ( {
490+ type : "audio-delta" ,
491+ value,
492+ } ) ;
493+ }
494+ } finally {
495+ input . abortSignal ?. removeEventListener ( "abort" , cancelAudioStream ) ;
496+ reader . releaseLock ( ) ;
497+ }
498+
499+ await input . emit ( { type : "audio-done" } ) ;
500+ await input . emit ( { type : "finish" } ) ;
501+ return agentResponse ;
502+ } catch ( error ) {
503+ if ( input . abortSignal ?. aborted || isAbortError ( error ) ) {
504+ logger . info ( "Agent speech audio streaming aborted by the client" ) ;
505+ } else {
506+ logger . error ( `Agent speech audio streaming failed:\n${ getErrorMessage ( error ) } ` ) ;
507+ await input . emit ( {
508+ type : "error" ,
509+ error : getErrorMessage ( error ) ,
510+ } ) ;
511+ }
512+ await input . emit ( { type : "finish" } ) ;
513+ return agentResponse ;
514+ }
515+ }
345516}
346517
347518function getPartialVegaLiteFenceStartLength ( text : string ) : number {
0 commit comments