Skip to content

Commit f8f1132

Browse files
committed
refactor: add handleSpeechTurn method to process audio input and integrate with agent response
1 parent 6a5f6a5 commit f8f1132

4 files changed

Lines changed: 184 additions & 154 deletions

File tree

agentTurnService.ts

Lines changed: 172 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { AdminUser, IAdminForth } from "adminforth";
1+
import type { AdminUser, AudioAdapter, IAdminForth } from "adminforth";
22
import { logger } from "adminforth";
33
import { randomUUID } from "crypto";
44
import { HumanMessage, SystemMessage } from "langchain";
@@ -11,6 +11,7 @@ import type { AgentEventEmitter } from "./agentEvents.js";
1111
import { buildAgentTurnSystemPrompt } from "./agent/systemPrompt.js";
1212
import type { CurrentPageContext } from "./agent/tools/getUserLocation.js";
1313
import { isAbortError, getErrorMessage } from "./errors.js";
14+
import { sanitizeSpeechText } from "./sanitizeSpeechText.js";
1415
import type { AgentSessionStore } from "./sessionStore.js";
1516
import type { PluginOptions } from "./types.js";
1617

@@ -54,6 +55,15 @@ export type HandleTurnInput = Omit<RunAndPersistAgentResponseInput, "failureLogM
5455
abortLogMessage?: string;
5556
};
5657

58+
export type HandleSpeechTurnInput = Omit<HandleTurnInput, "prompt"> & {
59+
audioAdapter: AudioAdapter;
60+
audio: {
61+
buffer: Buffer;
62+
filename: string;
63+
mimeType: string;
64+
};
65+
};
66+
5767
type AgentTurnServiceOptions = {
5868
getAdminforth: () => IAdminForth;
5969
getPluginInstanceId: () => string;
@@ -342,6 +352,167 @@ export class AgentTurnService {
342352

343353
return agentResponse;
344354
}
355+
356+
async handleSpeechTurn(input: HandleSpeechTurnInput) {
357+
let transcription;
358+
359+
try {
360+
transcription = await input.audioAdapter.transcribe({
361+
buffer: input.audio.buffer,
362+
filename: input.audio.filename,
363+
mimeType: input.audio.mimeType,
364+
language: "auto",
365+
abortSignal: input.abortSignal,
366+
});
367+
} catch (error) {
368+
if (input.abortSignal?.aborted || isAbortError(error)) {
369+
logger.info("Agent speech transcription aborted by the client");
370+
await input.emit({ type: "finish" });
371+
return null;
372+
}
373+
374+
logger.error(`Agent speech transcription failed:\n${getErrorMessage(error)}`);
375+
await input.emit({
376+
type: "error",
377+
error: "Speech transcription failed. Check server logs for details.",
378+
});
379+
await input.emit({ type: "finish" });
380+
return null;
381+
}
382+
383+
if (input.abortSignal?.aborted) {
384+
await input.emit({ type: "finish" });
385+
return null;
386+
}
387+
388+
const prompt = transcription.text;
389+
if (!prompt) {
390+
await input.emit({
391+
type: "error",
392+
error: "Speech transcription is empty",
393+
});
394+
await input.emit({ type: "finish" });
395+
return null;
396+
}
397+
398+
await input.emit({
399+
type: "transcript",
400+
text: transcription.text,
401+
language: transcription.language,
402+
});
403+
404+
const agentResponse = await this.runAndPersistAgentResponse({
405+
prompt,
406+
sessionId: input.sessionId,
407+
modeName: input.modeName,
408+
userTimeZone: input.userTimeZone,
409+
currentPage: input.currentPage,
410+
abortSignal: input.abortSignal,
411+
adminUser: input.adminUser,
412+
emit: async (event) => {
413+
if (event.type === "tool-call") {
414+
await input.emit(event);
415+
}
416+
},
417+
failureLogMessage: input.failureLogMessage ?? "Agent speech response failed",
418+
abortLogMessage: input.abortLogMessage ?? "Agent speech response aborted by the client",
419+
});
420+
421+
if (agentResponse.aborted) {
422+
await input.emit({ type: "finish" });
423+
return agentResponse;
424+
}
425+
426+
if (agentResponse.failed) {
427+
await input.emit({
428+
type: "error",
429+
error: agentResponse.text,
430+
});
431+
await input.emit({ type: "finish" });
432+
return agentResponse;
433+
}
434+
435+
try {
436+
await input.emit({
437+
type: "speech-response",
438+
transcript: {
439+
text: transcription.text,
440+
language: transcription.language,
441+
},
442+
response: {
443+
text: agentResponse.text,
444+
},
445+
sessionId: input.sessionId,
446+
turnId: agentResponse.turnId,
447+
});
448+
const speech = await input.audioAdapter.synthesize({
449+
text: sanitizeSpeechText(agentResponse.text),
450+
stream: true,
451+
streamFormat: "audio",
452+
format: "pcm",
453+
abortSignal: input.abortSignal,
454+
});
455+
456+
await input.emit({
457+
type: "audio-start",
458+
mimeType: speech.mimeType,
459+
format: speech.format,
460+
sampleRate: 24000,
461+
channelCount: 1,
462+
bitsPerSample: 16,
463+
});
464+
465+
const reader = speech.audioStream.getReader();
466+
const cancelAudioStream = () => {
467+
void reader.cancel().catch(() => undefined);
468+
};
469+
470+
try {
471+
input.abortSignal?.addEventListener("abort", cancelAudioStream, { once: true });
472+
473+
while (true) {
474+
if (input.abortSignal?.aborted) {
475+
await reader.cancel().catch(() => undefined);
476+
break;
477+
}
478+
479+
const { value, done } = await reader.read();
480+
481+
if (done) {
482+
break;
483+
}
484+
485+
if (input.abortSignal?.aborted) {
486+
break;
487+
}
488+
489+
await input.emit({
490+
type: "audio-delta",
491+
value,
492+
});
493+
}
494+
} finally {
495+
input.abortSignal?.removeEventListener("abort", cancelAudioStream);
496+
reader.releaseLock();
497+
}
498+
499+
await input.emit({ type: "audio-done" });
500+
await input.emit({ type: "finish" });
501+
return agentResponse;
502+
} catch (error) {
503+
if (input.abortSignal?.aborted || isAbortError(error)) {
504+
logger.info("Agent speech audio streaming aborted by the client");
505+
} else {
506+
logger.error(`Agent speech audio streaming failed:\n${getErrorMessage(error)}`);
507+
await input.emit({
508+
type: "error",
509+
error: getErrorMessage(error),
510+
});
511+
}
512+
await input.emit({ type: "finish" });
513+
return agentResponse;
514+
}
515+
}
345516
}
346517

347518
function getPartialVegaLiteFenceStartLength(text: string): number {

endpoints/context.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import type {
77
} from "adminforth";
88
import type { ZodType } from "zod";
99
import type {
10+
HandleSpeechTurnInput,
1011
HandleTurnInput,
1112
RunAndPersistAgentResponseInput,
1213
RunAndPersistAgentResponseResult,
@@ -30,6 +31,7 @@ export type AgentEndpointsContext = {
3031
options: PluginOptions;
3132
parseBody<T>(schema: ZodType<T>, body: unknown, response: EndpointResponse): T | null;
3233
handleTurn(input: HandleTurnInput): Promise<RunAndPersistAgentResponseResult>;
34+
handleSpeechTurn(input: HandleSpeechTurnInput): Promise<RunAndPersistAgentResponseResult | null>;
3335
runAndPersistAgentResponse(input: RunAndPersistAgentResponseInput): Promise<RunAndPersistAgentResponseResult>;
3436
getSessionTurns(sessionId: string): Promise<SessionTurn[]>;
3537
createNewTurn(sessionId: string, prompt: string, response?: string): Promise<string>;
@@ -45,7 +47,7 @@ export type AgentEndpointsContext = {
4547

4648
export type CoreEndpointsContext = Pick<
4749
AgentEndpointsContext,
48-
"options" | "parseBody" | "handleTurn" | "runAndPersistAgentResponse"
50+
"options" | "parseBody" | "handleTurn" | "handleSpeechTurn"
4951
>;
5052

5153
export type SessionEndpointsContext = Pick<

0 commit comments

Comments
 (0)