Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/model-voice-input.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@iqai/adk": patch
---

feat: add model-aware voice input with audio support detection
70 changes: 58 additions & 12 deletions apps/adk-web/components/chat-panel.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"use client";

import { Bot, MessageSquare, Paperclip, User as UserIcon } from "lucide-react";
import { useEffect, useRef, useState } from "react";
import { useEffect, useMemo, useRef, useState } from "react";
import { toast } from "sonner";
import type { Message as ChatMessage } from "@/app/(dashboard)/_schema";
import { ConversationAutoScroll } from "@/components/ai-elements/conversation-auto-scroll";
Expand All @@ -21,8 +21,17 @@ import {
} from "@/components/ai-elements/prompt-input";
import { Response } from "@/components/ai-elements/response";
import { Button } from "@/components/ui/button";
import {
Tooltip,
TooltipContent,
TooltipTrigger,
} from "@/components/ui/tooltip";
import { useChatAttachments } from "@/hooks/use-chat-attachments";
import useVoiceRecording from "@/hooks/use-voice-recording";
import {
getAudioUnsupportedMessage,
inferModelNameFromAgent,
} from "@/lib/model-capabilities";
import { cn } from "@/lib/utils";
import type { AgentListItemDto as Agent } from "../Api";

Expand Down Expand Up @@ -57,6 +66,11 @@ export function ChatPanel({
isDragOver,
} = useChatAttachments();

const inferredModelName = useMemo(
() => inferModelNameFromAgent(selectedAgent),
[selectedAgent],
);

const {
recording,
error,
Expand All @@ -65,7 +79,8 @@ export function ChatPanel({
startRecording,
stopRecording,
clearAudio,
} = useVoiceRecording();
audioSupported,
} = useVoiceRecording({ modelName: inferredModelName });

const handleSubmit = (e: React.FormEvent) => {
e.preventDefault();
Expand All @@ -84,13 +99,26 @@ export function ChatPanel({
const handleVoiceRecording = async () => {
if (recording) {
// Stop recording and get both the audio file and transcript
const { file, transcript } = await stopRecording();
const { file, transcript, hasValidTranscript } = await stopRecording();

if (file) {
// Check if we have valid transcription
if (!hasValidTranscript) {
toast.error(
"Transcription failed or is too short. Please try speaking more clearly or use text input.",
);
clearAudio();
return;
}

// Use the transcribed text as the message
// If transcription failed or is empty, use a fallback message
const messageText =
transcript?.trim() || "Voice message (transcription unavailable)";
const messageText = transcript?.trim() || "";

if (!messageText) {
toast.error("No transcription available. Please try again.");
clearAudio();
return;
}

// Send the transcribed text along with the audio file
// The agent receives the text message, and optionally the audio file as attachment
Expand Down Expand Up @@ -320,12 +348,30 @@ export function ChatPanel({
</PromptInputButton>
</PromptInputTools>
<div>
<PromptInputMicButton
variant={"secondary"}
status={{ recording }}
onClick={handleVoiceRecording}
disabled={isLoading || isSendingMessage}
/>
{audioSupported ? (
<PromptInputMicButton
variant={"secondary"}
status={{ recording }}
onClick={handleVoiceRecording}
disabled={isLoading || isSendingMessage}
/>
) : (
<Tooltip>
<TooltipTrigger asChild>
<div>
<PromptInputMicButton
variant={"secondary"}
disabled={true}
/>
Comment on lines +362 to +365
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The onClick handler on this disabled PromptInputMicButton will not be triggered by user clicks because the button's disabled prop is set to true. The toast.error will never be shown. The tooltip already provides sufficient information to the user about why the button is disabled. This onClick handler is effectively dead code and should be removed for clarity.

Suggested change
<PromptInputMicButton
variant={"secondary"}
onClick={() => {
// Show tooltip message
toast.error(
getAudioUnsupportedMessage(inferredModelName),
);
}}
disabled={true}
/>
<PromptInputMicButton
variant={"secondary"}
disabled={true}
/>

</div>
</TooltipTrigger>
<TooltipContent>
<p className="max-w-xs">
{getAudioUnsupportedMessage(inferredModelName)}
</p>
</TooltipContent>
</Tooltip>
)}
<PromptInputSubmit
status={isSendingMessage ? "streaming" : "ready"}
disabled={
Expand Down
67 changes: 60 additions & 7 deletions apps/adk-web/hooks/use-voice-recording.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,33 @@ import {
isSpeechRecognitionSupported,
startTranscription,
} from "@/lib/transcribe-audio";

const useVoiceRecording = () => {
import { supportsAudioInput } from "@/lib/model-capabilities";

interface UseVoiceRecordingOptions {
modelName?: string | null;
}

/**
* Validates if transcribed text has meaningful content
* Checks for minimum length and non-placeholder text
*/
function isValidTranscript(text: string): boolean {
const trimmed = text.trim();
// Minimum 3 characters to be considered valid
if (trimmed.length < 3) return false;
// Check if it's not just placeholder text
const placeholders = [
"voice message",
"transcription unavailable",
"listening",
"recording",
];
const lower = trimmed.toLowerCase();
return !placeholders.some((placeholder) => lower === placeholder);
}

const useVoiceRecording = (options?: UseVoiceRecordingOptions) => {
const { modelName } = options || {};
const [recording, setRecording] = useState(false);
const [audioFile, setAudioFile] = useState<File | null>(null);
const [error, setError] = useState<string | null>(null);
Expand All @@ -16,7 +41,18 @@ const useVoiceRecording = () => {
const stopTranscriptionRef = useRef<(() => void) | null>(null);
const accumulatedTranscriptRef = useRef<string>("");

// Check if model supports audio
const audioSupported = supportsAudioInput(modelName);

const startRecording = useCallback(async () => {
// Check if model supports audio
if (!audioSupported) {
setError(
"Voice input is not supported for this model. Please use GPT-4o or Gemini models.",
);
return;
}

try {
setError(null);
setAudioFile(null);
Expand Down Expand Up @@ -96,11 +132,12 @@ const useVoiceRecording = () => {
setError(errorMessage);
console.error("Error starting recording:", err);
}
}, []);
}, [audioSupported]);

const stopRecording = useCallback(async (): Promise<{
file: File | null;
transcript: string;
hasValidTranscript: boolean;
}> => {
return new Promise((resolve) => {
// Step 1: Stop transcription first
Expand All @@ -113,10 +150,15 @@ const useVoiceRecording = () => {

// Step 2: Get the final transcribed text
const finalTranscript = accumulatedTranscriptRef.current.trim();
const hasValidTranscript = isValidTranscript(finalTranscript);

if (!mediaRecorderRef.current) {
setRecording(false);
resolve({ file: null, transcript: finalTranscript });
resolve({
file: null,
transcript: finalTranscript,
hasValidTranscript,
});
return;
}

Expand All @@ -143,20 +185,30 @@ const useVoiceRecording = () => {

// Clean up microphone stream
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
streamRef.current.getTracks().forEach((track) => {
track.stop();
});
streamRef.current = null;
}
mediaRecorderRef.current = null;

// Return both the file and the transcript
resolve({ file, transcript: finalTranscript });
resolve({
file,
transcript: finalTranscript,
hasValidTranscript,
});
Comment on lines +196 to +200
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The isValidTranscript function is being called again here, but its result has already been calculated and stored in the hasValidTranscript variable on line 153. To avoid this redundant computation, you should reuse the hasValidTranscript variable that is already in scope.

Suggested change
resolve({
file,
transcript: finalTranscript,
hasValidTranscript: isValidTranscript(finalTranscript),
});
resolve({
file,
transcript: finalTranscript,
hasValidTranscript,
});

};

if (mediaRecorderRef.current.state !== "inactive") {
mediaRecorderRef.current.stop();
} else {
setRecording(false);
resolve({ file: null, transcript: finalTranscript });
resolve({
file: null,
transcript: finalTranscript,
hasValidTranscript,
});
}
});
}, []);
Expand All @@ -177,6 +229,7 @@ const useVoiceRecording = () => {
startRecording,
stopRecording,
clearAudio,
audioSupported,
};
};

Expand Down
Loading
Loading