diff --git a/.changeset/model-voice-input.md b/.changeset/model-voice-input.md new file mode 100644 index 000000000..3ec99d539 --- /dev/null +++ b/.changeset/model-voice-input.md @@ -0,0 +1,5 @@ +--- +"@iqai/adk": patch +--- + +feat: add model-aware voice input with audio support detection diff --git a/apps/adk-web/components/chat-panel.tsx b/apps/adk-web/components/chat-panel.tsx index f63c371d4..bffe1b2ce 100644 --- a/apps/adk-web/components/chat-panel.tsx +++ b/apps/adk-web/components/chat-panel.tsx @@ -1,7 +1,7 @@ "use client"; import { Bot, MessageSquare, Paperclip, User as UserIcon } from "lucide-react"; -import { useEffect, useRef, useState } from "react"; +import { useEffect, useMemo, useRef, useState } from "react"; import { toast } from "sonner"; import type { Message as ChatMessage } from "@/app/(dashboard)/_schema"; import { ConversationAutoScroll } from "@/components/ai-elements/conversation-auto-scroll"; @@ -21,8 +21,17 @@ import { } from "@/components/ai-elements/prompt-input"; import { Response } from "@/components/ai-elements/response"; import { Button } from "@/components/ui/button"; +import { + Tooltip, + TooltipContent, + TooltipTrigger, +} from "@/components/ui/tooltip"; import { useChatAttachments } from "@/hooks/use-chat-attachments"; import useVoiceRecording from "@/hooks/use-voice-recording"; +import { + getAudioUnsupportedMessage, + inferModelNameFromAgent, +} from "@/lib/model-capabilities"; import { cn } from "@/lib/utils"; import type { AgentListItemDto as Agent } from "../Api"; @@ -57,6 +66,11 @@ export function ChatPanel({ isDragOver, } = useChatAttachments(); + const inferredModelName = useMemo( + () => inferModelNameFromAgent(selectedAgent), + [selectedAgent], + ); + const { recording, error, @@ -65,7 +79,8 @@ export function ChatPanel({ startRecording, stopRecording, clearAudio, - } = useVoiceRecording(); + audioSupported, + } = useVoiceRecording({ modelName: inferredModelName }); const handleSubmit = (e: React.FormEvent) => { e.preventDefault(); @@ -84,13 +99,26 @@ export function ChatPanel({ const handleVoiceRecording = async () => { if (recording) { // Stop recording and get both the audio file and transcript - const { file, transcript } = await stopRecording(); + const { file, transcript, hasValidTranscript } = await stopRecording(); if (file) { + // Check if we have valid transcription + if (!hasValidTranscript) { + toast.error( + "Transcription failed or is too short. Please try speaking more clearly or use text input.", + ); + clearAudio(); + return; + } + // Use the transcribed text as the message - // If transcription failed or is empty, use a fallback message - const messageText = - transcript?.trim() || "Voice message (transcription unavailable)"; + const messageText = transcript?.trim() || ""; + + if (!messageText) { + toast.error("No transcription available. Please try again."); + clearAudio(); + return; + } // Send the transcribed text along with the audio file // The agent receives the text message, and optionally the audio file as attachment @@ -320,12 +348,30 @@ export function ChatPanel({
- + {audioSupported ? ( + + ) : ( + + +
+ +
+
+ +

+ {getAudioUnsupportedMessage(inferredModelName)} +

+
+
+ )} { +import { supportsAudioInput } from "@/lib/model-capabilities"; + +interface UseVoiceRecordingOptions { + modelName?: string | null; +} + +/** + * Validates if transcribed text has meaningful content + * Checks for minimum length and non-placeholder text + */ +function isValidTranscript(text: string): boolean { + const trimmed = text.trim(); + // Minimum 3 characters to be considered valid + if (trimmed.length < 3) return false; + // Check if it's not just placeholder text + const placeholders = [ + "voice message", + "transcription unavailable", + "listening", + "recording", + ]; + const lower = trimmed.toLowerCase(); + return !placeholders.some((placeholder) => lower === placeholder); +} + +const useVoiceRecording = (options?: UseVoiceRecordingOptions) => { + const { modelName } = options || {}; const [recording, setRecording] = useState(false); const [audioFile, setAudioFile] = useState(null); const [error, setError] = useState(null); @@ -16,7 +41,18 @@ const useVoiceRecording = () => { const stopTranscriptionRef = useRef<(() => void) | null>(null); const accumulatedTranscriptRef = useRef(""); + // Check if model supports audio + const audioSupported = supportsAudioInput(modelName); + const startRecording = useCallback(async () => { + // Check if model supports audio + if (!audioSupported) { + setError( + "Voice input is not supported for this model. Please use GPT-4o or Gemini models.", + ); + return; + } + try { setError(null); setAudioFile(null); @@ -96,11 +132,12 @@ const useVoiceRecording = () => { setError(errorMessage); console.error("Error starting recording:", err); } - }, []); + }, [audioSupported]); const stopRecording = useCallback(async (): Promise<{ file: File | null; transcript: string; + hasValidTranscript: boolean; }> => { return new Promise((resolve) => { // Step 1: Stop transcription first @@ -113,10 +150,15 @@ const useVoiceRecording = () => { // Step 2: Get the final transcribed text const finalTranscript = accumulatedTranscriptRef.current.trim(); + const hasValidTranscript = isValidTranscript(finalTranscript); if (!mediaRecorderRef.current) { setRecording(false); - resolve({ file: null, transcript: finalTranscript }); + resolve({ + file: null, + transcript: finalTranscript, + hasValidTranscript, + }); return; } @@ -143,20 +185,30 @@ const useVoiceRecording = () => { // Clean up microphone stream if (streamRef.current) { - streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current.getTracks().forEach((track) => { + track.stop(); + }); streamRef.current = null; } mediaRecorderRef.current = null; // Return both the file and the transcript - resolve({ file, transcript: finalTranscript }); + resolve({ + file, + transcript: finalTranscript, + hasValidTranscript, + }); }; if (mediaRecorderRef.current.state !== "inactive") { mediaRecorderRef.current.stop(); } else { setRecording(false); - resolve({ file: null, transcript: finalTranscript }); + resolve({ + file: null, + transcript: finalTranscript, + hasValidTranscript, + }); } }); }, []); @@ -177,6 +229,7 @@ const useVoiceRecording = () => { startRecording, stopRecording, clearAudio, + audioSupported, }; }; diff --git a/apps/adk-web/lib/model-capabilities.ts b/apps/adk-web/lib/model-capabilities.ts new file mode 100644 index 000000000..0b3eeeeb4 --- /dev/null +++ b/apps/adk-web/lib/model-capabilities.ts @@ -0,0 +1,190 @@ +/** + * Utility functions to check model capabilities + */ + +import type { AgentListItemDto as Agent } from "../Api"; + +/** + * Extracts the base model name from various formats: + * + * Direct model names: + * "gpt-4o" -> "gpt-4o" + * "gemini-2.5-flash" -> "gemini-2.5-flash" + * "claude-3-5-sonnet-20241022" -> "claude-3-5-sonnet-20241022" + * + * Provider-prefixed (OpenRouter / Vercel AI SDK): + * "openai/gpt-4o" -> "gpt-4o" + * "google/gemini-2.5-flash" -> "gemini-2.5-flash" + * "anthropic/claude-3-opus" -> "claude-3-opus" + * + * Nested provider paths (OpenRouter via another provider): + * "openrouter/google/gemini-2.5-flash" -> "gemini-2.5-flash" + * "openrouter/openai/gpt-4o" -> "gpt-4o" + * + * Vertex AI format: + * "projects/my-proj/locations/us-central1/publishers/google/models/gemini-2.5-flash" + * -> "gemini-2.5-flash" + * + * @param modelName - The model name/identifier in any format + * @returns The base model name without provider prefix + */ +function extractBaseModelName(modelName: string): string { + const normalized = modelName.toLowerCase().trim(); + + // Handle Vertex AI format: + // "projects/.../publishers/google/models/gemini-2.5-flash" + const vertexMatch = normalized.match(/\/models\/(.+)$/); + if (vertexMatch) { + return vertexMatch[1]; + } + + // Handle provider-prefixed formats: "provider/model" or "provider/provider/model" + // Always take the last segment as the model name + if (normalized.includes("/")) { + const parts = normalized.split("/"); + return parts[parts.length - 1]; + } + + // Return as-is for direct model names + return normalized; +} + +/** + * Checks if a model supports audio input + * + * Supported models: + * - Google Gemini models (gemini-*) + * - OpenAI gpt-4o models (gpt-4o, gpt-4o-mini, gpt-4o-2024-*, etc.) + * + * Supports multiple formats: + * - Direct model names: "gpt-4o", "gemini-2.5-flash" + * - OpenRouter format: "openai/gpt-4o", "google/gemini-2.5-flash" + * - Vercel AI SDK format: "google/gemini-2.5-flash" + * - Vertex AI format: "projects/.../models/gemini-2.5-flash" + * - Agent names that may contain model info + * + * @param modelName - The model name/identifier in any format + * @returns true if the model supports audio input + */ +export function supportsAudioInput( + modelName: string | null | undefined, +): boolean { + // When we can't determine the model, default to enabled + // (don't break voice for agents whose model we can't infer) + if (!modelName) return true; + + // Extract base model name (handles OpenRouter, Vercel AI SDK, Vertex AI formats) + const baseModel = extractBaseModelName(modelName); + + // Google Gemini models support audio + // Check for: gemini-*, google/gemini-*, or agent names containing "gemini" + if ( + baseModel.includes("gemini") || + modelName.toLowerCase().includes("gemini") + ) { + return true; + } + + // OpenAI gpt-4o models support audio + // Check for: gpt-4o, gpt-4o-mini, gpt-4o-2024-*, etc. + if (baseModel.startsWith("gpt-4o")) { + return true; + } + + // OpenAI gpt-4-turbo models may support audio (check specific versions) + if (baseModel.startsWith("gpt-4-turbo")) { + // Only newer versions with audio support + return baseModel.includes("2024-11") || baseModel.includes("2024-12"); + } + + // Check if model name contains gpt-4o (for agent names like "gpt-4o-agent") + if (modelName.toLowerCase().includes("gpt-4o")) { + return true; + } + + return false; +} + +/** + * Gets a user-friendly message explaining why audio isn't supported + * + * @param modelName - The model name/identifier (can be in any format) + * @returns A message explaining the limitation + */ +export function getAudioUnsupportedMessage( + modelName: string | null | undefined, +): string { + if (!modelName) { + return "Voice input is not available. Please select an agent with a supported model (GPT-4o or Gemini)."; + } + + const baseModel = extractBaseModelName(modelName); + const normalized = modelName.toLowerCase(); + + // Check for specific unsupported model types + if (baseModel.startsWith("gpt-3.5") || normalized.includes("gpt-3.5")) { + return "Voice input is not supported for GPT-3.5 models. Please use GPT-4o or Gemini models."; + } + + if ( + (baseModel.startsWith("gpt-4") && !baseModel.startsWith("gpt-4o")) || + (normalized.includes("gpt-4") && !normalized.includes("gpt-4o")) + ) { + return "Voice input is only supported for GPT-4o models. Please use GPT-4o or Gemini models."; + } + + if ( + baseModel.startsWith("o1") || + baseModel.startsWith("o3") || + normalized.includes("o1") || + normalized.includes("o3") + ) { + return "Voice input is not supported for reasoning models (o1, o3). Please use GPT-4o or Gemini models."; + } + + if (baseModel.startsWith("claude") || normalized.includes("claude")) { + return "Voice input is not supported for Claude models. Please use GPT-4o or Gemini models."; + } + + // Generic message for unknown models + return "Voice input is not supported for this model. Please use GPT-4o or Gemini models for voice input."; +} + +/** + * Infers a model name from an agent's name and path. + * Best-effort approach — returns null if no model pattern is found. + */ +export function inferModelNameFromAgent(agent: Agent | null): string | null { + if (!agent) return null; + + const name = agent.name.toLowerCase(); + const path = agent.relativePath?.toLowerCase() || ""; + const combined = `${name} ${path}`; + + // Check for OpenRouter format patterns (provider/model) + if (combined.includes("openai/gpt-4o") || combined.includes("openai/gpt4o")) { + return "openai/gpt-4o"; + } + if (combined.includes("google/gemini")) { + return "google/gemini-2.5-flash"; + } + + // Check for direct model patterns in agent name + if (name.includes("gpt-4o") || name.includes("gpt4o")) return "gpt-4o"; + if (name.includes("gemini")) { + const geminiMatch = name.match(/gemini[-\s]?([\d.]+)?/); + if (geminiMatch?.[1]) { + return `gemini-${geminiMatch[1]}`; + } + return "gemini-2.5-flash"; + } + if (name.includes("gpt-4") || name.includes("gpt4")) return "gpt-4"; + if (name.includes("gpt-3.5")) return "gpt-3.5-turbo"; + if (name.includes("claude")) return "claude-3-5-sonnet"; + + // Check path for model indicators + if (path.includes("gpt-4o") || path.includes("gpt4o")) return "gpt-4o"; + if (path.includes("gemini")) return "gemini-2.5-flash"; + + return null; +} diff --git a/packages/adk/src/models/openai-llm.ts b/packages/adk/src/models/openai-llm.ts index 1c826912f..4de580013 100644 --- a/packages/adk/src/models/openai-llm.ts +++ b/packages/adk/src/models/openai-llm.ts @@ -448,12 +448,38 @@ export class OpenAiLlm extends BaseLlm { } if (part.inline_data?.mime_type && part.inline_data?.data) { - return { - type: "image_url", - image_url: { - url: `data:${part.inline_data.mime_type};base64,${part.inline_data.data}`, - }, - }; + const mimeType = part.inline_data.mime_type; + + // Handle audio input + if (mimeType.startsWith("audio/")) { + const formatMap: Record = { + "audio/wav": "wav", + "audio/mp3": "mp3", + "audio/mpeg": "mp3", + "audio/webm": "webm", + "audio/ogg": "ogg", + "audio/mp4": "mp4", + }; + const format = formatMap[mimeType] || mimeType.split("/")[1]; + + return { + type: "input_audio" as const, + input_audio: { + data: part.inline_data.data, + format, + }, + }; + } + + // Handle image input (existing behavior) + if (mimeType.startsWith("image/")) { + return { + type: "image_url", + image_url: { + url: `data:${part.inline_data.mime_type};base64,${part.inline_data.data}`, + }, + }; + } } throw new Error("Unsupported part type for OpenAI conversion"); @@ -591,7 +617,6 @@ export class OpenAiLlm extends BaseLlm { if (part.inline_data) { // Ensure inline data is in the correct format for OpenAI if (!part.inline_data.mime_type || !part.inline_data.data) { - // biome-ignore lint/performance/noDelete: Remove invalid inline data delete part.inline_data; } }