diff --git a/.changeset/model-voice-input.md b/.changeset/model-voice-input.md
new file mode 100644
index 000000000..3ec99d539
--- /dev/null
+++ b/.changeset/model-voice-input.md
@@ -0,0 +1,5 @@
+---
+"@iqai/adk": patch
+---
+
+feat: add model-aware voice input with audio support detection
diff --git a/apps/adk-web/components/chat-panel.tsx b/apps/adk-web/components/chat-panel.tsx
index f63c371d4..bffe1b2ce 100644
--- a/apps/adk-web/components/chat-panel.tsx
+++ b/apps/adk-web/components/chat-panel.tsx
@@ -1,7 +1,7 @@
"use client";
import { Bot, MessageSquare, Paperclip, User as UserIcon } from "lucide-react";
-import { useEffect, useRef, useState } from "react";
+import { useEffect, useMemo, useRef, useState } from "react";
import { toast } from "sonner";
import type { Message as ChatMessage } from "@/app/(dashboard)/_schema";
import { ConversationAutoScroll } from "@/components/ai-elements/conversation-auto-scroll";
@@ -21,8 +21,17 @@ import {
} from "@/components/ai-elements/prompt-input";
import { Response } from "@/components/ai-elements/response";
import { Button } from "@/components/ui/button";
+import {
+ Tooltip,
+ TooltipContent,
+ TooltipTrigger,
+} from "@/components/ui/tooltip";
import { useChatAttachments } from "@/hooks/use-chat-attachments";
import useVoiceRecording from "@/hooks/use-voice-recording";
+import {
+ getAudioUnsupportedMessage,
+ inferModelNameFromAgent,
+} from "@/lib/model-capabilities";
import { cn } from "@/lib/utils";
import type { AgentListItemDto as Agent } from "../Api";
@@ -57,6 +66,11 @@ export function ChatPanel({
isDragOver,
} = useChatAttachments();
+ const inferredModelName = useMemo(
+ () => inferModelNameFromAgent(selectedAgent),
+ [selectedAgent],
+ );
+
const {
recording,
error,
@@ -65,7 +79,8 @@ export function ChatPanel({
startRecording,
stopRecording,
clearAudio,
- } = useVoiceRecording();
+ audioSupported,
+ } = useVoiceRecording({ modelName: inferredModelName });
const handleSubmit = (e: React.FormEvent) => {
e.preventDefault();
@@ -84,13 +99,26 @@ export function ChatPanel({
const handleVoiceRecording = async () => {
if (recording) {
// Stop recording and get both the audio file and transcript
- const { file, transcript } = await stopRecording();
+ const { file, transcript, hasValidTranscript } = await stopRecording();
if (file) {
+ // Check if we have valid transcription
+ if (!hasValidTranscript) {
+ toast.error(
+ "Transcription failed or is too short. Please try speaking more clearly or use text input.",
+ );
+ clearAudio();
+ return;
+ }
+
// Use the transcribed text as the message
- // If transcription failed or is empty, use a fallback message
- const messageText =
- transcript?.trim() || "Voice message (transcription unavailable)";
+ const messageText = transcript?.trim() || "";
+
+ if (!messageText) {
+ toast.error("No transcription available. Please try again.");
+ clearAudio();
+ return;
+ }
// Send the transcribed text along with the audio file
// The agent receives the text message, and optionally the audio file as attachment
@@ -320,12 +348,30 @@ export function ChatPanel({
-
+ {audioSupported ? (
+
+ ) : (
+
+
+
+
+
+
+ {getAudioUnsupportedMessage(inferredModelName)}
+
+
+
+ )}
{
+import { supportsAudioInput } from "@/lib/model-capabilities";
+
+interface UseVoiceRecordingOptions {
+ modelName?: string | null;
+}
+
+/**
+ * Validates if transcribed text has meaningful content
+ * Checks for minimum length and non-placeholder text
+ */
+function isValidTranscript(text: string): boolean {
+ const trimmed = text.trim();
+ // Minimum 3 characters to be considered valid
+ if (trimmed.length < 3) return false;
+ // Check if it's not just placeholder text
+ const placeholders = [
+ "voice message",
+ "transcription unavailable",
+ "listening",
+ "recording",
+ ];
+ const lower = trimmed.toLowerCase();
+ return !placeholders.some((placeholder) => lower === placeholder);
+}
+
+const useVoiceRecording = (options?: UseVoiceRecordingOptions) => {
+ const { modelName } = options || {};
const [recording, setRecording] = useState(false);
const [audioFile, setAudioFile] = useState(null);
const [error, setError] = useState(null);
@@ -16,7 +41,18 @@ const useVoiceRecording = () => {
const stopTranscriptionRef = useRef<(() => void) | null>(null);
const accumulatedTranscriptRef = useRef("");
+ // Check if model supports audio
+ const audioSupported = supportsAudioInput(modelName);
+
const startRecording = useCallback(async () => {
+ // Check if model supports audio
+ if (!audioSupported) {
+ setError(
+ "Voice input is not supported for this model. Please use GPT-4o or Gemini models.",
+ );
+ return;
+ }
+
try {
setError(null);
setAudioFile(null);
@@ -96,11 +132,12 @@ const useVoiceRecording = () => {
setError(errorMessage);
console.error("Error starting recording:", err);
}
- }, []);
+ }, [audioSupported]);
const stopRecording = useCallback(async (): Promise<{
file: File | null;
transcript: string;
+ hasValidTranscript: boolean;
}> => {
return new Promise((resolve) => {
// Step 1: Stop transcription first
@@ -113,10 +150,15 @@ const useVoiceRecording = () => {
// Step 2: Get the final transcribed text
const finalTranscript = accumulatedTranscriptRef.current.trim();
+ const hasValidTranscript = isValidTranscript(finalTranscript);
if (!mediaRecorderRef.current) {
setRecording(false);
- resolve({ file: null, transcript: finalTranscript });
+ resolve({
+ file: null,
+ transcript: finalTranscript,
+ hasValidTranscript,
+ });
return;
}
@@ -143,20 +185,30 @@ const useVoiceRecording = () => {
// Clean up microphone stream
if (streamRef.current) {
- streamRef.current.getTracks().forEach((track) => track.stop());
+ streamRef.current.getTracks().forEach((track) => {
+ track.stop();
+ });
streamRef.current = null;
}
mediaRecorderRef.current = null;
// Return both the file and the transcript
- resolve({ file, transcript: finalTranscript });
+ resolve({
+ file,
+ transcript: finalTranscript,
+ hasValidTranscript,
+ });
};
if (mediaRecorderRef.current.state !== "inactive") {
mediaRecorderRef.current.stop();
} else {
setRecording(false);
- resolve({ file: null, transcript: finalTranscript });
+ resolve({
+ file: null,
+ transcript: finalTranscript,
+ hasValidTranscript,
+ });
}
});
}, []);
@@ -177,6 +229,7 @@ const useVoiceRecording = () => {
startRecording,
stopRecording,
clearAudio,
+ audioSupported,
};
};
diff --git a/apps/adk-web/lib/model-capabilities.ts b/apps/adk-web/lib/model-capabilities.ts
new file mode 100644
index 000000000..0b3eeeeb4
--- /dev/null
+++ b/apps/adk-web/lib/model-capabilities.ts
@@ -0,0 +1,190 @@
+/**
+ * Utility functions to check model capabilities
+ */
+
+import type { AgentListItemDto as Agent } from "../Api";
+
+/**
+ * Extracts the base model name from various formats:
+ *
+ * Direct model names:
+ * "gpt-4o" -> "gpt-4o"
+ * "gemini-2.5-flash" -> "gemini-2.5-flash"
+ * "claude-3-5-sonnet-20241022" -> "claude-3-5-sonnet-20241022"
+ *
+ * Provider-prefixed (OpenRouter / Vercel AI SDK):
+ * "openai/gpt-4o" -> "gpt-4o"
+ * "google/gemini-2.5-flash" -> "gemini-2.5-flash"
+ * "anthropic/claude-3-opus" -> "claude-3-opus"
+ *
+ * Nested provider paths (OpenRouter via another provider):
+ * "openrouter/google/gemini-2.5-flash" -> "gemini-2.5-flash"
+ * "openrouter/openai/gpt-4o" -> "gpt-4o"
+ *
+ * Vertex AI format:
+ * "projects/my-proj/locations/us-central1/publishers/google/models/gemini-2.5-flash"
+ * -> "gemini-2.5-flash"
+ *
+ * @param modelName - The model name/identifier in any format
+ * @returns The base model name without provider prefix
+ */
+function extractBaseModelName(modelName: string): string {
+ const normalized = modelName.toLowerCase().trim();
+
+ // Handle Vertex AI format:
+ // "projects/.../publishers/google/models/gemini-2.5-flash"
+ const vertexMatch = normalized.match(/\/models\/(.+)$/);
+ if (vertexMatch) {
+ return vertexMatch[1];
+ }
+
+ // Handle provider-prefixed formats: "provider/model" or "provider/provider/model"
+ // Always take the last segment as the model name
+ if (normalized.includes("/")) {
+ const parts = normalized.split("/");
+ return parts[parts.length - 1];
+ }
+
+ // Return as-is for direct model names
+ return normalized;
+}
+
+/**
+ * Checks if a model supports audio input
+ *
+ * Supported models:
+ * - Google Gemini models (gemini-*)
+ * - OpenAI gpt-4o models (gpt-4o, gpt-4o-mini, gpt-4o-2024-*, etc.)
+ *
+ * Supports multiple formats:
+ * - Direct model names: "gpt-4o", "gemini-2.5-flash"
+ * - OpenRouter format: "openai/gpt-4o", "google/gemini-2.5-flash"
+ * - Vercel AI SDK format: "google/gemini-2.5-flash"
+ * - Vertex AI format: "projects/.../models/gemini-2.5-flash"
+ * - Agent names that may contain model info
+ *
+ * @param modelName - The model name/identifier in any format
+ * @returns true if the model supports audio input
+ */
+export function supportsAudioInput(
+ modelName: string | null | undefined,
+): boolean {
+ // When we can't determine the model, default to enabled
+ // (don't break voice for agents whose model we can't infer)
+ if (!modelName) return true;
+
+ // Extract base model name (handles OpenRouter, Vercel AI SDK, Vertex AI formats)
+ const baseModel = extractBaseModelName(modelName);
+
+ // Google Gemini models support audio
+ // Check for: gemini-*, google/gemini-*, or agent names containing "gemini"
+ if (
+ baseModel.includes("gemini") ||
+ modelName.toLowerCase().includes("gemini")
+ ) {
+ return true;
+ }
+
+ // OpenAI gpt-4o models support audio
+ // Check for: gpt-4o, gpt-4o-mini, gpt-4o-2024-*, etc.
+ if (baseModel.startsWith("gpt-4o")) {
+ return true;
+ }
+
+ // OpenAI gpt-4-turbo models may support audio (check specific versions)
+ if (baseModel.startsWith("gpt-4-turbo")) {
+ // Only newer versions with audio support
+ return baseModel.includes("2024-11") || baseModel.includes("2024-12");
+ }
+
+ // Check if model name contains gpt-4o (for agent names like "gpt-4o-agent")
+ if (modelName.toLowerCase().includes("gpt-4o")) {
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * Gets a user-friendly message explaining why audio isn't supported
+ *
+ * @param modelName - The model name/identifier (can be in any format)
+ * @returns A message explaining the limitation
+ */
+export function getAudioUnsupportedMessage(
+ modelName: string | null | undefined,
+): string {
+ if (!modelName) {
+ return "Voice input is not available. Please select an agent with a supported model (GPT-4o or Gemini).";
+ }
+
+ const baseModel = extractBaseModelName(modelName);
+ const normalized = modelName.toLowerCase();
+
+ // Check for specific unsupported model types
+ if (baseModel.startsWith("gpt-3.5") || normalized.includes("gpt-3.5")) {
+ return "Voice input is not supported for GPT-3.5 models. Please use GPT-4o or Gemini models.";
+ }
+
+ if (
+ (baseModel.startsWith("gpt-4") && !baseModel.startsWith("gpt-4o")) ||
+ (normalized.includes("gpt-4") && !normalized.includes("gpt-4o"))
+ ) {
+ return "Voice input is only supported for GPT-4o models. Please use GPT-4o or Gemini models.";
+ }
+
+ if (
+ baseModel.startsWith("o1") ||
+ baseModel.startsWith("o3") ||
+ normalized.includes("o1") ||
+ normalized.includes("o3")
+ ) {
+ return "Voice input is not supported for reasoning models (o1, o3). Please use GPT-4o or Gemini models.";
+ }
+
+ if (baseModel.startsWith("claude") || normalized.includes("claude")) {
+ return "Voice input is not supported for Claude models. Please use GPT-4o or Gemini models.";
+ }
+
+ // Generic message for unknown models
+ return "Voice input is not supported for this model. Please use GPT-4o or Gemini models for voice input.";
+}
+
+/**
+ * Infers a model name from an agent's name and path.
+ * Best-effort approach — returns null if no model pattern is found.
+ */
+export function inferModelNameFromAgent(agent: Agent | null): string | null {
+ if (!agent) return null;
+
+ const name = agent.name.toLowerCase();
+ const path = agent.relativePath?.toLowerCase() || "";
+ const combined = `${name} ${path}`;
+
+ // Check for OpenRouter format patterns (provider/model)
+ if (combined.includes("openai/gpt-4o") || combined.includes("openai/gpt4o")) {
+ return "openai/gpt-4o";
+ }
+ if (combined.includes("google/gemini")) {
+ return "google/gemini-2.5-flash";
+ }
+
+ // Check for direct model patterns in agent name
+ if (name.includes("gpt-4o") || name.includes("gpt4o")) return "gpt-4o";
+ if (name.includes("gemini")) {
+ const geminiMatch = name.match(/gemini[-\s]?([\d.]+)?/);
+ if (geminiMatch?.[1]) {
+ return `gemini-${geminiMatch[1]}`;
+ }
+ return "gemini-2.5-flash";
+ }
+ if (name.includes("gpt-4") || name.includes("gpt4")) return "gpt-4";
+ if (name.includes("gpt-3.5")) return "gpt-3.5-turbo";
+ if (name.includes("claude")) return "claude-3-5-sonnet";
+
+ // Check path for model indicators
+ if (path.includes("gpt-4o") || path.includes("gpt4o")) return "gpt-4o";
+ if (path.includes("gemini")) return "gemini-2.5-flash";
+
+ return null;
+}
diff --git a/packages/adk/src/models/openai-llm.ts b/packages/adk/src/models/openai-llm.ts
index 1c826912f..4de580013 100644
--- a/packages/adk/src/models/openai-llm.ts
+++ b/packages/adk/src/models/openai-llm.ts
@@ -448,12 +448,38 @@ export class OpenAiLlm extends BaseLlm {
}
if (part.inline_data?.mime_type && part.inline_data?.data) {
- return {
- type: "image_url",
- image_url: {
- url: `data:${part.inline_data.mime_type};base64,${part.inline_data.data}`,
- },
- };
+ const mimeType = part.inline_data.mime_type;
+
+ // Handle audio input
+ if (mimeType.startsWith("audio/")) {
+ const formatMap: Record = {
+ "audio/wav": "wav",
+ "audio/mp3": "mp3",
+ "audio/mpeg": "mp3",
+ "audio/webm": "webm",
+ "audio/ogg": "ogg",
+ "audio/mp4": "mp4",
+ };
+ const format = formatMap[mimeType] || mimeType.split("/")[1];
+
+ return {
+ type: "input_audio" as const,
+ input_audio: {
+ data: part.inline_data.data,
+ format,
+ },
+ };
+ }
+
+ // Handle image input (existing behavior)
+ if (mimeType.startsWith("image/")) {
+ return {
+ type: "image_url",
+ image_url: {
+ url: `data:${part.inline_data.mime_type};base64,${part.inline_data.data}`,
+ },
+ };
+ }
}
throw new Error("Unsupported part type for OpenAI conversion");
@@ -591,7 +617,6 @@ export class OpenAiLlm extends BaseLlm {
if (part.inline_data) {
// Ensure inline data is in the correct format for OpenAI
if (!part.inline_data.mime_type || !part.inline_data.data) {
- // biome-ignore lint/performance/noDelete: Remove invalid inline data
delete part.inline_data;
}
}