diff --git a/mlx_audio/ui/app/text-to-speech/page.tsx b/mlx_audio/ui/app/text-to-speech/page.tsx index c9b503e44..cddc7b884 100644 --- a/mlx_audio/ui/app/text-to-speech/page.tsx +++ b/mlx_audio/ui/app/text-to-speech/page.tsx @@ -6,6 +6,7 @@ import { useState, useRef } from "react" import { ChevronDown, Download, ThumbsUp, ThumbsDown, Play, Pause, RefreshCw } from "lucide-react" import { LayoutWrapper } from "@/components/layout-wrapper" import { VoiceSelection } from "@/components/voice-selection" +import { getVoiceDisplayName } from "@/components/voice-library" // Custom range input component with colored progress function RangeInput({ @@ -62,7 +63,7 @@ export default function SpeechSynthesis() { const [quantization, setQuantization] = useState("6bit") const [language, setLanguage] = useState("English-detected") const [liked, setLiked] = useState(null) - const [selectedVoice, setSelectedVoice] = useState("conversational_a") + const [selectedVoice, setSelectedVoice] = useState("af_heart") const audioRef = useRef(null) @@ -131,10 +132,10 @@ export default function SpeechSynthesis() { if (!audioRef.current) return setIsGenerating(true) - const API_BASE_URL = process.env.NEXT_PUBLIC_API_BASE_URL || 'http://localhost'; - const API_PORT = process.env.NEXT_PUBLIC_API_PORT || '8000'; + const API_BASE_URL = process.env.NEXT_PUBLIC_API_BASE_URL || "http://localhost" + const API_PORT = process.env.NEXT_PUBLIC_API_PORT || "8000" - const voice = (model.includes("marvis") ? "conversational_a" : "af_heart"); + const voice = isMarvisModel(model) ? "conversational_a" : selectedVoice; try { const response = await fetch(`${API_BASE_URL}:${API_PORT}/v1/audio/speech`, { @@ -333,7 +334,12 @@ export default function SpeechSynthesis() { - + {!isMarvisModel(baseModel) && !baseModel.includes("Spark") && ( + + )}
@@ -479,7 +485,7 @@ export default function SpeechSynthesis() {
- {selectedVoice}: {text.length > 20 ? text.substring(0, 20) + "..." : text} + {getVoiceDisplayName(selectedVoice)}: {text.length > 20 ? text.substring(0, 20) + "..." : text}
How did this sound?
diff --git a/mlx_audio/ui/components/voice-library.tsx b/mlx_audio/ui/components/voice-library.tsx index 937c0b615..f5067434a 100644 --- a/mlx_audio/ui/components/voice-library.tsx +++ b/mlx_audio/ui/components/voice-library.tsx @@ -1,9 +1,7 @@ "use client" -import type React from "react" - -import { useState, useEffect } from "react" -import { Bookmark, ChevronDown, Play } from "lucide-react" +import { useState, useEffect, type MouseEvent } from "react" +import { Play } from "lucide-react" type Voice = { id: string @@ -13,113 +11,120 @@ type Voice = { age: string accent: string region: string - isSelected?: boolean tags?: string[] } const voices: Voice[] = [ { - id: "trustworthy-man", - name: "Trustworthy Man", + id: "af_heart", + name: "Heart", language: "English", - gender: "Male", + gender: "Female", age: "Adult", - accent: "Resonate", - region: "EN-US (General)", - isSelected: true, + accent: "Warm", + region: "EN-US (American)", }, { - id: "expressive-narrator", - name: "Expressive Narrator", + id: "af_bella", + name: "Bella", language: "English", - gender: "Male", + gender: "Female", age: "Adult", - accent: "Audiobook", - region: "EN-British", + accent: "Bright", + region: "EN-US (American)", }, { - id: "radiant-girl", - name: "Radiant Girl", + id: "af_nova", + name: "Nova", language: "English", gender: "Female", - age: "Young Adult", - accent: "Lively", - region: "EN-US (General)", - }, - { - id: "magnetic-voiced-male", - name: "Magnetic-voiced Male", - language: "English", - gender: "Male", age: "Adult", - accent: "Ad", - region: "EN-US (General)", + accent: "Clear", + region: "EN-US (American)", }, { - id: "compelling-lady", - name: "Compelling Lady", + id: "af_sky", + name: "Sky", language: "English", gender: "Female", - age: "Adult", - accent: "Broadcast", - region: "EN-British", + age: "Young Adult", + accent: "Lively", + region: "EN-US (American)", }, { - id: "aussie-bloke", - name: "Aussie Bloke", + id: "am_adam", + name: "Adam", language: "English", gender: "Male", age: "Adult", - accent: "Bright", - region: "EN-Australian", + accent: "Deep", + region: "EN-US (American)", }, { - id: "captivating-female", - name: "Captivating Female", + id: "am_echo", + name: "Echo", language: "English", - gender: "Female", + gender: "Male", age: "Adult", - accent: "News Report", - region: "EN-US (General)", + accent: "Resonant", + region: "EN-US (American)", }, { - id: "upbeat-woman", - name: "Upbeat Woman", + id: "bf_alice", + name: "Alice", language: "English", gender: "Female", age: "Adult", - accent: "Bright", - region: "EN-US (General)", + accent: "Refined", + region: "EN-British", }, { - id: "calm-woman", - name: "Calm Woman", + id: "bf_emma", + name: "Emma", language: "English", gender: "Female", age: "Adult", - accent: "Audiobook", - region: "EN-US (General)", + accent: "Clear", + region: "EN-British", }, { - id: "upset-girl", - name: "Upset Girl", + id: "bm_daniel", + name: "Daniel", language: "English", - gender: "Female", - age: "Young Adult", - accent: "Sad", + gender: "Male", + age: "Adult", + accent: "Deep", region: "EN-British", }, { - id: "gentle-voiced-man", - name: "Gentle-voiced Man", + id: "bm_george", + name: "George", language: "English", gender: "Male", age: "Adult", - accent: "Resonate", - region: "EN-US (General)", + accent: "Warm", + region: "EN-British", }, ] +export function getVoiceDisplayName(voiceId: string): string { + const voice = voices.find(v => v.id === voiceId) + return voice?.name || voiceId +} + +export const VOICE_GRADIENT_COLORS: Record = { + af_heart: "from-pink-400 to-rose-500", + af_bella: "from-purple-400 to-pink-500", + af_nova: "from-sky-400 to-blue-500", + af_sky: "from-cyan-400 to-sky-500", + am_adam: "from-blue-400 to-indigo-600", + am_echo: "from-indigo-400 to-purple-500", + bf_alice: "from-rose-400 to-pink-500", + bf_emma: "from-amber-400 to-orange-500", + bm_daniel: "from-slate-400 to-gray-600", + bm_george: "from-teal-400 to-emerald-500", +} + interface VoiceLibraryProps { onClose?: () => void onSelectVoice?: (voice: string) => void @@ -135,57 +140,28 @@ export function VoiceLibrary({ }: VoiceLibraryProps) { const [activeTab, setActiveTab] = useState<"library" | "my-voices">("library") const [selectedVoice, setSelectedVoice] = useState( - initialSelectedVoice - ? voices.find((v) => v.name === initialSelectedVoice)?.id || "trustworthy-man" - : "trustworthy-man", + initialSelectedVoice || "af_heart", ) const [language, setLanguage] = useState("") const [accent, setAccent] = useState("") const [gender, setGender] = useState("") const [age, setAge] = useState("") - const [bookmarkedVoices, setBookmarkedVoices] = useState([]) const [isCloneModalOpen, setIsCloneModalOpen] = useState(false) useEffect(() => { if (initialSelectedVoice) { - const voiceId = voices.find((v) => v.name === initialSelectedVoice)?.id - if (voiceId) { - setSelectedVoice(voiceId) - } + setSelectedVoice(initialSelectedVoice) } }, [initialSelectedVoice]) - const getGradientForVoice = (voiceId: string) => { - // Map of voice IDs to gradient classes - const gradientMap: Record = { - "trustworthy-man": "bg-gradient-to-br from-blue-400 to-indigo-600", - "expressive-narrator": "bg-gradient-to-br from-purple-400 to-indigo-500", - "radiant-girl": "bg-gradient-to-br from-pink-400 to-orange-300", - "magnetic-voiced-male": "bg-gradient-to-br from-sky-400 to-blue-600", - "compelling-lady": "bg-gradient-to-br from-rose-400 to-red-500", - "aussie-bloke": "bg-gradient-to-br from-amber-400 to-orange-500", - "captivating-female": "bg-gradient-to-br from-teal-400 to-emerald-500", - "upbeat-woman": "bg-gradient-to-br from-green-400 to-emerald-500", - "calm-woman": "bg-gradient-to-br from-indigo-400 to-purple-500", - "upset-girl": "bg-gradient-to-br from-rose-300 to-pink-500", - "gentle-voiced-man": "bg-gradient-to-br from-cyan-400 to-blue-500", - } - - // Return the gradient class or a default gradient if not found - return gradientMap[voiceId] || "bg-gradient-to-br from-gray-400 to-gray-600" - } + const getGradientForVoice = (voiceId: string) => + `bg-gradient-to-br ${VOICE_GRADIENT_COLORS[voiceId] || "from-gray-400 to-gray-600"}` const handleSelectVoice = (voiceId: string) => { setSelectedVoice(voiceId) - // Get the voice name from the voices array - const selectedVoiceName = voices.find((v) => v.id === voiceId)?.name || "Trustworthy Man" - - // Call the onSelectVoice callback if provided if (onSelectVoice) { - onSelectVoice(selectedVoiceName) + onSelectVoice(voiceId) } - - // In a real app, this would update the selected voice in the parent component if (onClose) { setTimeout(() => { onClose() @@ -193,27 +169,12 @@ export function VoiceLibrary({ } } - const handleBookmark = (e: React.MouseEvent, voiceId: string) => { - e.stopPropagation() - setBookmarkedVoices((prev) => (prev.includes(voiceId) ? prev.filter((id) => id !== voiceId) : [...prev, voiceId])) - } - - const handleUseVoice = (e: React.MouseEvent, voiceId: string) => { + const handleUseVoice = (e: MouseEvent, voiceId: string) => { e.stopPropagation() - // Set the selected voice setSelectedVoice(voiceId) - - // Get the voice name from the voices array - const selectedVoiceName = voices.find((v) => v.id === voiceId)?.name || "Trustworthy Man" - - // Call the onSelectVoice callback if provided if (onSelectVoice) { - onSelectVoice(selectedVoiceName) + onSelectVoice(voiceId) } - - // Provide visual feedback - const voiceName = voices.find((v) => v.id === voiceId)?.name - console.log(`Voice selected: ${voiceName}`) } const handleCreateVoice = () => { @@ -223,11 +184,8 @@ export function VoiceLibrary({ return (
console.log("Current selected voice:", selectedVoice)} style={{ display: "grid", gridTemplateRows: "auto 1fr", height: "100%" }} > - -
{activeTab === "library" ? ( @@ -275,12 +233,6 @@ export function VoiceLibrary({ Use )} -
)) diff --git a/mlx_audio/ui/components/voice-selection.tsx b/mlx_audio/ui/components/voice-selection.tsx index d58f9b52a..88eece621 100644 --- a/mlx_audio/ui/components/voice-selection.tsx +++ b/mlx_audio/ui/components/voice-selection.tsx @@ -1,7 +1,7 @@ "use client" import { useState, useEffect } from "react" -import { VoiceLibrary } from "@/components/voice-library" +import { VoiceLibrary, getVoiceDisplayName, VOICE_GRADIENT_COLORS } from "@/components/voice-library" import { Settings } from "lucide-react" interface VoiceSelectionProps { @@ -12,17 +12,12 @@ interface VoiceSelectionProps { export function VoiceSelection({ onVoiceChange, - initialVoice = "Trustworthy Man", + initialVoice = "af_heart", className = "mb-6", }: VoiceSelectionProps) { const [isModalOpen, setIsModalOpen] = useState(false) const [selectedVoice, setSelectedVoice] = useState(initialVoice) - useEffect(() => { - // This ensures the UI updates when the selected voice changes - console.log("Selected voice updated:", selectedVoice) - }, [selectedVoice]) - useEffect(() => { // Update selected voice if initialVoice prop changes if (initialVoice) { @@ -30,23 +25,6 @@ export function VoiceSelection({ } }, [initialVoice]) - // Helper function to determine gradient based on voice name - const getGradientForVoice = (name: string) => { - if (name.includes("Man") || name.includes("Male")) { - return "from-blue-400 to-indigo-600" - } else if (name.includes("Girl") || name.includes("Female")) { - return "from-pink-400 to-orange-300" - } else if (name.includes("Narrator")) { - return "from-purple-400 to-indigo-500" - } else if (name.includes("Compelling")) { - return "from-rose-400 to-red-500" - } else if (name.includes("Magnetic")) { - return "from-sky-400 to-blue-600" - } else { - return "from-gray-400 to-gray-600" - } - } - const handleVoiceChange = (voice: string) => { setSelectedVoice(voice) if (onVoiceChange) { @@ -56,7 +34,7 @@ export function VoiceSelection({ } const handleResetVoice = () => { - const defaultVoice = "Trustworthy Man" + const defaultVoice = "af_heart" setSelectedVoice(defaultVoice) if (onVoiceChange) { onVoiceChange(defaultVoice) @@ -70,12 +48,12 @@ export function VoiceSelection({
- {selectedVoice} + {getVoiceDisplayName(selectedVoice)}
English diff --git a/pyproject.toml b/pyproject.toml index 99f4727c1..ebcc9577e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "numba>=0.60.0", # Required by librosa "librosa==0.11.0", "protobuf>=6.33.5", + "python-multipart>=0.0.22", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index bfc4e85ea..57e6ed165 100644 --- a/uv.lock +++ b/uv.lock @@ -1321,6 +1321,7 @@ dependencies = [ { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "protobuf" }, { name = "pyloudnorm" }, + { name = "python-multipart" }, { name = "sounddevice" }, { name = "tqdm" }, { name = "transformers" }, @@ -1419,6 +1420,7 @@ requires-dist = [ { name = "pyloudnorm", specifier = ">=0.2.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.0.0" }, + { name = "python-multipart", specifier = ">=0.0.22" }, { name = "python-multipart", marker = "extra == 'server'", specifier = ">=0.0.22" }, { name = "sentencepiece", marker = "extra == 'all'", specifier = ">=0.2.0" }, { name = "sentencepiece", marker = "extra == 'sts'", specifier = ">=0.2.0" },