feat(_kokoro_tts): add audio format config with remote API support

Draco-Lunaris · Draco-Lunaris · commit 9cd5be8bf41c · 2026-06-03T16:04:15.000-05:00
diff --git a/plugins/_kokoro_tts/api/status.py b/plugins/_kokoro_tts/api/status.py
@@ -1,4 +1,6 @@
-import importlib.metadata
+from __future__ import annotations
+
+import aiohttp
 
 from helpers.api import ApiHandler, Request, Response
 from plugins._kokoro_tts.helpers import migration, runtime
@@ -8,24 +10,34 @@ class Status(ApiHandler):
     async def process(self, input: dict, request: Request) -> dict | Response:
         migration.ensure_migrated()
 
-        package_version = ""
-        package_error = ""
-        try:
-            package_version = importlib.metadata.version("kokoro")
-        except Exception as e:
-            package_error = str(e)
+        cfg = runtime.get_config()
+        remote_url = cfg.get("remote_url", "")
+
+        remote_healthy = False
+        remote_error = ""
+        if remote_url:
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(
+                        f"{remote_url}/health",
+                        timeout=aiohttp.ClientTimeout(total=5),
+                    ) as resp:
+                        remote_healthy = resp.status == 200
+            except Exception as e:
+                remote_error = str(e)
 
         return {
             "plugin": "_kokoro_tts",
             "enabled": runtime.is_globally_enabled(),
-            "config": runtime.get_config(),
+            "config": cfg,
             "model": {
-                "ready": await runtime.is_downloaded(),
-                "loading": await runtime.is_downloading(),
+                "ready": remote_healthy,
+                "loading": False,
             },
-            "package": {
-                "version": package_version,
-                "error": package_error,
+            "remote": {
+                "url": remote_url,
+                "healthy": remote_healthy,
+                "error": remote_error,
             },
             "fallback": "Browser-native speechSynthesis remains the fallback when Kokoro is disabled.",
         }
diff --git a/plugins/_kokoro_tts/api/synthesize.py b/plugins/_kokoro_tts/api/synthesize.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from helpers.api import ApiHandler, Request, Response
 from plugins._kokoro_tts.helpers import runtime
 
@@ -12,11 +14,11 @@ async def process(self, input: dict, request: Request) -> dict | Response:
             return Response(status=400, response="Missing text")
 
         try:
-            audio = await runtime.synthesize_sentences([text])
+            audio, mime_type = await runtime.synthesize_sentences([text])
             return {
                 "success": True,
                 "audio": audio,
-                "mime_type": "audio/wav",
+                "mime_type": mime_type,
             }
         except Exception as e:
             return {"success": False, "error": str(e)}
diff --git a/plugins/_kokoro_tts/default_config.yaml b/plugins/_kokoro_tts/default_config.yaml
@@ -1,2 +1,4 @@
-voice: am_puck,am_onyx
+voice: am_onyx+am_echo
 speed: 1.1
+remote_url: http://ares.moon-dragon.us:18890
+response_format: mp3
diff --git a/plugins/_kokoro_tts/helpers/runtime.py b/plugins/_kokoro_tts/helpers/runtime.py
@@ -2,34 +2,32 @@
 
 import asyncio
 import base64
-import io
-import warnings
 from typing import Any
 
-import soundfile as sf
+import aiohttp
 
 from helpers import plugins
-from helpers.notification import (
-    NotificationManager,
-    NotificationPriority,
-    NotificationType,
-)
 from helpers.print_style import PrintStyle
 from plugins._kokoro_tts.helpers import migration
 
 
-warnings.filterwarnings("ignore", category=FutureWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
-
-
 PLUGIN_NAME = "_kokoro_tts"
 DEFAULT_CONFIG = {
-    "voice": "am_puck,am_onyx",
+    "voice": "am_onyx+am_echo",
     "speed": 1.1,
+    "remote_url": "http://ares.moon-dragon.us:18890",
+    "response_format": "mp3",
 }
 
-_pipeline = None
-is_updating_model = False
+VALID_FORMATS = {"wav", "mp3", "opus", "flac"}
+MIME_TYPES = {
+    "wav": "audio/wav",
+    "mp3": "audio/mpeg",
+    "opus": "audio/opus",
+    "flac": "audio/flac",
+}
+
+_remote_healthy: bool | None = None
 
 
 def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]:
@@ -48,6 +46,14 @@ def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]:
     except (TypeError, ValueError):
         pass
 
+    remote_url = str(config.get("remote_url", normalized["remote_url"]) or "").strip()
+    if remote_url:
+        normalized["remote_url"] = remote_url.rstrip("/")
+
+    response_format = str(config.get("response_format", normalized["response_format"]) or "").strip().lower()
+    if response_format in VALID_FORMATS:
+        normalized["response_format"] = response_format
+
     return normalized
 
 
@@ -68,79 +74,77 @@ async def preload(config: dict[str, Any] | None = None):
 
 
 async def _preload():
-    global _pipeline, is_updating_model
-
-    while is_updating_model:
-        await asyncio.sleep(0.1)
-
+    global _remote_healthy
     try:
-        is_updating_model = True
-        if not _pipeline:
-            NotificationManager.send_notification(
-                NotificationType.INFO,
-                NotificationPriority.NORMAL,
-                "Loading Kokoro TTS model...",
-                display_time=99,
-                group="kokoro-preload",
-            )
-            PrintStyle.standard("Loading Kokoro TTS model...")
-            from kokoro import KPipeline
-
-            _pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
-            NotificationManager.send_notification(
-                NotificationType.INFO,
-                NotificationPriority.NORMAL,
-                "Kokoro TTS model loaded.",
-                display_time=2,
-                group="kokoro-preload",
-            )
-    finally:
-        is_updating_model = False
+        cfg = get_config()
+        remote_url = cfg.get("remote_url", DEFAULT_CONFIG["remote_url"])
+        async with aiohttp.ClientSession() as session:
+            async with session.get(
+                f"{remote_url}/health",
+                timeout=aiohttp.ClientTimeout(total=5),
+            ) as resp:
+                _remote_healthy = resp.status == 200
+        if _remote_healthy:
+            PrintStyle.standard("Kokoro TTS remote API is healthy.")
+        else:
+            PrintStyle.error(f"Kokoro TTS remote API unhealthy: status {resp.status}")
+    except Exception as e:
+        _remote_healthy = False
+        PrintStyle.error(f"Kokoro TTS remote API check failed: {e}")
 
 
 async def is_downloading() -> bool:
-    return is_updating_model
+    return False
 
 
 async def is_downloaded() -> bool:
-    return _pipeline is not None
+    if _remote_healthy is None:
+        await _preload()
+    return _remote_healthy is True
 
 
 async def synthesize_sentences(
     sentences: list[str], config: dict[str, Any] | None = None
-) -> str:
+) -> tuple[str, str]:
     cfg = normalize_config(config or get_config())
     return await _synthesize_sentences(
         sentences,
         voice=str(cfg["voice"]),
         speed=float(cfg["speed"]),
+        remote_url=str(cfg["remote_url"]),
+        response_format=str(cfg["response_format"]),
     )
 
 
 async def _synthesize_sentences(
-    sentences: list[str], *, voice: str, speed: float
-) -> str:
-    await _preload()
-
-    combined_audio: list[float] = []
+    sentences: list[str],
+    *,
+    voice: str,
+    speed: float,
+    remote_url: str,
+    response_format: str,
+) -> tuple[str, str]:
+    text = " ".join(s.strip() for s in sentences if s.strip())
+    if not text:
+        return "", MIME_TYPES.get(response_format, "audio/mpeg")
 
     try:
-        for sentence in sentences:
-            if not sentence.strip():
-                continue
-
-            segments = _pipeline(sentence.strip(), voice=voice, speed=speed)  # type: ignore[misc]
-            for segment in list(segments):
-                audio_tensor = segment.audio
-                audio_numpy = audio_tensor.detach().cpu().numpy()  # type: ignore[union-attr]
-                combined_audio.extend(audio_numpy.tolist())
-
-        if not combined_audio:
-            return ""
-
-        buffer = io.BytesIO()
-        sf.write(buffer, combined_audio, 24000, format="WAV")
-        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                f"{remote_url}/v1/audio/speech",
+                json={
+                    "model": "kokoro",
+                    "input": text,
+                    "voice": voice,
+                    "response_format": response_format,
+                    "speed": speed,
+                },
+                timeout=aiohttp.ClientTimeout(total=30),
+            ) as resp:
+                resp.raise_for_status()
+                audio_bytes = await resp.read()
+                mime_type = MIME_TYPES.get(response_format, "audio/mpeg")
+                return base64.b64encode(audio_bytes).decode("utf-8"), mime_type
     except Exception as e:
-        PrintStyle.error(f"Error in Kokoro TTS synthesis: {e}")
+        PrintStyle.error(f"Error in remote Kokoro TTS synthesis: {e}")
         raise
diff --git a/plugins/_kokoro_tts/webui/config.html b/plugins/_kokoro_tts/webui/config.html
@@ -9,20 +9,46 @@
       <div class="plugin-config-page">
         <div class="section-title">Kokoro TTS</div>
         <div class="section-description">
-          Configure the built-in Kokoro voice provider. When this plugin is disabled,
-          spoken output falls back to the browser speech API.
+          Configure the Kokoro voice provider. Synthesis is handled by a remote
+          Kokoro-FastAPI service. When disabled, spoken output falls back to the
+          browser speech API.
+        </div>
+
+        <div class="field">
+          <div class="field-label">
+            <div class="field-title">Remote URL</div>
+            <div class="field-description">URL of the Kokoro-FastAPI service (e.g. http://ares.moon-dragon.us:18890).</div>
+          </div>
+          <div class="field-control">
+            <input type="text" x-model="config.remote_url" />
+          </div>
         </div>
 
         <div class="field">
           <div class="field-label">
             <div class="field-title">Voice</div>
-            <div class="field-description">Kokoro voice identifier passed to the backend pipeline.</div>
+            <div class="field-description">Kokoro voice identifier (e.g. af_bella, am_onyx, am_onyx+am_echo for blending).</div>
           </div>
           <div class="field-control">
             <input type="text" x-model="config.voice" />
           </div>
         </div>
 
+        <div class="field">
+          <div class="field-label">
+            <div class="field-title">Audio Format</div>
+            <div class="field-description">Output format for synthesized audio. MP3 recommended for smaller file size.</div>
+          </div>
+          <div class="field-control">
+            <select x-model="config.response_format">
+              <option value="mp3">MP3 (recommended)</option>
+              <option value="wav">WAV (uncompressed)</option>
+              <option value="opus">Opus (low bitrate)</option>
+              <option value="flac">FLAC (lossless)</option>
+            </select>
+          </div>
+        </div>
+
         <div class="field">
           <div class="field-label">
             <div class="field-title">Speed</div>
diff --git a/plugins/_kokoro_tts/webui/kokoro-tts-store.js b/plugins/_kokoro_tts/webui/kokoro-tts-store.js
@@ -14,10 +14,12 @@ const model = {
   config: {
     voice: "",
     speed: 1.1,
+    remote_url: "",
+    response_format: "mp3",
   },
   modelReady: false,
   modelLoading: false,
-  packageVersion: "",
+  remoteHealthy: false,
   providerCleanup: null,
 
   async initRuntime() {
@@ -42,10 +44,12 @@ const model = {
       this.config = {
         voice: status?.config?.voice || "",
         speed: Number(status?.config?.speed || 1.1),
+        remote_url: status?.config?.remote_url || "",
+        response_format: status?.config?.response_format || "mp3",
       };
       this.modelReady = !!status?.model?.ready;
       this.modelLoading = !!status?.model?.loading;
-      this.packageVersion = status?.package?.version || "";
+      this.remoteHealthy = !!status?.remote?.healthy;
 
       if (this.enabled) {
         this.registerProvider();
@@ -77,7 +81,7 @@ const model = {
 
         return {
           audioBase64: result.audio || "",
-          mimeType: result.mime_type || "audio/wav",
+          mimeType: result.mime_type || "audio/mpeg",
         };
       },
     });
diff --git a/plugins/_kokoro_tts/webui/main.html b/plugins/_kokoro_tts/webui/main.html