Skip to content

Commit 9cd5be8

Browse files
committed
feat(_kokoro_tts): add audio format config with remote API support
1 parent 143ff8f commit 9cd5be8

7 files changed

Lines changed: 152 additions & 95 deletions

File tree

plugins/_kokoro_tts/api/status.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
import importlib.metadata
1+
from __future__ import annotations
2+
3+
import aiohttp
24

35
from helpers.api import ApiHandler, Request, Response
46
from plugins._kokoro_tts.helpers import migration, runtime
@@ -8,24 +10,34 @@ class Status(ApiHandler):
810
async def process(self, input: dict, request: Request) -> dict | Response:
911
migration.ensure_migrated()
1012

11-
package_version = ""
12-
package_error = ""
13-
try:
14-
package_version = importlib.metadata.version("kokoro")
15-
except Exception as e:
16-
package_error = str(e)
13+
cfg = runtime.get_config()
14+
remote_url = cfg.get("remote_url", "")
15+
16+
remote_healthy = False
17+
remote_error = ""
18+
if remote_url:
19+
try:
20+
async with aiohttp.ClientSession() as session:
21+
async with session.get(
22+
f"{remote_url}/health",
23+
timeout=aiohttp.ClientTimeout(total=5),
24+
) as resp:
25+
remote_healthy = resp.status == 200
26+
except Exception as e:
27+
remote_error = str(e)
1728

1829
return {
1930
"plugin": "_kokoro_tts",
2031
"enabled": runtime.is_globally_enabled(),
21-
"config": runtime.get_config(),
32+
"config": cfg,
2233
"model": {
23-
"ready": await runtime.is_downloaded(),
24-
"loading": await runtime.is_downloading(),
34+
"ready": remote_healthy,
35+
"loading": False,
2536
},
26-
"package": {
27-
"version": package_version,
28-
"error": package_error,
37+
"remote": {
38+
"url": remote_url,
39+
"healthy": remote_healthy,
40+
"error": remote_error,
2941
},
3042
"fallback": "Browser-native speechSynthesis remains the fallback when Kokoro is disabled.",
3143
}

plugins/_kokoro_tts/api/synthesize.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
from helpers.api import ApiHandler, Request, Response
24
from plugins._kokoro_tts.helpers import runtime
35

@@ -12,11 +14,11 @@ async def process(self, input: dict, request: Request) -> dict | Response:
1214
return Response(status=400, response="Missing text")
1315

1416
try:
15-
audio = await runtime.synthesize_sentences([text])
17+
audio, mime_type = await runtime.synthesize_sentences([text])
1618
return {
1719
"success": True,
1820
"audio": audio,
19-
"mime_type": "audio/wav",
21+
"mime_type": mime_type,
2022
}
2123
except Exception as e:
2224
return {"success": False, "error": str(e)}
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
1-
voice: am_puck,am_onyx
1+
voice: am_onyx+am_echo
22
speed: 1.1
3+
remote_url: http://ares.moon-dragon.us:18890
4+
response_format: mp3

plugins/_kokoro_tts/helpers/runtime.py

Lines changed: 71 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,32 @@
22

33
import asyncio
44
import base64
5-
import io
6-
import warnings
75
from typing import Any
86

9-
import soundfile as sf
7+
import aiohttp
108

119
from helpers import plugins
12-
from helpers.notification import (
13-
NotificationManager,
14-
NotificationPriority,
15-
NotificationType,
16-
)
1710
from helpers.print_style import PrintStyle
1811
from plugins._kokoro_tts.helpers import migration
1912

2013

21-
warnings.filterwarnings("ignore", category=FutureWarning)
22-
warnings.filterwarnings("ignore", category=UserWarning)
23-
24-
2514
PLUGIN_NAME = "_kokoro_tts"
2615
DEFAULT_CONFIG = {
27-
"voice": "am_puck,am_onyx",
16+
"voice": "am_onyx+am_echo",
2817
"speed": 1.1,
18+
"remote_url": "http://ares.moon-dragon.us:18890",
19+
"response_format": "mp3",
2920
}
3021

31-
_pipeline = None
32-
is_updating_model = False
22+
VALID_FORMATS = {"wav", "mp3", "opus", "flac"}
23+
MIME_TYPES = {
24+
"wav": "audio/wav",
25+
"mp3": "audio/mpeg",
26+
"opus": "audio/opus",
27+
"flac": "audio/flac",
28+
}
29+
30+
_remote_healthy: bool | None = None
3331

3432

3533
def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]:
@@ -48,6 +46,14 @@ def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]:
4846
except (TypeError, ValueError):
4947
pass
5048

49+
remote_url = str(config.get("remote_url", normalized["remote_url"]) or "").strip()
50+
if remote_url:
51+
normalized["remote_url"] = remote_url.rstrip("/")
52+
53+
response_format = str(config.get("response_format", normalized["response_format"]) or "").strip().lower()
54+
if response_format in VALID_FORMATS:
55+
normalized["response_format"] = response_format
56+
5157
return normalized
5258

5359

@@ -68,79 +74,77 @@ async def preload(config: dict[str, Any] | None = None):
6874

6975

7076
async def _preload():
71-
global _pipeline, is_updating_model
72-
73-
while is_updating_model:
74-
await asyncio.sleep(0.1)
75-
77+
global _remote_healthy
7678
try:
77-
is_updating_model = True
78-
if not _pipeline:
79-
NotificationManager.send_notification(
80-
NotificationType.INFO,
81-
NotificationPriority.NORMAL,
82-
"Loading Kokoro TTS model...",
83-
display_time=99,
84-
group="kokoro-preload",
85-
)
86-
PrintStyle.standard("Loading Kokoro TTS model...")
87-
from kokoro import KPipeline
88-
89-
_pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
90-
NotificationManager.send_notification(
91-
NotificationType.INFO,
92-
NotificationPriority.NORMAL,
93-
"Kokoro TTS model loaded.",
94-
display_time=2,
95-
group="kokoro-preload",
96-
)
97-
finally:
98-
is_updating_model = False
79+
cfg = get_config()
80+
remote_url = cfg.get("remote_url", DEFAULT_CONFIG["remote_url"])
81+
async with aiohttp.ClientSession() as session:
82+
async with session.get(
83+
f"{remote_url}/health",
84+
timeout=aiohttp.ClientTimeout(total=5),
85+
) as resp:
86+
_remote_healthy = resp.status == 200
87+
if _remote_healthy:
88+
PrintStyle.standard("Kokoro TTS remote API is healthy.")
89+
else:
90+
PrintStyle.error(f"Kokoro TTS remote API unhealthy: status {resp.status}")
91+
except Exception as e:
92+
_remote_healthy = False
93+
PrintStyle.error(f"Kokoro TTS remote API check failed: {e}")
9994

10095

10196
async def is_downloading() -> bool:
102-
return is_updating_model
97+
return False
10398

10499

105100
async def is_downloaded() -> bool:
106-
return _pipeline is not None
101+
if _remote_healthy is None:
102+
await _preload()
103+
return _remote_healthy is True
107104

108105

109106
async def synthesize_sentences(
110107
sentences: list[str], config: dict[str, Any] | None = None
111-
) -> str:
108+
) -> tuple[str, str]:
112109
cfg = normalize_config(config or get_config())
113110
return await _synthesize_sentences(
114111
sentences,
115112
voice=str(cfg["voice"]),
116113
speed=float(cfg["speed"]),
114+
remote_url=str(cfg["remote_url"]),
115+
response_format=str(cfg["response_format"]),
117116
)
118117

119118

120119
async def _synthesize_sentences(
121-
sentences: list[str], *, voice: str, speed: float
122-
) -> str:
123-
await _preload()
124-
125-
combined_audio: list[float] = []
120+
sentences: list[str],
121+
*,
122+
voice: str,
123+
speed: float,
124+
remote_url: str,
125+
response_format: str,
126+
) -> tuple[str, str]:
127+
text = " ".join(s.strip() for s in sentences if s.strip())
128+
if not text:
129+
return "", MIME_TYPES.get(response_format, "audio/mpeg")
126130

127131
try:
128-
for sentence in sentences:
129-
if not sentence.strip():
130-
continue
131-
132-
segments = _pipeline(sentence.strip(), voice=voice, speed=speed) # type: ignore[misc]
133-
for segment in list(segments):
134-
audio_tensor = segment.audio
135-
audio_numpy = audio_tensor.detach().cpu().numpy() # type: ignore[union-attr]
136-
combined_audio.extend(audio_numpy.tolist())
137-
138-
if not combined_audio:
139-
return ""
140-
141-
buffer = io.BytesIO()
142-
sf.write(buffer, combined_audio, 24000, format="WAV")
143-
return base64.b64encode(buffer.getvalue()).decode("utf-8")
132+
async with aiohttp.ClientSession() as session:
133+
async with session.post(
134+
f"{remote_url}/v1/audio/speech",
135+
json={
136+
"model": "kokoro",
137+
"input": text,
138+
"voice": voice,
139+
"response_format": response_format,
140+
"speed": speed,
141+
},
142+
timeout=aiohttp.ClientTimeout(total=30),
143+
) as resp:
144+
resp.raise_for_status()
145+
audio_bytes = await resp.read()
146+
mime_type = MIME_TYPES.get(response_format, "audio/mpeg")
147+
return base64.b64encode(audio_bytes).decode("utf-8"), mime_type
144148
except Exception as e:
145-
PrintStyle.error(f"Error in Kokoro TTS synthesis: {e}")
149+
PrintStyle.error(f"Error in remote Kokoro TTS synthesis: {e}")
146150
raise

plugins/_kokoro_tts/webui/config.html

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,46 @@
99
<div class="plugin-config-page">
1010
<div class="section-title">Kokoro TTS</div>
1111
<div class="section-description">
12-
Configure the built-in Kokoro voice provider. When this plugin is disabled,
13-
spoken output falls back to the browser speech API.
12+
Configure the Kokoro voice provider. Synthesis is handled by a remote
13+
Kokoro-FastAPI service. When disabled, spoken output falls back to the
14+
browser speech API.
15+
</div>
16+
17+
<div class="field">
18+
<div class="field-label">
19+
<div class="field-title">Remote URL</div>
20+
<div class="field-description">URL of the Kokoro-FastAPI service (e.g. http://ares.moon-dragon.us:18890).</div>
21+
</div>
22+
<div class="field-control">
23+
<input type="text" x-model="config.remote_url" />
24+
</div>
1425
</div>
1526

1627
<div class="field">
1728
<div class="field-label">
1829
<div class="field-title">Voice</div>
19-
<div class="field-description">Kokoro voice identifier passed to the backend pipeline.</div>
30+
<div class="field-description">Kokoro voice identifier (e.g. af_bella, am_onyx, am_onyx+am_echo for blending).</div>
2031
</div>
2132
<div class="field-control">
2233
<input type="text" x-model="config.voice" />
2334
</div>
2435
</div>
2536

37+
<div class="field">
38+
<div class="field-label">
39+
<div class="field-title">Audio Format</div>
40+
<div class="field-description">Output format for synthesized audio. MP3 recommended for smaller file size.</div>
41+
</div>
42+
<div class="field-control">
43+
<select x-model="config.response_format">
44+
<option value="mp3">MP3 (recommended)</option>
45+
<option value="wav">WAV (uncompressed)</option>
46+
<option value="opus">Opus (low bitrate)</option>
47+
<option value="flac">FLAC (lossless)</option>
48+
</select>
49+
</div>
50+
</div>
51+
2652
<div class="field">
2753
<div class="field-label">
2854
<div class="field-title">Speed</div>

plugins/_kokoro_tts/webui/kokoro-tts-store.js

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@ const model = {
1414
config: {
1515
voice: "",
1616
speed: 1.1,
17+
remote_url: "",
18+
response_format: "mp3",
1719
},
1820
modelReady: false,
1921
modelLoading: false,
20-
packageVersion: "",
22+
remoteHealthy: false,
2123
providerCleanup: null,
2224

2325
async initRuntime() {
@@ -42,10 +44,12 @@ const model = {
4244
this.config = {
4345
voice: status?.config?.voice || "",
4446
speed: Number(status?.config?.speed || 1.1),
47+
remote_url: status?.config?.remote_url || "",
48+
response_format: status?.config?.response_format || "mp3",
4549
};
4650
this.modelReady = !!status?.model?.ready;
4751
this.modelLoading = !!status?.model?.loading;
48-
this.packageVersion = status?.package?.version || "";
52+
this.remoteHealthy = !!status?.remote?.healthy;
4953

5054
if (this.enabled) {
5155
this.registerProvider();
@@ -77,7 +81,7 @@ const model = {
7781

7882
return {
7983
audioBase64: result.audio || "",
80-
mimeType: result.mime_type || "audio/wav",
84+
mimeType: result.mime_type || "audio/mpeg",
8185
};
8286
},
8387
});

0 commit comments

Comments
 (0)