Skip to content

Commit 503ab62

Browse files
committed
Merge branch 'main-T11397-supertonic-tts-RUS' into 'main'
T11397 add Supertonic TTS for Korean and Japanese See merge request bizzappdev/ai/polytalkio/polytalk!5
2 parents 09e0833 + c50bb08 commit 503ab62

8 files changed

Lines changed: 317 additions & 25 deletions

File tree

.env.example

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,15 @@ TTS_MODEL=en_GB-jenny_dioco-medium
118118
# For external service, use: https://tts.your-domain.com
119119
TTS_BASE_URL=http://tts:5000
120120

121+
# Supertonic TTS is used for Japanese and Korean. First startup downloads
122+
# model assets into the supertonic_data Docker volume.
123+
SUPERTONIC_TTS_BASE_URL=http://supertonic-tts:7788
124+
SUPERTONIC_TTS_VOICE=F1
125+
SUPERTONIC_TTS_JA_VOICE=F1
126+
SUPERTONIC_TTS_KO_VOICE=F1
127+
SUPERTONIC_TTS_STEPS=8
128+
SUPERTONIC_TTS_SPEED=1.00
129+
121130
# ============================================================================
122131
# APPLICATION SETTINGS
123132
# ============================================================================

app/services/tts_service.py

Lines changed: 117 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ def __init__(self) -> None:
4444
self.mock_mode = self.config.get("mock_mode", True)
4545
self.provider = self.config.get("provider", "piper")
4646
self.base_url = self.config.get("base_url", "http://localhost:5000")
47+
self.providers = self.config.get("providers", {})
48+
self.language_providers = self.config.get("language_providers", {})
4749
self.voice = self.config.get("voice", "en_US-lessac-medium")
4850
self.timeout = self.config.get("timeout_seconds", 15)
4951
self.media_dir = get_config().media_output_dir
@@ -91,10 +93,12 @@ async def synthesize(
9193
return await self._mock_synthesize(text, language, output_path)
9294

9395
try:
94-
if self.provider == "piper":
96+
provider = self._get_provider_for_language(language)
97+
if provider == "piper":
9598
return await self._piper_synthesize(text, language, output_path)
96-
else:
97-
return await self._openai_synthesize(text, language, output_path)
99+
if provider == "supertonic":
100+
return await self._supertonic_synthesize(text, language, output_path)
101+
return await self._openai_synthesize(text, language, output_path)
98102
except Exception as e:
99103
logger.error(f"TTS synthesis failed: {e}")
100104
return TTSResult(success=False, error=str(e))
@@ -148,6 +152,46 @@ async def _mock_synthesize(
148152
logger.error(f"Mock TTS failed: {e}")
149153
return TTSResult(success=False, error=f"Mock TTS failed: {e}")
150154

155+
def _normalize_language(self, language: str) -> tuple[str, str]:
156+
"""Return normalized exact and base language codes."""
157+
normalized_language = language.replace("-", "_")
158+
lang_base = normalized_language.split("_")[0].lower()
159+
return normalized_language, lang_base
160+
161+
def _get_provider_for_language(self, language: str) -> str:
162+
"""Resolve the TTS provider for a language, falling back to default."""
163+
normalized_language, lang_base = self._normalize_language(language)
164+
return self.language_providers.get(
165+
normalized_language,
166+
self.language_providers.get(lang_base, self.provider),
167+
)
168+
169+
def _get_provider_config(self, provider: str) -> dict:
170+
"""Return provider-specific config merged with top-level defaults."""
171+
provider_config = self.providers.get(provider, {})
172+
return {**self.config, **provider_config}
173+
174+
def _get_provider_base_url(self, provider: str) -> str:
175+
"""Return the base URL for a provider."""
176+
provider_config = self._get_provider_config(provider)
177+
return provider_config.get("base_url", self.base_url).rstrip("/")
178+
179+
@staticmethod
180+
def _config_int(value: object, default: int) -> int:
181+
"""Parse an integer config value with a safe fallback."""
182+
try:
183+
return int(value)
184+
except (TypeError, ValueError):
185+
return default
186+
187+
@staticmethod
188+
def _config_float(value: object, default: float) -> float:
189+
"""Parse a float config value with a safe fallback."""
190+
try:
191+
return float(value)
192+
except (TypeError, ValueError):
193+
return default
194+
151195
async def _fetch_voices(self) -> dict:
152196
"""
153197
Fetch available voices from Piper TTS API with thread-safe caching.
@@ -182,7 +226,7 @@ async def _fetch_voices(self) -> dict:
182226

183227
try:
184228
response = await self._http_client.get(
185-
f"{self.base_url.rstrip('/')}/voices"
229+
f"{self._get_provider_base_url('piper')}/voices"
186230
)
187231
response.raise_for_status()
188232
self._voices_cache = response.json()
@@ -210,8 +254,7 @@ async def _get_voice_for_language(self, language: str) -> str:
210254
4. Base language match from API voices
211255
5. Default voice from config
212256
"""
213-
normalized_language = language.replace("-", "_")
214-
lang_base = normalized_language.split("_")[0].lower()
257+
normalized_language, lang_base = self._normalize_language(language)
215258
voices = await self._fetch_voices()
216259

217260
def voice_available(voice_name: str) -> bool:
@@ -276,8 +319,7 @@ def _get_length_scale_for_language(self, language: str, voice: str) -> float:
276319
lookup supports voice-specific, exact language, base language, then
277320
global default settings.
278321
"""
279-
normalized_language = language.replace("-", "_")
280-
lang_base = normalized_language.split("_")[0].lower()
322+
normalized_language, lang_base = self._normalize_language(language)
281323

282324
candidates = [
283325
voice,
@@ -306,7 +348,7 @@ async def _piper_synthesize(
306348
Returns:
307349
TTSResult with audio file path
308350
"""
309-
url = self.base_url.rstrip("/")
351+
url = self._get_provider_base_url("piper")
310352

311353
# Select voice dynamically from Piper TTS API
312354
voice = await self._get_voice_for_language(language)
@@ -341,17 +383,78 @@ async def _piper_synthesize(
341383
audio_url=audio_url,
342384
success=True,
343385
)
344-
except httpx.TimeoutException:
345-
logger.error(f"Piper TTS timeout after {self.timeout}s")
386+
except httpx.HTTPError as e:
387+
logger.error(f"Piper TTS HTTP error: {e}")
346388
return TTSResult(
347389
success=False,
348-
error=f"Piper TTS timeout after {self.timeout}s",
390+
error=f"Piper TTS HTTP error: {e}",
349391
)
392+
393+
def _get_supertonic_voice_for_language(self, language: str) -> str:
394+
"""Return Supertonic voice style for a language."""
395+
provider_config = self._get_provider_config("supertonic")
396+
voices = provider_config.get("voices", {})
397+
normalized_language, lang_base = self._normalize_language(language)
398+
return voices.get(
399+
normalized_language,
400+
voices.get(lang_base, provider_config.get("voice", "M1")),
401+
)
402+
403+
async def _supertonic_synthesize(
404+
self, text: str, language: str, output_path: Optional[Path] = None
405+
) -> TTSResult:
406+
"""Synthesize speech using a Supertonic TTS HTTP server."""
407+
provider_config = self._get_provider_config("supertonic")
408+
normalized_language, lang_base = self._normalize_language(language)
409+
voice = self._get_supertonic_voice_for_language(language)
410+
url = self._get_provider_base_url("supertonic") + "/v1/tts"
411+
412+
payload = {
413+
"text": text,
414+
"voice": voice,
415+
"lang": lang_base,
416+
"steps": self._config_int(provider_config.get("steps"), 8),
417+
"speed": self._config_float(provider_config.get("speed"), 1.05),
418+
"response_format": provider_config.get("response_format", "wav"),
419+
}
420+
421+
try:
422+
async with self._http_client.stream("POST", url, json=payload) as response:
423+
response.raise_for_status()
424+
425+
if output_path is None:
426+
unique_id = str(uuid.uuid4())[:8]
427+
output_path = (
428+
self.media_dir / f"tts_{normalized_language}_{unique_id}.wav"
429+
)
430+
431+
output_path.parent.mkdir(parents=True, exist_ok=True)
432+
433+
with open(output_path, "wb") as f:
434+
async for chunk in response.aiter_bytes():
435+
if chunk:
436+
f.write(chunk)
437+
438+
audio_url = f"/media/output/{output_path.name}"
439+
duration_header = response.headers.get("X-Audio-Duration")
440+
duration = float(duration_header) if duration_header else None
441+
442+
logger.info(
443+
f"Supertonic TTS generated: {output_path} "
444+
f"(lang: {lang_base}, voice: {voice})"
445+
)
446+
447+
return TTSResult(
448+
audio_path=output_path,
449+
audio_url=audio_url,
450+
duration=duration,
451+
success=True,
452+
)
350453
except httpx.HTTPError as e:
351-
logger.error(f"Piper TTS HTTP error: {e}")
454+
logger.error(f"Supertonic TTS HTTP error: {e}")
352455
return TTSResult(
353456
success=False,
354-
error=f"Piper TTS HTTP error: {e}",
457+
error=f"Supertonic TTS HTTP error: {e}",
355458
)
356459

357460
async def _openai_synthesize(
@@ -386,12 +489,6 @@ async def _openai_synthesize(
386489
response.raise_for_status()
387490

388491
audio_content = response.content
389-
except httpx.TimeoutException:
390-
logger.error(f"OpenAI TTS timeout after {self.timeout}s")
391-
return TTSResult(
392-
success=False,
393-
error=f"OpenAI TTS timeout after {self.timeout}s",
394-
)
395492
except httpx.HTTPError as e:
396493
logger.error(f"OpenAI TTS HTTP error: {e}")
397494
return TTSResult(

app/templates/index.html

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@
4646
<option value="es_MX">Spanish (Mexico)</option>
4747
<option value="tr">Turkish</option>
4848
<option value="bn" disabled>Bengali</option>
49-
<option value="ja" disabled>Japanese</option>
49+
<option value="ja">Japanese</option>
5050
<option value="kn" disabled>Kannada</option>
51-
<option value="ko" disabled>Korean</option>
51+
<option value="ko">Korean</option>
5252
<option value="mr" disabled>Marathi</option>
5353
<option value="pt" disabled>Portuguese</option>
5454
<option value="ta" disabled>Tamil</option>
@@ -82,9 +82,9 @@
8282
<option value="es_MX">Spanish (Mexico)</option>
8383
<option value="tr">Turkish</option>
8484
<option value="bn" disabled>Bengali</option>
85-
<option value="ja" disabled>Japanese</option>
85+
<option value="ja">Japanese</option>
8686
<option value="kn" disabled>Kannada</option>
87-
<option value="ko" disabled>Korean</option>
87+
<option value="ko">Korean</option>
8888
<option value="mr" disabled>Marathi</option>
8989
<option value="pt" disabled>Portuguese</option>
9090
<option value="ta" disabled>Tamil</option>

config/config.yaml.example

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,23 @@ tts:
2828
mock_mode: true
2929
provider: "piper"
3030
base_url: "${TTS_BASE_URL}"
31+
providers:
32+
piper:
33+
base_url: "${TTS_BASE_URL}"
34+
supertonic:
35+
base_url: "${SUPERTONIC_TTS_BASE_URL}"
36+
voice: "${SUPERTONIC_TTS_VOICE}"
37+
steps: "${SUPERTONIC_TTS_STEPS}"
38+
speed: "${SUPERTONIC_TTS_SPEED}"
39+
response_format: "wav"
40+
voices:
41+
ja: "${SUPERTONIC_TTS_JA_VOICE}"
42+
ko: "${SUPERTONIC_TTS_KO_VOICE}"
43+
language_providers:
44+
ja: "supertonic"
45+
ja_JP: "supertonic"
46+
ko: "supertonic"
47+
ko_KR: "supertonic"
3148
voice: "en_GB-jenny_dioco-medium"
3249
timeout_seconds: 10
3350
length_scales:

docker-compose.yml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,22 @@ services:
7272
networks:
7373
- polytalk-network
7474

75+
supertonic-tts:
76+
build:
77+
context: ./supertonic_tts
78+
dockerfile: Dockerfile
79+
image: polytalk-supertonic-tts:latest
80+
container_name: polytalk-supertonic-tts
81+
restart: unless-stopped
82+
ports:
83+
- "127.0.0.1:7788:7788"
84+
environment:
85+
- XDG_CACHE_HOME=/data/cache
86+
volumes:
87+
- supertonic_data:/data
88+
networks:
89+
- polytalk-network
90+
7591
polytalk:
7692
build:
7793
context: .
@@ -89,8 +105,14 @@ services:
89105
- TRANSLATION_API_KEY=${TRANSLATION_API_KEY:-}
90106
- TRANSLATION_MODEL=${TRANSLATION_MODEL:-gpt-4o-mini}
91107
- TRANSLATION_MAX_TOKENS=${TRANSLATION_MAX_TOKENS:-160}
92-
# TTS service (local Piper)
108+
# TTS services
93109
- TTS_BASE_URL=${TTS_BASE_URL:-http://tts:5000}
110+
- SUPERTONIC_TTS_BASE_URL=${SUPERTONIC_TTS_BASE_URL:-http://supertonic-tts:7788}
111+
- SUPERTONIC_TTS_VOICE=${SUPERTONIC_TTS_VOICE:-M1}
112+
- SUPERTONIC_TTS_JA_VOICE=${SUPERTONIC_TTS_JA_VOICE:-M1}
113+
- SUPERTONIC_TTS_KO_VOICE=${SUPERTONIC_TTS_KO_VOICE:-M1}
114+
- SUPERTONIC_TTS_STEPS=${SUPERTONIC_TTS_STEPS:-8}
115+
- SUPERTONIC_TTS_SPEED=${SUPERTONIC_TTS_SPEED:-1.05}
94116
# Application
95117
- APP_HOST=0.0.0.0
96118
- APP_PORT=8000
@@ -114,11 +136,13 @@ services:
114136
depends_on:
115137
- stt
116138
- tts
139+
- supertonic-tts
117140
networks:
118141
- polytalk-network
119142

120143
volumes:
121144
stt_data:
145+
supertonic_data:
122146

123147
networks:
124148
polytalk-network:

supertonic_tts/Dockerfile

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
FROM python:3.12-slim
2+
3+
ENV PIP_BREAK_SYSTEM_PACKAGES=1
4+
ENV XDG_CACHE_HOME=/data/cache
5+
ENV HOME=/data/home
6+
7+
WORKDIR /app
8+
9+
RUN apt-get update && apt-get install -y --no-install-recommends \
10+
build-essential \
11+
gosu \
12+
libsndfile1 \
13+
&& rm -rf /var/lib/apt/lists/*
14+
15+
RUN pip install --no-cache-dir 'supertonic[serve]==1.3.1'
16+
17+
RUN addgroup --system supertonic \
18+
&& adduser --system --ingroup supertonic --home /data/home supertonic \
19+
&& mkdir -p /data/cache /data/home /app \
20+
&& chown -R supertonic:supertonic /data /app
21+
22+
COPY entrypoint.sh /app/entrypoint.sh
23+
RUN chmod +x /app/entrypoint.sh
24+
25+
EXPOSE 7788
26+
27+
ENTRYPOINT ["/app/entrypoint.sh"]
28+
CMD ["supertonic", "serve", "--host", "0.0.0.0", "--port", "7788"]

supertonic_tts/entrypoint.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/sh
2+
set -eu
3+
4+
mkdir -p /data/cache /data/home
5+
chown -R supertonic:supertonic /data
6+
7+
exec gosu supertonic "$@"

0 commit comments

Comments
 (0)