Skip to content

Commit 0b22349

Browse files
authored
feat: add ElevenLabs TTS API provider
1 parent 56d2b3f commit 0b22349

6 files changed

Lines changed: 268 additions & 0 deletions

File tree

astrbot/core/config/default.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1807,6 +1807,25 @@
18071807
"gemini_tts_voice_name": "Leda",
18081808
"proxy": "",
18091809
},
1810+
"ElevenLabs TTS(API)": {
1811+
"hint": "API Key 从 https://elevenlabs.io/app/settings/api-keys 获取。Voice ID 可在 https://elevenlabs.io/app/voice-library 浏览选择。",
1812+
"id": "elevenlabs_tts",
1813+
"type": "elevenlabs_tts_api",
1814+
"provider": "elevenlabs",
1815+
"provider_type": "text_to_speech",
1816+
"enable": False,
1817+
"api_key": "",
1818+
"api_base": "https://api.elevenlabs.io/v1",
1819+
"model": "eleven_multilingual_v2",
1820+
"elevenlabs-tts-voice-id": "JBFqnCBsd6RMkjVDRZzb",
1821+
"elevenlabs-tts-output-format": "mp3_44100_128",
1822+
"elevenlabs-tts-stability": "",
1823+
"elevenlabs-tts-similarity-boost": "",
1824+
"elevenlabs-tts-style": "",
1825+
"elevenlabs-tts-use-speaker-boost": True,
1826+
"timeout": "20",
1827+
"proxy": "",
1828+
},
18101829
"OpenAI Embedding": {
18111830
"id": "openai_embedding",
18121831
"type": "openai_embedding",

astrbot/core/provider/manager.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,10 @@ def dynamic_import_provider(self, type: str) -> None:
467467
from .sources.gemini_tts_source import (
468468
ProviderGeminiTTSAPI as ProviderGeminiTTSAPI,
469469
)
470+
case "elevenlabs_tts_api":
471+
from .sources.elevenlabs_tts_source import (
472+
ProviderElevenLabsTTSAPI as ProviderElevenLabsTTSAPI,
473+
)
470474
case "openai_embedding":
471475
from .sources.openai_embedding_source import (
472476
OpenAIEmbeddingProvider as OpenAIEmbeddingProvider,
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
import uuid
2+
from pathlib import Path
3+
4+
import httpx
5+
6+
from astrbot import logger
7+
from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
8+
9+
from ..entities import ProviderType
10+
from ..provider import TTSProvider
11+
from ..register import register_provider_adapter
12+
13+
SUPPORTED_CONTAINER_OUTPUT_PREFIXES = ("mp3", "wav", "opus")
14+
RAW_AUDIO_OUTPUT_PREFIXES = ("pcm", "ulaw", "alaw")
15+
16+
17+
def _parse_optional_float(
18+
provider_config: dict,
19+
cfg_name: str,
20+
) -> float | None:
21+
value = provider_config.get(cfg_name, "")
22+
if value in ("", None):
23+
return None
24+
try:
25+
parsed = float(value)
26+
except (TypeError, ValueError) as exc:
27+
raise ValueError(f"{cfg_name} must be a number between 0 and 1.") from exc
28+
if not 0 <= parsed <= 1:
29+
raise ValueError(f"{cfg_name} must be between 0 and 1.")
30+
return parsed
31+
32+
33+
def _parse_bool(provider_config: dict, cfg_name: str) -> bool:
34+
value = provider_config[cfg_name]
35+
if isinstance(value, bool):
36+
return value
37+
if isinstance(value, int):
38+
return bool(value)
39+
if isinstance(value, str):
40+
normalized = value.strip().lower()
41+
if normalized in {"true", "1", "yes", "y", "on"}:
42+
return True
43+
if normalized in {"false", "0", "no", "n", "off"}:
44+
return False
45+
raise ValueError(f"{cfg_name} must be a boolean value.")
46+
47+
48+
def _normalize_timeout(value: int | str | None) -> int:
49+
if value in ("", None):
50+
return 20
51+
try:
52+
timeout = int(value)
53+
except (TypeError, ValueError) as exc:
54+
raise ValueError("timeout must be a positive integer.") from exc
55+
if timeout <= 0:
56+
raise ValueError("timeout must be a positive integer.")
57+
return timeout
58+
59+
60+
def _validate_output_format(output_format: str) -> None:
61+
fmt = output_format.lower()
62+
if fmt.startswith(RAW_AUDIO_OUTPUT_PREFIXES):
63+
raise ValueError(
64+
"ElevenLabs raw audio output formats are not supported by this provider. "
65+
"Use an mp3, wav, or opus output format instead."
66+
)
67+
if not fmt.startswith(SUPPORTED_CONTAINER_OUTPUT_PREFIXES):
68+
raise ValueError(
69+
"Unsupported ElevenLabs output format. "
70+
"Use an mp3, wav, or opus output format."
71+
)
72+
73+
74+
@register_provider_adapter(
75+
"elevenlabs_tts_api",
76+
"ElevenLabs TTS API",
77+
provider_type=ProviderType.TEXT_TO_SPEECH,
78+
)
79+
class ProviderElevenLabsTTSAPI(TTSProvider):
80+
def __init__(
81+
self,
82+
provider_config: dict,
83+
provider_settings: dict,
84+
) -> None:
85+
super().__init__(provider_config, provider_settings)
86+
self.api_key = provider_config.get("api_key", "")
87+
self.api_base = provider_config.get(
88+
"api_base", "https://api.elevenlabs.io/v1"
89+
).removesuffix("/")
90+
self.voice_id = provider_config.get(
91+
"elevenlabs-tts-voice-id", "JBFqnCBsd6RMkjVDRZzb"
92+
)
93+
self.model_id = provider_config.get("model", "eleven_multilingual_v2")
94+
self.set_model(self.model_id)
95+
self.output_format = provider_config.get(
96+
"elevenlabs-tts-output-format", "mp3_44100_128"
97+
)
98+
_validate_output_format(self.output_format)
99+
100+
# Only send explicitly configured voice settings so the API can apply defaults.
101+
self.voice_settings: dict = {}
102+
for key, cfg_name in (
103+
("stability", "elevenlabs-tts-stability"),
104+
("similarity_boost", "elevenlabs-tts-similarity-boost"),
105+
("style", "elevenlabs-tts-style"),
106+
):
107+
value = _parse_optional_float(provider_config, cfg_name)
108+
if value is not None:
109+
self.voice_settings[key] = value
110+
if "elevenlabs-tts-use-speaker-boost" in provider_config:
111+
self.voice_settings["use_speaker_boost"] = _parse_bool(
112+
provider_config,
113+
"elevenlabs-tts-use-speaker-boost",
114+
)
115+
116+
timeout = _normalize_timeout(provider_config.get("timeout", 20))
117+
118+
proxy = provider_config.get("proxy", "")
119+
if proxy:
120+
logger.info(f"[ElevenLabs TTS] 使用代理: {proxy}")
121+
self.client = httpx.AsyncClient(
122+
timeout=timeout,
123+
proxy=proxy or None,
124+
trust_env=False,
125+
)
126+
127+
def _output_extension(self) -> str:
128+
"""Infer the audio file extension from the configured output format."""
129+
fmt = self.output_format.lower()
130+
if fmt.startswith("mp3"):
131+
return "mp3"
132+
if fmt.startswith("opus"):
133+
return "opus"
134+
if fmt.startswith("wav"):
135+
return "wav"
136+
return "mp3"
137+
138+
async def get_audio(self, text: str) -> str:
139+
url = f"{self.api_base}/text-to-speech/{self.voice_id}"
140+
headers = {
141+
"xi-api-key": self.api_key,
142+
"Content-Type": "application/json",
143+
}
144+
payload: dict = {
145+
"text": text,
146+
"model_id": self.model_name,
147+
}
148+
if self.voice_settings:
149+
payload["voice_settings"] = self.voice_settings
150+
151+
response = await self.client.post(
152+
url,
153+
headers=headers,
154+
params={"output_format": self.output_format},
155+
json=payload,
156+
)
157+
if response.status_code != 200:
158+
error_text = response.text[:1024]
159+
raise Exception(
160+
f"ElevenLabs TTS API 请求失败: {response.status_code}, {error_text}"
161+
)
162+
163+
temp_dir = Path(get_astrbot_temp_path())
164+
temp_dir.mkdir(parents=True, exist_ok=True)
165+
path = (
166+
temp_dir / f"elevenlabs_tts_api_{uuid.uuid4()}.{self._output_extension()}"
167+
)
168+
path.write_bytes(response.content)
169+
return str(path)
170+
171+
async def terminate(self):
172+
if self.client:
173+
await self.client.aclose()

dashboard/src/i18n/locales/en-US/features/config-metadata.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,6 +1590,30 @@
15901590
"description": "voice",
15911591
"hint": "OpenAI TTS voice. OpenAI defaults: 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'."
15921592
},
1593+
"elevenlabs-tts-voice-id": {
1594+
"description": "Voice ID",
1595+
"hint": "ElevenLabs voice ID. Browse and copy voice IDs at https://elevenlabs.io/app/voice-library. Default 'JBFqnCBsd6RMkjVDRZzb' (George)."
1596+
},
1597+
"elevenlabs-tts-output-format": {
1598+
"description": "Output format",
1599+
"hint": "Audio output format, e.g. 'mp3_44100_128', 'mp3_22050_32', 'wav_44100', or 'opus_48000_128'. Raw PCM/u-law/a-law formats are not supported. Default 'mp3_44100_128'."
1600+
},
1601+
"elevenlabs-tts-stability": {
1602+
"description": "Stability",
1603+
"hint": "Voice stability, range [0, 1]. Higher is more consistent, lower is more expressive. Leave empty to use the server default."
1604+
},
1605+
"elevenlabs-tts-similarity-boost": {
1606+
"description": "Similarity boost",
1607+
"hint": "How closely the output matches the original voice, range [0, 1]. Leave empty to use the server default."
1608+
},
1609+
"elevenlabs-tts-style": {
1610+
"description": "Style exaggeration",
1611+
"hint": "Style exaggeration of the voice, range [0, 1]. Higher values increase latency. Leave empty to use the server default."
1612+
},
1613+
"elevenlabs-tts-use-speaker-boost": {
1614+
"description": "Speaker boost",
1615+
"hint": "Boost similarity to the original speaker. May slightly increase latency."
1616+
},
15931617
"mimo-tts-voice": {
15941618
"description": "Voice",
15951619
"hint": "MiMo TTS voice name. Supported values include 'mimo_default', 'default_en', and 'default_zh'."

dashboard/src/i18n/locales/ru-RU/features/config-metadata.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,6 +1587,30 @@
15871587
"description": "API Base URL",
15881588
"hint": "Голоса OpenAI TTS: alloy, echo и др."
15891589
},
1590+
"elevenlabs-tts-voice-id": {
1591+
"description": "ID голоса",
1592+
"hint": "ID голоса ElevenLabs. Просмотрите и скопируйте ID на https://elevenlabs.io/app/voice-library. По умолчанию 'JBFqnCBsd6RMkjVDRZzb' (George)."
1593+
},
1594+
"elevenlabs-tts-output-format": {
1595+
"description": "Формат вывода",
1596+
"hint": "Формат аудио, например 'mp3_44100_128', 'mp3_22050_32', 'wav_44100' или 'opus_48000_128'. Raw PCM/u-law/a-law форматы не поддерживаются. По умолчанию 'mp3_44100_128'."
1597+
},
1598+
"elevenlabs-tts-stability": {
1599+
"description": "Стабильность",
1600+
"hint": "Стабильность голоса, диапазон [0, 1]. Оставьте пустым для значения по умолчанию."
1601+
},
1602+
"elevenlabs-tts-similarity-boost": {
1603+
"description": "Усиление сходства",
1604+
"hint": "Насколько вывод соответствует исходному голосу, диапазон [0, 1]. Оставьте пустым для значения по умолчанию."
1605+
},
1606+
"elevenlabs-tts-style": {
1607+
"description": "Выразительность стиля",
1608+
"hint": "Выразительность стиля голоса, диапазон [0, 1]. Высокие значения увеличивают задержку. Оставьте пустым для значения по умолчанию."
1609+
},
1610+
"elevenlabs-tts-use-speaker-boost": {
1611+
"description": "Усиление диктора",
1612+
"hint": "Усиливает сходство с исходным диктором. Может немного увеличить задержку."
1613+
},
15901614
"mimo-tts-voice": {
15911615
"description": "Голос",
15921616
"hint": "Имя голоса MiMo TTS. Поддерживаются значения 'mimo_default', 'default_en' и 'default_zh'."

dashboard/src/i18n/locales/zh-CN/features/config-metadata.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1592,6 +1592,30 @@
15921592
"description": "voice",
15931593
"hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'"
15941594
},
1595+
"elevenlabs-tts-voice-id": {
1596+
"description": "音色 ID",
1597+
"hint": "ElevenLabs 音色 ID。可在 https://elevenlabs.io/app/voice-library 浏览并复制音色 ID。默认 'JBFqnCBsd6RMkjVDRZzb'(George)。"
1598+
},
1599+
"elevenlabs-tts-output-format": {
1600+
"description": "输出格式",
1601+
"hint": "音频输出格式,例如 'mp3_44100_128'、'mp3_22050_32'、'wav_44100'、'opus_48000_128'。不支持裸 PCM/u-law/a-law 格式。默认 'mp3_44100_128'。"
1602+
},
1603+
"elevenlabs-tts-stability": {
1604+
"description": "稳定性",
1605+
"hint": "音色稳定性,范围 [0, 1]。值越高越稳定,越低越富有表现力。留空则使用服务端默认值。"
1606+
},
1607+
"elevenlabs-tts-similarity-boost": {
1608+
"description": "相似度增强",
1609+
"hint": "输出与原始音色的接近程度,范围 [0, 1]。留空则使用服务端默认值。"
1610+
},
1611+
"elevenlabs-tts-style": {
1612+
"description": "风格夸张度",
1613+
"hint": "音色风格的夸张程度,范围 [0, 1]。值越高延迟越大。留空则使用服务端默认值。"
1614+
},
1615+
"elevenlabs-tts-use-speaker-boost": {
1616+
"description": "说话人增强",
1617+
"hint": "增强与原始说话人的相似度,可能略微增加延迟。"
1618+
},
15951619
"mimo-tts-voice": {
15961620
"description": "音色",
15971621
"hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。"

0 commit comments

Comments
 (0)