Skip to content

Commit 2cd7b5d

Browse files
committed
Fix for proper ElevenLabs support
1 parent ecf86c0 commit 2cd7b5d

7 files changed

Lines changed: 114 additions & 16 deletions

File tree

README.md

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ New accounts get $300 in free credits. TTS pricing is ~$4 per 1M characters for
4343

4444
1. Sign up at [elevenlabs.io](https://elevenlabs.io/)
4545
2. Go to [Profile + API Key](https://elevenlabs.io/app/settings/api-keys)
46-
3. Copy your API key
46+
3. Create an API key with these permissions: **Text to Speech > Access**, **Voices > Read**, **Models > Access**
47+
4. Copy your API key
4748

4849
Free tier includes limited characters per month. Paid plans start at $5/mo.
4950

@@ -98,8 +99,6 @@ prompt-tools transcribe --file recording.wav
9899

99100
400+ voices across multiple model families. Default provider.
100101

101-
| Model | Quality | Example Voice | Notes |
102-
|-------|---------|---------------|-------|
103102
| Model | Quality | Example Voice | API Used | Notes |
104103
|-------|---------|---------------|----------|-------|
105104
| Gemini | Highest | `Achernar`, `Kore`, `Puck` | Generative Language | Bare names, auto-selects best model |
@@ -117,10 +116,19 @@ prompt-tools speak "Hello" --voice Kore --model gemini-2.5-flash-preview-tts -o
117116

118117
### ElevenLabs
119118

120-
Premium natural voices. Output is converted to IVR-compatible formats (mu-law/A-law WAV) automatically.
119+
Premium natural voices. Output is converted to IVR-compatible formats (mu-law/A-law WAV) automatically. Voices can be specified by name (e.g., `Sarah`, `Roger`) or voice ID.
120+
121+
| Model | Quality | Notes |
122+
|-------|---------|-------|
123+
| `eleven_v3` | Highest | Latest model (default) |
124+
| `eleven_multilingual_v2` | High | Multilingual |
125+
| `eleven_flash_v2_5` | Good | Fast, low latency |
126+
| `eleven_turbo_v2_5` | Good | Low latency, multilingual |
121127

122128
```bash
123-
prompt-tools speak "Hello" --provider elevenlabs --voice <voice-id> -o hello.wav
129+
prompt-tools speak "Hello" --provider elevenlabs --voice Sarah -o hello.wav
130+
prompt-tools speak "Hello" --provider elevenlabs --voice Sarah --model eleven_multilingual_v2 -o hello.wav
131+
prompt-tools voices --provider elevenlabs --output table
124132
```
125133

126134
## STT Providers

cmd/config_cmd.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,9 +193,15 @@ var configSetEncodingCmd = &cobra.Command{
193193
}
194194

195195
var configSetAPIKeyCmd = &cobra.Command{
196-
Use: "set-api-key <provider>",
196+
Use: "set-api-key <google|elevenlabs|assemblyai>",
197197
Short: "Store API key in OS keyring (interactive)",
198198
Args: cobra.ExactArgs(1),
199+
ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
200+
if len(args) != 0 {
201+
return nil, cobra.ShellCompDirectiveNoFileComp
202+
}
203+
return []string{"google", "elevenlabs", "assemblyai"}, cobra.ShellCompDirectiveNoFileComp
204+
},
199205
RunE: func(cmd *cobra.Command, args []string) error {
200206
provider := strings.ToLower(args[0])
201207
valid := map[string]bool{"google": true, "elevenlabs": true, "assemblyai": true}
@@ -225,9 +231,15 @@ var configSetAPIKeyCmd = &cobra.Command{
225231
}
226232

227233
var configClearAPIKeyCmd = &cobra.Command{
228-
Use: "clear-api-key <provider>",
234+
Use: "clear-api-key <google|elevenlabs|assemblyai>",
229235
Short: "Remove API key from keyring",
230236
Args: cobra.ExactArgs(1),
237+
ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) {
238+
if len(args) != 0 {
239+
return nil, cobra.ShellCompDirectiveNoFileComp
240+
}
241+
return []string{"google", "elevenlabs", "assemblyai"}, cobra.ShellCompDirectiveNoFileComp
242+
},
231243
RunE: func(cmd *cobra.Command, args []string) error {
232244
provider := strings.ToLower(args[0])
233245
valid := map[string]bool{"google": true, "elevenlabs": true, "assemblyai": true}

cmd/speak.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,21 @@ Gemini voices automatically use the Generative Language API. Override model with
2929
gemini-2.5-pro-preview-tts Highest quality (default for Gemini voices)
3030
gemini-2.5-flash-preview-tts Fast, good quality
3131
32+
ElevenLabs voices can be specified by name (e.g., Sarah, Roger) or voice ID.
33+
ElevenLabs models (override with --model):
34+
eleven_v3 Latest, highest quality (default)
35+
eleven_multilingual_v2 High quality, multilingual
36+
eleven_flash_v2_5 Fast, low latency
37+
eleven_turbo_v2_5 Low latency, multilingual
38+
3239
Examples:
3340
prompt-tools speak "Hello world" -o hello.wav
3441
prompt-tools speak "Hello world" --voice Achernar -o hello.wav
3542
prompt-tools speak --ssml "<speak>Hello<break time='500ms'/>world</speak>" -o hello.wav
3643
prompt-tools speak --file script.txt --voice en-US-Studio-O -o prompt.wav
37-
prompt-tools speak "Hello" --voice Kore --model gemini-2.5-flash-preview-tts -o hello.wav`,
44+
prompt-tools speak "Hello" --voice Kore --model gemini-2.5-flash-preview-tts -o hello.wav
45+
prompt-tools speak --provider elevenlabs --voice Sarah -o hello.wav
46+
prompt-tools speak --provider elevenlabs --voice Sarah --model eleven_v3 -o hello.wav`,
3847
Args: cobra.MaximumNArgs(1),
3948
RunE: func(cmd *cobra.Command, args []string) error {
4049
text, _ := cmd.Flags().GetString("text")
@@ -200,7 +209,7 @@ func init() {
200209
speakCmd.Flags().Float64("speaking-rate", 0, "Speaking rate multiplier")
201210
speakCmd.Flags().Float64("pitch", 0, "Pitch in semitones")
202211
speakCmd.Flags().Float64("volume-gain-db", 0, "Volume gain in dB")
203-
speakCmd.Flags().String("model", "", "Gemini model (gemini-2.5-pro-preview-tts, gemini-2.5-flash-preview-tts)")
212+
speakCmd.Flags().String("model", "", "TTS model (Gemini: gemini-2.5-pro-preview-tts; ElevenLabs: eleven_v3, eleven_multilingual_v2, eleven_flash_v2_5)")
204213

205214
rootCmd.AddCommand(speakCmd)
206215
}

cmd/voices.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,14 @@ Google voice models (highest to lowest quality):
3333
Gemini voices use the same star/moon names as Chirp3-HD but are a distinct model
3434
requiring a model_name parameter (handled automatically by the speak command).
3535
36+
ElevenLabs voices show friendly names (e.g., Sarah, Roger) and voice IDs.
37+
Either can be used with --voice in the speak command.
38+
3639
Examples:
3740
prompt-tools voices --language en-US --output table
3841
prompt-tools voices --language en-US --gender FEMALE
39-
prompt-tools voices --provider elevenlabs`,
42+
prompt-tools voices --provider elevenlabs
43+
prompt-tools voices --provider elevenlabs --gender FEMALE --output table`,
4044
RunE: func(cmd *cobra.Command, args []string) error {
4145
providerName, _ := cmd.Flags().GetString("provider")
4246
language, _ := cmd.Flags().GetString("language")

internal/provider/provider.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package provider
33
// Voice represents a TTS voice.
44
type Voice struct {
55
Name string `json:"name"`
6+
VoiceID string `json:"voice_id,omitempty"`
67
Model string `json:"model"`
78
LanguageCodes []string `json:"language_codes"`
89
Gender string `json:"gender"`

internal/tts/elevenlabs.go

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,11 @@ func init() {
3131
func (e *ElevenLabsTTS) Name() string { return "elevenlabs" }
3232

3333
func (e *ElevenLabsTTS) Synthesize(req *provider.TTSRequest) (*provider.TTSResult, error) {
34-
// ElevenLabs voice ID — the req.Voice field contains the voice ID
35-
voiceID := req.Voice
34+
// Resolve voice: accept either a voice ID or a friendly name
35+
voiceID, err := e.resolveVoiceID(req.Voice)
36+
if err != nil {
37+
return nil, err
38+
}
3639

3740
text := req.Text
3841
if req.SSML != "" {
@@ -46,9 +49,15 @@ func (e *ElevenLabsTTS) Synthesize(req *provider.TTSRequest) (*provider.TTSResul
4649
outputFormat = "mp3_44100_128"
4750
}
4851

52+
// Default to eleven_v3; allow override via --model
53+
modelID := "eleven_v3"
54+
if req.Model != "" {
55+
modelID = req.Model
56+
}
57+
4958
body := map[string]any{
5059
"text": text,
51-
"model_id": "eleven_multilingual_v2",
60+
"model_id": modelID,
5261
"voice_settings": map[string]any{
5362
"stability": 0.5,
5463
"similarity_boost": 0.75,
@@ -128,6 +137,42 @@ func (e *ElevenLabsTTS) Synthesize(req *provider.TTSRequest) (*provider.TTSResul
128137
}, nil
129138
}
130139

140+
// resolveVoiceID resolves a voice name or ID to an ElevenLabs voice ID.
141+
// If the input matches a voice ID directly, it's returned as-is.
142+
// Otherwise, it searches by name (case-insensitive).
143+
func (e *ElevenLabsTTS) resolveVoiceID(voice string) (string, error) {
144+
// ElevenLabs voice IDs are 20-char alphanumeric strings.
145+
// If it looks like one, use it directly.
146+
if len(voice) == 20 && isAlphanumeric(voice) {
147+
return voice, nil
148+
}
149+
150+
// Look up by friendly name (match full name or short name before " - ")
151+
voices, err := e.ListVoices("")
152+
if err != nil {
153+
return "", fmt.Errorf("resolving voice name %q: %w", voice, err)
154+
}
155+
for _, v := range voices {
156+
if strings.EqualFold(v.Name, voice) {
157+
return v.VoiceID, nil
158+
}
159+
// Match short name (e.g., "Sarah" matches "Sarah - Mature, Reassuring")
160+
if short, _, ok := strings.Cut(v.Name, " - "); ok && strings.EqualFold(short, voice) {
161+
return v.VoiceID, nil
162+
}
163+
}
164+
return "", fmt.Errorf("ElevenLabs voice %q not found — use 'prompt-tools voices --provider elevenlabs' to list available voices", voice)
165+
}
166+
167+
func isAlphanumeric(s string) bool {
168+
for _, r := range s {
169+
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
170+
return false
171+
}
172+
}
173+
return true
174+
}
175+
131176
func (e *ElevenLabsTTS) ListVoices(languageCode string) ([]provider.Voice, error) {
132177
url := fmt.Sprintf("%s/voices", elevenlabsEndpoint)
133178

@@ -175,7 +220,8 @@ func (e *ElevenLabsTTS) ListVoices(languageCode string) ([]provider.Voice, error
175220
}
176221

177222
voice := provider.Voice{
178-
Name: v.VoiceID,
223+
Name: v.Name,
224+
VoiceID: v.VoiceID,
179225
Model: "ElevenLabs",
180226
LanguageCodes: []string{"multilingual"},
181227
Gender: gender,

skill/SKILL.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,21 @@ Gemini voices auto-select the best available model. Override with `--model`:
7575
2. [Generative Language API](https://console.cloud.google.com/apis/library/generativelanguage.googleapis.com) (for Gemini voices)
7676
3. [Cloud Speech-to-Text API](https://console.cloud.google.com/apis/library/speech.googleapis.com) (for transcription)
7777

78+
## Voice Types (ElevenLabs)
79+
80+
ElevenLabs voices can be specified by friendly name (e.g., `Sarah`, `Roger`) or voice ID. Names are resolved automatically via the API.
81+
82+
| Model | Quality | Notes |
83+
|---|---|---|
84+
| **eleven_v3** | Highest | Latest model (default) |
85+
| **eleven_multilingual_v2** | High | Multilingual |
86+
| **eleven_flash_v2_5** | Good | Fast, low latency |
87+
| **eleven_turbo_v2_5** | Good | Low latency, multilingual |
88+
89+
Override model with `--model`. List voices with `prompt-tools voices --provider elevenlabs`.
90+
91+
**ElevenLabs API key requires these permissions:** Text to Speech > Access, Voices > Read, Models > Access.
92+
7893
## Speak Examples
7994

8095
```bash
@@ -99,8 +114,11 @@ prompt-tools speak "Hello" --sample-rate 16000 --encoding linear16 -o hello.wav
99114
# MP3 output
100115
prompt-tools speak "Hello" --format mp3 -o hello.mp3
101116

102-
# ElevenLabs provider
103-
prompt-tools speak "Hello" --provider elevenlabs --voice <voice-id> -o hello.wav
117+
# ElevenLabs provider (by voice name)
118+
prompt-tools speak "Hello" --provider elevenlabs --voice Sarah -o hello.wav
119+
120+
# ElevenLabs with specific model
121+
prompt-tools speak "Hello" --provider elevenlabs --voice Sarah --model eleven_multilingual_v2 -o hello.wav
104122
```
105123

106124
## Voice Listing Examples

0 commit comments

Comments
 (0)