Skip to content

Commit ad9734f

Browse files
committed
Add audio format correctly and better typing
1 parent 72564a5 commit ad9734f

2 files changed

Lines changed: 31 additions & 15 deletions

File tree

src/elevenlabs/realtime/connection.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
import typing
55
from enum import Enum
66

7+
if typing.TYPE_CHECKING:
8+
from websockets.asyncio.client import ClientConnection
9+
710

811
class RealtimeEvents(str, Enum):
912
"""Events emitted by the RealtimeConnection"""
@@ -55,7 +58,7 @@ class RealtimeConnection:
5558
```
5659
"""
5760

58-
def __init__(self, websocket, current_sample_rate: int, ffmpeg_process: typing.Optional[subprocess.Popen] = None):
61+
def __init__(self, websocket: "ClientConnection", current_sample_rate: int, ffmpeg_process: typing.Optional[subprocess.Popen] = None):
5962
self.websocket = websocket
6063
self.current_sample_rate = current_sample_rate
6164
self.ffmpeg_process = ffmpeg_process

src/elevenlabs/realtime/scribe.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
import subprocess
44
import typing
55
from enum import Enum
6+
from typing import Required, overload
67

78
try:
8-
import websockets
9+
from websockets.asyncio.client import connect as websocket_connect
910
except ImportError:
1011
raise ImportError(
1112
"The websockets package is required for realtime speech-to-text. "
@@ -17,10 +18,13 @@
1718

1819
class AudioFormat(str, Enum):
1920
"""Audio format options for realtime transcription"""
21+
PCM_8000 = "pcm_8000"
2022
PCM_16000 = "pcm_16000"
2123
PCM_22050 = "pcm_22050"
2224
PCM_24000 = "pcm_24000"
2325
PCM_44100 = "pcm_44100"
26+
PCM_48000 = "pcm_48000"
27+
ULAW_8000 = "ulaw_8000"
2428

2529

2630
class CommitStrategy(str, Enum):
@@ -50,9 +54,9 @@ class RealtimeAudioOptions(typing.TypedDict, total=False):
5054
language_code: An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand.
5155
include_timestamps: Whether to receive the committed_transcript_with_timestamps event after committing the segment (optional, defaults to False)
5256
"""
53-
model_id: str
54-
audio_format: AudioFormat
55-
sample_rate: int
57+
model_id: Required[str]
58+
audio_format: Required[AudioFormat]
59+
sample_rate: Required[int]
5660
commit_strategy: CommitStrategy
5761
vad_silence_threshold_secs: float
5862
vad_threshold: float
@@ -77,8 +81,8 @@ class RealtimeUrlOptions(typing.TypedDict, total=False):
7781
language_code: An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand.
7882
include_timestamps: Whether to receive the committed_transcript_with_timestamps event after committing the segment (optional, defaults to False)
7983
"""
80-
model_id: str
81-
url: str
84+
model_id: Required[str]
85+
url: Required[str]
8286
commit_strategy: CommitStrategy
8387
vad_silence_threshold_secs: float
8488
vad_threshold: float
@@ -121,6 +125,18 @@ def __init__(self, api_key: str, base_url: str = "wss://api.elevenlabs.io"):
121125
self.api_key = api_key
122126
self.base_url = base_url
123127

128+
@overload
129+
async def connect(
130+
self,
131+
options: RealtimeAudioOptions
132+
) -> RealtimeConnection: ...
133+
134+
@overload
135+
async def connect(
136+
self,
137+
options: RealtimeUrlOptions
138+
) -> RealtimeConnection: ...
139+
124140
async def connect(
125141
self,
126142
options: typing.Union[RealtimeAudioOptions, RealtimeUrlOptions]
@@ -185,8 +201,7 @@ async def _connect_audio(self, options: RealtimeAudioOptions) -> RealtimeConnect
185201
# Build WebSocket URL with query parameters
186202
ws_url = self._build_websocket_url(
187203
model_id=model_id,
188-
encoding=audio_format.value,
189-
sample_rate=sample_rate,
204+
audio_format=audio_format.value,
190205
commit_strategy=commit_strategy.value,
191206
vad_silence_threshold_secs=vad_silence_threshold_secs,
192207
vad_threshold=vad_threshold,
@@ -197,7 +212,7 @@ async def _connect_audio(self, options: RealtimeAudioOptions) -> RealtimeConnect
197212
)
198213

199214
# Connect to WebSocket
200-
websocket = await websockets.connect(
215+
websocket = await websocket_connect(
201216
ws_url,
202217
additional_headers={"xi-api-key": self.api_key}
203218
)
@@ -249,7 +264,7 @@ async def _connect_url(self, options: RealtimeUrlOptions) -> RealtimeConnection:
249264
)
250265

251266
# Connect to WebSocket
252-
websocket = await websockets.connect(
267+
websocket = await websocket_connect(
253268
ws_url,
254269
additional_headers={"xi-api-key": self.api_key}
255270
)
@@ -341,8 +356,7 @@ async def _stream_ffmpeg_to_websocket(self, connection: RealtimeConnection) -> N
341356
def _build_websocket_url(
342357
self,
343358
model_id: str,
344-
encoding: str,
345-
sample_rate: int,
359+
audio_format: str,
346360
commit_strategy: str,
347361
vad_silence_threshold_secs: typing.Optional[float] = None,
348362
vad_threshold: typing.Optional[float] = None,
@@ -358,8 +372,7 @@ def _build_websocket_url(
358372
# Build query parameters
359373
params = [
360374
f"model_id={model_id}",
361-
f"encoding={encoding}",
362-
f"sample_rate={sample_rate}",
375+
f"audio_format={audio_format}",
363376
f"commit_strategy={commit_strategy}"
364377
]
365378

0 commit comments

Comments
 (0)