elevenlabs
diff --git a/‎src/elevenlabs/speech_engine/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎src/elevenlabs/speech_engine/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/elevenlabs/speech_engine/resource.py‎
Lines changed: 125 additions & 1 deletion b/‎src/elevenlabs/speech_engine/resource.py‎
Lines changed: 125 additions & 1 deletion
diff --git a/‎src/elevenlabs/speech_engine/server.py‎
Lines changed: 54 additions & 0 deletions b/‎src/elevenlabs/speech_engine/server.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎src/elevenlabs/speech_engine/session.py‎
Lines changed: 36 additions & 9 deletions b/‎src/elevenlabs/speech_engine/session.py‎
Lines changed: 36 additions & 9 deletions
@@ -1,6 +1,6 @@
 """ElevenLabs Speech Engine SDK module."""
 
-from .resource import SpeechEngineResource
+from .resource import SpeechEngineResource, verify_speech_engine_jwt
 from .server import SpeechEngineServer
 from .session import SpeechEngineSession
 from .types import (
@@ -19,6 +19,7 @@
     "SpeechEngineServer",
     "SpeechEngineSession",
     "WebSocketLike",
+    "verify_speech_engine_jwt",
     "CLOSE",
     "DISCONNECTED",
     "ERROR",
 
@@ -1,11 +1,101 @@
 """SpeechEngineResource — client-facing handle for a speech engine instance."""
 
+import base64
+import hashlib
+import hmac
+import json
+import logging
+import time
 import typing
 
 from .server import SpeechEngineServer
 from .session import SpeechEngineSession
 from .types import WebSocketLike
 
+logger = logging.getLogger("elevenlabs.speech_engine")
+
+_ISSUER = "https://api.elevenlabs.io/convai/speech-engine"
+_SUBJECT = "convai_speech_engine_upstream"
+_LEEWAY_SECONDS = 60
+
+
+def _base64url_decode(data: str) -> bytes:
+    padded = data.replace("-", "+").replace("_", "/")
+    remainder = len(padded) % 4
+    if remainder:
+        padded += "=" * (4 - remainder)
+    return base64.b64decode(padded)
+
+
+def verify_speech_engine_jwt(value: str, api_key: str) -> typing.Dict[str, typing.Any]:
+    """Verify an HS256 JWT from the ElevenLabs Speech Engine API.
+
+    The HMAC secret is the SHA-256 hash of the API key.  Returns the
+    decoded payload on success, raises :class:`ValueError` on failure.
+    """
+    token = value.strip()
+    if token.lower().startswith("bearer "):
+        token = token[7:].strip()
+
+    parts = token.split(".")
+    if len(parts) != 3:
+        raise ValueError("Invalid JWT: expected 3 parts")
+
+    header_b64, payload_b64, signature_b64 = parts
+
+    try:
+        payload = json.loads(_base64url_decode(payload_b64))
+    except Exception:
+        raise ValueError("Invalid JWT: failed to decode payload")
+
+    trimmed_key = api_key.strip()
+    secret = hashlib.sha256(trimmed_key.encode("utf-8")).digest()
+
+    expected_sig = hmac.new(
+        secret, f"{header_b64}.{payload_b64}".encode(), hashlib.sha256
+    ).digest()
+    actual_sig = _base64url_decode(signature_b64)
+
+    if not hmac.compare_digest(expected_sig, actual_sig):
+        key_prefix = (
+            f"{trimmed_key[:4]}...{trimmed_key[-4:]}"
+            if len(trimmed_key) > 8
+            else "****"
+        )
+        whitespace_note = (
+            " — key had trailing whitespace that was trimmed"
+            if len(trimmed_key) != len(api_key)
+            else ""
+        )
+        raise ValueError(
+            f"Invalid JWT: signature mismatch "
+            f"(API key: {key_prefix}, {len(trimmed_key)} chars{whitespace_note})"
+        )
+
+    if payload.get("iss") != _ISSUER:
+        raise ValueError(
+            f'Invalid JWT: expected issuer "{_ISSUER}", got "{payload.get("iss")}"'
+        )
+    if payload.get("sub") != _SUBJECT:
+        raise ValueError(
+            f'Invalid JWT: expected subject "{_SUBJECT}", got "{payload.get("sub")}"'
+        )
+
+    now = int(time.time())
+
+    exp = payload.get("exp")
+    if not isinstance(exp, (int, float)):
+        raise ValueError("Invalid JWT: missing exp claim")
+    iat = payload.get("iat")
+    if not isinstance(iat, (int, float)):
+        raise ValueError("Invalid JWT: missing iat claim")
+    if exp + _LEEWAY_SECONDS < now:
+        raise ValueError("Invalid JWT: token has expired")
+    if iat - _LEEWAY_SECONDS > now:
+        raise ValueError("Invalid JWT: iat is in the future")
+
+    return payload
+
 
 class SpeechEngineResource:
     """Represents a speech engine instance.
@@ -39,15 +129,49 @@ def __init__(
         self.engine_id = engine_id
         self._options = client_options
 
+    def _get_api_key(self) -> typing.Optional[str]:
+        if self._options is not None and hasattr(self._options, "_api_key"):
+            return self._options._api_key
+        return None
+
+    def verify_request(
+        self, headers: typing.Dict[str, typing.Any]
+    ) -> bool:
+        """Verify that an incoming request is from the ElevenLabs API.
+
+        Checks the ``X-Elevenlabs-Speech-Engine-Authorization`` header
+        for a valid JWT signed with the SHA-256 hash of the API key.
+
+        Only needed when managing the WebSocket upgrade yourself.
+        When using :meth:`serve`, verification is handled automatically.
+        """
+        api_key = self._get_api_key()
+        if not api_key:
+            return False
+        raw = headers.get("x-elevenlabs-speech-engine-authorization")
+        if isinstance(raw, list):
+            raw = raw[0] if raw else None
+        if not raw:
+            return False
+        try:
+            verify_speech_engine_jwt(raw, api_key)
+            return True
+        except ValueError:
+            return False
+
     async def serve(
         self,
         *,
         port: int = 3001,
+        path: typing.Optional[str] = None,
         debug: bool = False,
         **handlers: typing.Any,
     ) -> None:
         """Start a standalone WebSocket server.  Blocks until stopped."""
-        server = SpeechEngineServer(port=port, debug=debug, **handlers)
+        api_key = self._get_api_key()
+        server = SpeechEngineServer(
+            port=port, path=path, debug=debug, api_key=api_key, **handlers
+        )
         await server.serve()
 
     def create_session(
 
@@ -2,6 +2,7 @@
 
 import asyncio
 import logging
+import os
 import typing
 
 from .session import SpeechEngineSession, _wire_handlers
@@ -15,10 +16,14 @@ class SpeechEngineServer:
     instances for each incoming connection from the ElevenLabs Speech Engine
     API.
 
+    Every incoming connection is verified against the ElevenLabs API using
+    the configured API key before being accepted.
+
     Example::
 
         server = SpeechEngineServer(
             port=3001,
+            api_key="sk_...",
             debug=True,
             on_transcript=handle_transcript,
         )
@@ -29,15 +34,28 @@ def __init__(
         self,
         *,
         port: int = 3001,
+        path: typing.Optional[str] = None,
+        api_key: typing.Optional[str] = None,
         debug: bool = False,
         **handlers: typing.Any,
     ) -> None:
         self._port = port
+        self._path = path
+        self._api_key = api_key
         self._debug = debug
         self._handlers = handlers
         self._stop_event = asyncio.Event()
         self._server = None  # type: typing.Any
 
+        if debug:
+            logger.setLevel(logging.DEBUG)
+            if not logger.handlers:
+                handler = logging.StreamHandler()
+                handler.setFormatter(
+                    logging.Formatter("[SpeechEngine] %(message)s")
+                )
+                logger.addHandler(handler)
+
     def handle_connection(self, ws: WebSocketLike) -> SpeechEngineSession:
         """Wrap *ws* in a :class:`SpeechEngineSession` with the server's
         handlers wired up.
@@ -46,15 +64,51 @@ def handle_connection(self, ws: WebSocketLike) -> SpeechEngineSession:
         individual connections.  The returned session's :meth:`run` must
         still be awaited by the caller.
         """
+        logger.debug("creating new session")
         session = SpeechEngineSession(ws, debug=self._debug)
         _wire_handlers(session, self._handlers)
         return session
 
     async def serve(self) -> None:
         """Start the WebSocket server.  Blocks until :meth:`stop` is called."""
+        from .resource import verify_speech_engine_jwt  # noqa: E402
+
         import websockets  # noqa: E402 — keep import lazy
 
+        api_key = self._api_key or os.environ.get("ELEVENLABS_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "SpeechEngineServer requires an API key to verify incoming "
+                "connections. Pass api_key= or set the ELEVENLABS_API_KEY "
+                "environment variable."
+            )
+
         async def _handler(websocket: typing.Any, *_args: typing.Any) -> None:
+            if self._path is not None and websocket.request.path != self._path:
+                await websocket.close(4000, "not found")
+                return
+
+            header_value = websocket.request.headers.get(
+                "x-elevenlabs-speech-engine-authorization"
+            )
+            if not header_value:
+                logger.debug(
+                    "rejected connection — missing "
+                    "X-Elevenlabs-Speech-Engine-Authorization header"
+                )
+                await websocket.close(
+                    4001, "missing authorization header"
+                )
+                return
+
+            try:
+                verify_speech_engine_jwt(header_value, api_key)
+            except ValueError as e:
+                logger.debug("rejected connection — %s", e)
+                await websocket.close(4001, str(e))
+                return
+
+            logger.debug("verified connection, accepting WebSocket")
             session = self.handle_connection(websocket)
             await session.run()
 
 
@@ -5,7 +5,7 @@
 import logging
 import typing
 
-from .types import ConversationMessage, WebSocketLike
+from .types import ConversationMessage, WebSocketLike, wrap_websocket
 
 logger = logging.getLogger("elevenlabs.speech_engine")
 
@@ -159,11 +159,11 @@ async def handle(transcript):
 
     def __init__(
         self,
-        ws: WebSocketLike,
+        ws: typing.Any,
         *,
         debug: bool = False,
     ) -> None:
-        self._ws = ws
+        self._ws = wrap_websocket(ws)
         self._conversation_id = None  # type: typing.Optional[str]
         self._current_task = None  # type: typing.Optional[asyncio.Task]  # type: ignore[type-arg]
         self._current_event_id = None  # type: typing.Optional[int]
@@ -173,6 +173,12 @@ def __init__(
 
         if debug:
             logger.setLevel(logging.DEBUG)
+            if not logger.handlers:
+                handler = logging.StreamHandler()
+                handler.setFormatter(
+                    logging.Formatter("[SpeechEngine] %(message)s")
+                )
+                logger.addHandler(handler)
 
     # ------------------------------------------------------------------
     # Event emitter interface
@@ -232,8 +238,7 @@ async def run(self) -> None:
                 except asyncio.CancelledError:
                     raise
                 except Exception:
-                    # Connection closed or errored — exit the loop and
-                    # let the finally block emit "disconnected".
+                    logger.debug("WebSocket connection lost")
                     break
 
                 try:
@@ -275,10 +280,19 @@ async def send_response(
         if self._closed:
             raise RuntimeError("Cannot send response: session is closed")
 
+        if self._current_event_id is None:
+            logger.warning(
+                "sendResponse() called outside of an on_transcript handler. "
+                "Responses can only be sent in reply to a user transcript. "
+                "To have the agent speak first, set a first message in your "
+                "Speech Engine conversation config on the client."
+            )
+            return
+
         if isinstance(response, str):
             logger.debug(
-                "sending string response (%d chars), event_id=%s",
-                len(response),
+                'sending string response: "%s", event_id=%s',
+                response,
                 self._current_event_id,
             )
             await self._send_agent_response(response, False)
@@ -318,6 +332,19 @@ async def _handle_message(self, msg: typing.Dict[str, typing.Any]) -> None:
             await self._emit("init", self._conversation_id)
 
         elif msg_type == "user_transcript":
+            incoming_event_id = msg.get("event_id")
+
+            if (
+                incoming_event_id == self._current_event_id
+                and self._current_task is not None
+                and not self._current_task.done()
+            ):
+                logger.debug(
+                    "skipping duplicate transcript, event_id=%s",
+                    incoming_event_id,
+                )
+                return
+
             was_active = (
                 self._current_task is not None
                 and not self._current_task.done()
@@ -328,10 +355,10 @@ async def _handle_message(self, msg: typing.Dict[str, typing.Any]) -> None:
                     "interrupted: cancelling previous response "
                     "(event_id=%s) for new transcript (event_id=%s)",
                     self._current_event_id,
-                    msg.get("event_id"),
+                    incoming_event_id,
                 )
 
-            self._current_event_id = msg.get("event_id")
+            self._current_event_id = incoming_event_id
             transcript_data = msg.get("user_transcript", [])
             logger.debug(
                 "received transcript, event_id=%s, messages=%d",