fix(inworld-tts): don't poison receive stream with stale-context errors (#539)

aliev · web-flow · commit c24ea70e589b · 2026-05-08T09:35:31.000-06:00
Two related issues caused TTS calls to fail with `Context  not found
during sendText` after &gt;60 seconds of inactivity, leaving the agent
unable to speak its proactive responses.

1. **`_keepalive_loop` sends `send_text` without a `contextId`.**
   When `_active_context_id` is `None`, the keepalive payload still
   contains `{"send_text": {"text": ""}}` with no `contextId`. The
   server cannot route this to a context and responds with an error
   message. The error then sits in the WebSocket receive buffer and
   surfaces on the next valid TTS call, breaking it. Fix: skip the
   keepalive iteration when no active context exists. The websockets
   library handles TCP-level keepalive via PING/PONG independently.

2. **`_receive_audio` did not filter errors by `contextId`.** Audio
   chunks were filtered by `msg_context_id != context_id`, but the
   error/status check ran first and raised regardless of which context
   the message was for. Fix: pull the `contextId` mismatch check above
   the status/error checks so messages addressed to a different (or
   stale) context are dropped early. Server-wide errors with no
   `contextId` (e.g. "max contexts limit reached") still pass through.
diff --git a/plugins/inworld/vision_agents/plugins/inworld/tts.py b/plugins/inworld/vision_agents/plugins/inworld/tts.py
@@ -208,8 +208,18 @@ async def _receive_audio(
                     continue
 
                 result = data.get("result", {})
+                msg_context_id = result.get("contextId") or result.get("context_id")
                 status = result.get("status", {})
-                if status.get("code", 0) != 0:
+                status_code = status.get("code", 0)
+
+                # Drop messages addressed to a different context: they belong
+                # to a stale or already-closed call (or a keepalive whose
+                # context the server doesn't know about). Server-wide errors
+                # with no contextId still pass through.
+                if msg_context_id and msg_context_id != context_id:
+                    continue
+
+                if status_code != 0:
                     error_message = status.get("message", "Unknown Inworld error")
                     if "max contexts limit reached" in error_message.lower():
                         logger.warning(
@@ -221,10 +231,6 @@ async def _receive_audio(
                 if "error" in data:
                     raise RuntimeError(f"Inworld TTS websocket error: {data['error']}")
 
-                msg_context_id = result.get("contextId") or result.get("context_id")
-                if msg_context_id and msg_context_id != context_id:
-                    continue
-
                 audio_chunk = result.get("audioChunk", {})
                 audio_b64 = audio_chunk.get("audioContent")
                 if audio_b64:
@@ -362,9 +368,17 @@ async def _keepalive_loop(self) -> None:
                 if self._websocket is websocket:
                     self._websocket = None
                 return
-            payload: dict[str, object] = {"send_text": {"text": ""}}
-            if self._active_context_id:
-                payload["contextId"] = self._active_context_id
+            # Without an active context the server has nothing to attach a
+            # `send_text` to and responds with "Context not found", which then
+            # corrupts the next valid TTS call's receive stream. The websockets
+            # library handles TCP-level keepalive via PING/PONG on its own, so
+            # skipping iterations here is safe.
+            if self._active_context_id is None:
+                continue
+            payload: dict[str, object] = {
+                "send_text": {"text": ""},
+                "contextId": self._active_context_id,
+            }
             try:
                 await websocket.send(json.dumps(payload))
             except (websockets.exceptions.WebSocketException, OSError):