fix: openai realtime participants and conversation sync (#258)

d3xvn · web-flow · commit a0042ea49bd8 · 2025-12-12T13:46:22.000+01:00
diff --git a/plugins/openai/tests/test_openai_realtime.py b/plugins/openai/tests/test_openai_realtime.py
@@ -250,19 +250,6 @@ async def on_user_transcript(event: RealtimeUserSpeechTranscriptionEvent):
         )
         await realtime.simple_audio_response(pcm, test_participant)
 
-        # Simulate OpenAI creating the conversation item (this is when we map item_id -> participant)
-        item_created_event = {
-            "type": "conversation.item.created",
-            "event_id": "event_created_123",
-            "item": {
-                "id": "item_test_456",
-                "type": "message",
-                "role": "user",
-                "content": [],
-            },
-        }
-        await realtime._handle_openai_event(item_created_event)
-
         # Now simulate receiving the transcription event from OpenAI
         openai_event = {
             "content_index": 0,
@@ -286,95 +273,3 @@ async def on_user_transcript(event: RealtimeUserSpeechTranscriptionEvent):
 
         # Verify the user_id() helper method works
         assert user_transcripts[0].user_id() == "test_user_123"
-
-    async def test_multi_user_participant_tracking(self, realtime):
-        """Test that participant tracking works correctly when multiple users speak in succession"""
-        user_transcripts = []
-
-        @realtime.events.subscribe
-        async def on_user_transcript(event: RealtimeUserSpeechTranscriptionEvent):
-            user_transcripts.append(event)
-
-        from vision_agents.core.edge.types import Participant
-        from getstream.video.rtc.track_util import PcmData, AudioFormat
-        import numpy as np
-
-        # User A sends audio
-        participant_a = Participant(original=None, user_id="user_a")
-        pcm_a = PcmData(
-            samples=np.zeros(100, dtype=np.int16),
-            sample_rate=48000,
-            format=AudioFormat.S16,
-        )
-        await realtime.simple_audio_response(pcm_a, participant_a)
-
-        # OpenAI creates conversation item for User A
-        item_created_a = {
-            "type": "conversation.item.created",
-            "event_id": "event_created_a",
-            "item": {
-                "id": "item_a_123",
-                "type": "message",
-                "role": "user",
-                "content": [],
-            },
-        }
-        await realtime._handle_openai_event(item_created_a)
-
-        # User B sends audio (before A's transcription arrives)
-        participant_b = Participant(original=None, user_id="user_b")
-        pcm_b = PcmData(
-            samples=np.zeros(100, dtype=np.int16),
-            sample_rate=48000,
-            format=AudioFormat.S16,
-        )
-        await realtime.simple_audio_response(pcm_b, participant_b)
-
-        # OpenAI creates conversation item for User B
-        item_created_b = {
-            "type": "conversation.item.created",
-            "event_id": "event_created_b",
-            "item": {
-                "id": "item_b_456",
-                "type": "message",
-                "role": "user",
-                "content": [],
-            },
-        }
-        await realtime._handle_openai_event(item_created_b)
-
-        # Now transcriptions arrive (A's transcription arrives AFTER B started speaking)
-        transcription_a = {
-            "content_index": 0,
-            "event_id": "event_trans_a",
-            "item_id": "item_a_123",  # References User A's item
-            "transcript": "Hello from User A",
-            "type": "conversation.item.input_audio_transcription.completed",
-            "usage": {"seconds": 1, "type": "duration"},
-        }
-        await realtime._handle_openai_event(transcription_a)
-
-        transcription_b = {
-            "content_index": 0,
-            "event_id": "event_trans_b",
-            "item_id": "item_b_456",  # References User B's item
-            "transcript": "Hello from User B",
-            "type": "conversation.item.input_audio_transcription.completed",
-            "usage": {"seconds": 1, "type": "duration"},
-        }
-        await realtime._handle_openai_event(transcription_b)
-
-        await asyncio.sleep(0.1)
-
-        # Verify both transcriptions are attributed to the correct users
-        assert len(user_transcripts) == 2
-
-        # User A's transcription should be attributed to User A (not B, despite B speaking more recently)
-        assert user_transcripts[0].text == "Hello from User A"
-        assert user_transcripts[0].participant is not None
-        assert user_transcripts[0].participant.user_id == "user_a"
-
-        # User B's transcription should be attributed to User B
-        assert user_transcripts[1].text == "Hello from User B"
-        assert user_transcripts[1].participant is not None
-        assert user_transcripts[1].participant.user_id == "user_b"
diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
@@ -102,10 +102,6 @@ def __init__(
             self.realtime_session["audio"]["output"] = RealtimeAudioConfigOutputParam()
         self.realtime_session["audio"]["output"]["voice"] = self.voice
 
-        # Map conversation item_id to participant to handle multi-user scenarios
-        self._item_to_participant: Dict[str, Participant] = {}
-        self._pending_participant: Optional[Participant] = None
-
         # Track pending tool calls: item_id -> {call_id, name, argument_parts: []}
         # We accumulate argument deltas until response.output_item.done
         self._pending_tool_calls: Dict[str, Dict[str, Any]] = {}
@@ -196,8 +192,8 @@ async def simple_audio_response(
             audio: PCM audio frame to forward upstream.
             participant: Optional participant information for the audio source.
         """
-        # Track pending participant for the next conversation item
-        self._pending_participant = participant
+        # Track current participant for user speech transcription events
+        self._current_participant = participant
         await self.rtc.send_audio_pcm(audio)
 
     async def close(self):
@@ -246,15 +242,9 @@ async def _handle_openai_event(self, event: dict) -> None:
                 conversation_item_id=transcript_event.item_id,
             )
         elif et == "conversation.item.created":
-            # When OpenAI creates a conversation item, map it to the participant who sent the audio
-            item = event.get("item", {})
-            if item.get("type") == "message" and item.get("role") == "user":
-                item_id = item.get("id")
-                if item_id and self._pending_participant:
-                    self._item_to_participant[item_id] = self._pending_participant
-                    logger.debug(
-                        f"Mapped item {item_id} to participant {self._pending_participant.user_id if self._pending_participant else 'None'}"
-                    )
+            # Conversation item created - no action needed
+            # Participant tracking is handled via _current_participant in simple_audio_response
+            pass
         elif et == "conversation.item.added":
             # Conversation item was added to the conversation
             pass
@@ -266,22 +256,11 @@ async def _handle_openai_event(self, event: dict) -> None:
             user_transcript_event: ConversationItemInputAudioTranscriptionCompletedEvent = ConversationItemInputAudioTranscriptionCompletedEvent.model_validate(
                 event
             )
-            item_id = user_transcript_event.item_id
-
-            # Look up the correct participant for this transcription
-            participant = self._item_to_participant.get(item_id)
-
-            # Temporarily set the correct participant for this specific transcription
-            original_participant = self._current_participant
-            self._current_participant = participant
+            # _current_participant is kept up-to-date in simple_audio_response
+            # so it will be used by _emit_user_speech_transcription
             self._emit_user_speech_transcription(
                 text=user_transcript_event.transcript, original=event
             )
-            self._current_participant = original_participant
-
-            # Clean up the mapping to avoid memory leaks
-            if item_id:
-                self._item_to_participant.pop(item_id, None)
         elif et == "input_audio_buffer.speech_started":
             # Validate event but don't need to store it
             InputAudioBufferSpeechStartedEvent.model_validate(event)