Revert "fix(groupchat): detect participant loops via executor_completed events"

Prachig-Microsoft · Prachig-Microsoft · commit 03387c45df1f · 2026-06-13T15:52:36.000+05:30
This reverts commit 7a0212f.
diff --git a/src/processor/src/libs/agent_framework/groupchat_orchestrator.py b/src/processor/src/libs/agent_framework/groupchat_orchestrator.py
@@ -313,33 +313,6 @@ def __init__(
         # Snapshot of progress_counter at the time we last saw _last_coordinator_selection.
         self._last_coordinator_selection_progress: int = 0
 
-        # Per-participant turn tracking driven by ``WorkflowEvent.executor_completed``.
-        #
-        # In agent-framework 1.3.0 the GroupChat orchestrator agent (the
-        # Coordinator) is invoked directly inside the framework's internal
-        # ``_invoke_agent_helper`` (see
-        # ``agent_framework_orchestrations/_group_chat.py:484``). It is NOT
-        # wrapped in an ``AgentExecutor`` and therefore never surfaces as a
-        # workflow event - which makes the Coordinator-JSON-based loop
-        # detection in ``_complete_agent_response`` permanently dead in 1.3.0.
-        #
-        # The only observable "the conversation is moving" pulse we have is
-        # ``executor_completed`` events for the *participants* (which DO go
-        # through ``AgentExecutor``). We track:
-        #   - the most recently completed participant,
-        #   - the streak of consecutive completions of that participant,
-        #   - the total number of participant turns,
-        # and use these for two safety nets in the streaming loop:
-        #   * 3+ consecutive same-participant turns => hard_loop termination
-        #   * total turns >= ``max_rounds`` => hard_timeout termination
-        # (independent of ``len(self.agent_responses)`` which only grows on
-        # agent switch and so cannot reach ``max_rounds`` during a same-
-        # participant loop).
-        self._participant_completions_total: int = 0
-        self._last_completed_participant: str | None = None
-        self._participant_completion_streak: int = 0
-        self._participant_consecutive_loop_threshold: int = 3
-
     def _request_forced_termination(
         self, *, reason: str, termination_type: str
     ) -> None:
@@ -570,15 +543,6 @@ async def run_stream(
                             termination_type="hard_timeout",
                         )
 
-                # Honor any pending termination request at the *top* of each
-                # iteration so that branches which set the flags (timeout,
-                # participant loop detection, Coordinator finish=true) take
-                # effect immediately on the next event - rather than being
-                # gated on the next ``output`` event arriving (which during a
-                # slow loop can be many seconds away).
-                if self._forced_termination_requested or self._termination_requested:
-                    break
-
                 # In agent-framework 1.3.0, ``workflow.run(stream=True)`` yields
                 # only ``WorkflowEvent`` instances; ``AgentResponseUpdate`` is
                 # wrapped inside ``WorkflowEvent.data`` for ``type=="output"``
@@ -588,46 +552,7 @@ async def run_stream(
                 # ``WorkflowEvent.type`` and inspect ``event.data`` /
                 # ``event.executor_id`` to route per-participant streaming
                 # chunks vs the orchestrator's final output.
-                if not isinstance(event, WorkflowEvent):
-                    continue
-
-                # Participant turn completion. Used for loop / max_rounds
-                # safety nets that work even when the Coordinator is
-                # invisible to the streaming loop (which it is in 1.3.0 -
-                # the Coordinator runs inside the framework's internal
-                # ``_invoke_agent_helper`` and never surfaces as an executor
-                # event). See ``_track_participant_completion`` for details.
-                if event.type == "executor_completed":
-                    src_executor = self._normalize_executor_id(
-                        event.executor_id or ""
-                    )
-                    if (
-                        src_executor in self.agents
-                        and src_executor != self.coordinator_name
-                        and src_executor != self.get_result_generator_name()
-                    ):
-                        # Flush this participant's streaming buffer into a
-                        # discrete per-turn ``AgentResponse`` before we track
-                        # the completion. Without this, when the framework's
-                        # Coordinator picks the same participant back-to-back
-                        # (the loop pattern we're trying to detect),
-                        # ``_start_agent_if_needed`` sees no agent switch on
-                        # the NEXT turn's chunks and the buffer would grow
-                        # across turns - producing one merged response rather
-                        # than one response per turn.
-                        if (
-                            self._last_executor_id == src_executor
-                            and self._current_agent_response
-                        ):
-                            await self._complete_agent_response(
-                                src_executor, on_agent_response
-                            )
-                            self._current_agent_response = []
-                            self._last_executor_id = None
-                        self._track_participant_completion(src_executor)
-                    continue
-
-                if event.type != "output":
+                if not isinstance(event, WorkflowEvent) or event.type != "output":
                     continue
 
                 data = event.data
@@ -648,12 +573,7 @@ async def run_stream(
                         callback=on_agent_response,
                     )
 
-                    # Secondary max_rounds safety net based on agent switches.
-                    # The primary check lives in ``_track_participant_completion``
-                    # (driven by ``executor_completed`` events) and works even
-                    # when the same agent runs back-to-back. This switch-based
-                    # check is kept as defense-in-depth for sessions with
-                    # normal alternation.
+                    # Enforce max rounds as a safety guard.
                     if self.max_rounds and len(self.agent_responses) >= self.max_rounds:
                         self._request_forced_termination(
                             reason=(
@@ -662,9 +582,13 @@ async def run_stream(
                             termination_type="hard_timeout",
                         )
 
-                    # Termination flags are honored at the top of the next
-                    # iteration so any branch can request termination
-                    # uniformly without duplicating break logic here.
+                    if self._forced_termination_requested:
+                        break
+
+                    # If the Coordinator requested finish=true, stop immediately.
+                    if self._termination_requested:
+                        break
+
                     continue
 
                 # Final orchestrator output: complete any buffered agent
@@ -853,75 +777,6 @@ def _normalize_executor_id(self, executor_id: str) -> str:
         """
         return executor_id.split(":")[-1]
 
-    def _track_participant_completion(self, src_executor: str) -> None:
-        """Track a participant turn completion for loop / max_rounds detection.
-
-        Called from the streaming loop on every ``WorkflowEvent.type ==
-        "executor_completed"`` event whose ``executor_id`` matches one of our
-        registered non-Coordinator, non-ResultGenerator participants.
-
-        Why this exists (agent-framework 1.3.0 design constraint):
-            The framework's ``GroupChatBuilder.orchestrator_agent`` (our
-            Coordinator) is invoked directly via ``self._agent.run(...)``
-            inside ``agent_framework_orchestrations/_group_chat.py:484``. It
-            is NOT wrapped in an ``AgentExecutor`` and therefore never
-            surfaces as a workflow event. Our existing Coordinator-JSON-based
-            loop detector in ``_complete_agent_response`` (lines ~1118-1181)
-            is consequently permanently dead in 1.3.0. We need an independent
-            loop signal that does NOT rely on Coordinator visibility.
-
-        Two safety nets enforced here:
-
-        1. Same-participant streak (``_participant_consecutive_loop_threshold``,
-           default 3): if the Coordinator keeps selecting the same participant
-           (e.g., the Chief Architect latched on producing an Evidence Pack
-           that never satisfies the next reviewer), 3+ consecutive completions
-           of the same participant force-terminate with ``hard_loop``.
-
-        2. Total round budget: each participant turn counts as one round.
-           Once total completions reach ``self.max_rounds`` the workflow
-           force-terminates with ``hard_timeout``. This is independent of
-           ``len(self.agent_responses)`` (which only grows on agent switch
-           via ``_start_agent_if_needed`` and therefore cannot reach
-           ``max_rounds`` during a same-participant loop).
-        """
-        if src_executor == self._last_completed_participant:
-            self._participant_completion_streak += 1
-        else:
-            self._last_completed_participant = src_executor
-            self._participant_completion_streak = 1
-        self._participant_completions_total += 1
-
-        if (
-            self._participant_completion_streak
-            >= self._participant_consecutive_loop_threshold
-        ):
-            self._request_forced_termination(
-                reason=(
-                    f"Loop detected: participant '{src_executor}' completed "
-                    f"{self._participant_completion_streak} consecutive turns "
-                    "with no other participant in between (Coordinator is "
-                    "stuck on the same selection; in agent-framework 1.3.0 "
-                    "the Coordinator runs inside the framework and is "
-                    "invisible to the streaming loop, so we infer this from "
-                    "executor_completed events)"
-                ),
-                termination_type="hard_loop",
-            )
-            return
-
-        if (
-            self.max_rounds
-            and self._participant_completions_total >= self.max_rounds
-        ):
-            self._request_forced_termination(
-                reason=(
-                    f"Workflow exceeded max_rounds={self.max_rounds} "
-                    "participant turns; terminating to avoid infinite loop"
-                ),
-                termination_type="hard_timeout",
-            )
-
     async def _start_agent_if_needed(
         self,
         agent_name: str,
diff --git a/src/processor/src/tests/unit/libs/agent_framework/test_groupchat_orchestrator_termination.py b/src/processor/src/tests/unit/libs/agent_framework/test_groupchat_orchestrator_termination.py
@@ -293,113 +293,3 @@ async def _run():
         )
 
     asyncio.run(_run())
-
-
-def test_participant_completion_streak_triggers_forced_termination():
-    """In agent-framework 1.3.0 the GroupChat orchestrator agent (Coordinator)
-    is invoked directly inside the framework's ``_invoke_agent_helper`` and
-    is NOT wrapped in an ``AgentExecutor``, so it never surfaces as a
-    workflow event. The Coordinator-JSON loop detector in
-    ``_complete_agent_response`` is therefore permanently dead in 1.3.0.
-
-    The only observable loop signal we have is consecutive
-    ``executor_completed`` events for the same participant. After
-    ``_participant_consecutive_loop_threshold`` (default 3) same-participant
-    completions, the orchestrator must force-terminate with ``hard_loop``
-    so the workflow halts cleanly instead of running until the framework's
-    own max_rounds ceiling (which at default 100 is ~17 min).
-    """
-
-    async def _run():
-        orch = _make_orchestrator()
-        # Register a participant so the tracker recognizes it.
-        orch.agents = {"Coordinator": object(), "Chief Architect": object()}
-
-        for _ in range(3):
-            orch._track_participant_completion("Chief Architect")
-
-        assert orch._forced_termination_requested is True, (
-            "Three consecutive completions of the same participant must "
-            "trigger the participant-streak loop breaker; otherwise the "
-            "Chief-Architect-only loop observed in production (with the "
-            "Coordinator invisible to our streaming loop in 1.3.0) can "
-            "never be detected and the workflow runs until the framework's "
-            "own max_rounds ceiling fires."
-        )
-        assert orch._forced_termination_type == "hard_loop"
-        assert "Chief Architect" in (orch._forced_termination_reason or "")
-        assert "3 consecutive" in (orch._forced_termination_reason or "")
-
-    asyncio.run(_run())
-
-
-def test_participant_completion_streak_resets_on_different_participant():
-    """If a different participant runs in between, the same-participant
-    streak counter resets. This prevents false-positive loop detection
-    when participants alternate normally.
-    """
-
-    async def _run():
-        orch = _make_orchestrator()
-        orch.agents = {
-            "Coordinator": object(),
-            "Chief Architect": object(),
-            "AKS Expert": object(),
-        }
-
-        orch._track_participant_completion("Chief Architect")
-        orch._track_participant_completion("Chief Architect")
-        # A different participant runs -> streak resets.
-        orch._track_participant_completion("AKS Expert")
-        orch._track_participant_completion("Chief Architect")
-        orch._track_participant_completion("Chief Architect")  # streak=2 only
-
-        assert orch._forced_termination_requested is False, (
-            "Alternating participants must not trigger the loop breaker; "
-            "the streak should reset whenever a different participant runs."
-        )
-        assert orch._participant_completion_streak == 2
-        assert orch._last_completed_participant == "Chief Architect"
-
-    asyncio.run(_run())
-
-
-def test_participant_completions_total_enforces_max_rounds_under_alternation():
-    """``max_rounds`` must be enforced from the per-participant total count
-    (which grows on EVERY completion) - not from ``len(agent_responses)``
-    (which only grows on agent switch in ``_start_agent_if_needed`` and
-    therefore can never reach ``max_rounds`` during a same-agent loop).
-
-    This test exercises the alternation case where the streak detector
-    never fires, ensuring the round-budget guard still halts the workflow.
-    """
-
-    async def _run():
-        orch = GroupChatOrchestrator(
-            name="t",
-            process_id="p1",
-            participants={
-                "Coordinator": object(),
-                "A": object(),
-                "B": object(),
-            },
-            memory_client=None,
-            coordinator_name="Coordinator",
-            max_rounds=4,
-            result_output_format=None,
-        )
-
-        # Alternate A and B to keep the streak below threshold.
-        orch._track_participant_completion("A")
-        orch._track_participant_completion("B")
-        orch._track_participant_completion("A")
-        # Streak detector hasn't fired yet (max streak = 1 because of perfect
-        # alternation). The 4th turn must trip the max_rounds budget.
-        assert orch._forced_termination_requested is False
-        orch._track_participant_completion("B")
-
-        assert orch._forced_termination_requested is True
-        assert orch._forced_termination_type == "hard_timeout"
-        assert "max_rounds=4" in (orch._forced_termination_reason or "")
-
-    asyncio.run(_run())