fix(groupchat): detect participant loops via executor_completed events

Prachig-Microsoft · Copilot · Prachig-Microsoft · commit 7a0212fdaa65 · 2026-06-13T15:36:22.000+05:30
In agent-framework 1.3.0 the GroupChat orchestrator agent (Coordinator)
is invoked directly inside the framework's internal _invoke_agent_helper
(agent_framework_orchestrations/_group_chat.py:484) rather than through
an AgentExecutor. The Coordinator therefore never surfaces as a workflow
event, which makes our existing Coordinator-JSON-based loop detector in
_complete_agent_response permanently dead in 1.3.0.

Symptom in production: workflow loops with the Coordinator latched onto
the same participant (e.g., Chief Architect repeatedly asked to produce
an Evidence Pack that never satisfies the next reviewer). The loop runs
until the framework's max_rounds ceiling fires (~17 min at default 100)
instead of being caught early.

Fix:
* Track participant turn completions from WorkflowEvent.executor_completed,
  the one observable signal that does NOT depend on Coordinator visibility
  (participants ARE wrapped in AgentExecutor and so do emit these events).
* Force-terminate (hard_loop) after 3 consecutive completions of the same
  participant.
* Force-terminate (hard_timeout) when total participant completions reach
  max_rounds; independent of len(agent_responses) which only grows on
  agent switch and so can never reach max_rounds during a same-participant
  loop.
* Flush per-participant streaming buffer on each executor_completed so
  back-to-back same-agent turns produce one AgentResponse per turn instead
  of accumulating across turns.
* Move forced-termination break check to top of the streaming loop so any
  branch (timeout, participant loop, Coordinator finish=true) takes effect
  on the very next event rather than waiting for the next output event.

Adds 3 regression tests covering the streak trigger, the alternation
reset, and the round-budget enforcement. 836 tests pass (833 -&gt; 836).

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/src/processor/src/libs/agent_framework/groupchat_orchestrator.py b/src/processor/src/libs/agent_framework/groupchat_orchestrator.py
@@ -313,6 +313,33 @@ def __init__(
         # Snapshot of progress_counter at the time we last saw _last_coordinator_selection.
         self._last_coordinator_selection_progress: int = 0
 
+        # Per-participant turn tracking driven by ``WorkflowEvent.executor_completed``.
+        #
+        # In agent-framework 1.3.0 the GroupChat orchestrator agent (the
+        # Coordinator) is invoked directly inside the framework's internal
+        # ``_invoke_agent_helper`` (see
+        # ``agent_framework_orchestrations/_group_chat.py:484``). It is NOT
+        # wrapped in an ``AgentExecutor`` and therefore never surfaces as a
+        # workflow event - which makes the Coordinator-JSON-based loop
+        # detection in ``_complete_agent_response`` permanently dead in 1.3.0.
+        #
+        # The only observable "the conversation is moving" pulse we have is
+        # ``executor_completed`` events for the *participants* (which DO go
+        # through ``AgentExecutor``). We track:
+        #   - the most recently completed participant,
+        #   - the streak of consecutive completions of that participant,
+        #   - the total number of participant turns,
+        # and use these for two safety nets in the streaming loop:
+        #   * 3+ consecutive same-participant turns => hard_loop termination
+        #   * total turns >= ``max_rounds`` => hard_timeout termination
+        # (independent of ``len(self.agent_responses)`` which only grows on
+        # agent switch and so cannot reach ``max_rounds`` during a same-
+        # participant loop).
+        self._participant_completions_total: int = 0
+        self._last_completed_participant: str | None = None
+        self._participant_completion_streak: int = 0
+        self._participant_consecutive_loop_threshold: int = 3
+
     def _request_forced_termination(
         self, *, reason: str, termination_type: str
     ) -> None:
@@ -543,6 +570,15 @@ async def run_stream(
                             termination_type="hard_timeout",
                         )
 
+                # Honor any pending termination request at the *top* of each
+                # iteration so that branches which set the flags (timeout,
+                # participant loop detection, Coordinator finish=true) take
+                # effect immediately on the next event - rather than being
+                # gated on the next ``output`` event arriving (which during a
+                # slow loop can be many seconds away).
+                if self._forced_termination_requested or self._termination_requested:
+                    break
+
                 # In agent-framework 1.3.0, ``workflow.run(stream=True)`` yields
                 # only ``WorkflowEvent`` instances; ``AgentResponseUpdate`` is
                 # wrapped inside ``WorkflowEvent.data`` for ``type=="output"``
@@ -552,7 +588,46 @@ async def run_stream(
                 # ``WorkflowEvent.type`` and inspect ``event.data`` /
                 # ``event.executor_id`` to route per-participant streaming
                 # chunks vs the orchestrator's final output.
-                if not isinstance(event, WorkflowEvent) or event.type != "output":
+                if not isinstance(event, WorkflowEvent):
+                    continue
+
+                # Participant turn completion. Used for loop / max_rounds
+                # safety nets that work even when the Coordinator is
+                # invisible to the streaming loop (which it is in 1.3.0 -
+                # the Coordinator runs inside the framework's internal
+                # ``_invoke_agent_helper`` and never surfaces as an executor
+                # event). See ``_track_participant_completion`` for details.
+                if event.type == "executor_completed":
+                    src_executor = self._normalize_executor_id(
+                        event.executor_id or ""
+                    )
+                    if (
+                        src_executor in self.agents
+                        and src_executor != self.coordinator_name
+                        and src_executor != self.get_result_generator_name()
+                    ):
+                        # Flush this participant's streaming buffer into a
+                        # discrete per-turn ``AgentResponse`` before we track
+                        # the completion. Without this, when the framework's
+                        # Coordinator picks the same participant back-to-back
+                        # (the loop pattern we're trying to detect),
+                        # ``_start_agent_if_needed`` sees no agent switch on
+                        # the NEXT turn's chunks and the buffer would grow
+                        # across turns - producing one merged response rather
+                        # than one response per turn.
+                        if (
+                            self._last_executor_id == src_executor
+                            and self._current_agent_response
+                        ):
+                            await self._complete_agent_response(
+                                src_executor, on_agent_response
+                            )
+                            self._current_agent_response = []
+                            self._last_executor_id = None
+                        self._track_participant_completion(src_executor)
+                    continue
+
+                if event.type != "output":
                     continue
 
                 data = event.data
@@ -573,7 +648,12 @@ async def run_stream(
                         callback=on_agent_response,
                     )
 
-                    # Enforce max rounds as a safety guard.
+                    # Secondary max_rounds safety net based on agent switches.
+                    # The primary check lives in ``_track_participant_completion``
+                    # (driven by ``executor_completed`` events) and works even
+                    # when the same agent runs back-to-back. This switch-based
+                    # check is kept as defense-in-depth for sessions with
+                    # normal alternation.
                     if self.max_rounds and len(self.agent_responses) >= self.max_rounds:
                         self._request_forced_termination(
                             reason=(
@@ -582,13 +662,9 @@ async def run_stream(
                             termination_type="hard_timeout",
                         )
 
-                    if self._forced_termination_requested:
-                        break
-
-                    # If the Coordinator requested finish=true, stop immediately.
-                    if self._termination_requested:
-                        break
-
+                    # Termination flags are honored at the top of the next
+                    # iteration so any branch can request termination
+                    # uniformly without duplicating break logic here.
                     continue
 
                 # Final orchestrator output: complete any buffered agent
@@ -777,6 +853,75 @@ def _normalize_executor_id(self, executor_id: str) -> str:
         """
         return executor_id.split(":")[-1]
 
+    def _track_participant_completion(self, src_executor: str) -> None:
+        """Track a participant turn completion for loop / max_rounds detection.
+
+        Called from the streaming loop on every ``WorkflowEvent.type ==
+        "executor_completed"`` event whose ``executor_id`` matches one of our
+        registered non-Coordinator, non-ResultGenerator participants.
+
+        Why this exists (agent-framework 1.3.0 design constraint):
+            The framework's ``GroupChatBuilder.orchestrator_agent`` (our
+            Coordinator) is invoked directly via ``self._agent.run(...)``
+            inside ``agent_framework_orchestrations/_group_chat.py:484``. It
+            is NOT wrapped in an ``AgentExecutor`` and therefore never
+            surfaces as a workflow event. Our existing Coordinator-JSON-based
+            loop detector in ``_complete_agent_response`` (lines ~1118-1181)
+            is consequently permanently dead in 1.3.0. We need an independent
+            loop signal that does NOT rely on Coordinator visibility.
+
+        Two safety nets enforced here:
+
+        1. Same-participant streak (``_participant_consecutive_loop_threshold``,
+           default 3): if the Coordinator keeps selecting the same participant
+           (e.g., the Chief Architect latched on producing an Evidence Pack
+           that never satisfies the next reviewer), 3+ consecutive completions
+           of the same participant force-terminate with ``hard_loop``.
+
+        2. Total round budget: each participant turn counts as one round.
+           Once total completions reach ``self.max_rounds`` the workflow
+           force-terminates with ``hard_timeout``. This is independent of
+           ``len(self.agent_responses)`` (which only grows on agent switch
+           via ``_start_agent_if_needed`` and therefore cannot reach
+           ``max_rounds`` during a same-participant loop).
+        """
+        if src_executor == self._last_completed_participant:
+            self._participant_completion_streak += 1
+        else:
+            self._last_completed_participant = src_executor
+            self._participant_completion_streak = 1
+        self._participant_completions_total += 1
+
+        if (
+            self._participant_completion_streak
+            >= self._participant_consecutive_loop_threshold
+        ):
+            self._request_forced_termination(
+                reason=(
+                    f"Loop detected: participant '{src_executor}' completed "
+                    f"{self._participant_completion_streak} consecutive turns "
+                    "with no other participant in between (Coordinator is "
+                    "stuck on the same selection; in agent-framework 1.3.0 "
+                    "the Coordinator runs inside the framework and is "
+                    "invisible to the streaming loop, so we infer this from "
+                    "executor_completed events)"
+                ),
+                termination_type="hard_loop",
+            )
+            return
+
+        if (
+            self.max_rounds
+            and self._participant_completions_total >= self.max_rounds
+        ):
+            self._request_forced_termination(
+                reason=(
+                    f"Workflow exceeded max_rounds={self.max_rounds} "
+                    "participant turns; terminating to avoid infinite loop"
+                ),
+                termination_type="hard_timeout",
+            )
+
     async def _start_agent_if_needed(
         self,
         agent_name: str,
diff --git a/src/processor/src/tests/unit/libs/agent_framework/test_groupchat_orchestrator_termination.py b/src/processor/src/tests/unit/libs/agent_framework/test_groupchat_orchestrator_termination.py
@@ -293,3 +293,113 @@ async def _run():
         )
 
     asyncio.run(_run())
+
+
+def test_participant_completion_streak_triggers_forced_termination():
+    """In agent-framework 1.3.0 the GroupChat orchestrator agent (Coordinator)
+    is invoked directly inside the framework's ``_invoke_agent_helper`` and
+    is NOT wrapped in an ``AgentExecutor``, so it never surfaces as a
+    workflow event. The Coordinator-JSON loop detector in
+    ``_complete_agent_response`` is therefore permanently dead in 1.3.0.
+
+    The only observable loop signal we have is consecutive
+    ``executor_completed`` events for the same participant. After
+    ``_participant_consecutive_loop_threshold`` (default 3) same-participant
+    completions, the orchestrator must force-terminate with ``hard_loop``
+    so the workflow halts cleanly instead of running until the framework's
+    own max_rounds ceiling (which at default 100 is ~17 min).
+    """
+
+    async def _run():
+        orch = _make_orchestrator()
+        # Register a participant so the tracker recognizes it.
+        orch.agents = {"Coordinator": object(), "Chief Architect": object()}
+
+        for _ in range(3):
+            orch._track_participant_completion("Chief Architect")
+
+        assert orch._forced_termination_requested is True, (
+            "Three consecutive completions of the same participant must "
+            "trigger the participant-streak loop breaker; otherwise the "
+            "Chief-Architect-only loop observed in production (with the "
+            "Coordinator invisible to our streaming loop in 1.3.0) can "
+            "never be detected and the workflow runs until the framework's "
+            "own max_rounds ceiling fires."
+        )
+        assert orch._forced_termination_type == "hard_loop"
+        assert "Chief Architect" in (orch._forced_termination_reason or "")
+        assert "3 consecutive" in (orch._forced_termination_reason or "")
+
+    asyncio.run(_run())
+
+
+def test_participant_completion_streak_resets_on_different_participant():
+    """If a different participant runs in between, the same-participant
+    streak counter resets. This prevents false-positive loop detection
+    when participants alternate normally.
+    """
+
+    async def _run():
+        orch = _make_orchestrator()
+        orch.agents = {
+            "Coordinator": object(),
+            "Chief Architect": object(),
+            "AKS Expert": object(),
+        }
+
+        orch._track_participant_completion("Chief Architect")
+        orch._track_participant_completion("Chief Architect")
+        # A different participant runs -> streak resets.
+        orch._track_participant_completion("AKS Expert")
+        orch._track_participant_completion("Chief Architect")
+        orch._track_participant_completion("Chief Architect")  # streak=2 only
+
+        assert orch._forced_termination_requested is False, (
+            "Alternating participants must not trigger the loop breaker; "
+            "the streak should reset whenever a different participant runs."
+        )
+        assert orch._participant_completion_streak == 2
+        assert orch._last_completed_participant == "Chief Architect"
+
+    asyncio.run(_run())
+
+
+def test_participant_completions_total_enforces_max_rounds_under_alternation():
+    """``max_rounds`` must be enforced from the per-participant total count
+    (which grows on EVERY completion) - not from ``len(agent_responses)``
+    (which only grows on agent switch in ``_start_agent_if_needed`` and
+    therefore can never reach ``max_rounds`` during a same-agent loop).
+
+    This test exercises the alternation case where the streak detector
+    never fires, ensuring the round-budget guard still halts the workflow.
+    """
+
+    async def _run():
+        orch = GroupChatOrchestrator(
+            name="t",
+            process_id="p1",
+            participants={
+                "Coordinator": object(),
+                "A": object(),
+                "B": object(),
+            },
+            memory_client=None,
+            coordinator_name="Coordinator",
+            max_rounds=4,
+            result_output_format=None,
+        )
+
+        # Alternate A and B to keep the streak below threshold.
+        orch._track_participant_completion("A")
+        orch._track_participant_completion("B")
+        orch._track_participant_completion("A")
+        # Streak detector hasn't fired yet (max streak = 1 because of perfect
+        # alternation). The 4th turn must trip the max_rounds budget.
+        assert orch._forced_termination_requested is False
+        orch._track_participant_completion("B")
+
+        assert orch._forced_termination_requested is True
+        assert orch._forced_termination_type == "hard_timeout"
+        assert "max_rounds=4" in (orch._forced_termination_reason or "")
+
+    asyncio.run(_run())