Skip to content

Commit f74629b

Browse files
fix(groupchat): resolve agent identity via author_name for af 1.3.0
Root cause of the "Runner did not converge after 100 iterations" production failure (and the Chief-Architect-only loop that preceded it): agent-framework 1.3.0 changed how AgentResponseUpdate is constructed. `map_chat_to_agent_update` (_types.py:2825-2837) now only sets `author_name` and leaves `agent_id` as None. Our orchestrator was reading `event.agent_id` exclusively, so every streaming update resolved to `agent_name=""`. That silently broke: * Loop detection (line 1080 `if agent_name == self.coordinator_name` never matched, so the streak counter never advanced and the 3x same-agent guard never fired). Production looped 100x on Chief Architect with zero detection. * Coordinator termination signal extraction (`finish=true`, `instruction=complete`, blocking instructions) - same gated block. * Manager-instruction parsing for the next participant. The [MEMORY] logs continued to show real agent names ("Chief Architect") because `SharedMemoryContextProvider` reads the name from the agent's own context, not from the workflow event - which is why the regression was invisible from logs alone. Fix: in `_handle_agent_update`, prefer `event.author_name` (which IS populated by 1.3.0's `map_chat_to_agent_update`) and fall back to `agent_id` only when author_name is missing, for backwards compat with older event shapes. Use `getattr` defensively so existing tests that construct SimpleNamespace events without author_name still work. Tests: * test_handle_agent_update_resolves_coordinator_via_author_name_when_agent_id_is_none - asserts the identity resolution itself * test_loop_detection_fires_on_3_consecutive_coordinator_selections_via_handle_agent_update - end-to-end through the production code path: 3 identical Coordinator selections via _handle_agent_update must trip _forced_termination * Both tests verified to FAIL without the fix (intentionally reverted to confirm) and PASS with the fix * Full suite: 831 passed (was 829, +2 regression tests) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 4de1e28 commit f74629b

2 files changed

Lines changed: 118 additions & 1 deletion

File tree

src/processor/src/libs/agent_framework/groupchat_orchestrator.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,18 @@ async def _handle_agent_update(
727727
3. Trigger callback with complete response
728728
4. Handle tool calls separately from text streaming
729729
"""
730-
agent_name = self._normalize_executor_id(event.agent_id or "")
730+
# NOTE: In agent-framework 1.3.0, ``AgentResponseUpdate.agent_id`` is no
731+
# longer populated by ``map_chat_to_agent_update`` (only ``author_name``
732+
# is set, from the agent's name). Reading ``event.agent_id`` alone
733+
# silently yielded an empty string, which made every downstream identity
734+
# check (loop detection, coordinator termination signal extraction,
735+
# manager-instruction parsing) silently no-op. Prefer ``author_name``
736+
# and fall back to ``agent_id`` only for older shapes. Use ``getattr``
737+
# so older event types without ``author_name`` still work.
738+
author_name = getattr(event, "author_name", None)
739+
agent_name = author_name or self._normalize_executor_id(
740+
getattr(event, "agent_id", None) or ""
741+
)
731742
await self._start_agent_if_needed(agent_name, stream_callback, callback)
732743
self._append_text_chunk(event)
733744
await self._process_tool_calls(event, agent_name, stream_callback)

src/processor/src/tests/unit/libs/agent_framework/test_groupchat_orchestrator_termination.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,109 @@ def _agent_reply(text: str = "ok"):
127127
assert orch._forced_termination_requested is False
128128

129129
asyncio.run(_run())
130+
131+
132+
@dataclass
133+
class _AgentResponseUpdateStub:
134+
"""Mimics the agent-framework 1.3.0 AgentResponseUpdate shape.
135+
136+
Only the fields actually read by ``_handle_agent_update`` /
137+
``_normalize_executor_id`` matter. In 1.3.0 ``agent_id`` is no longer
138+
populated by ``map_chat_to_agent_update`` - only ``author_name`` is set.
139+
This stub reproduces that shape.
140+
"""
141+
142+
author_name: str | None = None
143+
agent_id: str | None = None
144+
contents: list = None # type: ignore[assignment]
145+
146+
def __post_init__(self):
147+
if self.contents is None:
148+
self.contents = []
149+
150+
151+
def test_handle_agent_update_resolves_coordinator_via_author_name_when_agent_id_is_none():
152+
"""Regression guard for agent-framework 1.3.0.
153+
154+
In 1.3.0 ``AgentResponseUpdate.agent_id`` is ``None`` because
155+
``map_chat_to_agent_update`` only sets ``author_name``. Reading
156+
``event.agent_id`` alone silently produced an empty string, so
157+
``agent_name == self.coordinator_name`` never matched and loop
158+
detection / coordinator termination signal extraction silently
159+
no-opped. The orchestrator must treat ``author_name`` as the
160+
authoritative source.
161+
"""
162+
163+
async def _run():
164+
orch = _make_orchestrator()
165+
166+
event = _AgentResponseUpdateStub(
167+
author_name="Coordinator",
168+
agent_id=None,
169+
)
170+
171+
# No-op tool/text processing: we only care about agent identity.
172+
await orch._handle_agent_update(event, stream_callback=None, callback=None) # type: ignore[arg-type]
173+
174+
assert orch._last_executor_id == "Coordinator", (
175+
"author_name must be used to identify the agent; otherwise "
176+
"_last_executor_id stays empty and downstream coordinator "
177+
"checks silently fail."
178+
)
179+
180+
asyncio.run(_run())
181+
182+
183+
def test_loop_detection_fires_on_3_consecutive_coordinator_selections_via_handle_agent_update():
184+
"""End-to-end check: feeding 3 identical Coordinator selections through
185+
``_handle_agent_update`` (the path used in production) must trigger the
186+
loop-detection forced termination. This is the path that was silently
187+
broken in the 1.3.0 regression.
188+
"""
189+
190+
async def _run():
191+
orch = _make_orchestrator()
192+
orch._conversation = []
193+
194+
coordinator_json = json.dumps(
195+
{
196+
"selected_participant": "Chief Architect",
197+
"instruction": "re-list",
198+
"finish": False,
199+
"final_message": "",
200+
}
201+
)
202+
203+
# Simulate three consecutive Coordinator turns, each emitting the
204+
# same selection. Between each Coordinator turn we drive an update
205+
# from a non-Coordinator agent so the orchestrator's "agent switch"
206+
# logic completes the previous Coordinator response (which is what
207+
# actually runs loop-detection at line 1080).
208+
for _ in range(3):
209+
# Coordinator emits its selection as a streaming chunk.
210+
await orch._handle_agent_update(
211+
_AgentResponseUpdateStub(author_name="Coordinator"),
212+
stream_callback=None,
213+
callback=None,
214+
) # type: ignore[arg-type]
215+
orch._current_agent_response = [coordinator_json]
216+
217+
# Then Chief Architect emits a chunk: the agent switch closes
218+
# out the Coordinator response and runs loop detection.
219+
await orch._handle_agent_update(
220+
_AgentResponseUpdateStub(author_name="Chief Architect"),
221+
stream_callback=None,
222+
callback=None,
223+
) # type: ignore[arg-type]
224+
orch._current_agent_response = ["ack"]
225+
226+
# Closing the final Chief Architect response keeps state consistent.
227+
await orch._complete_agent_response("Chief Architect", callback=None)
228+
229+
assert orch._forced_termination_requested is True, (
230+
"Loop detection failed to fire after 3 identical Coordinator "
231+
"selections via _handle_agent_update; agent identity resolution "
232+
"is broken."
233+
)
234+
235+
asyncio.run(_run())

0 commit comments

Comments
 (0)