Skip to content

Commit 19ab09c

Browse files
feat: add new speaker handoff pattern that is more scalable and generalizable
1 parent dfc9068 commit 19ab09c

11 files changed

Lines changed: 355 additions & 358 deletions

File tree

src/askui/agent_base.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,10 @@
2323
LocateSettings,
2424
)
2525
from askui.models.shared.tools import Tool, ToolCollection
26-
from askui.prompts.act_prompts import create_default_prompt
27-
from askui.prompts.caching import CACHE_USE_PROMPT
26+
from askui.prompts.act_prompts import CACHE_USE_PROMPT, create_default_prompt
2827
from askui.tools.agent_os import AgentOs
2928
from askui.tools.android.agent_os import AndroidAgentOs
3029
from askui.tools.caching_tools import (
31-
ExecuteCachedTrajectory,
3230
InspectCacheMetadata,
3331
RetrieveCachedTestExecutions,
3432
VerifyCacheExecution,
@@ -231,7 +229,7 @@ def act(
231229

232230
_caching_settings: CachingSettings = caching_settings or self.caching_settings
233231

234-
tools, cached_execution_tool, cache_manager = self._patch_act_with_cache(
232+
tools, cache_manager = self._patch_act_with_cache(
235233
_caching_settings, _act_settings, tools, goal_str
236234
)
237235
_tools = self._build_tools(tools)
@@ -264,11 +262,7 @@ def _patch_act_with_cache(
264262
settings: ActSettings,
265263
tools: list[Tool] | ToolCollection | None,
266264
goal: str,
267-
) -> tuple[
268-
list[Tool] | ToolCollection,
269-
ExecuteCachedTrajectory | None,
270-
CacheManager | None,
271-
]:
265+
) -> tuple[list[Tool] | ToolCollection, CacheManager | None]:
272266
"""Patch act settings and tools with caching functionality.
273267
274268
Args:
@@ -278,10 +272,9 @@ def _patch_act_with_cache(
278272
goal: The goal string for cache recording
279273
280274
Returns:
281-
A tuple of (modified_tools, cached_execution_tool, cache_manager)
275+
A tuple of (modified_tools, cache_manager)
282276
"""
283277
caching_tools: list[Tool] = []
284-
cached_execution_tool: ExecuteCachedTrajectory | None = None
285278
cache_manager: CacheManager | None = None
286279

287280
# Setup execute mode: add caching tools and modify system prompt
@@ -290,12 +283,11 @@ def _patch_act_with_cache(
290283
cache_executor = CacheExecutor(caching_settings.execution_settings)
291284
self._conversation.speakers.add_speaker(cache_executor)
292285

293-
# Add caching tools
294-
cached_execution_tool = ExecuteCachedTrajectory()
286+
# Add caching tools (switch_speaker tool is added automatically
287+
# by Conversation._setup_speaker_handoff)
295288
caching_tools.extend(
296289
[
297290
RetrieveCachedTestExecutions(caching_settings.cache_dir),
298-
cached_execution_tool,
299291
VerifyCacheExecution(),
300292
InspectCacheMetadata(),
301293
]
@@ -328,7 +320,7 @@ def _patch_act_with_cache(
328320
vlm_provider=self._vlm_provider,
329321
)
330322

331-
return tools, cached_execution_tool, cache_manager
323+
return tools, cache_manager
332324

333325
@overload
334326
def get(

src/askui/models/shared/conversation.py

Lines changed: 68 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
from askui.model_providers.vlm_provider import VlmProvider
1111
from askui.models.shared.agent_message_param import (
1212
MessageParam,
13-
ToolResultBlockParam,
14-
ToolUseBlockParam,
1513
UsageParam,
1614
)
1715
from askui.models.shared.settings import ActSettings
@@ -23,6 +21,7 @@
2321
)
2422
from askui.reporting import NULL_REPORTER, Reporter
2523
from askui.speaker.speaker import SpeakerResult, Speakers
24+
from askui.tools.switch_speaker_tool import SwitchSpeakerTool
2625

2726
if TYPE_CHECKING:
2827
from askui.models.shared.conversation_callback import ConversationCallback
@@ -103,9 +102,6 @@ def __init__(
103102
self._reporters: list[Reporter] = []
104103
self._step_index: int = 0
105104

106-
# Cache execution context (for communication between tools and CacheExecutor)
107-
self.cache_execution_context: dict[str, Any] = {}
108-
109105
# Track if cache execution was used (to prevent recording during playback)
110106
self._executed_from_cache: bool = False
111107

@@ -162,7 +158,6 @@ def _setup_control_loop(
162158
) -> None:
163159
# Reset state
164160
self.accumulated_usage = UsageParam()
165-
self.cache_execution_context = {}
166161
self._executed_from_cache = False
167162
self.speakers.reset_state()
168163

@@ -171,6 +166,9 @@ def _setup_control_loop(
171166
self.tools = tools or ToolCollection()
172167
self._reporters = reporters or []
173168

169+
# Auto-populate speaker descriptions and switch_speaker tool
170+
self._setup_speaker_handoff()
171+
174172
# Initialize truncation strategy
175173
self._truncation_strategy = (
176174
self._truncation_strategy_factory.create_truncation_strategy(
@@ -199,6 +197,51 @@ def _conclude_control_loop(self) -> None:
199197
# Report final usage
200198
self._reporter.add_usage_summary(self.accumulated_usage.model_dump())
201199

200+
def _setup_speaker_handoff(self) -> None:
201+
"""Set up speaker handoff infrastructure.
202+
203+
If there are speakers with descriptions (handoff targets), this method:
204+
1. Appends an ``<AVAILABLE_SPEAKERS>`` section to ``system_capabilities``
205+
2. Adds a ``SwitchSpeakerTool`` to the tool collection
206+
"""
207+
speaker_descriptions = self._build_speaker_descriptions()
208+
if not speaker_descriptions:
209+
return
210+
211+
# Append speaker descriptions to system_capabilities
212+
if self.settings.messages.system is not None:
213+
has_capabilities = self.settings.messages.system.system_capabilities
214+
separator = "\n\n" if has_capabilities else ""
215+
self.settings.messages.system.system_capabilities += (
216+
f"{separator}<AVAILABLE_SPEAKERS>\n"
217+
"The following specialized speakers are available in this "
218+
"conversation. Use the switch_speaker tool to hand off to "
219+
"them when appropriate.\n\n"
220+
f"{speaker_descriptions}\n"
221+
"</AVAILABLE_SPEAKERS>"
222+
)
223+
224+
# Create switch_speaker tool with valid speaker names
225+
handoff_speakers = [
226+
speaker.get_name() for speaker in self.speakers if speaker.get_description()
227+
]
228+
switch_tool = SwitchSpeakerTool(speaker_names=handoff_speakers)
229+
self.tools.append_tool(switch_tool)
230+
231+
def _build_speaker_descriptions(self) -> str:
232+
"""Build formatted speaker descriptions for the system prompt.
233+
234+
Returns:
235+
Formatted string with speaker names and descriptions,
236+
or empty string if no speakers have descriptions.
237+
"""
238+
descriptions: list[str] = []
239+
for speaker in self.speakers:
240+
description = speaker.get_description()
241+
if description:
242+
descriptions.append(f"### {speaker.get_name()}\n{description}")
243+
return "\n\n".join(descriptions)
244+
202245
@tracer.start_as_current_span("step")
203246
def _execute_step(self) -> bool:
204247
"""Execute one step of the conversation loop with speakers.
@@ -238,14 +281,13 @@ def _execute_step(self) -> bool:
238281
tool_result_message = self._execute_tools_if_present(last_message)
239282
if tool_result_message:
240283
self._add_message(tool_result_message)
241-
242-
# Handle side effects of tool execution (e.g., speaker switches)
243-
self._handle_tool_results(last_message, tool_result_message)
244-
245284
continue_loop = True # we always continue after a tool was called
246285

247286
# 4. Check if conversation should continue and switch speaker if necessary
248-
continue_loop = continue_loop or self._handle_result_status(result)
287+
# Note: _handle_result_status must always be called (not short-circuited)
288+
# because it has side effects (e.g., triggering speaker switches).
289+
status_continue = self._handle_result_status(result)
290+
continue_loop = continue_loop or status_continue
249291

250292
# 5. Collect Statistics
251293
if result.usage:
@@ -295,67 +337,6 @@ def _execute_tools_if_present(self, message: MessageParam) -> MessageParam | Non
295337
# Return tool results as a user message
296338
return MessageParam(content=tool_results, role="user")
297339

298-
@tracer.start_as_current_span("handle_tool_result")
299-
def _handle_tool_results(
300-
self,
301-
assistant_message: MessageParam,
302-
tool_result_message: MessageParam,
303-
) -> None:
304-
"""Handle side effects of tool execution.
305-
306-
Extracts tool use blocks and tool results from messages, then checks
307-
if specific tools require speaker switches or other actions.
308-
309-
Currently handles:
310-
- ExecuteCachedTrajectory: Switches to CacheExecutor if successful
311-
312-
Args:
313-
assistant_message: The assistant message containing tool use blocks
314-
tool_result_message: The user message containing tool results
315-
"""
316-
# Extract tool use blocks from assistant message
317-
if isinstance(assistant_message.content, str):
318-
return
319-
320-
tool_use_blocks: list[ToolUseBlockParam] = [
321-
block for block in assistant_message.content if block.type == "tool_use"
322-
]
323-
324-
if isinstance(tool_result_message.content, str):
325-
return
326-
327-
tool_results: list[ToolResultBlockParam] = tool_result_message.content # type: ignore[assignment]
328-
329-
# Handle side effects for each tool
330-
for tool_use_block, tool_result in zip(
331-
tool_use_blocks, tool_results, strict=False
332-
):
333-
# Check if ExecuteCachedTrajectory was called successfully
334-
if (
335-
tool_use_block.name.startswith("execute_cached_executions_tool")
336-
and not tool_result.is_error
337-
):
338-
# Extract parameters from tool call (input is dict at runtime)
339-
trajectory_file: str = tool_use_block.input["trajectory_file"] # type: ignore[index]
340-
start_from_step_index: int = tool_use_block.input.get( # type: ignore[attr-defined]
341-
"start_from_step_index", 0
342-
)
343-
parameter_values: dict[str, str] = tool_use_block.input.get( # type: ignore[attr-defined]
344-
"parameter_values", {}
345-
)
346-
347-
# Prepare cache execution context for CacheExecutor
348-
# CacheExecutor will validate and load the cache file
349-
self.cache_execution_context = {
350-
"trajectory_file": trajectory_file,
351-
"start_from_step_index": start_from_step_index,
352-
"parameter_values": parameter_values,
353-
"toolbox": self.tools,
354-
"reporter": self._reporter,
355-
}
356-
self._executed_from_cache = True
357-
self.switch_speaker("CacheExecutor")
358-
359340
def _add_message(self, message: MessageParam) -> None:
360341
"""Add message to conversation history.
361342
@@ -392,17 +373,26 @@ def _handle_result_status(self, result: SpeakerResult) -> bool:
392373
return False
393374
if result.status == "switch_speaker":
394375
if result.next_speaker:
395-
self.switch_speaker(result.next_speaker)
376+
self.switch_speaker(
377+
result.next_speaker,
378+
speaker_context=result.speaker_context,
379+
)
396380
return True
397381
# status == "continue"
398382
return True
399383

400384
@tracer.start_as_current_span("switch_speaker")
401-
def switch_speaker(self, speaker_name: str) -> None:
402-
"""Switch to a different speaker.
385+
def switch_speaker(
386+
self,
387+
speaker_name: str,
388+
speaker_context: dict[str, Any] | None = None,
389+
) -> None:
390+
"""Switch to a different speaker, optionally passing activation context.
403391
404392
Args:
405-
speaker_name: Name of the speaker to switch to
393+
speaker_name: Name of the speaker to switch to.
394+
speaker_context: Optional activation context to pass to the
395+
target speaker via ``on_activate()``.
406396
"""
407397
old_speaker = self.current_speaker
408398
self.current_speaker = self.speakers[speaker_name]
@@ -411,6 +401,8 @@ def switch_speaker(self, speaker_name: str) -> None:
411401
old_speaker.get_name(),
412402
self.current_speaker.get_name(),
413403
)
404+
if speaker_context is not None:
405+
self.current_speaker.on_activate(speaker_context)
414406

415407
def get_messages(self) -> list[MessageParam]:
416408
"""Get current message history from truncation strategy.

src/askui/prompts/act_prompts.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -388,27 +388,29 @@
388388
"CRITICAL: Before taking ANY action, you MUST first call the"
389389
" retrieve_available_trajectories_tool to check for cached trajectories. If the"
390390
" name of an available cached trajectory matches the one specified by the user,"
391-
" you MUST execute it using the execute_cached_executions_tool before calling any"
392-
" other tools!\n"
391+
" you MUST switch to the CacheExecutor speaker using the switch_speaker tool"
392+
" before calling any other tools!\n"
393393
"\n"
394394
"WORKFLOW:\n"
395395
"1. ALWAYS start by calling retrieve_available_trajectories_tool\n"
396-
"2. If a matching cached trajectory exists, execute it immediately using"
397-
" the execute_cached_executions_tool"
396+
"2. If a matching cached trajectory exists, switch to CacheExecutor using"
397+
" the switch_speaker tool with speaker_context containing the trajectory details\n"
398398
"3. Only proceed with manual execution if no matching trajectory is available\n"
399399
"\n"
400400
"EXECUTING TRAJECTORIES:\n"
401-
"- Use execute_cached_executions_tool to run cached trajectories\n"
401+
"- Use switch_speaker(speaker_name='CacheExecutor', speaker_context={"
402+
"'trajectory_file': '<path>', 'parameter_values': {...}}) to start execution\n"
402403
"- Trajectories contain complete sequences of mouse movements, clicks, and typing"
403404
" from successful executions\n"
404405
"- You'll see all screenshots and results in message history\n"
405406
"- Verify results after execution completes\n"
406407
"\n"
407408
"DYNAMIC PARAMETERS:\n"
408409
"- Trajectories may require parameters like {{current_date}} or {{user_name}}\n"
409-
"- Provide values via parameter_values as a dictionary\n"
410-
"- Example: execute_cached_executions_tool(trajectory_file='test.json',"
411-
" parameter_values={'current_date': '2025-12-11'})\n"
410+
"- Provide values via parameter_values in the speaker_context\n"
411+
"- Example: switch_speaker(speaker_name='CacheExecutor', speaker_context={"
412+
"'trajectory_file': 'test.json', 'parameter_values': {"
413+
"'current_date': '2025-12-11'}})\n"
412414
"- Missing required parameters will cause execution failure with an error message."
413415
" In that case try again with providing the correct parameters\n"
414416
"\n"
@@ -417,13 +419,13 @@
417419
"- Trajectory pauses at non-cacheable steps, returning NEEDS_AGENT status with"
418420
" current step index\n"
419421
"- Execute the non-cacheable step manually\n"
420-
"- Resume using execute_cached_executions_tool with start_from_step_index"
421-
" parameter\n"
422+
"- Resume by switching to CacheExecutor again with start_from_step_index"
423+
" in the speaker_context\n"
422424
"\n"
423425
"CONTINUING TRAJECTORIES:\n"
424-
"- Resume after non-cacheable steps:"
425-
" execute_cached_executions_tool(trajectory_file='test.json',"
426-
" start_from_step_index=5, parameter_values={...})\n"
426+
"- Resume after non-cacheable steps: switch_speaker(speaker_name='CacheExecutor',"
427+
" speaker_context={'trajectory_file': 'test.json',"
428+
" 'start_from_step_index': 5, 'parameter_values': {...}})\n"
427429
"\n"
428430
"FAILURE HANDLING:\n"
429431
"- On failure, you'll see the error and failed step index\n"

src/askui/prompts/caching.py

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -34,30 +34,3 @@
3434
}
3535
3636
If no parameters are found, return an empty parameters array."""
37-
38-
39-
CACHE_USE_PROMPT = (
40-
"<TRAJECTORY USE>\n"
41-
" You can use precomputed trajectories to make the execution of the "
42-
"task more robust and faster!\n"
43-
" To do so, first use the RetrieveCachedTestExecutions tool to check "
44-
"which trajectories are available for you.\n"
45-
" The details what each trajectory that is available for you does are "
46-
"at the end of this prompt.\n"
47-
" A trajectory contains all necessary mouse movements, clicks, and "
48-
"typing actions from a previously successful execution.\n"
49-
" If there is a trajectory available for a step you need to take, "
50-
"always use it!\n"
51-
" You can execute a trajectory with the ExecuteCachedExecution tool.\n"
52-
" After a trajectory was executed, make sure to verify the results! "
53-
"While it works most of the time, occasionally, the execution can be "
54-
"(partly) incorrect. So make sure to verify if everything is filled out "
55-
"as expected, and make corrections where necessary!\n"
56-
" </TRAJECTORY USE>\n"
57-
" <TRAJECTORY DETAILS>\n"
58-
" There are several trajectories available to you.\n"
59-
" Their filename is a unique testID.\n"
60-
" If executed using the ExecuteCachedExecution tool, a trajectory will "
61-
"automatically execute all necessary steps for the test with that id.\n"
62-
" </TRAJECTORY DETAILS>\n"
63-
)

0 commit comments

Comments
 (0)