sdk-python/src/strands/event_loop/event_loop.py at 1b7b6f244520cb593117d2a12e452c9ea07370ca · strands-agents/sdk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
"""This module implements the central event loop.

The event loop allows agents to:

1. Process conversation messages
2. Execute tools based on model requests
3. Handle errors and recovery strategies
4. Manage recursive execution cycles
"""

import logging
import uuid
from collections.abc import AsyncGenerator
from typing import TYPE_CHECKING, Any

from opentelemetry import trace as trace_api

from ..hooks import AfterModelCallEvent, BeforeModelCallEvent, MessageAddedEvent
from ..telemetry.metrics import Trace
from ..telemetry.tracer import Tracer, get_tracer
from ..tools._validator import validate_and_prepare_tools
from ..tools.structured_output._structured_output_context import StructuredOutputContext
from ..types._events import (
    EventLoopStopEvent,
    ForceStopEvent,
    ModelMessageEvent,
    ModelStopReason,
    StartEvent,
    StartEventLoopEvent,
    StructuredOutputEvent,
    ToolInterruptEvent,
    ToolResultMessageEvent,
    TypedEvent,
)
from ..types.content import Message, Messages
from ..types.exceptions import (
    ContextWindowOverflowException,
    EventLoopException,
    MaxTokensReachedException,
    StructuredOutputException,
)
from ..types.streaming import StopReason
from ..types.tools import ToolResult, ToolUse
from ._recover_message_on_max_tokens_reached import recover_message_on_max_tokens_reached
from ._retry import ModelRetryStrategy
from .streaming import stream_messages

if TYPE_CHECKING:
    from ..agent import Agent

logger = logging.getLogger(__name__)

MAX_ATTEMPTS = 6
INITIAL_DELAY = 4
MAX_DELAY = 240  # 4 minutes


def _has_tool_use_in_latest_message(messages: "Messages") -> bool:
    """Check if the latest message contains any ToolUse content blocks.

    Args:
        messages: List of messages in the conversation.

    Returns:
        True if the latest message contains at least one ToolUse content block, False otherwise.
    """
    if len(messages) > 0:
        latest_message = messages[-1]
        content_blocks = latest_message.get("content", [])

        for content_block in content_blocks:
            if "toolUse" in content_block:
                return True

    return False


async def event_loop_cycle(
    agent: "Agent",
    invocation_state: dict[str, Any],
    structured_output_context: StructuredOutputContext | None = None,
) -> AsyncGenerator[TypedEvent, None]:
    """Execute a single cycle of the event loop.

    This core function processes a single conversation turn, handling model inference, tool execution, and error
    recovery. It manages the entire lifecycle of a conversation turn, including:

    1. Initializing cycle state and metrics
    2. Checking execution limits
    3. Processing messages with the model
    4. Handling tool execution requests
    5. Managing recursive calls for multi-turn tool interactions
    6. Collecting and reporting metrics
    7. Error handling and recovery

    Args:
        agent: The agent for which the cycle is being executed.
        invocation_state: Additional arguments including:

            - request_state: State maintained across cycles
            - event_loop_cycle_id: Unique ID for this cycle
            - event_loop_cycle_span: Current tracing Span for this cycle
        structured_output_context: Optional context for structured output management.

    Yields:
        Model and tool stream events. The last event is a tuple containing:

            - StopReason: Reason the model stopped generating (e.g., "tool_use")
            - Message: The generated message from the model
            - EventLoopMetrics: Updated metrics for the event loop
            - Any: Updated request state

    Raises:
        EventLoopException: If an error occurs during execution
        ContextWindowOverflowException: If the input is too large for the model
    """
    structured_output_context = structured_output_context or StructuredOutputContext()

    # Initialize cycle state
    invocation_state["event_loop_cycle_id"] = uuid.uuid4()

    # Initialize state and get cycle trace
    if "request_state" not in invocation_state:
        invocation_state["request_state"] = {}
    attributes = {"event_loop_cycle_id": str(invocation_state.get("event_loop_cycle_id"))}
    cycle_start_time, cycle_trace = agent.event_loop_metrics.start_cycle(attributes=attributes)
    invocation_state["event_loop_cycle_trace"] = cycle_trace

    yield StartEvent()
    yield StartEventLoopEvent()

    # Create tracer span for this event loop cycle
    tracer = get_tracer()
    cycle_span = tracer.start_event_loop_cycle_span(
        invocation_state=invocation_state,
        messages=agent.messages,
        parent_span=agent.trace_span,
        custom_trace_attributes=agent.trace_attributes,
    )
    invocation_state["event_loop_cycle_span"] = cycle_span
    model_events: AsyncGenerator[TypedEvent, None] | None = None

    with trace_api.use_span(cycle_span, end_on_exit=False):
        try:
            # Skipping model invocation if in interrupt state as interrupts are currently only supported for tool calls.
            if agent._interrupt_state.activated:
                stop_reason: StopReason = "tool_use"
                message = agent._interrupt_state.context["tool_use_message"]
            # Skip model invocation if the latest message contains ToolUse
            elif _has_tool_use_in_latest_message(agent.messages):
                stop_reason = "tool_use"
                message = agent.messages[-1]
            else:
                model_events = _handle_model_execution(
                    agent, cycle_span, cycle_trace, invocation_state, tracer, structured_output_context
                )
                try:
                    async for model_event in model_events:
                        if not isinstance(model_event, ModelStopReason):
                            yield model_event
                finally:
                    await model_events.aclose()

                stop_reason, message, *_ = model_event["stop"]
                yield ModelMessageEvent(message=message)
        except Exception as e:
            tracer.end_span_with_error(cycle_span, str(e), e)
            raise
        except BaseException as e:
            tracer.end_span_with_error(cycle_span, str(e), e)
            raise

        try:
            if stop_reason == "max_tokens":
                """
                Handle max_tokens limit reached by the model.

                When the model reaches its maximum token limit, this represents a potentially unrecoverable
                state where the model's response was truncated. By default, Strands fails hard with an
                MaxTokensReachedException to maintain consistency with other failure types.
                """
                raise MaxTokensReachedException(
                    message=(
                        "Agent has reached an unrecoverable state due to max_tokens limit. "
                        "For more information see: "
                        "https://strandsagents.com/latest/user-guide/concepts/agents/agent-loop/#maxtokensreachedexception"
                    )
                )

            if stop_reason == "tool_use":
                # Handle tool execution
                tool_events = _handle_tool_execution(
                    stop_reason,
                    message,
                    agent=agent,
                    cycle_trace=cycle_trace,
                    cycle_span=cycle_span,
                    cycle_start_time=cycle_start_time,
                    invocation_state=invocation_state,
                    tracer=tracer,
                    structured_output_context=structured_output_context,
                )
                async for tool_event in tool_events:
                    yield tool_event

                return

            # End the cycle and return results
            agent.event_loop_metrics.end_cycle(cycle_start_time, cycle_trace, attributes)

            # Force structured output tool call if LLM didn't use it automatically
            if structured_output_context.is_enabled and stop_reason == "end_turn":
                if structured_output_context.force_attempted:
                    raise StructuredOutputException(
                        "The model failed to invoke the structured output tool even after it was forced."
                    )
                structured_output_context.set_forced_mode()
                logger.debug("Forcing structured output tool")
                await agent._append_messages(
                    {"role": "user", "content": [{"text": structured_output_context.structured_output_prompt}]}
                )

                tracer.end_event_loop_cycle_span(cycle_span, message)
                events = recurse_event_loop(
                    agent=agent, invocation_state=invocation_state, structured_output_context=structured_output_context
                )
                async for typed_event in events:
                    yield typed_event
                return

            tracer.end_event_loop_cycle_span(cycle_span, message)
            yield EventLoopStopEvent(stop_reason, message, agent.event_loop_metrics, invocation_state["request_state"])
        except StructuredOutputException as e:
            tracer.end_span_with_error(cycle_span, str(e), e)
            raise
        except EventLoopException as e:
            tracer.end_span_with_error(cycle_span, str(e), e)
            # Don't yield or log the exception - we already did it when we
            # raised the exception and we don't need that duplication.
            raise
        except (ContextWindowOverflowException, MaxTokensReachedException) as e:
            # Special cased exceptions which we want to bubble up rather than get wrapped in an EventLoopException
            tracer.end_span_with_error(cycle_span, str(e), e)
            raise e
        except Exception as e:
            tracer.end_span_with_error(cycle_span, str(e), e)
            # Handle any other exceptions
            yield ForceStopEvent(reason=e)
            logger.exception("cycle failed")
            raise EventLoopException(e, invocation_state["request_state"]) from e
        except BaseException as e:
            tracer.end_span_with_error(cycle_span, str(e), e)
            raise


async def recurse_event_loop(
    agent: "Agent",
    invocation_state: dict[str, Any],
    structured_output_context: StructuredOutputContext | None = None,
) -> AsyncGenerator[TypedEvent, None]:
    """Make a recursive call to event_loop_cycle with the current state.

    This function is used when the event loop needs to continue processing after tool execution.

    Args:
        agent: Agent for which the recursive call is being made.
        invocation_state: Arguments to pass through event_loop_cycle
        structured_output_context: Optional context for structured output management.

    Yields:
        Results from event_loop_cycle where the last result contains:

            - StopReason: Reason the model stopped generating
            - Message: The generated message from the model
            - EventLoopMetrics: Updated metrics for the event loop
            - Any: Updated request state
    """
    cycle_trace = invocation_state["event_loop_cycle_trace"]

    # Recursive call trace
    recursive_trace = Trace("Recursive call", parent_id=cycle_trace.id)
    cycle_trace.add_child(recursive_trace)

    yield StartEvent()

    events = event_loop_cycle(
        agent=agent, invocation_state=invocation_state, structured_output_context=structured_output_context
    )
    async for event in events:
        yield event

    recursive_trace.end()


async def _handle_model_execution(
    agent: "Agent",
    cycle_span: Any,
    cycle_trace: Trace,
    invocation_state: dict[str, Any],
    tracer: Tracer,
    structured_output_context: StructuredOutputContext,
) -> AsyncGenerator[TypedEvent, None]:
    """Handle model execution with retry logic for throttling exceptions.

    Executes the model inference with automatic retry handling for throttling exceptions.
    Manages tracing, hooks, and metrics collection throughout the process.

    Args:
        agent: The agent executing the model.
        cycle_span: Span object for tracing the cycle.
        cycle_trace: Trace object for the current event loop cycle.
        invocation_state: State maintained across cycles.
        tracer: Tracer instance for span management.
        structured_output_context: Context for structured output management.

    Yields:
        Model stream events and throttle events during retries.

    Raises:
        ModelThrottledException: If max retry attempts are exceeded.
        Exception: Any other model execution errors.
    """
    # Create a trace for the stream_messages call
    stream_trace = Trace("stream_messages", parent_id=cycle_trace.id)
    cycle_trace.add_child(stream_trace)

    # Retry loop - actual retry logic is handled by retry_strategy hook
    # Hooks control when to stop retrying via the event.retry flag
    while True:
        model_id = agent.model.config.get("model_id") if hasattr(agent.model, "config") else None
        model_invoke_span = tracer.start_model_invoke_span(
            messages=agent.messages,
            parent_span=cycle_span,
            model_id=model_id,
            custom_trace_attributes=agent.trace_attributes,
        )
        streamed_events: AsyncGenerator[TypedEvent, None] | None = None
        with trace_api.use_span(model_invoke_span, end_on_exit=False):
            try:
                await agent.hooks.invoke_callbacks_async(
                    BeforeModelCallEvent(
                        agent=agent,
                        invocation_state=invocation_state,
                    )
                )

                if structured_output_context.forced_mode:
                    tool_spec = structured_output_context.get_tool_spec()
                    tool_specs = [tool_spec] if tool_spec else []
                else:
                    tool_specs = agent.tool_registry.get_all_tool_specs()

                streamed_events = stream_messages(
                    agent.model,
                    agent.system_prompt,
                    agent.messages,
                    tool_specs,
                    system_prompt_content=agent._system_prompt_content,
                    tool_choice=structured_output_context.tool_choice,
                    invocation_state=invocation_state,
                    cancel_signal=agent._cancel_signal,
                )
                try:
                    async for event in streamed_events:
                        yield event
                finally:
                    await streamed_events.aclose()

                stop_reason, message, usage, metrics = event["stop"]
                invocation_state.setdefault("request_state", {})

                after_model_call_event = AfterModelCallEvent(
                    agent=agent,
                    invocation_state=invocation_state,
                    stop_response=AfterModelCallEvent.ModelStopResponse(
                        stop_reason=stop_reason,
                        message=message,
                    ),
                )

                await agent.hooks.invoke_callbacks_async(after_model_call_event)

                # Check if hooks want to retry the model call
                if after_model_call_event.retry:
                    logger.debug(
                        "stop_reason=<%s>, retry_requested=<True> | hook requested model retry",
                        stop_reason,
                    )
                    tracer.end_model_invoke_span(model_invoke_span, message, usage, metrics, stop_reason)
                    continue  # Retry the model call

                if stop_reason == "max_tokens":
                    message = recover_message_on_max_tokens_reached(message)

                tracer.end_model_invoke_span(model_invoke_span, message, usage, metrics, stop_reason)
                break  # Success! Break out of retry loop

            except Exception as e:
                tracer.end_span_with_error(model_invoke_span, str(e), e)
                after_model_call_event = AfterModelCallEvent(
                    agent=agent,
                    invocation_state=invocation_state,
                    exception=e,
                )
                await agent.hooks.invoke_callbacks_async(after_model_call_event)

                # Emit backwards-compatible events if retry strategy supports it
                # (prior to making the retry strategy configurable, this is what we emitted)

                if (
                    isinstance(agent._retry_strategy, ModelRetryStrategy)
                    and agent._retry_strategy._backwards_compatible_event_to_yield
                ):
                    yield agent._retry_strategy._backwards_compatible_event_to_yield

                # Check if hooks want to retry the model call
                if after_model_call_event.retry:
                    logger.debug(
                        "exception=<%s>, retry_requested=<True> | hook requested model retry",
                        type(e).__name__,
                    )

                    continue  # Retry the model call

                # No retry requested, raise the exception
                yield ForceStopEvent(reason=e)
                raise e
            except BaseException as e:
                tracer.end_span_with_error(model_invoke_span, str(e), e)
                raise

    try:
        # Add message in trace and mark the end of the stream messages trace
        stream_trace.add_message(message)
        stream_trace.end()

        # Add the response message to the conversation
        agent.messages.append(message)
        await agent.hooks.invoke_callbacks_async(MessageAddedEvent(agent=agent, message=message))

        # Update metrics
        agent.event_loop_metrics.update_usage(usage)
        agent.event_loop_metrics.update_metrics(metrics)

    except Exception as e:
        yield ForceStopEvent(reason=e)
        logger.exception("cycle failed")
        raise EventLoopException(e, invocation_state["request_state"]) from e


async def _handle_tool_execution(
    stop_reason: StopReason,
    message: Message,
    agent: "Agent",
    cycle_trace: Trace,
    cycle_span: Any,
    cycle_start_time: float,
    invocation_state: dict[str, Any],
    tracer: Tracer,
    structured_output_context: StructuredOutputContext,
) -> AsyncGenerator[TypedEvent, None]:
    """Handles the execution of tools requested by the model during an event loop cycle.

    Args:
        stop_reason: The reason the model stopped generating.
        message: The message from the model that may contain tool use requests.
        agent: Agent for which tools are being executed.
        cycle_trace: Trace object for the current event loop cycle.
        cycle_span: Span object for tracing the cycle (type may vary).
        cycle_start_time: Start time of the current cycle.
        invocation_state: Additional keyword arguments, including request state.
        tracer: Tracer instance for span management.
        structured_output_context: Optional context for structured output management.

    Yields:
        Tool stream events along with events yielded from a recursive call to the event loop. The last event is a tuple
        containing:
            - The stop reason,
            - The updated message,
            - The updated event loop metrics,
            - The updated request state.
    """
    tool_uses: list[ToolUse] = []
    tool_results: list[ToolResult] = []
    invalid_tool_use_ids: list[str] = []

    validate_and_prepare_tools(message, tool_uses, tool_results, invalid_tool_use_ids)
    tool_uses = [tool_use for tool_use in tool_uses if tool_use.get("toolUseId") not in invalid_tool_use_ids]

    if agent._interrupt_state.activated:
        tool_results.extend(agent._interrupt_state.context["tool_results"])

        # Filter to only the interrupted tools when resuming from interrupt (tool uses without results)
        tool_use_ids = {tool_result["toolUseId"] for tool_result in tool_results}
        tool_uses = [tool_use for tool_use in tool_uses if tool_use["toolUseId"] not in tool_use_ids]

    interrupts = []

    # Check for cancellation before tool execution
    # Add tool_result for each tool_use to maintain valid conversation state
    if agent._cancel_signal.is_set():
        logger.debug("tool_count=<%d> | cancellation detected before tool execution", len(tool_uses))

        # Create cancellation tool_result for each tool_use to avoid invalid message state
        # (tool_use without tool_result would be rejected on next invocation)
        for tool_use in tool_uses:
            cancel_result: ToolResult = {
                "toolUseId": str(tool_use.get("toolUseId")),
                "status": "error",
                "content": [{"text": "Tool execution cancelled"}],
            }
            tool_results.append(cancel_result)

        # Add tool results message to conversation if any tools were cancelled
        cancelled_tool_result_message: Message | None = None
        if tool_results:
            _cancelled_msg: Message = {
                "role": "user",
                "content": [{"toolResult": result} for result in tool_results],
            }
            cancelled_tool_result_message = _cancelled_msg
            agent.messages.append(_cancelled_msg)
            await agent.hooks.invoke_callbacks_async(MessageAddedEvent(agent=agent, message=_cancelled_msg))
            yield ToolResultMessageEvent(message=_cancelled_msg)

        agent.event_loop_metrics.end_cycle(cycle_start_time, cycle_trace)
        yield EventLoopStopEvent(
            "cancelled",
            message,
            agent.event_loop_metrics,
            invocation_state["request_state"],
        )
        if cycle_span:
            tracer.end_event_loop_cycle_span(
                span=cycle_span, message=message, tool_result_message=cancelled_tool_result_message
            )
        return

    tool_events = agent.tool_executor._execute(
        agent, tool_uses, tool_results, cycle_trace, cycle_span, invocation_state, structured_output_context
    )
    async for tool_event in tool_events:
        if isinstance(tool_event, ToolInterruptEvent):
            interrupts.extend(tool_event["tool_interrupt_event"]["interrupts"])

        yield tool_event

    structured_output_result = None
    if structured_output_context.is_enabled:
        if structured_output_result := structured_output_context.extract_result(tool_uses):
            yield StructuredOutputEvent(structured_output=structured_output_result)
            structured_output_context.stop_loop = True

    invocation_state["event_loop_parent_cycle_id"] = invocation_state["event_loop_cycle_id"]

    if interrupts:
        # Session state stored on AfterInvocationEvent.
        agent._interrupt_state.context = {"tool_use_message": message, "tool_results": tool_results}
        agent._interrupt_state.activate()

        agent.event_loop_metrics.end_cycle(cycle_start_time, cycle_trace)
        yield EventLoopStopEvent(
            "interrupt",
            message,
            agent.event_loop_metrics,
            invocation_state["request_state"],
            interrupts,
            structured_output=structured_output_result,
        )
        # End the cycle span before yielding the recursive cycle.
        if cycle_span:
            tracer.end_event_loop_cycle_span(span=cycle_span, message=message)

        return

    agent._interrupt_state.deactivate()

    tool_result_message: Message = {
        "role": "user",
        "content": [{"toolResult": result} for result in tool_results],
    }

    agent.messages.append(tool_result_message)
    await agent.hooks.invoke_callbacks_async(MessageAddedEvent(agent=agent, message=tool_result_message))

    yield ToolResultMessageEvent(message=tool_result_message)

    # End the cycle span before yielding the recursive cycle.
    if cycle_span:
        tracer.end_event_loop_cycle_span(span=cycle_span, message=message, tool_result_message=tool_result_message)

    if invocation_state["request_state"].get("stop_event_loop", False) or structured_output_context.stop_loop:
        agent.event_loop_metrics.end_cycle(cycle_start_time, cycle_trace)
        yield EventLoopStopEvent(
            stop_reason,
            message,
            agent.event_loop_metrics,
            invocation_state["request_state"],
            structured_output=structured_output_result,
        )
        return

    events = recurse_event_loop(
        agent=agent, invocation_state=invocation_state, structured_output_context=structured_output_context
    )
    async for event in events:
        yield event