Add StructuredOutputRetryLimitMiddleware and default retry limit

mateusz834 · mateusz834 · commit 2eeaeef4abe3 · 2026-04-30T10:48:46.000+02:00
diff --git a/splunklib/ai/base_agent.py b/splunklib/ai/base_agent.py
@@ -23,9 +23,11 @@
 from splunklib.ai.conversation_store import ConversationStore
 from splunklib.ai.hooks import (
     DEFAULT_STEP_LIMIT,
+    DEFAULT_STRUCTURED_OUTPUT_RETRY_LIMIT,
     DEFAULT_TIMEOUT_SECONDS,
     DEFAULT_TOKEN_LIMIT,
     StepLimitMiddleware,
+    StructuredOutputRetryLimitMiddleware,
     TimeoutLimitMiddleware,
     TokenLimitMiddleware,
 )
@@ -87,8 +89,24 @@ def __init__(
             TimeoutLimitMiddleware(DEFAULT_TIMEOUT_SECONDS),
         ]
         # Append predefined middlewares by default if not provided already.
-        default_middleware = [m for m in predefined if type(m) not in user_middleware_types]
-        self._middleware = (*user_middleware, *default_middleware)
+        default_middleware = [
+            m for m in predefined if type(m) not in user_middleware_types
+        ]
+
+        predefined_before: list[AgentMiddleware] = [
+            StructuredOutputRetryLimitMiddleware(DEFAULT_STRUCTURED_OUTPUT_RETRY_LIMIT),
+        ]
+
+        default_before_middleware = [
+            m for m in predefined_before if type(m) not in user_middleware_types
+        ]
+
+        self._middleware = (
+            *default_before_middleware,
+            *user_middleware,
+            *default_middleware,
+        )
+
         self._trace_id = secrets.token_hex(16)  # 32 Hex characters
         self._conversation_store = conversation_store
         self._thread_id = thread_id
diff --git a/splunklib/ai/hooks.py b/splunklib/ai/hooks.py
@@ -12,10 +12,18 @@
     ModelRequest,
     ModelResponse,
 )
+from splunklib.ai.structured_output import StructuredOutputGenerationException
 
 DEFAULT_TIMEOUT_SECONDS: float = 600.0
 DEFAULT_STEP_LIMIT: int = 100
 DEFAULT_TOKEN_LIMIT: int = 200_000
+DEFAULT_STRUCTURED_OUTPUT_RETRY_LIMIT: int = 3
+
+
+# TODO: should we include the messages in the exception? We have them
+# in AgentState.
+
+# TODO: what if we pass a AiMessage with tool calls to invoke.
 
 
 class AgentStopException(Exception):
@@ -25,6 +33,7 @@ class AgentStopException(Exception):
 class TokenLimitExceededException(AgentStopException):
     """Raised by `Agent.invoke`, when token limit exceeds"""
 
+    # TODO: should be an int
     def __init__(self, token_limit: float) -> None:
         super().__init__(f"Token limit of {token_limit} exceeded.")
 
@@ -43,6 +52,13 @@ def __init__(self, timeout_seconds: float) -> None:
         super().__init__(f"Timed out after {timeout_seconds} seconds.")
 
 
+class StructuredOutputRetryLimitExceededException(AgentStopException):
+    """Raised by `Agent.invoke`, when structured output retry limit exceeds"""
+
+    def __init__(self, retry_count: int) -> None:
+        super().__init__(f"Structured output retry limit of {retry_count} exceeded")
+
+
 def before_model(
     func: Callable[[ModelRequest], None | Awaitable[None]],
 ) -> AgentMiddleware:
@@ -125,6 +141,10 @@ async def agent_middleware(
     return _Middleware()
 
 
+# TODO: we should have a token budget limit.
+
+
+# TODO: actually we could call this context window limit, right?
 class TokenLimitMiddleware(AgentMiddleware):
     """Stops agent execution when the token count of messages passed to the model exceeds the given limit."""
 
@@ -187,6 +207,7 @@ async def agent_middleware(
     ) -> AgentResponse[Any | None]:
         # WARN: this might not work with agents handling
         # different threads at the same time.
+        # TODO: now we have thread_id, thus we can solve this.
         self._deadline = monotonic() + self._seconds
         return await handler(request)
 
@@ -199,3 +220,44 @@ async def model_middleware(
         if self._deadline is not None and monotonic() >= self._deadline:
             raise TimeoutExceededException(timeout_seconds=self._seconds)
         return await handler(request)
+
+
+class StructuredOutputRetryLimitMiddleware(AgentMiddleware):
+    """Stops agent execution when the agent exceeds structured output
+    retry limit during a single agent loop invocation.
+    """
+
+    _limit: int
+    _retries_per_thread_id: dict[str, int]
+
+    def __init__(self, limit: int) -> None:
+        self._limit = limit
+        self._retries_per_thread_id = {}
+
+    @override
+    async def agent_middleware(
+        self,
+        request: AgentRequest,
+        handler: AgentMiddlewareHandler,
+    ) -> AgentResponse[Any | None]:
+        try:
+            # Agent loop starting.
+            self._retries_per_thread_id[request.thread_id] = 0
+            return await handler(request)
+        finally:
+            del self._retries_per_thread_id[request.thread_id]  # don't leak memory
+
+    @override
+    async def model_middleware(
+        self,
+        request: ModelRequest,
+        handler: ModelMiddlewareHandler,
+    ) -> ModelResponse:
+        try:
+            return await handler(request)
+        except StructuredOutputGenerationException:
+            # TODO: 0 is fince, document that? 0 means no retries, right?
+            self._retries_per_thread_id[request.state.thread_id] += 1
+            if self._retries_per_thread_id[request.state.thread_id] > self._limit:
+                raise StructuredOutputRetryLimitExceededException(self._limit)
+            raise  # re-raise, to retry structured output generation
diff --git a/tests/integration/ai/test_structured_output.py b/tests/integration/ai/test_structured_output.py
@@ -21,6 +21,10 @@
 from pydantic.dataclasses import dataclass
 
 from splunklib.ai import Agent
+from splunklib.ai.hooks import (
+    StructuredOutputRetryLimitExceededException,
+    StructuredOutputRetryLimitMiddleware,
+)
 from splunklib.ai.messages import (
     AgentResponse,
     AIMessage,
@@ -930,5 +934,98 @@ async def _model_middleware(
             assert len(result.messages) == 3
             assert result.structured_output.name == "MIKE"
 
+    @pytest.mark.asyncio
+    @ai_snapshot_test()
+    async def test_default_retry_limit(self) -> None:
+        pytest.importorskip("langchain_openai")
+
+        class Person(BaseModel):
+            name: str = Field(description="The person's full name", min_length=1)
+
+        model_call_count = 0
+
+        @model_middleware
+        async def _model_middleware(
+            _request: ModelRequest,
+            _handler: ModelMiddlewareHandler,
+        ) -> ModelResponse:
+            nonlocal model_call_count
+            model_call_count += 1
+
+            raise StructuredOutputGenerationException(
+                message=AIMessage(content="", calls=[]),
+                error=StructuredOutputValidationError(
+                    validation_error="Invalid output"
+                ),
+            )
+
+        async with Agent(
+            model=(await self.model()),
+            system_prompt="Respond with structured data",
+            output_schema=Person,
+            service=self.service,
+            middleware=[_model_middleware],
+        ) as agent:
+            with pytest.raises(
+                StructuredOutputRetryLimitExceededException,
+                match="Structured output retry limit of 3 exceeded",
+            ):
+                await agent.invoke(
+                    [HumanMessage(content="My name is Mike, what is my name?")]
+                )
+
+        assert model_call_count == 4
+
+    @pytest.mark.asyncio
+    @ai_snapshot_test()
+    async def test_custom_retry_limit_retry(self) -> None:
+        pytest.importorskip("langchain_openai")
+
+        class Person(BaseModel):
+            name: str = Field(description="The person's full name", min_length=1)
+
+        limits = [0, 1, 20]
+        for limit in limits:
+            with self.subTest(limit):
+                model_call_count = 0
+
+                @model_middleware
+                async def _model_middleware(
+                    _request: ModelRequest,
+                    _handler: ModelMiddlewareHandler,
+                ) -> ModelResponse:
+                    nonlocal model_call_count
+                    model_call_count += 1
+
+                    raise StructuredOutputGenerationException(
+                        message=AIMessage(content="", calls=[]),
+                        error=StructuredOutputValidationError(
+                            validation_error="Invalid output"
+                        ),
+                    )
+
+                async with Agent(
+                    model=(await self.model()),
+                    system_prompt="Respond with structured data",
+                    output_schema=Person,
+                    service=self.service,
+                    middleware=[
+                        StructuredOutputRetryLimitMiddleware(limit),
+                        _model_middleware,
+                    ],
+                ) as agent:
+                    with pytest.raises(
+                        StructuredOutputRetryLimitExceededException,
+                        match=f"Structured output retry limit of {limit} exceeded",
+                    ):
+                        await agent.invoke(
+                            [HumanMessage(content="My name is Mike, what is my name?")]
+                        )
+
+                # We expect limit + 1, since first LLM call is not a retry.
+                assert model_call_count == limit + 1
+
+    # TODO: make sure with test that this retry is in a single agent loop invoaction.
+
     # TODO: test what happens if model/agent middleware removes the structured_output.
     #       do we detect that? We should and raise in invoke, that output was removed.