potential fix

jrobertboos · jrobertboos · commit 949a8de551fe · 2026-06-22T13:58:14.000-04:00
diff --git a/src/utils/agents/query.py b/src/utils/agents/query.py
@@ -2,8 +2,9 @@
 
 from __future__ import annotations
 
+import json
 from enum import Enum
-from typing import Optional, TypeAlias, cast
+from typing import Any, Optional, TypeAlias, cast
 
 from fastapi import HTTPException
 from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient
@@ -16,7 +17,12 @@
     UnexpectedModelBehavior,
     UsageLimitExceeded,
 )
-from pydantic_ai.messages import ModelRequest, ModelResponse, ToolReturnPart
+from pydantic_ai.messages import (
+    ModelRequest,
+    ModelResponse,
+    ToolCallPart,
+    ToolReturnPart,
+)
 from pydantic_ai.run import AgentRunResult
 from pydantic_ai.usage import RunUsage
 
@@ -277,6 +283,83 @@ def build_turn_summary_from_agent_run(
     return state.turn_summary
 
 
+async def persist_agent_run_to_conversation(
+    client: AsyncLlamaStackClient,
+    conversation_id: str,
+    user_input: ResponseInput,
+    run_result: AgentRunResult[str],
+) -> None:
+    """Persist a completed pydantic AI agent run to a Llama Stack conversation.
+
+    Since the pydantic AI agent path does not pass ``conversation`` to Llama Stack
+    (to avoid duplicate history loading on tool-call continuations), the turn must
+    be explicitly stored after the run completes.
+
+    Builds conversation items from the run's message history in correct order:
+    user input, function calls, function call outputs, and final assistant message.
+
+    Args:
+        client: Llama Stack client for conversation persistence.
+        conversation_id: Llama Stack conversation ID to store items in.
+        user_input: Original user input (string or structured items).
+        run_result: Completed pydantic AI agent run result.
+    """
+    items: list[dict[str, Any]] = []
+
+    if isinstance(user_input, str):
+        items.append({"type": "message", "role": "user", "content": user_input})
+    else:
+        items.extend(item.model_dump() for item in user_input)
+
+    for message in run_result.new_messages():
+        if isinstance(message, ModelResponse):
+            for part in message.parts:
+                if isinstance(part, ToolCallPart):
+                    args = part.args_as_json_str()
+                    items.append(
+                        {
+                            "type": "function_call",
+                            "call_id": part.tool_call_id or "",
+                            "name": part.tool_name,
+                            "arguments": args,
+                            "status": "completed",
+                        }
+                    )
+            if message.text:
+                items.append(
+                    {
+                        "type": "message",
+                        "role": "assistant",
+                        "content": message.text,
+                    }
+                )
+        elif isinstance(message, ModelRequest):
+            for part in message.parts:
+                if isinstance(part, ToolReturnPart):
+                    content = part.content
+                    if not isinstance(content, str):
+                        content = json.dumps(content)
+                    items.append(
+                        {
+                            "type": "function_call_output",
+                            "call_id": part.tool_call_id or "",
+                            "output": content,
+                        }
+                    )
+
+    if not items:
+        return
+
+    try:
+        await client.conversations.items.create(conversation_id, items=items)  # type: ignore[arg-type]
+    except (APIConnectionError, APIStatusError) as exc:
+        logger.warning(
+            "Failed to persist agent turn to conversation %s: %s",
+            conversation_id,
+            exc,
+        )
+
+
 async def retrieve_agent_response(
     client: AsyncLlamaStackClient,
     responses_params: ResponsesApiParams,
@@ -320,6 +403,14 @@ async def retrieve_agent_response(
         response = map_agent_inference_error(exc, responses_params.model)
         raise HTTPException(**response.model_dump()) from exc
 
+    if not responses_params.omit_conversation:
+        await persist_agent_run_to_conversation(
+            client,
+            responses_params.conversation,
+            _original_input or responses_params.input,
+            run_result,
+        )
+
     vector_store_ids = extract_vector_store_ids_from_tools(responses_params.tools)
     rag_id_mapping = configuration.rag_id_mapping
     return build_turn_summary_from_agent_run(
diff --git a/src/utils/agents/streaming.py b/src/utils/agents/streaming.py
@@ -49,6 +49,7 @@
     get_agent_finish_reason,
     get_finish_reason_error,
     map_agent_inference_error,
+    persist_agent_run_to_conversation,
 )
 from utils.agents.tool_processor import (
     process_function_tool_call,
@@ -300,6 +301,15 @@ async def agent_response_generator(
         return
 
     run_result = dispatch_state.run_result
+
+    if not responses_params.omit_conversation:
+        await persist_agent_run_to_conversation(
+            context.client,
+            responses_params.conversation,
+            responses_params.input,
+            run_result,
+        )
+
     turn_summary.token_usage = extract_agent_token_usage(
         run_result.usage,
         responses_params.model,
diff --git a/src/utils/pydantic_ai.py b/src/utils/pydantic_ai.py
@@ -19,7 +19,6 @@
 
 _LLS_RESPONSES_EXTRA_FIELDS: Final[frozenset[str]] = frozenset(
     {
-        "conversation",
         "max_infer_iters",
         "tools",
         "tool_choice",
@@ -68,10 +67,9 @@ def _model_settings_from_responses_params(
     if responses_params.extra_headers:
         settings_dict["extra_headers"] = dict(responses_params.extra_headers)
     settings_dict["openai_store"] = responses_params.store
-    if responses_params.previous_response_id is not None:
-        settings_dict["openai_previous_response_id"] = (
-            responses_params.previous_response_id
-        )
+    settings_dict["openai_previous_response_id"] = (
+        responses_params.previous_response_id or "auto"
+    )
     return cast(OpenAIResponsesModelSettings, settings_dict)
 
 
diff --git a/src/utils/responses.py b/src/utils/responses.py
@@ -420,6 +420,10 @@ async def prepare_responses_params(  # pylint: disable=too-many-arguments,too-ma
     # Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug
     normalized_model = normalize_vertex_ai_model_id(model)
 
+    previous_response_id = (
+        user_conversation.last_response_id if user_conversation else None
+    )
+
     return ResponsesApiParams(
         input=input_text,
         model=normalized_model,
@@ -429,6 +433,7 @@ async def prepare_responses_params(  # pylint: disable=too-many-arguments,too-ma
         stream=stream,
         store=store,
         extra_headers=extra_headers,
+        previous_response_id=previous_response_id,
     )