2626from pydantic_ai import UnexpectedModelBehavior
2727from pydantic_ai ._run_context import RunContext
2828from pydantic_ai ._utils import PeekableAsyncStream , Unset , number_to_datetime
29- from pydantic_ai .messages import ModelMessage
29+ from pydantic_ai .messages import ModelMessage , ModelResponse
3030from pydantic_ai .models import (
3131 ModelRequestParameters ,
3232 StreamedResponse ,
@@ -181,15 +181,79 @@ class LlamaStackResponsesModel(OpenAIResponsesModel):
181181 before the corresponding ``McpCall`` or ``ResponseFunctionToolCall`` item.
182182 """
183183
184+ async def request ( # pylint: disable=unused-argument
185+ self ,
186+ messages : list [ModelMessage ],
187+ model_settings : ModelSettings | None ,
188+ model_request_parameters : ModelRequestParameters ,
189+ run_context : RunContext [Any ] | None = None ,
190+ ) -> Any :
191+ """Non-streaming request with Llama Stack conversation continuation fix.
192+
193+ Llama Stack rejects requests containing both ``conversation`` and
194+ ``previous_response_id``. On continuation turns (where a prior
195+ ``ModelResponse`` exists), we trim messages to only the new input and
196+ disable ``previous_response_id`` so that only ``conversation`` is sent.
197+ This ensures all responses are persisted to the conversation.
198+ """
199+ messages , model_settings = self ._prepare_conversation_continuation (
200+ messages , model_settings
201+ )
202+ return await super ().request (messages , model_settings , model_request_parameters )
203+
204+ def _prepare_conversation_continuation (
205+ self ,
206+ messages : list [ModelMessage ],
207+ model_settings : ModelSettings | None ,
208+ ) -> tuple [list [ModelMessage ], ModelSettings | None ]:
209+ """Trim messages and disable previous_response_id for conversation continuations.
210+
211+ Llama Stack rejects requests with both ``previous_response_id`` and
212+ ``conversation``. When ``conversation`` is in ``extra_body`` and there's
213+ already a ModelResponse in the history (a continuation turn), we:
214+
215+ 1. Trim messages to only those AFTER the last ModelResponse (new input only)
216+ 2. Disable ``openai_previous_response_id`` so pydantic-ai won't resolve one
217+
218+ This means Llama Stack receives ``conversation`` (for persistence) plus only
219+ the new input items. Llama Stack reconstructs prior history from the
220+ conversation and appends the new input correctly.
221+ """
222+ if not model_settings or not isinstance (model_settings , dict ):
223+ return messages , model_settings
224+
225+ extra_body = model_settings .get ("extra_body" )
226+ if not isinstance (extra_body , dict ) or "conversation" not in extra_body :
227+ return messages , model_settings
228+
229+ last_response_idx = None
230+ for i in range (len (messages ) - 1 , - 1 , - 1 ):
231+ msg = messages [i ]
232+ if isinstance (msg , ModelResponse ) and msg .provider_response_id :
233+ last_response_idx = i
234+ break
235+
236+ if last_response_idx is None :
237+ return messages , model_settings
238+
239+ trimmed_messages = messages [last_response_idx + 1 :]
240+
241+ new_settings = dict (model_settings )
242+ new_settings .pop ("openai_previous_response_id" , None )
243+ return trimmed_messages , cast (ModelSettings , new_settings )
244+
184245 @asynccontextmanager
185- async def request_stream (
246+ async def request_stream ( # pylint: disable=unused-argument
186247 self ,
187248 messages : list [ModelMessage ],
188249 model_settings : ModelSettings | None ,
189250 model_request_parameters : ModelRequestParameters ,
190251 run_context : RunContext [Any ] | None = None ,
191252 ) -> AsyncIterator [StreamedResponse ]:
192- """Request a streaming response, filtering Llama Stack-specific event quirks.
253+ """Request a streaming response with Llama Stack compatibility fixes.
254+
255+ Applies the same conversation continuation handling as :meth:`request`
256+ before calling the Responses API, then filters streaming tool-call events.
193257
194258 Args:
195259 messages: Model messages for the request.
@@ -201,10 +265,10 @@ async def request_stream(
201265 A StreamedResponse with the filtered event stream.
202266 """
203267 check_allow_model_requests ()
204- model_settings , model_request_parameters = self .prepare_request (
205- model_settings ,
206- model_request_parameters ,
268+ messages , model_settings = self ._prepare_conversation_continuation (
269+ messages , model_settings
207270 )
271+
208272 model_settings_cast = cast (OpenAIResponsesModelSettings , model_settings or {})
209273 response = await self ._responses_create (
210274 messages , True , model_settings_cast , model_request_parameters
0 commit comments