@@ -181,6 +181,70 @@ class LlamaStackResponsesModel(OpenAIResponsesModel):
181181 before the corresponding ``McpCall`` or ``ResponseFunctionToolCall`` item.
182182 """
183183
184+ async def request (
185+ self ,
186+ messages : list [ModelMessage ],
187+ model_settings : ModelSettings | None ,
188+ model_request_parameters : ModelRequestParameters ,
189+ run_context : RunContext [Any ] | None = None ,
190+ ) -> Any :
191+ """Non-streaming request with Llama Stack conversation continuation fix.
192+
193+ Llama Stack rejects requests containing both ``conversation`` and
194+ ``previous_response_id``. On continuation turns (where a prior
195+ ``ModelResponse`` exists), we trim messages to only the new input and
196+ disable ``previous_response_id`` so that only ``conversation`` is sent.
197+ This ensures all responses are persisted to the conversation.
198+ """
199+ messages , model_settings = self ._prepare_conversation_continuation (
200+ messages , model_settings
201+ )
202+ return await super ().request (
203+ messages , model_settings , model_request_parameters
204+ )
205+
206+ def _prepare_conversation_continuation (
207+ self ,
208+ messages : list [ModelMessage ],
209+ model_settings : ModelSettings | None ,
210+ ) -> tuple [list [ModelMessage ], ModelSettings | None ]:
211+ """Trim messages and disable previous_response_id for conversation continuations.
212+
213+ Llama Stack rejects requests with both ``previous_response_id`` and
214+ ``conversation``. When ``conversation`` is in ``extra_body`` and there's
215+ already a ModelResponse in the history (a continuation turn), we:
216+
217+ 1. Trim messages to only those AFTER the last ModelResponse (new input only)
218+ 2. Disable ``openai_previous_response_id`` so pydantic-ai won't resolve one
219+
220+ This means Llama Stack receives ``conversation`` (for persistence) plus only
221+ the new input items. Llama Stack reconstructs prior history from the
222+ conversation and appends the new input correctly.
223+ """
224+ from pydantic_ai .messages import ModelResponse # noqa: PLC0415
225+
226+ if not model_settings or not isinstance (model_settings , dict ):
227+ return messages , model_settings
228+
229+ extra_body = model_settings .get ("extra_body" )
230+ if not extra_body or "conversation" not in extra_body :
231+ return messages , model_settings
232+
233+ last_response_idx = None
234+ for i in range (len (messages ) - 1 , - 1 , - 1 ):
235+ if isinstance (messages [i ], ModelResponse ) and messages [i ].provider_response_id :
236+ last_response_idx = i
237+ break
238+
239+ if last_response_idx is None :
240+ return messages , model_settings
241+
242+ trimmed_messages = messages [last_response_idx + 1 :]
243+
244+ new_settings = dict (model_settings )
245+ new_settings .pop ("openai_previous_response_id" , None )
246+ return trimmed_messages , cast (ModelSettings , new_settings )
247+
184248 @asynccontextmanager
185249 async def request_stream (
186250 self ,
0 commit comments