Skip to content

Commit a95f3c6

Browse files
authored
Merge pull request #1972 from jrobertboos/fix-skills
LCORE-2311: Fix Skills
2 parents 2596e0a + a4c237e commit a95f3c6

5 files changed

Lines changed: 102 additions & 21 deletions

File tree

src/pydantic_ai_lightspeed/llamastack/_model.py

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from pydantic_ai import UnexpectedModelBehavior
2727
from pydantic_ai._run_context import RunContext
2828
from pydantic_ai._utils import PeekableAsyncStream, Unset, number_to_datetime
29-
from pydantic_ai.messages import ModelMessage
29+
from pydantic_ai.messages import ModelMessage, ModelResponse
3030
from pydantic_ai.models import (
3131
ModelRequestParameters,
3232
StreamedResponse,
@@ -181,15 +181,79 @@ class LlamaStackResponsesModel(OpenAIResponsesModel):
181181
before the corresponding ``McpCall`` or ``ResponseFunctionToolCall`` item.
182182
"""
183183

184+
async def request( # pylint: disable=unused-argument
185+
self,
186+
messages: list[ModelMessage],
187+
model_settings: ModelSettings | None,
188+
model_request_parameters: ModelRequestParameters,
189+
run_context: RunContext[Any] | None = None,
190+
) -> Any:
191+
"""Non-streaming request with Llama Stack conversation continuation fix.
192+
193+
Llama Stack rejects requests containing both ``conversation`` and
194+
``previous_response_id``. On continuation turns (where a prior
195+
``ModelResponse`` exists), we trim messages to only the new input and
196+
disable ``previous_response_id`` so that only ``conversation`` is sent.
197+
This ensures all responses are persisted to the conversation.
198+
"""
199+
messages, model_settings = self._prepare_conversation_continuation(
200+
messages, model_settings
201+
)
202+
return await super().request(messages, model_settings, model_request_parameters)
203+
204+
def _prepare_conversation_continuation(
205+
self,
206+
messages: list[ModelMessage],
207+
model_settings: ModelSettings | None,
208+
) -> tuple[list[ModelMessage], ModelSettings | None]:
209+
"""Trim messages and disable previous_response_id for conversation continuations.
210+
211+
Llama Stack rejects requests with both ``previous_response_id`` and
212+
``conversation``. When ``conversation`` is in ``extra_body`` and there's
213+
already a ModelResponse in the history (a continuation turn), we:
214+
215+
1. Trim messages to only those AFTER the last ModelResponse (new input only)
216+
2. Disable ``openai_previous_response_id`` so pydantic-ai won't resolve one
217+
218+
This means Llama Stack receives ``conversation`` (for persistence) plus only
219+
the new input items. Llama Stack reconstructs prior history from the
220+
conversation and appends the new input correctly.
221+
"""
222+
if not model_settings or not isinstance(model_settings, dict):
223+
return messages, model_settings
224+
225+
extra_body = model_settings.get("extra_body")
226+
if not isinstance(extra_body, dict) or "conversation" not in extra_body:
227+
return messages, model_settings
228+
229+
last_response_idx = None
230+
for i in range(len(messages) - 1, -1, -1):
231+
msg = messages[i]
232+
if isinstance(msg, ModelResponse) and msg.provider_response_id:
233+
last_response_idx = i
234+
break
235+
236+
if last_response_idx is None:
237+
return messages, model_settings
238+
239+
trimmed_messages = messages[last_response_idx + 1 :]
240+
241+
new_settings = dict(model_settings)
242+
new_settings.pop("openai_previous_response_id", None)
243+
return trimmed_messages, cast(ModelSettings, new_settings)
244+
184245
@asynccontextmanager
185-
async def request_stream(
246+
async def request_stream( # pylint: disable=unused-argument
186247
self,
187248
messages: list[ModelMessage],
188249
model_settings: ModelSettings | None,
189250
model_request_parameters: ModelRequestParameters,
190251
run_context: RunContext[Any] | None = None,
191252
) -> AsyncIterator[StreamedResponse]:
192-
"""Request a streaming response, filtering Llama Stack-specific event quirks.
253+
"""Request a streaming response with Llama Stack compatibility fixes.
254+
255+
Applies the same conversation continuation handling as :meth:`request`
256+
before calling the Responses API, then filters streaming tool-call events.
193257
194258
Args:
195259
messages: Model messages for the request.
@@ -201,10 +265,10 @@ async def request_stream(
201265
A StreamedResponse with the filtered event stream.
202266
"""
203267
check_allow_model_requests()
204-
model_settings, model_request_parameters = self.prepare_request(
205-
model_settings,
206-
model_request_parameters,
268+
messages, model_settings = self._prepare_conversation_continuation(
269+
messages, model_settings
207270
)
271+
208272
model_settings_cast = cast(OpenAIResponsesModelSettings, model_settings or {})
209273
response = await self._responses_create(
210274
messages, True, model_settings_cast, model_request_parameters

src/utils/agents/streaming.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,29 @@ async def generate_agent_response(
229229
context.query_request.conversation_id is None
230230
and bool(context.query_request.generate_topic_summary)
231231
)
232-
topic_summary = await maybe_get_topic_summary(
233-
generate_topic_summary=should_generate_topic_summary,
234-
input_text=context.query_request.query,
235-
client=context.client,
236-
model_id=responses_params.model,
237-
)
232+
try:
233+
topic_summary = await maybe_get_topic_summary(
234+
generate_topic_summary=should_generate_topic_summary,
235+
input_text=context.query_request.query,
236+
client=context.client,
237+
model_id=responses_params.model,
238+
)
239+
except HTTPException as exc:
240+
logger.warning(
241+
"Topic summary failed for request %s: %s",
242+
context.request_id,
243+
exc.detail,
244+
)
245+
detail: dict[str, str] = exc.detail if isinstance(exc.detail, dict) else {}
246+
yield serialize_event(
247+
ErrorStreamPayload.create(
248+
status_code=exc.status_code,
249+
response=detail.get("response", "Internal server error"),
250+
cause=detail.get("cause", str(exc.detail)),
251+
),
252+
media_type,
253+
)
254+
return
238255
logger.info("Consuming tokens")
239256
consume_query_tokens(
240257
user_id=context.user_id,

src/utils/pydantic_ai.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
{
2222
"conversation",
2323
"max_infer_iters",
24-
"tools",
2524
"tool_choice",
2625
"include",
2726
"text",
@@ -68,6 +67,8 @@ def _model_settings_from_responses_params(
6867
if responses_params.extra_headers:
6968
settings_dict["extra_headers"] = dict(responses_params.extra_headers)
7069
settings_dict["openai_store"] = responses_params.store
70+
if responses_params.tools is not None:
71+
settings_dict["openai_native_tools"] = responses_params.tools
7172
if responses_params.previous_response_id is not None:
7273
settings_dict["openai_previous_response_id"] = (
7374
responses_params.previous_response_id

src/utils/query.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -574,10 +574,9 @@ def handle_known_apistatus_errors(
574574
Returns:
575575
AbstractErrorResponse: The error response model.
576576
"""
577-
if error.status_code == 400:
578-
error_message = getattr(error, "message", str(error))
579-
if is_context_length_error(error_message):
580-
return PromptTooLongResponse(model=model_id)
581-
elif error.status_code == 429:
577+
error_message = getattr(error, "message", str(error))
578+
if is_context_length_error(error_message):
579+
return PromptTooLongResponse(model=model_id)
580+
if error.status_code == 429:
582581
return QuotaExceededResponse.model(model_id)
583582
return InternalServerErrorResponse.generic()

tests/unit/utils/test_pydantic_ai.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def minimal_params_fixture(self, mocker: MockerFixture) -> object:
8282
params.parallel_tool_calls = None
8383
params.extra_headers = None
8484
params.store = False
85+
params.tools = None
8586
params.previous_response_id = None
8687
return params
8788

@@ -138,7 +139,6 @@ def test_extra_body_from_lls_fields(self, mocker: MockerFixture) -> None:
138139
"model": "test/model",
139140
"conversation": "conv-123",
140141
"max_infer_iters": 5,
141-
"tools": [{"type": "function"}],
142142
"tool_choice": "auto",
143143
}
144144
params.max_output_tokens = None
@@ -147,14 +147,15 @@ def test_extra_body_from_lls_fields(self, mocker: MockerFixture) -> None:
147147
params.extra_headers = None
148148
params.store = False
149149
params.previous_response_id = None
150+
params.tools = [{"type": "function"}]
150151

151152
settings = _model_settings_from_responses_params(params)
152153

153154
assert "extra_body" in settings
154155
assert settings["extra_body"]["conversation"] == "conv-123"
155156
assert settings["extra_body"]["max_infer_iters"] == 5
156-
assert settings["extra_body"]["tools"] == [{"type": "function"}]
157157
assert settings["extra_body"]["tool_choice"] == "auto"
158+
assert settings["openai_native_tools"] == [{"type": "function"}]
158159

159160
def test_extra_body_only_includes_known_fields(self, mocker: MockerFixture) -> None:
160161
"""Test that extra_body only includes fields in _LLS_RESPONSES_EXTRA_FIELDS."""
@@ -189,7 +190,6 @@ def test_contains_expected_fields(self) -> None:
189190
expected = {
190191
"conversation",
191192
"max_infer_iters",
192-
"tools",
193193
"tool_choice",
194194
"include",
195195
"text",

0 commit comments

Comments
 (0)