|
4 | 4 |
|
5 | 5 | import asyncio |
6 | 6 | import json |
7 | | -from collections.abc import AsyncIterator |
| 7 | +from collections.abc import AsyncIterator, Sequence |
8 | 8 | from datetime import UTC, datetime |
9 | | -from typing import Annotated, Any, Final, Optional, cast |
| 9 | +from typing import Annotated, Any, Final, NoReturn, Optional, cast |
10 | 10 |
|
11 | 11 | from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request |
12 | 12 | from fastapi.responses import StreamingResponse |
@@ -224,6 +224,235 @@ def _queue_responses_splunk_event( # pylint: disable=too-many-arguments,too-man |
224 | 224 | background_tasks.add_task(send_splunk_event, event, sourcetype) |
225 | 225 |
|
226 | 226 |
|
| 227 | +def _queue_responses_error_event( |
| 228 | + error: Exception, |
| 229 | + api_params: ResponsesApiParams, |
| 230 | + context: ResponsesContext, |
| 231 | +) -> None: |
| 232 | + """Queue fire-and-forget Splunk telemetry for a Responses API error. |
| 233 | +
|
| 234 | + Args: |
| 235 | + error: The backend exception being converted into an HTTP error. |
| 236 | + api_params: Responses API parameters for the failed request. |
| 237 | + context: Request-scoped Responses API context. |
| 238 | + """ |
| 239 | + _queue_responses_splunk_event( |
| 240 | + background_tasks=context.background_tasks, |
| 241 | + input_text=context.input_text, |
| 242 | + response_text=str(error), |
| 243 | + conversation_id=normalize_conversation_id(api_params.conversation), |
| 244 | + model=api_params.model, |
| 245 | + rh_identity_context=context.rh_identity_context, |
| 246 | + inference_time=(datetime.now(UTC) - context.started_at).total_seconds(), |
| 247 | + sourcetype="responses_error", |
| 248 | + fire_and_forget=True, |
| 249 | + user_agent=context.user_agent, |
| 250 | + ) |
| 251 | + |
| 252 | + |
| 253 | +def _http_exception_for_response_api_error( |
| 254 | + error: Exception, |
| 255 | + api_params: ResponsesApiParams, |
| 256 | +) -> Optional[HTTPException]: |
| 257 | + """Map known Responses API backend errors to HTTP exceptions. |
| 258 | +
|
| 259 | + Args: |
| 260 | + error: The backend exception raised while creating a response. |
| 261 | + api_params: Responses API parameters for the request. |
| 262 | +
|
| 263 | + Returns: |
| 264 | + HTTPException for known API failures, or None for unknown RuntimeError. |
| 265 | + """ |
| 266 | + if isinstance(error, RuntimeError): |
| 267 | + if not is_context_length_error(str(error)): |
| 268 | + return None |
| 269 | + error_response = PromptTooLongResponse(model=api_params.model) |
| 270 | + elif isinstance(error, APIConnectionError): |
| 271 | + error_response = ServiceUnavailableResponse( |
| 272 | + backend_name="Llama Stack", |
| 273 | + cause=str(error), |
| 274 | + ) |
| 275 | + elif isinstance(error, (LLSApiStatusError, OpenAIAPIStatusError)): |
| 276 | + error_response = handle_known_apistatus_errors(error, api_params.model) |
| 277 | + else: |
| 278 | + return None |
| 279 | + return HTTPException(**error_response.model_dump()) |
| 280 | + |
| 281 | + |
| 282 | +def _raise_response_api_http_exception( |
| 283 | + error: Exception, |
| 284 | + api_params: ResponsesApiParams, |
| 285 | + context: ResponsesContext, |
| 286 | +) -> NoReturn: |
| 287 | + """Queue error telemetry and raise the mapped Responses API HTTP error. |
| 288 | +
|
| 289 | + Args: |
| 290 | + error: The backend exception raised while creating a response. |
| 291 | + api_params: Responses API parameters for the request. |
| 292 | + context: Request-scoped Responses API context. |
| 293 | +
|
| 294 | + Raises: |
| 295 | + Exception: Re-raises unknown RuntimeError instances unchanged. |
| 296 | + HTTPException: Raised for known Responses API failures. |
| 297 | + """ |
| 298 | + http_exception = _http_exception_for_response_api_error(error, api_params) |
| 299 | + if http_exception is None: |
| 300 | + raise error |
| 301 | + _queue_responses_error_event(error, api_params, context) |
| 302 | + raise http_exception from error |
| 303 | + |
| 304 | + |
| 305 | +async def _persist_blocked_response_turn( |
| 306 | + api_params: ResponsesApiParams, |
| 307 | + context: ResponsesContext, |
| 308 | +) -> None: |
| 309 | + """Persist a shield-blocked refusal turn when response storage is enabled. |
| 310 | +
|
| 311 | + Args: |
| 312 | + api_params: Responses API parameters for the blocked request. |
| 313 | + context: Request-scoped Responses API context with moderation details. |
| 314 | + """ |
| 315 | + if api_params.store: |
| 316 | + moderation_result = cast(ShieldModerationBlocked, context.moderation_result) |
| 317 | + await append_turn_items_to_conversation( |
| 318 | + client=context.client, |
| 319 | + conversation_id=api_params.conversation, |
| 320 | + user_input=api_params.input, |
| 321 | + llm_output=[moderation_result.refusal_response], |
| 322 | + ) |
| 323 | + |
| 324 | + |
| 325 | +def _queue_blocked_response_event( |
| 326 | + api_params: ResponsesApiParams, |
| 327 | + context: ResponsesContext, |
| 328 | + response_text: str, |
| 329 | +) -> None: |
| 330 | + """Queue Splunk telemetry for a shield-blocked Responses API request. |
| 331 | +
|
| 332 | + Args: |
| 333 | + api_params: Responses API parameters for the blocked request. |
| 334 | + context: Request-scoped Responses API context. |
| 335 | + response_text: Refusal text sent to the client. |
| 336 | + """ |
| 337 | + _queue_responses_splunk_event( |
| 338 | + background_tasks=context.background_tasks, |
| 339 | + input_text=context.input_text, |
| 340 | + response_text=response_text, |
| 341 | + conversation_id=normalize_conversation_id(api_params.conversation), |
| 342 | + model=api_params.model, |
| 343 | + rh_identity_context=context.rh_identity_context, |
| 344 | + inference_time=(datetime.now(UTC) - context.started_at).total_seconds(), |
| 345 | + sourcetype="responses_shield_blocked", |
| 346 | + user_agent=context.user_agent, |
| 347 | + ) |
| 348 | + |
| 349 | + |
| 350 | +async def _append_previous_response_turn( |
| 351 | + api_params: ResponsesApiParams, |
| 352 | + context: ResponsesContext, |
| 353 | + output: Sequence[Any], |
| 354 | +) -> None: |
| 355 | + """Append response output when continuing from a previous response id. |
| 356 | +
|
| 357 | + Args: |
| 358 | + api_params: Responses API parameters containing conversation details. |
| 359 | + context: Request-scoped Responses API context. |
| 360 | + output: Final output items from the Responses API object. |
| 361 | + """ |
| 362 | + if api_params.store and api_params.previous_response_id: |
| 363 | + await append_turn_items_to_conversation( |
| 364 | + context.client, |
| 365 | + api_params.conversation, |
| 366 | + api_params.input, |
| 367 | + output, |
| 368 | + ) |
| 369 | + |
| 370 | + |
| 371 | +async def _maybe_get_topic_summary( |
| 372 | + api_params: ResponsesApiParams, |
| 373 | + context: ResponsesContext, |
| 374 | +) -> Optional[str]: |
| 375 | + """Generate a topic summary when requested for the current response. |
| 376 | +
|
| 377 | + Args: |
| 378 | + api_params: Responses API parameters containing the selected model. |
| 379 | + context: Request-scoped Responses API context. |
| 380 | +
|
| 381 | + Returns: |
| 382 | + Generated topic summary, or None when topic summaries are disabled. |
| 383 | + """ |
| 384 | + if not context.generate_topic_summary: |
| 385 | + return None |
| 386 | + logger.debug("Generating topic summary for new conversation") |
| 387 | + return await get_topic_summary(context.input_text, context.client, api_params.model) |
| 388 | + |
| 389 | + |
| 390 | +def _store_response_query_results( |
| 391 | + api_params: ResponsesApiParams, |
| 392 | + context: ResponsesContext, |
| 393 | + turn_summary: TurnSummary, |
| 394 | + completed_at: datetime, |
| 395 | + topic_summary: Optional[str], |
| 396 | +) -> None: |
| 397 | + """Persist Responses API query results when request storage is enabled. |
| 398 | +
|
| 399 | + Args: |
| 400 | + api_params: Responses API parameters containing conversation details. |
| 401 | + context: Request-scoped Responses API context. |
| 402 | + turn_summary: Summary of the completed model turn. |
| 403 | + completed_at: Time when response handling completed. |
| 404 | + topic_summary: Optional generated topic summary for the conversation. |
| 405 | + """ |
| 406 | + if not api_params.store: |
| 407 | + return |
| 408 | + user_id, _, skip_userid_check, _ = context.auth |
| 409 | + store_query_results( |
| 410 | + user_id=user_id, |
| 411 | + conversation_id=normalize_conversation_id(api_params.conversation), |
| 412 | + model=api_params.model, |
| 413 | + started_at=context.started_at.strftime("%Y-%m-%dT%H:%M:%SZ"), |
| 414 | + completed_at=completed_at.strftime("%Y-%m-%dT%H:%M:%SZ"), |
| 415 | + summary=turn_summary, |
| 416 | + query=context.input_text, |
| 417 | + attachments=[], |
| 418 | + skip_userid_check=skip_userid_check, |
| 419 | + topic_summary=topic_summary, |
| 420 | + ) |
| 421 | + |
| 422 | + |
| 423 | +def _queue_completed_response_event( |
| 424 | + api_params: ResponsesApiParams, |
| 425 | + context: ResponsesContext, |
| 426 | + turn_summary: TurnSummary, |
| 427 | + completed_at: datetime, |
| 428 | + response_text: str, |
| 429 | +) -> None: |
| 430 | + """Queue Splunk telemetry for a completed Responses API request. |
| 431 | +
|
| 432 | + Args: |
| 433 | + api_params: Responses API parameters for the completed request. |
| 434 | + context: Request-scoped Responses API context. |
| 435 | + turn_summary: Summary containing token usage for telemetry. |
| 436 | + completed_at: Time when response handling completed. |
| 437 | + response_text: Final text sent to the client. |
| 438 | + """ |
| 439 | + if context.moderation_result.decision != "passed": |
| 440 | + return |
| 441 | + _queue_responses_splunk_event( |
| 442 | + background_tasks=context.background_tasks, |
| 443 | + input_text=context.input_text, |
| 444 | + response_text=response_text, |
| 445 | + conversation_id=normalize_conversation_id(api_params.conversation), |
| 446 | + model=api_params.model, |
| 447 | + rh_identity_context=context.rh_identity_context, |
| 448 | + inference_time=(completed_at - context.started_at).total_seconds(), |
| 449 | + sourcetype="responses_completed", |
| 450 | + input_tokens=turn_summary.token_usage.input_tokens, |
| 451 | + output_tokens=turn_summary.token_usage.output_tokens, |
| 452 | + user_agent=context.user_agent, |
| 453 | + ) |
| 454 | + |
| 455 | + |
227 | 456 | @router.post( |
228 | 457 | "/responses", |
229 | 458 | responses=responses_response, |
|
0 commit comments