-
Notifications
You must be signed in to change notification settings - Fork 94
Expand file tree
/
Copy pathstreaming_query.py
More file actions
1033 lines (916 loc) · 37.1 KB
/
Copy pathstreaming_query.py
File metadata and controls
1033 lines (916 loc) · 37.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Streaming query handler using Responses API."""
# pylint: disable=too-many-lines
import asyncio
import datetime
from collections.abc import AsyncIterator
from typing import Annotated, Any, Optional, cast
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import StreamingResponse
from llama_stack_api import (
OpenAIResponseObject,
OpenAIResponseObjectStream,
)
from llama_stack_api import (
OpenAIResponseObjectStreamResponseMcpCallArgumentsDone as MCPArgsDoneChunk,
)
from llama_stack_api import (
OpenAIResponseObjectStreamResponseOutputItemAdded as OutputItemAddedChunk,
)
from llama_stack_api import (
OpenAIResponseObjectStreamResponseOutputItemDone as OutputItemDoneChunk,
)
from llama_stack_api import (
OpenAIResponseObjectStreamResponseOutputTextDelta as TextDeltaChunk,
)
from llama_stack_api import (
OpenAIResponseObjectStreamResponseOutputTextDone as TextDoneChunk,
)
from llama_stack_api import (
OpenAIResponseOutputMessageMCPCall as MCPCall,
)
from llama_stack_client import (
APIConnectionError,
)
from llama_stack_client import (
APIStatusError as LLSApiStatusError,
)
from openai._exceptions import APIStatusError as OpenAIAPIStatusError
from authentication import get_auth_dependency
from authentication.interface import AuthTuple
from authorization.azure_token_manager import AzureEntraIDManager
from authorization.middleware import authorize
from client import AsyncLlamaStackClientHolder
from configuration import configuration
from constants import (
ENDPOINT_PATH_STREAMING_QUERY,
INTERRUPTED_RESPONSE_MESSAGE,
LLM_TOKEN_EVENT,
LLM_TOOL_CALL_EVENT,
LLM_TOOL_RESULT_EVENT,
LLM_TURN_COMPLETE_EVENT,
MEDIA_TYPE_EVENT_STREAM,
MEDIA_TYPE_JSON,
MEDIA_TYPE_TEXT,
TOPIC_SUMMARY_INTERRUPT_TIMEOUT_SECONDS,
)
from log import get_logger
from metrics import recording
from models.api.requests import QueryRequest
from models.api.responses.constants import UNAUTHORIZED_OPENAPI_EXAMPLES_WITH_MCP_OAUTH
from models.api.responses.error import (
AbstractErrorResponse,
ForbiddenResponse,
InternalServerErrorResponse,
NotFoundResponse,
PromptTooLongResponse,
QuotaExceededResponse,
ServiceUnavailableResponse,
UnauthorizedResponse,
UnprocessableEntityResponse,
)
from models.api.responses.successful import StreamingQueryResponse
from models.common.responses.contexts import ResponseGeneratorContext
from models.common.responses.responses_api_params import ResponsesApiParams
from models.common.turn_summary import ReferencedDocument, TurnSummary
from models.config import Action
from utils.agents.streaming import (
generate_agent_response,
retrieve_agent_response_generator,
)
from utils.conversations import append_turn_items_to_conversation
from utils.endpoints import (
check_configuration_loaded,
validate_and_retrieve_conversation,
)
from utils.mcp_headers import McpHeaders, mcp_headers_dependency
from utils.mcp_oauth_probe import check_mcp_auth
from utils.query import (
consume_query_tokens,
extract_provider_and_model_from_model_id,
handle_known_apistatus_errors,
is_context_length_error,
prepare_input,
store_query_results,
update_conversation_topic_summary,
validate_attachments_metadata,
validate_model_provider_override,
)
from utils.quota import check_tokens_available, get_available_quotas
from utils.responses import (
build_mcp_tool_call_from_arguments_done,
build_tool_call_summary,
build_tool_result_from_mcp_output_item_done,
deduplicate_referenced_documents,
extract_token_usage,
extract_vector_store_ids_from_tools,
get_topic_summary,
parse_rag_chunks,
parse_referenced_documents,
prepare_responses_params,
)
from utils.shields import (
append_turn_to_conversation,
run_shield_moderation,
validate_shield_ids_override,
)
from utils.stream_interrupts import get_stream_interrupt_registry
from utils.streaming_sse import (
format_stream_data,
shield_violation_generator,
stream_event,
)
from utils.suid import get_suid, normalize_conversation_id
from utils.token_counter import TokenCounter
from utils.vector_search import build_rag_context
logger = get_logger(__name__)
router = APIRouter(tags=["streaming_query"])
# Tracks background topic summary tasks for graceful shutdown.
_background_topic_summary_tasks: list[asyncio.Task[None]] = []
streaming_query_responses: dict[int | str, dict[str, Any]] = {
200: StreamingQueryResponse.openapi_response(),
401: UnauthorizedResponse.openapi_response(
examples=UNAUTHORIZED_OPENAPI_EXAMPLES_WITH_MCP_OAUTH
),
403: ForbiddenResponse.openapi_response(
examples=["conversation read", "endpoint", "model override"]
),
404: NotFoundResponse.openapi_response(
examples=["conversation", "model", "provider"]
),
413: PromptTooLongResponse.openapi_response(examples=["context window exceeded"]),
422: UnprocessableEntityResponse.openapi_response(),
429: QuotaExceededResponse.openapi_response(),
500: InternalServerErrorResponse.openapi_response(examples=["configuration"]),
503: ServiceUnavailableResponse.openapi_response(
examples=["llama stack", "kubernetes api"]
),
}
@router.post(
"/streaming_query",
response_class=StreamingResponse,
responses=streaming_query_responses,
summary="Streaming Query Endpoint Handler",
)
@authorize(Action.STREAMING_QUERY)
async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
request: Request,
query_request: QueryRequest,
auth: Annotated[AuthTuple, Depends(get_auth_dependency())],
mcp_headers: McpHeaders = Depends(mcp_headers_dependency),
) -> StreamingResponse:
"""
Handle request to the /streaming_query endpoint using Responses API.
Returns a streaming response using Server-Sent Events (SSE) format with
content type text/event-stream.
### Parameters:
- request: The incoming HTTP request (used by middleware).
- query_request: Request to the LLM.
- auth: Auth context tuple resolved from the authentication dependency.
- mcp_headers: Headers that should be passed to MCP servers.
### Returns:
- SSE-formatted events for the query lifecycle.
### Raises:
- HTTPException:
- 401: Unauthorized - Missing or invalid credentials
- 403: Forbidden - Insufficient permissions or model override not allowed
- 404: Not Found - Conversation, model, or provider not found
- 413: Prompt too long - Prompt exceeded model's context window size
- 422: Unprocessable Entity - Request validation failed
- 429: Quota limit exceeded - The token quota for model or user has been exceeded
- 500: Internal Server Error - Configuration not loaded or other server errors
- 503: Service Unavailable - Unable to connect to Llama Stack backend
"""
check_configuration_loaded(configuration)
user_id, _user_name, _skip_userid_check, token = auth
started_at = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
# Check MCP Auth
await check_mcp_auth(configuration, mcp_headers, token, request.headers)
# Check token availability
check_tokens_available(configuration.quota_limiters, user_id)
# Enforce RBAC: optionally disallow overriding model/provider in requests
validate_model_provider_override(
query_request.model, query_request.provider, request.state.authorized_actions
)
# Validate shield_ids override if provided
validate_shield_ids_override(query_request, configuration)
# Validate attachments if provided
if query_request.attachments:
validate_attachments_metadata(query_request.attachments)
# Retrieve conversation if conversation_id is provided
user_conversation = None
if query_request.conversation_id:
logger.debug(
"Conversation ID specified in query: %s", query_request.conversation_id
)
normalized_conv_id = normalize_conversation_id(query_request.conversation_id)
user_conversation = validate_and_retrieve_conversation(
normalized_conv_id=normalized_conv_id,
user_id=user_id,
others_allowed=Action.READ_OTHERS_CONVERSATIONS
in request.state.authorized_actions,
)
client = AsyncLlamaStackClientHolder().get_client()
# Moderation input is the raw user content (query + attachments) without injected RAG
# context, to avoid false positives from retrieved document content.
moderation_input = prepare_input(query_request)
endpoint_path = ENDPOINT_PATH_STREAMING_QUERY
moderation_result = await run_shield_moderation(
client, moderation_input, endpoint_path, query_request.shield_ids
)
# Build RAG context from Inline RAG sources
inline_rag_context = await build_rag_context(
client,
moderation_result.decision,
query_request.query,
query_request.vector_store_ids,
query_request.solr,
)
# Prepare API request parameters
responses_params = await prepare_responses_params(
client=client,
query_request=query_request,
user_conversation=user_conversation,
token=token,
mcp_headers=mcp_headers,
stream=True,
store=True,
request_headers=request.headers,
inline_rag_context=inline_rag_context.context_text,
)
# Handle Azure token refresh if needed
if (
responses_params.model.startswith("azure")
and AzureEntraIDManager().is_entra_id_configured
and AzureEntraIDManager().is_token_expired
and AzureEntraIDManager().refresh_token()
):
client = await AsyncLlamaStackClientHolder().update_azure_token()
request_id = get_suid()
# Create context with index identification mapping for RAG source resolution
context = ResponseGeneratorContext(
conversation_id=normalize_conversation_id(responses_params.conversation),
request_id=request_id,
model_id=responses_params.model,
user_id=user_id,
skip_userid_check=_skip_userid_check,
query_request=query_request,
started_at=started_at,
client=client,
moderation_result=moderation_result,
vector_store_ids=extract_vector_store_ids_from_tools(responses_params.tools),
rag_id_mapping=configuration.rag_id_mapping,
inline_rag_context=inline_rag_context,
)
# Update metrics for the LLM call
provider_id, model_id = extract_provider_and_model_from_model_id(
responses_params.model
)
recording.record_llm_call(provider_id, model_id, endpoint_path)
generator, turn_summary = await retrieve_agent_response_generator(
responses_params=responses_params,
context=context,
endpoint_path=endpoint_path,
)
# Combine inline RAG results (BYOK + Solr) with tool-based results
if context.moderation_result.decision == "passed":
turn_summary.referenced_documents = deduplicate_referenced_documents(
inline_rag_context.referenced_documents + turn_summary.referenced_documents
)
response_media_type = (
MEDIA_TYPE_TEXT
if query_request.media_type == MEDIA_TYPE_TEXT
else MEDIA_TYPE_EVENT_STREAM
)
return StreamingResponse(
generate_agent_response(
generator=generator,
context=context,
responses_params=responses_params,
turn_summary=turn_summary,
),
media_type=response_media_type,
)
async def retrieve_response_generator(
responses_params: ResponsesApiParams,
context: ResponseGeneratorContext,
endpoint_path: str,
) -> tuple[AsyncIterator[str], TurnSummary]:
"""
Retrieve the appropriate response generator.
Handles shield moderation check and retrieves response.
Returns the generator (shield violation or response generator) and turn_summary.
Fills turn_summary attributes for token usage, referenced documents, and tool calls.
Args:
responses_params: The Responses API parameters
context: The response generator context
endpoint_path: API endpoint path used for metric labeling.
Returns:
tuple[AsyncIterator[str], TurnSummary]: The response generator and turn summary
"""
turn_summary = TurnSummary()
try:
if context.moderation_result.decision == "blocked":
turn_summary.llm_response = context.moderation_result.message
turn_summary.id = context.moderation_result.moderation_id
await append_turn_items_to_conversation(
context.client,
responses_params.conversation,
responses_params.input,
[context.moderation_result.refusal_response],
)
media_type = context.query_request.media_type or MEDIA_TYPE_JSON
return (
shield_violation_generator(
context.moderation_result.message,
media_type,
),
turn_summary,
)
# Retrieve response stream (may raise exceptions)
response = await context.client.responses.create(
**responses_params.model_dump(exclude_none=True)
)
# Store pre-RAG documents for later merging with tool-based RAG
return (
response_generator(
response,
context,
turn_summary,
endpoint_path,
),
turn_summary,
)
# Handle know LLS client errors only at stream creation time and shield execution
except RuntimeError as e: # library mode wraps 413 into runtime error
if is_context_length_error(str(e)):
error_response = PromptTooLongResponse(model=responses_params.model)
raise HTTPException(**error_response.model_dump()) from e
raise e
except APIConnectionError as e:
error_response = ServiceUnavailableResponse(
backend_name="Llama Stack",
cause=str(e),
)
raise HTTPException(**error_response.model_dump()) from e
except (LLSApiStatusError, OpenAIAPIStatusError) as e:
error_response = handle_known_apistatus_errors(e, responses_params.model)
raise HTTPException(**error_response.model_dump()) from e
async def _background_update_topic_summary(
context: ResponseGeneratorContext,
model: str,
) -> None:
"""Generate topic summary and update DB/cache in the background.
Runs as a fire-and-forget task after an interrupted turn is persisted.
All errors are caught and logged.
"""
try:
topic_summary = await asyncio.wait_for(
get_topic_summary(
context.query_request.query,
context.client,
model,
),
timeout=TOPIC_SUMMARY_INTERRUPT_TIMEOUT_SECONDS,
)
if topic_summary:
update_conversation_topic_summary(
context.conversation_id,
topic_summary,
user_id=context.user_id,
skip_userid_check=context.skip_userid_check,
)
except asyncio.TimeoutError:
logger.warning(
"Topic summary timed out for interrupted turn, request %s",
context.request_id,
)
except Exception: # pylint: disable=broad-except
logger.exception(
"Failed to generate topic summary for interrupted turn, request %s",
context.request_id,
)
async def shutdown_background_topic_summary_tasks() -> None:
"""Cancel and await outstanding background topic summary tasks on shutdown.
Ensures graceful shutdown so in-flight topic summary generation can be
cleaned up. Called from the application lifespan shutdown phase.
"""
tasks = list(_background_topic_summary_tasks)
if not tasks:
return
logger.debug(
"Shutting down %d outstanding background topic summary task(s)",
len(tasks),
)
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
async def _persist_interrupted_turn(
context: ResponseGeneratorContext,
responses_params: ResponsesApiParams,
turn_summary: TurnSummary,
) -> None:
"""Persist the user query and an interrupted response into the conversation.
Called when a streaming request is cancelled so the exchange is not lost.
Persists immediately with topic_summary=None so the conversation exists
when the client fetches. Topic summary is generated in a background task
and updated when ready.
Parameters:
----------
context: The response generator context.
responses_params: The Responses API parameters.
turn_summary: TurnSummary with llm_response already set to the
interrupted message.
"""
try:
await append_turn_to_conversation(
context.client,
responses_params.conversation,
cast(str, responses_params.input),
INTERRUPTED_RESPONSE_MESSAGE,
)
except Exception: # pylint: disable=broad-except
logger.exception(
"Failed to append interrupted turn to conversation for request %s",
context.request_id,
)
try:
completed_at = datetime.datetime.now(datetime.UTC).strftime(
"%Y-%m-%dT%H:%M:%SZ"
)
store_query_results(
user_id=context.user_id,
conversation_id=context.conversation_id,
model=responses_params.model,
completed_at=completed_at,
started_at=context.started_at,
summary=turn_summary,
query=context.query_request.query,
skip_userid_check=context.skip_userid_check,
topic_summary=None,
)
if (
not context.query_request.conversation_id
and context.query_request.generate_topic_summary
):
task = asyncio.create_task(
_background_update_topic_summary(
context=context,
model=responses_params.model,
)
)
_background_topic_summary_tasks.append(task)
task.add_done_callback(_background_topic_summary_tasks.remove)
except Exception: # pylint: disable=broad-except
logger.exception(
"Failed to store interrupted query results for request %s",
context.request_id,
)
def _register_interrupt_callback(
context: ResponseGeneratorContext,
responses_params: ResponsesApiParams,
turn_summary: TurnSummary,
) -> list[bool]:
"""Build an interrupt callback and register the stream for cancellation.
The callback is invoked by ``cancel_stream`` when the client
interrupts, so persistence runs regardless of where the
``CancelledError`` is raised in the ASGI stack.
A mutable one-element list is used as a shared guard so the
callback and the in-generator ``CancelledError`` handler never
both persist the same turn.
Parameters:
----------
context: The response generator context.
responses_params: The Responses API parameters.
turn_summary: TurnSummary populated during streaming.
Returns:
-------
A mutable list ``[False]`` used as a persist-done guard; the
caller should check ``guard[0]`` before persisting and set
it to ``True`` afterwards.
"""
guard: list[bool] = [False]
async def _on_interrupt() -> None:
if guard[0]:
return
guard[0] = True
turn_summary.llm_response = INTERRUPTED_RESPONSE_MESSAGE
await _persist_interrupted_turn(context, responses_params, turn_summary)
current_task = asyncio.current_task()
if current_task is not None:
get_stream_interrupt_registry().register_stream(
request_id=context.request_id,
user_id=context.user_id,
task=current_task,
on_interrupt=_on_interrupt,
)
else:
logger.warning(
"No current asyncio task for request %s; "
"stream interruption will not be available",
context.request_id,
)
return guard
async def generate_response(
generator: AsyncIterator[str],
context: ResponseGeneratorContext,
responses_params: ResponsesApiParams,
turn_summary: TurnSummary,
) -> AsyncIterator[str]:
"""Wrap a generator with cleanup logic.
Re-yields events from the generator, handles errors, and ensures
persistence and token consumption after completion. When the
stream is interrupted via ``CancelledError``, the user query and
an interrupted response are persisted to the conversation, but
token consumption is skipped (no usage data is available).
Args:
generator: The base generator to wrap
context: The response generator context
responses_params: The Responses API parameters
turn_summary: TurnSummary populated during streaming
Yields:
SSE-formatted strings from the wrapped generator
"""
persist_guard = _register_interrupt_callback(
context, responses_params, turn_summary
)
stream_completed = False
try:
yield stream_start_event(
conversation_id=context.conversation_id,
request_id=context.request_id,
)
# Re-yield all events from the generator
async for event in generator:
yield event
stream_completed = True
# Handle known LLS client errors during response generation time
except RuntimeError as e: # library mode wraps 413 into runtime error
error_response = (
PromptTooLongResponse(model=responses_params.model)
if is_context_length_error(str(e))
else InternalServerErrorResponse.generic()
)
yield stream_http_error_event(error_response, context.query_request.media_type)
except APIConnectionError as e:
error_response = ServiceUnavailableResponse(
backend_name="Llama Stack",
cause=str(e),
)
yield stream_http_error_event(error_response, context.query_request.media_type)
except (LLSApiStatusError, OpenAIAPIStatusError) as e:
error_response = handle_known_apistatus_errors(e, responses_params.model)
yield stream_http_error_event(error_response, context.query_request.media_type)
except asyncio.CancelledError:
logger.info("Streaming request %s interrupted by user", context.request_id)
current_task = asyncio.current_task()
if current_task is not None:
current_task.uncancel()
if not persist_guard[0]:
persist_guard[0] = True
turn_summary.llm_response = INTERRUPTED_RESPONSE_MESSAGE
await _persist_interrupted_turn(context, responses_params, turn_summary)
yield stream_interrupted_event(context.request_id)
finally:
get_stream_interrupt_registry().deregister_stream(context.request_id)
if not stream_completed:
return
# Post-stream side effects: only run when streaming finished successfully
# Get topic summary for new conversations if needed
topic_summary = None
if not context.query_request.conversation_id:
should_generate = context.query_request.generate_topic_summary
if should_generate:
logger.debug("Generating topic summary for new conversation")
topic_summary = await get_topic_summary(
context.query_request.query,
context.client,
responses_params.model,
)
# Consume tokens
logger.info("Consuming tokens")
consume_query_tokens(
user_id=context.user_id,
model_id=responses_params.model,
token_usage=turn_summary.token_usage,
)
# Get available quotas
logger.info("Getting available quotas")
available_quotas = get_available_quotas(
quota_limiters=configuration.quota_limiters, user_id=context.user_id
)
yield stream_end_event(
turn_summary.token_usage,
available_quotas,
turn_summary.referenced_documents,
context.query_request.media_type or MEDIA_TYPE_JSON,
)
completed_at = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
# Store query results (transcript, conversation details, cache)
logger.info("Storing query results")
store_query_results(
user_id=context.user_id,
conversation_id=context.conversation_id,
model=responses_params.model,
completed_at=completed_at,
started_at=context.started_at,
summary=turn_summary,
query=context.query_request.query,
attachments=context.query_request.attachments,
skip_userid_check=context.skip_userid_check,
topic_summary=topic_summary,
)
async def response_generator( # pylint: disable=too-many-branches,too-many-statements,too-many-locals
turn_response: AsyncIterator[OpenAIResponseObjectStream],
context: ResponseGeneratorContext,
turn_summary: TurnSummary,
endpoint_path: str,
) -> AsyncIterator[str]:
"""Generate SSE formatted streaming response.
Processes streaming chunks from Llama Stack and converts them to
Server-Sent Events (SSE) format. Uses handler functions to process
different event types and populate turn_summary during streaming.
Args:
turn_response: The streaming response from Llama Stack
context: The response generator context
turn_summary: TurnSummary to populate during streaming
endpoint_path: API endpoint path used for metric labeling.
Yields:
SSE-formatted strings for tokens, tool calls, tool results,
turn completion, and error events.
"""
chunk_id = 0
media_type = context.query_request.media_type or MEDIA_TYPE_JSON
text_parts: list[str] = []
mcp_calls: dict[int, tuple[str, str]] = (
{}
) # output_index -> (mcp_call_id, mcp_call_name)
latest_response_object: Optional[OpenAIResponseObject] = None
logger.debug("Starting streaming response (Responses API) processing")
async for chunk in turn_response:
event_type = getattr(chunk, "type", None)
logger.debug("Processing chunk %d, type: %s", chunk_id, event_type)
# Content part started - emit an empty token to kick off UI streaming
if event_type == "response.content_part.added":
yield stream_event(
{
"id": chunk_id,
"token": "",
},
LLM_TOKEN_EVENT,
media_type,
)
chunk_id += 1
# Store MCP call item info for later lookup when arguments.done event occurs
elif event_type == "response.output_item.added":
item_added_chunk = cast(OutputItemAddedChunk, chunk)
if item_added_chunk.item.type == "mcp_call":
mcp_call_item = cast(MCPCall, item_added_chunk.item)
mcp_calls[item_added_chunk.output_index] = (
mcp_call_item.id,
mcp_call_item.name,
)
# Text streaming - emit token delta
elif event_type == "response.output_text.delta":
delta_chunk = cast(TextDeltaChunk, chunk)
text_parts.append(delta_chunk.delta)
yield stream_event(
{
"id": chunk_id,
"token": delta_chunk.delta,
},
LLM_TOKEN_EVENT,
media_type,
)
chunk_id += 1
# Final text of the output (capture, but emit at response.completed)
elif event_type == "response.output_text.done":
text_done_chunk = cast(TextDoneChunk, chunk)
turn_summary.llm_response = text_done_chunk.text
# Emit tool call when MCP call arguments are done
elif event_type == "response.mcp_call.arguments.done":
mcp_arguments_done_chunk = cast(MCPArgsDoneChunk, chunk)
tool_call = build_mcp_tool_call_from_arguments_done(
mcp_arguments_done_chunk.output_index,
mcp_arguments_done_chunk.arguments,
mcp_calls,
)
if tool_call:
turn_summary.tool_calls.append(tool_call)
yield stream_event(
tool_call.model_dump(),
LLM_TOOL_CALL_EVENT,
media_type,
)
# Process tool calls and results when output items are done
# For mcp_call, only emit result (call was already emitted when arguments.done)
# For other types, emit both call and result
elif event_type == "response.output_item.done":
output_item_done_chunk = cast(OutputItemDoneChunk, chunk)
item_type = output_item_done_chunk.item.type
# Skip message items as they are parsed separately
if item_type == "message":
continue
output_index = output_item_done_chunk.output_index
# For mcp_call, only emit result if call was already emitted when arguments.done
# (indicated by output_index not being in mcp_calls dict)
# If output_index is in dict, process in else branch (emit both call and result)
if item_type == "mcp_call" and output_index not in mcp_calls:
# Call was already emitted during arguments.done, only emit result
mcp_call_item = cast(MCPCall, output_item_done_chunk.item)
tool_result = build_tool_result_from_mcp_output_item_done(mcp_call_item)
turn_summary.tool_results.append(tool_result)
yield stream_event(
tool_result.model_dump(),
LLM_TOOL_RESULT_EVENT,
media_type,
)
else:
# For all other types (and mcp_call when arguments.done didn't happen),
# emit both call and result together
tool_call, tool_result = build_tool_call_summary(
output_item_done_chunk.item
)
if tool_call:
turn_summary.tool_calls.append(tool_call)
yield stream_event(
tool_call.model_dump(),
LLM_TOOL_CALL_EVENT,
media_type,
)
if tool_result:
turn_summary.tool_results.append(tool_result)
yield stream_event(
tool_result.model_dump(),
LLM_TOOL_RESULT_EVENT,
media_type,
)
# Completed response - capture final text and response object
elif event_type == "response.completed":
latest_response_object = cast(
OpenAIResponseObject,
getattr(chunk, "response"), # noqa: B009
)
turn_summary.llm_response = turn_summary.llm_response or "".join(text_parts)
yield stream_event(
{
"id": chunk_id,
"token": turn_summary.llm_response,
},
LLM_TURN_COMPLETE_EVENT,
media_type,
)
chunk_id += 1
# Incomplete or failed response - emit error
elif event_type in ("response.incomplete", "response.failed"):
latest_response_object = cast(
OpenAIResponseObject,
getattr(chunk, "response"), # noqa: B009
)
error_message = (
latest_response_object.error.message
if latest_response_object.error
else "An unexpected error occurred while processing the request."
)
error_response = (
PromptTooLongResponse(model=context.model_id)
if is_context_length_error(error_message)
else InternalServerErrorResponse.query_failed(error_message)
)
yield stream_http_error_event(error_response, media_type)
logger.debug(
"Streaming complete - Tool calls: %d, Response chars: %d",
len(turn_summary.tool_calls),
len(turn_summary.llm_response),
)
# Extract token usage and referenced documents from the final response object
if not latest_response_object:
return
turn_summary.token_usage = extract_token_usage(
latest_response_object.usage, context.model_id, endpoint_path
)
# Parse tool-based referenced documents from the final response object
tool_rag_docs = parse_referenced_documents(
latest_response_object,
vector_store_ids=context.vector_store_ids,
rag_id_mapping=context.rag_id_mapping,
)
# Combine inline RAG results (BYOK + Solr) with tool-based results
turn_summary.referenced_documents = deduplicate_referenced_documents(
context.inline_rag_context.referenced_documents + tool_rag_docs
)
tool_rag_chunks = parse_rag_chunks(
latest_response_object,
vector_store_ids=context.vector_store_ids,
rag_id_mapping=context.rag_id_mapping,
)
turn_summary.rag_chunks = context.inline_rag_context.rag_chunks + tool_rag_chunks
def stream_http_error_event(
error: AbstractErrorResponse, media_type: Optional[str] = MEDIA_TYPE_JSON
) -> str:
"""
Create an SSE-formatted error response for generic LLM or API errors.
Args:
error: An AbstractErrorResponse instance representing the error.
media_type: The media type for the response format. Defaults to MEDIA_TYPE_JSON if None.
Returns:
str: A Server-Sent Events (SSE) formatted error message containing
the serialized error details.
"""
logger.error("Error while obtaining answer for user question")
media_type = media_type or MEDIA_TYPE_JSON
if media_type == MEDIA_TYPE_TEXT:
return f"Status: {error.status_code} - {error.detail.response} - {error.detail.cause}"
return format_stream_data(
{
"event": "error",
"data": {
"status_code": error.status_code,
"response": error.detail.response,
"cause": error.detail.cause,
},
}
)
def stream_start_event(conversation_id: str, request_id: str) -> str:
"""Format an SSE start event for a streaming response.
The payload contains both the conversation ID and the request ID
so the client can correlate the stream with a conversation and
use the request ID to issue an interrupt if needed.
Parameters:
----------
conversation_id (str): Unique identifier for the conversation.
request_id (str): Unique SUID for this streaming request,
returned to the client for interrupt support.
Returns:
-------
str: SSE-formatted string representing the start event.
"""
return format_stream_data(
{
"event": "start",
"data": {
"conversation_id": conversation_id,
"request_id": request_id,
},
}
)
def stream_interrupted_event(request_id: str) -> str:
"""Format an SSE event indicating the stream was interrupted.
Emitted to the client just before the generator closes so the
frontend can distinguish an intentional user-initiated interruption
from an unexpected connection drop.
Parameters:
----------
request_id (str): Unique identifier for the interrupted request.
Returns:
-------
str: SSE-formatted string representing the interrupted event.
"""
return format_stream_data(
{
"event": "interrupted",
"data": {
"request_id": request_id,
},
}
)
def stream_end_event(
token_usage: TokenCounter,
available_quotas: dict[str, int],
referenced_documents: list[ReferencedDocument],
media_type: str = MEDIA_TYPE_JSON,
) -> str:
"""
Yield the end of the data stream.
Format and return the end event for a streaming response,
including referenced document metadata and token usage information.
Parameters: