lightspeed-stack/src/app/endpoints/streaming_query.py at 9bbc2cbbc9fceec47f14d546a6d4156da63d7c2d · jrobertboos/lightspeed-stack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
"""Streaming query handler using Responses API."""

import asyncio
import datetime
import json

from typing import Annotated, Any, AsyncIterator, Optional, cast

from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import StreamingResponse
from llama_stack_api.openai_responses import (
    OpenAIResponseObject,
    OpenAIResponseObjectStream,
    OpenAIResponseObjectStreamResponseMcpCallArgumentsDone as MCPArgsDoneChunk,
    OpenAIResponseObjectStreamResponseOutputItemAdded as OutputItemAddedChunk,
    OpenAIResponseObjectStreamResponseOutputItemDone as OutputItemDoneChunk,
    OpenAIResponseObjectStreamResponseOutputTextDelta as TextDeltaChunk,
    OpenAIResponseObjectStreamResponseOutputTextDone as TextDoneChunk,
    OpenAIResponseOutputMessageMCPCall as MCPCall,
)
from llama_stack_client import (
    APIConnectionError,
    APIStatusError as LLSApiStatusError,
)
from openai._exceptions import APIStatusError as OpenAIAPIStatusError
import metrics
from authentication import get_auth_dependency
from authentication.interface import AuthTuple
from authorization.azure_token_manager import AzureEntraIDManager
from authorization.middleware import authorize
from client import AsyncLlamaStackClientHolder
from configuration import configuration
from constants import (
    INTERRUPTED_RESPONSE_MESSAGE,
    LLM_TOKEN_EVENT,
    LLM_TOOL_CALL_EVENT,
    LLM_TOOL_RESULT_EVENT,
    LLM_TURN_COMPLETE_EVENT,
    MEDIA_TYPE_EVENT_STREAM,
    MEDIA_TYPE_JSON,
    MEDIA_TYPE_TEXT,
)
from models.config import Action
from models.context import ResponseGeneratorContext
from models.requests import QueryRequest
from models.responses import (
    AbstractErrorResponse,
    ForbiddenResponse,
    InternalServerErrorResponse,
    NotFoundResponse,
    PromptTooLongResponse,
    QuotaExceededResponse,
    ServiceUnavailableResponse,
    StreamingQueryResponse,
    UnauthorizedResponse,
    UnprocessableEntityResponse,
)
from utils.types import ReferencedDocument
from utils.endpoints import (
    check_configuration_loaded,
    validate_and_retrieve_conversation,
)
from utils.mcp_headers import mcp_headers_dependency, McpHeaders
from utils.query import (
    consume_query_tokens,
    extract_provider_and_model_from_model_id,
    handle_known_apistatus_errors,
    store_query_results,
    update_azure_token,
    validate_attachments_metadata,
    validate_model_provider_override,
)
from utils.quota import check_tokens_available, get_available_quotas
from utils.responses import (
    build_mcp_tool_call_from_arguments_done,
    responses_params_to_request_body,
    build_tool_call_summary,
    build_tool_result_from_mcp_output_item_done,
    deduplicate_referenced_documents,
    extract_token_usage,
    extract_vector_store_ids_from_tools,
    get_topic_summary,
    parse_referenced_documents,
    prepare_responses_params,
)
from utils.shields import (
    append_turn_to_conversation,
    run_shield_moderation,
    validate_shield_ids_override,
)
from utils.stream_interrupts import get_stream_interrupt_registry
from utils.suid import get_suid, normalize_conversation_id
from utils.token_counter import TokenCounter
from utils.types import ResponsesApiParams, TurnSummary
from utils.vector_search import format_rag_context_for_injection, perform_vector_search
from log import get_logger

logger = get_logger(__name__)
router = APIRouter(tags=["streaming_query"])

streaming_query_responses: dict[int | str, dict[str, Any]] = {
    200: StreamingQueryResponse.openapi_response(),
    401: UnauthorizedResponse.openapi_response(
        examples=["missing header", "missing token"]
    ),
    403: ForbiddenResponse.openapi_response(
        examples=["conversation read", "endpoint", "model override"]
    ),
    404: NotFoundResponse.openapi_response(
        examples=["conversation", "model", "provider"]
    ),
    # 413: PromptTooLongResponse.openapi_response(),
    422: UnprocessableEntityResponse.openapi_response(),
    429: QuotaExceededResponse.openapi_response(),
    500: InternalServerErrorResponse.openapi_response(examples=["configuration"]),
    503: ServiceUnavailableResponse.openapi_response(),
}


@router.post(
    "/streaming_query",
    response_class=StreamingResponse,
    responses=streaming_query_responses,
    summary="Streaming Query Endpoint Handler",
)
@authorize(Action.STREAMING_QUERY)
async def streaming_query_endpoint_handler(  # pylint: disable=too-many-locals
    request: Request,
    query_request: QueryRequest,
    auth: Annotated[AuthTuple, Depends(get_auth_dependency())],
    mcp_headers: McpHeaders = Depends(mcp_headers_dependency),
) -> StreamingResponse:
    """
    Handle request to the /streaming_query endpoint using Responses API.

    Returns a streaming response using Server-Sent Events (SSE) format with
    content type text/event-stream.

    Returns:
        SSE-formatted events for the query lifecycle.

    Raises:
        HTTPException:
            - 401: Unauthorized - Missing or invalid credentials
            - 403: Forbidden - Insufficient permissions or model override not allowed
            - 404: Not Found - Conversation, model, or provider not found
            - 413: Prompt too long - Prompt exceeded model's context window size
            - 422: Unprocessable Entity - Request validation failed
            - 429: Quota limit exceeded - The token quota for model or user has been exceeded
            - 500: Internal Server Error - Configuration not loaded or other server errors
            - 503: Service Unavailable - Unable to connect to Llama Stack backend
    """
    check_configuration_loaded(configuration)

    user_id, _user_name, _skip_userid_check, token = auth
    started_at = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%SZ")

    # Check token availability
    check_tokens_available(configuration.quota_limiters, user_id)

    # Enforce RBAC: optionally disallow overriding model/provider in requests
    validate_model_provider_override(
        query_request.model, query_request.provider, request.state.authorized_actions
    )

    # Validate shield_ids override if provided
    validate_shield_ids_override(query_request, configuration)

    # Validate attachments if provided
    if query_request.attachments:
        validate_attachments_metadata(query_request.attachments)

    # Retrieve conversation if conversation_id is provided
    user_conversation = None
    if query_request.conversation_id:
        logger.debug(
            "Conversation ID specified in query: %s", query_request.conversation_id
        )
        normalized_conv_id = normalize_conversation_id(query_request.conversation_id)
        user_conversation = validate_and_retrieve_conversation(
            normalized_conv_id=normalized_conv_id,
            user_id=user_id,
            others_allowed=Action.READ_OTHERS_CONVERSATIONS
            in request.state.authorized_actions,
        )

    client = AsyncLlamaStackClientHolder().get_client()

    _, _, doc_ids_from_chunks, pre_rag_chunks = await perform_vector_search(
        client, query_request.query, query_request.solr
    )

    rag_context = format_rag_context_for_injection(pre_rag_chunks)
    if rag_context:
        query_request = query_request.model_copy(deep=True)
        query_request.query = query_request.query + rag_context

    # Prepare API request parameters
    responses_params = await prepare_responses_params(
        client=client,
        query_request=query_request,
        user_conversation=user_conversation,
        token=token,
        mcp_headers=mcp_headers,
        stream=True,
        store=True,
        request_headers=request.headers,
    )

    # Handle Azure token refresh if needed
    if (
        responses_params.model.startswith("azure")
        and AzureEntraIDManager().is_entra_id_configured
        and AzureEntraIDManager().is_token_expired
        and AzureEntraIDManager().refresh_token()
    ):
        client = await update_azure_token(client)

    request_id = get_suid()

    # Create context with index identification mapping for RAG source resolution
    context = ResponseGeneratorContext(
        conversation_id=normalize_conversation_id(responses_params.conversation),
        request_id=request_id,
        model_id=responses_params.model,
        user_id=user_id,
        skip_userid_check=_skip_userid_check,
        query_request=query_request,
        started_at=started_at,
        client=client,
        vector_store_ids=extract_vector_store_ids_from_tools(responses_params.tools),
        rag_id_mapping=configuration.rag_id_mapping,
    )

    # Update metrics for the LLM call
    provider_id, model_id = extract_provider_and_model_from_model_id(
        responses_params.model
    )
    metrics.llm_calls_total.labels(provider_id, model_id).inc()

    generator, turn_summary = await retrieve_response_generator(
        responses_params=responses_params,
        context=context,
        doc_ids_from_chunks=doc_ids_from_chunks,
    )

    response_media_type = (
        MEDIA_TYPE_TEXT
        if query_request.media_type == MEDIA_TYPE_TEXT
        else MEDIA_TYPE_EVENT_STREAM
    )

    return StreamingResponse(
        generate_response(
            generator=generator,
            context=context,
            responses_params=responses_params,
            turn_summary=turn_summary,
        ),
        media_type=response_media_type,
    )


async def retrieve_response_generator(
    responses_params: ResponsesApiParams,
    context: ResponseGeneratorContext,
    doc_ids_from_chunks: list[ReferencedDocument],
) -> tuple[AsyncIterator[str], TurnSummary]:
    """
    Retrieve the appropriate response generator.

    Handles shield moderation check and retrieves response.
    Returns the generator (shield violation or response generator) and turn_summary.
    Fills turn_summary attributes for token usage, referenced documents, and tool calls.

    Args:
        responses_params: The Responses API parameters
        context: The response generator context
        doc_ids_from_chunks: List of ReferencedDocument objects extracted from static RAG

    Returns:
        tuple[AsyncIterator[str], TurnSummary]: The response generator and turn summary

    """
    turn_summary = TurnSummary()
    try:
        moderation_result = await run_shield_moderation(
            context.client,
            cast(str, responses_params.input),
            context.query_request.shield_ids,
        )
        if moderation_result.decision == "blocked":
            turn_summary.llm_response = moderation_result.message
            await append_turn_to_conversation(
                context.client,
                responses_params.conversation,
                cast(str, responses_params.input),
                moderation_result.message,
            )
            media_type = context.query_request.media_type or MEDIA_TYPE_JSON
            return (
                shield_violation_generator(moderation_result.message, media_type),
                turn_summary,
            )
        # Retrieve response stream (may raise exceptions)
        response = await context.client.responses.create(
            **responses_params_to_request_body(responses_params),
        )
        # Store pre-RAG documents for later merging
        turn_summary.pre_rag_documents = doc_ids_from_chunks
        return response_generator(response, context, turn_summary), turn_summary

    # Handle know LLS client errors only at stream creation time and shield execution
    except RuntimeError as e:  # library mode wraps 413 into runtime error
        if "context_length" in str(e).lower():
            error_response = PromptTooLongResponse(model=responses_params.model)
            raise HTTPException(**error_response.model_dump()) from e
        raise e
    except APIConnectionError as e:
        error_response = ServiceUnavailableResponse(
            backend_name="Llama Stack",
            cause=str(e),
        )
        raise HTTPException(**error_response.model_dump()) from e

    except (LLSApiStatusError, OpenAIAPIStatusError) as e:
        error_response = handle_known_apistatus_errors(e, responses_params.model)
        raise HTTPException(**error_response.model_dump()) from e


async def _persist_interrupted_turn(
    context: ResponseGeneratorContext,
    responses_params: ResponsesApiParams,
    turn_summary: TurnSummary,
) -> None:
    """Persist the user query and an interrupted response into the conversation.

    Called when a streaming request is cancelled so the exchange is not lost.
    All errors are caught and logged to avoid masking the original
    cancellation.

    Parameters:
        context: The response generator context.
        responses_params: The Responses API parameters.
        turn_summary: TurnSummary with llm_response already set to the
            interrupted message.
    """
    try:
        await append_turn_to_conversation(
            context.client,
            responses_params.conversation,
            cast(str, responses_params.input),
            INTERRUPTED_RESPONSE_MESSAGE,
        )
    except Exception:  # pylint: disable=broad-except
        logger.exception(
            "Failed to append interrupted turn to conversation for request %s",
            context.request_id,
        )

    try:
        completed_at = datetime.datetime.now(datetime.UTC).strftime(
            "%Y-%m-%dT%H:%M:%SZ"
        )
        store_query_results(
            user_id=context.user_id,
            conversation_id=context.conversation_id,
            model=responses_params.model,
            completed_at=completed_at,
            started_at=context.started_at,
            summary=turn_summary,
            query=context.query_request.query,
            skip_userid_check=context.skip_userid_check,
            topic_summary=None,
        )
    except Exception:  # pylint: disable=broad-except
        logger.exception(
            "Failed to store interrupted query results for request %s",
            context.request_id,
        )


def _register_interrupt_callback(
    context: ResponseGeneratorContext,
    responses_params: ResponsesApiParams,
    turn_summary: TurnSummary,
) -> list[bool]:
    """Build an interrupt callback and register the stream for cancellation.

    The callback is scheduled as a **separate** asyncio task by
    ``cancel_stream`` so it executes regardless of where the
    ``CancelledError`` is raised in the ASGI stack.

    A mutable one-element list is used as a shared guard so the
    callback and the in-generator ``CancelledError`` handler never
    both persist the same turn.

    Parameters:
        context: The response generator context.
        responses_params: The Responses API parameters.
        turn_summary: TurnSummary populated during streaming.

    Returns:
        A mutable list ``[False]`` used as a persist-done guard; the
        caller should check ``guard[0]`` before persisting and set
        it to ``True`` afterwards.
    """
    guard: list[bool] = [False]

    async def _on_interrupt() -> None:
        if guard[0]:
            return
        guard[0] = True
        turn_summary.llm_response = INTERRUPTED_RESPONSE_MESSAGE
        await _persist_interrupted_turn(context, responses_params, turn_summary)

    current_task = asyncio.current_task()
    if current_task is not None:
        get_stream_interrupt_registry().register_stream(
            request_id=context.request_id,
            user_id=context.user_id,
            task=current_task,
            on_interrupt=_on_interrupt,
        )
    else:
        logger.warning(
            "No current asyncio task for request %s; "
            "stream interruption will not be available",
            context.request_id,
        )

    return guard


async def generate_response(  # pylint: disable=too-many-statements
    generator: AsyncIterator[str],
    context: ResponseGeneratorContext,
    responses_params: ResponsesApiParams,
    turn_summary: TurnSummary,
) -> AsyncIterator[str]:
    """Wrap a generator with cleanup logic.

    Re-yields events from the generator, handles errors, and ensures
    persistence and token consumption after completion.  When the
    stream is interrupted via ``CancelledError``, the user query and
    an interrupted response are persisted to the conversation, but
    token consumption is skipped (no usage data is available).

    Args:
        generator: The base generator to wrap
        context: The response generator context
        responses_params: The Responses API parameters
        turn_summary: TurnSummary populated during streaming

    Yields:
        SSE-formatted strings from the wrapped generator
    """
    persist_guard = _register_interrupt_callback(
        context, responses_params, turn_summary
    )

    stream_completed = False
    try:
        yield stream_start_event(
            conversation_id=context.conversation_id,
            request_id=context.request_id,
        )

        # Re-yield all events from the generator
        async for event in generator:
            yield event

        stream_completed = True

    # Handle known LLS client errors during response generation time
    except RuntimeError as e:  # library mode wraps 413 into runtime error
        error_response = (
            PromptTooLongResponse(model=responses_params.model)
            if "context_length" in str(e).lower()
            else InternalServerErrorResponse.generic()
        )
        yield stream_http_error_event(error_response, context.query_request.media_type)
    except APIConnectionError as e:
        error_response = ServiceUnavailableResponse(
            backend_name="Llama Stack",
            cause=str(e),
        )
        yield stream_http_error_event(error_response, context.query_request.media_type)
    except (LLSApiStatusError, OpenAIAPIStatusError) as e:
        error_response = handle_known_apistatus_errors(e, responses_params.model)
        yield stream_http_error_event(error_response, context.query_request.media_type)
    except asyncio.CancelledError:
        logger.info("Streaming request %s interrupted by user", context.request_id)
        current_task = asyncio.current_task()
        if current_task is not None:
            current_task.uncancel()
        if not persist_guard[0]:
            persist_guard[0] = True
            turn_summary.llm_response = INTERRUPTED_RESPONSE_MESSAGE
            await _persist_interrupted_turn(context, responses_params, turn_summary)
        yield stream_interrupted_event(context.request_id)
    finally:
        get_stream_interrupt_registry().deregister_stream(context.request_id)

    if not stream_completed:
        return

    # Post-stream side effects: only run when streaming finished successfully

    # Get topic summary for new conversations if needed. Guard against
    # CancelledError from MCP session cleanup (MCPSessionManager.close_all)
    # so we still yield stream_end_event and complete the ASGI response.
    topic_summary = None
    if not context.query_request.conversation_id:
        should_generate = context.query_request.generate_topic_summary
        if should_generate:
            try:
                logger.debug("Generating topic summary for new conversation")
                topic_summary = await get_topic_summary(
                    context.query_request.query,
                    context.client,
                    responses_params.model,
                )
            except asyncio.CancelledError:
                logger.debug(
                    "Topic summary cancelled (e.g. MCP cleanup); completing without it"
                )
                topic_summary = None

    # Consume tokens
    logger.info("Consuming tokens")
    consume_query_tokens(
        user_id=context.user_id,
        model_id=responses_params.model,
        token_usage=turn_summary.token_usage,
    )
    # Get available quotas
    logger.info("Getting available quotas")
    available_quotas = get_available_quotas(
        quota_limiters=configuration.quota_limiters, user_id=context.user_id
    )

    yield stream_end_event(
        turn_summary.token_usage,
        available_quotas,
        turn_summary.referenced_documents,
        context.query_request.media_type or MEDIA_TYPE_JSON,
    )
    completed_at = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%dT%H:%M:%SZ")

    # Store query results (transcript, conversation details, cache)
    logger.info("Storing query results")
    store_query_results(
        user_id=context.user_id,
        conversation_id=context.conversation_id,
        model=responses_params.model,
        completed_at=completed_at,
        started_at=context.started_at,
        summary=turn_summary,
        query=context.query_request.query,
        attachments=context.query_request.attachments,
        skip_userid_check=context.skip_userid_check,
        topic_summary=topic_summary,
    )


async def response_generator(  # pylint: disable=too-many-branches,too-many-statements,too-many-locals
    turn_response: AsyncIterator[OpenAIResponseObjectStream],
    context: ResponseGeneratorContext,
    turn_summary: TurnSummary,
) -> AsyncIterator[str]:
    """Generate SSE formatted streaming response.

    Processes streaming chunks from Llama Stack and converts them to
    Server-Sent Events (SSE) format. Uses handler functions to process
    different event types and populate turn_summary during streaming.

    Args:
        turn_response: The streaming response from Llama Stack
        context: The response generator context
        turn_summary: TurnSummary to populate during streaming

    Yields:
        SSE-formatted strings for tokens, tool calls, tool results,
        turn completion, and error events.
    """
    chunk_id = 0
    media_type = context.query_request.media_type or MEDIA_TYPE_JSON
    text_parts: list[str] = []
    mcp_calls: dict[int, tuple[str, str]] = (
        {}
    )  # output_index -> (mcp_call_id, mcp_call_name)
    latest_response_object: Optional[OpenAIResponseObject] = None

    logger.debug("Starting streaming response (Responses API) processing")

    async for chunk in turn_response:
        event_type = getattr(chunk, "type", None)
        logger.debug("Processing chunk %d, type: %s", chunk_id, event_type)

        # Content part started - emit an empty token to kick off UI streaming
        if event_type == "response.content_part.added":
            yield stream_event(
                {
                    "id": chunk_id,
                    "token": "",
                },
                LLM_TOKEN_EVENT,
                media_type,
            )
            chunk_id += 1

        # Store MCP call item info for later lookup when arguments.done event occurs
        elif event_type == "response.output_item.added":
            item_added_chunk = cast(OutputItemAddedChunk, chunk)
            if item_added_chunk.item.type == "mcp_call":
                mcp_call_item = cast(MCPCall, item_added_chunk.item)
                mcp_calls[item_added_chunk.output_index] = (
                    mcp_call_item.id,
                    mcp_call_item.name,
                )

        # Text streaming - emit token delta
        elif event_type == "response.output_text.delta":
            delta_chunk = cast(TextDeltaChunk, chunk)
            text_parts.append(delta_chunk.delta)
            yield stream_event(
                {
                    "id": chunk_id,
                    "token": delta_chunk.delta,
                },
                LLM_TOKEN_EVENT,
                media_type,
            )
            chunk_id += 1

        # Final text of the output (capture, but emit at response.completed)
        elif event_type == "response.output_text.done":
            text_done_chunk = cast(TextDoneChunk, chunk)
            turn_summary.llm_response = text_done_chunk.text

        # Emit tool call when MCP call arguments are done
        elif event_type == "response.mcp_call.arguments.done":
            mcp_arguments_done_chunk = cast(MCPArgsDoneChunk, chunk)
            tool_call = build_mcp_tool_call_from_arguments_done(
                mcp_arguments_done_chunk.output_index,
                mcp_arguments_done_chunk.arguments,
                mcp_calls,
            )
            if tool_call:
                turn_summary.tool_calls.append(tool_call)
                yield stream_event(
                    tool_call.model_dump(),
                    LLM_TOOL_CALL_EVENT,
                    media_type,
                )

        # Process tool calls and results when output items are done
        # For mcp_call, only emit result (call was already emitted when arguments.done)
        # For other types, emit both call and result
        elif event_type == "response.output_item.done":
            output_item_done_chunk = cast(OutputItemDoneChunk, chunk)
            item_type = output_item_done_chunk.item.type
            # Skip message items as they are parsed separately
            if item_type == "message":
                continue

            output_index = output_item_done_chunk.output_index

            # For mcp_call, only emit result if call was already emitted when arguments.done
            # (indicated by output_index not being in mcp_calls dict)
            # If output_index is in dict, process in else branch (emit both call and result)
            if item_type == "mcp_call" and output_index not in mcp_calls:
                # Call was already emitted during arguments.done, only emit result
                mcp_call_item = cast(MCPCall, output_item_done_chunk.item)
                tool_result = build_tool_result_from_mcp_output_item_done(mcp_call_item)
                turn_summary.tool_results.append(tool_result)
                yield stream_event(
                    tool_result.model_dump(),
                    LLM_TOOL_RESULT_EVENT,
                    media_type,
                )
            else:
                # For all other types (and mcp_call when arguments.done didn't happen),
                # emit both call and result together
                tool_call, tool_result = build_tool_call_summary(
                    output_item_done_chunk.item,
                    turn_summary.rag_chunks,
                    vector_store_ids=context.vector_store_ids,
                    rag_id_mapping=context.rag_id_mapping,
                )
                if tool_call:
                    turn_summary.tool_calls.append(tool_call)
                    yield stream_event(
                        tool_call.model_dump(),
                        LLM_TOOL_CALL_EVENT,
                        media_type,
                    )
                if tool_result:
                    turn_summary.tool_results.append(tool_result)
                    yield stream_event(
                        tool_result.model_dump(),
                        LLM_TOOL_RESULT_EVENT,
                        media_type,
                    )

        # Completed response - capture final text and response object
        elif event_type == "response.completed":
            latest_response_object = cast(
                OpenAIResponseObject, getattr(chunk, "response")
            )
            turn_summary.llm_response = turn_summary.llm_response or "".join(text_parts)
            yield stream_event(
                {
                    "id": chunk_id,
                    "token": turn_summary.llm_response,
                },
                LLM_TURN_COMPLETE_EVENT,
                media_type,
            )
            chunk_id += 1

        # Incomplete or failed response - emit error
        elif event_type in ("response.incomplete", "response.failed"):
            latest_response_object = cast(
                OpenAIResponseObject, getattr(chunk, "response")
            )
            error_message = (
                latest_response_object.error.message
                if latest_response_object.error
                else "An unexpected error occurred while processing the request."
            )
            error_response = (
                PromptTooLongResponse(model=context.model_id)
                if "context_length" in error_message.lower()
                else InternalServerErrorResponse.query_failed(error_message)
            )
            yield stream_http_error_event(error_response, media_type)

    logger.debug(
        "Streaming complete - Tool calls: %d, Response chars: %d",
        len(turn_summary.tool_calls),
        len(turn_summary.llm_response),
    )

    # Extract token usage and referenced documents from the final response object
    if not latest_response_object:
        return

    turn_summary.token_usage = extract_token_usage(
        latest_response_object.usage, context.model_id
    )
    tool_based_documents = parse_referenced_documents(
        latest_response_object,
        vector_store_ids=context.vector_store_ids,
        rag_id_mapping=context.rag_id_mapping,
    )

    turn_summary.referenced_documents = deduplicate_referenced_documents(
        tool_based_documents + turn_summary.pre_rag_documents
    )


def stream_http_error_event(
    error: AbstractErrorResponse, media_type: str | None = MEDIA_TYPE_JSON
) -> str:
    """
    Create an SSE-formatted error response for generic LLM or API errors.

    Args:
        error: An AbstractErrorResponse instance representing the error.
        media_type: The media type for the response format. Defaults to MEDIA_TYPE_JSON if None.
    Returns:
        str: A Server-Sent Events (SSE) formatted error message containing
            the serialized error details.
    """
    logger.error("Error while obtaining answer for user question")
    media_type = media_type or MEDIA_TYPE_JSON
    if media_type == MEDIA_TYPE_TEXT:
        return f"Status: {error.status_code} - {error.detail.response} - {error.detail.cause}"

    return format_stream_data(
        {
            "event": "error",
            "data": {
                "status_code": error.status_code,
                "response": error.detail.response,
                "cause": error.detail.cause,
            },
        }
    )


def format_stream_data(d: dict) -> str:
    """
    Create a response generator function for Responses API streaming.

    Parameters:
        d (dict): The data to be formatted as an SSE event.

    Returns:
        str: The formatted SSE data string.
    """
    data = json.dumps(d)
    return f"data: {data}\n\n"


def stream_start_event(conversation_id: str, request_id: str) -> str:
    """Format an SSE start event for a streaming response.

    The payload contains both the conversation ID and the request ID
    so the client can correlate the stream with a conversation and
    use the request ID to issue an interrupt if needed.

    Parameters:
        conversation_id (str): Unique identifier for the conversation.
        request_id (str): Unique SUID for this streaming request,
            returned to the client for interrupt support.

    Returns:
        str: SSE-formatted string representing the start event.
    """
    return format_stream_data(
        {
            "event": "start",
            "data": {
                "conversation_id": conversation_id,
                "request_id": request_id,
            },
        }
    )


def stream_interrupted_event(request_id: str) -> str:
    """Format an SSE event indicating the stream was interrupted.

    Emitted to the client just before the generator closes so the
    frontend can distinguish an intentional user-initiated interruption
    from an unexpected connection drop.

    Parameters:
        request_id (str): Unique identifier for the interrupted request.

    Returns:
        str: SSE-formatted string representing the interrupted event.
    """
    return format_stream_data(
        {
            "event": "interrupted",
            "data": {
                "request_id": request_id,
            },
        }
    )


def stream_end_event(
    token_usage: TokenCounter,
    available_quotas: dict[str, int],
    referenced_documents: list[ReferencedDocument],
    media_type: str = MEDIA_TYPE_JSON,
) -> str:
    """
    Yield the end of the data stream.

    Format and return the end event for a streaming response,
    including referenced document metadata and token usage information.

    Parameters:
        token_usage (TokenCounter): Token usage information.
        available_quotas (dict[str, int]): Available quotas for the user.
        referenced_documents (list[ReferencedDocument]): List of referenced documents.
        media_type (str): The media type for the response format.

    Returns:
        str: A Server-Sent Events (SSE) formatted string
        representing the end of the data stream.
    """
    if media_type == MEDIA_TYPE_TEXT:
        ref_docs_string = "\n".join(
            f"{doc.doc_title}: {doc.doc_url}"
            for doc in referenced_documents
            if doc.doc_url and doc.doc_title
        )
        return f"\n\n---\n\n{ref_docs_string}" if ref_docs_string else ""

    referenced_docs_dict = [doc.model_dump(mode="json") for doc in referenced_documents]

    return format_stream_data(
        {
            "event": "end",
            "data": {
                "referenced_documents": referenced_docs_dict,
                "truncated": None,
                "input_tokens": token_usage.input_tokens,
                "output_tokens": token_usage.output_tokens,
            },
            "available_quotas": available_quotas,
        }
    )


def stream_event(data: dict, event_type: str, media_type: str) -> str:
    """Build an item to yield based on media type.

    Args:
        data: Dictionary containing the event data
        event_type: Type of event (token, tool call, etc.)
        media_type: The media type for the response format

    Returns:
        SSE-formatted string representing the event
    """
    if media_type == MEDIA_TYPE_TEXT:
        if event_type == LLM_TOKEN_EVENT:
            return data.get("token", "")
        if event_type == LLM_TOOL_CALL_EVENT:
            return f"[Tool Call: {data.get('function_name', 'unknown')}]\n"
        if event_type == LLM_TOOL_RESULT_EVENT:
            return "[Tool Result]\n"
        if event_type == LLM_TURN_COMPLETE_EVENT:
            return ""
        return ""

    return format_stream_data(
        {
            "event": event_type,
            "data": data,
        }
    )


async def shield_violation_generator(
    violation_message: str,
    media_type: str = MEDIA_TYPE_TEXT,
) -> AsyncIterator[str]:
    """
    Create an SSE stream for shield violation responses.

    Yields start, token, and end events immediately for shield violations.
    This function creates a minimal streaming response without going through
    the Llama Stack response format.

    Args:
        violation_message: The violation message to display.
        media_type: The media type for the response format.

    Yields:
        str: SSE-formatted strings for start, token, and end events.
    """
    yield stream_event(
        {
            "id": 0,
            "token": violation_message,
        },
        LLM_TOKEN_EVENT,
        media_type,
    )