5757 UnauthorizedResponse ,
5858 UnprocessableEntityResponse ,
5959)
60+ from utils .conversations import append_turn_items_to_conversation
6061from utils .endpoints import (
6162 check_configuration_loaded ,
6263 validate_and_retrieve_conversation ,
@@ -189,10 +190,22 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
189190
190191 client = AsyncLlamaStackClientHolder ().get_client ()
191192
193+ # Moderation input is the raw user content (query + attachments) without injected RAG
194+ # context, to avoid false positives from retrieved document content.
195+ moderation_input = prepare_input (query_request )
196+ moderation_result = await run_shield_moderation (
197+ client , moderation_input , query_request .shield_ids
198+ )
199+
192200 # Build RAG context from Inline RAG sources
193201 inline_rag_context = await build_rag_context (
194- client , query_request .query , query_request .vector_store_ids , query_request .solr
202+ client ,
203+ moderation_result .decision ,
204+ query_request .query ,
205+ query_request .vector_store_ids ,
206+ query_request .solr ,
195207 )
208+
196209 # Prepare API request parameters
197210 responses_params = await prepare_responses_params (
198211 client = client ,
@@ -203,7 +216,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
203216 stream = True ,
204217 store = True ,
205218 request_headers = request .headers ,
206- inline_rag_context = inline_rag_context .context_text or None ,
219+ inline_rag_context = inline_rag_context .context_text ,
207220 )
208221
209222 # Handle Azure token refresh if needed
@@ -227,6 +240,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
227240 query_request = query_request ,
228241 started_at = started_at ,
229242 client = client ,
243+ moderation_result = moderation_result ,
230244 vector_store_ids = extract_vector_store_ids_from_tools (responses_params .tools ),
231245 rag_id_mapping = configuration .rag_id_mapping ,
232246 )
@@ -240,9 +254,15 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
240254 generator , turn_summary = await retrieve_response_generator (
241255 responses_params = responses_params ,
242256 context = context ,
243- inline_rag_documents = inline_rag_context .referenced_documents ,
257+ inline_rag_docs = inline_rag_context .referenced_documents ,
244258 )
245259
260+ # Combine inline RAG results (BYOK + Solr) with tool-based results
261+ if context .moderation_result .decision == "passed" :
262+ turn_summary .referenced_documents = deduplicate_referenced_documents (
263+ inline_rag_context .referenced_documents + turn_summary .referenced_documents
264+ )
265+
246266 response_media_type = (
247267 MEDIA_TYPE_TEXT
248268 if query_request .media_type == MEDIA_TYPE_TEXT
@@ -263,7 +283,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
263283async def retrieve_response_generator (
264284 responses_params : ResponsesApiParams ,
265285 context : ResponseGeneratorContext ,
266- inline_rag_documents : list [ReferencedDocument ],
286+ inline_rag_docs : list [ReferencedDocument ],
267287) -> tuple [AsyncIterator [str ], TurnSummary ]:
268288 """
269289 Retrieve the appropriate response generator.
@@ -275,40 +295,43 @@ async def retrieve_response_generator(
275295 Args:
276296 responses_params: The Responses API parameters
277297 context: The response generator context
278- inline_rag_documents: Referenced documents from inline RAG (BYOK + Solr)
279-
298+ inline_rag_docs: Inline RAG (BYOK + Solr) documents
280299 Returns:
281300 tuple[AsyncIterator[str], TurnSummary]: The response generator and turn summary
282301
283302 """
284303 turn_summary = TurnSummary ()
285304 try :
286- moderation_result = await run_shield_moderation (
287- context .client ,
288- prepare_input (context .query_request ),
289- context .query_request .shield_ids ,
290- )
291- if moderation_result .decision == "blocked" :
292- turn_summary .llm_response = moderation_result .message
293- await append_turn_to_conversation (
305+ if context .moderation_result .decision == "blocked" :
306+ turn_summary .llm_response = context .moderation_result .message
307+ await append_turn_items_to_conversation (
294308 context .client ,
295309 responses_params .conversation ,
296- cast ( str , responses_params .input ) ,
297- moderation_result .message ,
310+ responses_params .input ,
311+ [ context . moderation_result .refusal_response ] ,
298312 )
299313 media_type = context .query_request .media_type or MEDIA_TYPE_JSON
300314 return (
301- shield_violation_generator (moderation_result .message , media_type ),
315+ shield_violation_generator (
316+ context .moderation_result .message ,
317+ media_type ,
318+ ),
302319 turn_summary ,
303320 )
304321 # Retrieve response stream (may raise exceptions)
305322 response = await context .client .responses .create (
306323 ** responses_params .model_dump (exclude_none = True )
307324 )
308325 # Store pre-RAG documents for later merging with tool-based RAG
309- turn_summary .inline_rag_documents = inline_rag_documents
310- return response_generator (response , context , turn_summary ), turn_summary
311-
326+ return (
327+ response_generator (
328+ response ,
329+ context ,
330+ turn_summary ,
331+ inline_rag_docs ,
332+ ),
333+ turn_summary ,
334+ )
312335 # Handle know LLS client errors only at stream creation time and shield execution
313336 except RuntimeError as e : # library mode wraps 413 into runtime error
314337 if "context_length" in str (e ).lower ():
@@ -559,6 +582,7 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
559582 turn_response : AsyncIterator [OpenAIResponseObjectStream ],
560583 context : ResponseGeneratorContext ,
561584 turn_summary : TurnSummary ,
585+ inline_rag_docs : list [ReferencedDocument ],
562586) -> AsyncIterator [str ]:
563587 """Generate SSE formatted streaming response.
564588
@@ -570,7 +594,7 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
570594 turn_response: The streaming response from Llama Stack
571595 context: The response generator context
572596 turn_summary: TurnSummary to populate during streaming
573-
597+ inline_rag_docs: Inline RAG (BYOK + Solr) documents
574598 Yields:
575599 SSE-formatted strings for tokens, tool calls, tool results,
576600 turn completion, and error events.
@@ -741,15 +765,15 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
741765 turn_summary .token_usage = extract_token_usage (
742766 latest_response_object .usage , context .model_id
743767 )
744- tool_based_documents = parse_referenced_documents (
768+ # Parse tool-based referenced documents from the final response object
769+ tool_rag_docs = parse_referenced_documents (
745770 latest_response_object ,
746771 vector_store_ids = context .vector_store_ids ,
747772 rag_id_mapping = context .rag_id_mapping ,
748773 )
749-
750- # Merge pre-RAG documents with tool-based documents and deduplicate
774+ # Combine inline RAG results (BYOK + Solr) with tool-based results
751775 turn_summary .referenced_documents = deduplicate_referenced_documents (
752- turn_summary . inline_rag_documents + tool_based_documents
776+ inline_rag_docs + tool_rag_docs
753777 )
754778
755779
0 commit comments