Re-apply lost parts of implementation after rebase

max-svistunov · max-svistunov · commit 3ed3cacda3a8 · 2026-02-16T15:21:28.000+01:00
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -4,7 +4,7 @@
 
 import datetime
 import logging
-from typing import Annotated, Any, cast
+from typing import Annotated, Any, Optional, cast
 
 from fastapi import APIRouter, Depends, HTTPException, Request
 from llama_stack_api.openai_responses import OpenAIResponseObject
@@ -56,6 +56,7 @@
     build_tool_call_summary,
     extract_text_from_response_output_item,
     extract_token_usage,
+    extract_vector_store_ids_from_tools,
     get_topic_summary,
     parse_referenced_documents,
     prepare_responses_params,
@@ -184,8 +185,14 @@ async def query_endpoint_handler(
     ):
         client = await update_azure_token(client)
 
+    # Build index identification mapping for RAG source resolution
+    vector_store_ids = extract_vector_store_ids_from_tools(responses_params.tools)
+    rag_id_mapping = configuration.rag_id_mapping
+
     # Retrieve response using Responses API
-    turn_summary = await retrieve_response(client, responses_params)
+    turn_summary = await retrieve_response(
+        client, responses_params, vector_store_ids, rag_id_mapping
+    )
 
     if pre_rag_chunks:
         turn_summary.rag_chunks = pre_rag_chunks + (turn_summary.rag_chunks or [])
@@ -269,6 +276,8 @@ def parse_referenced_docs(
 async def retrieve_response(  # pylint: disable=too-many-locals
     client: AsyncLlamaStackClient,
     responses_params: ResponsesApiParams,
+    vector_store_ids: Optional[list[str]] = None,
+    rag_id_mapping: Optional[dict[str, str]] = None,
 ) -> TurnSummary:
     """
     Retrieve response from LLMs and agents.
@@ -279,6 +288,8 @@ async def retrieve_response(  # pylint: disable=too-many-locals
     Parameters:
         client: The AsyncLlamaStackClient to use for the request.
         responses_params: The Responses API parameters.
+        vector_store_ids: Vector store IDs used in the query for source resolution.
+        rag_id_mapping: Mapping from vector_db_id to user-facing rag_id.
 
     Returns:
         TurnSummary: Summary of the LLM response content
@@ -323,7 +334,7 @@ async def retrieve_response(  # pylint: disable=too-many-locals
             summary.llm_response += message_text
 
         tool_call, tool_result = build_tool_call_summary(
-            output_item, summary.rag_chunks
+            output_item, summary.rag_chunks, vector_store_ids, rag_id_mapping
         )
         if tool_call:
             summary.tool_calls.append(tool_call)
@@ -337,7 +348,9 @@ async def retrieve_response(  # pylint: disable=too-many-locals
     )
 
     # Extract referenced documents and token usage from Responses API response
-    summary.referenced_documents = parse_referenced_documents(response)
+    summary.referenced_documents = parse_referenced_documents(
+        response, vector_store_ids, rag_id_mapping
+    )
     summary.token_usage = extract_token_usage(response, responses_params.model)
 
     return summary
diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py
@@ -74,6 +74,7 @@
     build_tool_call_summary,
     build_tool_result_from_mcp_output_item_done,
     extract_token_usage,
+    extract_vector_store_ids_from_tools,
     get_topic_summary,
     parse_referenced_documents,
     prepare_responses_params,
@@ -204,7 +205,7 @@ async def streaming_query_endpoint_handler(  # pylint: disable=too-many-locals
     ):
         client = await update_azure_token(client)
 
-    # Create context
+    # Create context with index identification mapping for RAG source resolution
     context = ResponseGeneratorContext(
         conversation_id=normalize_conversation_id(responses_params.conversation),
         model_id=responses_params.model,
@@ -213,6 +214,8 @@ async def streaming_query_endpoint_handler(  # pylint: disable=too-many-locals
         query_request=query_request,
         started_at=started_at,
         client=client,
+        vector_store_ids=extract_vector_store_ids_from_tools(responses_params.tools),
+        rag_id_mapping=configuration.rag_id_mapping,
     )
 
     # Update metrics for the LLM call
@@ -527,7 +530,10 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
                 # For all other types (and mcp_call when arguments.done didn't happen),
                 # emit both call and result together
                 tool_call, tool_result = build_tool_call_summary(
-                    output_item_done_chunk.item, turn_summary.rag_chunks
+                    output_item_done_chunk.item,
+                    turn_summary.rag_chunks,
+                    context.vector_store_ids,
+                    context.rag_id_mapping,
                 )
                 if tool_call:
                     turn_summary.tool_calls.append(tool_call)
@@ -587,7 +593,11 @@ async def response_generator(  # pylint: disable=too-many-branches,too-many-stat
     turn_summary.token_usage = extract_token_usage(
         latest_response_object, context.model_id
     )
-    tool_based_documents = parse_referenced_documents(latest_response_object)
+    tool_based_documents = parse_referenced_documents(
+        latest_response_object,
+        vector_store_ids=context.vector_store_ids,
+        rag_id_mapping=context.rag_id_mapping,
+    )
 
     # Merge pre-RAG documents with tool-based documents (similar to query.py)
     if turn_summary.pre_rag_documents:
diff --git a/src/utils/responses.py b/src/utils/responses.py
@@ -292,6 +292,25 @@ async def prepare_responses_params(  # pylint: disable=too-many-arguments,too-ma
     )
 
 
+def extract_vector_store_ids_from_tools(
+    tools: Optional[list[dict[str, Any]]],
+) -> list[str]:
+    """Extract vector store IDs from prepared tool configurations.
+
+    Parameters:
+        tools: The prepared tools list from ResponsesApiParams.
+
+    Returns:
+        List of vector store IDs used in file_search tools, or empty list.
+    """
+    if not tools:
+        return []
+    for tool in tools:
+        if tool.get("type") == "file_search":
+            return tool.get("vector_store_ids", [])
+    return []
+
+
 def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[dict[str, Any]]]:
     """Convert vector store IDs to tools format for Responses API.
 
@@ -390,14 +409,18 @@ def _get_token_value(original: str, header: str) -> str | None:
 
 def parse_referenced_documents(
     response: Optional[OpenAIResponseObject],
+    vector_store_ids: Optional[list[str]] = None,
+    rag_id_mapping: Optional[dict[str, str]] = None,
 ) -> list[ReferencedDocument]:
     """Parse referenced documents from Responses API response.
 
     Args:
         response: The OpenAI Response API response object
+        vector_store_ids: Vector store IDs used in the query for source resolution.
+        rag_id_mapping: Mapping from vector_db_id to user-facing rag_id.
 
     Returns:
-        List of referenced documents with doc_url and doc_title
+        List of referenced documents with doc_url, doc_title, and source
     """
     documents: list[ReferencedDocument] = []
     # Use a set to track unique documents by (doc_url, doc_title) tuple
@@ -407,6 +430,10 @@ def parse_referenced_documents(
     if response is None or not response.output:
         return documents
 
+    resolved_source = _resolve_single_store_source(
+        vector_store_ids or [], rag_id_mapping or {}
+    )
+
     for output_item in response.output:
         item_type = getattr(output_item, "type", None)
 
@@ -434,13 +461,36 @@ def parse_referenced_documents(
                     final_url = doc_url if doc_url else None
                     if (final_url, doc_title) not in seen_docs:
                         documents.append(
-                            ReferencedDocument(doc_url=final_url, doc_title=doc_title)
+                            ReferencedDocument(
+                                doc_url=final_url,
+                                doc_title=doc_title,
+                                source=resolved_source,
+                            )
                         )
                         seen_docs.add((final_url, doc_title))
 
     return documents
 
 
+def _resolve_single_store_source(
+    vector_store_ids: list[str],
+    rag_id_mapping: dict[str, str],
+) -> Optional[str]:
+    """Resolve source name when there is exactly one vector store.
+
+    Parameters:
+        vector_store_ids: The vector store IDs used in the query.
+        rag_id_mapping: Mapping from vector_db_id to user-facing rag_id.
+
+    Returns:
+        The resolved rag_id if exactly one store is used, None otherwise.
+    """
+    if len(vector_store_ids) == 1:
+        store_id = vector_store_ids[0]
+        return rag_id_mapping.get(store_id)
+    return None
+
+
 def extract_token_usage(
     response: Optional[OpenAIResponseObject], model_id: str
 ) -> TokenCounter:
@@ -522,15 +572,19 @@ def extract_token_usage(
     return token_counter
 
 
-def build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-many-branches
+def build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-many-branches,too-many-locals
     output_item: OpenAIResponseOutput,
     rag_chunks: list[RAGChunk],
+    vector_store_ids: Optional[list[str]] = None,
+    rag_id_mapping: Optional[dict[str, str]] = None,
 ) -> tuple[Optional[ToolCallSummary], Optional[ToolResultSummary]]:
     """Translate Responses API tool outputs into ToolCallSummary and ToolResultSummary.
 
     Args:
         output_item: An OpenAIResponseOutput item from the response.output array
         rag_chunks: List to append extracted RAG chunks to (from file_search_call items)
+        vector_store_ids: Vector store IDs used in the query for source resolution.
+        rag_id_mapping: Mapping from vector_db_id to user-facing rag_id.
 
     Returns:
         Tuple of (ToolCallSummary, ToolResultSummary), one may be None
@@ -551,7 +605,9 @@ def build_tool_call_summary(  # pylint: disable=too-many-return-statements,too-m
 
     if item_type == "file_search_call":
         file_search_item = cast(FileSearchCall, output_item)
-        extract_rag_chunks_from_file_search_item(file_search_item, rag_chunks)
+        extract_rag_chunks_from_file_search_item(
+            file_search_item, rag_chunks, vector_store_ids, rag_id_mapping
+        )
         response_payload: Optional[dict[str, Any]] = None
         if file_search_item.results is not None:
             response_payload = {
@@ -731,20 +787,79 @@ def build_tool_result_from_mcp_output_item_done(
     )
 
 
+def _resolve_source_for_result(
+    result: Any,
+    vector_store_ids: list[str],
+    rag_id_mapping: dict[str, str],
+) -> Optional[str]:
+    """Resolve the human-friendly index name for a file search result.
+
+    Uses the vector store mapping to convert internal llama-stack IDs
+    to user-facing rag_ids from configuration.
+
+    Parameters:
+        result: A file search result object with optional attributes.
+        vector_store_ids: The vector store IDs used in this query.
+        rag_id_mapping: Mapping from vector_db_id to user-facing rag_id.
+
+    Returns:
+        The resolved index name, or None if resolution is not possible.
+    """
+    if len(vector_store_ids) == 1:
+        store_id = vector_store_ids[0]
+        return rag_id_mapping.get(store_id, result.filename)
+
+    if len(vector_store_ids) > 1:
+        attributes = getattr(result, "attributes", {}) or {}
+        attr_store_id: Optional[str] = attributes.get("vector_store_id")
+        if attr_store_id and attr_store_id in rag_id_mapping:
+            return rag_id_mapping[attr_store_id]
+
+    return result.filename
+
+
+def _build_chunk_attributes(result: Any) -> Optional[dict[str, Any]]:
+    """Extract document metadata attributes from a file search result.
+
+    Parameters:
+        result: A file search result object with optional attributes.
+
+    Returns:
+        Dictionary of metadata attributes, or None if no attributes available.
+    """
+    attributes = getattr(result, "attributes", None)
+    if not attributes:
+        return None
+    if isinstance(attributes, dict):
+        return attributes if attributes else None
+    return None
+
+
 def extract_rag_chunks_from_file_search_item(
     item: FileSearchCall,
     rag_chunks: list[RAGChunk],
+    vector_store_ids: Optional[list[str]] = None,
+    rag_id_mapping: Optional[dict[str, str]] = None,
 ) -> None:
     """Extract RAG chunks from a file search tool call item.
 
     Args:
         item: The file search tool call item
         rag_chunks: List to append extracted RAG chunks to
+        vector_store_ids: Vector store IDs used in the query for source resolution.
+        rag_id_mapping: Mapping from vector_db_id to user-facing rag_id.
     """
     if item.results is not None:
         for result in item.results:
+            source = _resolve_source_for_result(
+                result, vector_store_ids or [], rag_id_mapping or {}
+            )
+            attributes = _build_chunk_attributes(result)
             rag_chunk = RAGChunk(
-                content=result.text, source=result.filename, score=result.score
+                content=result.text,
+                source=source,
+                score=result.score,
+                attributes=attributes,
             )
             rag_chunks.append(rag_chunk)
 
diff --git a/src/utils/types.py b/src/utils/types.py
@@ -181,6 +181,11 @@ class ReferencedDocument(BaseModel):
         None, description="Title of the referenced document"
     )
 
+    source: Optional[str] = Field(
+        default=None,
+        description="Index name identifying the knowledge source from configuration",
+    )
+
 
 class TurnSummary(BaseModel):
     """Summary of a turn in llama stack."""
diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py
@@ -125,6 +125,7 @@ async def test_successful_query_no_conversation(
         mock_responses_params = mocker.Mock(spec=ResponsesApiParams)
         mock_responses_params.model = "provider1/model1"
         mock_responses_params.conversation = "conv_123"
+        mock_responses_params.tools = None
         mock_responses_params.model_dump.return_value = {
             "input": "test",
             "model": "provider1/model1",
@@ -200,6 +201,7 @@ async def test_successful_query_with_conversation(
         mock_responses_params = mocker.Mock(spec=ResponsesApiParams)
         mock_responses_params.model = "provider1/model1"
         mock_responses_params.conversation = "conv_123"
+        mock_responses_params.tools = None
         mock_responses_params.model_dump.return_value = {
             "input": "test",
             "model": "provider1/model1",
@@ -273,6 +275,7 @@ async def test_query_with_attachments(
         mock_responses_params = mocker.Mock(spec=ResponsesApiParams)
         mock_responses_params.model = "provider1/model1"
         mock_responses_params.conversation = "conv_123"
+        mock_responses_params.tools = None
         mock_responses_params.model_dump.return_value = {
             "input": "test",
             "model": "provider1/model1",
@@ -332,6 +335,7 @@ async def test_query_with_topic_summary(
         mock_responses_params = mocker.Mock(spec=ResponsesApiParams)
         mock_responses_params.model = "provider1/model1"
         mock_responses_params.conversation = "conv_123"
+        mock_responses_params.tools = None
         mock_responses_params.model_dump.return_value = {
             "input": "test",
             "model": "provider1/model1",
@@ -401,6 +405,7 @@ async def test_query_azure_token_refresh(
         mock_responses_params = mocker.Mock(spec=ResponsesApiParams)
         mock_responses_params.model = "azure/model1"
         mock_responses_params.conversation = "conv_123"
+        mock_responses_params.tools = None
         mock_responses_params.model_dump.return_value = {
             "input": "test",
             "model": "azure/model1",
diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py
diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py