Restored rag_chunks attribute in query response

asimurka · asimurka · commit c8cb85a59e14 · 2026-01-21T13:43:32.000+01:00
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -7648,6 +7648,14 @@
                             "Kubernetes is an open-source container orchestration system for automating ..."
                         ]
                     },
+                    "rag_chunks": {
+                        "items": {
+                            "$ref": "#/components/schemas/RAGChunk"
+                        },
+                        "type": "array",
+                        "title": "Rag Chunks",
+                        "description": "Deprecated: List of RAG chunks used to generate the response."
+                    },
                     "referenced_documents": {
                         "items": {
                             "$ref": "#/components/schemas/ReferencedDocument"
@@ -7711,32 +7719,18 @@
                         ]
                     },
                     "tool_calls": {
-                        "anyOf": [
-                            {
-                                "items": {
-                                    "$ref": "#/components/schemas/ToolCallSummary"
-                                },
-                                "type": "array"
-                            },
-                            {
-                                "type": "null"
-                            }
-                        ],
+                        "items": {
+                            "$ref": "#/components/schemas/ToolCallSummary"
+                        },
+                        "type": "array",
                         "title": "Tool Calls",
                         "description": "List of tool calls made during response generation"
                     },
                     "tool_results": {
-                        "anyOf": [
-                            {
-                                "items": {
-                                    "$ref": "#/components/schemas/ToolResultSummary"
-                                },
-                                "type": "array"
-                            },
-                            {
-                                "type": "null"
-                            }
-                        ],
+                        "items": {
+                            "$ref": "#/components/schemas/ToolResultSummary"
+                        },
+                        "type": "array",
                         "title": "Tool Results",
                         "description": "List of tool results"
                     }
@@ -7746,7 +7740,7 @@
                     "response"
                 ],
                 "title": "QueryResponse",
-                "description": "Model representing LLM response to a query.\n\nAttributes:\n    conversation_id: The optional conversation ID (UUID).\n    response: The response.\n    rag_chunks: List of RAG chunks used to generate the response.\n    referenced_documents: The URLs and titles for the documents used to generate the response.\n    tool_calls: List of tool calls made during response generation.\n    truncated: Whether conversation history was truncated.\n    input_tokens: Number of tokens sent to LLM.\n    output_tokens: Number of tokens received from LLM.\n    available_quotas: Quota available as measured by all configured quota limiters.",
+                "description": "Model representing LLM response to a query.\n\nAttributes:\n    conversation_id: The optional conversation ID (UUID).\n    response: The response.\n    rag_chunks: Deprecated. List of RAG chunks used to generate the response.\n        This information is now available in tool_results under file_search_call type.\n    referenced_documents: The URLs and titles for the documents used to generate the response.\n    tool_calls: List of tool calls made during response generation.\n    tool_results: List of tool results.\n    truncated: Whether conversation history was truncated.\n    input_tokens: Number of tokens sent to LLM.\n    output_tokens: Number of tokens received from LLM.\n    available_quotas: Quota available as measured by all configured quota limiters.",
                 "examples": [
                     {
                         "available_quotas": {
@@ -7979,6 +7973,45 @@
                 "title": "QuotaSchedulerConfiguration",
                 "description": "Quota scheduler configuration."
             },
+            "RAGChunk": {
+                "properties": {
+                    "content": {
+                        "type": "string",
+                        "title": "Content",
+                        "description": "The content of the chunk"
+                    },
+                    "source": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Source",
+                        "description": "Source document or URL"
+                    },
+                    "score": {
+                        "anyOf": [
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Score",
+                        "description": "Relevance score"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "content"
+                ],
+                "title": "RAGChunk",
+                "description": "Model representing a RAG chunk used in the response."
+            },
             "RAGInfoResponse": {
                 "properties": {
                     "id": {
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -441,6 +441,7 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
             response=summary.llm_response,
             tool_calls=summary.tool_calls,
             tool_results=summary.tool_results,
+            rag_chunks=summary.rag_chunks,
             referenced_documents=referenced_documents,
             truncated=False,  # TODO: implement truncation detection
             input_tokens=token_usage.input_tokens,
diff --git a/src/app/endpoints/query_v2.py b/src/app/endpoints/query_v2.py
@@ -492,7 +492,7 @@ def extract_rag_chunks_from_file_search_item(
     if item.results is not None:
         for result in item.results:
             rag_chunk = RAGChunk(
-                content=result.text, source="file_search", score=result.score
+                content=result.text, source=result.filename, score=result.score
             )
             rag_chunks.append(rag_chunk)
 
diff --git a/src/models/responses.py b/src/models/responses.py
@@ -10,7 +10,7 @@
 
 from quota.quota_exceed_error import QuotaExceedError
 from models.config import Action, Configuration
-from utils.types import ToolCallSummary, ToolResultSummary
+from utils.types import RAGChunk, ToolCallSummary, ToolResultSummary
 
 SUCCESSFUL_RESPONSE_DESCRIPTION = "Successful response"
 BAD_REQUEST_DESCRIPTION = "Invalid request format"
@@ -348,9 +348,11 @@ class QueryResponse(AbstractSuccessfulResponse):
     Attributes:
         conversation_id: The optional conversation ID (UUID).
         response: The response.
-        rag_chunks: List of RAG chunks used to generate the response.
+        rag_chunks: Deprecated. List of RAG chunks used to generate the response.
+            This information is now available in tool_results under file_search_call type.
         referenced_documents: The URLs and titles for the documents used to generate the response.
         tool_calls: List of tool calls made during response generation.
+        tool_results: List of tool results.
         truncated: Whether conversation history was truncated.
         input_tokens: Number of tokens sent to LLM.
         output_tokens: Number of tokens received from LLM.
@@ -370,6 +372,11 @@ class QueryResponse(AbstractSuccessfulResponse):
         ],
     )
 
+    rag_chunks: list[RAGChunk] = Field(
+        default_factory=list,
+        description="Deprecated: List of RAG chunks used to generate the response.",
+    )
+
     referenced_documents: list[ReferencedDocument] = Field(
         default_factory=list,
         description="List of documents referenced in generating the response",
diff --git a/tests/unit/app/endpoints/test_query_v2.py b/tests/unit/app/endpoints/test_query_v2.py
@@ -998,8 +998,8 @@ async def test_retrieve_response_parses_referenced_documents(
     # Verify RAG chunks were extracted from file_search_call results
     assert len(_summary.rag_chunks) == 2
     assert _summary.rag_chunks[0].content == "Sample text from file2.pdf"
-    assert _summary.rag_chunks[0].source == "file_search"
+    assert _summary.rag_chunks[0].source == "file2.pdf"
     assert _summary.rag_chunks[0].score == 0.95
     assert _summary.rag_chunks[1].content == "Sample text from file3.docx"
-    assert _summary.rag_chunks[1].source == "file_search"
+    assert _summary.rag_chunks[1].source == "file3.docx"
     assert _summary.rag_chunks[1].score == 0.85

Original file line number	Diff line number	Diff line change
`@@ -492,7 +492,7 @@ def extract_rag_chunks_from_file_search_item(`
`492`	`492`	`if item.results is not None:`
`493`	`493`	`for result in item.results:`
`494`	`494`	`rag_chunk = RAGChunk(`
`495`		`- content=result.text, source="file_search", score=result.score`
	`495`	`+ content=result.text, source=result.filename, score=result.score`
`496`	`496`	`)`
`497`	`497`	`rag_chunks.append(rag_chunk)`
`498`	`498`