@@ -1066,25 +1066,25 @@ async def _side_effect(**kwargs: Any) -> Any:
10661066
10671067
10681068# ==============================================================================
1069- # BYOK_RAG_MAX_CHUNKS Capping Tests
1069+ # INLINE_RAG_MAX_CHUNKS Capping Tests
10701070# ==============================================================================
10711071
10721072
10731073@pytest .mark .asyncio
1074- async def test_query_byok_max_chunks_caps_retrieved_results ( # pylint: disable=too-many-locals
1074+ async def test_query_rag_content_limit_caps_retrieved_results ( # pylint: disable=too-many-locals
10751075 test_config : AppConfig ,
10761076 mocker : MockerFixture ,
10771077 test_request : Request ,
10781078 test_auth : AuthTuple ,
10791079) -> None :
1080- """Test that BYOK_RAG_MAX_CHUNKS caps the number of returned chunks.
1080+ """Test that INLINE_RAG_MAX_CHUNKS caps the number of returned chunks.
10811081
1082- A single source returns more chunks than BYOK_RAG_MAX_CHUNKS allows.
1083- The response should contain at most BYOK_RAG_MAX_CHUNKS chunks and
1082+ A single source returns more chunks than INLINE_RAG_MAX_CHUNKS allows.
1083+ The response should contain at most INLINE_RAG_MAX_CHUNKS chunks and
10841084 they should be the highest-scored ones.
10851085
10861086 Verifies:
1087- - Number of RAG chunks does not exceed BYOK_RAG_MAX_CHUNKS
1087+ - Number of RAG chunks does not exceed INLINE_RAG_MAX_CHUNKS
10881088 - Returned chunks are the top-scoring ones
10891089 """
10901090 entry = mocker .MagicMock ()
@@ -1101,8 +1101,8 @@ async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable=
11011101 mock_holder_class = mocker .patch ("app.endpoints.query.AsyncLlamaStackClientHolder" )
11021102 mock_client = _build_base_mock_client (mocker )
11031103
1104- # Generate more chunks than BYOK_RAG_MAX_CHUNKS
1105- num_chunks = constants .BYOK_RAG_MAX_CHUNKS + 1
1104+ # Generate more chunks than INLINE_RAG_MAX_CHUNKS
1105+ num_chunks = constants .INLINE_RAG_MAX_CHUNKS + 1
11061106 chunks_data = [
11071107 (f"Chunk content { i } " , f"chunk-{ i } " , round (0.50 + i * 0.03 , 2 ))
11081108 for i in range (num_chunks )
@@ -1141,7 +1141,7 @@ async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable=
11411141 )
11421142
11431143 assert response .rag_chunks is not None
1144- assert len (response .rag_chunks ) == constants .BYOK_RAG_MAX_CHUNKS
1144+ assert len (response .rag_chunks ) == constants .INLINE_RAG_MAX_CHUNKS
11451145
11461146 # Check that the score is computed properly
11471147 for chunk in response .rag_chunks :
@@ -1161,20 +1161,20 @@ async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable=
11611161
11621162
11631163@pytest .mark .asyncio
1164- async def test_query_byok_max_chunks_caps_across_multiple_sources ( # pylint: disable=too-many-locals
1164+ async def test_query_rag_content_limit_caps_across_multiple_sources ( # pylint: disable=too-many-locals
11651165 test_config : AppConfig ,
11661166 mocker : MockerFixture ,
11671167 test_request : Request ,
11681168 test_auth : AuthTuple ,
11691169) -> None :
1170- """Test that BYOK_RAG_MAX_CHUNKS caps chunks across multiple sources.
1170+ """Test that INLINE_RAG_MAX_CHUNKS caps chunks across multiple sources.
11711171
1172- Two sources each return several chunks. The combined result should
1173- not exceed BYOK_RAG_MAX_CHUNKS and should contain the globally
1174- highest-scored chunks regardless of source.
1172+ Two sources each return several chunks. The combined result should not
1173+ exceed INLINE_RAG_MAX_CHUNKS and should contain the globally highest-scored
1174+ chunks regardless of source.
11751175
11761176 Verifies:
1177- - Total chunks across sources are capped at BYOK_RAG_MAX_CHUNKS
1177+ - Total chunks across sources are capped at INLINE_RAG_MAX_CHUNKS
11781178 - Top-scoring chunks from both sources are included
11791179 """
11801180 entry_a = mocker .MagicMock ()
@@ -1194,7 +1194,7 @@ async def test_query_byok_max_chunks_caps_across_multiple_sources( # pylint: di
11941194 mock_client = _build_base_mock_client (mocker )
11951195
11961196 # Overlapping score bands so top-k must pick from both sources
1197- n = constants .BYOK_RAG_MAX_CHUNKS
1197+ n = constants .INLINE_RAG_MAX_CHUNKS
11981198 resp_a = _make_vector_io_response (
11991199 mocker ,
12001200 [
@@ -1246,7 +1246,7 @@ async def _side_effect(**kwargs: Any) -> Any:
12461246 )
12471247
12481248 assert response .rag_chunks is not None
1249- assert len (response .rag_chunks ) == constants .BYOK_RAG_MAX_CHUNKS
1249+ assert len (response .rag_chunks ) == constants .INLINE_RAG_MAX_CHUNKS
12501250
12511251 # Check that the score is computed properly
12521252 for chunk in response .rag_chunks :
@@ -1266,3 +1266,79 @@ async def _side_effect(**kwargs: Any) -> Any:
12661266 chunk_contents = {chunk .content for chunk in response .rag_chunks }
12671267 assert "Source A chunk 0" not in chunk_contents
12681268 assert "Source B chunk 0" not in chunk_contents
1269+
1270+
1271+ @pytest .mark .asyncio
1272+ async def test_query_rag_content_limit_caps_inline_rag ( # pylint: disable=too-many-locals
1273+ test_config : AppConfig ,
1274+ mocker : MockerFixture ,
1275+ test_request : Request ,
1276+ test_auth : AuthTuple ,
1277+ ) -> None :
1278+ """Test that INLINE_RAG_MAX_CHUNKS caps inline RAG below BYOK_RAG_MAX_CHUNKS.
1279+
1280+ Sets INLINE_RAG_MAX_CHUNKS to 3 (below BYOK_RAG_MAX_CHUNKS=10) and feeds
1281+ 10 chunks. The result should be capped at 3.
1282+
1283+ Verifies:
1284+ - Number of inline RAG chunks equals the lowered INLINE_RAG_MAX_CHUNKS
1285+ - Returned chunks are the top-scoring ones
1286+ """
1287+ mocker .patch ("utils.vector_search.constants.INLINE_RAG_MAX_CHUNKS" , 3 )
1288+
1289+ entry = mocker .MagicMock ()
1290+ entry .rag_id = "big-source"
1291+ entry .vector_db_id = "vs-big-source"
1292+ entry .score_multiplier = 1.0
1293+
1294+ test_config .configuration .byok_rag = [entry ]
1295+ test_config .configuration .rag .inline = ["big-source" ]
1296+ test_config .configuration .reranker .enabled = False
1297+
1298+ mock_holder_class = mocker .patch ("app.endpoints.query.AsyncLlamaStackClientHolder" )
1299+ mock_client = _build_base_mock_client (mocker )
1300+
1301+ num_chunks = constants .BYOK_RAG_MAX_CHUNKS
1302+ chunks_data = [
1303+ (f"Chunk content { i } " , f"chunk-{ i } " , round (0.50 + i * 0.03 , 2 ))
1304+ for i in range (num_chunks )
1305+ ]
1306+ mock_client .vector_io .query = mocker .AsyncMock (
1307+ return_value = _make_vector_io_response (mocker , chunks_data )
1308+ )
1309+
1310+ mock_vs_resp = mocker .MagicMock ()
1311+ mock_vs_resp .data = []
1312+ mock_client .vector_stores .list .return_value = mock_vs_resp
1313+
1314+ mock_holder_class .return_value .get_client .return_value = mock_client
1315+
1316+ query_request = QueryRequest (
1317+ query = "test query" ,
1318+ conversation_id = None ,
1319+ provider = None ,
1320+ model = None ,
1321+ system_prompt = None ,
1322+ attachments = None ,
1323+ no_tools = False ,
1324+ generate_topic_summary = None ,
1325+ media_type = None ,
1326+ vector_store_ids = None ,
1327+ shield_ids = None ,
1328+ solr = None ,
1329+ )
1330+
1331+ response = await query_endpoint_handler (
1332+ request = test_request ,
1333+ query_request = query_request ,
1334+ auth = test_auth ,
1335+ mcp_headers = {},
1336+ )
1337+
1338+ assert response .rag_chunks is not None
1339+ assert len (response .rag_chunks ) == 3
1340+
1341+ scores : list [float ] = [
1342+ chunk .score for chunk in response .rag_chunks if chunk .score is not None
1343+ ]
1344+ assert scores == sorted (scores , reverse = True )
0 commit comments