LCORE-1333: Define max chunks to retrieve for RAG (#1765)

are-ces · claude · web-flow · commit 892f376b08cb · 2026-06-01T15:57:11.000+02:00
*LCORE-1333: Add INLINE_RAG_MAX_CHUNKS to cap inline RAG chunks*

Add INLINE_RAG_MAX_CHUNKS constant (default: 10) that caps the final
merged BYOK + OKP output from build_rag_context. Per-source constants
(BYOK_RAG_MAX_CHUNKS, OKP_RAG_MAX_CHUNKS) remain as fetch hints for
the reranking pool. Tool RAG is unaffected.

Also adds test_responses_byok_integration.py covering inline RAG, tool
RAG, combined RAG, score multiplier, chunk capping, and
INLINE_RAG_MAX_CHUNKS enforcement for the /responses endpoint.


Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/docs/byok_guide.md b/docs/byok_guide.md
@@ -85,6 +85,9 @@ Inline RAG additionally supports:
 > `score_multiplier` does not apply to OKP results. To control the amount of retrieved
 > context, set the `BYOK_RAG_MAX_CHUNKS` and `OKP_RAG_MAX_CHUNKS` constants in `src/constants.py`
 > (defaults: 10 and 5 respectively). For Tool RAG, use `TOOL_RAG_MAX_CHUNKS` (default: 10).
+> The `INLINE_RAG_MAX_CHUNKS` constant (value: 10) caps the final merged inline RAG
+> chunks (BYOK + OKP) delivered to the LLM. Tool RAG is controlled independently
+> by `TOOL_RAG_MAX_CHUNKS`.
 
 ---
 
diff --git a/docs/rag_guide.md b/docs/rag_guide.md
@@ -382,11 +382,12 @@ OKP and BYOK scores are not directly comparable (different scoring systems), so
 `score_multiplier` (a BYOK-only concept) does not apply to OKP results. To control
 the number of retrieved chunks, set the constants in `src/constants.py`:
 
-| Constant | Default | Description |
-|----------|---------|-------------|
-| `OKP_RAG_MAX_CHUNKS` | 5 | Max chunks retrieved from OKP (Inline RAG) |
-| `BYOK_RAG_MAX_CHUNKS` | 10 | Max chunks retrieved from BYOK stores (Inline RAG) |
-| `TOOL_RAG_MAX_CHUNKS` | 10 | Max chunks retrieved via Tool RAG (`file_search`) |
+| Constant | Value | Description |
+|----------|-------|-------------|
+| `INLINE_RAG_MAX_CHUNKS` | 10 | Hard upper bound on the final merged inline RAG chunks (BYOK + OKP) delivered to the LLM |
+| `OKP_RAG_MAX_CHUNKS` | 5 | Fetch hint for OKP (Inline RAG); controls how many chunks enter the reranking pool |
+| `BYOK_RAG_MAX_CHUNKS` | 10 | Fetch hint for BYOK stores (Inline RAG); controls how many chunks enter the reranking pool |
+| `TOOL_RAG_MAX_CHUNKS` | 10 | Max chunks retrieved via Tool RAG (`file_search`); independent from `INLINE_RAG_MAX_CHUNKS` |
 
 **Limitations:**
 
diff --git a/src/constants.py b/src/constants.py
@@ -188,6 +188,9 @@
 USER_QUOTA_LIMITER: Final[str] = "user_limiter"
 CLUSTER_QUOTA_LIMITER: Final[str] = "cluster_limiter"
 
+# Hard cap on total RAG chunks delivered to the LLM across all sources
+INLINE_RAG_MAX_CHUNKS: Final[int] = 10
+
 # RAG as a tool constants
 DEFAULT_RAG_TOOL: Final[str] = "file_search"
 TOOL_RAG_MAX_CHUNKS: Final[int] = 10  # retrieved from RAG as a tool
diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py
@@ -638,9 +638,9 @@ async def build_rag_context(  # pylint: disable=too-many-locals,too-many-branche
 ) -> RAGContext:
     """Build RAG context by fetching and merging chunks from all enabled sources.
 
-    Fetches 2 * BYOK_RAG_MAX_CHUNKS from each of BYOK and Solr, merges and keeps
-    top 2 * BYOK_RAG_MAX_CHUNKS by score, reranks with a cross-encoder, then
-    keeps the top BYOK_RAG_MAX_CHUNKS for context. Enabled sources can be BYOK
+    Each source fetches using its per-source limit to build the reranking pool.
+    Results are merged, sorted by score, reranked with a cross-encoder if
+    enabled, then capped at INLINE_RAG_MAX_CHUNKS. Enabled sources can be BYOK
     and/or Solr OKP.
 
     Args:
@@ -655,34 +655,32 @@ async def build_rag_context(  # pylint: disable=too-many-locals,too-many-branche
     if moderation_decision == "blocked":
         return RAGContext()
 
-    pool_size = 2 * constants.BYOK_RAG_MAX_CHUNKS
-    top_k = constants.BYOK_RAG_MAX_CHUNKS
+    top_k = constants.INLINE_RAG_MAX_CHUNKS
 
-    # Fetch 2*BYOK_RAG_MAX_CHUNKS from each source in parallel
+    # Fetch from each source using per-source limits for the reranking pool
     byok_chunks_task = _fetch_byok_rag(
-        client, query, vector_store_ids, max_chunks=pool_size
+        client, query, vector_store_ids, max_chunks=constants.BYOK_RAG_MAX_CHUNKS
     )
     solr_chunks_task = _fetch_solr_rag(client, query, solr)
 
     (byok_chunks, byok_documents), (solr_chunks, solr_documents) = await asyncio.gather(
         byok_chunks_task, solr_chunks_task
     )
 
-    # Merge: combine and sort by score, keep top 2*BYOK_RAG_MAX_CHUNKS
+    # Merge: combine and sort by score
     merged = byok_chunks + solr_chunks
     merged.sort(
         key=lambda c: c.score if c.score is not None else float("-inf"), reverse=True
     )
-    merged = merged[:pool_size]
 
-    # Rerank full pool with cross-encoder if enabled; boost BYOK then take top_k
+    # Rerank full pool with cross-encoder if enabled; then take top_k
     if configuration.reranker.enabled:
         logger.info(
             "Reranker enabled: processing %d chunks with model '%s'",
             len(merged),
             configuration.reranker.model,
         )
-        reranked = await rerank_chunks_with_cross_encoder(query, merged, pool_size)
+        reranked = await rerank_chunks_with_cross_encoder(query, merged, len(merged))
         context_chunks = apply_byok_rerank_boost(reranked)[:top_k]
         logger.info(
             "Reranker completed: returned %d top chunks after BYOK boost",
diff --git a/tests/integration/endpoints/test_query_byok_integration.py b/tests/integration/endpoints/test_query_byok_integration.py
@@ -1066,25 +1066,25 @@ async def _side_effect(**kwargs: Any) -> Any:
 
 
 # ==============================================================================
-# BYOK_RAG_MAX_CHUNKS Capping Tests
+# INLINE_RAG_MAX_CHUNKS Capping Tests
 # ==============================================================================
 
 
 @pytest.mark.asyncio
-async def test_query_byok_max_chunks_caps_retrieved_results(  # pylint: disable=too-many-locals
+async def test_query_rag_content_limit_caps_retrieved_results(  # pylint: disable=too-many-locals
     test_config: AppConfig,
     mocker: MockerFixture,
     test_request: Request,
     test_auth: AuthTuple,
 ) -> None:
-    """Test that BYOK_RAG_MAX_CHUNKS caps the number of returned chunks.
+    """Test that INLINE_RAG_MAX_CHUNKS caps the number of returned chunks.
 
-    A single source returns more chunks than BYOK_RAG_MAX_CHUNKS allows.
-    The response should contain at most BYOK_RAG_MAX_CHUNKS chunks and
+    A single source returns more chunks than INLINE_RAG_MAX_CHUNKS allows.
+    The response should contain at most INLINE_RAG_MAX_CHUNKS chunks and
     they should be the highest-scored ones.
 
     Verifies:
-    - Number of RAG chunks does not exceed BYOK_RAG_MAX_CHUNKS
+    - Number of RAG chunks does not exceed INLINE_RAG_MAX_CHUNKS
     - Returned chunks are the top-scoring ones
     """
     entry = mocker.MagicMock()
@@ -1101,8 +1101,8 @@ async def test_query_byok_max_chunks_caps_retrieved_results(  # pylint: disable=
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
     mock_client = _build_base_mock_client(mocker)
 
-    # Generate more chunks than BYOK_RAG_MAX_CHUNKS
-    num_chunks = constants.BYOK_RAG_MAX_CHUNKS + 1
+    # Generate more chunks than INLINE_RAG_MAX_CHUNKS
+    num_chunks = constants.INLINE_RAG_MAX_CHUNKS + 1
     chunks_data = [
         (f"Chunk content {i}", f"chunk-{i}", round(0.50 + i * 0.03, 2))
         for i in range(num_chunks)
@@ -1141,7 +1141,7 @@ async def test_query_byok_max_chunks_caps_retrieved_results(  # pylint: disable=
     )
 
     assert response.rag_chunks is not None
-    assert len(response.rag_chunks) == constants.BYOK_RAG_MAX_CHUNKS
+    assert len(response.rag_chunks) == constants.INLINE_RAG_MAX_CHUNKS
 
     # Check that the score is computed properly
     for chunk in response.rag_chunks:
@@ -1161,20 +1161,20 @@ async def test_query_byok_max_chunks_caps_retrieved_results(  # pylint: disable=
 
 
 @pytest.mark.asyncio
-async def test_query_byok_max_chunks_caps_across_multiple_sources(  # pylint: disable=too-many-locals
+async def test_query_rag_content_limit_caps_across_multiple_sources(  # pylint: disable=too-many-locals
     test_config: AppConfig,
     mocker: MockerFixture,
     test_request: Request,
     test_auth: AuthTuple,
 ) -> None:
-    """Test that BYOK_RAG_MAX_CHUNKS caps chunks across multiple sources.
+    """Test that INLINE_RAG_MAX_CHUNKS caps chunks across multiple sources.
 
-    Two sources each return several chunks.  The combined result should
-    not exceed BYOK_RAG_MAX_CHUNKS and should contain the globally
-    highest-scored chunks regardless of source.
+    Two sources each return several chunks. The combined result should not
+    exceed INLINE_RAG_MAX_CHUNKS and should contain the globally highest-scored
+    chunks regardless of source.
 
     Verifies:
-    - Total chunks across sources are capped at BYOK_RAG_MAX_CHUNKS
+    - Total chunks across sources are capped at INLINE_RAG_MAX_CHUNKS
     - Top-scoring chunks from both sources are included
     """
     entry_a = mocker.MagicMock()
@@ -1194,7 +1194,7 @@ async def test_query_byok_max_chunks_caps_across_multiple_sources(  # pylint: di
     mock_client = _build_base_mock_client(mocker)
 
     # Overlapping score bands so top-k must pick from both sources
-    n = constants.BYOK_RAG_MAX_CHUNKS
+    n = constants.INLINE_RAG_MAX_CHUNKS
     resp_a = _make_vector_io_response(
         mocker,
         [
@@ -1246,7 +1246,7 @@ async def _side_effect(**kwargs: Any) -> Any:
     )
 
     assert response.rag_chunks is not None
-    assert len(response.rag_chunks) == constants.BYOK_RAG_MAX_CHUNKS
+    assert len(response.rag_chunks) == constants.INLINE_RAG_MAX_CHUNKS
 
     # Check that the score is computed properly
     for chunk in response.rag_chunks:
@@ -1266,3 +1266,79 @@ async def _side_effect(**kwargs: Any) -> Any:
     chunk_contents = {chunk.content for chunk in response.rag_chunks}
     assert "Source A chunk 0" not in chunk_contents
     assert "Source B chunk 0" not in chunk_contents
+
+
+@pytest.mark.asyncio
+async def test_query_rag_content_limit_caps_inline_rag(  # pylint: disable=too-many-locals
+    test_config: AppConfig,
+    mocker: MockerFixture,
+    test_request: Request,
+    test_auth: AuthTuple,
+) -> None:
+    """Test that INLINE_RAG_MAX_CHUNKS caps inline RAG below BYOK_RAG_MAX_CHUNKS.
+
+    Sets INLINE_RAG_MAX_CHUNKS to 3 (below BYOK_RAG_MAX_CHUNKS=10) and feeds
+    10 chunks. The result should be capped at 3.
+
+    Verifies:
+    - Number of inline RAG chunks equals the lowered INLINE_RAG_MAX_CHUNKS
+    - Returned chunks are the top-scoring ones
+    """
+    mocker.patch("utils.vector_search.constants.INLINE_RAG_MAX_CHUNKS", 3)
+
+    entry = mocker.MagicMock()
+    entry.rag_id = "big-source"
+    entry.vector_db_id = "vs-big-source"
+    entry.score_multiplier = 1.0
+
+    test_config.configuration.byok_rag = [entry]
+    test_config.configuration.rag.inline = ["big-source"]
+    test_config.configuration.reranker.enabled = False
+
+    mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
+    mock_client = _build_base_mock_client(mocker)
+
+    num_chunks = constants.BYOK_RAG_MAX_CHUNKS
+    chunks_data = [
+        (f"Chunk content {i}", f"chunk-{i}", round(0.50 + i * 0.03, 2))
+        for i in range(num_chunks)
+    ]
+    mock_client.vector_io.query = mocker.AsyncMock(
+        return_value=_make_vector_io_response(mocker, chunks_data)
+    )
+
+    mock_vs_resp = mocker.MagicMock()
+    mock_vs_resp.data = []
+    mock_client.vector_stores.list.return_value = mock_vs_resp
+
+    mock_holder_class.return_value.get_client.return_value = mock_client
+
+    query_request = QueryRequest(
+        query="test query",
+        conversation_id=None,
+        provider=None,
+        model=None,
+        system_prompt=None,
+        attachments=None,
+        no_tools=False,
+        generate_topic_summary=None,
+        media_type=None,
+        vector_store_ids=None,
+        shield_ids=None,
+        solr=None,
+    )
+
+    response = await query_endpoint_handler(
+        request=test_request,
+        query_request=query_request,
+        auth=test_auth,
+        mcp_headers={},
+    )
+
+    assert response.rag_chunks is not None
+    assert len(response.rag_chunks) == 3
+
+    scores: list[float] = [
+        chunk.score for chunk in response.rag_chunks if chunk.score is not None
+    ]
+    assert scores == sorted(scores, reverse=True)
diff --git a/tests/integration/endpoints/test_responses_byok_integration.py b/tests/integration/endpoints/test_responses_byok_integration.py
diff --git a/tests/integration/endpoints/test_streaming_query_byok_integration.py b/tests/integration/endpoints/test_streaming_query_byok_integration.py