Skip to content

Commit 892f376

Browse files
are-cesclaude
andauthored
LCORE-1333: Define max chunks to retrieve for RAG (#1765)
*LCORE-1333: Add INLINE_RAG_MAX_CHUNKS to cap inline RAG chunks* Add INLINE_RAG_MAX_CHUNKS constant (default: 10) that caps the final merged BYOK + OKP output from build_rag_context. Per-source constants (BYOK_RAG_MAX_CHUNKS, OKP_RAG_MAX_CHUNKS) remain as fetch hints for the reranking pool. Tool RAG is unaffected. Also adds test_responses_byok_integration.py covering inline RAG, tool RAG, combined RAG, score multiplier, chunk capping, and INLINE_RAG_MAX_CHUNKS enforcement for the /responses endpoint. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0dd4ca4 commit 892f376

7 files changed

Lines changed: 916 additions & 50 deletions

File tree

docs/byok_guide.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ Inline RAG additionally supports:
8585
> `score_multiplier` does not apply to OKP results. To control the amount of retrieved
8686
> context, set the `BYOK_RAG_MAX_CHUNKS` and `OKP_RAG_MAX_CHUNKS` constants in `src/constants.py`
8787
> (defaults: 10 and 5 respectively). For Tool RAG, use `TOOL_RAG_MAX_CHUNKS` (default: 10).
88+
> The `INLINE_RAG_MAX_CHUNKS` constant (value: 10) caps the final merged inline RAG
89+
> chunks (BYOK + OKP) delivered to the LLM. Tool RAG is controlled independently
90+
> by `TOOL_RAG_MAX_CHUNKS`.
8891
8992
---
9093

docs/rag_guide.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -382,11 +382,12 @@ OKP and BYOK scores are not directly comparable (different scoring systems), so
382382
`score_multiplier` (a BYOK-only concept) does not apply to OKP results. To control
383383
the number of retrieved chunks, set the constants in `src/constants.py`:
384384

385-
| Constant | Default | Description |
386-
|----------|---------|-------------|
387-
| `OKP_RAG_MAX_CHUNKS` | 5 | Max chunks retrieved from OKP (Inline RAG) |
388-
| `BYOK_RAG_MAX_CHUNKS` | 10 | Max chunks retrieved from BYOK stores (Inline RAG) |
389-
| `TOOL_RAG_MAX_CHUNKS` | 10 | Max chunks retrieved via Tool RAG (`file_search`) |
385+
| Constant | Value | Description |
386+
|----------|-------|-------------|
387+
| `INLINE_RAG_MAX_CHUNKS` | 10 | Hard upper bound on the final merged inline RAG chunks (BYOK + OKP) delivered to the LLM |
388+
| `OKP_RAG_MAX_CHUNKS` | 5 | Fetch hint for OKP (Inline RAG); controls how many chunks enter the reranking pool |
389+
| `BYOK_RAG_MAX_CHUNKS` | 10 | Fetch hint for BYOK stores (Inline RAG); controls how many chunks enter the reranking pool |
390+
| `TOOL_RAG_MAX_CHUNKS` | 10 | Max chunks retrieved via Tool RAG (`file_search`); independent from `INLINE_RAG_MAX_CHUNKS` |
390391

391392
**Limitations:**
392393

src/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@
188188
USER_QUOTA_LIMITER: Final[str] = "user_limiter"
189189
CLUSTER_QUOTA_LIMITER: Final[str] = "cluster_limiter"
190190

191+
# Hard cap on total RAG chunks delivered to the LLM across all sources
192+
INLINE_RAG_MAX_CHUNKS: Final[int] = 10
193+
191194
# RAG as a tool constants
192195
DEFAULT_RAG_TOOL: Final[str] = "file_search"
193196
TOOL_RAG_MAX_CHUNKS: Final[int] = 10 # retrieved from RAG as a tool

src/utils/vector_search.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -638,9 +638,9 @@ async def build_rag_context( # pylint: disable=too-many-locals,too-many-branche
638638
) -> RAGContext:
639639
"""Build RAG context by fetching and merging chunks from all enabled sources.
640640
641-
Fetches 2 * BYOK_RAG_MAX_CHUNKS from each of BYOK and Solr, merges and keeps
642-
top 2 * BYOK_RAG_MAX_CHUNKS by score, reranks with a cross-encoder, then
643-
keeps the top BYOK_RAG_MAX_CHUNKS for context. Enabled sources can be BYOK
641+
Each source fetches using its per-source limit to build the reranking pool.
642+
Results are merged, sorted by score, reranked with a cross-encoder if
643+
enabled, then capped at INLINE_RAG_MAX_CHUNKS. Enabled sources can be BYOK
644644
and/or Solr OKP.
645645
646646
Args:
@@ -655,34 +655,32 @@ async def build_rag_context( # pylint: disable=too-many-locals,too-many-branche
655655
if moderation_decision == "blocked":
656656
return RAGContext()
657657

658-
pool_size = 2 * constants.BYOK_RAG_MAX_CHUNKS
659-
top_k = constants.BYOK_RAG_MAX_CHUNKS
658+
top_k = constants.INLINE_RAG_MAX_CHUNKS
660659

661-
# Fetch 2*BYOK_RAG_MAX_CHUNKS from each source in parallel
660+
# Fetch from each source using per-source limits for the reranking pool
662661
byok_chunks_task = _fetch_byok_rag(
663-
client, query, vector_store_ids, max_chunks=pool_size
662+
client, query, vector_store_ids, max_chunks=constants.BYOK_RAG_MAX_CHUNKS
664663
)
665664
solr_chunks_task = _fetch_solr_rag(client, query, solr)
666665

667666
(byok_chunks, byok_documents), (solr_chunks, solr_documents) = await asyncio.gather(
668667
byok_chunks_task, solr_chunks_task
669668
)
670669

671-
# Merge: combine and sort by score, keep top 2*BYOK_RAG_MAX_CHUNKS
670+
# Merge: combine and sort by score
672671
merged = byok_chunks + solr_chunks
673672
merged.sort(
674673
key=lambda c: c.score if c.score is not None else float("-inf"), reverse=True
675674
)
676-
merged = merged[:pool_size]
677675

678-
# Rerank full pool with cross-encoder if enabled; boost BYOK then take top_k
676+
# Rerank full pool with cross-encoder if enabled; then take top_k
679677
if configuration.reranker.enabled:
680678
logger.info(
681679
"Reranker enabled: processing %d chunks with model '%s'",
682680
len(merged),
683681
configuration.reranker.model,
684682
)
685-
reranked = await rerank_chunks_with_cross_encoder(query, merged, pool_size)
683+
reranked = await rerank_chunks_with_cross_encoder(query, merged, len(merged))
686684
context_chunks = apply_byok_rerank_boost(reranked)[:top_k]
687685
logger.info(
688686
"Reranker completed: returned %d top chunks after BYOK boost",

tests/integration/endpoints/test_query_byok_integration.py

Lines changed: 93 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,25 +1066,25 @@ async def _side_effect(**kwargs: Any) -> Any:
10661066

10671067

10681068
# ==============================================================================
1069-
# BYOK_RAG_MAX_CHUNKS Capping Tests
1069+
# INLINE_RAG_MAX_CHUNKS Capping Tests
10701070
# ==============================================================================
10711071

10721072

10731073
@pytest.mark.asyncio
1074-
async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable=too-many-locals
1074+
async def test_query_rag_content_limit_caps_retrieved_results( # pylint: disable=too-many-locals
10751075
test_config: AppConfig,
10761076
mocker: MockerFixture,
10771077
test_request: Request,
10781078
test_auth: AuthTuple,
10791079
) -> None:
1080-
"""Test that BYOK_RAG_MAX_CHUNKS caps the number of returned chunks.
1080+
"""Test that INLINE_RAG_MAX_CHUNKS caps the number of returned chunks.
10811081
1082-
A single source returns more chunks than BYOK_RAG_MAX_CHUNKS allows.
1083-
The response should contain at most BYOK_RAG_MAX_CHUNKS chunks and
1082+
A single source returns more chunks than INLINE_RAG_MAX_CHUNKS allows.
1083+
The response should contain at most INLINE_RAG_MAX_CHUNKS chunks and
10841084
they should be the highest-scored ones.
10851085
10861086
Verifies:
1087-
- Number of RAG chunks does not exceed BYOK_RAG_MAX_CHUNKS
1087+
- Number of RAG chunks does not exceed INLINE_RAG_MAX_CHUNKS
10881088
- Returned chunks are the top-scoring ones
10891089
"""
10901090
entry = mocker.MagicMock()
@@ -1101,8 +1101,8 @@ async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable=
11011101
mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
11021102
mock_client = _build_base_mock_client(mocker)
11031103

1104-
# Generate more chunks than BYOK_RAG_MAX_CHUNKS
1105-
num_chunks = constants.BYOK_RAG_MAX_CHUNKS + 1
1104+
# Generate more chunks than INLINE_RAG_MAX_CHUNKS
1105+
num_chunks = constants.INLINE_RAG_MAX_CHUNKS + 1
11061106
chunks_data = [
11071107
(f"Chunk content {i}", f"chunk-{i}", round(0.50 + i * 0.03, 2))
11081108
for i in range(num_chunks)
@@ -1141,7 +1141,7 @@ async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable=
11411141
)
11421142

11431143
assert response.rag_chunks is not None
1144-
assert len(response.rag_chunks) == constants.BYOK_RAG_MAX_CHUNKS
1144+
assert len(response.rag_chunks) == constants.INLINE_RAG_MAX_CHUNKS
11451145

11461146
# Check that the score is computed properly
11471147
for chunk in response.rag_chunks:
@@ -1161,20 +1161,20 @@ async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable=
11611161

11621162

11631163
@pytest.mark.asyncio
1164-
async def test_query_byok_max_chunks_caps_across_multiple_sources( # pylint: disable=too-many-locals
1164+
async def test_query_rag_content_limit_caps_across_multiple_sources( # pylint: disable=too-many-locals
11651165
test_config: AppConfig,
11661166
mocker: MockerFixture,
11671167
test_request: Request,
11681168
test_auth: AuthTuple,
11691169
) -> None:
1170-
"""Test that BYOK_RAG_MAX_CHUNKS caps chunks across multiple sources.
1170+
"""Test that INLINE_RAG_MAX_CHUNKS caps chunks across multiple sources.
11711171
1172-
Two sources each return several chunks. The combined result should
1173-
not exceed BYOK_RAG_MAX_CHUNKS and should contain the globally
1174-
highest-scored chunks regardless of source.
1172+
Two sources each return several chunks. The combined result should not
1173+
exceed INLINE_RAG_MAX_CHUNKS and should contain the globally highest-scored
1174+
chunks regardless of source.
11751175
11761176
Verifies:
1177-
- Total chunks across sources are capped at BYOK_RAG_MAX_CHUNKS
1177+
- Total chunks across sources are capped at INLINE_RAG_MAX_CHUNKS
11781178
- Top-scoring chunks from both sources are included
11791179
"""
11801180
entry_a = mocker.MagicMock()
@@ -1194,7 +1194,7 @@ async def test_query_byok_max_chunks_caps_across_multiple_sources( # pylint: di
11941194
mock_client = _build_base_mock_client(mocker)
11951195

11961196
# Overlapping score bands so top-k must pick from both sources
1197-
n = constants.BYOK_RAG_MAX_CHUNKS
1197+
n = constants.INLINE_RAG_MAX_CHUNKS
11981198
resp_a = _make_vector_io_response(
11991199
mocker,
12001200
[
@@ -1246,7 +1246,7 @@ async def _side_effect(**kwargs: Any) -> Any:
12461246
)
12471247

12481248
assert response.rag_chunks is not None
1249-
assert len(response.rag_chunks) == constants.BYOK_RAG_MAX_CHUNKS
1249+
assert len(response.rag_chunks) == constants.INLINE_RAG_MAX_CHUNKS
12501250

12511251
# Check that the score is computed properly
12521252
for chunk in response.rag_chunks:
@@ -1266,3 +1266,79 @@ async def _side_effect(**kwargs: Any) -> Any:
12661266
chunk_contents = {chunk.content for chunk in response.rag_chunks}
12671267
assert "Source A chunk 0" not in chunk_contents
12681268
assert "Source B chunk 0" not in chunk_contents
1269+
1270+
1271+
@pytest.mark.asyncio
1272+
async def test_query_rag_content_limit_caps_inline_rag( # pylint: disable=too-many-locals
1273+
test_config: AppConfig,
1274+
mocker: MockerFixture,
1275+
test_request: Request,
1276+
test_auth: AuthTuple,
1277+
) -> None:
1278+
"""Test that INLINE_RAG_MAX_CHUNKS caps inline RAG below BYOK_RAG_MAX_CHUNKS.
1279+
1280+
Sets INLINE_RAG_MAX_CHUNKS to 3 (below BYOK_RAG_MAX_CHUNKS=10) and feeds
1281+
10 chunks. The result should be capped at 3.
1282+
1283+
Verifies:
1284+
- Number of inline RAG chunks equals the lowered INLINE_RAG_MAX_CHUNKS
1285+
- Returned chunks are the top-scoring ones
1286+
"""
1287+
mocker.patch("utils.vector_search.constants.INLINE_RAG_MAX_CHUNKS", 3)
1288+
1289+
entry = mocker.MagicMock()
1290+
entry.rag_id = "big-source"
1291+
entry.vector_db_id = "vs-big-source"
1292+
entry.score_multiplier = 1.0
1293+
1294+
test_config.configuration.byok_rag = [entry]
1295+
test_config.configuration.rag.inline = ["big-source"]
1296+
test_config.configuration.reranker.enabled = False
1297+
1298+
mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
1299+
mock_client = _build_base_mock_client(mocker)
1300+
1301+
num_chunks = constants.BYOK_RAG_MAX_CHUNKS
1302+
chunks_data = [
1303+
(f"Chunk content {i}", f"chunk-{i}", round(0.50 + i * 0.03, 2))
1304+
for i in range(num_chunks)
1305+
]
1306+
mock_client.vector_io.query = mocker.AsyncMock(
1307+
return_value=_make_vector_io_response(mocker, chunks_data)
1308+
)
1309+
1310+
mock_vs_resp = mocker.MagicMock()
1311+
mock_vs_resp.data = []
1312+
mock_client.vector_stores.list.return_value = mock_vs_resp
1313+
1314+
mock_holder_class.return_value.get_client.return_value = mock_client
1315+
1316+
query_request = QueryRequest(
1317+
query="test query",
1318+
conversation_id=None,
1319+
provider=None,
1320+
model=None,
1321+
system_prompt=None,
1322+
attachments=None,
1323+
no_tools=False,
1324+
generate_topic_summary=None,
1325+
media_type=None,
1326+
vector_store_ids=None,
1327+
shield_ids=None,
1328+
solr=None,
1329+
)
1330+
1331+
response = await query_endpoint_handler(
1332+
request=test_request,
1333+
query_request=query_request,
1334+
auth=test_auth,
1335+
mcp_headers={},
1336+
)
1337+
1338+
assert response.rag_chunks is not None
1339+
assert len(response.rag_chunks) == 3
1340+
1341+
scores: list[float] = [
1342+
chunk.score for chunk in response.rag_chunks if chunk.score is not None
1343+
]
1344+
assert scores == sorted(scores, reverse=True)

0 commit comments

Comments
 (0)