Skip to content

Commit 116621d

Browse files
sciapanCAclaude
andauthored
Add datasource relevance filter (#17)
* Add datasource relevance filter * Improve parsing of total header --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 4ad35fb commit 116621d

2 files changed

Lines changed: 270 additions & 8 deletions

File tree

src/tests/test_datasources.py

Lines changed: 184 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,4 +170,187 @@ async def test_get_data_sources_handles_missing_repository_ids(mock_get_api_key)
170170
workspace = data_sources[0]
171171
assert workspace["id"] == "workspace-1"
172172
assert workspace["name"] == "Test Workspace"
173-
assert "repositoryIds" not in workspace
173+
assert "repositoryIds" not in workspace
174+
175+
176+
def _ctx_with_response(json_return, headers=None):
177+
"""Builds a mocked Context whose client.get returns a response with the given JSON body."""
178+
mock_ctx = MagicMock(spec=Context)
179+
mock_ctx.info = AsyncMock()
180+
mock_ctx.warning = AsyncMock()
181+
mock_ctx.error = AsyncMock()
182+
183+
mock_response = MagicMock()
184+
mock_response.json.return_value = json_return
185+
mock_response.headers = headers or {}
186+
mock_response.raise_for_status = MagicMock()
187+
188+
mock_client = AsyncMock()
189+
mock_client.get = AsyncMock(return_value=mock_response)
190+
191+
mock_lifespan_context = MagicMock()
192+
mock_lifespan_context.base_url = "https://api.example.com"
193+
mock_lifespan_context.client = mock_client
194+
mock_ctx.request_context.lifespan_context = mock_lifespan_context
195+
return mock_ctx, mock_client
196+
197+
198+
@pytest.mark.asyncio
199+
@patch('tools.datasources.get_api_key_from_context')
200+
async def test_get_data_sources_with_query_passes_query_param(mock_get_api_key):
201+
"""When a query is supplied, it is forwarded to the listing endpoint as the `query` param."""
202+
mock_get_api_key.return_value = "test-key"
203+
mock_ctx, mock_client = _ctx_with_response([
204+
{"id": "repo-1", "name": "Repo", "type": "Repository", "relevanceReason": "handles OAuth"},
205+
])
206+
207+
await get_data_sources(mock_ctx, alive_only=True, query="add OAuth to checkout")
208+
209+
call_args = mock_client.get.call_args
210+
assert call_args.args[0] == "/api/datasources/ready"
211+
assert call_args.kwargs["params"] == {"query": "add OAuth to checkout"}
212+
213+
214+
@pytest.mark.asyncio
215+
@patch('tools.datasources.get_api_key_from_context')
216+
async def test_get_data_sources_without_query_sends_no_query_param(mock_get_api_key):
217+
"""Without a query, no `query` param is sent (legacy behavior unchanged)."""
218+
mock_get_api_key.return_value = "test-key"
219+
mock_ctx, mock_client = _ctx_with_response([
220+
{"id": "repo-1", "name": "Repo", "type": "Repository"},
221+
])
222+
223+
await get_data_sources(mock_ctx, alive_only=True)
224+
225+
call_args = mock_client.get.call_args
226+
assert call_args.kwargs.get("params") is None
227+
228+
229+
@pytest.mark.asyncio
230+
@patch('tools.datasources.get_api_key_from_context')
231+
async def test_get_data_sources_surfaces_relevance_reason(mock_get_api_key):
232+
"""relevanceReason is preserved per item for the client (wrapped shape when query is set)."""
233+
mock_get_api_key.return_value = "test-key"
234+
mock_ctx, _ = _ctx_with_response([
235+
{"id": "repo-1", "name": "Repo", "type": "Repository", "relevanceReason": "implements the checkout flow"},
236+
])
237+
238+
result = await get_data_sources(mock_ctx, alive_only=True, query="checkout")
239+
240+
payload = result
241+
assert payload["dataSources"][0]["relevanceReason"] == "implements the checkout flow"
242+
243+
244+
@pytest.mark.asyncio
245+
@patch('tools.datasources.get_api_key_from_context')
246+
async def test_get_data_sources_filtered_hint_reports_total_and_omitted(mock_get_api_key):
247+
"""Filtered success surfaces how many sources exist beyond the shown subset and how to get them."""
248+
mock_get_api_key.return_value = "test-key"
249+
mock_ctx, _ = _ctx_with_response(
250+
[{"id": "repo-1", "name": "Repo", "type": "Repository", "relevanceReason": "checkout flow"}],
251+
headers={"X-CodeAlive-Total-Data-Sources": "25"},
252+
)
253+
254+
result = await get_data_sources(mock_ctx, alive_only=True, query="checkout")
255+
256+
payload = result
257+
assert len(payload["dataSources"]) == 1
258+
assert "1 of 25" in payload["message"]
259+
assert "omitted" in payload["message"].lower()
260+
assert "without a query" in payload["message"].lower()
261+
262+
263+
@pytest.mark.asyncio
264+
@patch('tools.datasources.get_api_key_from_context')
265+
async def test_get_data_sources_filtered_hint_without_total_header(mock_get_api_key):
266+
"""Filtered success without the total header still hints that sources were omitted."""
267+
mock_get_api_key.return_value = "test-key"
268+
mock_ctx, _ = _ctx_with_response(
269+
[{"id": "repo-1", "name": "Repo", "type": "Repository", "relevanceReason": "checkout flow"}],
270+
)
271+
272+
result = await get_data_sources(mock_ctx, alive_only=True, query="checkout")
273+
274+
payload = result
275+
assert "omitted" in payload["message"].lower()
276+
assert "without a query" in payload["message"].lower()
277+
278+
279+
@pytest.mark.asyncio
280+
@patch('tools.datasources.get_api_key_from_context')
281+
async def test_get_data_sources_filtered_hint_with_malformed_total_header(mock_get_api_key):
282+
"""A malformed total header is treated as absent rather than raising."""
283+
mock_get_api_key.return_value = "test-key"
284+
mock_ctx, _ = _ctx_with_response(
285+
[{"id": "repo-1", "name": "Repo", "type": "Repository", "relevanceReason": "checkout flow"}],
286+
headers={"X-CodeAlive-Total-Data-Sources": "not-a-number"},
287+
)
288+
289+
result = await get_data_sources(mock_ctx, alive_only=True, query="checkout")
290+
291+
payload = result
292+
assert "omitted" in payload["message"].lower()
293+
assert "without a query" in payload["message"].lower()
294+
295+
296+
@pytest.mark.asyncio
297+
@patch('tools.datasources.get_api_key_from_context')
298+
async def test_get_data_sources_all_relevant_hint_reports_no_omission(mock_get_api_key):
299+
"""When every available source is relevant, the hint says so instead of claiming omissions."""
300+
mock_get_api_key.return_value = "test-key"
301+
mock_ctx, _ = _ctx_with_response(
302+
[{"id": "repo-1", "name": "Repo", "type": "Repository", "relevanceReason": "checkout flow"}],
303+
headers={"X-CodeAlive-Total-Data-Sources": "1"},
304+
)
305+
306+
result = await get_data_sources(mock_ctx, alive_only=True, query="checkout")
307+
308+
payload = result
309+
assert "all 1" in payload["message"].lower()
310+
assert "omitted" not in payload["message"].lower()
311+
312+
313+
@pytest.mark.asyncio
314+
@patch('tools.datasources.get_api_key_from_context')
315+
async def test_get_data_sources_failopen_hint_when_no_reasons_present(mock_get_api_key):
316+
"""Query supplied but no item carries relevanceReason → the filter did not run (fail-open,
317+
disabled, or an older backend); the hint must say the FULL list is returned."""
318+
mock_get_api_key.return_value = "test-key"
319+
mock_ctx, _ = _ctx_with_response([
320+
{"id": "repo-1", "name": "Repo", "type": "Repository"},
321+
{"id": "repo-2", "name": "Other", "type": "Repository"},
322+
])
323+
324+
result = await get_data_sources(mock_ctx, alive_only=True, query="checkout")
325+
326+
payload = result
327+
assert len(payload["dataSources"]) == 2
328+
assert "unavailable" in payload["message"].lower()
329+
assert "full" in payload["message"].lower()
330+
331+
332+
@pytest.mark.asyncio
333+
@patch('tools.datasources.get_api_key_from_context')
334+
async def test_get_data_sources_empty_with_query_returns_no_relevant_hint(mock_get_api_key):
335+
"""Empty result WITH a query returns a 'no relevant' hint, not 'add a repository'."""
336+
mock_get_api_key.return_value = "test-key"
337+
mock_ctx, _ = _ctx_with_response([])
338+
339+
result = await get_data_sources(mock_ctx, alive_only=True, query="something unrelated")
340+
341+
assert result["dataSources"] == []
342+
assert "relevant" in result["hint"].lower()
343+
assert "add a repository" not in result["hint"].lower()
344+
345+
346+
@pytest.mark.asyncio
347+
@patch('tools.datasources.get_api_key_from_context')
348+
async def test_get_data_sources_empty_without_query_keeps_add_repository_hint(mock_get_api_key):
349+
"""Empty result WITHOUT a query keeps the existing 'add a repository' hint."""
350+
mock_get_api_key.return_value = "test-key"
351+
mock_ctx, _ = _ctx_with_response([])
352+
353+
result = await get_data_sources(mock_ctx, alive_only=True)
354+
355+
assert result["dataSources"] == []
356+
assert "add a repository" in result["hint"].lower()

src/tools/datasources.py

Lines changed: 86 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,55 @@
66
import httpx
77
from fastmcp import Context
88

9-
from core import CodeAliveContext, get_api_key_from_context, log_api_request, log_api_response
9+
from core import (
10+
CodeAliveContext,
11+
get_api_key_from_context,
12+
log_api_request,
13+
log_api_response,
14+
)
1015
from utils import handle_api_error
1116

1217
# MCP tool/method name surfaced in every error/log message from this module.
1318
_TOOL_NAME = "get_data_sources"
1419

20+
# Pre-filter scoped candidate count, emitted by the backend only on relevance-filtered requests.
21+
_TOTAL_HEADER = "X-CodeAlive-Total-Data-Sources"
22+
23+
24+
def _relevance_message(data_sources: list, response) -> str:
25+
"""Builds the hint accompanying a query'd (relevance-filtered) result.
26+
27+
The backend guarantees every relevance-selected item carries a non-empty `relevanceReason`,
28+
so a query'd response where NO item has one means the filter did not run (fail-open on error,
29+
disabled by config, or an older backend ignoring `query`) and the FULL list was returned —
30+
the model must be told, instead of mistaking the full dump for a relevant shortlist.
31+
"""
32+
filtered = any(ds.get("relevanceReason") for ds in data_sources)
33+
if not filtered:
34+
return (
35+
"Relevance filtering was unavailable for this request (it may have failed or be "
36+
"disabled), so the FULL unfiltered list of data sources is returned."
37+
)
38+
39+
shown = len(data_sources)
40+
try:
41+
total = int(response.headers.get(_TOTAL_HEADER))
42+
except (TypeError, ValueError):
43+
# Header absent (TypeError on int(None)) or malformed (ValueError).
44+
total = None
45+
if total is not None and total > shown:
46+
return (
47+
f"{shown} of {total} available data sources are relevant to this query; the other "
48+
f"{total - shown} were omitted. Call get_data_sources without a query to get the full list."
49+
)
50+
if total is not None:
51+
return f"All {total} available data sources are relevant to this query."
52+
return (
53+
"Only the data sources relevant to this query are shown; non-relevant sources were "
54+
"omitted. Call get_data_sources without a query to get the full list."
55+
)
56+
57+
1558
# Hint embedded in every successful response. Mirrors the convention used by
1659
# the search tools (see _SEARCH_HINT in utils/response_transformer.py): the
1760
# response is always in front of the model when it picks the next step, so we
@@ -31,9 +74,19 @@
3174
"being indexed."
3275
)
3376

77+
# Empty result WITH a query means "nothing relevant to this intent" (sources DO exist) —
78+
# a distinct hint from the no-sources-at-all case, so the model doesn't tell the user
79+
# to add a repository.
80+
_DATASOURCES_EMPTY_QUERY_HINT = (
81+
"No data sources are relevant to this query. Try a broader query, or call "
82+
"get_data_sources without a query to see the full list."
83+
)
84+
3485

3586
# alive_only refers to ready_only. leaved as is for backward compatibility.
36-
async def get_data_sources(ctx: Context, alive_only: bool = True) -> Dict[str, Any]:
87+
async def get_data_sources(
88+
ctx: Context, alive_only: bool = True, query: str | None = None
89+
) -> Dict[str, Any]:
3790
"""
3891
**CALL THIS FIRST**: Gets all available data sources (repositories and workspaces) for the user's account.
3992
@@ -47,10 +100,20 @@ async def get_data_sources(ctx: Context, alive_only: bool = True) -> Dict[str, A
47100
Args:
48101
alive_only: If True (default), returns only data sources that are fully processed and ready for use.
49102
If False, returns all data sources regardless of processing state.
103+
query: Optional. The user's initial intent/task in natural language (e.g. "add OAuth to
104+
checkout"). When provided, the backend runs an agentic relevance filter and returns
105+
ONLY the data sources relevant to that intent, each with a `relevanceReason`
106+
explaining why. This is the user's GOAL — distinct from `searchTerm` (a substring
107+
name filter). Omit it to get the full list. Pass it whenever you
108+
know what the user is trying to accomplish, to keep the returned list focused.
50109
51110
Returns:
52111
{"dataSources": [...], "hint": "..."}
53112
113+
With `query`, the object also carries a `message` field telling you whether sources
114+
were omitted as non-relevant (and how many of the total), that every available source
115+
was relevant, or that relevance filtering was unavailable and the FULL list is returned.
116+
54117
Each entry in `dataSources` carries:
55118
- id: Unique identifier for the data source
56119
- name: Human-readable name - CRITICAL for matching with current working directory name
@@ -59,6 +122,7 @@ async def get_data_sources(ctx: Context, alive_only: bool = True) -> Dict[str, A
59122
- type: The type of data source ("Repository" or "Workspace")
60123
- url: Repository URL (for Repository type only) - useful for matching with git remote
61124
- state: The processing state of the data source (if alive_only=false)
125+
- relevanceReason: Why this source is relevant to `query` (present ONLY when `query` was supplied)
62126
63127
The `hint` field reminds you how to use the result and how to distinguish
64128
the CURRENT repository from EXTERNAL ones.
@@ -117,12 +181,17 @@ async def get_data_sources(ctx: Context, alive_only: bool = True) -> Dict[str, A
117181
"X-CodeAlive-Client": "fastmcp",
118182
}
119183

184+
# Thread the user's intent as the `query` param when present so the backend relevance
185+
# filter runs. Omitted entirely otherwise, so the request is unchanged for legacy callers
186+
# (and an older backend that ignores `query` simply returns the full list).
187+
params = {"query": query} if query else None
188+
120189
# Log the request
121190
full_url = urljoin(context.base_url, endpoint)
122191
request_id = log_api_request("GET", full_url, headers)
123192

124193
# Make API request
125-
response = await context.client.get(endpoint, headers=headers)
194+
response = await context.client.get(endpoint, headers=headers, params=params)
126195

127196
# Log the response
128197
log_api_response(response, request_id)
@@ -133,19 +202,29 @@ async def get_data_sources(ctx: Context, alive_only: bool = True) -> Dict[str, A
133202
data_sources = response.json()
134203

135204
if not data_sources or len(data_sources) == 0:
136-
return {"dataSources": [], "hint": _DATASOURCES_EMPTY_HINT}
205+
hint = _DATASOURCES_EMPTY_QUERY_HINT if query else _DATASOURCES_EMPTY_HINT
206+
return {"dataSources": [], "hint": hint}
137207

138208
# Remove repositoryIds from workspace data sources
139209
for data_source in data_sources:
140-
if data_source.get("type") == "Workspace" and "repositoryIds" in data_source:
210+
if (
211+
data_source.get("type") == "Workspace"
212+
and "repositoryIds" in data_source
213+
):
141214
del data_source["repositoryIds"]
142215

143216
# FastMCP serializes via pydantic_core.to_json, which preserves UTF-8.
144-
return {"dataSources": data_sources, "hint": _DATASOURCES_HINT}
217+
result: Dict[str, Any] = {"dataSources": data_sources, "hint": _DATASOURCES_HINT}
218+
if query:
219+
result["message"] = _relevance_message(data_sources, response)
220+
return result
145221

146222
except (httpx.HTTPStatusError, Exception) as e:
147223
await handle_api_error(
148-
ctx, e, "retrieving data sources", method=_TOOL_NAME,
224+
ctx,
225+
e,
226+
"retrieving data sources",
227+
method=_TOOL_NAME,
149228
recovery_hints={
150229
# 422 means *some* sources are still indexing — surface alive_only=false as the next step
151230
422: (

0 commit comments

Comments
 (0)