Skip to content

Commit 41efd19

Browse files
rodion-mclaude
andcommitted
Wrap get_data_sources in {dataSources, hint} envelope; bump to 2.0.4
get_data_sources used to switch wire shapes between empty and non-empty responses — a bare list when sources existed, a dict carrying a recovery message when none did. Always returning {"dataSources": [...], "hint": "..."} matches the convention search tools settled on, gives the model guidance even when results are present, and removes the dual-shape edge case for clients. Drop the naive "no `, ` in text" compact-JSON assertion in the get_data_sources e2e test — the new hint is plain English and trips it. The round-trip equality against `json.dumps(..., ensure_ascii=False)` already verifies both compactness and UTF-8. Cover the two remaining serialization paths so future regressions get caught: an XML-content Cyrillic test for fetch_artifacts and an SSE-stream Cyrillic test for chat. Bump version triplet (pyproject / manifest / server) to 2.0.4 — wire-format change for get_data_sources plus the Unicode fixes from 2.0.3+1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent dbc81e5 commit 41efd19

6 files changed

Lines changed: 97 additions & 25 deletions

File tree

manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"manifest_version": "0.4",
33
"name": "codealive-mcp",
44
"display_name": "CodeAlive",
5-
"version": "2.0.3",
5+
"version": "2.0.4",
66
"description": "Semantic code search and codebase Q&A for Claude Desktop using your CodeAlive account or self-hosted deployment.",
77
"long_description": "CodeAlive gives Claude Desktop access to semantic code search, artifact fetch, repository discovery, and architecture-aware codebase Q&A. This extension runs locally via MCP and supports both CodeAlive Cloud and self-hosted deployments.",
88
"author": {

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ packages = ["src"]
3737
package-dir = {"" = "."}
3838

3939
[tool.setuptools_scm]
40-
fallback_version = "2.0.3"
40+
fallback_version = "2.0.4"
4141

4242
[tool.uv]
4343
# Relative dates in exclude-newer (e.g. "7 days") require uv ≥ 0.11.

server.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
33
"name": "io.github.CodeAlive-AI/codealive-mcp",
4-
"version": "2.0.3",
4+
"version": "2.0.4",
55
"description": "Semantic code search and analysis from CodeAlive for AI assistants and agents.",
66
"keywords": [
77
"context-engineering",

src/tests/test_datasources.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@ async def test_get_data_sources_removes_repository_ids_from_workspaces(mock_get_
4949

5050
mock_ctx.request_context.lifespan_context = mock_lifespan_context
5151

52-
# Tool returns the parsed list directly; FastMCP serializes it.
53-
data_sources = await get_data_sources(mock_ctx, alive_only=True)
52+
# Tool returns a dict {"dataSources":[...], "hint":"..."}.
53+
result = await get_data_sources(mock_ctx, alive_only=True)
54+
data_sources = result["dataSources"]
55+
assert "hint" in result
5456

5557
# Verify repository still has all fields
5658
repo = next(ds for ds in data_sources if ds["type"] == "Repository")
@@ -112,7 +114,8 @@ async def test_get_data_sources_preserves_other_workspace_fields(mock_get_api_ke
112114

113115
mock_ctx.request_context.lifespan_context = mock_lifespan_context
114116

115-
data_sources = await get_data_sources(mock_ctx, alive_only=True)
117+
result = await get_data_sources(mock_ctx, alive_only=True)
118+
data_sources = result["dataSources"]
116119

117120
workspace = data_sources[0]
118121

@@ -160,7 +163,8 @@ async def test_get_data_sources_handles_missing_repository_ids(mock_get_api_key)
160163
mock_ctx.request_context.lifespan_context = mock_lifespan_context
161164

162165
# Should not raise an error
163-
data_sources = await get_data_sources(mock_ctx, alive_only=True)
166+
result = await get_data_sources(mock_ctx, alive_only=True)
167+
data_sources = result["dataSources"]
164168

165169
# Verify workspace is intact
166170
workspace = data_sources[0]

src/tests/test_e2e_tools.py

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -112,26 +112,28 @@ def handler(req):
112112
result = await client.call_tool("get_data_sources", {})
113113

114114
text = _text(result)
115-
# Compact JSON: no spaces after separators
116-
assert ", " not in text and ": " not in text
117115
data = json.loads(text)
118-
names = [ds["name"] for ds in data]
116+
# Compact JSON, UTF-8 preserved (FastMCP uses pydantic_core.to_json).
117+
assert text == json.dumps(data, separators=(",", ":"), ensure_ascii=False)
118+
names = [ds["name"] for ds in data["dataSources"]]
119119
assert "backend" in names
120120
assert "platform" in names
121121
# repositoryIds must be stripped from workspaces
122-
for ds in data:
122+
for ds in data["dataSources"]:
123123
assert "repositoryIds" not in ds
124+
# Always emit a follow-up hint pointing at search/chat tools.
125+
assert "semantic_search" in data["hint"]
124126

125127
@pytest.mark.asyncio
126-
async def test_empty_list_returns_message(self):
128+
async def test_empty_list_returns_recovery_hint(self):
127129
mcp = _server({"/api/datasources/ready": lambda r: httpx.Response(200, json=[])})
128130
async with Client(mcp) as client:
129131
result = await client.call_tool("get_data_sources", {})
130132

131133
text = _text(result)
132134
data = json.loads(text)
133135
assert data["dataSources"] == []
134-
assert "No data sources found" in data["message"]
136+
assert "No data sources found" in data["hint"]
135137

136138
@pytest.mark.asyncio
137139
async def test_unicode_preserved_in_response(self):
@@ -148,8 +150,9 @@ async def test_unicode_preserved_in_response(self):
148150
text = _text(result)
149151
# Round-trip via ensure_ascii=False — ASCII-escaped output would mismatch.
150152
assert text == json.dumps(json.loads(text), separators=(",", ":"), ensure_ascii=False)
151-
assert "кирилл-репо" in text
152-
assert "Описание про принтеры HPRT" in text
153+
data = json.loads(text)
154+
assert data["dataSources"][0]["name"] == "кирилл-репо"
155+
assert data["dataSources"][0]["description"] == "Описание про принтеры HPRT"
153156
assert "\\u04" not in text
154157

155158
@pytest.mark.asyncio
@@ -904,6 +907,31 @@ def handler(req):
904907
xml = _text(result)
905908
assert "<artifacts>" in xml
906909

910+
@pytest.mark.asyncio
911+
async def test_unicode_preserved_in_xml(self):
912+
"""Cyrillic in identifier and content must survive into the XML output."""
913+
payload = {
914+
"artifacts": [
915+
{
916+
"identifier": "org/repo::файл.cs::Класс.Метод",
917+
"content": "класс Привет {\n метод() => 42\n}\n",
918+
"contentByteSize": 100,
919+
"startLine": 1,
920+
}
921+
]
922+
}
923+
mcp = _server({"/api/search/artifacts": lambda r: httpx.Response(200, json=payload)})
924+
async with Client(mcp) as client:
925+
result = await client.call_tool(
926+
"fetch_artifacts",
927+
{"identifiers": ["org/repo::файл.cs::Класс.Метод"]},
928+
)
929+
930+
xml = _text(result)
931+
assert "Класс.Метод" in xml
932+
assert "класс Привет" in xml
933+
assert "\\u04" not in xml
934+
907935

908936
# ---------------------------------------------------------------------------
909937
# Stringified parameter coercion for search tools
@@ -1041,6 +1069,26 @@ async def test_backend_error_handled(self):
10411069
text = _text(result)
10421070
assert "401" in text or "auth" in text.lower()
10431071

1072+
@pytest.mark.asyncio
1073+
async def test_unicode_preserved_in_streamed_response(self):
1074+
"""Cyrillic chunks streamed via SSE must survive as UTF-8 in the final text."""
1075+
body = self._sse_body(["Привет, ", "мир!"])
1076+
1077+
mcp = _server({
1078+
"/api/chat/completions": lambda r: httpx.Response(
1079+
200, text=body, headers={"content-type": "text/event-stream"},
1080+
),
1081+
})
1082+
async with Client(mcp) as client:
1083+
result = await client.call_tool(
1084+
"chat",
1085+
{"question": "Как работает аутентификация?", "data_sources": ["backend"]},
1086+
)
1087+
1088+
text = _text(result)
1089+
assert "Привет, мир!" in text
1090+
assert "\\u04" not in text
1091+
10441092
@pytest.mark.asyncio
10451093
async def test_legacy_alias_still_works(self):
10461094
body = self._sse_body(["Legacy alias"])

src/tools/datasources.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Data sources tool implementation."""
22

3-
from typing import Any
3+
from typing import Any, Dict
44
from urllib.parse import urljoin
55

66
import httpx
@@ -12,8 +12,28 @@
1212
# MCP tool/method name surfaced in every error/log message from this module.
1313
_TOOL_NAME = "get_data_sources"
1414

15+
# Hint embedded in every successful response. Mirrors the convention used by
16+
# the search tools (see _SEARCH_HINT in utils/response_transformer.py): the
17+
# response is always in front of the model when it picks the next step, so we
18+
# repeat the most load-bearing usage rule here instead of relying on the
19+
# tool's docstring being re-read mid-conversation.
20+
_DATASOURCES_HINT = (
21+
"Use the `name` field as the `data_sources` parameter for `semantic_search`, "
22+
"`grep_search`, or `chat`. To identify the CURRENT repository (vs external), "
23+
"compare `name`/`description`/`url` against your working directory and the "
24+
"code you've already observed."
25+
)
26+
27+
_DATASOURCES_EMPTY_HINT = (
28+
"No data sources found. Add a repository or workspace to CodeAlive at "
29+
"https://app.codealive.ai before calling search or chat tools. If you "
30+
"expected sources here, retry with alive_only=false to surface ones still "
31+
"being indexed."
32+
)
33+
34+
1535
# alive_only refers to ready_only. leaved as is for backward compatibility.
16-
async def get_data_sources(ctx: Context, alive_only: bool = True) -> Any:
36+
async def get_data_sources(ctx: Context, alive_only: bool = True) -> Dict[str, Any]:
1737
"""
1838
**CALL THIS FIRST**: Gets all available data sources (repositories and workspaces) for the user's account.
1939
@@ -29,7 +49,9 @@ async def get_data_sources(ctx: Context, alive_only: bool = True) -> Any:
2949
If False, returns all data sources regardless of processing state.
3050
3151
Returns:
32-
A compact JSON array of available data sources with the following fields for each:
52+
{"dataSources": [...], "hint": "..."}
53+
54+
Each entry in `dataSources` carries:
3355
- id: Unique identifier for the data source
3456
- name: Human-readable name - CRITICAL for matching with current working directory name
3557
- description: Summary of codebase contents - CRITICAL for identifying if this matches your
@@ -38,6 +60,9 @@ async def get_data_sources(ctx: Context, alive_only: bool = True) -> Any:
3860
- url: Repository URL (for Repository type only) - useful for matching with git remote
3961
- state: The processing state of the data source (if alive_only=false)
4062
63+
The `hint` field reminds you how to use the result and how to distinguish
64+
the CURRENT repository from EXTERNAL ones.
65+
4166
Use name + description + url together to determine if a repository is the CURRENT one
4267
you're working in versus an EXTERNAL repository.
4368
@@ -107,21 +132,16 @@ async def get_data_sources(ctx: Context, alive_only: bool = True) -> Any:
107132
# Parse and format the response
108133
data_sources = response.json()
109134

110-
# Empty result: return a dict carrying a recovery message instead of a bare
111-
# list, so the LLM has guidance when nothing is indexed yet.
112135
if not data_sources or len(data_sources) == 0:
113-
return {
114-
"dataSources": [],
115-
"message": "No data sources found. Please add a repository or workspace to CodeAlive before using this API.",
116-
}
136+
return {"dataSources": [], "hint": _DATASOURCES_EMPTY_HINT}
117137

118138
# Remove repositoryIds from workspace data sources
119139
for data_source in data_sources:
120140
if data_source.get("type") == "Workspace" and "repositoryIds" in data_source:
121141
del data_source["repositoryIds"]
122142

123143
# FastMCP serializes via pydantic_core.to_json, which preserves UTF-8.
124-
return data_sources
144+
return {"dataSources": data_sources, "hint": _DATASOURCES_HINT}
125145

126146
except (httpx.HTTPStatusError, Exception) as e:
127147
await handle_api_error(

0 commit comments

Comments
 (0)