Skip to content

Commit dbc81e5

Browse files
rodion-mclaude
andcommitted
Return dict from get_data_sources and get_artifact_relationships
The April refactor that switched search tools to dict (commit 28f4681) missed these two — they kept calling json.dumps(...) without ensure_ascii=False, so Cyrillic and other non-ASCII content reached clients escaped as \uXXXX sequences. Returning the dict (or list) directly lets FastMCP serialize through pydantic_core.to_json, which preserves UTF-8. Also tighten the convention in CLAUDE.md: only fetch_artifacts returns XML; every other tool returns dict/list. Add an explicit "never call json.dumps from a tool's return path" rule with the rationale, plus e2e UTF-8 round-trip tests for the four metadata tools so this regression is caught next time. Drop the unused transform_*_response_to_json aliases. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent e808d9e commit dbc81e5

8 files changed

Lines changed: 167 additions & 85 deletions

CLAUDE.md

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ This is a Model Context Protocol (MCP) server that provides AI clients with acce
118118
2. Client calls tools (`get_data_sources``semantic_search` / `grep_search``fetch_artifacts` / `get_artifact_relationships``chat` only if synthesis is still needed)
119119
3. Middleware chain runs: N8N cleanup → ObservabilityMiddleware (OTel span + log correlation)
120120
4. Tool translates MCP call to CodeAlive API request (with `X-CodeAlive-*` headers)
121-
5. Response parsed, formatted as XML or text, returned to AI client
121+
5. Response parsed and returned to the AI client — as a `dict` for metadata/discovery tools, as an XML string for `fetch_artifacts`, or as plain text for `chat`
122122

123123
### Environment Variables
124124

@@ -205,18 +205,26 @@ When adding a new tool, ensure:
205205

206206
## Tool Response Conventions
207207

208-
### Response format: dict for metadata, XML for content
209-
210-
Tools that return **search metadata** (identifiers, match counts, line numbers)
211-
return a `dict`. FastMCP serializes it automatically via `pydantic_core.to_json`,
212-
which preserves Unicode — no manual `json.dumps()` needed. Examples:
213-
`semantic_search`, `grep_search`, `codebase_search`.
214-
215-
Tools that return **source code content** return an **XML string**. XML tags give
216-
the LLM clear structural boundaries between artifacts, content blocks, and
217-
relationships — this is critical for accurate reasoning over multi-artifact
218-
responses. **Do not convert `fetch_artifacts` or `get_artifact_relationships`
219-
to dict/JSON** — the XML structure is intentional.
208+
### Response format: dict for metadata/discovery, XML only for source code
209+
210+
Tools that return **structured metadata** (identifiers, match counts, line
211+
numbers, relationship groups, data source listings) return a `dict` (or list of
212+
dicts). FastMCP serializes it automatically via `pydantic_core.to_json`, which
213+
preserves Unicode — no manual `json.dumps()` needed. Examples:
214+
`semantic_search`, `grep_search`, `codebase_search`, `get_data_sources`,
215+
`get_artifact_relationships`.
216+
217+
**Never call `json.dumps(...)` from a tool's return path.** Python's `json.dumps`
218+
defaults to `ensure_ascii=True` and escapes Cyrillic/CJK/etc. to `\uXXXX`.
219+
Returning a `dict` lets FastMCP route through `pydantic_core.to_json`, which
220+
emits UTF-8. If you must serialize manually for some reason, pass
221+
`ensure_ascii=False` explicitly.
222+
223+
Only `fetch_artifacts` returns an **XML string**. XML tags give the LLM clear
224+
structural boundaries between artifacts, content blocks, and inline
225+
relationships when streaming source code — this is critical for accurate
226+
reasoning over multi-artifact responses. **Do not convert `fetch_artifacts` to
227+
dict/JSON** — the XML structure is intentional.
220228

221229
### Hint other MCP tools when the response implies a follow-up call
222230

src/tests/test_artifact_relationships.py

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
"""Tests for the get_artifact_relationships tool."""
22

3-
import json
4-
53
import pytest
64
from unittest.mock import AsyncMock, MagicMock, patch
75

@@ -10,7 +8,7 @@
108

119
from tools.artifact_relationships import (
1210
PROFILE_MAP,
13-
_build_relationships_json,
11+
_build_relationships_dict,
1412
get_artifact_relationships,
1513
)
1614

@@ -32,8 +30,8 @@ def test_references_only_maps_correctly(self):
3230
assert PROFILE_MAP["referencesOnly"] == "ReferencesOnly"
3331

3432

35-
class TestBuildRelationshipsJson:
36-
"""Test compact JSON rendering of relationship responses."""
33+
class TestBuildRelationshipsDict:
34+
"""Test dict shape of relationship responses (FastMCP handles serialization)."""
3735

3836
def test_found_with_grouped_relationships(self):
3937
data = {
@@ -71,11 +69,7 @@ def test_found_with_grouped_relationships(self):
7169
],
7270
}
7371

74-
result = _build_relationships_json(data)
75-
# Compact JSON
76-
assert ", " not in result and ": " not in result
77-
78-
parsed = json.loads(result)
72+
parsed = _build_relationships_dict(data)
7973
assert parsed["sourceIdentifier"] == "org/repo::path::Symbol"
8074
assert parsed["profile"] == "callsOnly"
8175
assert parsed["found"] is True
@@ -103,7 +97,7 @@ def test_not_found_omits_relationships(self):
10397
"relationships": [],
10498
}
10599

106-
parsed = json.loads(_build_relationships_json(data))
100+
parsed = _build_relationships_dict(data)
107101
assert parsed["found"] is False
108102
assert "relationships" not in parsed
109103

@@ -130,7 +124,7 @@ def test_empty_groups_still_rendered(self):
130124
],
131125
}
132126

133-
parsed = json.loads(_build_relationships_json(data))
127+
parsed = _build_relationships_dict(data)
134128
types = [g["type"] for g in parsed["relationships"]]
135129
assert types == ["ancestors", "descendants"]
136130
for g in parsed["relationships"]:
@@ -158,14 +152,15 @@ def test_optional_fields_omitted_when_null(self):
158152
],
159153
}
160154

161-
parsed = json.loads(_build_relationships_json(data))
155+
parsed = _build_relationships_dict(data)
162156
item = parsed["relationships"][0]["items"][0]
163157
assert item["identifier"] == "org/repo::path::Target"
164158
assert "filePath" not in item
165159
assert "startLine" not in item
166160
assert "shortSummary" not in item
167161

168-
def test_quotes_and_specials_use_json_escaping(self):
162+
def test_quotes_and_specials_pass_through_unchanged(self):
163+
"""Special chars (<, >, &, ") are preserved as-is in the dict — no HTML encoding."""
169164
data = {
170165
"sourceIdentifier": "org/repo::path::Class<T>",
171166
"profile": "CallsOnly",
@@ -186,13 +181,7 @@ def test_quotes_and_specials_use_json_escaping(self):
186181
],
187182
}
188183

189-
result = _build_relationships_json(data)
190-
# No HTML entity encoding in JSON output
191-
assert "&quot;" not in result
192-
assert "&amp;" not in result
193-
assert "&lt;" not in result
194-
195-
parsed = json.loads(result)
184+
parsed = _build_relationships_dict(data)
196185
assert parsed["sourceIdentifier"] == "org/repo::path::Class<T>"
197186
assert parsed["relationships"][0]["items"][0]["identifier"] == "org/repo::path::Method<T>"
198187
assert parsed["relationships"][0]["items"][0]["shortSummary"] == 'Returns "value" & more'
@@ -206,7 +195,7 @@ def test_profile_mapped_back_to_mcp_name(self):
206195
"found": False,
207196
"relationships": [],
208197
}
209-
parsed = json.loads(_build_relationships_json(data))
198+
parsed = _build_relationships_dict(data)
210199
assert parsed["profile"] == mcp_name
211200

212201

@@ -247,7 +236,7 @@ async def test_default_profile_sends_calls_only(self, mock_get_api_key):
247236
# Verify the API was called with CallsOnly profile
248237
call_args = mock_client.post.call_args
249238
assert call_args[1]["json"]["profile"] == "CallsOnly"
250-
assert json.loads(result)["found"] is True
239+
assert result["found"] is True
251240

252241
@pytest.mark.asyncio
253242
@patch("tools.artifact_relationships.get_api_key_from_context")
@@ -353,8 +342,7 @@ async def test_not_found_response_renders_correctly(self, mock_get_api_key):
353342
mock_context.base_url = "https://app.codealive.ai"
354343
ctx.request_context.lifespan_context = mock_context
355344

356-
result = await get_artifact_relationships(ctx=ctx, identifier="org/repo::path::Missing")
345+
data = await get_artifact_relationships(ctx=ctx, identifier="org/repo::path::Missing")
357346

358-
data = json.loads(result)
359347
assert data["found"] is False
360348
assert "relationships" not in data

src/tests/test_datasources.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Tests for data sources tool."""
22

3-
import json
43
from unittest.mock import AsyncMock, MagicMock, patch
54

65
import pytest
@@ -50,11 +49,8 @@ async def test_get_data_sources_removes_repository_ids_from_workspaces(mock_get_
5049

5150
mock_ctx.request_context.lifespan_context = mock_lifespan_context
5251

53-
# Call the function
54-
result = await get_data_sources(mock_ctx, alive_only=True)
55-
56-
# Result is a compact JSON array
57-
data_sources = json.loads(result)
52+
# Tool returns the parsed list directly; FastMCP serializes it.
53+
data_sources = await get_data_sources(mock_ctx, alive_only=True)
5854

5955
# Verify repository still has all fields
6056
repo = next(ds for ds in data_sources if ds["type"] == "Repository")
@@ -116,11 +112,7 @@ async def test_get_data_sources_preserves_other_workspace_fields(mock_get_api_ke
116112

117113
mock_ctx.request_context.lifespan_context = mock_lifespan_context
118114

119-
# Call the function
120-
result = await get_data_sources(mock_ctx, alive_only=True)
121-
122-
# Result is compact JSON array
123-
data_sources = json.loads(result)
115+
data_sources = await get_data_sources(mock_ctx, alive_only=True)
124116

125117
workspace = data_sources[0]
126118

@@ -167,11 +159,8 @@ async def test_get_data_sources_handles_missing_repository_ids(mock_get_api_key)
167159

168160
mock_ctx.request_context.lifespan_context = mock_lifespan_context
169161

170-
# Call the function - should not raise an error
171-
result = await get_data_sources(mock_ctx, alive_only=True)
172-
173-
# Result is compact JSON array
174-
data_sources = json.loads(result)
162+
# Should not raise an error
163+
data_sources = await get_data_sources(mock_ctx, alive_only=True)
175164

176165
# Verify workspace is intact
177166
workspace = data_sources[0]

src/tests/test_e2e_tools.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,25 @@ async def test_empty_list_returns_message(self):
133133
assert data["dataSources"] == []
134134
assert "No data sources found" in data["message"]
135135

136+
@pytest.mark.asyncio
137+
async def test_unicode_preserved_in_response(self):
138+
"""Cyrillic in name/description must survive as UTF-8, not \\uXXXX."""
139+
payload = [
140+
{"id": "r1", "name": "кирилл-репо", "type": "Repository",
141+
"description": "Описание про принтеры HPRT"},
142+
]
143+
144+
mcp = _server({"/api/datasources/ready": lambda r: httpx.Response(200, json=payload)})
145+
async with Client(mcp) as client:
146+
result = await client.call_tool("get_data_sources", {})
147+
148+
text = _text(result)
149+
# Round-trip via ensure_ascii=False — ASCII-escaped output would mismatch.
150+
assert text == json.dumps(json.loads(text), separators=(",", ":"), ensure_ascii=False)
151+
assert "кирилл-репо" in text
152+
assert "Описание про принтеры HPRT" in text
153+
assert "\\u04" not in text
154+
136155
@pytest.mark.asyncio
137156
async def test_alive_only_false_hits_all_endpoint(self):
138157
hit = []
@@ -497,6 +516,32 @@ async def test_404_includes_recovery_hint(self):
497516
assert result.is_error
498517
assert "get_data_sources" in text
499518

519+
@pytest.mark.asyncio
520+
async def test_unicode_preserved_in_response(self):
521+
"""Cyrillic in path/description must survive as UTF-8, not \\uXXXX."""
522+
payload = {
523+
"results": [
524+
{
525+
"kind": "File",
526+
"identifier": "org/repo::база/file.md::",
527+
"description": "Описание про принтеры HPRT",
528+
"location": {"path": "база/file.md"},
529+
"contentByteSize": 100,
530+
}
531+
]
532+
}
533+
534+
mcp = _server({"/api/search/semantic": lambda r: httpx.Response(200, json=payload)})
535+
async with Client(mcp) as client:
536+
result = await client.call_tool(
537+
"semantic_search", {"query": "кир", "data_sources": ["repo"]},
538+
)
539+
540+
text = _text(result)
541+
assert text == json.dumps(json.loads(text), separators=(",", ":"), ensure_ascii=False)
542+
assert "база/file.md" in text
543+
assert "\\u04" not in text
544+
500545

501546
# ---------------------------------------------------------------------------
502547
# grep_search
@@ -699,6 +744,32 @@ async def test_404_includes_recovery_hint(self):
699744
assert result.is_error
700745
assert "get_data_sources" in _text(result)
701746

747+
@pytest.mark.asyncio
748+
async def test_unicode_preserved_in_response(self):
749+
"""Cyrillic in path/lineText must survive as UTF-8, not \\uXXXX."""
750+
payload = {
751+
"results": [
752+
{
753+
"kind": "File",
754+
"identifier": "org/repo::база/file.md::",
755+
"location": {"path": "база/file.md"},
756+
"matchCount": 1,
757+
"matches": [{"lineNumber": 3, "startColumn": 1, "endColumn": 5,
758+
"lineText": "тест кириллица"}],
759+
}
760+
]
761+
}
762+
mcp = _server({"/api/search/grep": lambda r: httpx.Response(200, json=payload)})
763+
async with Client(mcp) as client:
764+
result = await client.call_tool(
765+
"grep_search", {"query": "кир", "data_sources": ["repo"]},
766+
)
767+
768+
text = _text(result)
769+
assert text == json.dumps(json.loads(text), separators=(",", ":"), ensure_ascii=False)
770+
assert "тест кириллица" in text
771+
assert "\\u04" not in text
772+
702773

703774
# ---------------------------------------------------------------------------
704775
# fetch_artifacts
@@ -1037,8 +1108,9 @@ def handler(req):
10371108
)
10381109

10391110
text = _text(result)
1040-
assert ", " not in text and ": " not in text
10411111
data = json.loads(text)
1112+
# FastMCP serializes via pydantic_core.to_json — compact, UTF-8.
1113+
assert text == json.dumps(data, separators=(",", ":"), ensure_ascii=False)
10421114
assert data["found"] is True
10431115
types = [g["type"] for g in data["relationships"]]
10441116
assert "outgoing_calls" in types
@@ -1221,6 +1293,40 @@ def handler(req):
12211293
assert data["relationships"][0]["type"] == "references"
12221294
assert data["relationships"][0]["totalCount"] == 5
12231295

1296+
@pytest.mark.asyncio
1297+
async def test_unicode_preserved_in_response(self):
1298+
"""Cyrillic in identifiers/summaries must survive as UTF-8, not \\uXXXX."""
1299+
response_data = {
1300+
"sourceIdentifier": "org/repo::файл.cs::Класс.Метод",
1301+
"profile": "CallsOnly",
1302+
"found": True,
1303+
"relationships": [
1304+
{
1305+
"relationType": "OutgoingCalls",
1306+
"totalCount": 1,
1307+
"returnedCount": 1,
1308+
"truncated": False,
1309+
"items": [{"identifier": "org/repo::другой.cs::Метод2",
1310+
"filePath": "другой.cs",
1311+
"shortSummary": "Описание метода"}],
1312+
}
1313+
],
1314+
}
1315+
mcp = _server({
1316+
"/api/search/artifact-relationships": lambda r: httpx.Response(200, json=response_data),
1317+
})
1318+
async with Client(mcp) as client:
1319+
result = await client.call_tool(
1320+
"get_artifact_relationships",
1321+
{"identifier": "org/repo::файл.cs::Класс.Метод"},
1322+
)
1323+
1324+
text = _text(result)
1325+
assert text == json.dumps(json.loads(text), separators=(",", ":"), ensure_ascii=False)
1326+
assert "Класс.Метод" in text
1327+
assert "Описание метода" in text
1328+
assert "\\u04" not in text
1329+
12241330
@pytest.mark.asyncio
12251331
async def test_inheritance_profile_maps_correctly(self):
12261332
response_data = {

0 commit comments

Comments
 (0)