Skip to content

Commit 9be7ad3

Browse files
rodion-mclaude
andcommitted
fix(fetch_artifacts): surface not-found identifiers instead of dropping them
_build_artifacts_xml skipped artifacts with no content (`if content is None: continue`), so when the backend could not resolve some requested identifiers the tool returned only the found ones with no signal — the agent never noticed the gap and answered without the missing article. Now collect the misses (graceful degradation: prefer the backend `found` flag, fall back to `content is None`) plus a backstop over the requested identifiers the backend never echoed, and emit a `<not_found count=N>` block listing each concrete identifier with a hint telling the agent to re-check those exact ids and retry fetch for the problematic ones. Found-but-empty renders as a normal artifact. XML attributes (identifier, summary, data_source) are now escaped to neutralize injection via attacker-influenced values. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 7a3bde3 commit 9be7ad3

5 files changed

Lines changed: 260 additions & 29 deletions

File tree

CLAUDE.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,11 @@ Examples in this repo:
297297
whenever an artifact has call relationships, telling the agent it can drill
298298
down further. Implementation: `_build_artifacts_xml` in
299299
`src/tools/fetch_artifacts.py`.
300+
- `fetch_artifacts` also emits a `<not_found count="N">` block plus a `<hint>`
301+
(`_NOT_FOUND_HINT`) whenever a requested identifier is missing or inaccessible —
302+
it relies on the backend `found` flag, falling back to `content is null` and a
303+
requested-vs-returned diff so it never silently drops a requested artifact.
304+
Implementation: `_build_artifacts_xml` in `src/tools/fetch_artifacts.py`.
300305

301306
When you add or change a tool whose output is structurally a "pointer" to data
302307
held by another tool (identifiers, IDs, references), add or update the hint in

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Once connected, you'll have access to these powerful tools:
2828
1. **`get_data_sources`** - List your indexed repositories and workspaces
2929
2. **`semantic_search`** - Canonical semantic search across indexed artifacts
3030
3. **`grep_search`** - Exact literal or regex text search inside file content, plus literal file-name/path matching (returns files like `Form.xml` even when their content never mentions the name), with line-level previews for content matches
31-
4. **`fetch_artifacts`** - Load the full source for relevant search hits
31+
4. **`fetch_artifacts`** - Load the full source for relevant search hits (missing or inaccessible identifiers are reported back in a `<not_found>` block, not silently dropped)
3232
5. **`get_artifact_relationships`** - Expand call graph, inheritance, and reference relationships for one artifact
3333
6. **`chat`** - Slower synthesized codebase Q&A, typically only after search
3434
7. **`codebase_search`** - Deprecated legacy semantic search alias kept for backward compatibility

src/tests/test_e2e_tools.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,47 @@ def handler(req):
814814
assert "<content>\n" in xml
815815
assert "\n </content>" in xml
816816

817+
@pytest.mark.asyncio
818+
async def test_not_found_surfaced_with_found_flag(self):
819+
payload = {
820+
"artifacts": [
821+
{"identifier": "org/repo::src/auth.py::AuthService", "found": True,
822+
"content": "class AuthService:\n pass\n", "contentByteSize": 28, "startLine": 10},
823+
{"identifier": "org/repo::src/missing.py::Gone", "found": False, "content": None},
824+
]
825+
}
826+
mcp = _server({"/api/search/artifacts": lambda r: httpx.Response(200, json=payload)})
827+
async with Client(mcp) as client:
828+
result = await client.call_tool(
829+
"fetch_artifacts",
830+
{"identifiers": ["org/repo::src/auth.py::AuthService", "org/repo::src/missing.py::Gone"]},
831+
)
832+
833+
xml = _text(result)
834+
assert "<not_found" in xml
835+
assert 'identifier="org/repo::src/missing.py::Gone"' in xml
836+
assert "could not be fetched" in xml.lower()
837+
# the found one is still rendered
838+
assert "class AuthService" in xml
839+
840+
@pytest.mark.asyncio
841+
async def test_not_found_surfaced_legacy_null_content(self):
842+
# Older backend without the `found` flag: null content still surfaces as not-found.
843+
payload = {
844+
"artifacts": [
845+
{"identifier": "org/repo::src/missing.py::Gone", "content": None, "contentByteSize": None},
846+
]
847+
}
848+
mcp = _server({"/api/search/artifacts": lambda r: httpx.Response(200, json=payload)})
849+
async with Client(mcp) as client:
850+
result = await client.call_tool(
851+
"fetch_artifacts", {"identifiers": ["org/repo::src/missing.py::Gone"]},
852+
)
853+
854+
xml = _text(result)
855+
assert "<not_found" in xml
856+
assert 'identifier="org/repo::src/missing.py::Gone"' in xml
857+
817858
@pytest.mark.asyncio
818859
async def test_empty_identifiers_returns_error(self):
819860
mcp = _server({})

src/tests/test_fetch_artifacts.py

Lines changed: 124 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,11 @@ def test_relationships_call_without_summary_omits_summary_attr(self):
204204
assert 'identifier="repo::src/b.ts::FuncB"/>' in result
205205
assert 'summary' not in result.split('FuncB')[1].split('/>')[0]
206206

207-
def test_relationships_summary_kept_raw(self):
207+
def test_relationships_summary_xml_escaped(self):
208+
# Summaries are AI-generated text placed in an XML *attribute*; like identifiers they must
209+
# be escaped so a crafted summary cannot break out of the attribute and inject pseudo-XML
210+
# into the model context. (Source-code *content* stays raw — that is the <content> body,
211+
# not an attribute; see test_content_is_not_html_escaped.)
208212
data = {"artifacts": [{
209213
"identifier": "repo::src/a.ts::FuncA",
210214
"content": "code",
@@ -219,10 +223,29 @@ def test_relationships_summary_kept_raw(self):
219223
}
220224
}]}
221225
result = _build_artifacts_xml(data)
222-
# Raw special chars preserved (no HTML escaping)
223-
assert "Checks if x < 10 & y > 5" in result
224-
assert "&lt;" not in result
225-
assert "&amp;" not in result
226+
# Special chars in the attribute are escaped, and the raw unescaped form is gone.
227+
assert "&lt;" in result
228+
assert "&amp;" in result
229+
assert "&gt;" in result
230+
assert "x < 10 & y > 5" not in result
231+
232+
def test_relationships_summary_injection_is_neutralized(self):
233+
# A crafted summary must not break out of the attribute and inject pseudo-XML.
234+
data = {"artifacts": [{
235+
"identifier": "repo::src/a.ts::FuncA",
236+
"content": "code",
237+
"contentByteSize": 4,
238+
"relationships": {
239+
"outgoingCallsCount": 1,
240+
"outgoingCalls": [
241+
{"identifier": "repo::src/b.ts::FuncB", "summary": '"/><injected>x</injected><call summary="'},
242+
],
243+
"incomingCallsCount": None,
244+
"incomingCalls": None,
245+
}
246+
}]}
247+
result = _build_artifacts_xml(data)
248+
assert "<injected>" not in result
226249

227250

228251
class TestHasAnyCalls:
@@ -357,12 +380,16 @@ def test_hint_appears_once_with_multiple_artifacts(self):
357380
assert result.count("<hint>") == 1
358381
assert result.count(self.HINT_MARKER) == 1
359382

360-
def test_hint_absent_when_no_artifacts_have_content(self):
383+
def test_relationships_hint_absent_but_missing_surfaced_when_no_content(self):
384+
# A not-found artifact produces no relationships preview hint (nothing to drill
385+
# into), but it must still be surfaced — not silently dropped.
361386
data = {"artifacts": [
362387
{"identifier": "repo::missing.ts::Func", "content": None, "contentByteSize": None},
363388
]}
364389
result = _build_artifacts_xml(data)
365-
assert "<hint>" not in result
390+
assert self.HINT_MARKER not in result
391+
assert "<not_found" in result
392+
assert "repo::missing.ts::Func" in result
366393

367394

368395
class TestBuildArtifactsXmlDataSourceMissHint:
@@ -390,12 +417,96 @@ def test_no_miss_hint_when_data_source_resolved_content(self):
390417
result = _build_artifacts_xml(data, data_source="backend")
391418
assert "omit data_source" not in result
392419

393-
def test_no_miss_hint_without_data_source(self):
420+
def test_no_data_source_miss_hint_without_data_source(self):
421+
# Without a data_source selector there is no data-source-specific recovery hint,
422+
# but the missing artifact is still surfaced in a <not_found> block.
394423
data = {"artifacts": [
395424
{"identifier": "repo::a.ts::F", "content": None, "contentByteSize": None},
396425
]}
397426
result = _build_artifacts_xml(data)
398-
assert "<hint>" not in result
427+
assert "omit data_source" not in result
428+
assert "<not_found" in result
429+
430+
431+
class TestBuildArtifactsXmlNotFound:
432+
"""Requested identifiers the backend could not resolve must be surfaced, not dropped."""
433+
434+
HINT = "could not be fetched"
435+
436+
def test_explicit_found_false_goes_to_not_found_block(self):
437+
data = {"artifacts": [
438+
{"identifier": "repo::a.ts::F", "found": True, "content": "code", "contentByteSize": 4},
439+
{"identifier": "repo::missing.ts::G", "found": False, "content": None},
440+
]}
441+
result = _build_artifacts_xml(data)
442+
assert '<not_found count="1">' in result
443+
assert 'identifier="repo::missing.ts::G"' in result
444+
# the found sibling is still rendered as a normal artifact
445+
assert '<artifact identifier="repo::a.ts::F"' in result
446+
assert self.HINT in result.lower()
447+
448+
def test_legacy_null_content_without_found_flag_is_surfaced(self):
449+
data = {"artifacts": [
450+
{"identifier": "repo::missing.ts::G", "content": None},
451+
]}
452+
result = _build_artifacts_xml(data)
453+
assert "<not_found" in result
454+
assert 'identifier="repo::missing.ts::G"' in result
455+
456+
def test_found_true_empty_content_renders_as_artifact_not_missing(self):
457+
data = {"artifacts": [
458+
{"identifier": "repo::a.ts::F", "found": True, "content": ""},
459+
]}
460+
result = _build_artifacts_xml(data)
461+
assert '<artifact identifier="repo::a.ts::F"' in result
462+
assert "<not_found" not in result
463+
464+
def test_found_true_null_content_renders_as_artifact_not_missing(self):
465+
data = {"artifacts": [
466+
{"identifier": "repo::a.ts::F", "found": True, "content": None},
467+
]}
468+
result = _build_artifacts_xml(data)
469+
assert '<artifact identifier="repo::a.ts::F"' in result
470+
assert "<not_found" not in result
471+
472+
def test_all_found_emits_no_not_found_block(self):
473+
data = {"artifacts": [
474+
{"identifier": "repo::a.ts::F", "found": True, "content": "code", "contentByteSize": 4},
475+
]}
476+
result = _build_artifacts_xml(data)
477+
assert "<not_found" not in result
478+
479+
def test_requested_backstop_surfaces_id_backend_never_echoed(self):
480+
data = {"artifacts": [
481+
{"identifier": "repo::a.ts::F", "found": True, "content": "code", "contentByteSize": 4},
482+
]}
483+
result = _build_artifacts_xml(data, requested=["repo::a.ts::F", "repo::ghost.ts::Z"])
484+
assert '<not_found count="1">' in result
485+
assert 'identifier="repo::ghost.ts::Z"' in result
486+
487+
def test_not_found_count_matches_rows(self):
488+
data = {"artifacts": [
489+
{"identifier": "repo::m1::A", "found": False, "content": None},
490+
{"identifier": "repo::m2::B", "found": False, "content": None},
491+
]}
492+
result = _build_artifacts_xml(data)
493+
assert '<not_found count="2">' in result
494+
assert 'identifier="repo::m1::A"' in result
495+
assert 'identifier="repo::m2::B"' in result
496+
497+
def test_identifiers_are_xml_escaped(self):
498+
# Crafted identifiers (caller/LLM-supplied, and any unmatched requested string lands in
499+
# <not_found> via the backstop) must not break out of the XML attribute and inject
500+
# pseudo-XML into the model context — neither in <artifact> nor <not_found>.
501+
data = {"artifacts": [
502+
{"identifier": 'r::ok"><x>::F', "found": True, "content": "code", "contentByteSize": 4},
503+
{"identifier": 'r::bad"><injected>::G', "found": False, "content": None},
504+
]}
505+
result = _build_artifacts_xml(data)
506+
assert "<injected>" not in result
507+
assert "<x>" not in result
508+
assert "&quot;&gt;&lt;injected&gt;" in result
509+
assert "&quot;&gt;&lt;x&gt;" in result
399510

400511

401512
@pytest.mark.asyncio
@@ -450,8 +561,10 @@ async def test_fetch_artifacts_returns_xml(mock_get_api_key):
450561
assert "2 | return True" in result
451562
assert 'contentByteSize="38"' in result
452563
assert 'identifier="owner/repo::src/auth.py::login"' in result
453-
# Not-found artifact is skipped (not in output)
454-
assert "missing.py" not in result
564+
# Not-found artifact is surfaced in a <not_found> block, not silently dropped.
565+
assert "<not_found" in result
566+
assert 'identifier="owner/repo::src/missing.py::func"' in result
567+
assert "could not be fetched" in result.lower()
455568

456569

457570
@pytest.mark.asyncio

0 commit comments

Comments
 (0)