Skip to content

Commit fb8abec

Browse files
illeatmyhatclaude
andcommitted
test(platform-integrations): end-to-end chain test + run provenance/e2e guards in CI
Add test_end_to_end_claude.py: drives the REAL rendered Claude scripts as subprocesses in sequence (save->adapt->audit->provenance->record) and asserts the entity id stays identical across adapt_memory, audit_recall, and provenance, that the native-transcript locator resolves, and that record+dedup closes the loop. A second test asserts gaps are surfaced (missing entity/trajectory) not dropped. No production code needed — the chain closes as built. Also drop the e2e marker from test_provenance.py and the new chain test: CI runs pytest with the default '-m not llm and not e2e' filter, so e2e-marked tests never execute in CI. These are sandboxed and fast (no real CLI/network), so they belong in the default suite as guards — matching test_doctor/test_entity_io_core. Default suite: 242 -> 252. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent faf2021 commit fb8abec

2 files changed

Lines changed: 290 additions & 1 deletion

File tree

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
"""End-to-end data-flow test for the rendered Claude evolve-lite scripts.
2+
3+
This is the ONE integration test that proves the correlation ids line up across
4+
the whole chain on Claude — the integration that was broken in the pre-redesign
5+
world (native transcript path vs. entity id) and the reason the hookless redesign
6+
exists. It drives the REAL rendered Claude scripts as subprocesses, in sequence,
7+
with nothing mocked in the data flow:
8+
9+
adapt_memory.py -> mirrors a native memory into the evolve store, emitting
10+
the entity id ``feedback/prefer-ripgrep``.
11+
audit_recall.py -> records a ``recall`` row keyed by that exact entity id
12+
and the host session id.
13+
provenance.py -> reads the recall row, resolves the mirrored entity AND
14+
the NATIVE Claude transcript, and emits exactly one
15+
candidate whose ids line up end to end.
16+
provenance.py -> records a ``followed`` verdict, then dedups the pair.
17+
18+
Lib resolution (``lib/evolve-lite/entity_io.py``) only works in the rendered
19+
tree, so we point at the rendered Claude copies under ``platform-integrations/``.
20+
21+
The scripts are driven as real subprocesses (closest to actual agent usage);
22+
nothing in the data flow is mocked.
23+
"""
24+
25+
import json
26+
import os
27+
import re
28+
import subprocess
29+
import sys
30+
from pathlib import Path
31+
32+
import pytest
33+
34+
pytestmark = [pytest.mark.platform_integrations]
35+
36+
_REPO_ROOT = Path(__file__).parent.parent.parent
37+
_PLUGIN = _REPO_ROOT / "platform-integrations/claude/plugins/evolve-lite"
38+
ADAPT_SCRIPT = _PLUGIN / "skills/evolve-lite/adapt-memory/scripts/adapt_memory.py"
39+
AUDIT_SCRIPT = _PLUGIN / "scripts/audit_recall.py"
40+
PROVENANCE_SCRIPT = _PLUGIN / "skills/evolve-lite/provenance/scripts/provenance.py"
41+
42+
SID = "claude-e2e-session-0001"
43+
44+
NATIVE_MEMORY = """\
45+
---
46+
name: prefer-ripgrep
47+
description: use ripgrep over grep
48+
metadata:
49+
type: feedback
50+
---
51+
Always reach for ripgrep (rg) instead of grep.
52+
"""
53+
54+
55+
def _claude_slug(root: Path) -> str:
56+
"""Mirror provenance.py / doctor.py slugging: non-alphanumerics -> '-'."""
57+
return re.sub(r"[^A-Za-z0-9]", "-", str(root))
58+
59+
60+
def _run(script: Path, *args, evolve_dir: Path, home: Path, cwd: Path, stdin=None, sid=None):
61+
"""Run a rendered Claude script as a real subprocess in the sandbox.
62+
63+
Every host path is sandboxed: ``$EVOLVE_DIR`` points at the temp store,
64+
``$HOME``/``$USERPROFILE`` at a sandboxed home, cwd at the temp project root,
65+
and ``$CLAUDE_CODE_SESSION_ID`` at a known SID when supplied.
66+
"""
67+
env = {**os.environ}
68+
env["EVOLVE_DIR"] = str(evolve_dir)
69+
env["HOME"] = str(home)
70+
env["USERPROFILE"] = str(home)
71+
env.pop("HOMEDRIVE", None)
72+
env.pop("HOMEPATH", None)
73+
if sid is not None:
74+
env["CLAUDE_CODE_SESSION_ID"] = sid
75+
else:
76+
env.pop("CLAUDE_CODE_SESSION_ID", None)
77+
return subprocess.run(
78+
[sys.executable, str(script), *args],
79+
input=stdin,
80+
capture_output=True,
81+
text=True,
82+
cwd=str(cwd),
83+
env=env,
84+
check=False,
85+
)
86+
87+
88+
def _parse_jsonl(text: str):
89+
return [json.loads(line) for line in text.splitlines() if line.strip()]
90+
91+
92+
def _read_audit(evolve_dir: Path):
93+
path = evolve_dir / "audit.log"
94+
if not path.is_file():
95+
return []
96+
return _parse_jsonl(path.read_text(encoding="utf-8"))
97+
98+
99+
@pytest.fixture
100+
def sandbox(tmp_path, sandbox_home):
101+
"""Build the sandbox dirs the chain needs and return the salient paths.
102+
103+
``sandbox_home`` (autouse) already redirects ``$HOME``; we reuse it as the
104+
home that holds the native Claude transcript tree. The project root lives
105+
under tmp_path with its own ``.evolve`` store, kept separate from HOME so
106+
the native-transcript slug (derived from the project root) is exercised for
107+
real.
108+
"""
109+
project_root = tmp_path / "proj"
110+
project_root.mkdir()
111+
evolve_dir = project_root / ".evolve"
112+
evolve_dir.mkdir()
113+
return {
114+
"home": sandbox_home,
115+
"project_root": project_root,
116+
"evolve_dir": evolve_dir,
117+
}
118+
119+
120+
def test_chain_closes_ids_line_up(sandbox):
121+
"""The whole chain closes: the entity adapt() creates is the entity audit()
122+
records is the entity provenance() resolves against the native transcript.
123+
124+
Steps (each runs the real rendered script as a subprocess):
125+
1. save — write the native Claude memory file.
126+
2. adapt — mirror it; assert entities/feedback/prefer-ripgrep.md exists and
127+
the printed entity id is ``feedback/prefer-ripgrep``.
128+
3. audit — record a recall row for that exact entity id under the SID.
129+
4. native transcript — drop ~/.claude/projects/<slug>/<SID>.jsonl.
130+
5. candidates — assert EXACTLY ONE candidate whose entity_id ==
131+
``feedback/prefer-ripgrep``, whose excerpt holds the mirrored
132+
content, whose trajectory_path is the native transcript, with
133+
NO ``missing`` field (entity + trajectory both resolved). This
134+
is the id-alignment assertion.
135+
6. record + dedup — pipe a ``followed`` verdict; assert an influence row is
136+
appended; re-run candidates and assert it's now empty.
137+
"""
138+
home = sandbox["home"]
139+
project_root = sandbox["project_root"]
140+
evolve_dir = sandbox["evolve_dir"]
141+
142+
# --- 1. save: native memory file (Claude format) ------------------------
143+
native_file = project_root / "native_memory.md"
144+
native_file.write_text(NATIVE_MEMORY, encoding="utf-8")
145+
146+
# --- 2. adapt: mirror native memory into the evolve store ---------------
147+
adapt = _run(
148+
ADAPT_SCRIPT,
149+
str(native_file),
150+
"--type",
151+
"feedback",
152+
"--trigger",
153+
"when searching code, prefer ripgrep",
154+
evolve_dir=evolve_dir,
155+
home=home,
156+
cwd=project_root,
157+
)
158+
assert adapt.returncode == 0, adapt.stderr
159+
160+
mirrored = evolve_dir / "entities" / "feedback" / "prefer-ripgrep.md"
161+
assert mirrored.is_file(), f"adapt did not mirror the entity: {adapt.stdout}\n{adapt.stderr}"
162+
163+
# Capture the entity id from adapt's stdout ("Entity id: <id>").
164+
id_lines = [ln for ln in adapt.stdout.splitlines() if ln.startswith("Entity id:")]
165+
assert id_lines, f"adapt did not print an entity id:\n{adapt.stdout}"
166+
adapted_entity_id = id_lines[0].split("Entity id:", 1)[1].strip()
167+
assert adapted_entity_id == "feedback/prefer-ripgrep"
168+
169+
# --- 3. audit: record a recall row for that exact entity id -------------
170+
audit = _run(
171+
AUDIT_SCRIPT,
172+
adapted_entity_id, # exactly as EVOLVE.md instructs the agent to pass it
173+
evolve_dir=evolve_dir,
174+
home=home,
175+
cwd=project_root,
176+
sid=SID,
177+
)
178+
assert audit.returncode == 0, audit.stderr
179+
180+
recall_rows = [r for r in _read_audit(evolve_dir) if r.get("event") == "recall"]
181+
assert len(recall_rows) == 1, _read_audit(evolve_dir)
182+
assert recall_rows[0]["session_id"] == SID
183+
assert recall_rows[0]["entities"] == ["feedback/prefer-ripgrep"]
184+
185+
# --- 4. native transcript fixture ---------------------------------------
186+
slug = _claude_slug(project_root)
187+
native_transcript = home / ".claude" / "projects" / slug / f"{SID}.jsonl"
188+
native_transcript.parent.mkdir(parents=True)
189+
native_transcript.write_text(
190+
'{"type":"user","message":{"role":"user","content":"search the repo for TODOs"}}\n'
191+
'{"type":"assistant","message":{"role":"assistant","content":"Using rg to search."}}\n',
192+
encoding="utf-8",
193+
)
194+
195+
# --- 5. candidates: the id-alignment assertion --------------------------
196+
cand_result = _run(
197+
PROVENANCE_SCRIPT,
198+
"candidates",
199+
evolve_dir=evolve_dir,
200+
home=home,
201+
cwd=project_root,
202+
)
203+
assert cand_result.returncode == 0, cand_result.stderr
204+
candidates = _parse_jsonl(cand_result.stdout)
205+
assert len(candidates) == 1, f"expected exactly one candidate, got: {candidates}"
206+
cand = candidates[0]
207+
208+
# KEY ASSERTION: the entity adapt() created == the entity audit() recorded
209+
# == the entity provenance() resolved, and the native transcript located by
210+
# the resolved project-root slug lines up with the audited session id.
211+
assert cand["session_id"] == SID
212+
assert cand["entity_id"] == adapted_entity_id == "feedback/prefer-ripgrep"
213+
assert "Always reach for ripgrep (rg) instead of grep." in cand["entity_excerpt"]
214+
assert cand["trajectory_path"] == str(native_transcript)
215+
assert "rg to search" in cand["trajectory_excerpt"]
216+
assert "missing" not in cand, f"chain did not fully resolve: {cand}"
217+
218+
# --- 6. record a verdict, then assert dedup -----------------------------
219+
verdict = {
220+
"session_id": SID,
221+
"entity": adapted_entity_id,
222+
"verdict": "followed",
223+
"evidence": "Assistant used rg (ripgrep) to search the repo.",
224+
}
225+
record = _run(
226+
PROVENANCE_SCRIPT,
227+
"record",
228+
evolve_dir=evolve_dir,
229+
home=home,
230+
cwd=project_root,
231+
stdin=json.dumps(verdict),
232+
)
233+
assert record.returncode == 0, record.stderr
234+
235+
influence_rows = [r for r in _read_audit(evolve_dir) if r.get("event") == "influence"]
236+
assert len(influence_rows) == 1, _read_audit(evolve_dir)
237+
assert influence_rows[0]["session_id"] == SID
238+
assert influence_rows[0]["entity"] == "feedback/prefer-ripgrep"
239+
assert influence_rows[0]["verdict"] == "followed"
240+
241+
# Re-run candidates: the judged pair is deduped -> nothing left.
242+
cand_again = _run(
243+
PROVENANCE_SCRIPT,
244+
"candidates",
245+
evolve_dir=evolve_dir,
246+
home=home,
247+
cwd=project_root,
248+
)
249+
assert cand_again.returncode == 0, cand_again.stderr
250+
assert _parse_jsonl(cand_again.stdout) == [], cand_again.stdout
251+
252+
253+
def test_candidates_surface_gaps_when_nothing_lines_up(sandbox):
254+
"""Negative/robustness: when the audited entity id was NEVER mirrored AND no
255+
transcript exists, the candidate is still emitted with ``missing`` listing
256+
BOTH ``entity`` and ``trajectory`` — the chain surfaces gaps instead of
257+
silently dropping them.
258+
"""
259+
home = sandbox["home"]
260+
project_root = sandbox["project_root"]
261+
evolve_dir = sandbox["evolve_dir"]
262+
263+
# Record a recall for an entity id that was never adapted/mirrored, with no
264+
# native transcript on disk for the session.
265+
audit = _run(
266+
AUDIT_SCRIPT,
267+
"feedback/does-not-exist",
268+
evolve_dir=evolve_dir,
269+
home=home,
270+
cwd=project_root,
271+
sid="ghost-session-0002",
272+
)
273+
assert audit.returncode == 0, audit.stderr
274+
275+
cand_result = _run(
276+
PROVENANCE_SCRIPT,
277+
"candidates",
278+
evolve_dir=evolve_dir,
279+
home=home,
280+
cwd=project_root,
281+
)
282+
assert cand_result.returncode == 0, cand_result.stderr
283+
candidates = _parse_jsonl(cand_result.stdout)
284+
assert len(candidates) == 1, candidates
285+
cand = candidates[0]
286+
assert cand["entity_id"] == "feedback/does-not-exist"
287+
assert cand["entity_excerpt"] is None
288+
assert cand["trajectory_path"] is None
289+
assert set(cand["missing"]) == {"entity", "trajectory"}

tests/platform_integrations/test_provenance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import pytest
1919

20-
pytestmark = [pytest.mark.platform_integrations, pytest.mark.e2e]
20+
pytestmark = [pytest.mark.platform_integrations]
2121

2222
_REPO_ROOT = Path(__file__).parent.parent.parent
2323
PROVENANCE_SCRIPT = _REPO_ROOT / "platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/provenance.py"

0 commit comments

Comments
 (0)