|
| 1 | +"""End-to-end data-flow test for the rendered Claude evolve-lite scripts. |
| 2 | +
|
| 3 | +This is the ONE integration test that proves the correlation ids line up across |
| 4 | +the whole chain on Claude — the integration that was broken in the pre-redesign |
| 5 | +world (native transcript path vs. entity id) and the reason the hookless redesign |
| 6 | +exists. It drives the REAL rendered Claude scripts as subprocesses, in sequence, |
| 7 | +with nothing mocked in the data flow: |
| 8 | +
|
| 9 | + adapt_memory.py -> mirrors a native memory into the evolve store, emitting |
| 10 | + the entity id ``feedback/prefer-ripgrep``. |
| 11 | + audit_recall.py -> records a ``recall`` row keyed by that exact entity id |
| 12 | + and the host session id. |
| 13 | + provenance.py -> reads the recall row, resolves the mirrored entity AND |
| 14 | + the NATIVE Claude transcript, and emits exactly one |
| 15 | + candidate whose ids line up end to end. |
| 16 | + provenance.py -> records a ``followed`` verdict, then dedups the pair. |
| 17 | +
|
| 18 | +Lib resolution (``lib/evolve-lite/entity_io.py``) only works in the rendered |
| 19 | +tree, so we point at the rendered Claude copies under ``platform-integrations/``. |
| 20 | +
|
| 21 | +The scripts are driven as real subprocesses (closest to actual agent usage); |
| 22 | +nothing in the data flow is mocked. |
| 23 | +""" |
| 24 | + |
| 25 | +import json |
| 26 | +import os |
| 27 | +import re |
| 28 | +import subprocess |
| 29 | +import sys |
| 30 | +from pathlib import Path |
| 31 | + |
| 32 | +import pytest |
| 33 | + |
| 34 | +pytestmark = [pytest.mark.platform_integrations] |
| 35 | + |
| 36 | +_REPO_ROOT = Path(__file__).parent.parent.parent |
| 37 | +_PLUGIN = _REPO_ROOT / "platform-integrations/claude/plugins/evolve-lite" |
| 38 | +ADAPT_SCRIPT = _PLUGIN / "skills/evolve-lite/adapt-memory/scripts/adapt_memory.py" |
| 39 | +AUDIT_SCRIPT = _PLUGIN / "scripts/audit_recall.py" |
| 40 | +PROVENANCE_SCRIPT = _PLUGIN / "skills/evolve-lite/provenance/scripts/provenance.py" |
| 41 | + |
| 42 | +SID = "claude-e2e-session-0001" |
| 43 | + |
| 44 | +NATIVE_MEMORY = """\ |
| 45 | +--- |
| 46 | +name: prefer-ripgrep |
| 47 | +description: use ripgrep over grep |
| 48 | +metadata: |
| 49 | + type: feedback |
| 50 | +--- |
| 51 | +Always reach for ripgrep (rg) instead of grep. |
| 52 | +""" |
| 53 | + |
| 54 | + |
| 55 | +def _claude_slug(root: Path) -> str: |
| 56 | + """Mirror provenance.py / doctor.py slugging: non-alphanumerics -> '-'.""" |
| 57 | + return re.sub(r"[^A-Za-z0-9]", "-", str(root)) |
| 58 | + |
| 59 | + |
| 60 | +def _run(script: Path, *args, evolve_dir: Path, home: Path, cwd: Path, stdin=None, sid=None): |
| 61 | + """Run a rendered Claude script as a real subprocess in the sandbox. |
| 62 | +
|
| 63 | + Every host path is sandboxed: ``$EVOLVE_DIR`` points at the temp store, |
| 64 | + ``$HOME``/``$USERPROFILE`` at a sandboxed home, cwd at the temp project root, |
| 65 | + and ``$CLAUDE_CODE_SESSION_ID`` at a known SID when supplied. |
| 66 | + """ |
| 67 | + env = {**os.environ} |
| 68 | + env["EVOLVE_DIR"] = str(evolve_dir) |
| 69 | + env["HOME"] = str(home) |
| 70 | + env["USERPROFILE"] = str(home) |
| 71 | + env.pop("HOMEDRIVE", None) |
| 72 | + env.pop("HOMEPATH", None) |
| 73 | + if sid is not None: |
| 74 | + env["CLAUDE_CODE_SESSION_ID"] = sid |
| 75 | + else: |
| 76 | + env.pop("CLAUDE_CODE_SESSION_ID", None) |
| 77 | + return subprocess.run( |
| 78 | + [sys.executable, str(script), *args], |
| 79 | + input=stdin, |
| 80 | + capture_output=True, |
| 81 | + text=True, |
| 82 | + cwd=str(cwd), |
| 83 | + env=env, |
| 84 | + check=False, |
| 85 | + ) |
| 86 | + |
| 87 | + |
| 88 | +def _parse_jsonl(text: str): |
| 89 | + return [json.loads(line) for line in text.splitlines() if line.strip()] |
| 90 | + |
| 91 | + |
| 92 | +def _read_audit(evolve_dir: Path): |
| 93 | + path = evolve_dir / "audit.log" |
| 94 | + if not path.is_file(): |
| 95 | + return [] |
| 96 | + return _parse_jsonl(path.read_text(encoding="utf-8")) |
| 97 | + |
| 98 | + |
| 99 | +@pytest.fixture |
| 100 | +def sandbox(tmp_path, sandbox_home): |
| 101 | + """Build the sandbox dirs the chain needs and return the salient paths. |
| 102 | +
|
| 103 | + ``sandbox_home`` (autouse) already redirects ``$HOME``; we reuse it as the |
| 104 | + home that holds the native Claude transcript tree. The project root lives |
| 105 | + under tmp_path with its own ``.evolve`` store, kept separate from HOME so |
| 106 | + the native-transcript slug (derived from the project root) is exercised for |
| 107 | + real. |
| 108 | + """ |
| 109 | + project_root = tmp_path / "proj" |
| 110 | + project_root.mkdir() |
| 111 | + evolve_dir = project_root / ".evolve" |
| 112 | + evolve_dir.mkdir() |
| 113 | + return { |
| 114 | + "home": sandbox_home, |
| 115 | + "project_root": project_root, |
| 116 | + "evolve_dir": evolve_dir, |
| 117 | + } |
| 118 | + |
| 119 | + |
| 120 | +def test_chain_closes_ids_line_up(sandbox): |
| 121 | + """The whole chain closes: the entity adapt() creates is the entity audit() |
| 122 | + records is the entity provenance() resolves against the native transcript. |
| 123 | +
|
| 124 | + Steps (each runs the real rendered script as a subprocess): |
| 125 | + 1. save — write the native Claude memory file. |
| 126 | + 2. adapt — mirror it; assert entities/feedback/prefer-ripgrep.md exists and |
| 127 | + the printed entity id is ``feedback/prefer-ripgrep``. |
| 128 | + 3. audit — record a recall row for that exact entity id under the SID. |
| 129 | + 4. native transcript — drop ~/.claude/projects/<slug>/<SID>.jsonl. |
| 130 | + 5. candidates — assert EXACTLY ONE candidate whose entity_id == |
| 131 | + ``feedback/prefer-ripgrep``, whose excerpt holds the mirrored |
| 132 | + content, whose trajectory_path is the native transcript, with |
| 133 | + NO ``missing`` field (entity + trajectory both resolved). This |
| 134 | + is the id-alignment assertion. |
| 135 | + 6. record + dedup — pipe a ``followed`` verdict; assert an influence row is |
| 136 | + appended; re-run candidates and assert it's now empty. |
| 137 | + """ |
| 138 | + home = sandbox["home"] |
| 139 | + project_root = sandbox["project_root"] |
| 140 | + evolve_dir = sandbox["evolve_dir"] |
| 141 | + |
| 142 | + # --- 1. save: native memory file (Claude format) ------------------------ |
| 143 | + native_file = project_root / "native_memory.md" |
| 144 | + native_file.write_text(NATIVE_MEMORY, encoding="utf-8") |
| 145 | + |
| 146 | + # --- 2. adapt: mirror native memory into the evolve store --------------- |
| 147 | + adapt = _run( |
| 148 | + ADAPT_SCRIPT, |
| 149 | + str(native_file), |
| 150 | + "--type", |
| 151 | + "feedback", |
| 152 | + "--trigger", |
| 153 | + "when searching code, prefer ripgrep", |
| 154 | + evolve_dir=evolve_dir, |
| 155 | + home=home, |
| 156 | + cwd=project_root, |
| 157 | + ) |
| 158 | + assert adapt.returncode == 0, adapt.stderr |
| 159 | + |
| 160 | + mirrored = evolve_dir / "entities" / "feedback" / "prefer-ripgrep.md" |
| 161 | + assert mirrored.is_file(), f"adapt did not mirror the entity: {adapt.stdout}\n{adapt.stderr}" |
| 162 | + |
| 163 | + # Capture the entity id from adapt's stdout ("Entity id: <id>"). |
| 164 | + id_lines = [ln for ln in adapt.stdout.splitlines() if ln.startswith("Entity id:")] |
| 165 | + assert id_lines, f"adapt did not print an entity id:\n{adapt.stdout}" |
| 166 | + adapted_entity_id = id_lines[0].split("Entity id:", 1)[1].strip() |
| 167 | + assert adapted_entity_id == "feedback/prefer-ripgrep" |
| 168 | + |
| 169 | + # --- 3. audit: record a recall row for that exact entity id ------------- |
| 170 | + audit = _run( |
| 171 | + AUDIT_SCRIPT, |
| 172 | + adapted_entity_id, # exactly as EVOLVE.md instructs the agent to pass it |
| 173 | + evolve_dir=evolve_dir, |
| 174 | + home=home, |
| 175 | + cwd=project_root, |
| 176 | + sid=SID, |
| 177 | + ) |
| 178 | + assert audit.returncode == 0, audit.stderr |
| 179 | + |
| 180 | + recall_rows = [r for r in _read_audit(evolve_dir) if r.get("event") == "recall"] |
| 181 | + assert len(recall_rows) == 1, _read_audit(evolve_dir) |
| 182 | + assert recall_rows[0]["session_id"] == SID |
| 183 | + assert recall_rows[0]["entities"] == ["feedback/prefer-ripgrep"] |
| 184 | + |
| 185 | + # --- 4. native transcript fixture --------------------------------------- |
| 186 | + slug = _claude_slug(project_root) |
| 187 | + native_transcript = home / ".claude" / "projects" / slug / f"{SID}.jsonl" |
| 188 | + native_transcript.parent.mkdir(parents=True) |
| 189 | + native_transcript.write_text( |
| 190 | + '{"type":"user","message":{"role":"user","content":"search the repo for TODOs"}}\n' |
| 191 | + '{"type":"assistant","message":{"role":"assistant","content":"Using rg to search."}}\n', |
| 192 | + encoding="utf-8", |
| 193 | + ) |
| 194 | + |
| 195 | + # --- 5. candidates: the id-alignment assertion -------------------------- |
| 196 | + cand_result = _run( |
| 197 | + PROVENANCE_SCRIPT, |
| 198 | + "candidates", |
| 199 | + evolve_dir=evolve_dir, |
| 200 | + home=home, |
| 201 | + cwd=project_root, |
| 202 | + ) |
| 203 | + assert cand_result.returncode == 0, cand_result.stderr |
| 204 | + candidates = _parse_jsonl(cand_result.stdout) |
| 205 | + assert len(candidates) == 1, f"expected exactly one candidate, got: {candidates}" |
| 206 | + cand = candidates[0] |
| 207 | + |
| 208 | + # KEY ASSERTION: the entity adapt() created == the entity audit() recorded |
| 209 | + # == the entity provenance() resolved, and the native transcript located by |
| 210 | + # the resolved project-root slug lines up with the audited session id. |
| 211 | + assert cand["session_id"] == SID |
| 212 | + assert cand["entity_id"] == adapted_entity_id == "feedback/prefer-ripgrep" |
| 213 | + assert "Always reach for ripgrep (rg) instead of grep." in cand["entity_excerpt"] |
| 214 | + assert cand["trajectory_path"] == str(native_transcript) |
| 215 | + assert "rg to search" in cand["trajectory_excerpt"] |
| 216 | + assert "missing" not in cand, f"chain did not fully resolve: {cand}" |
| 217 | + |
| 218 | + # --- 6. record a verdict, then assert dedup ----------------------------- |
| 219 | + verdict = { |
| 220 | + "session_id": SID, |
| 221 | + "entity": adapted_entity_id, |
| 222 | + "verdict": "followed", |
| 223 | + "evidence": "Assistant used rg (ripgrep) to search the repo.", |
| 224 | + } |
| 225 | + record = _run( |
| 226 | + PROVENANCE_SCRIPT, |
| 227 | + "record", |
| 228 | + evolve_dir=evolve_dir, |
| 229 | + home=home, |
| 230 | + cwd=project_root, |
| 231 | + stdin=json.dumps(verdict), |
| 232 | + ) |
| 233 | + assert record.returncode == 0, record.stderr |
| 234 | + |
| 235 | + influence_rows = [r for r in _read_audit(evolve_dir) if r.get("event") == "influence"] |
| 236 | + assert len(influence_rows) == 1, _read_audit(evolve_dir) |
| 237 | + assert influence_rows[0]["session_id"] == SID |
| 238 | + assert influence_rows[0]["entity"] == "feedback/prefer-ripgrep" |
| 239 | + assert influence_rows[0]["verdict"] == "followed" |
| 240 | + |
| 241 | + # Re-run candidates: the judged pair is deduped -> nothing left. |
| 242 | + cand_again = _run( |
| 243 | + PROVENANCE_SCRIPT, |
| 244 | + "candidates", |
| 245 | + evolve_dir=evolve_dir, |
| 246 | + home=home, |
| 247 | + cwd=project_root, |
| 248 | + ) |
| 249 | + assert cand_again.returncode == 0, cand_again.stderr |
| 250 | + assert _parse_jsonl(cand_again.stdout) == [], cand_again.stdout |
| 251 | + |
| 252 | + |
| 253 | +def test_candidates_surface_gaps_when_nothing_lines_up(sandbox): |
| 254 | + """Negative/robustness: when the audited entity id was NEVER mirrored AND no |
| 255 | + transcript exists, the candidate is still emitted with ``missing`` listing |
| 256 | + BOTH ``entity`` and ``trajectory`` — the chain surfaces gaps instead of |
| 257 | + silently dropping them. |
| 258 | + """ |
| 259 | + home = sandbox["home"] |
| 260 | + project_root = sandbox["project_root"] |
| 261 | + evolve_dir = sandbox["evolve_dir"] |
| 262 | + |
| 263 | + # Record a recall for an entity id that was never adapted/mirrored, with no |
| 264 | + # native transcript on disk for the session. |
| 265 | + audit = _run( |
| 266 | + AUDIT_SCRIPT, |
| 267 | + "feedback/does-not-exist", |
| 268 | + evolve_dir=evolve_dir, |
| 269 | + home=home, |
| 270 | + cwd=project_root, |
| 271 | + sid="ghost-session-0002", |
| 272 | + ) |
| 273 | + assert audit.returncode == 0, audit.stderr |
| 274 | + |
| 275 | + cand_result = _run( |
| 276 | + PROVENANCE_SCRIPT, |
| 277 | + "candidates", |
| 278 | + evolve_dir=evolve_dir, |
| 279 | + home=home, |
| 280 | + cwd=project_root, |
| 281 | + ) |
| 282 | + assert cand_result.returncode == 0, cand_result.stderr |
| 283 | + candidates = _parse_jsonl(cand_result.stdout) |
| 284 | + assert len(candidates) == 1, candidates |
| 285 | + cand = candidates[0] |
| 286 | + assert cand["entity_id"] == "feedback/does-not-exist" |
| 287 | + assert cand["entity_excerpt"] is None |
| 288 | + assert cand["trajectory_path"] is None |
| 289 | + assert set(cand["missing"]) == {"entity", "trajectory"} |
0 commit comments