diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/learn/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/learn/SKILL.md index 21e70613..0d90a265 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/learn/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/learn/SKILL.md @@ -116,6 +116,38 @@ The script will: - Deduplicate against existing entities - Display confirmation with the total count +### Step 5: Assess Influence of Recalled Entities + +Regardless of whether Step 4 saved new entities, judge whether the guidelines the recall hook served to *this* session were actually followed, contradicted, or simply irrelevant. This closes the provenance loop: the recall hook records *what* was served; this step records *what effect* it had. + +1. Derive this session's `session_id` from the `saved_trajectory_path` extracted in Step 0: strip the directory prefix and the `claude-transcript_` / `.jsonl` affixes. For `.evolve/trajectories/claude-transcript_abc-123.jsonl` the `session_id` is `abc-123`. + +2. Read `.evolve/audit.log` (JSONL, one object per line). Find every line where `event == "recall"` and `session_id` matches. Take the union of their `entities` arrays — that is the set of guideline identifiers served to this session. Each identifier is a relative path from `.evolve/entities/` without the `.md` suffix (e.g. `guideline/foo` for a local entity, or `subscribed/alice/guideline/foo` for a subscribed one), so it unambiguously names one file. If the set is empty, skip this step. + +3. For each identifier, open `.evolve/entities/.md` with the Read tool. Read its content + trigger — that is the guideline's intent. Skip the identifier (log it as an assessment-less entry) if the file is not found. + +4. Compare against the transcript loaded in Step 0. For each identifier, pick one verdict: + - `followed` — the agent's actual actions are consistent with the guideline's recommendation. + - `contradicted` — the guideline's trigger matched the task but the agent did the opposite, or hit the dead end the guideline would have prevented. + - `not_applicable` — the guideline's trigger didn't match what this session was about. + + Keep `evidence` to one short sentence citing a specific action or tool call from the transcript. + +5. Emit one JSON payload and pipe it to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent imported struct and parsed APP1 directly"} + ] +}' | python3 ${CLAUDE_PLUGIN_ROOT}/skills/learn/scripts/log_influence.py +``` + +The `entity` value must match exactly what appeared in the recall event — include the `subscribed//` prefix if the entity came from a subscribed repo. + +Emit zero assessments (empty `assessments` list) when no recall events exist for this session. + ## Quality Gate Before saving, review each entity against this checklist: diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/learn/scripts/log_influence.py b/platform-integrations/claude/plugins/evolve-lite/skills/learn/scripts/log_influence.py new file mode 100644 index 00000000..9a00ddee --- /dev/null +++ b/platform-integrations/claude/plugins/evolve-lite/skills/learn/scripts/log_influence.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent / "lib")) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + project_root = str(get_evolve_dir().resolve().parent) + + written = 0 + for a in assessments: + if not isinstance(a, dict): + log(f"Skipping non-dict assessment item: {a!r}") + continue + entity = a.get("entity") + verdict = a.get("verdict") + evidence = a.get("evidence", "") + if not entity or verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment: {a}") + continue + audit.append( + project_root=project_root, + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/recall/scripts/retrieve_entities.py b/platform-integrations/claude/plugins/evolve-lite/skills/recall/scripts/retrieve_entities.py index 6043c345..1cdae7fa 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/recall/scripts/retrieve_entities.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/recall/scripts/retrieve_entities.py @@ -8,7 +8,8 @@ # Add lib to path so we can import entity_io sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent / "lib")) -from entity_io import find_recall_entity_dirs, markdown_to_entity, log as _log +from entity_io import find_recall_entity_dirs, get_evolve_dir, markdown_to_entity, log as _log +import audit def log(message): @@ -82,6 +83,13 @@ def load_entities_with_source(entities_dir): entity = markdown_to_entity(md) if not entity.get("content"): continue + # Record the on-disk path relative to entities_dir (without the + # .md suffix) as a qualified identifier. This distinguishes + # same-named entities in different trees — e.g. + # "guideline/foo" (local) vs "subscribed/alice/guideline/foo" + # (from a subscribed repo) — so downstream auditing doesn't + # collapse them into one. + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) # Detect subscribed entities by path: .../entities/subscribed/{name}/... parts = md.parts try: @@ -129,6 +137,24 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit: record which entities were served to which session. Must not + # fail the hook if logging errors — recall is the user-visible path. + try: + transcript_path = input_data.get("transcript_path", "") + session_id = Path(transcript_path).stem if transcript_path else None + entity_ids = sorted({e["_id"] for e in entities if e.get("_id")}) + if session_id and entity_ids: + project_root = get_evolve_dir().resolve().parent + audit.append( + project_root=str(project_root), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py index d39293a0..0731730a 100644 --- a/tests/e2e/test_sandbox_learn_recall.py +++ b/tests/e2e/test_sandbox_learn_recall.py @@ -164,3 +164,36 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): # pip-installed). Other libraries (PIL, piexif, exifread) may appear in a # valid guideline as "install via pip and use", so we don't ban them. assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands) + + # --- Usage provenance: audit.log should record recall + influence --- + audit_log = sandbox_workspace / ".evolve" / "audit.log" + assert audit_log.is_file(), f"{audit_log} was not created — recall did not append audit events" + + events = [] + for line in audit_log.read_text().splitlines(): + line = line.strip() + if not line: + continue + events.append(json.loads(line)) + + session2_id = session2_transcript.stem.removeprefix("claude-transcript_") + # Recall audit records qualified ids — path relative to .evolve/entities/ + # without the .md suffix — so we match session 1's entities the same way. + session1_ids = {str(p.relative_to(entities_dir).with_suffix("")) for p in entity_files} + + recall_events = [e for e in events if e.get("event") == "recall" and e.get("session_id") == session2_id] + assert recall_events, f"no recall audit event for session 2 ({session2_id}). all events: {events}" + recalled_ids = {eid for e in recall_events for eid in e.get("entities", [])} + assert recalled_ids & session1_ids, f"recall event entities {recalled_ids} did not include any id from session 1 ({session1_ids})" + log.info(f"session 2: audit recorded recall of {recalled_ids}") + + influence_events = [e for e in events if e.get("event") == "influence" and e.get("session_id") == session2_id] + assert influence_events, ( + f"no influence audit event for session 2 ({session2_id}). recall events exist but learn did not emit assessments." + ) + for ie in influence_events: + assert ie.get("verdict") in {"followed", "contradicted", "not_applicable"}, f"influence event has invalid verdict: {ie}" + log.info( + f"session 2: audit recorded {len(influence_events)} influence assessment(s): " + f"{[(e['entity'], e['verdict']) for e in influence_events]}" + ) diff --git a/tests/platform_integrations/test_log_influence.py b/tests/platform_integrations/test_log_influence.py new file mode 100644 index 00000000..09745054 --- /dev/null +++ b/tests/platform_integrations/test_log_influence.py @@ -0,0 +1,199 @@ +"""Tests for the Claude plugin's skills/learn/scripts/log_influence.py.""" + +import json +import os +import subprocess +import sys +from pathlib import Path + +import pytest + +pytestmark = [pytest.mark.platform_integrations, pytest.mark.e2e] + +_PLUGIN_ROOT = Path(__file__).parent.parent.parent / "platform-integrations/claude/plugins/evolve-lite" +LOG_INFLUENCE_SCRIPT = _PLUGIN_ROOT / "skills/learn/scripts/log_influence.py" + + +def run_log_influence(project_dir, payload, *, raw_input=None, evolve_dir=None): + """Invoke log_influence.py with the given payload (dict) or raw_input (str).""" + env = {**os.environ} + if evolve_dir: + env["EVOLVE_DIR"] = str(evolve_dir) + stdin = raw_input if raw_input is not None else json.dumps(payload) + return subprocess.run( + [sys.executable, str(LOG_INFLUENCE_SCRIPT)], + input=stdin, + capture_output=True, + text=True, + cwd=str(project_dir), + env=env, + check=False, + ) + + +def read_audit(evolve_dir): + path = evolve_dir / "audit.log" + if not path.is_file(): + return [] + return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] + + +class TestLogInfluence: + def test_writes_single_assessment(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "abc-123", + "assessments": [ + {"entity": "slug-a", "verdict": "followed", "evidence": "because"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0] == { + "event": "influence", + "session_id": "abc-123", + "entity": "slug-a", + "verdict": "followed", + "evidence": "because", + "ts": events[0]["ts"], + } + + def test_writes_multiple_assessments(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"entity": "slug-a", "verdict": "followed", "evidence": "e1"}, + {"entity": "slug-b", "verdict": "not_applicable", "evidence": "e2"}, + {"entity": "slug-c", "verdict": "contradicted", "evidence": "e3"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 3 + verdicts = {e["entity"]: e["verdict"] for e in events} + assert verdicts == {"slug-a": "followed", "slug-b": "not_applicable", "slug-c": "contradicted"} + + def test_skips_assessments_with_invalid_verdict(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"entity": "slug-a", "verdict": "bogus", "evidence": "no"}, + {"entity": "slug-b", "verdict": "followed", "evidence": "yes"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "slug-b" + + def test_skips_assessments_missing_entity(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"verdict": "followed", "evidence": "no entity"}, + {"entity": "slug-b", "verdict": "followed", "evidence": "ok"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "slug-b" + + def test_skips_non_dict_assessment_items(self, temp_project_dir): + """Non-dict items in the assessments list must not raise AttributeError.""" + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + "not-a-dict", + 42, + None, + {"entity": "slug-ok", "verdict": "followed", "evidence": "yes"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "slug-ok" + + def test_empty_assessments_list_is_ok(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"session_id": "sess-1", "assessments": []}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + assert read_audit(evolve_dir) == [] + + def test_evidence_defaults_to_empty_string(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [{"entity": "slug-a", "verdict": "followed"}], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert events[0]["evidence"] == "" + + def test_rejects_non_dict_payload(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence(temp_project_dir, ["not", "a", "dict"], evolve_dir=evolve_dir) + assert result.returncode == 1 + assert "payload" in result.stderr.lower() + assert read_audit(evolve_dir) == [] + + def test_rejects_missing_session_id(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"assessments": [{"entity": "a", "verdict": "followed"}]}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 1 + assert read_audit(evolve_dir) == [] + + def test_rejects_non_list_assessments(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"session_id": "sess-1", "assessments": "oops"}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 1 + assert read_audit(evolve_dir) == [] + + def test_rejects_invalid_json(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence(temp_project_dir, None, raw_input="{not valid json", evolve_dir=evolve_dir) + assert result.returncode == 1 + assert "json" in result.stderr.lower() + assert read_audit(evolve_dir) == []