From 48d0aad401ee89fb4a55622806bfcf6bff306f70 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 29 Apr 2026 10:56:44 -0500 Subject: [PATCH 1/7] test(e2e): add sandbox learn+recall test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tests/e2e/test_sandbox_learn_recall.py which runs two Claude Code sessions in the Docker sandbox back-to-back: the first hits dead ends on a photo-EXIF task (exiftool/PIL unavailable), so the Stop hook extracts a guideline; the second asks a similar question, and recall should inject the guideline and steer Claude away from the failed tools. Uses pytest live logging for real-time progress. Also simplify the claude-prompt justfile target — drop the obsolete trace/learn options now that the Stop hooks handle transcript saving and guideline learning automatically — and document how to run the test in sandbox/README.md. --- justfile | 20 +-- sandbox/README.md | 60 +++++++++ tests/e2e/test_sandbox_learn_recall.py | 180 +++++++++++++++++++++++++ 3 files changed, 241 insertions(+), 19 deletions(-) create mode 100644 tests/e2e/test_sandbox_learn_recall.py diff --git a/justfile b/justfile index cbbe8107..915da28f 100644 --- a/justfile +++ b/justfile @@ -13,8 +13,6 @@ codex_image := "evolve-codex-sandbox" env_file := "sandbox/myenv" sandbox_dir := "sandbox" workspace := "demo/workspace" -trace := "false" -learn := "false" # Build sandbox Docker image(s). Use target=claude or target=codex to build only one. sandbox-build target="all": @@ -58,31 +56,15 @@ sandbox-clean target="all": claude-run: docker run --rm -it --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}} -# Run a one-shot prompt in the sandbox (trace=true to summarize session, learn=true to run /evolve-lite:learn) +# Run a one-shot prompt in the sandbox claude-prompt prompt: #!/usr/bin/env sh export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF' {{prompt}} PROMPT_EOF )" - TRACE_CMD="" - LEARN_CMD="" - if [ "{{trace}}" = "true" ]; then - TRACE_CMD=" - echo; echo; echo Summarizing the session...; echo - claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --no-session-persistence -p 'tell me what happened in the newest json file in /home/sandbox/.claude/projects/-workspace/' - " - fi - if [ "{{learn}}" = "true" ]; then - LEARN_CMD=" - echo; echo; echo Learning...; echo - claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --continue -p '/evolve-lite:learn' - " - fi docker run --rm -it --env SANDBOX_PROMPT --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}} sh -c " claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p \"\$SANDBOX_PROMPT\" - $TRACE_CMD - $LEARN_CMD " # Smoke-test that Claude Code is installed and working diff --git a/sandbox/README.md b/sandbox/README.md index fed9d8d4..1065618d 100644 --- a/sandbox/README.md +++ b/sandbox/README.md @@ -30,3 +30,63 @@ docker run --rm -it --env-file sandbox/myenv -v "$(pwd)":/workspace claude-sandb docker run --rm --env-file sandbox/myenv claude-sandbox claude -p "who are you" ``` +## Automated E2E Test + +`tests/e2e/test_sandbox_learn_recall.py` exercises the full evolve-lite +learn + recall loop end-to-end inside this sandbox. It runs two Claude +sessions: + +1. **Session 1** asks Claude to extract EXIF metadata from a sample photo. + The sandbox lacks `exiftool` and `PIL`, so Claude hits dead ends and + recovers using stdlib. The Stop hook runs `learn`, which reads the + saved transcript and extracts a guideline. +2. **Session 2** asks a similar metadata question. Recall injects the + guideline from session 1, so Claude should skip the failing tools and + go straight to stdlib. + +The test asserts a guideline file was produced in session 1 and that +session 2's bash commands do not invoke `exiftool` / `PIL` / `piexif` / +`exifread`. + +### Prerequisites + +- Build the sandbox image: `just sandbox-build claude` +- Credentials in the environment — either export `ANTHROPIC_API_KEY` + directly, or source an env file (e.g. with + [`dotenv`](https://github.com/bkeepers/dotenv)). The test forwards + these vars into the container when set: `ANTHROPIC_API_KEY`, + `ANTHROPIC_AUTH_TOKEN`, `ANTHROPIC_BASE_URL`, `CLAUDE_MODEL`, + `CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS`, `CLAUDE_CODE_SKIP_BEDROCK_AUTH`. + +Example env file (only `ANTHROPIC_API_KEY` is required; others are +optional and used when routing through a proxy or picking a specific +model): + +```bash +# Direct Anthropic API +ANTHROPIC_API_KEY=sk-ant-... + +# Or, via a proxy / gateway +ANTHROPIC_AUTH_TOKEN=your-token +ANTHROPIC_BASE_URL=https://your-gateway.example.com +CLAUDE_MODEL=claude-sonnet-4-6 +CLAUDE_CODE_SKIP_BEDROCK_AUTH=1 +``` + +### Run + +```bash +# If creds live in an env file: +dotenv -e path/to/your.env -- \ + uv run pytest tests/e2e/test_sandbox_learn_recall.py \ + --run-e2e -m e2e -v --log-cli-level=INFO + +# Or, with vars already exported: +uv run pytest tests/e2e/test_sandbox_learn_recall.py \ + --run-e2e -m e2e -v --log-cli-level=INFO +``` + +The `--log-cli-level=INFO` flag streams per-session progress lines live +(~4 minutes total). The test skips if Docker, the sandbox image, or +credentials are missing. + diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py new file mode 100644 index 00000000..2487a97b --- /dev/null +++ b/tests/e2e/test_sandbox_learn_recall.py @@ -0,0 +1,180 @@ +"""End-to-end test of the evolve-lite learn + recall flow in the sandbox. + +Runs two sequential Claude Code sessions against the Dockerized sandbox: + 1. Ask about photo location — sandbox lacks exiftool/PIL, so Claude hits + dead ends and recovers. Stop hook fires learn, which reads the saved + transcript and extracts a guideline. + 2. Ask about focal length — UserPromptSubmit recall hook injects the + guideline from session 1, so Claude should skip the dead ends. + +Assertions: + - Session 1 produces a guideline file under .evolve/entities/. + - Session 2 does NOT invoke exiftool/PIL (recall shortcut worked). + +Requires Docker, the `claude-sandbox` image built, and ANTHROPIC_API_KEY +set in the environment (forwarded into the container). +""" + +import json +import logging +import os +import shutil +import subprocess +import time +from pathlib import Path + +import pytest + + +log = logging.getLogger(__name__) + + +SANDBOX_IMAGE = "claude-sandbox" +REPO_ROOT = Path(__file__).resolve().parents[2] +SESSION_TIMEOUT_SECONDS = 600 +FORWARDED_ENV_VARS = ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "ANTHROPIC_BASE_URL", + "CLAUDE_MODEL", + "CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS", + "CLAUDE_CODE_SKIP_BEDROCK_AUTH", +) + + +@pytest.fixture(scope="session") +def sandbox_ready(): + """Skip if Docker, the sandbox image, or credentials aren't available.""" + if shutil.which("docker") is None: + pytest.skip("docker not installed") + + if subprocess.run(["docker", "info"], capture_output=True).returncode != 0: + pytest.skip("docker daemon not running") + + image_check = subprocess.run( + ["docker", "image", "inspect", SANDBOX_IMAGE], + capture_output=True, + ) + if image_check.returncode != 0: + pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build claude`") + + if not (os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")): + pytest.skip("ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) not set in environment") + + return True + + +@pytest.fixture +def sandbox_workspace(tmp_path): + """Copy demo/workspace to tmp_path so each test gets a clean state.""" + src = REPO_ROOT / "demo" / "workspace" + dst = tmp_path / "workspace" + shutil.copytree(src, dst, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup")) + return dst + + +def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedProcess: + plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins" + cmd = ["docker", "run", "--rm"] + for var in FORWARDED_ENV_VARS: + if os.environ.get(var): + cmd += ["-e", var] + cmd += [ + "-e", "EVOLVE_DEBUG=1", + "-v", f"{workspace}:/workspace", + "-v", f"{plugins}:/plugins", + SANDBOX_IMAGE, + "bash", "-c", + f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"', + ] + return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) + + +def _bash_commands(transcript_path: Path) -> list[str]: + commands = [] + for line in transcript_path.read_text().splitlines(): + if not line.strip(): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + content = record.get("message", {}).get("content", []) + if not isinstance(content, list): + continue + for block in content: + if isinstance(block, dict) and block.get("type") == "tool_use" and block.get("name") == "Bash": + cmd = block.get("input", {}).get("command", "") + if cmd: + commands.append(cmd) + return commands + + +@pytest.mark.e2e +def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): + """Session 1 extracts a guideline; session 2 benefits from recall.""" + del sandbox_ready # only used for its skip side effect + + # --- Session 1: location query — expected dead ends then recovery --- + log.info("session 1: running location query in sandbox...") + t0 = time.time() + result1 = _run_sandbox_prompt( + sandbox_workspace, + "where was the photo @sample.jpg taken. use exif metadata", + ) + log.info(f"session 1: exited {result1.returncode} after {time.time() - t0:.0f}s") + assert result1.returncode == 0, ( + f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}" + ) + + entities_dir = sandbox_workspace / ".evolve" / "entities" + trajectories_dir = sandbox_workspace / ".evolve" / "trajectories" + + assert entities_dir.is_dir(), ( + f"{entities_dir} was not created — learn did not save guidelines.\n" + f"stdout:\n{result1.stdout[-2000:]}" + ) + entity_files = list(entities_dir.rglob("*.md")) + assert entity_files, f"no guideline files found in {entities_dir}" + log.info(f"session 1: learn saved {len(entity_files)} guideline(s): {[p.name for p in entity_files]}") + + transcripts = list(trajectories_dir.glob("*.jsonl")) + assert transcripts, f"no transcript saved in {trajectories_dir}" + + # --- Session 2: focal length query — recall should inject the guideline --- + log.info("session 2: running focal length query in sandbox...") + t1 = time.time() + result2 = _run_sandbox_prompt( + sandbox_workspace, + "what focal length was used to take the photo @sample.jpg. use exif metadata", + ) + log.info(f"session 2: exited {result2.returncode} after {time.time() - t1:.0f}s") + assert result2.returncode == 0, ( + f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}" + ) + + session2_transcripts = [ + p for p in trajectories_dir.glob("*.jsonl") + if p not in transcripts + ] + assert session2_transcripts, "no new transcript saved for session 2" + session2_transcript = session2_transcripts[0] + + commands = _bash_commands(session2_transcript) + log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools") + joined = "\n".join(commands).lower() + + # Recall should steer Claude away from the tools that failed in session 1. + # The guideline text itself may name these tools, but we're checking actual + # bash invocations, not string mentions — so a command like + # `python3 -c "import PIL"` would fail this check, while the guideline's + # prose mentioning PIL as unavailable does not. + assert "exiftool " not in joined and "exiftool$" not in joined, ( + f"session 2 invoked exiftool despite recall guideline:\n" + + "\n".join(commands) + ) + for banned in ("from pil", "import pil", "import piexif", "import exifread"): + assert banned not in joined, ( + f"session 2 tried {banned!r} despite recall guideline:\n" + + "\n".join(commands) + ) From 3c6bd3e4c097d30431949d419edd7e2a2d23d6db Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 29 Apr 2026 11:10:28 -0500 Subject: [PATCH 2/7] fix(tests): ruff format and remove extraneous f-string prefix Fixes failing CI checks: check-formatting, check-linting --- tests/e2e/test_sandbox_learn_recall.py | 40 ++++++++++---------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py index 2487a97b..03767041 100644 --- a/tests/e2e/test_sandbox_learn_recall.py +++ b/tests/e2e/test_sandbox_learn_recall.py @@ -80,11 +80,15 @@ def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedPro if os.environ.get(var): cmd += ["-e", var] cmd += [ - "-e", "EVOLVE_DEBUG=1", - "-v", f"{workspace}:/workspace", - "-v", f"{plugins}:/plugins", + "-e", + "EVOLVE_DEBUG=1", + "-v", + f"{workspace}:/workspace", + "-v", + f"{plugins}:/plugins", SANDBOX_IMAGE, - "bash", "-c", + "bash", + "-c", f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"', ] return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) @@ -123,17 +127,12 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): "where was the photo @sample.jpg taken. use exif metadata", ) log.info(f"session 1: exited {result1.returncode} after {time.time() - t0:.0f}s") - assert result1.returncode == 0, ( - f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}" - ) + assert result1.returncode == 0, f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}" entities_dir = sandbox_workspace / ".evolve" / "entities" trajectories_dir = sandbox_workspace / ".evolve" / "trajectories" - assert entities_dir.is_dir(), ( - f"{entities_dir} was not created — learn did not save guidelines.\n" - f"stdout:\n{result1.stdout[-2000:]}" - ) + assert entities_dir.is_dir(), f"{entities_dir} was not created — learn did not save guidelines.\nstdout:\n{result1.stdout[-2000:]}" entity_files = list(entities_dir.rglob("*.md")) assert entity_files, f"no guideline files found in {entities_dir}" log.info(f"session 1: learn saved {len(entity_files)} guideline(s): {[p.name for p in entity_files]}") @@ -149,14 +148,9 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): "what focal length was used to take the photo @sample.jpg. use exif metadata", ) log.info(f"session 2: exited {result2.returncode} after {time.time() - t1:.0f}s") - assert result2.returncode == 0, ( - f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}" - ) + assert result2.returncode == 0, f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}" - session2_transcripts = [ - p for p in trajectories_dir.glob("*.jsonl") - if p not in transcripts - ] + session2_transcripts = [p for p in trajectories_dir.glob("*.jsonl") if p not in transcripts] assert session2_transcripts, "no new transcript saved for session 2" session2_transcript = session2_transcripts[0] @@ -169,12 +163,8 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): # bash invocations, not string mentions — so a command like # `python3 -c "import PIL"` would fail this check, while the guideline's # prose mentioning PIL as unavailable does not. - assert "exiftool " not in joined and "exiftool$" not in joined, ( - f"session 2 invoked exiftool despite recall guideline:\n" - + "\n".join(commands) + assert "exiftool " not in joined and "exiftool$" not in joined, "session 2 invoked exiftool despite recall guideline:\n" + "\n".join( + commands ) for banned in ("from pil", "import pil", "import piexif", "import exifread"): - assert banned not in joined, ( - f"session 2 tried {banned!r} despite recall guideline:\n" - + "\n".join(commands) - ) + assert banned not in joined, f"session 2 tried {banned!r} despite recall guideline:\n" + "\n".join(commands) From 2fef74b5a6b413d0bc1243db251c68e68ff8c9f7 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 29 Apr 2026 11:15:24 -0500 Subject: [PATCH 3/7] fix(sandbox): allowlist README placeholder in .secrets.baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes failing CI check: tekton/code-detect-secrets The example env file snippet in sandbox/README.md tripped IBM's detect-secrets scanner. Add the finding to .secrets.baseline — the repo's existing convention (14 entries already present) — rather than polluting the doc with inline pragmas. Change the placeholder to the canonical sk-ant-xxxx form used elsewhere. --- .secrets.baseline | 13 +++++++++++-- sandbox/README.md | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index a265cdbd..77d1a251 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": "^.secrets.baseline$|package-lock\\.json$", "lines": null }, - "generated_at": "2026-04-20T15:55:53Z", + "generated_at": "2026-04-29T16:14:59Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -153,6 +153,15 @@ "verified_result": null } ], + "sandbox/README.md": [ + { + "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd", + "is_verified": false, + "line_number": 67, + "type": "Secret Keyword", + "verified_result": null + } + ], "sandbox/sample.env": [ { "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd", @@ -213,4 +222,4 @@ "file": null, "hash": null } -} \ No newline at end of file +} diff --git a/sandbox/README.md b/sandbox/README.md index 1065618d..8ca8151d 100644 --- a/sandbox/README.md +++ b/sandbox/README.md @@ -64,7 +64,7 @@ model): ```bash # Direct Anthropic API -ANTHROPIC_API_KEY=sk-ant-... +ANTHROPIC_API_KEY=sk-ant-xxxx # Or, via a proxy / gateway ANTHROPIC_AUTH_TOKEN=your-token From 967acaa24d43375c195725546879934106e4d6ad Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 29 Apr 2026 11:34:09 -0500 Subject: [PATCH 4/7] fix(sandbox): audit README placeholder as non-secret Fixes failing CI check: tekton/pr-code-checks/code-detect-secrets The baseline entry for sandbox/README.md needed `is_secret: false` to pass tekton's `detect-secrets audit --fail-on-unaudited` check. --- .secrets.baseline | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.secrets.baseline b/.secrets.baseline index 77d1a251..14ee6fa1 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -159,7 +159,8 @@ "is_verified": false, "line_number": 67, "type": "Secret Keyword", - "verified_result": null + "verified_result": null, + "is_secret": false } ], "sandbox/sample.env": [ From 5b4271c5a007a75259ee85ced6e0c47b8ead9d37 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 29 Apr 2026 12:42:12 -0500 Subject: [PATCH 5/7] fix(tests): select newest session-2 transcript deterministically Addresses CodeRabbit review finding: Session 2 transcript selection is nondeterministic and can yield false passes `glob()` order isn't guaranteed; if multiple new `*.jsonl` files ever appear between session 1 and session 2 (e.g. a future change adds a second writer), `[0]` could pick the wrong one. Use `max(key=mtime)` to pick the newest deterministically. --- tests/e2e/test_sandbox_learn_recall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py index 03767041..a967f05b 100644 --- a/tests/e2e/test_sandbox_learn_recall.py +++ b/tests/e2e/test_sandbox_learn_recall.py @@ -152,7 +152,7 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): session2_transcripts = [p for p in trajectories_dir.glob("*.jsonl") if p not in transcripts] assert session2_transcripts, "no new transcript saved for session 2" - session2_transcript = session2_transcripts[0] + session2_transcript = max(session2_transcripts, key=lambda p: p.stat().st_mtime) commands = _bash_commands(session2_transcript) log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools") From 8ed92ebad4d50558227bbc265d853fbbdf82d45a Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 29 Apr 2026 12:42:38 -0500 Subject: [PATCH 6/7] fix(tests): use regex word boundary for banned tool detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses CodeRabbit review finding: `exiftool$` check is ineffective as written The `"exiftool$" not in joined` check treated `$` as a literal character — never matched anything. Replace with `re.search(r"\bexiftool\b", joined)` which catches every bash invocation of exiftool regardless of trailing character. --- tests/e2e/test_sandbox_learn_recall.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py index a967f05b..10ffe39e 100644 --- a/tests/e2e/test_sandbox_learn_recall.py +++ b/tests/e2e/test_sandbox_learn_recall.py @@ -18,6 +18,7 @@ import json import logging import os +import re import shutil import subprocess import time @@ -163,8 +164,6 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): # bash invocations, not string mentions — so a command like # `python3 -c "import PIL"` would fail this check, while the guideline's # prose mentioning PIL as unavailable does not. - assert "exiftool " not in joined and "exiftool$" not in joined, "session 2 invoked exiftool despite recall guideline:\n" + "\n".join( - commands - ) + assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands) for banned in ("from pil", "import pil", "import piexif", "import exifread"): assert banned not in joined, f"session 2 tried {banned!r} despite recall guideline:\n" + "\n".join(commands) From e7b5dfc4ce92760b6a2a93ffbd4fc7376bd9866e Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 29 Apr 2026 13:12:49 -0500 Subject: [PATCH 7/7] fix(tests): narrow recall assertion to exiftool only The banned list previously included PIL/piexif/exifread, which are all pip-installable. A valid recall guideline can legitimately recommend "pip install " for any of them, causing the assertion to fail on correct behavior. Only exiftool is definitively absent in the sandbox (not installed, not pip-installable), so it's the only reliable marker that recall did NOT steer Claude away from the unavailable tool. --- tests/e2e/test_sandbox_learn_recall.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py index 10ffe39e..d39293a0 100644 --- a/tests/e2e/test_sandbox_learn_recall.py +++ b/tests/e2e/test_sandbox_learn_recall.py @@ -159,11 +159,8 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools") joined = "\n".join(commands).lower() - # Recall should steer Claude away from the tools that failed in session 1. - # The guideline text itself may name these tools, but we're checking actual - # bash invocations, not string mentions — so a command like - # `python3 -c "import PIL"` would fail this check, while the guideline's - # prose mentioning PIL as unavailable does not. + # Recall should steer Claude away from tools guaranteed-unavailable in the + # sandbox. Only `exiftool` is definitively absent (not installed, can't be + # pip-installed). Other libraries (PIL, piexif, exifread) may appear in a + # valid guideline as "install via pip and use", so we don't ban them. assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands) - for banned in ("from pil", "import pil", "import piexif", "import exifread"): - assert banned not in joined, f"session 2 tried {banned!r} despite recall guideline:\n" + "\n".join(commands)