From 48d0aad401ee89fb4a55622806bfcf6bff306f70 Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 29 Apr 2026 10:56:44 -0500
Subject: [PATCH 1/7] test(e2e): add sandbox learn+recall test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add tests/e2e/test_sandbox_learn_recall.py which runs two Claude Code
sessions in the Docker sandbox back-to-back: the first hits dead ends
on a photo-EXIF task (exiftool/PIL unavailable), so the Stop hook
extracts a guideline; the second asks a similar question, and recall
should inject the guideline and steer Claude away from the failed
tools. Uses pytest live logging for real-time progress.

Also simplify the claude-prompt justfile target — drop the obsolete
trace/learn options now that the Stop hooks handle transcript saving
and guideline learning automatically — and document how to run the
test in sandbox/README.md.
---
 justfile                               |  20 +--
 sandbox/README.md                      |  60 +++++++++
 tests/e2e/test_sandbox_learn_recall.py | 180 +++++++++++++++++++++++++
 3 files changed, 241 insertions(+), 19 deletions(-)
 create mode 100644 tests/e2e/test_sandbox_learn_recall.py

diff --git a/justfile b/justfile
index cbbe8107..915da28f 100644
--- a/justfile
+++ b/justfile
@@ -13,8 +13,6 @@ codex_image := "evolve-codex-sandbox"
 env_file := "sandbox/myenv"
 sandbox_dir := "sandbox"
 workspace := "demo/workspace"
-trace := "false"
-learn := "false"
 
 # Build sandbox Docker image(s). Use target=claude or target=codex to build only one.
 sandbox-build target="all":
@@ -58,31 +56,15 @@ sandbox-clean target="all":
 claude-run:
     docker run --rm -it --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}}
 
-# Run a one-shot prompt in the sandbox (trace=true to summarize session, learn=true to run /evolve-lite:learn)
+# Run a one-shot prompt in the sandbox
 claude-prompt prompt:
     #!/usr/bin/env sh
     export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF'
     {{prompt}}
     PROMPT_EOF
     )"
-    TRACE_CMD=""
-    LEARN_CMD=""
-    if [ "{{trace}}" = "true" ]; then
-        TRACE_CMD="
-            echo; echo; echo Summarizing the session...; echo
-            claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --no-session-persistence -p 'tell me what happened in the newest json file in /home/sandbox/.claude/projects/-workspace/'
-        "
-    fi
-    if [ "{{learn}}" = "true" ]; then
-        LEARN_CMD="
-            echo; echo; echo Learning...; echo
-            claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --continue -p '/evolve-lite:learn'
-        "
-    fi
     docker run --rm -it --env SANDBOX_PROMPT --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}} sh -c "
         claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p \"\$SANDBOX_PROMPT\"
-        $TRACE_CMD
-        $LEARN_CMD
     "
 
 # Smoke-test that Claude Code is installed and working
diff --git a/sandbox/README.md b/sandbox/README.md
index fed9d8d4..1065618d 100644
--- a/sandbox/README.md
+++ b/sandbox/README.md
@@ -30,3 +30,63 @@ docker run --rm -it --env-file sandbox/myenv -v "$(pwd)":/workspace claude-sandb
 docker run --rm --env-file sandbox/myenv claude-sandbox claude -p "who are you"
 ```
 
+## Automated E2E Test
+
+`tests/e2e/test_sandbox_learn_recall.py` exercises the full evolve-lite
+learn + recall loop end-to-end inside this sandbox. It runs two Claude
+sessions:
+
+1. **Session 1** asks Claude to extract EXIF metadata from a sample photo.
+   The sandbox lacks `exiftool` and `PIL`, so Claude hits dead ends and
+   recovers using stdlib. The Stop hook runs `learn`, which reads the
+   saved transcript and extracts a guideline.
+2. **Session 2** asks a similar metadata question. Recall injects the
+   guideline from session 1, so Claude should skip the failing tools and
+   go straight to stdlib.
+
+The test asserts a guideline file was produced in session 1 and that
+session 2's bash commands do not invoke `exiftool` / `PIL` / `piexif` /
+`exifread`.
+
+### Prerequisites
+
+- Build the sandbox image: `just sandbox-build claude`
+- Credentials in the environment — either export `ANTHROPIC_API_KEY`
+  directly, or source an env file (e.g. with
+  [`dotenv`](https://github.com/bkeepers/dotenv)). The test forwards
+  these vars into the container when set: `ANTHROPIC_API_KEY`,
+  `ANTHROPIC_AUTH_TOKEN`, `ANTHROPIC_BASE_URL`, `CLAUDE_MODEL`,
+  `CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS`, `CLAUDE_CODE_SKIP_BEDROCK_AUTH`.
+
+Example env file (only `ANTHROPIC_API_KEY` is required; others are
+optional and used when routing through a proxy or picking a specific
+model):
+
+```bash
+# Direct Anthropic API
+ANTHROPIC_API_KEY=sk-ant-...
+
+# Or, via a proxy / gateway
+ANTHROPIC_AUTH_TOKEN=your-token
+ANTHROPIC_BASE_URL=https://your-gateway.example.com
+CLAUDE_MODEL=claude-sonnet-4-6
+CLAUDE_CODE_SKIP_BEDROCK_AUTH=1
+```
+
+### Run
+
+```bash
+# If creds live in an env file:
+dotenv -e path/to/your.env -- \
+  uv run pytest tests/e2e/test_sandbox_learn_recall.py \
+    --run-e2e -m e2e -v --log-cli-level=INFO
+
+# Or, with vars already exported:
+uv run pytest tests/e2e/test_sandbox_learn_recall.py \
+  --run-e2e -m e2e -v --log-cli-level=INFO
+```
+
+The `--log-cli-level=INFO` flag streams per-session progress lines live
+(~4 minutes total). The test skips if Docker, the sandbox image, or
+credentials are missing.
+
diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py
new file mode 100644
index 00000000..2487a97b
--- /dev/null
+++ b/tests/e2e/test_sandbox_learn_recall.py
@@ -0,0 +1,180 @@
+"""End-to-end test of the evolve-lite learn + recall flow in the sandbox.
+
+Runs two sequential Claude Code sessions against the Dockerized sandbox:
+  1. Ask about photo location — sandbox lacks exiftool/PIL, so Claude hits
+     dead ends and recovers. Stop hook fires learn, which reads the saved
+     transcript and extracts a guideline.
+  2. Ask about focal length — UserPromptSubmit recall hook injects the
+     guideline from session 1, so Claude should skip the dead ends.
+
+Assertions:
+  - Session 1 produces a guideline file under .evolve/entities/.
+  - Session 2 does NOT invoke exiftool/PIL (recall shortcut worked).
+
+Requires Docker, the `claude-sandbox` image built, and ANTHROPIC_API_KEY
+set in the environment (forwarded into the container).
+"""
+
+import json
+import logging
+import os
+import shutil
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+
+
+log = logging.getLogger(__name__)
+
+
+SANDBOX_IMAGE = "claude-sandbox"
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SESSION_TIMEOUT_SECONDS = 600
+FORWARDED_ENV_VARS = (
+    "ANTHROPIC_API_KEY",
+    "ANTHROPIC_AUTH_TOKEN",
+    "ANTHROPIC_BASE_URL",
+    "CLAUDE_MODEL",
+    "CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS",
+    "CLAUDE_CODE_SKIP_BEDROCK_AUTH",
+)
+
+
+@pytest.fixture(scope="session")
+def sandbox_ready():
+    """Skip if Docker, the sandbox image, or credentials aren't available."""
+    if shutil.which("docker") is None:
+        pytest.skip("docker not installed")
+
+    if subprocess.run(["docker", "info"], capture_output=True).returncode != 0:
+        pytest.skip("docker daemon not running")
+
+    image_check = subprocess.run(
+        ["docker", "image", "inspect", SANDBOX_IMAGE],
+        capture_output=True,
+    )
+    if image_check.returncode != 0:
+        pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build claude`")
+
+    if not (os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")):
+        pytest.skip("ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) not set in environment")
+
+    return True
+
+
+@pytest.fixture
+def sandbox_workspace(tmp_path):
+    """Copy demo/workspace to tmp_path so each test gets a clean state."""
+    src = REPO_ROOT / "demo" / "workspace"
+    dst = tmp_path / "workspace"
+    shutil.copytree(src, dst, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup"))
+    return dst
+
+
+def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedProcess:
+    plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins"
+    cmd = ["docker", "run", "--rm"]
+    for var in FORWARDED_ENV_VARS:
+        if os.environ.get(var):
+            cmd += ["-e", var]
+    cmd += [
+        "-e", "EVOLVE_DEBUG=1",
+        "-v", f"{workspace}:/workspace",
+        "-v", f"{plugins}:/plugins",
+        SANDBOX_IMAGE,
+        "bash", "-c",
+        f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"',
+    ]
+    return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS)
+
+
+def _bash_commands(transcript_path: Path) -> list[str]:
+    commands = []
+    for line in transcript_path.read_text().splitlines():
+        if not line.strip():
+            continue
+        try:
+            record = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        content = record.get("message", {}).get("content", [])
+        if not isinstance(content, list):
+            continue
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "tool_use" and block.get("name") == "Bash":
+                cmd = block.get("input", {}).get("command", "")
+                if cmd:
+                    commands.append(cmd)
+    return commands
+
+
+@pytest.mark.e2e
+def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
+    """Session 1 extracts a guideline; session 2 benefits from recall."""
+    del sandbox_ready  # only used for its skip side effect
+
+    # --- Session 1: location query — expected dead ends then recovery ---
+    log.info("session 1: running location query in sandbox...")
+    t0 = time.time()
+    result1 = _run_sandbox_prompt(
+        sandbox_workspace,
+        "where was the photo @sample.jpg taken. use exif metadata",
+    )
+    log.info(f"session 1: exited {result1.returncode} after {time.time() - t0:.0f}s")
+    assert result1.returncode == 0, (
+        f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}"
+    )
+
+    entities_dir = sandbox_workspace / ".evolve" / "entities"
+    trajectories_dir = sandbox_workspace / ".evolve" / "trajectories"
+
+    assert entities_dir.is_dir(), (
+        f"{entities_dir} was not created — learn did not save guidelines.\n"
+        f"stdout:\n{result1.stdout[-2000:]}"
+    )
+    entity_files = list(entities_dir.rglob("*.md"))
+    assert entity_files, f"no guideline files found in {entities_dir}"
+    log.info(f"session 1: learn saved {len(entity_files)} guideline(s): {[p.name for p in entity_files]}")
+
+    transcripts = list(trajectories_dir.glob("*.jsonl"))
+    assert transcripts, f"no transcript saved in {trajectories_dir}"
+
+    # --- Session 2: focal length query — recall should inject the guideline ---
+    log.info("session 2: running focal length query in sandbox...")
+    t1 = time.time()
+    result2 = _run_sandbox_prompt(
+        sandbox_workspace,
+        "what focal length was used to take the photo @sample.jpg. use exif metadata",
+    )
+    log.info(f"session 2: exited {result2.returncode} after {time.time() - t1:.0f}s")
+    assert result2.returncode == 0, (
+        f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}"
+    )
+
+    session2_transcripts = [
+        p for p in trajectories_dir.glob("*.jsonl")
+        if p not in transcripts
+    ]
+    assert session2_transcripts, "no new transcript saved for session 2"
+    session2_transcript = session2_transcripts[0]
+
+    commands = _bash_commands(session2_transcript)
+    log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools")
+    joined = "\n".join(commands).lower()
+
+    # Recall should steer Claude away from the tools that failed in session 1.
+    # The guideline text itself may name these tools, but we're checking actual
+    # bash invocations, not string mentions — so a command like
+    # `python3 -c "import PIL"` would fail this check, while the guideline's
+    # prose mentioning PIL as unavailable does not.
+    assert "exiftool " not in joined and "exiftool$" not in joined, (
+        f"session 2 invoked exiftool despite recall guideline:\n"
+        + "\n".join(commands)
+    )
+    for banned in ("from pil", "import pil", "import piexif", "import exifread"):
+        assert banned not in joined, (
+            f"session 2 tried {banned!r} despite recall guideline:\n"
+            + "\n".join(commands)
+        )

From 3c6bd3e4c097d30431949d419edd7e2a2d23d6db Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:10:28 -0500
Subject: [PATCH 2/7] fix(tests): ruff format and remove extraneous f-string
 prefix

Fixes failing CI checks: check-formatting, check-linting
---
 tests/e2e/test_sandbox_learn_recall.py | 40 ++++++++++----------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py
index 2487a97b..03767041 100644
--- a/tests/e2e/test_sandbox_learn_recall.py
+++ b/tests/e2e/test_sandbox_learn_recall.py
@@ -80,11 +80,15 @@ def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedPro
         if os.environ.get(var):
             cmd += ["-e", var]
     cmd += [
-        "-e", "EVOLVE_DEBUG=1",
-        "-v", f"{workspace}:/workspace",
-        "-v", f"{plugins}:/plugins",
+        "-e",
+        "EVOLVE_DEBUG=1",
+        "-v",
+        f"{workspace}:/workspace",
+        "-v",
+        f"{plugins}:/plugins",
         SANDBOX_IMAGE,
-        "bash", "-c",
+        "bash",
+        "-c",
         f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"',
     ]
     return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS)
@@ -123,17 +127,12 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
         "where was the photo @sample.jpg taken. use exif metadata",
     )
     log.info(f"session 1: exited {result1.returncode} after {time.time() - t0:.0f}s")
-    assert result1.returncode == 0, (
-        f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}"
-    )
+    assert result1.returncode == 0, f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}"
 
     entities_dir = sandbox_workspace / ".evolve" / "entities"
     trajectories_dir = sandbox_workspace / ".evolve" / "trajectories"
 
-    assert entities_dir.is_dir(), (
-        f"{entities_dir} was not created — learn did not save guidelines.\n"
-        f"stdout:\n{result1.stdout[-2000:]}"
-    )
+    assert entities_dir.is_dir(), f"{entities_dir} was not created — learn did not save guidelines.\nstdout:\n{result1.stdout[-2000:]}"
     entity_files = list(entities_dir.rglob("*.md"))
     assert entity_files, f"no guideline files found in {entities_dir}"
     log.info(f"session 1: learn saved {len(entity_files)} guideline(s): {[p.name for p in entity_files]}")
@@ -149,14 +148,9 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
         "what focal length was used to take the photo @sample.jpg. use exif metadata",
     )
     log.info(f"session 2: exited {result2.returncode} after {time.time() - t1:.0f}s")
-    assert result2.returncode == 0, (
-        f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}"
-    )
+    assert result2.returncode == 0, f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}"
 
-    session2_transcripts = [
-        p for p in trajectories_dir.glob("*.jsonl")
-        if p not in transcripts
-    ]
+    session2_transcripts = [p for p in trajectories_dir.glob("*.jsonl") if p not in transcripts]
     assert session2_transcripts, "no new transcript saved for session 2"
     session2_transcript = session2_transcripts[0]
 
@@ -169,12 +163,8 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
     # bash invocations, not string mentions — so a command like
     # `python3 -c "import PIL"` would fail this check, while the guideline's
     # prose mentioning PIL as unavailable does not.
-    assert "exiftool " not in joined and "exiftool$" not in joined, (
-        f"session 2 invoked exiftool despite recall guideline:\n"
-        + "\n".join(commands)
+    assert "exiftool " not in joined and "exiftool$" not in joined, "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(
+        commands
     )
     for banned in ("from pil", "import pil", "import piexif", "import exifread"):
-        assert banned not in joined, (
-            f"session 2 tried {banned!r} despite recall guideline:\n"
-            + "\n".join(commands)
-        )
+        assert banned not in joined, f"session 2 tried {banned!r} despite recall guideline:\n" + "\n".join(commands)

From 2fef74b5a6b413d0bc1243db251c68e68ff8c9f7 Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:15:24 -0500
Subject: [PATCH 3/7] fix(sandbox): allowlist README placeholder in
 .secrets.baseline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes failing CI check: tekton/code-detect-secrets

The example env file snippet in sandbox/README.md tripped IBM's
detect-secrets scanner. Add the finding to .secrets.baseline — the
repo's existing convention (14 entries already present) — rather than
polluting the doc with inline pragmas. Change the placeholder to the
canonical sk-ant-xxxx form used elsewhere.
---
 .secrets.baseline | 13 +++++++++++--
 sandbox/README.md |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index a265cdbd..77d1a251 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$|package-lock\\.json$",
     "lines": null
   },
-  "generated_at": "2026-04-20T15:55:53Z",
+  "generated_at": "2026-04-29T16:14:59Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -153,6 +153,15 @@
         "verified_result": null
       }
     ],
+    "sandbox/README.md": [
+      {
+        "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd",
+        "is_verified": false,
+        "line_number": 67,
+        "type": "Secret Keyword",
+        "verified_result": null
+      }
+    ],
     "sandbox/sample.env": [
       {
         "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd",
@@ -213,4 +222,4 @@
     "file": null,
     "hash": null
   }
-}
\ No newline at end of file
+}
diff --git a/sandbox/README.md b/sandbox/README.md
index 1065618d..8ca8151d 100644
--- a/sandbox/README.md
+++ b/sandbox/README.md
@@ -64,7 +64,7 @@ model):
 
 ```bash
 # Direct Anthropic API
-ANTHROPIC_API_KEY=sk-ant-...
+ANTHROPIC_API_KEY=sk-ant-xxxx
 
 # Or, via a proxy / gateway
 ANTHROPIC_AUTH_TOKEN=your-token

From 967acaa24d43375c195725546879934106e4d6ad Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:34:09 -0500
Subject: [PATCH 4/7] fix(sandbox): audit README placeholder as non-secret

Fixes failing CI check: tekton/pr-code-checks/code-detect-secrets

The baseline entry for sandbox/README.md needed `is_secret: false` to
pass tekton's `detect-secrets audit --fail-on-unaudited` check.
---
 .secrets.baseline | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index 77d1a251..14ee6fa1 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -159,7 +159,8 @@
         "is_verified": false,
         "line_number": 67,
         "type": "Secret Keyword",
-        "verified_result": null
+        "verified_result": null,
+        "is_secret": false
       }
     ],
     "sandbox/sample.env": [

From 5b4271c5a007a75259ee85ced6e0c47b8ead9d37 Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 29 Apr 2026 12:42:12 -0500
Subject: [PATCH 5/7] fix(tests): select newest session-2 transcript
 deterministically

Addresses CodeRabbit review finding: Session 2 transcript selection is
nondeterministic and can yield false passes

`glob()` order isn't guaranteed; if multiple new `*.jsonl` files ever
appear between session 1 and session 2 (e.g. a future change adds a
second writer), `[0]` could pick the wrong one. Use
`max(key=mtime)` to pick the newest deterministically.
---
 tests/e2e/test_sandbox_learn_recall.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py
index 03767041..a967f05b 100644
--- a/tests/e2e/test_sandbox_learn_recall.py
+++ b/tests/e2e/test_sandbox_learn_recall.py
@@ -152,7 +152,7 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
 
     session2_transcripts = [p for p in trajectories_dir.glob("*.jsonl") if p not in transcripts]
     assert session2_transcripts, "no new transcript saved for session 2"
-    session2_transcript = session2_transcripts[0]
+    session2_transcript = max(session2_transcripts, key=lambda p: p.stat().st_mtime)
 
     commands = _bash_commands(session2_transcript)
     log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools")

From 8ed92ebad4d50558227bbc265d853fbbdf82d45a Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 29 Apr 2026 12:42:38 -0500
Subject: [PATCH 6/7] fix(tests): use regex word boundary for banned tool
 detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses CodeRabbit review finding: `exiftool$` check is ineffective
as written

The `"exiftool$" not in joined` check treated `$` as a literal
character — never matched anything. Replace with
`re.search(r"\bexiftool\b", joined)` which catches every bash
invocation of exiftool regardless of trailing character.
---
 tests/e2e/test_sandbox_learn_recall.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py
index a967f05b..10ffe39e 100644
--- a/tests/e2e/test_sandbox_learn_recall.py
+++ b/tests/e2e/test_sandbox_learn_recall.py
@@ -18,6 +18,7 @@
 import json
 import logging
 import os
+import re
 import shutil
 import subprocess
 import time
@@ -163,8 +164,6 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
     # bash invocations, not string mentions — so a command like
     # `python3 -c "import PIL"` would fail this check, while the guideline's
     # prose mentioning PIL as unavailable does not.
-    assert "exiftool " not in joined and "exiftool$" not in joined, "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(
-        commands
-    )
+    assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands)
     for banned in ("from pil", "import pil", "import piexif", "import exifread"):
         assert banned not in joined, f"session 2 tried {banned!r} despite recall guideline:\n" + "\n".join(commands)

From e7b5dfc4ce92760b6a2a93ffbd4fc7376bd9866e Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 29 Apr 2026 13:12:49 -0500
Subject: [PATCH 7/7] fix(tests): narrow recall assertion to exiftool only

The banned list previously included PIL/piexif/exifread, which are all
pip-installable. A valid recall guideline can legitimately recommend
"pip install <pkg>" for any of them, causing the assertion to fail on
correct behavior. Only exiftool is definitively absent in the sandbox
(not installed, not pip-installable), so it's the only reliable
marker that recall did NOT steer Claude away from the unavailable
tool.
---
 tests/e2e/test_sandbox_learn_recall.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py
index 10ffe39e..d39293a0 100644
--- a/tests/e2e/test_sandbox_learn_recall.py
+++ b/tests/e2e/test_sandbox_learn_recall.py
@@ -159,11 +159,8 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
     log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools")
     joined = "\n".join(commands).lower()
 
-    # Recall should steer Claude away from the tools that failed in session 1.
-    # The guideline text itself may name these tools, but we're checking actual
-    # bash invocations, not string mentions — so a command like
-    # `python3 -c "import PIL"` would fail this check, while the guideline's
-    # prose mentioning PIL as unavailable does not.
+    # Recall should steer Claude away from tools guaranteed-unavailable in the
+    # sandbox. Only `exiftool` is definitively absent (not installed, can't be
+    # pip-installed). Other libraries (PIL, piexif, exifread) may appear in a
+    # valid guideline as "install via pip and use", so we don't ban them.
     assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands)
-    for banned in ("from pil", "import pil", "import piexif", "import exifread"):
-        assert banned not in joined, f"session 2 tried {banned!r} despite recall guideline:\n" + "\n".join(commands)