AgentToolkit · visahak · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$|package-lock\\.json$",
     "lines": null
   },
-  "generated_at": "2026-04-20T15:55:53Z",
+  "generated_at": "2026-04-29T16:14:59Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -153,6 +153,16 @@
         "verified_result": null
       }
     ],
+    "sandbox/README.md": [
+      {
+        "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd",
+        "is_verified": false,
+        "line_number": 67,
+        "type": "Secret Keyword",
+        "verified_result": null,
+        "is_secret": false
+      }
+    ],
     "sandbox/sample.env": [
       {
         "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd",
@@ -213,4 +223,4 @@
     "file": null,
     "hash": null
   }
-}
+}
diff --git a/justfile b/justfile
@@ -13,8 +13,6 @@ codex_image := "evolve-codex-sandbox"
 env_file := "sandbox/myenv"
 sandbox_dir := "sandbox"
 workspace := "demo/workspace"
-trace := "false"
-learn := "false"
 
 # Build sandbox Docker image(s). Use target=claude or target=codex to build only one.
 sandbox-build target="all":
@@ -58,31 +56,15 @@ sandbox-clean target="all":
 claude-run:
     docker run --rm -it --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}}
 
-# Run a one-shot prompt in the sandbox (trace=true to summarize session, learn=true to run /evolve-lite:learn)
+# Run a one-shot prompt in the sandbox
 claude-prompt prompt:
     #!/usr/bin/env sh
     export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF'
     {{prompt}}
     PROMPT_EOF
     )"
-    TRACE_CMD=""
-    LEARN_CMD=""
-    if [ "{{trace}}" = "true" ]; then
-        TRACE_CMD="
-            echo; echo; echo Summarizing the session...; echo
-            claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --no-session-persistence -p 'tell me what happened in the newest json file in /home/sandbox/.claude/projects/-workspace/'
-        "
-    fi
-    if [ "{{learn}}" = "true" ]; then
-        LEARN_CMD="
-            echo; echo; echo Learning...; echo
-            claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --continue -p '/evolve-lite:learn'
-        "
-    fi
     docker run --rm -it --env SANDBOX_PROMPT --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}} sh -c "
         claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p \"\$SANDBOX_PROMPT\"
-        $TRACE_CMD
-        $LEARN_CMD
     "
 
 # Smoke-test that Claude Code is installed and working

diff --git a/sandbox/README.md b/sandbox/README.md
@@ -30,3 +30,63 @@ docker run --rm -it --env-file sandbox/myenv -v "$(pwd)":/workspace claude-sandb
 docker run --rm --env-file sandbox/myenv claude-sandbox claude -p "who are you"
 ```
 
+## Automated E2E Test
+
+`tests/e2e/test_sandbox_learn_recall.py` exercises the full evolve-lite
+learn + recall loop end-to-end inside this sandbox. It runs two Claude
+sessions:
+
+1. **Session 1** asks Claude to extract EXIF metadata from a sample photo.
+   The sandbox lacks `exiftool` and `PIL`, so Claude hits dead ends and
+   recovers using stdlib. The Stop hook runs `learn`, which reads the
+   saved transcript and extracts a guideline.
+2. **Session 2** asks a similar metadata question. Recall injects the
+   guideline from session 1, so Claude should skip the failing tools and
+   go straight to stdlib.
+
+The test asserts a guideline file was produced in session 1 and that
+session 2's bash commands do not invoke `exiftool` / `PIL` / `piexif` /
+`exifread`.
+
+### Prerequisites
+
+- Build the sandbox image: `just sandbox-build claude`
+- Credentials in the environment — either export `ANTHROPIC_API_KEY`
+  directly, or source an env file (e.g. with
+  [`dotenv`](https://github.com/bkeepers/dotenv)). The test forwards
+  these vars into the container when set: `ANTHROPIC_API_KEY`,
+  `ANTHROPIC_AUTH_TOKEN`, `ANTHROPIC_BASE_URL`, `CLAUDE_MODEL`,
+  `CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS`, `CLAUDE_CODE_SKIP_BEDROCK_AUTH`.
+
+Example env file (only `ANTHROPIC_API_KEY` is required; others are
+optional and used when routing through a proxy or picking a specific
+model):
+
+```bash
+# Direct Anthropic API
+ANTHROPIC_API_KEY=sk-ant-xxxx
+
+# Or, via a proxy / gateway
+ANTHROPIC_AUTH_TOKEN=your-token
+ANTHROPIC_BASE_URL=https://your-gateway.example.com
+CLAUDE_MODEL=claude-sonnet-4-6
+CLAUDE_CODE_SKIP_BEDROCK_AUTH=1
+```
+
+### Run
+
+```bash
+# If creds live in an env file:
+dotenv -e path/to/your.env -- \
+  uv run pytest tests/e2e/test_sandbox_learn_recall.py \
+    --run-e2e -m e2e -v --log-cli-level=INFO
+
+# Or, with vars already exported:
+uv run pytest tests/e2e/test_sandbox_learn_recall.py \
+  --run-e2e -m e2e -v --log-cli-level=INFO
+```
+
+The `--log-cli-level=INFO` flag streams per-session progress lines live
+(~4 minutes total). The test skips if Docker, the sandbox image, or
+credentials are missing.
+
diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py
@@ -0,0 +1,166 @@
+"""End-to-end test of the evolve-lite learn + recall flow in the sandbox.
+
+Runs two sequential Claude Code sessions against the Dockerized sandbox:
+  1. Ask about photo location — sandbox lacks exiftool/PIL, so Claude hits
+     dead ends and recovers. Stop hook fires learn, which reads the saved
+     transcript and extracts a guideline.
+  2. Ask about focal length — UserPromptSubmit recall hook injects the
+     guideline from session 1, so Claude should skip the dead ends.
+
+Assertions:
+  - Session 1 produces a guideline file under .evolve/entities/.
+  - Session 2 does NOT invoke exiftool/PIL (recall shortcut worked).
+
+Requires Docker, the `claude-sandbox` image built, and ANTHROPIC_API_KEY
+set in the environment (forwarded into the container).
+"""
+
+import json
+import logging
+import os
+import re
+import shutil
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+
+
+log = logging.getLogger(__name__)
+
+
+SANDBOX_IMAGE = "claude-sandbox"
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SESSION_TIMEOUT_SECONDS = 600
+FORWARDED_ENV_VARS = (
+    "ANTHROPIC_API_KEY",
+    "ANTHROPIC_AUTH_TOKEN",
+    "ANTHROPIC_BASE_URL",
+    "CLAUDE_MODEL",
+    "CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS",
+    "CLAUDE_CODE_SKIP_BEDROCK_AUTH",
+)
+
+
+@pytest.fixture(scope="session")
+def sandbox_ready():
+    """Skip if Docker, the sandbox image, or credentials aren't available."""
+    if shutil.which("docker") is None:
+        pytest.skip("docker not installed")
+
+    if subprocess.run(["docker", "info"], capture_output=True).returncode != 0:
+        pytest.skip("docker daemon not running")
+
+    image_check = subprocess.run(
+        ["docker", "image", "inspect", SANDBOX_IMAGE],
+        capture_output=True,
+    )
+    if image_check.returncode != 0:
+        pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build claude`")
+
+    if not (os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")):
+        pytest.skip("ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) not set in environment")
+
+    return True
+
+
+@pytest.fixture
+def sandbox_workspace(tmp_path):
+    """Copy demo/workspace to tmp_path so each test gets a clean state."""
+    src = REPO_ROOT / "demo" / "workspace"
+    dst = tmp_path / "workspace"
+    shutil.copytree(src, dst, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup"))
+    return dst
+
+
+def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedProcess:
+    plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins"
+    cmd = ["docker", "run", "--rm"]
+    for var in FORWARDED_ENV_VARS:
+        if os.environ.get(var):
+            cmd += ["-e", var]
+    cmd += [
+        "-e",
+        "EVOLVE_DEBUG=1",
+        "-v",
+        f"{workspace}:/workspace",
+        "-v",
+        f"{plugins}:/plugins",
+        SANDBOX_IMAGE,
+        "bash",
+        "-c",
+        f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"',
+    ]
+    return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS)
+
+
+def _bash_commands(transcript_path: Path) -> list[str]:
+    commands = []
+    for line in transcript_path.read_text().splitlines():
+        if not line.strip():
+            continue
+        try:
+            record = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        content = record.get("message", {}).get("content", [])
+        if not isinstance(content, list):
+            continue
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "tool_use" and block.get("name") == "Bash":
+                cmd = block.get("input", {}).get("command", "")
+                if cmd:
+                    commands.append(cmd)
+    return commands
+
+
+@pytest.mark.e2e
+def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
+    """Session 1 extracts a guideline; session 2 benefits from recall."""
+    del sandbox_ready  # only used for its skip side effect
+
+    # --- Session 1: location query — expected dead ends then recovery ---
+    log.info("session 1: running location query in sandbox...")
+    t0 = time.time()
+    result1 = _run_sandbox_prompt(
+        sandbox_workspace,
+        "where was the photo @sample.jpg taken. use exif metadata",
+    )
+    log.info(f"session 1: exited {result1.returncode} after {time.time() - t0:.0f}s")
+    assert result1.returncode == 0, f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}"
+
+    entities_dir = sandbox_workspace / ".evolve" / "entities"
+    trajectories_dir = sandbox_workspace / ".evolve" / "trajectories"
+
+    assert entities_dir.is_dir(), f"{entities_dir} was not created — learn did not save guidelines.\nstdout:\n{result1.stdout[-2000:]}"
+    entity_files = list(entities_dir.rglob("*.md"))
+    assert entity_files, f"no guideline files found in {entities_dir}"
+    log.info(f"session 1: learn saved {len(entity_files)} guideline(s): {[p.name for p in entity_files]}")
+
+    transcripts = list(trajectories_dir.glob("*.jsonl"))
+    assert transcripts, f"no transcript saved in {trajectories_dir}"
+
+    # --- Session 2: focal length query — recall should inject the guideline ---
+    log.info("session 2: running focal length query in sandbox...")
+    t1 = time.time()
+    result2 = _run_sandbox_prompt(
+        sandbox_workspace,
+        "what focal length was used to take the photo @sample.jpg. use exif metadata",
+    )
+    log.info(f"session 2: exited {result2.returncode} after {time.time() - t1:.0f}s")
+    assert result2.returncode == 0, f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}"
+
+    session2_transcripts = [p for p in trajectories_dir.glob("*.jsonl") if p not in transcripts]
+    assert session2_transcripts, "no new transcript saved for session 2"
+    session2_transcript = max(session2_transcripts, key=lambda p: p.stat().st_mtime)
+
+    commands = _bash_commands(session2_transcript)
+    log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools")
+    joined = "\n".join(commands).lower()
+
+    # Recall should steer Claude away from tools guaranteed-unavailable in the
+    # sandbox. Only `exiftool` is definitively absent (not installed, can't be
+    # pip-installed). Other libraries (PIL, piexif, exifread) may appear in a
+    # valid guideline as "install via pip and use", so we don't ban them.
+    assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands)