Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"files": "^.secrets.baseline$|package-lock\\.json$",
"lines": null
},
"generated_at": "2026-04-20T15:55:53Z",
"generated_at": "2026-04-29T16:14:59Z",
"plugins_used": [
{
"name": "AWSKeyDetector"
Expand Down Expand Up @@ -153,6 +153,16 @@
"verified_result": null
}
],
"sandbox/README.md": [
{
"hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd",
"is_verified": false,
"line_number": 67,
"type": "Secret Keyword",
"verified_result": null,
"is_secret": false
}
],
"sandbox/sample.env": [
{
"hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd",
Expand Down Expand Up @@ -213,4 +223,4 @@
"file": null,
"hash": null
}
}
}
20 changes: 1 addition & 19 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ codex_image := "evolve-codex-sandbox"
env_file := "sandbox/myenv"
sandbox_dir := "sandbox"
workspace := "demo/workspace"
trace := "false"
learn := "false"

# Build sandbox Docker image(s). Use target=claude or target=codex to build only one.
sandbox-build target="all":
Expand Down Expand Up @@ -58,31 +56,15 @@ sandbox-clean target="all":
claude-run:
docker run --rm -it --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}}

# Run a one-shot prompt in the sandbox (trace=true to summarize session, learn=true to run /evolve-lite:learn)
# Run a one-shot prompt in the sandbox
claude-prompt prompt:
#!/usr/bin/env sh
export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF'
{{prompt}}
PROMPT_EOF
)"
TRACE_CMD=""
LEARN_CMD=""
if [ "{{trace}}" = "true" ]; then
TRACE_CMD="
echo; echo; echo Summarizing the session...; echo
claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --no-session-persistence -p 'tell me what happened in the newest json file in /home/sandbox/.claude/projects/-workspace/'
"
fi
if [ "{{learn}}" = "true" ]; then
LEARN_CMD="
echo; echo; echo Learning...; echo
claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --continue -p '/evolve-lite:learn'
"
fi
docker run --rm -it --env SANDBOX_PROMPT --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}} sh -c "
claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p \"\$SANDBOX_PROMPT\"
$TRACE_CMD
$LEARN_CMD
"

# Smoke-test that Claude Code is installed and working
Expand Down
60 changes: 60 additions & 0 deletions sandbox/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,63 @@ docker run --rm -it --env-file sandbox/myenv -v "$(pwd)":/workspace claude-sandb
docker run --rm --env-file sandbox/myenv claude-sandbox claude -p "who are you"
```

## Automated E2E Test

`tests/e2e/test_sandbox_learn_recall.py` exercises the full evolve-lite
learn + recall loop end-to-end inside this sandbox. It runs two Claude
sessions:

1. **Session 1** asks Claude to extract EXIF metadata from a sample photo.
The sandbox lacks `exiftool` and `PIL`, so Claude hits dead ends and
recovers using stdlib. The Stop hook runs `learn`, which reads the
saved transcript and extracts a guideline.
2. **Session 2** asks a similar metadata question. Recall injects the
guideline from session 1, so Claude should skip the failing tools and
go straight to stdlib.

The test asserts a guideline file was produced in session 1 and that
session 2's bash commands do not invoke `exiftool` / `PIL` / `piexif` /
`exifread`.

### Prerequisites

- Build the sandbox image: `just sandbox-build claude`
- Credentials in the environment — either export `ANTHROPIC_API_KEY`
directly, or source an env file (e.g. with
[`dotenv`](https://github.com/bkeepers/dotenv)). The test forwards
these vars into the container when set: `ANTHROPIC_API_KEY`,
`ANTHROPIC_AUTH_TOKEN`, `ANTHROPIC_BASE_URL`, `CLAUDE_MODEL`,
`CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS`, `CLAUDE_CODE_SKIP_BEDROCK_AUTH`.

Example env file (only `ANTHROPIC_API_KEY` is required; others are
optional and used when routing through a proxy or picking a specific
model):

```bash
# Direct Anthropic API
ANTHROPIC_API_KEY=sk-ant-xxxx

# Or, via a proxy / gateway
ANTHROPIC_AUTH_TOKEN=your-token
ANTHROPIC_BASE_URL=https://your-gateway.example.com
CLAUDE_MODEL=claude-sonnet-4-6
CLAUDE_CODE_SKIP_BEDROCK_AUTH=1
```

### Run

```bash
# If creds live in an env file:
dotenv -e path/to/your.env -- \
uv run pytest tests/e2e/test_sandbox_learn_recall.py \
--run-e2e -m e2e -v --log-cli-level=INFO

# Or, with vars already exported:
uv run pytest tests/e2e/test_sandbox_learn_recall.py \
--run-e2e -m e2e -v --log-cli-level=INFO
```

The `--log-cli-level=INFO` flag streams per-session progress lines live
(~4 minutes total). The test skips if Docker, the sandbox image, or
credentials are missing.

166 changes: 166 additions & 0 deletions tests/e2e/test_sandbox_learn_recall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
"""End-to-end test of the evolve-lite learn + recall flow in the sandbox.

Runs two sequential Claude Code sessions against the Dockerized sandbox:
1. Ask about photo location — sandbox lacks exiftool/PIL, so Claude hits
dead ends and recovers. Stop hook fires learn, which reads the saved
transcript and extracts a guideline.
2. Ask about focal length — UserPromptSubmit recall hook injects the
guideline from session 1, so Claude should skip the dead ends.

Assertions:
- Session 1 produces a guideline file under .evolve/entities/.
- Session 2 does NOT invoke exiftool/PIL (recall shortcut worked).

Requires Docker, the `claude-sandbox` image built, and ANTHROPIC_API_KEY
set in the environment (forwarded into the container).
"""

import json
import logging
import os
import re
import shutil
import subprocess
import time
from pathlib import Path

import pytest


log = logging.getLogger(__name__)


SANDBOX_IMAGE = "claude-sandbox"
REPO_ROOT = Path(__file__).resolve().parents[2]
SESSION_TIMEOUT_SECONDS = 600
FORWARDED_ENV_VARS = (
"ANTHROPIC_API_KEY",
"ANTHROPIC_AUTH_TOKEN",
"ANTHROPIC_BASE_URL",
"CLAUDE_MODEL",
"CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS",
"CLAUDE_CODE_SKIP_BEDROCK_AUTH",
)


@pytest.fixture(scope="session")
def sandbox_ready():
"""Skip if Docker, the sandbox image, or credentials aren't available."""
if shutil.which("docker") is None:
pytest.skip("docker not installed")

if subprocess.run(["docker", "info"], capture_output=True).returncode != 0:
pytest.skip("docker daemon not running")

image_check = subprocess.run(
["docker", "image", "inspect", SANDBOX_IMAGE],
capture_output=True,
)
if image_check.returncode != 0:
pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build claude`")

if not (os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")):
pytest.skip("ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) not set in environment")

return True


@pytest.fixture
def sandbox_workspace(tmp_path):
"""Copy demo/workspace to tmp_path so each test gets a clean state."""
src = REPO_ROOT / "demo" / "workspace"
dst = tmp_path / "workspace"
shutil.copytree(src, dst, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup"))
return dst


def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedProcess:
plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins"
cmd = ["docker", "run", "--rm"]
for var in FORWARDED_ENV_VARS:
if os.environ.get(var):
cmd += ["-e", var]
cmd += [
"-e",
"EVOLVE_DEBUG=1",
"-v",
f"{workspace}:/workspace",
"-v",
f"{plugins}:/plugins",
SANDBOX_IMAGE,
"bash",
"-c",
f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"',
]
return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS)


def _bash_commands(transcript_path: Path) -> list[str]:
commands = []
for line in transcript_path.read_text().splitlines():
if not line.strip():
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
continue
content = record.get("message", {}).get("content", [])
if not isinstance(content, list):
continue
for block in content:
if isinstance(block, dict) and block.get("type") == "tool_use" and block.get("name") == "Bash":
cmd = block.get("input", {}).get("command", "")
if cmd:
commands.append(cmd)
return commands


@pytest.mark.e2e
def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace):
"""Session 1 extracts a guideline; session 2 benefits from recall."""
del sandbox_ready # only used for its skip side effect

# --- Session 1: location query — expected dead ends then recovery ---
log.info("session 1: running location query in sandbox...")
t0 = time.time()
result1 = _run_sandbox_prompt(
sandbox_workspace,
"where was the photo @sample.jpg taken. use exif metadata",
)
log.info(f"session 1: exited {result1.returncode} after {time.time() - t0:.0f}s")
assert result1.returncode == 0, f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}"

entities_dir = sandbox_workspace / ".evolve" / "entities"
trajectories_dir = sandbox_workspace / ".evolve" / "trajectories"

assert entities_dir.is_dir(), f"{entities_dir} was not created — learn did not save guidelines.\nstdout:\n{result1.stdout[-2000:]}"
entity_files = list(entities_dir.rglob("*.md"))
assert entity_files, f"no guideline files found in {entities_dir}"
log.info(f"session 1: learn saved {len(entity_files)} guideline(s): {[p.name for p in entity_files]}")

transcripts = list(trajectories_dir.glob("*.jsonl"))
assert transcripts, f"no transcript saved in {trajectories_dir}"

# --- Session 2: focal length query — recall should inject the guideline ---
log.info("session 2: running focal length query in sandbox...")
t1 = time.time()
result2 = _run_sandbox_prompt(
sandbox_workspace,
"what focal length was used to take the photo @sample.jpg. use exif metadata",
)
log.info(f"session 2: exited {result2.returncode} after {time.time() - t1:.0f}s")
assert result2.returncode == 0, f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}"

session2_transcripts = [p for p in trajectories_dir.glob("*.jsonl") if p not in transcripts]
assert session2_transcripts, "no new transcript saved for session 2"
session2_transcript = max(session2_transcripts, key=lambda p: p.stat().st_mtime)

Comment thread
coderabbitai[bot] marked this conversation as resolved.
commands = _bash_commands(session2_transcript)
log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools")
joined = "\n".join(commands).lower()

# Recall should steer Claude away from tools guaranteed-unavailable in the
# sandbox. Only `exiftool` is definitively absent (not installed, can't be
# pip-installed). Other libraries (PIL, piexif, exifread) may appear in a
# valid guideline as "install via pip and use", so we don't ban them.
assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands)
Loading