diff --git a/.secrets.baseline b/.secrets.baseline index a265cdbd..14ee6fa1 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": "^.secrets.baseline$|package-lock\\.json$", "lines": null }, - "generated_at": "2026-04-20T15:55:53Z", + "generated_at": "2026-04-29T16:14:59Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -153,6 +153,16 @@ "verified_result": null } ], + "sandbox/README.md": [ + { + "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd", + "is_verified": false, + "line_number": 67, + "type": "Secret Keyword", + "verified_result": null, + "is_secret": false + } + ], "sandbox/sample.env": [ { "hashed_secret": "b792a28a35da9b44fa0ee8a53002e9c238afb1bd", @@ -213,4 +223,4 @@ "file": null, "hash": null } -} \ No newline at end of file +} diff --git a/justfile b/justfile index cbbe8107..915da28f 100644 --- a/justfile +++ b/justfile @@ -13,8 +13,6 @@ codex_image := "evolve-codex-sandbox" env_file := "sandbox/myenv" sandbox_dir := "sandbox" workspace := "demo/workspace" -trace := "false" -learn := "false" # Build sandbox Docker image(s). Use target=claude or target=codex to build only one. sandbox-build target="all": @@ -58,31 +56,15 @@ sandbox-clean target="all": claude-run: docker run --rm -it --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}} -# Run a one-shot prompt in the sandbox (trace=true to summarize session, learn=true to run /evolve-lite:learn) +# Run a one-shot prompt in the sandbox claude-prompt prompt: #!/usr/bin/env sh export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF' {{prompt}} PROMPT_EOF )" - TRACE_CMD="" - LEARN_CMD="" - if [ "{{trace}}" = "true" ]; then - TRACE_CMD=" - echo; echo; echo Summarizing the session...; echo - claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --no-session-persistence -p 'tell me what happened in the newest json file in /home/sandbox/.claude/projects/-workspace/' - " - fi - if [ "{{learn}}" = "true" ]; then - LEARN_CMD=" - echo; echo; echo Learning...; echo - claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions --continue -p '/evolve-lite:learn' - " - fi docker run --rm -it --env SANDBOX_PROMPT --env-file {{env_file}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/platform-integrations/claude/plugins":/plugins {{claude_image}} sh -c " claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p \"\$SANDBOX_PROMPT\" - $TRACE_CMD - $LEARN_CMD " # Smoke-test that Claude Code is installed and working diff --git a/sandbox/README.md b/sandbox/README.md index fed9d8d4..8ca8151d 100644 --- a/sandbox/README.md +++ b/sandbox/README.md @@ -30,3 +30,63 @@ docker run --rm -it --env-file sandbox/myenv -v "$(pwd)":/workspace claude-sandb docker run --rm --env-file sandbox/myenv claude-sandbox claude -p "who are you" ``` +## Automated E2E Test + +`tests/e2e/test_sandbox_learn_recall.py` exercises the full evolve-lite +learn + recall loop end-to-end inside this sandbox. It runs two Claude +sessions: + +1. **Session 1** asks Claude to extract EXIF metadata from a sample photo. + The sandbox lacks `exiftool` and `PIL`, so Claude hits dead ends and + recovers using stdlib. The Stop hook runs `learn`, which reads the + saved transcript and extracts a guideline. +2. **Session 2** asks a similar metadata question. Recall injects the + guideline from session 1, so Claude should skip the failing tools and + go straight to stdlib. + +The test asserts a guideline file was produced in session 1 and that +session 2's bash commands do not invoke `exiftool` / `PIL` / `piexif` / +`exifread`. + +### Prerequisites + +- Build the sandbox image: `just sandbox-build claude` +- Credentials in the environment — either export `ANTHROPIC_API_KEY` + directly, or source an env file (e.g. with + [`dotenv`](https://github.com/bkeepers/dotenv)). The test forwards + these vars into the container when set: `ANTHROPIC_API_KEY`, + `ANTHROPIC_AUTH_TOKEN`, `ANTHROPIC_BASE_URL`, `CLAUDE_MODEL`, + `CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS`, `CLAUDE_CODE_SKIP_BEDROCK_AUTH`. + +Example env file (only `ANTHROPIC_API_KEY` is required; others are +optional and used when routing through a proxy or picking a specific +model): + +```bash +# Direct Anthropic API +ANTHROPIC_API_KEY=sk-ant-xxxx + +# Or, via a proxy / gateway +ANTHROPIC_AUTH_TOKEN=your-token +ANTHROPIC_BASE_URL=https://your-gateway.example.com +CLAUDE_MODEL=claude-sonnet-4-6 +CLAUDE_CODE_SKIP_BEDROCK_AUTH=1 +``` + +### Run + +```bash +# If creds live in an env file: +dotenv -e path/to/your.env -- \ + uv run pytest tests/e2e/test_sandbox_learn_recall.py \ + --run-e2e -m e2e -v --log-cli-level=INFO + +# Or, with vars already exported: +uv run pytest tests/e2e/test_sandbox_learn_recall.py \ + --run-e2e -m e2e -v --log-cli-level=INFO +``` + +The `--log-cli-level=INFO` flag streams per-session progress lines live +(~4 minutes total). The test skips if Docker, the sandbox image, or +credentials are missing. + diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py new file mode 100644 index 00000000..d39293a0 --- /dev/null +++ b/tests/e2e/test_sandbox_learn_recall.py @@ -0,0 +1,166 @@ +"""End-to-end test of the evolve-lite learn + recall flow in the sandbox. + +Runs two sequential Claude Code sessions against the Dockerized sandbox: + 1. Ask about photo location — sandbox lacks exiftool/PIL, so Claude hits + dead ends and recovers. Stop hook fires learn, which reads the saved + transcript and extracts a guideline. + 2. Ask about focal length — UserPromptSubmit recall hook injects the + guideline from session 1, so Claude should skip the dead ends. + +Assertions: + - Session 1 produces a guideline file under .evolve/entities/. + - Session 2 does NOT invoke exiftool/PIL (recall shortcut worked). + +Requires Docker, the `claude-sandbox` image built, and ANTHROPIC_API_KEY +set in the environment (forwarded into the container). +""" + +import json +import logging +import os +import re +import shutil +import subprocess +import time +from pathlib import Path + +import pytest + + +log = logging.getLogger(__name__) + + +SANDBOX_IMAGE = "claude-sandbox" +REPO_ROOT = Path(__file__).resolve().parents[2] +SESSION_TIMEOUT_SECONDS = 600 +FORWARDED_ENV_VARS = ( + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "ANTHROPIC_BASE_URL", + "CLAUDE_MODEL", + "CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS", + "CLAUDE_CODE_SKIP_BEDROCK_AUTH", +) + + +@pytest.fixture(scope="session") +def sandbox_ready(): + """Skip if Docker, the sandbox image, or credentials aren't available.""" + if shutil.which("docker") is None: + pytest.skip("docker not installed") + + if subprocess.run(["docker", "info"], capture_output=True).returncode != 0: + pytest.skip("docker daemon not running") + + image_check = subprocess.run( + ["docker", "image", "inspect", SANDBOX_IMAGE], + capture_output=True, + ) + if image_check.returncode != 0: + pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build claude`") + + if not (os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")): + pytest.skip("ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) not set in environment") + + return True + + +@pytest.fixture +def sandbox_workspace(tmp_path): + """Copy demo/workspace to tmp_path so each test gets a clean state.""" + src = REPO_ROOT / "demo" / "workspace" + dst = tmp_path / "workspace" + shutil.copytree(src, dst, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup")) + return dst + + +def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedProcess: + plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins" + cmd = ["docker", "run", "--rm"] + for var in FORWARDED_ENV_VARS: + if os.environ.get(var): + cmd += ["-e", var] + cmd += [ + "-e", + "EVOLVE_DEBUG=1", + "-v", + f"{workspace}:/workspace", + "-v", + f"{plugins}:/plugins", + SANDBOX_IMAGE, + "bash", + "-c", + f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"', + ] + return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) + + +def _bash_commands(transcript_path: Path) -> list[str]: + commands = [] + for line in transcript_path.read_text().splitlines(): + if not line.strip(): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + content = record.get("message", {}).get("content", []) + if not isinstance(content, list): + continue + for block in content: + if isinstance(block, dict) and block.get("type") == "tool_use" and block.get("name") == "Bash": + cmd = block.get("input", {}).get("command", "") + if cmd: + commands.append(cmd) + return commands + + +@pytest.mark.e2e +def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): + """Session 1 extracts a guideline; session 2 benefits from recall.""" + del sandbox_ready # only used for its skip side effect + + # --- Session 1: location query — expected dead ends then recovery --- + log.info("session 1: running location query in sandbox...") + t0 = time.time() + result1 = _run_sandbox_prompt( + sandbox_workspace, + "where was the photo @sample.jpg taken. use exif metadata", + ) + log.info(f"session 1: exited {result1.returncode} after {time.time() - t0:.0f}s") + assert result1.returncode == 0, f"session 1 exited {result1.returncode}\nstderr:\n{result1.stderr[-2000:]}" + + entities_dir = sandbox_workspace / ".evolve" / "entities" + trajectories_dir = sandbox_workspace / ".evolve" / "trajectories" + + assert entities_dir.is_dir(), f"{entities_dir} was not created — learn did not save guidelines.\nstdout:\n{result1.stdout[-2000:]}" + entity_files = list(entities_dir.rglob("*.md")) + assert entity_files, f"no guideline files found in {entities_dir}" + log.info(f"session 1: learn saved {len(entity_files)} guideline(s): {[p.name for p in entity_files]}") + + transcripts = list(trajectories_dir.glob("*.jsonl")) + assert transcripts, f"no transcript saved in {trajectories_dir}" + + # --- Session 2: focal length query — recall should inject the guideline --- + log.info("session 2: running focal length query in sandbox...") + t1 = time.time() + result2 = _run_sandbox_prompt( + sandbox_workspace, + "what focal length was used to take the photo @sample.jpg. use exif metadata", + ) + log.info(f"session 2: exited {result2.returncode} after {time.time() - t1:.0f}s") + assert result2.returncode == 0, f"session 2 exited {result2.returncode}\nstderr:\n{result2.stderr[-2000:]}" + + session2_transcripts = [p for p in trajectories_dir.glob("*.jsonl") if p not in transcripts] + assert session2_transcripts, "no new transcript saved for session 2" + session2_transcript = max(session2_transcripts, key=lambda p: p.stat().st_mtime) + + commands = _bash_commands(session2_transcript) + log.info(f"session 2: checking {len(commands)} bash commands for forbidden tools") + joined = "\n".join(commands).lower() + + # Recall should steer Claude away from tools guaranteed-unavailable in the + # sandbox. Only `exiftool` is definitively absent (not installed, can't be + # pip-installed). Other libraries (PIL, piexif, exifread) may appear in a + # valid guideline as "install via pip and use", so we don't ban them. + assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands)