diff --git a/.gitignore b/.gitignore index 5c925ec2..bd26a3ad 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ __pycache__ evolve_data demo/workdir/.claude/ .bob +.bob-sandbox-home/ .claude dist .coverage diff --git a/justfile b/justfile index fc1aaeaf..95b48d2c 100644 --- a/justfile +++ b/justfile @@ -10,16 +10,20 @@ commit message: claude_image := "claude-sandbox" codex_image := "evolve-codex-sandbox" +bob_image := "evolve-bob-sandbox" env_file := "sandbox/myenv" +bob_home := env_var_or_default("BOB_HOME", ".bob-sandbox-home") +bob_hostname := env_var_or_default("BOB_HOSTNAME", "evolve-bob-sandbox") +bob_sso_port := env_var_or_default("BOB_SSO_PORT", "47687") sandbox_dir := "sandbox" workspace := "demo/workspace" -# Build sandbox Docker image(s). Use target=claude or target=codex to build only one. +# Build sandbox Docker image(s). Use target=claude, codex, or bob to build only one. sandbox-build target="all": #!/usr/bin/env sh set -e - if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then - echo "Error: target must be one of: all, claude, codex" >&2 + if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then + echo "Error: target must be one of: all, claude, codex, bob" >&2 exit 1 fi if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then @@ -28,6 +32,9 @@ sandbox-build target="all": if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then docker build --target codex -t {{codex_image}} {{sandbox_dir}} fi + if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then + docker build --target bob -t {{bob_image}} {{sandbox_dir}} + fi # Copy sample.env to myenv if it doesn't already exist sandbox-setup: @@ -38,11 +45,11 @@ sandbox-setup: echo "{{env_file}} already exists, skipping"; \ fi -# Remove sandbox Docker image(s). Use target=claude or target=codex to remove only one. +# Remove sandbox Docker image(s). Use target=claude, codex, or bob to remove only one. sandbox-clean target="all": #!/usr/bin/env sh - if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then - echo "Error: target must be one of: all, claude, codex" >&2 + if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then + echo "Error: target must be one of: all, claude, codex, bob" >&2 exit 1 fi if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then @@ -51,6 +58,9 @@ sandbox-clean target="all": if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then docker rmi {{codex_image}} || true fi + if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then + docker rmi {{bob_image}} || true + fi # Run an interactive Claude Code shell in the sandbox claude-run: @@ -79,6 +89,38 @@ codex-run: codex-test: docker run --rm --env-file {{env_file}} {{codex_image}} codex exec --skip-git-repo-check "who are you" +# Run an interactive Bob shell in the sandbox +bob-run: _bob-force-sso + mkdir -p {{bob_home}} + docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} + +# Authenticate Bob in the sandbox with browser SSO. Open the printed URL on the host. +bob-auth: _bob-force-sso # pragma: allowlist secret + mkdir -p {{bob_home}} + docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso + just _bob-force-sso + +# Run a one-shot prompt in the sandbox +bob-prompt prompt: _bob-force-sso + #!/usr/bin/env sh + export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF' + {{prompt}} + PROMPT_EOF + )" + mkdir -p {{bob_home}} + docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SANDBOX_PROMPT --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} sh -c ' + bob -C /workspace --accept-license --auth-method sso --yolo -p "$SANDBOX_PROMPT" + ' + +# Smoke-test that Bob is installed and working +bob-test: _bob-force-sso + mkdir -p {{bob_home}} + docker run --rm --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso -p "who are you" + +_bob-force-sso: + mkdir -p {{bob_home}} + node -e 'const fs = require("fs"); const path = "{{bob_home}}/settings.json"; const data = fs.existsSync(path) ? JSON.parse(fs.readFileSync(path, "utf8")) : {}; data.security ??= {}; data.security.auth ??= {}; data.security.auth.selectedType = "sso"; fs.writeFileSync(path, JSON.stringify(data, null, 2) + "\n");' + # Render plugin-source/ into platform-integrations/. Edit plugin-source/, then run this. compile-plugins: uv run python plugin-source/build_plugins.py render diff --git a/sandbox/Dockerfile b/sandbox/Dockerfile index 278dda33..ccc126d9 100644 --- a/sandbox/Dockerfile +++ b/sandbox/Dockerfile @@ -108,3 +108,27 @@ ENV PYTHONUNBUFFERED=1 ENTRYPOINT ["codex-container-entrypoint"] CMD ["bash"] + + +# Bob (IBM bobshell) target +FROM base AS bob + +COPY --from=node:22-bookworm-slim /usr/local/bin/node /usr/local/bin/node +COPY --from=node:22-bookworm-slim /usr/local/lib/node_modules /usr/local/lib/node_modules + +RUN ln -sf /usr/local/lib/node_modules/npm/bin/npm-cli.js /usr/local/bin/npm \ + && ln -sf /usr/local/lib/node_modules/npm/bin/npx-cli.js /usr/local/bin/npx + +# Install bobshell globally as root (the IBM installer runs `npm install -g`) +RUN curl -fsSL https://bob.ibm.com/download/bobshell.sh | bash \ + && command -v bob \ + && bob --version + +WORKDIR /workspace + +USER sandbox + +ENV HOME="/home/sandbox" +ENV PYTHONUNBUFFERED=1 + +CMD ["bash"] diff --git a/sandbox/README.md b/sandbox/README.md index 877cebc9..2ee38b13 100644 --- a/sandbox/README.md +++ b/sandbox/README.md @@ -110,3 +110,51 @@ as environment variables, for example `CODEX_MODEL_PROVIDER`, `CODEX_MODEL_PROVIDER_BASE_URL`, `CODEX_MODEL_PROVIDER_ENV_KEY`, and `CODEX_MODEL_PROVIDER_WIRE_API`. The test forwards only environment variable values into Docker; it does not mount host credential or Codex config files. + +## Bob Sandbox Auth + +Build the Bob image: + +```bash +just sandbox-build bob +``` + +Authenticate once with browser SSO: + +```bash +just bob-auth +``` + +Open the URL printed by Bob on the host machine. The recipe fixes +`SSO_PORT` and publishes it to `127.0.0.1`, so the browser callback can reach +the Bob process inside Docker. Auth state is stored in the ignored +`.bob-sandbox-home/` directory and reused by `just bob-run`, `just bob-test`, +and `just bob-prompt "..."`. The recipes also set a stable Docker hostname +because Bob's encrypted file storage derives its key from the hostname; without +that, credentials written in one `docker run --rm` session cannot be decrypted +in the next. + +API-key auth with older `sk-` / `pk-` Bob keys can route Bob Shell 1.0.4 to +`prod.ibm-bob-staging.cloud.ibm.com`, which may fail with a Cloudflare 403 +from inside Docker. Prefer SSO auth for this sandbox unless you have a current +Bob API key known to work with the production Bob auth backend. + +## Bob Automated E2E Test + +`tests/e2e/test_bob_sandbox_learn_recall.py` runs the same learn + recall flow +against the Dockerized Bob sandbox. Build the image, authenticate once, then +run pytest: + +```bash +just sandbox-build bob +just bob-auth # one-time browser SSO +uv run pytest tests/e2e/test_bob_sandbox_learn_recall.py \ + --run-e2e -m e2e -v --log-cli-level=INFO +``` + +The test mounts `.bob-sandbox-home/` (created by `bob-auth`) as the +container's `~/.bob`, pins the Docker hostname so Bob's encrypted file storage +decrypts across runs, and publishes `SSO_PORT` to `127.0.0.1`. It skips with a +clear message if the image isn't built or if the auth state directory is +missing. Set `BOB_HOME` / `BOB_HOSTNAME` / `BOB_SSO_PORT` env vars to override +defaults. diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py new file mode 100644 index 00000000..c8ca56f0 --- /dev/null +++ b/tests/e2e/test_bob_sandbox_learn_recall.py @@ -0,0 +1,186 @@ +"""End-to-end test of the evolve-lite learn + recall flow in the Bob sandbox. + +Runs two sequential Bob CLI sessions against the Dockerized Bob sandbox: + 1. Session 1 performs an EXIF task, then explicitly invokes the evolve-lite + save-trajectory and learn skills so a trajectory and guideline are saved. + 2. Session 2 asks a related EXIF question. The recall skill should surface + the guideline from session 1 before substantive work begins. + +Bob 1.0.4 has no ``UserPromptSubmit`` hook, so the recall script never gets a +session id from the runtime and cannot emit a ``recall`` audit event the way +the Claude/Codex tests rely on. Without a recall audit event there is nothing +for ``evolve-lite:provenance`` to assess, so this test does not run a third +provenance session — it validates only the learn + recall evidence that Bob +can actually produce: guideline files in ``.evolve/entities/`` and references +to those guidelines in session 2's saved trajectory. + +Requires Docker, the ``evolve-bob-sandbox`` image built, and a persisted Bob +SSO auth state on the host (created by ``just bob-auth``). The test mounts +that auth state read-write into the container alongside a stable hostname so +Bob's encrypted file storage decrypts across runs. +""" + +import logging +import os +import shutil +import subprocess +import time +from pathlib import Path + +import pytest + + +log = logging.getLogger(__name__) + + +SANDBOX_IMAGE = "evolve-bob-sandbox" +REPO_ROOT = Path(__file__).resolve().parents[2] +SESSION_TIMEOUT_SECONDS = 600 +BOB_HOSTNAME = os.environ.get("BOB_HOSTNAME", "evolve-bob-sandbox") +BOB_SSO_PORT = os.environ.get("BOB_SSO_PORT", "47687") +BOB_HOME_DEFAULT = REPO_ROOT / ".bob-sandbox-home" + + +@pytest.fixture(scope="session") +def bob_sandbox_ready(): + """Skip if Docker, the Bob sandbox image, or persisted auth aren't available.""" + if shutil.which("docker") is None: + pytest.skip("docker not installed") + + if subprocess.run(["docker", "info"], capture_output=True).returncode != 0: + pytest.skip("docker daemon not running") + + image_check = subprocess.run( + ["docker", "image", "inspect", SANDBOX_IMAGE], + capture_output=True, + ) + if image_check.returncode != 0: + pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build bob`") + + bob_home = Path(os.environ.get("BOB_HOME", str(BOB_HOME_DEFAULT))) + if not bob_home.is_dir() or not (bob_home / "settings.json").is_file(): + pytest.skip(f"bob auth state missing at {bob_home} — run `just bob-auth` first") + + return bob_home + + +@pytest.fixture +def bob_workspace(tmp_path): + """Copy demo/workspace and install the Bob plugin into it.""" + src = REPO_ROOT / "demo" / "workspace" + workspace = tmp_path / "workspace" + shutil.copytree(src, workspace, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup")) + + install_script = REPO_ROOT / "platform-integrations" / "install.sh" + result = subprocess.run( + ["bash", str(install_script), "install", "--platform", "bob", "--dir", str(workspace)], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"bob install failed\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" + + return workspace + + +def _run_bob_prompt( + workspace: Path, + bob_home: Path, + prompt: str, +) -> subprocess.CompletedProcess: + cmd = [ + "docker", + "run", + "--rm", + "--hostname", + BOB_HOSTNAME, + "--env", + "BOB_SHELL_FORCE_FILE_STORAGE=true", + "--env", + f"SSO_PORT={BOB_SSO_PORT}", + "--env", + "EVOLVE_DEBUG=1", + "--env", + "TMPDIR=/workspace/.evolve/tmp", + "--publish", + f"127.0.0.1:{BOB_SSO_PORT}:{BOB_SSO_PORT}", + "-v", + f"{workspace}:/workspace", + "-v", + f"{bob_home}:/home/sandbox/.bob", + SANDBOX_IMAGE, + "bob", + "--accept-license", + "--auth-method", + "sso", + "--yolo", + "-p", + prompt, + ] + return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) + + +@pytest.mark.e2e +def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace): + """Session 1 learns, session 2 recalls.""" + bob_home = bob_sandbox_ready + evolve_dir = bob_workspace / ".evolve" + + log.info("bob session 1: running seed task with save-trajectory + learn...") + t0 = time.time() + result1 = _run_bob_prompt( + bob_workspace, + bob_home, + ( + "Where was the photo @sample.jpg taken? Use EXIF metadata. " + "When done, invoke the evolve-lite save-trajectory skill, then invoke the evolve-lite learn skill. " + "Do not skip either evolve-lite skill." + ), + ) + log.info(f"bob session 1: exited {result1.returncode} after {time.time() - t0:.0f}s") + assert result1.returncode == 0, ( + f"session 1 exited {result1.returncode}\nstdout:\n{result1.stdout[-2000:]}\nstderr:\n{result1.stderr[-2000:]}" + ) + + trajectories_dir = evolve_dir / "trajectories" + entities_dir = evolve_dir / "entities" + assert trajectories_dir.is_dir(), f"{trajectories_dir} was not created" + trajectories = list(trajectories_dir.glob("*.json")) + list(trajectories_dir.glob("*.jsonl")) + assert trajectories, f"no Bob trajectory files found in {trajectories_dir}" + assert entities_dir.is_dir(), f"{entities_dir} was not created" + entity_files = list(entities_dir.rglob("*.md")) + assert entity_files, f"no guideline files found in {entities_dir}" + + log.info("bob session 2: running related task to exercise recall...") + t1 = time.time() + result2 = _run_bob_prompt( + bob_workspace, + bob_home, + ( + "STEP 1 (mandatory, do this first before anything else): invoke the evolve-lite recall skill " + "to retrieve relevant stored guidelines. Do not run any other tool until recall is complete. " + "STEP 2: answer this question using EXIF metadata, applying any guideline returned by recall: " + "What focal length was used to take the photo @sample.jpg? " + "STEP 3 (mandatory): invoke the evolve-lite save-trajectory skill. " + "Do not invoke the learn skill." + ), + ) + log.info(f"bob session 2: exited {result2.returncode} after {time.time() - t1:.0f}s") + assert result2.returncode == 0, ( + f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}" + ) + + session2_trajectories = (set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))) - set(trajectories) + assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}" + + # Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never + # gets a session id from the runtime and cannot emit a recall audit + # event. Without a recall audit there is nothing for evolve-lite:provenance + # to assess, so this test stops at the indirect evidence Bob can produce: + # at least one session-2 trajectory should reference a guideline file + # learned in session 1. + learned_stems = {path.stem for path in entity_files} + session2_texts = [path.read_text(encoding="utf-8") for path in session2_trajectories] + assert any(stem in text for stem in learned_stems for text in session2_texts), ( + f"no session 2 trajectory referenced any guideline filename from {learned_stems}" + )