Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ __pycache__
evolve_data
demo/workdir/.claude/
.bob
.bob-sandbox-home/
.claude
dist
.coverage
Expand Down
54 changes: 48 additions & 6 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,20 @@ commit message:

claude_image := "claude-sandbox"
codex_image := "evolve-codex-sandbox"
bob_image := "evolve-bob-sandbox"
env_file := "sandbox/myenv"
bob_home := env_var_or_default("BOB_HOME", ".bob-sandbox-home")
bob_hostname := env_var_or_default("BOB_HOSTNAME", "evolve-bob-sandbox")
bob_sso_port := env_var_or_default("BOB_SSO_PORT", "47687")
sandbox_dir := "sandbox"
workspace := "demo/workspace"

# Build sandbox Docker image(s). Use target=claude or target=codex to build only one.
# Build sandbox Docker image(s). Use target=claude, codex, or bob to build only one.
sandbox-build target="all":
#!/usr/bin/env sh
set -e
if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then
echo "Error: target must be one of: all, claude, codex" >&2
if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then
echo "Error: target must be one of: all, claude, codex, bob" >&2
exit 1
fi
if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then
Expand All @@ -28,6 +32,9 @@ sandbox-build target="all":
if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then
docker build --target codex -t {{codex_image}} {{sandbox_dir}}
fi
if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then
docker build --target bob -t {{bob_image}} {{sandbox_dir}}
fi

# Copy sample.env to myenv if it doesn't already exist
sandbox-setup:
Expand All @@ -38,11 +45,11 @@ sandbox-setup:
echo "{{env_file}} already exists, skipping"; \
fi

# Remove sandbox Docker image(s). Use target=claude or target=codex to remove only one.
# Remove sandbox Docker image(s). Use target=claude, codex, or bob to remove only one.
sandbox-clean target="all":
#!/usr/bin/env sh
if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then
echo "Error: target must be one of: all, claude, codex" >&2
if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then
echo "Error: target must be one of: all, claude, codex, bob" >&2
exit 1
fi
if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then
Expand All @@ -51,6 +58,9 @@ sandbox-clean target="all":
if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then
docker rmi {{codex_image}} || true
fi
if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then
docker rmi {{bob_image}} || true
fi

# Run an interactive Claude Code shell in the sandbox
claude-run:
Expand Down Expand Up @@ -79,6 +89,38 @@ codex-run:
codex-test:
docker run --rm --env-file {{env_file}} {{codex_image}} codex exec --skip-git-repo-check "who are you"

# Run an interactive Bob shell in the sandbox
bob-run: _bob-force-sso
mkdir -p {{bob_home}}
docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}}

# Authenticate Bob in the sandbox with browser SSO. Open the printed URL on the host.
bob-auth: _bob-force-sso # pragma: allowlist secret
mkdir -p {{bob_home}}
docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso
just _bob-force-sso

# Run a one-shot prompt in the sandbox
bob-prompt prompt: _bob-force-sso
#!/usr/bin/env sh
export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF'
{{prompt}}
PROMPT_EOF
)"
mkdir -p {{bob_home}}
docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SANDBOX_PROMPT --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} sh -c '
bob -C /workspace --accept-license --auth-method sso --yolo -p "$SANDBOX_PROMPT"
'

# Smoke-test that Bob is installed and working
bob-test: _bob-force-sso
mkdir -p {{bob_home}}
docker run --rm --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso -p "who are you"

_bob-force-sso:
mkdir -p {{bob_home}}
node -e 'const fs = require("fs"); const path = "{{bob_home}}/settings.json"; const data = fs.existsSync(path) ? JSON.parse(fs.readFileSync(path, "utf8")) : {}; data.security ??= {}; data.security.auth ??= {}; data.security.auth.selectedType = "sso"; fs.writeFileSync(path, JSON.stringify(data, null, 2) + "\n");'

# Render plugin-source/ into platform-integrations/. Edit plugin-source/, then run this.
compile-plugins:
uv run python plugin-source/build_plugins.py render
Expand Down
24 changes: 24 additions & 0 deletions sandbox/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,27 @@ ENV PYTHONUNBUFFERED=1

ENTRYPOINT ["codex-container-entrypoint"]
CMD ["bash"]


# Bob (IBM bobshell) target
FROM base AS bob

COPY --from=node:22-bookworm-slim /usr/local/bin/node /usr/local/bin/node
COPY --from=node:22-bookworm-slim /usr/local/lib/node_modules /usr/local/lib/node_modules

RUN ln -sf /usr/local/lib/node_modules/npm/bin/npm-cli.js /usr/local/bin/npm \
&& ln -sf /usr/local/lib/node_modules/npm/bin/npx-cli.js /usr/local/bin/npx

# Install bobshell globally as root (the IBM installer runs `npm install -g`)
RUN curl -fsSL https://bob.ibm.com/download/bobshell.sh | bash \
&& command -v bob \
&& bob --version

WORKDIR /workspace

USER sandbox

ENV HOME="/home/sandbox"
ENV PYTHONUNBUFFERED=1

CMD ["bash"]
48 changes: 48 additions & 0 deletions sandbox/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,51 @@ as environment variables, for example `CODEX_MODEL_PROVIDER`,
`CODEX_MODEL_PROVIDER_BASE_URL`, `CODEX_MODEL_PROVIDER_ENV_KEY`, and
`CODEX_MODEL_PROVIDER_WIRE_API`. The test forwards only environment variable
values into Docker; it does not mount host credential or Codex config files.

## Bob Sandbox Auth

Build the Bob image:

```bash
just sandbox-build bob
```

Authenticate once with browser SSO:

```bash
just bob-auth
```

Open the URL printed by Bob on the host machine. The recipe fixes
`SSO_PORT` and publishes it to `127.0.0.1`, so the browser callback can reach
the Bob process inside Docker. Auth state is stored in the ignored
`.bob-sandbox-home/` directory and reused by `just bob-run`, `just bob-test`,
and `just bob-prompt "..."`. The recipes also set a stable Docker hostname
because Bob's encrypted file storage derives its key from the hostname; without
that, credentials written in one `docker run --rm` session cannot be decrypted
in the next.

API-key auth with older `sk-` / `pk-` Bob keys can route Bob Shell 1.0.4 to
`prod.ibm-bob-staging.cloud.ibm.com`, which may fail with a Cloudflare 403
from inside Docker. Prefer SSO auth for this sandbox unless you have a current
Bob API key known to work with the production Bob auth backend.

## Bob Automated E2E Test

`tests/e2e/test_bob_sandbox_learn_recall.py` runs the same learn + recall flow
against the Dockerized Bob sandbox. Build the image, authenticate once, then
run pytest:

```bash
just sandbox-build bob
just bob-auth # one-time browser SSO
uv run pytest tests/e2e/test_bob_sandbox_learn_recall.py \
--run-e2e -m e2e -v --log-cli-level=INFO
```

The test mounts `.bob-sandbox-home/` (created by `bob-auth`) as the
container's `~/.bob`, pins the Docker hostname so Bob's encrypted file storage
decrypts across runs, and publishes `SSO_PORT` to `127.0.0.1`. It skips with a
clear message if the image isn't built or if the auth state directory is
missing. Set `BOB_HOME` / `BOB_HOSTNAME` / `BOB_SSO_PORT` env vars to override
defaults.
186 changes: 186 additions & 0 deletions tests/e2e/test_bob_sandbox_learn_recall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
"""End-to-end test of the evolve-lite learn + recall flow in the Bob sandbox.

Runs two sequential Bob CLI sessions against the Dockerized Bob sandbox:
1. Session 1 performs an EXIF task, then explicitly invokes the evolve-lite
save-trajectory and learn skills so a trajectory and guideline are saved.
2. Session 2 asks a related EXIF question. The recall skill should surface
the guideline from session 1 before substantive work begins.

Bob 1.0.4 has no ``UserPromptSubmit`` hook, so the recall script never gets a
session id from the runtime and cannot emit a ``recall`` audit event the way
the Claude/Codex tests rely on. Without a recall audit event there is nothing
for ``evolve-lite:provenance`` to assess, so this test does not run a third
provenance session — it validates only the learn + recall evidence that Bob
can actually produce: guideline files in ``.evolve/entities/`` and references
to those guidelines in session 2's saved trajectory.

Requires Docker, the ``evolve-bob-sandbox`` image built, and a persisted Bob
SSO auth state on the host (created by ``just bob-auth``). The test mounts
that auth state read-write into the container alongside a stable hostname so
Bob's encrypted file storage decrypts across runs.
"""

import logging
import os
import shutil
import subprocess
import time
from pathlib import Path

import pytest


log = logging.getLogger(__name__)


SANDBOX_IMAGE = "evolve-bob-sandbox"
REPO_ROOT = Path(__file__).resolve().parents[2]
SESSION_TIMEOUT_SECONDS = 600
BOB_HOSTNAME = os.environ.get("BOB_HOSTNAME", "evolve-bob-sandbox")
BOB_SSO_PORT = os.environ.get("BOB_SSO_PORT", "47687")
BOB_HOME_DEFAULT = REPO_ROOT / ".bob-sandbox-home"


@pytest.fixture(scope="session")
def bob_sandbox_ready():
"""Skip if Docker, the Bob sandbox image, or persisted auth aren't available."""
if shutil.which("docker") is None:
pytest.skip("docker not installed")

if subprocess.run(["docker", "info"], capture_output=True).returncode != 0:
pytest.skip("docker daemon not running")

image_check = subprocess.run(
["docker", "image", "inspect", SANDBOX_IMAGE],
capture_output=True,
)
if image_check.returncode != 0:
pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build bob`")

bob_home = Path(os.environ.get("BOB_HOME", str(BOB_HOME_DEFAULT)))
if not bob_home.is_dir() or not (bob_home / "settings.json").is_file():
pytest.skip(f"bob auth state missing at {bob_home} — run `just bob-auth` first")

return bob_home


@pytest.fixture
def bob_workspace(tmp_path):
"""Copy demo/workspace and install the Bob plugin into it."""
src = REPO_ROOT / "demo" / "workspace"
workspace = tmp_path / "workspace"
shutil.copytree(src, workspace, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup"))

install_script = REPO_ROOT / "platform-integrations" / "install.sh"
result = subprocess.run(
["bash", str(install_script), "install", "--platform", "bob", "--dir", str(workspace)],
capture_output=True,
text=True,
check=False,
)
assert result.returncode == 0, f"bob install failed\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"

return workspace


def _run_bob_prompt(
workspace: Path,
bob_home: Path,
prompt: str,
) -> subprocess.CompletedProcess:
cmd = [
"docker",
"run",
"--rm",
"--hostname",
BOB_HOSTNAME,
"--env",
"BOB_SHELL_FORCE_FILE_STORAGE=true",
"--env",
f"SSO_PORT={BOB_SSO_PORT}",
"--env",
"EVOLVE_DEBUG=1",
"--env",
"TMPDIR=/workspace/.evolve/tmp",
"--publish",
f"127.0.0.1:{BOB_SSO_PORT}:{BOB_SSO_PORT}",
"-v",
f"{workspace}:/workspace",
"-v",
f"{bob_home}:/home/sandbox/.bob",
SANDBOX_IMAGE,
"bob",
"--accept-license",
"--auth-method",
"sso",
"--yolo",
"-p",
prompt,
]
return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS)


@pytest.mark.e2e
def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace):
"""Session 1 learns, session 2 recalls."""
bob_home = bob_sandbox_ready
evolve_dir = bob_workspace / ".evolve"

log.info("bob session 1: running seed task with save-trajectory + learn...")
t0 = time.time()
result1 = _run_bob_prompt(
bob_workspace,
bob_home,
(
"Where was the photo @sample.jpg taken? Use EXIF metadata. "
"When done, invoke the evolve-lite save-trajectory skill, then invoke the evolve-lite learn skill. "
"Do not skip either evolve-lite skill."
),
)
log.info(f"bob session 1: exited {result1.returncode} after {time.time() - t0:.0f}s")
assert result1.returncode == 0, (
f"session 1 exited {result1.returncode}\nstdout:\n{result1.stdout[-2000:]}\nstderr:\n{result1.stderr[-2000:]}"
)

trajectories_dir = evolve_dir / "trajectories"
entities_dir = evolve_dir / "entities"
assert trajectories_dir.is_dir(), f"{trajectories_dir} was not created"
trajectories = list(trajectories_dir.glob("*.json")) + list(trajectories_dir.glob("*.jsonl"))
assert trajectories, f"no Bob trajectory files found in {trajectories_dir}"
assert entities_dir.is_dir(), f"{entities_dir} was not created"
entity_files = list(entities_dir.rglob("*.md"))
assert entity_files, f"no guideline files found in {entities_dir}"

log.info("bob session 2: running related task to exercise recall...")
t1 = time.time()
result2 = _run_bob_prompt(
bob_workspace,
bob_home,
(
"STEP 1 (mandatory, do this first before anything else): invoke the evolve-lite recall skill "
"to retrieve relevant stored guidelines. Do not run any other tool until recall is complete. "
"STEP 2: answer this question using EXIF metadata, applying any guideline returned by recall: "
"What focal length was used to take the photo @sample.jpg? "
"STEP 3 (mandatory): invoke the evolve-lite save-trajectory skill. "
"Do not invoke the learn skill."
),
)
log.info(f"bob session 2: exited {result2.returncode} after {time.time() - t1:.0f}s")
assert result2.returncode == 0, (
f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}"
)

session2_trajectories = (set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))) - set(trajectories)
assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}"

# Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never
# gets a session id from the runtime and cannot emit a recall audit
# event. Without a recall audit there is nothing for evolve-lite:provenance
# to assess, so this test stops at the indirect evidence Bob can produce:
# at least one session-2 trajectory should reference a guideline file
# learned in session 1.
learned_stems = {path.stem for path in entity_files}
session2_texts = [path.read_text(encoding="utf-8") for path in session2_trajectories]
assert any(stem in text for stem in learned_stems for text in session2_texts), (
f"no session 2 trajectory referenced any guideline filename from {learned_stems}"
)
Loading