AgentToolkit · visahak · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ __pycache__
 evolve_data
 demo/workdir/.claude/
 .bob
+.bob-sandbox-home/
 .claude
 dist
 .coverage

diff --git a/justfile b/justfile
@@ -10,16 +10,20 @@ commit message:
 
 claude_image := "claude-sandbox"
 codex_image := "evolve-codex-sandbox"
+bob_image := "evolve-bob-sandbox"
 env_file := "sandbox/myenv"
+bob_home := env_var_or_default("BOB_HOME", ".bob-sandbox-home")
+bob_hostname := env_var_or_default("BOB_HOSTNAME", "evolve-bob-sandbox")
+bob_sso_port := env_var_or_default("BOB_SSO_PORT", "47687")
 sandbox_dir := "sandbox"
 workspace := "demo/workspace"
 
-# Build sandbox Docker image(s). Use target=claude or target=codex to build only one.
+# Build sandbox Docker image(s). Use target=claude, codex, or bob to build only one.
 sandbox-build target="all":
     #!/usr/bin/env sh
     set -e
-    if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then
-        echo "Error: target must be one of: all, claude, codex" >&2
+    if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then
+        echo "Error: target must be one of: all, claude, codex, bob" >&2
         exit 1
     fi
     if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then
@@ -28,6 +32,9 @@ sandbox-build target="all":
     if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then
         docker build --target codex -t {{codex_image}} {{sandbox_dir}}
     fi
+    if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then
+        docker build --target bob -t {{bob_image}} {{sandbox_dir}}
+    fi
 
 # Copy sample.env to myenv if it doesn't already exist
 sandbox-setup:
@@ -38,11 +45,11 @@ sandbox-setup:
         echo "{{env_file}} already exists, skipping"; \
     fi
 
-# Remove sandbox Docker image(s). Use target=claude or target=codex to remove only one.
+# Remove sandbox Docker image(s). Use target=claude, codex, or bob to remove only one.
 sandbox-clean target="all":
     #!/usr/bin/env sh
-    if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then
-        echo "Error: target must be one of: all, claude, codex" >&2
+    if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then
+        echo "Error: target must be one of: all, claude, codex, bob" >&2
         exit 1
     fi
     if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then
@@ -51,6 +58,9 @@ sandbox-clean target="all":
     if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then
         docker rmi {{codex_image}} || true
     fi
+    if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then
+        docker rmi {{bob_image}} || true
+    fi
 
 # Run an interactive Claude Code shell in the sandbox
 claude-run:
@@ -79,6 +89,38 @@ codex-run:
 codex-test:
     docker run --rm --env-file {{env_file}} {{codex_image}} codex exec --skip-git-repo-check "who are you"
 
+# Run an interactive Bob shell in the sandbox
+bob-run: _bob-force-sso
+    mkdir -p {{bob_home}}
+    docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}}
+
+# Authenticate Bob in the sandbox with browser SSO. Open the printed URL on the host.
+bob-auth: _bob-force-sso  # pragma: allowlist secret
+    mkdir -p {{bob_home}}
+    docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso
+    just _bob-force-sso
+
+# Run a one-shot prompt in the sandbox
+bob-prompt prompt: _bob-force-sso
+    #!/usr/bin/env sh
+    export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF'
+    {{prompt}}
+    PROMPT_EOF
+    )"
+    mkdir -p {{bob_home}}
+    docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SANDBOX_PROMPT --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} sh -c '
+        bob -C /workspace --accept-license --auth-method sso --yolo -p "$SANDBOX_PROMPT"
+    '
+
+# Smoke-test that Bob is installed and working
+bob-test: _bob-force-sso
+    mkdir -p {{bob_home}}
+    docker run --rm --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso -p "who are you"
+
+_bob-force-sso:
+    mkdir -p {{bob_home}}
+    node -e 'const fs = require("fs"); const path = "{{bob_home}}/settings.json"; const data = fs.existsSync(path) ? JSON.parse(fs.readFileSync(path, "utf8")) : {}; data.security ??= {}; data.security.auth ??= {}; data.security.auth.selectedType = "sso"; fs.writeFileSync(path, JSON.stringify(data, null, 2) + "\n");'
+
 # Render plugin-source/ into platform-integrations/. Edit plugin-source/, then run this.
 compile-plugins:
     uv run python plugin-source/build_plugins.py render

diff --git a/sandbox/Dockerfile b/sandbox/Dockerfile
@@ -108,3 +108,27 @@ ENV PYTHONUNBUFFERED=1
 
 ENTRYPOINT ["codex-container-entrypoint"]
 CMD ["bash"]
+
+
+# Bob (IBM bobshell) target
+FROM base AS bob
+
+COPY --from=node:22-bookworm-slim /usr/local/bin/node /usr/local/bin/node
+COPY --from=node:22-bookworm-slim /usr/local/lib/node_modules /usr/local/lib/node_modules
+
+RUN ln -sf /usr/local/lib/node_modules/npm/bin/npm-cli.js /usr/local/bin/npm \
+    && ln -sf /usr/local/lib/node_modules/npm/bin/npx-cli.js /usr/local/bin/npx
+
+# Install bobshell globally as root (the IBM installer runs `npm install -g`)
+RUN curl -fsSL https://bob.ibm.com/download/bobshell.sh | bash \
+    && command -v bob \
+    && bob --version
+
+WORKDIR /workspace
+
+USER sandbox
+
+ENV HOME="/home/sandbox"
+ENV PYTHONUNBUFFERED=1
+
+CMD ["bash"]
diff --git a/sandbox/README.md b/sandbox/README.md
@@ -110,3 +110,51 @@ as environment variables, for example `CODEX_MODEL_PROVIDER`,
 `CODEX_MODEL_PROVIDER_BASE_URL`, `CODEX_MODEL_PROVIDER_ENV_KEY`, and
 `CODEX_MODEL_PROVIDER_WIRE_API`. The test forwards only environment variable
 values into Docker; it does not mount host credential or Codex config files.
+
+## Bob Sandbox Auth
+
+Build the Bob image:
+
+```bash
+just sandbox-build bob
+```
+
+Authenticate once with browser SSO:
+
+```bash
+just bob-auth
+```
+
+Open the URL printed by Bob on the host machine. The recipe fixes
+`SSO_PORT` and publishes it to `127.0.0.1`, so the browser callback can reach
+the Bob process inside Docker. Auth state is stored in the ignored
+`.bob-sandbox-home/` directory and reused by `just bob-run`, `just bob-test`,
+and `just bob-prompt "..."`. The recipes also set a stable Docker hostname
+because Bob's encrypted file storage derives its key from the hostname; without
+that, credentials written in one `docker run --rm` session cannot be decrypted
+in the next.
+
+API-key auth with older `sk-` / `pk-` Bob keys can route Bob Shell 1.0.4 to
+`prod.ibm-bob-staging.cloud.ibm.com`, which may fail with a Cloudflare 403
+from inside Docker. Prefer SSO auth for this sandbox unless you have a current
+Bob API key known to work with the production Bob auth backend.
+
+## Bob Automated E2E Test
+
+`tests/e2e/test_bob_sandbox_learn_recall.py` runs the same learn + recall flow
+against the Dockerized Bob sandbox. Build the image, authenticate once, then
+run pytest:
+
+```bash
+just sandbox-build bob
+just bob-auth          # one-time browser SSO
+uv run pytest tests/e2e/test_bob_sandbox_learn_recall.py \
+  --run-e2e -m e2e -v --log-cli-level=INFO
+```
+
+The test mounts `.bob-sandbox-home/` (created by `bob-auth`) as the
+container's `~/.bob`, pins the Docker hostname so Bob's encrypted file storage
+decrypts across runs, and publishes `SSO_PORT` to `127.0.0.1`. It skips with a
+clear message if the image isn't built or if the auth state directory is
+missing. Set `BOB_HOME` / `BOB_HOSTNAME` / `BOB_SSO_PORT` env vars to override
+defaults.
diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py
@@ -0,0 +1,186 @@
+"""End-to-end test of the evolve-lite learn + recall flow in the Bob sandbox.
+
+Runs two sequential Bob CLI sessions against the Dockerized Bob sandbox:
+  1. Session 1 performs an EXIF task, then explicitly invokes the evolve-lite
+     save-trajectory and learn skills so a trajectory and guideline are saved.
+  2. Session 2 asks a related EXIF question. The recall skill should surface
+     the guideline from session 1 before substantive work begins.
+
+Bob 1.0.4 has no ``UserPromptSubmit`` hook, so the recall script never gets a
+session id from the runtime and cannot emit a ``recall`` audit event the way
+the Claude/Codex tests rely on. Without a recall audit event there is nothing
+for ``evolve-lite:provenance`` to assess, so this test does not run a third
+provenance session — it validates only the learn + recall evidence that Bob
+can actually produce: guideline files in ``.evolve/entities/`` and references
+to those guidelines in session 2's saved trajectory.
+
+Requires Docker, the ``evolve-bob-sandbox`` image built, and a persisted Bob
+SSO auth state on the host (created by ``just bob-auth``). The test mounts
+that auth state read-write into the container alongside a stable hostname so
+Bob's encrypted file storage decrypts across runs.
+"""
+
+import logging
+import os
+import shutil
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+
+
+log = logging.getLogger(__name__)
+
+
+SANDBOX_IMAGE = "evolve-bob-sandbox"
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SESSION_TIMEOUT_SECONDS = 600
+BOB_HOSTNAME = os.environ.get("BOB_HOSTNAME", "evolve-bob-sandbox")
+BOB_SSO_PORT = os.environ.get("BOB_SSO_PORT", "47687")
+BOB_HOME_DEFAULT = REPO_ROOT / ".bob-sandbox-home"
+
+
+@pytest.fixture(scope="session")
+def bob_sandbox_ready():
+    """Skip if Docker, the Bob sandbox image, or persisted auth aren't available."""
+    if shutil.which("docker") is None:
+        pytest.skip("docker not installed")
+
+    if subprocess.run(["docker", "info"], capture_output=True).returncode != 0:
+        pytest.skip("docker daemon not running")
+
+    image_check = subprocess.run(
+        ["docker", "image", "inspect", SANDBOX_IMAGE],
+        capture_output=True,
+    )
+    if image_check.returncode != 0:
+        pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build bob`")
+
+    bob_home = Path(os.environ.get("BOB_HOME", str(BOB_HOME_DEFAULT)))
+    if not bob_home.is_dir() or not (bob_home / "settings.json").is_file():
+        pytest.skip(f"bob auth state missing at {bob_home} — run `just bob-auth` first")
+
+    return bob_home
+
+
+@pytest.fixture
+def bob_workspace(tmp_path):
+    """Copy demo/workspace and install the Bob plugin into it."""
+    src = REPO_ROOT / "demo" / "workspace"
+    workspace = tmp_path / "workspace"
+    shutil.copytree(src, workspace, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup"))
+
+    install_script = REPO_ROOT / "platform-integrations" / "install.sh"
+    result = subprocess.run(
+        ["bash", str(install_script), "install", "--platform", "bob", "--dir", str(workspace)],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert result.returncode == 0, f"bob install failed\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
+
+    return workspace
+
+
+def _run_bob_prompt(
+    workspace: Path,
+    bob_home: Path,
+    prompt: str,
+) -> subprocess.CompletedProcess:
+    cmd = [
+        "docker",
+        "run",
+        "--rm",
+        "--hostname",
+        BOB_HOSTNAME,
+        "--env",
+        "BOB_SHELL_FORCE_FILE_STORAGE=true",
+        "--env",
+        f"SSO_PORT={BOB_SSO_PORT}",
+        "--env",
+        "EVOLVE_DEBUG=1",
+        "--env",
+        "TMPDIR=/workspace/.evolve/tmp",
+        "--publish",
+        f"127.0.0.1:{BOB_SSO_PORT}:{BOB_SSO_PORT}",
+        "-v",
+        f"{workspace}:/workspace",
+        "-v",
+        f"{bob_home}:/home/sandbox/.bob",
+        SANDBOX_IMAGE,
+        "bob",
+        "--accept-license",
+        "--auth-method",
+        "sso",
+        "--yolo",
+        "-p",
+        prompt,
+    ]
+    return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS)
+
+
+@pytest.mark.e2e
+def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace):
+    """Session 1 learns, session 2 recalls."""
+    bob_home = bob_sandbox_ready
+    evolve_dir = bob_workspace / ".evolve"
+
+    log.info("bob session 1: running seed task with save-trajectory + learn...")
+    t0 = time.time()
+    result1 = _run_bob_prompt(
+        bob_workspace,
+        bob_home,
+        (
+            "Where was the photo @sample.jpg taken? Use EXIF metadata. "
+            "When done, invoke the evolve-lite save-trajectory skill, then invoke the evolve-lite learn skill. "
+            "Do not skip either evolve-lite skill."
+        ),
+    )
+    log.info(f"bob session 1: exited {result1.returncode} after {time.time() - t0:.0f}s")
+    assert result1.returncode == 0, (
+        f"session 1 exited {result1.returncode}\nstdout:\n{result1.stdout[-2000:]}\nstderr:\n{result1.stderr[-2000:]}"
+    )
+
+    trajectories_dir = evolve_dir / "trajectories"
+    entities_dir = evolve_dir / "entities"
+    assert trajectories_dir.is_dir(), f"{trajectories_dir} was not created"
+    trajectories = list(trajectories_dir.glob("*.json")) + list(trajectories_dir.glob("*.jsonl"))
+    assert trajectories, f"no Bob trajectory files found in {trajectories_dir}"
+    assert entities_dir.is_dir(), f"{entities_dir} was not created"
+    entity_files = list(entities_dir.rglob("*.md"))
+    assert entity_files, f"no guideline files found in {entities_dir}"
+
+    log.info("bob session 2: running related task to exercise recall...")
+    t1 = time.time()
+    result2 = _run_bob_prompt(
+        bob_workspace,
+        bob_home,
+        (
+            "STEP 1 (mandatory, do this first before anything else): invoke the evolve-lite recall skill "
+            "to retrieve relevant stored guidelines. Do not run any other tool until recall is complete. "
+            "STEP 2: answer this question using EXIF metadata, applying any guideline returned by recall: "
+            "What focal length was used to take the photo @sample.jpg? "
+            "STEP 3 (mandatory): invoke the evolve-lite save-trajectory skill. "
+            "Do not invoke the learn skill."
+        ),
+    )
+    log.info(f"bob session 2: exited {result2.returncode} after {time.time() - t1:.0f}s")
+    assert result2.returncode == 0, (
+        f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}"
+    )
+
+    session2_trajectories = (set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))) - set(trajectories)
+    assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}"
+
+    # Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never
+    # gets a session id from the runtime and cannot emit a recall audit
+    # event. Without a recall audit there is nothing for evolve-lite:provenance
+    # to assess, so this test stops at the indirect evidence Bob can produce:
+    # at least one session-2 trajectory should reference a guideline file
+    # learned in session 1.
+    learned_stems = {path.stem for path in entity_files}
+    session2_texts = [path.read_text(encoding="utf-8") for path in session2_trajectories]
+    assert any(stem in text for stem in learned_stems for text in session2_texts), (
+        f"no session 2 trajectory referenced any guideline filename from {learned_stems}"
+    )