From 731dc0b1f5f03d540cb9219eb27921a65dac66df Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 10 Jun 2026 09:31:21 -0500 Subject: [PATCH 1/5] feat(sandbox): add bob CLI sandbox image and e2e test Add a third Docker target for the IBM Bob CLI alongside the existing Claude and Codex sandboxes. Reuses the shared base image, installs bobshell via the IBM-published installer over node 22, and pins a stable hostname plus mounted .bob-sandbox-home so Bob's encrypted file storage decrypts across docker run --rm sessions and SSO auth state persists. The e2e test mirrors the codex flow over three sessions (seed task with save-trajectory + learn, focal-length task with recall, offline provenance). Bob 1.0.4 has no UserPromptSubmit hook, so recall provenance is verified indirectly by checking session 2's trajectory references a learned guideline file rather than asserting on a recall audit event. --- .gitignore | 1 + justfile | 54 ++++- sandbox/Dockerfile | 24 +++ sandbox/README.md | 48 +++++ tests/e2e/test_bob_sandbox_learn_recall.py | 222 +++++++++++++++++++++ 5 files changed, 343 insertions(+), 6 deletions(-) create mode 100644 tests/e2e/test_bob_sandbox_learn_recall.py diff --git a/.gitignore b/.gitignore index 5c925ec2..bd26a3ad 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ __pycache__ evolve_data demo/workdir/.claude/ .bob +.bob-sandbox-home/ .claude dist .coverage diff --git a/justfile b/justfile index fc1aaeaf..94b2fb12 100644 --- a/justfile +++ b/justfile @@ -10,16 +10,20 @@ commit message: claude_image := "claude-sandbox" codex_image := "evolve-codex-sandbox" +bob_image := "evolve-bob-sandbox" env_file := "sandbox/myenv" +bob_home := env_var_or_default("BOB_HOME", ".bob-sandbox-home") +bob_hostname := env_var_or_default("BOB_HOSTNAME", "evolve-bob-sandbox") +bob_sso_port := env_var_or_default("BOB_SSO_PORT", "47687") sandbox_dir := "sandbox" workspace := "demo/workspace" -# Build sandbox Docker image(s). Use target=claude or target=codex to build only one. +# Build sandbox Docker image(s). Use target=claude, codex, or bob to build only one. sandbox-build target="all": #!/usr/bin/env sh set -e - if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then - echo "Error: target must be one of: all, claude, codex" >&2 + if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then + echo "Error: target must be one of: all, claude, codex, bob" >&2 exit 1 fi if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then @@ -28,6 +32,9 @@ sandbox-build target="all": if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then docker build --target codex -t {{codex_image}} {{sandbox_dir}} fi + if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then + docker build --target bob -t {{bob_image}} {{sandbox_dir}} + fi # Copy sample.env to myenv if it doesn't already exist sandbox-setup: @@ -38,11 +45,11 @@ sandbox-setup: echo "{{env_file}} already exists, skipping"; \ fi -# Remove sandbox Docker image(s). Use target=claude or target=codex to remove only one. +# Remove sandbox Docker image(s). Use target=claude, codex, or bob to remove only one. sandbox-clean target="all": #!/usr/bin/env sh - if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then - echo "Error: target must be one of: all, claude, codex" >&2 + if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then + echo "Error: target must be one of: all, claude, codex, bob" >&2 exit 1 fi if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then @@ -51,6 +58,9 @@ sandbox-clean target="all": if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then docker rmi {{codex_image}} || true fi + if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then + docker rmi {{bob_image}} || true + fi # Run an interactive Claude Code shell in the sandbox claude-run: @@ -79,6 +89,38 @@ codex-run: codex-test: docker run --rm --env-file {{env_file}} {{codex_image}} codex exec --skip-git-repo-check "who are you" +# Run an interactive Bob shell in the sandbox +bob-run: _bob-force-sso + mkdir -p {{bob_home}} + docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} + +# Authenticate Bob in the sandbox with browser SSO. Open the printed URL on the host. +bob-auth: _bob-force-sso + mkdir -p {{bob_home}} + docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso + just _bob-force-sso + +# Run a one-shot prompt in the sandbox +bob-prompt prompt: _bob-force-sso + #!/usr/bin/env sh + export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF' + {{prompt}} + PROMPT_EOF + )" + mkdir -p {{bob_home}} + docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SANDBOX_PROMPT --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} sh -c ' + bob -C /workspace --accept-license --auth-method sso --yolo -p "$SANDBOX_PROMPT" + ' + +# Smoke-test that Bob is installed and working +bob-test: _bob-force-sso + mkdir -p {{bob_home}} + docker run --rm --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso -p "who are you" + +_bob-force-sso: + mkdir -p {{bob_home}} + node -e 'const fs = require("fs"); const path = "{{bob_home}}/settings.json"; const data = fs.existsSync(path) ? JSON.parse(fs.readFileSync(path, "utf8")) : {}; data.security ??= {}; data.security.auth ??= {}; data.security.auth.selectedType = "sso"; fs.writeFileSync(path, JSON.stringify(data, null, 2) + "\n");' + # Render plugin-source/ into platform-integrations/. Edit plugin-source/, then run this. compile-plugins: uv run python plugin-source/build_plugins.py render diff --git a/sandbox/Dockerfile b/sandbox/Dockerfile index 278dda33..ccc126d9 100644 --- a/sandbox/Dockerfile +++ b/sandbox/Dockerfile @@ -108,3 +108,27 @@ ENV PYTHONUNBUFFERED=1 ENTRYPOINT ["codex-container-entrypoint"] CMD ["bash"] + + +# Bob (IBM bobshell) target +FROM base AS bob + +COPY --from=node:22-bookworm-slim /usr/local/bin/node /usr/local/bin/node +COPY --from=node:22-bookworm-slim /usr/local/lib/node_modules /usr/local/lib/node_modules + +RUN ln -sf /usr/local/lib/node_modules/npm/bin/npm-cli.js /usr/local/bin/npm \ + && ln -sf /usr/local/lib/node_modules/npm/bin/npx-cli.js /usr/local/bin/npx + +# Install bobshell globally as root (the IBM installer runs `npm install -g`) +RUN curl -fsSL https://bob.ibm.com/download/bobshell.sh | bash \ + && command -v bob \ + && bob --version + +WORKDIR /workspace + +USER sandbox + +ENV HOME="/home/sandbox" +ENV PYTHONUNBUFFERED=1 + +CMD ["bash"] diff --git a/sandbox/README.md b/sandbox/README.md index 877cebc9..2ee38b13 100644 --- a/sandbox/README.md +++ b/sandbox/README.md @@ -110,3 +110,51 @@ as environment variables, for example `CODEX_MODEL_PROVIDER`, `CODEX_MODEL_PROVIDER_BASE_URL`, `CODEX_MODEL_PROVIDER_ENV_KEY`, and `CODEX_MODEL_PROVIDER_WIRE_API`. The test forwards only environment variable values into Docker; it does not mount host credential or Codex config files. + +## Bob Sandbox Auth + +Build the Bob image: + +```bash +just sandbox-build bob +``` + +Authenticate once with browser SSO: + +```bash +just bob-auth +``` + +Open the URL printed by Bob on the host machine. The recipe fixes +`SSO_PORT` and publishes it to `127.0.0.1`, so the browser callback can reach +the Bob process inside Docker. Auth state is stored in the ignored +`.bob-sandbox-home/` directory and reused by `just bob-run`, `just bob-test`, +and `just bob-prompt "..."`. The recipes also set a stable Docker hostname +because Bob's encrypted file storage derives its key from the hostname; without +that, credentials written in one `docker run --rm` session cannot be decrypted +in the next. + +API-key auth with older `sk-` / `pk-` Bob keys can route Bob Shell 1.0.4 to +`prod.ibm-bob-staging.cloud.ibm.com`, which may fail with a Cloudflare 403 +from inside Docker. Prefer SSO auth for this sandbox unless you have a current +Bob API key known to work with the production Bob auth backend. + +## Bob Automated E2E Test + +`tests/e2e/test_bob_sandbox_learn_recall.py` runs the same learn + recall flow +against the Dockerized Bob sandbox. Build the image, authenticate once, then +run pytest: + +```bash +just sandbox-build bob +just bob-auth # one-time browser SSO +uv run pytest tests/e2e/test_bob_sandbox_learn_recall.py \ + --run-e2e -m e2e -v --log-cli-level=INFO +``` + +The test mounts `.bob-sandbox-home/` (created by `bob-auth`) as the +container's `~/.bob`, pins the Docker hostname so Bob's encrypted file storage +decrypts across runs, and publishes `SSO_PORT` to `127.0.0.1`. It skips with a +clear message if the image isn't built or if the auth state directory is +missing. Set `BOB_HOME` / `BOB_HOSTNAME` / `BOB_SSO_PORT` env vars to override +defaults. diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py new file mode 100644 index 00000000..6b27b9d8 --- /dev/null +++ b/tests/e2e/test_bob_sandbox_learn_recall.py @@ -0,0 +1,222 @@ +"""End-to-end test of the evolve-lite learn + recall flow in the Bob sandbox. + +Runs three sequential Bob CLI sessions against the Dockerized Bob sandbox: + 1. Session 1 performs an EXIF task, then explicitly invokes the evolve-lite + save-trajectory and learn skills so a trajectory and guideline are saved. + 2. Session 2 asks a related EXIF question. The recall skill should surface + the guideline from session 1 before substantive work begins. + 3. Session 3 runs the offline provenance skill so the recall audit gets + follow-up influence verdicts. + +Requires Docker, the ``evolve-bob-sandbox`` image built, and a persisted Bob +SSO auth state on the host (created by ``just bob-auth``). The test mounts +that auth state read-write into the container alongside a stable hostname so +Bob's encrypted file storage decrypts across runs. +""" + +import json +import logging +import os +import shutil +import subprocess +import time +from pathlib import Path + +import pytest + + +log = logging.getLogger(__name__) + + +SANDBOX_IMAGE = "evolve-bob-sandbox" +REPO_ROOT = Path(__file__).resolve().parents[2] +SESSION_TIMEOUT_SECONDS = 600 +BOB_HOSTNAME = os.environ.get("BOB_HOSTNAME", "evolve-bob-sandbox") +BOB_SSO_PORT = os.environ.get("BOB_SSO_PORT", "47687") +BOB_HOME_DEFAULT = REPO_ROOT / ".bob-sandbox-home" + + +@pytest.fixture(scope="session") +def bob_sandbox_ready(): + """Skip if Docker, the Bob sandbox image, or persisted auth aren't available.""" + if shutil.which("docker") is None: + pytest.skip("docker not installed") + + if subprocess.run(["docker", "info"], capture_output=True).returncode != 0: + pytest.skip("docker daemon not running") + + image_check = subprocess.run( + ["docker", "image", "inspect", SANDBOX_IMAGE], + capture_output=True, + ) + if image_check.returncode != 0: + pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build bob`") + + bob_home = Path(os.environ.get("BOB_HOME", str(BOB_HOME_DEFAULT))) + if not bob_home.is_dir() or not (bob_home / "settings.json").is_file(): + pytest.skip(f"bob auth state missing at {bob_home} — run `just bob-auth` first") + + return bob_home + + +@pytest.fixture +def bob_workspace(tmp_path): + """Copy demo/workspace and install the Bob plugin into it.""" + src = REPO_ROOT / "demo" / "workspace" + workspace = tmp_path / "workspace" + shutil.copytree(src, workspace, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup")) + + install_script = REPO_ROOT / "platform-integrations" / "install.sh" + result = subprocess.run( + ["bash", str(install_script), "install", "--platform", "bob", "--dir", str(workspace)], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"bob install failed\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" + + return workspace + + +def _run_bob_prompt( + workspace: Path, + bob_home: Path, + prompt: str, +) -> subprocess.CompletedProcess: + cmd = [ + "docker", + "run", + "--rm", + "--hostname", + BOB_HOSTNAME, + "--env", + "BOB_SHELL_FORCE_FILE_STORAGE=true", + "--env", + f"SSO_PORT={BOB_SSO_PORT}", + "--env", + "EVOLVE_DEBUG=1", + "--env", + "TMPDIR=/workspace/.evolve/tmp", + "--publish", + f"127.0.0.1:{BOB_SSO_PORT}:{BOB_SSO_PORT}", + "-v", + f"{workspace}:/workspace", + "-v", + f"{bob_home}:/home/sandbox/.bob", + SANDBOX_IMAGE, + "bob", + "--accept-license", + "--auth-method", + "sso", + "--yolo", + "-p", + prompt, + ] + return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) + + +def _audit_events(evolve_dir: Path) -> list[dict]: + audit_log = evolve_dir / "audit.log" + if not audit_log.is_file(): + return [] + return [json.loads(line) for line in audit_log.read_text().splitlines() if line.strip()] + + +@pytest.mark.e2e +def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace): + """Session 1 learns, session 2 recalls, session 3 records influence.""" + bob_home = bob_sandbox_ready + evolve_dir = bob_workspace / ".evolve" + + log.info("bob session 1: running seed task with save-trajectory + learn...") + t0 = time.time() + result1 = _run_bob_prompt( + bob_workspace, + bob_home, + ( + "Where was the photo @sample.jpg taken? Use EXIF metadata. " + "When done, invoke the evolve-lite save-trajectory skill, then invoke the evolve-lite learn skill. " + "Do not skip either evolve-lite skill." + ), + ) + log.info(f"bob session 1: exited {result1.returncode} after {time.time() - t0:.0f}s") + assert result1.returncode == 0, ( + f"session 1 exited {result1.returncode}\nstdout:\n{result1.stdout[-2000:]}\nstderr:\n{result1.stderr[-2000:]}" + ) + + trajectories_dir = evolve_dir / "trajectories" + entities_dir = evolve_dir / "entities" + assert trajectories_dir.is_dir(), f"{trajectories_dir} was not created" + trajectories = list(trajectories_dir.glob("*.json")) + list(trajectories_dir.glob("*.jsonl")) + assert trajectories, f"no Bob trajectory files found in {trajectories_dir}" + assert entities_dir.is_dir(), f"{entities_dir} was not created" + entity_files = list(entities_dir.rglob("*.md")) + assert entity_files, f"no guideline files found in {entities_dir}" + + log.info("bob session 2: running related task to exercise recall...") + t1 = time.time() + result2 = _run_bob_prompt( + bob_workspace, + bob_home, + ( + "STEP 1 (mandatory, do this first before anything else): invoke the evolve-lite recall skill " + "to retrieve relevant stored guidelines. Do not run any other tool until recall is complete. " + "STEP 2: answer this question using EXIF metadata, applying any guideline returned by recall: " + "What focal length was used to take the photo @sample.jpg? " + "STEP 3 (mandatory): invoke the evolve-lite save-trajectory skill. " + "Do not invoke the learn skill." + ), + ) + log.info(f"bob session 2: exited {result2.returncode} after {time.time() - t1:.0f}s") + assert result2.returncode == 0, ( + f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}" + ) + + session2_trajectories = ( + set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl")) + ) - set(trajectories) + assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}" + session2_trajectory = max(session2_trajectories, key=lambda p: p.stat().st_mtime) + + # Bob has no UserPromptSubmit hook, so the recall skill cannot emit a + # recall audit event the way the codex/claude tests do. Verify recall + # influence indirectly: session 2's saved trajectory should reference + # one of the guideline files (or its key content) from session 1. + learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files} + session2_text = session2_trajectory.read_text(encoding="utf-8") + assert any(eid.split("/")[-1] in session2_text for eid in learned_ids), ( + f"session 2 trajectory did not reference any guideline filename from {learned_ids}" + ) + + log.info("bob session 3: running offline provenance analysis...") + t2 = time.time() + session2_id = session2_trajectory.stem + result3 = _run_bob_prompt( + bob_workspace, + bob_home, + ( + "Run the evolve-lite provenance skill now. Analyze the saved trajectories in " + ".evolve/trajectories/. Record influence verdicts for the guidelines under " + f".evolve/entities/guideline/ as applied (or not) in trajectory {session2_id}. " + "Do not modify source files." + ), + ) + log.info(f"bob session 3: exited {result3.returncode} after {time.time() - t2:.0f}s") + assert result3.returncode == 0, ( + f"session 3 exited {result3.returncode}\nstdout:\n{result3.stdout[-2000:]}\nstderr:\n{result3.stderr[-2000:]}" + ) + + events = _audit_events(evolve_dir) + influence_events = [event for event in events if event.get("event") == "influence"] + assert influence_events, f"no influence audit event recorded. all events: {events}" + influenced_ids = {event.get("entity") for event in influence_events} + assert influenced_ids & learned_ids, ( + f"influence events {influence_events} did not assess any learned ids {learned_ids}" + ) + allowed_verdicts = {"followed", "contradicted", "not_applicable"} + assert any(event.get("verdict") in allowed_verdicts for event in influence_events), ( + f"no learned guideline was assessed with an allowed verdict. influence events: {influence_events}" + ) + for event in influence_events: + assert event.get("verdict") in allowed_verdicts + assert event.get("evidence"), f"influence event missing evidence: {event}" From 9821a21d1915b554281746d9314b869e0a710967 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 10 Jun 2026 09:35:56 -0500 Subject: [PATCH 2/5] style: apply ruff format to bob e2e test Fixes failing CI check: check-formatting (3.12) --- tests/e2e/test_bob_sandbox_learn_recall.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py index 6b27b9d8..ad0e8f54 100644 --- a/tests/e2e/test_bob_sandbox_learn_recall.py +++ b/tests/e2e/test_bob_sandbox_learn_recall.py @@ -172,9 +172,7 @@ def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace): f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}" ) - session2_trajectories = ( - set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl")) - ) - set(trajectories) + session2_trajectories = (set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))) - set(trajectories) assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}" session2_trajectory = max(session2_trajectories, key=lambda p: p.stat().st_mtime) @@ -210,9 +208,7 @@ def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace): influence_events = [event for event in events if event.get("event") == "influence"] assert influence_events, f"no influence audit event recorded. all events: {events}" influenced_ids = {event.get("entity") for event in influence_events} - assert influenced_ids & learned_ids, ( - f"influence events {influence_events} did not assess any learned ids {learned_ids}" - ) + assert influenced_ids & learned_ids, f"influence events {influence_events} did not assess any learned ids {learned_ids}" allowed_verdicts = {"followed", "contradicted", "not_applicable"} assert any(event.get("verdict") in allowed_verdicts for event in influence_events), ( f"no learned guideline was assessed with an allowed verdict. influence events: {influence_events}" From 680d4eb6e7c9bc632d7588e19ccbbc11ab4fcba7 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 10 Jun 2026 09:42:30 -0500 Subject: [PATCH 3/5] fix(sandbox): suppress detect-secrets false positive in justfile Fixes failing CI check: tekton/pr-code-checks/code-detect-secrets The bob-auth recipe target name contains the word "auth" which the detect-secrets Secret Keyword plugin flags as a possible credential. Annotate the line with `# pragma: allowlist secret` per project convention. --- justfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/justfile b/justfile index 94b2fb12..95b48d2c 100644 --- a/justfile +++ b/justfile @@ -95,7 +95,7 @@ bob-run: _bob-force-sso docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} # Authenticate Bob in the sandbox with browser SSO. Open the printed URL on the host. -bob-auth: _bob-force-sso +bob-auth: _bob-force-sso # pragma: allowlist secret mkdir -p {{bob_home}} docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso just _bob-force-sso From a40dbad325c044a1907d4c5f346b5fde42ab419d Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 10 Jun 2026 14:08:17 -0500 Subject: [PATCH 4/5] fix(e2e): drop bob provenance assertion that can never produce influence events Addresses review feedback from visahak Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never gets a session id from the runtime and cannot emit a recall audit event. The evolve-lite:provenance skill assesses entities from prior recall audit events; with none in scope, a compliant provenance run is required to produce zero influence events. The previous test asserted the opposite, making it order-of-magnitude flaky. Drop session 3 (provenance) and the associated influence-event assertions. The test now validates only the learn + recall evidence Bob can actually produce: guideline files in .evolve/entities/ and references to those guidelines in session 2's saved trajectory. --- tests/e2e/test_bob_sandbox_learn_recall.py | 63 ++++++---------------- 1 file changed, 16 insertions(+), 47 deletions(-) diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py index ad0e8f54..033d469c 100644 --- a/tests/e2e/test_bob_sandbox_learn_recall.py +++ b/tests/e2e/test_bob_sandbox_learn_recall.py @@ -1,12 +1,18 @@ """End-to-end test of the evolve-lite learn + recall flow in the Bob sandbox. -Runs three sequential Bob CLI sessions against the Dockerized Bob sandbox: +Runs two sequential Bob CLI sessions against the Dockerized Bob sandbox: 1. Session 1 performs an EXIF task, then explicitly invokes the evolve-lite save-trajectory and learn skills so a trajectory and guideline are saved. 2. Session 2 asks a related EXIF question. The recall skill should surface the guideline from session 1 before substantive work begins. - 3. Session 3 runs the offline provenance skill so the recall audit gets - follow-up influence verdicts. + +Bob 1.0.4 has no ``UserPromptSubmit`` hook, so the recall script never gets a +session id from the runtime and cannot emit a ``recall`` audit event the way +the Claude/Codex tests rely on. Without a recall audit event there is nothing +for ``evolve-lite:provenance`` to assess, so this test does not run a third +provenance session — it validates only the learn + recall evidence that Bob +can actually produce: guideline files in ``.evolve/entities/`` and references +to those guidelines in session 2's saved trajectory. Requires Docker, the ``evolve-bob-sandbox`` image built, and a persisted Bob SSO auth state on the host (created by ``just bob-auth``). The test mounts @@ -14,7 +20,6 @@ Bob's encrypted file storage decrypts across runs. """ -import json import logging import os import shutil @@ -115,16 +120,9 @@ def _run_bob_prompt( return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) -def _audit_events(evolve_dir: Path) -> list[dict]: - audit_log = evolve_dir / "audit.log" - if not audit_log.is_file(): - return [] - return [json.loads(line) for line in audit_log.read_text().splitlines() if line.strip()] - - @pytest.mark.e2e def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace): - """Session 1 learns, session 2 recalls, session 3 records influence.""" + """Session 1 learns, session 2 recalls.""" bob_home = bob_sandbox_ready evolve_dir = bob_workspace / ".evolve" @@ -176,43 +174,14 @@ def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace): assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}" session2_trajectory = max(session2_trajectories, key=lambda p: p.stat().st_mtime) - # Bob has no UserPromptSubmit hook, so the recall skill cannot emit a - # recall audit event the way the codex/claude tests do. Verify recall - # influence indirectly: session 2's saved trajectory should reference - # one of the guideline files (or its key content) from session 1. + # Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never + # gets a session id from the runtime and cannot emit a recall audit + # event. Without a recall audit there is nothing for evolve-lite:provenance + # to assess, so this test stops at the indirect evidence Bob can produce: + # session 2's saved trajectory should reference one of the guideline + # files learned in session 1. learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files} session2_text = session2_trajectory.read_text(encoding="utf-8") assert any(eid.split("/")[-1] in session2_text for eid in learned_ids), ( f"session 2 trajectory did not reference any guideline filename from {learned_ids}" ) - - log.info("bob session 3: running offline provenance analysis...") - t2 = time.time() - session2_id = session2_trajectory.stem - result3 = _run_bob_prompt( - bob_workspace, - bob_home, - ( - "Run the evolve-lite provenance skill now. Analyze the saved trajectories in " - ".evolve/trajectories/. Record influence verdicts for the guidelines under " - f".evolve/entities/guideline/ as applied (or not) in trajectory {session2_id}. " - "Do not modify source files." - ), - ) - log.info(f"bob session 3: exited {result3.returncode} after {time.time() - t2:.0f}s") - assert result3.returncode == 0, ( - f"session 3 exited {result3.returncode}\nstdout:\n{result3.stdout[-2000:]}\nstderr:\n{result3.stderr[-2000:]}" - ) - - events = _audit_events(evolve_dir) - influence_events = [event for event in events if event.get("event") == "influence"] - assert influence_events, f"no influence audit event recorded. all events: {events}" - influenced_ids = {event.get("entity") for event in influence_events} - assert influenced_ids & learned_ids, f"influence events {influence_events} did not assess any learned ids {learned_ids}" - allowed_verdicts = {"followed", "contradicted", "not_applicable"} - assert any(event.get("verdict") in allowed_verdicts for event in influence_events), ( - f"no learned guideline was assessed with an allowed verdict. influence events: {influence_events}" - ) - for event in influence_events: - assert event.get("verdict") in allowed_verdicts - assert event.get("evidence"), f"influence event missing evidence: {event}" From b77107d565a9dd953aea0bf3d9fddcba60460590 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 10 Jun 2026 14:18:58 -0500 Subject: [PATCH 5/5] fix(e2e): scan all session-2 trajectories with Path.stem matching Addresses CodeRabbit review finding: Make the recall-evidence assertion robust across all session-2 trajectories and path separators Iterate over the full session2_trajectories set instead of only the newest, and use Path.stem from entity_files for matching instead of splitting the entity id string on "/". Both changes make the assertion robust to path separator differences and to bob writing more than one trajectory in a single session. --- tests/e2e/test_bob_sandbox_learn_recall.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py index 033d469c..c8ca56f0 100644 --- a/tests/e2e/test_bob_sandbox_learn_recall.py +++ b/tests/e2e/test_bob_sandbox_learn_recall.py @@ -172,16 +172,15 @@ def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace): session2_trajectories = (set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))) - set(trajectories) assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}" - session2_trajectory = max(session2_trajectories, key=lambda p: p.stat().st_mtime) # Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never # gets a session id from the runtime and cannot emit a recall audit # event. Without a recall audit there is nothing for evolve-lite:provenance # to assess, so this test stops at the indirect evidence Bob can produce: - # session 2's saved trajectory should reference one of the guideline - # files learned in session 1. - learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files} - session2_text = session2_trajectory.read_text(encoding="utf-8") - assert any(eid.split("/")[-1] in session2_text for eid in learned_ids), ( - f"session 2 trajectory did not reference any guideline filename from {learned_ids}" + # at least one session-2 trajectory should reference a guideline file + # learned in session 1. + learned_stems = {path.stem for path in entity_files} + session2_texts = [path.read_text(encoding="utf-8") for path in session2_trajectories] + assert any(stem in text for stem in learned_stems for text in session2_texts), ( + f"no session 2 trajectory referenced any guideline filename from {learned_stems}" )