From 731dc0b1f5f03d540cb9219eb27921a65dac66df Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 10 Jun 2026 09:31:21 -0500
Subject: [PATCH 1/5] feat(sandbox): add bob CLI sandbox image and e2e test

Add a third Docker target for the IBM Bob CLI alongside the existing
Claude and Codex sandboxes. Reuses the shared base image, installs
bobshell via the IBM-published installer over node 22, and pins a
stable hostname plus mounted .bob-sandbox-home so Bob's encrypted
file storage decrypts across docker run --rm sessions and SSO auth
state persists.

The e2e test mirrors the codex flow over three sessions (seed task
with save-trajectory + learn, focal-length task with recall, offline
provenance). Bob 1.0.4 has no UserPromptSubmit hook, so recall
provenance is verified indirectly by checking session 2's trajectory
references a learned guideline file rather than asserting on a
recall audit event.
---
 .gitignore                                 |   1 +
 justfile                                   |  54 ++++-
 sandbox/Dockerfile                         |  24 +++
 sandbox/README.md                          |  48 +++++
 tests/e2e/test_bob_sandbox_learn_recall.py | 222 +++++++++++++++++++++
 5 files changed, 343 insertions(+), 6 deletions(-)
 create mode 100644 tests/e2e/test_bob_sandbox_learn_recall.py

diff --git a/.gitignore b/.gitignore
index 5c925ec2..bd26a3ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ __pycache__
 evolve_data
 demo/workdir/.claude/
 .bob
+.bob-sandbox-home/
 .claude
 dist
 .coverage
diff --git a/justfile b/justfile
index fc1aaeaf..94b2fb12 100644
--- a/justfile
+++ b/justfile
@@ -10,16 +10,20 @@ commit message:
 
 claude_image := "claude-sandbox"
 codex_image := "evolve-codex-sandbox"
+bob_image := "evolve-bob-sandbox"
 env_file := "sandbox/myenv"
+bob_home := env_var_or_default("BOB_HOME", ".bob-sandbox-home")
+bob_hostname := env_var_or_default("BOB_HOSTNAME", "evolve-bob-sandbox")
+bob_sso_port := env_var_or_default("BOB_SSO_PORT", "47687")
 sandbox_dir := "sandbox"
 workspace := "demo/workspace"
 
-# Build sandbox Docker image(s). Use target=claude or target=codex to build only one.
+# Build sandbox Docker image(s). Use target=claude, codex, or bob to build only one.
 sandbox-build target="all":
     #!/usr/bin/env sh
     set -e
-    if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then
-        echo "Error: target must be one of: all, claude, codex" >&2
+    if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then
+        echo "Error: target must be one of: all, claude, codex, bob" >&2
         exit 1
     fi
     if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then
@@ -28,6 +32,9 @@ sandbox-build target="all":
     if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then
         docker build --target codex -t {{codex_image}} {{sandbox_dir}}
     fi
+    if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then
+        docker build --target bob -t {{bob_image}} {{sandbox_dir}}
+    fi
 
 # Copy sample.env to myenv if it doesn't already exist
 sandbox-setup:
@@ -38,11 +45,11 @@ sandbox-setup:
         echo "{{env_file}} already exists, skipping"; \
     fi
 
-# Remove sandbox Docker image(s). Use target=claude or target=codex to remove only one.
+# Remove sandbox Docker image(s). Use target=claude, codex, or bob to remove only one.
 sandbox-clean target="all":
     #!/usr/bin/env sh
-    if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ]; then
-        echo "Error: target must be one of: all, claude, codex" >&2
+    if [ "{{target}}" != "all" ] && [ "{{target}}" != "claude" ] && [ "{{target}}" != "codex" ] && [ "{{target}}" != "bob" ]; then
+        echo "Error: target must be one of: all, claude, codex, bob" >&2
         exit 1
     fi
     if [ "{{target}}" = "all" ] || [ "{{target}}" = "claude" ]; then
@@ -51,6 +58,9 @@ sandbox-clean target="all":
     if [ "{{target}}" = "all" ] || [ "{{target}}" = "codex" ]; then
         docker rmi {{codex_image}} || true
     fi
+    if [ "{{target}}" = "all" ] || [ "{{target}}" = "bob" ]; then
+        docker rmi {{bob_image}} || true
+    fi
 
 # Run an interactive Claude Code shell in the sandbox
 claude-run:
@@ -79,6 +89,38 @@ codex-run:
 codex-test:
     docker run --rm --env-file {{env_file}} {{codex_image}} codex exec --skip-git-repo-check "who are you"
 
+# Run an interactive Bob shell in the sandbox
+bob-run: _bob-force-sso
+    mkdir -p {{bob_home}}
+    docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}}
+
+# Authenticate Bob in the sandbox with browser SSO. Open the printed URL on the host.
+bob-auth: _bob-force-sso
+    mkdir -p {{bob_home}}
+    docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso
+    just _bob-force-sso
+
+# Run a one-shot prompt in the sandbox
+bob-prompt prompt: _bob-force-sso
+    #!/usr/bin/env sh
+    export SANDBOX_PROMPT="$(cat <<'PROMPT_EOF'
+    {{prompt}}
+    PROMPT_EOF
+    )"
+    mkdir -p {{bob_home}}
+    docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SANDBOX_PROMPT --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} sh -c '
+        bob -C /workspace --accept-license --auth-method sso --yolo -p "$SANDBOX_PROMPT"
+    '
+
+# Smoke-test that Bob is installed and working
+bob-test: _bob-force-sso
+    mkdir -p {{bob_home}}
+    docker run --rm --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso -p "who are you"
+
+_bob-force-sso:
+    mkdir -p {{bob_home}}
+    node -e 'const fs = require("fs"); const path = "{{bob_home}}/settings.json"; const data = fs.existsSync(path) ? JSON.parse(fs.readFileSync(path, "utf8")) : {}; data.security ??= {}; data.security.auth ??= {}; data.security.auth.selectedType = "sso"; fs.writeFileSync(path, JSON.stringify(data, null, 2) + "\n");'
+
 # Render plugin-source/ into platform-integrations/. Edit plugin-source/, then run this.
 compile-plugins:
     uv run python plugin-source/build_plugins.py render
diff --git a/sandbox/Dockerfile b/sandbox/Dockerfile
index 278dda33..ccc126d9 100644
--- a/sandbox/Dockerfile
+++ b/sandbox/Dockerfile
@@ -108,3 +108,27 @@ ENV PYTHONUNBUFFERED=1
 
 ENTRYPOINT ["codex-container-entrypoint"]
 CMD ["bash"]
+
+
+# Bob (IBM bobshell) target
+FROM base AS bob
+
+COPY --from=node:22-bookworm-slim /usr/local/bin/node /usr/local/bin/node
+COPY --from=node:22-bookworm-slim /usr/local/lib/node_modules /usr/local/lib/node_modules
+
+RUN ln -sf /usr/local/lib/node_modules/npm/bin/npm-cli.js /usr/local/bin/npm \
+    && ln -sf /usr/local/lib/node_modules/npm/bin/npx-cli.js /usr/local/bin/npx
+
+# Install bobshell globally as root (the IBM installer runs `npm install -g`)
+RUN curl -fsSL https://bob.ibm.com/download/bobshell.sh | bash \
+    && command -v bob \
+    && bob --version
+
+WORKDIR /workspace
+
+USER sandbox
+
+ENV HOME="/home/sandbox"
+ENV PYTHONUNBUFFERED=1
+
+CMD ["bash"]
diff --git a/sandbox/README.md b/sandbox/README.md
index 877cebc9..2ee38b13 100644
--- a/sandbox/README.md
+++ b/sandbox/README.md
@@ -110,3 +110,51 @@ as environment variables, for example `CODEX_MODEL_PROVIDER`,
 `CODEX_MODEL_PROVIDER_BASE_URL`, `CODEX_MODEL_PROVIDER_ENV_KEY`, and
 `CODEX_MODEL_PROVIDER_WIRE_API`. The test forwards only environment variable
 values into Docker; it does not mount host credential or Codex config files.
+
+## Bob Sandbox Auth
+
+Build the Bob image:
+
+```bash
+just sandbox-build bob
+```
+
+Authenticate once with browser SSO:
+
+```bash
+just bob-auth
+```
+
+Open the URL printed by Bob on the host machine. The recipe fixes
+`SSO_PORT` and publishes it to `127.0.0.1`, so the browser callback can reach
+the Bob process inside Docker. Auth state is stored in the ignored
+`.bob-sandbox-home/` directory and reused by `just bob-run`, `just bob-test`,
+and `just bob-prompt "..."`. The recipes also set a stable Docker hostname
+because Bob's encrypted file storage derives its key from the hostname; without
+that, credentials written in one `docker run --rm` session cannot be decrypted
+in the next.
+
+API-key auth with older `sk-` / `pk-` Bob keys can route Bob Shell 1.0.4 to
+`prod.ibm-bob-staging.cloud.ibm.com`, which may fail with a Cloudflare 403
+from inside Docker. Prefer SSO auth for this sandbox unless you have a current
+Bob API key known to work with the production Bob auth backend.
+
+## Bob Automated E2E Test
+
+`tests/e2e/test_bob_sandbox_learn_recall.py` runs the same learn + recall flow
+against the Dockerized Bob sandbox. Build the image, authenticate once, then
+run pytest:
+
+```bash
+just sandbox-build bob
+just bob-auth          # one-time browser SSO
+uv run pytest tests/e2e/test_bob_sandbox_learn_recall.py \
+  --run-e2e -m e2e -v --log-cli-level=INFO
+```
+
+The test mounts `.bob-sandbox-home/` (created by `bob-auth`) as the
+container's `~/.bob`, pins the Docker hostname so Bob's encrypted file storage
+decrypts across runs, and publishes `SSO_PORT` to `127.0.0.1`. It skips with a
+clear message if the image isn't built or if the auth state directory is
+missing. Set `BOB_HOME` / `BOB_HOSTNAME` / `BOB_SSO_PORT` env vars to override
+defaults.
diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py
new file mode 100644
index 00000000..6b27b9d8
--- /dev/null
+++ b/tests/e2e/test_bob_sandbox_learn_recall.py
@@ -0,0 +1,222 @@
+"""End-to-end test of the evolve-lite learn + recall flow in the Bob sandbox.
+
+Runs three sequential Bob CLI sessions against the Dockerized Bob sandbox:
+  1. Session 1 performs an EXIF task, then explicitly invokes the evolve-lite
+     save-trajectory and learn skills so a trajectory and guideline are saved.
+  2. Session 2 asks a related EXIF question. The recall skill should surface
+     the guideline from session 1 before substantive work begins.
+  3. Session 3 runs the offline provenance skill so the recall audit gets
+     follow-up influence verdicts.
+
+Requires Docker, the ``evolve-bob-sandbox`` image built, and a persisted Bob
+SSO auth state on the host (created by ``just bob-auth``). The test mounts
+that auth state read-write into the container alongside a stable hostname so
+Bob's encrypted file storage decrypts across runs.
+"""
+
+import json
+import logging
+import os
+import shutil
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+
+
+log = logging.getLogger(__name__)
+
+
+SANDBOX_IMAGE = "evolve-bob-sandbox"
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SESSION_TIMEOUT_SECONDS = 600
+BOB_HOSTNAME = os.environ.get("BOB_HOSTNAME", "evolve-bob-sandbox")
+BOB_SSO_PORT = os.environ.get("BOB_SSO_PORT", "47687")
+BOB_HOME_DEFAULT = REPO_ROOT / ".bob-sandbox-home"
+
+
+@pytest.fixture(scope="session")
+def bob_sandbox_ready():
+    """Skip if Docker, the Bob sandbox image, or persisted auth aren't available."""
+    if shutil.which("docker") is None:
+        pytest.skip("docker not installed")
+
+    if subprocess.run(["docker", "info"], capture_output=True).returncode != 0:
+        pytest.skip("docker daemon not running")
+
+    image_check = subprocess.run(
+        ["docker", "image", "inspect", SANDBOX_IMAGE],
+        capture_output=True,
+    )
+    if image_check.returncode != 0:
+        pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built — run `just sandbox-build bob`")
+
+    bob_home = Path(os.environ.get("BOB_HOME", str(BOB_HOME_DEFAULT)))
+    if not bob_home.is_dir() or not (bob_home / "settings.json").is_file():
+        pytest.skip(f"bob auth state missing at {bob_home} — run `just bob-auth` first")
+
+    return bob_home
+
+
+@pytest.fixture
+def bob_workspace(tmp_path):
+    """Copy demo/workspace and install the Bob plugin into it."""
+    src = REPO_ROOT / "demo" / "workspace"
+    workspace = tmp_path / "workspace"
+    shutil.copytree(src, workspace, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup"))
+
+    install_script = REPO_ROOT / "platform-integrations" / "install.sh"
+    result = subprocess.run(
+        ["bash", str(install_script), "install", "--platform", "bob", "--dir", str(workspace)],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    assert result.returncode == 0, f"bob install failed\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
+
+    return workspace
+
+
+def _run_bob_prompt(
+    workspace: Path,
+    bob_home: Path,
+    prompt: str,
+) -> subprocess.CompletedProcess:
+    cmd = [
+        "docker",
+        "run",
+        "--rm",
+        "--hostname",
+        BOB_HOSTNAME,
+        "--env",
+        "BOB_SHELL_FORCE_FILE_STORAGE=true",
+        "--env",
+        f"SSO_PORT={BOB_SSO_PORT}",
+        "--env",
+        "EVOLVE_DEBUG=1",
+        "--env",
+        "TMPDIR=/workspace/.evolve/tmp",
+        "--publish",
+        f"127.0.0.1:{BOB_SSO_PORT}:{BOB_SSO_PORT}",
+        "-v",
+        f"{workspace}:/workspace",
+        "-v",
+        f"{bob_home}:/home/sandbox/.bob",
+        SANDBOX_IMAGE,
+        "bob",
+        "--accept-license",
+        "--auth-method",
+        "sso",
+        "--yolo",
+        "-p",
+        prompt,
+    ]
+    return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS)
+
+
+def _audit_events(evolve_dir: Path) -> list[dict]:
+    audit_log = evolve_dir / "audit.log"
+    if not audit_log.is_file():
+        return []
+    return [json.loads(line) for line in audit_log.read_text().splitlines() if line.strip()]
+
+
+@pytest.mark.e2e
+def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace):
+    """Session 1 learns, session 2 recalls, session 3 records influence."""
+    bob_home = bob_sandbox_ready
+    evolve_dir = bob_workspace / ".evolve"
+
+    log.info("bob session 1: running seed task with save-trajectory + learn...")
+    t0 = time.time()
+    result1 = _run_bob_prompt(
+        bob_workspace,
+        bob_home,
+        (
+            "Where was the photo @sample.jpg taken? Use EXIF metadata. "
+            "When done, invoke the evolve-lite save-trajectory skill, then invoke the evolve-lite learn skill. "
+            "Do not skip either evolve-lite skill."
+        ),
+    )
+    log.info(f"bob session 1: exited {result1.returncode} after {time.time() - t0:.0f}s")
+    assert result1.returncode == 0, (
+        f"session 1 exited {result1.returncode}\nstdout:\n{result1.stdout[-2000:]}\nstderr:\n{result1.stderr[-2000:]}"
+    )
+
+    trajectories_dir = evolve_dir / "trajectories"
+    entities_dir = evolve_dir / "entities"
+    assert trajectories_dir.is_dir(), f"{trajectories_dir} was not created"
+    trajectories = list(trajectories_dir.glob("*.json")) + list(trajectories_dir.glob("*.jsonl"))
+    assert trajectories, f"no Bob trajectory files found in {trajectories_dir}"
+    assert entities_dir.is_dir(), f"{entities_dir} was not created"
+    entity_files = list(entities_dir.rglob("*.md"))
+    assert entity_files, f"no guideline files found in {entities_dir}"
+
+    log.info("bob session 2: running related task to exercise recall...")
+    t1 = time.time()
+    result2 = _run_bob_prompt(
+        bob_workspace,
+        bob_home,
+        (
+            "STEP 1 (mandatory, do this first before anything else): invoke the evolve-lite recall skill "
+            "to retrieve relevant stored guidelines. Do not run any other tool until recall is complete. "
+            "STEP 2: answer this question using EXIF metadata, applying any guideline returned by recall: "
+            "What focal length was used to take the photo @sample.jpg? "
+            "STEP 3 (mandatory): invoke the evolve-lite save-trajectory skill. "
+            "Do not invoke the learn skill."
+        ),
+    )
+    log.info(f"bob session 2: exited {result2.returncode} after {time.time() - t1:.0f}s")
+    assert result2.returncode == 0, (
+        f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}"
+    )
+
+    session2_trajectories = (
+        set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))
+    ) - set(trajectories)
+    assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}"
+    session2_trajectory = max(session2_trajectories, key=lambda p: p.stat().st_mtime)
+
+    # Bob has no UserPromptSubmit hook, so the recall skill cannot emit a
+    # recall audit event the way the codex/claude tests do. Verify recall
+    # influence indirectly: session 2's saved trajectory should reference
+    # one of the guideline files (or its key content) from session 1.
+    learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files}
+    session2_text = session2_trajectory.read_text(encoding="utf-8")
+    assert any(eid.split("/")[-1] in session2_text for eid in learned_ids), (
+        f"session 2 trajectory did not reference any guideline filename from {learned_ids}"
+    )
+
+    log.info("bob session 3: running offline provenance analysis...")
+    t2 = time.time()
+    session2_id = session2_trajectory.stem
+    result3 = _run_bob_prompt(
+        bob_workspace,
+        bob_home,
+        (
+            "Run the evolve-lite provenance skill now. Analyze the saved trajectories in "
+            ".evolve/trajectories/. Record influence verdicts for the guidelines under "
+            f".evolve/entities/guideline/ as applied (or not) in trajectory {session2_id}. "
+            "Do not modify source files."
+        ),
+    )
+    log.info(f"bob session 3: exited {result3.returncode} after {time.time() - t2:.0f}s")
+    assert result3.returncode == 0, (
+        f"session 3 exited {result3.returncode}\nstdout:\n{result3.stdout[-2000:]}\nstderr:\n{result3.stderr[-2000:]}"
+    )
+
+    events = _audit_events(evolve_dir)
+    influence_events = [event for event in events if event.get("event") == "influence"]
+    assert influence_events, f"no influence audit event recorded. all events: {events}"
+    influenced_ids = {event.get("entity") for event in influence_events}
+    assert influenced_ids & learned_ids, (
+        f"influence events {influence_events} did not assess any learned ids {learned_ids}"
+    )
+    allowed_verdicts = {"followed", "contradicted", "not_applicable"}
+    assert any(event.get("verdict") in allowed_verdicts for event in influence_events), (
+        f"no learned guideline was assessed with an allowed verdict. influence events: {influence_events}"
+    )
+    for event in influence_events:
+        assert event.get("verdict") in allowed_verdicts
+        assert event.get("evidence"), f"influence event missing evidence: {event}"

From 9821a21d1915b554281746d9314b869e0a710967 Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 10 Jun 2026 09:35:56 -0500
Subject: [PATCH 2/5] style: apply ruff format to bob e2e test

Fixes failing CI check: check-formatting (3.12)
---
 tests/e2e/test_bob_sandbox_learn_recall.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py
index 6b27b9d8..ad0e8f54 100644
--- a/tests/e2e/test_bob_sandbox_learn_recall.py
+++ b/tests/e2e/test_bob_sandbox_learn_recall.py
@@ -172,9 +172,7 @@ def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace):
         f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}"
     )
 
-    session2_trajectories = (
-        set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))
-    ) - set(trajectories)
+    session2_trajectories = (set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))) - set(trajectories)
     assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}"
     session2_trajectory = max(session2_trajectories, key=lambda p: p.stat().st_mtime)
 
@@ -210,9 +208,7 @@ def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace):
     influence_events = [event for event in events if event.get("event") == "influence"]
     assert influence_events, f"no influence audit event recorded. all events: {events}"
     influenced_ids = {event.get("entity") for event in influence_events}
-    assert influenced_ids & learned_ids, (
-        f"influence events {influence_events} did not assess any learned ids {learned_ids}"
-    )
+    assert influenced_ids & learned_ids, f"influence events {influence_events} did not assess any learned ids {learned_ids}"
     allowed_verdicts = {"followed", "contradicted", "not_applicable"}
     assert any(event.get("verdict") in allowed_verdicts for event in influence_events), (
         f"no learned guideline was assessed with an allowed verdict. influence events: {influence_events}"

From 680d4eb6e7c9bc632d7588e19ccbbc11ab4fcba7 Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 10 Jun 2026 09:42:30 -0500
Subject: [PATCH 3/5] fix(sandbox): suppress detect-secrets false positive in
 justfile

Fixes failing CI check: tekton/pr-code-checks/code-detect-secrets

The bob-auth recipe target name contains the word "auth" which the
detect-secrets Secret Keyword plugin flags as a possible credential.
Annotate the line with `# pragma: allowlist secret` per project
convention.
---
 justfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/justfile b/justfile
index 94b2fb12..95b48d2c 100644
--- a/justfile
+++ b/justfile
@@ -95,7 +95,7 @@ bob-run: _bob-force-sso
     docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(cd {{workspace}} && pwd)":/workspace -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}}
 
 # Authenticate Bob in the sandbox with browser SSO. Open the printed URL on the host.
-bob-auth: _bob-force-sso
+bob-auth: _bob-force-sso  # pragma: allowlist secret
     mkdir -p {{bob_home}}
     docker run --rm -it --hostname {{bob_hostname}} --env BOB_SHELL_FORCE_FILE_STORAGE=true --env SSO_PORT={{bob_sso_port}} --publish 127.0.0.1:{{bob_sso_port}}:{{bob_sso_port}} -v "$(pwd)/{{bob_home}}":/home/sandbox/.bob {{bob_image}} bob --accept-license --auth-method sso
     just _bob-force-sso

From a40dbad325c044a1907d4c5f346b5fde42ab419d Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 10 Jun 2026 14:08:17 -0500
Subject: [PATCH 4/5] fix(e2e): drop bob provenance assertion that can never
 produce influence events

Addresses review feedback from visahak

Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never gets
a session id from the runtime and cannot emit a recall audit event. The
evolve-lite:provenance skill assesses entities from prior recall audit
events; with none in scope, a compliant provenance run is required to
produce zero influence events. The previous test asserted the opposite,
making it order-of-magnitude flaky.

Drop session 3 (provenance) and the associated influence-event
assertions. The test now validates only the learn + recall evidence
Bob can actually produce: guideline files in .evolve/entities/ and
references to those guidelines in session 2's saved trajectory.
---
 tests/e2e/test_bob_sandbox_learn_recall.py | 63 ++++++----------------
 1 file changed, 16 insertions(+), 47 deletions(-)

diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py
index ad0e8f54..033d469c 100644
--- a/tests/e2e/test_bob_sandbox_learn_recall.py
+++ b/tests/e2e/test_bob_sandbox_learn_recall.py
@@ -1,12 +1,18 @@
 """End-to-end test of the evolve-lite learn + recall flow in the Bob sandbox.
 
-Runs three sequential Bob CLI sessions against the Dockerized Bob sandbox:
+Runs two sequential Bob CLI sessions against the Dockerized Bob sandbox:
   1. Session 1 performs an EXIF task, then explicitly invokes the evolve-lite
      save-trajectory and learn skills so a trajectory and guideline are saved.
   2. Session 2 asks a related EXIF question. The recall skill should surface
      the guideline from session 1 before substantive work begins.
-  3. Session 3 runs the offline provenance skill so the recall audit gets
-     follow-up influence verdicts.
+
+Bob 1.0.4 has no ``UserPromptSubmit`` hook, so the recall script never gets a
+session id from the runtime and cannot emit a ``recall`` audit event the way
+the Claude/Codex tests rely on. Without a recall audit event there is nothing
+for ``evolve-lite:provenance`` to assess, so this test does not run a third
+provenance session — it validates only the learn + recall evidence that Bob
+can actually produce: guideline files in ``.evolve/entities/`` and references
+to those guidelines in session 2's saved trajectory.
 
 Requires Docker, the ``evolve-bob-sandbox`` image built, and a persisted Bob
 SSO auth state on the host (created by ``just bob-auth``). The test mounts
@@ -14,7 +20,6 @@
 Bob's encrypted file storage decrypts across runs.
 """
 
-import json
 import logging
 import os
 import shutil
@@ -115,16 +120,9 @@ def _run_bob_prompt(
     return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS)
 
 
-def _audit_events(evolve_dir: Path) -> list[dict]:
-    audit_log = evolve_dir / "audit.log"
-    if not audit_log.is_file():
-        return []
-    return [json.loads(line) for line in audit_log.read_text().splitlines() if line.strip()]
-
-
 @pytest.mark.e2e
 def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace):
-    """Session 1 learns, session 2 recalls, session 3 records influence."""
+    """Session 1 learns, session 2 recalls."""
     bob_home = bob_sandbox_ready
     evolve_dir = bob_workspace / ".evolve"
 
@@ -176,43 +174,14 @@ def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace):
     assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}"
     session2_trajectory = max(session2_trajectories, key=lambda p: p.stat().st_mtime)
 
-    # Bob has no UserPromptSubmit hook, so the recall skill cannot emit a
-    # recall audit event the way the codex/claude tests do. Verify recall
-    # influence indirectly: session 2's saved trajectory should reference
-    # one of the guideline files (or its key content) from session 1.
+    # Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never
+    # gets a session id from the runtime and cannot emit a recall audit
+    # event. Without a recall audit there is nothing for evolve-lite:provenance
+    # to assess, so this test stops at the indirect evidence Bob can produce:
+    # session 2's saved trajectory should reference one of the guideline
+    # files learned in session 1.
     learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files}
     session2_text = session2_trajectory.read_text(encoding="utf-8")
     assert any(eid.split("/")[-1] in session2_text for eid in learned_ids), (
         f"session 2 trajectory did not reference any guideline filename from {learned_ids}"
     )
-
-    log.info("bob session 3: running offline provenance analysis...")
-    t2 = time.time()
-    session2_id = session2_trajectory.stem
-    result3 = _run_bob_prompt(
-        bob_workspace,
-        bob_home,
-        (
-            "Run the evolve-lite provenance skill now. Analyze the saved trajectories in "
-            ".evolve/trajectories/. Record influence verdicts for the guidelines under "
-            f".evolve/entities/guideline/ as applied (or not) in trajectory {session2_id}. "
-            "Do not modify source files."
-        ),
-    )
-    log.info(f"bob session 3: exited {result3.returncode} after {time.time() - t2:.0f}s")
-    assert result3.returncode == 0, (
-        f"session 3 exited {result3.returncode}\nstdout:\n{result3.stdout[-2000:]}\nstderr:\n{result3.stderr[-2000:]}"
-    )
-
-    events = _audit_events(evolve_dir)
-    influence_events = [event for event in events if event.get("event") == "influence"]
-    assert influence_events, f"no influence audit event recorded. all events: {events}"
-    influenced_ids = {event.get("entity") for event in influence_events}
-    assert influenced_ids & learned_ids, f"influence events {influence_events} did not assess any learned ids {learned_ids}"
-    allowed_verdicts = {"followed", "contradicted", "not_applicable"}
-    assert any(event.get("verdict") in allowed_verdicts for event in influence_events), (
-        f"no learned guideline was assessed with an allowed verdict. influence events: {influence_events}"
-    )
-    for event in influence_events:
-        assert event.get("verdict") in allowed_verdicts
-        assert event.get("evidence"), f"influence event missing evidence: {event}"

From b77107d565a9dd953aea0bf3d9fddcba60460590 Mon Sep 17 00:00:00 2001
From: Vinod Muthusamy <vinodmut@users.noreply.github.com>
Date: Wed, 10 Jun 2026 14:18:58 -0500
Subject: [PATCH 5/5] fix(e2e): scan all session-2 trajectories with Path.stem
 matching

Addresses CodeRabbit review finding: Make the recall-evidence
assertion robust across all session-2 trajectories and path separators

Iterate over the full session2_trajectories set instead of only the
newest, and use Path.stem from entity_files for matching instead of
splitting the entity id string on "/". Both changes make the assertion
robust to path separator differences and to bob writing more than one
trajectory in a single session.
---
 tests/e2e/test_bob_sandbox_learn_recall.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/e2e/test_bob_sandbox_learn_recall.py b/tests/e2e/test_bob_sandbox_learn_recall.py
index 033d469c..c8ca56f0 100644
--- a/tests/e2e/test_bob_sandbox_learn_recall.py
+++ b/tests/e2e/test_bob_sandbox_learn_recall.py
@@ -172,16 +172,15 @@ def test_bob_learn_then_recall_flow(bob_sandbox_ready, bob_workspace):
 
     session2_trajectories = (set(trajectories_dir.glob("*.json")) | set(trajectories_dir.glob("*.jsonl"))) - set(trajectories)
     assert session2_trajectories, f"no Bob trajectory saved for session 2 in {trajectories_dir}"
-    session2_trajectory = max(session2_trajectories, key=lambda p: p.stat().st_mtime)
 
     # Bob 1.0.4 has no UserPromptSubmit hook, so the recall script never
     # gets a session id from the runtime and cannot emit a recall audit
     # event. Without a recall audit there is nothing for evolve-lite:provenance
     # to assess, so this test stops at the indirect evidence Bob can produce:
-    # session 2's saved trajectory should reference one of the guideline
-    # files learned in session 1.
-    learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files}
-    session2_text = session2_trajectory.read_text(encoding="utf-8")
-    assert any(eid.split("/")[-1] in session2_text for eid in learned_ids), (
-        f"session 2 trajectory did not reference any guideline filename from {learned_ids}"
+    # at least one session-2 trajectory should reference a guideline file
+    # learned in session 1.
+    learned_stems = {path.stem for path in entity_files}
+    session2_texts = [path.read_text(encoding="utf-8") for path in session2_trajectories]
+    assert any(stem in text for stem in learned_stems for text in session2_texts), (
+        f"no session 2 trajectory referenced any guideline filename from {learned_stems}"
     )