posit-dev
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 16 additions & 11 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 15 additions & 5 deletions b/‎AGENTS.md‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎docs/architecture.md‎
Lines changed: 9 additions & 1 deletion b/‎docs/architecture.md‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎docs/debugging.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/debugging.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/plans/active/public-api-runner.md‎
Lines changed: 67 additions & 0 deletions b/‎docs/plans/active/public-api-runner.md‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎docs/testing.md‎
Lines changed: 104 additions & 7 deletions b/‎docs/testing.md‎
Lines changed: 104 additions & 7 deletions
diff --git a/‎src/server.rs‎
Lines changed: 0 additions & 1 deletion b/‎src/server.rs‎
Lines changed: 0 additions & 1 deletion
@@ -78,16 +78,17 @@ jobs:
       - name: cargo build
         run: cargo build
 
-      - name: cargo clippy
-        run: cargo clippy --all-targets --all-features -- -D warnings
-
-      - name: cargo test
+      - name: Python public API suite
         if: matrix.os != 'windows-2022'
-        run: cargo test
+        run: python3 tests/run_integration_tests.py --binary target/debug/mcp-repl
 
-      - name: cargo test (windows serial)
+      - name: Python public API suite (windows)
         if: matrix.os == 'windows-2022'
-        run: cargo test -j 1 -- --test-threads=1
+        shell: pwsh
+        run: python tests/run_integration_tests.py --binary target/debug/mcp-repl.exe
+
+      - name: cargo clippy
+        run: cargo clippy --all-targets --all-features -- -D warnings
 
       - name: Install Codex CLI
         if: matrix.os != 'windows-2022'
@@ -110,13 +111,17 @@ jobs:
           $env:PATH = "$npmPrefix;$env:PATH"
           & (Join-Path $npmPrefix "codex.cmd") --version
 
-      - name: cargo test (real codex integrations)
+      - name: cargo test
         if: matrix.os != 'windows-2022'
-        run: cargo test -j 1 --test codex_approvals_tui -- --test-threads=1
+        env:
+          MCP_REPL_CODEX_BACKEND: mock
+        run: cargo test --quiet
 
-      - name: cargo test (real codex integrations, windows serial)
+      - name: cargo test (windows serial)
         if: matrix.os == 'windows-2022'
-        run: cargo test -j 1 --test codex_approvals_tui -- --test-threads=1
+        env:
+          MCP_REPL_CODEX_BACKEND: mock
+        run: cargo test -j 1 --quiet -- --test-threads=1
 
       - name: cargo +nightly fmt
         run: cargo +nightly fmt --all -- --check
 
@@ -7,19 +7,26 @@ Keep this file short. It is a table of contents, not the full manual.
 - If you modified code, run all required checks before replying:
   - `cargo check`
   - `cargo build`
+  - `python3 tests/run_integration_tests.py --binary target/debug/mcp-repl`
   - `cargo clippy --all-targets --all-features -- -D warnings`
-  - `cargo test`
+  - `cargo test --quiet`
   - `cargo +nightly fmt`
+- For docs-only changes, run the narrow docs validation that covers the edited
+  files, usually `cargo test --test docs_contracts`.
+- When changing Codex backend selection or CI real-client wiring, also run:
+  - `MCP_REPL_CODEX_BACKEND=mock cargo test -j 1 --test codex_integration codex_exec_auto_backend_smoke -- --test-threads=1`
 - Treat all clippy warnings as failures. Do not leave warning cleanup for later.
 - Never pass `--vanilla` to `R` or `Rscript` unless the user explicitly asks for it.
 
 ## Start Here
 
 - `docs/index.md`: source-of-truth map for repository docs.
-- `docs/architecture.md`: subsystem map for the binary, worker, sandbox, and eval surfaces.
+- `docs/architecture.md`: subsystem map for the CLI, server, worker, sandbox, output, and validation surfaces.
 - `docs/testing.md`: public verification surface and snapshot workflow.
 - `docs/debugging.md`: debug logs, `--debug-repl`, and stdio tracing.
 - `docs/sandbox.md`: sandbox modes and writable-root policy.
+- `docs/output_timeline.md`: visible output ordering across sideband and raw streams.
+- `docs/worker_sideband_protocol.md`: current server/worker IPC contract.
 - `docs/plans/AGENTS.md`: when to create checked-in execution plans.
 
 ## Glossary
@@ -41,8 +48,8 @@ Keep this file short. It is a table of contents, not the full manual.
 - Sandbox metadata: Codex per-tool-call `_meta["codex/sandbox-state-meta"]` used by `--sandbox inherit` to choose the effective worker sandbox for that call.
 - Writable root: An absolute path that a `workspace-write` worker may write, subject to forced read-only subpaths like `.git`, `.codex`, and `.agents`.
 - Session temp directory: The server-allocated per-session temp path exposed to the worker as `TMPDIR` and `MCP_REPL_R_SESSION_TMPDIR`.
-- Sideband IPC: The JSON-lines server/worker pipe for structural facts such as `readline_start`, `readline_result`, `plot_image`, `request_end`, and `session_end`.
-- stdout/stderr pipes: The normal process output streams captured by the server. They are the authoritative visible text source; sideband only helps interpret them.
+- Sideband IPC: The JSON-lines server/worker pipe for structural facts such as `readline_start`, `readline_input`, `readline_discard`, `output_text`, `plot_image`, and `session_end`.
+- Raw output capture: The stdout/stderr pipes or PTY stream captured by the server for unowned visible text. Sideband carries worker-owned text and structural facts.
 - Output timeline: The server-side reconstruction of visible output order from captured stdout/stderr plus sideband facts.
 - Server-owned: State, files, or notices created and retained by the main server process, not by the runtime or the worker. Use this for output bundles, response finalization, debug logs, and server temp roots.
 - Worker-originated text: Text that came from the worker REPL or worker child processes and can be written to `transcript.txt`.
@@ -60,7 +67,10 @@ Keep this file short. It is a table of contents, not the full manual.
   - `cargo insta test`
   - `cargo insta pending-snapshots`
   - `cargo insta review` or `cargo insta accept` / `cargo insta reject`
-- CI-style validation: `cargo insta test --check --unreferenced=reject`
+- CI-style validation: `cargo insta test --check`
+- Do not add `--unreferenced=reject` to the general snapshot check; this
+  repository keeps valid platform-specific snapshots that are unreferenced on
+  other platforms.
 - For broad intentional snapshot migrations: `cargo insta test --force-update-snapshots --accept`
 - Do not delete `tests/snapshots/*.snap.new` manually. Use `cargo insta reject`.
 
 
@@ -63,7 +63,15 @@ The repository is organized around a few concrete subsystems rather than deep pa
 
 ### Validation harnesses
 
-- `tests/` is the primary public validation surface. The tests exercise tool behavior, snapshots, sandboxing, and client integrations through the exposed MCP interface.
+- `tests/run_integration_tests.py` starts an already-built `mcp-repl` binary and
+  exercises public MCP tools over stdio. It covers representative real-binary
+  behavior that should not depend on Rust internals.
+- `tests/` contains the Rust public API, snapshot, sandbox, backend, install,
+  protocol-worker, and client-integration suites. Most tests exercise behavior
+  through the exposed MCP interface using the shared harness in `tests/common/`.
+- CI uses Cargo's standard Rust test runner after installing the real Codex CLI,
+  with the Codex backend forced to the mocked provider. The tests should not
+  depend on special local scheduling.
 
 ## Design Constraints
 
 
@@ -84,7 +84,7 @@ Useful environment variables:
 
 ## External wire trace proxy
 
-The built-in event log only sees what reaches `mcp-repl` after startup. If you need the exact stdio traffic between an MCP client and the server, use the external proxy in [scripts/mcp-stdio-trace.py](/Users/tomasz/github/t-kalinowski/mcp-repl/scripts/mcp-stdio-trace.py).
+The built-in event log only sees what reaches `mcp-repl` after startup. If you need the exact stdio traffic between an MCP client and the server, use the external proxy in [scripts/mcp-stdio-trace.py](../scripts/mcp-stdio-trace.py).
 
 What it does:
 
 
@@ -0,0 +1,67 @@
+# External Public API Runner
+
+## Summary
+
+- Move public MCP behavior checks, including sandbox-visible real-binary behavior, toward an external Python runner that starts a built `mcp-repl` binary over stdio.
+- Keep Rust tests for unit contracts, snapshot normalization, protocol-worker conformance, platform-specific mechanics, and behavior that is not yet covered externally.
+
+## Status
+
+- State: active
+- Last updated: 2026-05-18
+- Current phase: implementation
+
+## Current Direction
+
+- Grow the minimal Python runner with small, real-client scenarios that speak MCP directly with newline-delimited JSON-RPC.
+- Treat sandboxing as product behavior for the external suite. The test runner process is outside the sandbox, but each case starts a built `mcp-repl` binary with an explicit sandbox state and verifies the worker is launched inside that policy through public MCP calls.
+- Reintroduce sandbox coverage in the Python runner now, starting with the default `workspace-write` behavior and then adding read-only or full-access contrasts where they prove public behavior.
+- Keep each migrated case focused enough that matching Rust integration coverage can be removed or reduced in the same change.
+- Use `danger-full-access` only for individual external cases whose purpose is unrelated to sandboxing and where disabling sandbox enforcement does not hide the product behavior under test.
+- Keep existing Rust tests discoverable by `cargo test` until their scenario is migrated or removed in the same change that adds equivalent Python coverage.
+
+## Long-Term Direction
+
+- Migrate representative public API integration scenarios out of Rust when the Python runner covers the same real-binary behavior, including sandbox behavior that is observable through public MCP tool calls.
+- Keep protocol-worker conformance tests, Rust-only contract tests, and deeply platform-specific sandbox launch mechanics in Rust unless there is a clearer public external scenario for the same contract.
+
+## Phase Status
+
+- Phase 0: completed - add the runner shell and first R console smoke case.
+- Phase 1: completed - migrate another small real-client scenario with timeout or busy-worker behavior.
+- Phase 2: completed - run the external suite in CI after the debug binary is built.
+- Phase 3: pending - reintroduce sandbox scenarios in the Python runner and continue migrating duplicate real-binary Rust integration coverage case by case.
+
+## Locked Decisions
+
+- The external suite must accept a prebuilt binary path instead of building the binary itself.
+- The runner should call MCP tools over stdio and avoid internal Rust helpers.
+- CI runs the external suite as its own step after `cargo build` on each matrix target.
+- Do not opt Rust test targets out of Cargo discovery in anticipation of future migration work.
+
+## Open Questions
+
+- Which sandbox scenarios have public external equivalents and which should remain Rust-only launch or platform-mechanics coverage.
+- Which additional public scenarios should migrate into the external suite before the parent migration is complete.
+
+## Next Safe Slice
+
+- Add a Python-runner sandbox case that starts the binary under `workspace-write`, proves an in-workspace write succeeds, and proves an out-of-policy write is blocked through the public `repl` tool.
+- In the same or next small slice, migrate another representative real-binary Rust integration scenario to the Python runner and remove or reduce only the matching Rust coverage.
+
+## Stop Conditions
+
+- Stop if a migrated scenario requires internal server state inspection instead of public MCP requests.
+- Stop if runner behavior needs platform-specific process supervision beyond the simple stdio client.
+
+## Decision Log
+
+- 2026-05-17: Chose a narrow first slice with one R `repl` smoke case to prove the runner can initialize the real binary and call public tools before moving more complex scenarios.
+- 2026-05-17: Added an R timeout/busy/recovery case to the external runner and removed the matching Rust snapshot smoke test.
+- 2026-05-17: Added an R `repl_reset` state-clearing case to the external runner and removed the duplicate Rust public surface test.
+- 2026-05-17: Added the external public API suite to the cross-platform CI workflow as a separate post-build step.
+- 2026-05-17: Added an R interrupt/restart-prefix scenario with explicit interrupt readiness polling and removed duplicate Rust prefix tests.
+- 2026-05-17: Added files-mode output-bundle scenarios for text bundles, pruning, timeout backfill, and size-cap omission, then removed duplicate broad Rust integration coverage.
+- 2026-05-17: Removed obsolete serial scheduling after verifying the remaining Rust REPL binaries pass under normal Cargo test scheduling.
+- 2026-05-18: Reaffirmed that unmigrated Rust scenarios must remain discoverable by `cargo test`; migrations should replace Rust coverage with equivalent Python coverage in the same change, not disable tests ahead of time.
+- 2026-05-18: Clarified that the external runner itself is not sandboxed, but the spawned `mcp-repl` binary still owns the sandbox contract; the next slice should restore sandbox coverage in the Python runner starting with `workspace-write`.
@@ -5,14 +5,14 @@ This file is the entrypoint for deciding how to verify a change.
 
 ## Core Test Surface
 
-- `tests/repl_surface.rs`: basic `repl` and `repl_reset` behavior.
-- `tests/repl_surface.rs` and `tests/python_backend.rs`: IPC ownership coverage. Only the main worker may own sideband fds; user-spawned children must not. `tests/python_backend.rs` also covers detached-idle oversized-output behavior, Unix Python PTY-backed C stdio, CPython `input()` through the readline path, and the absence of direct-fd stdin shims through the public `repl` API.
-- `tests/server_smoke.rs`: end-to-end MCP session smoke coverage.
-- `tests/write_stdin_behavior.rs`: timeout polling, oversized text replies, and transcript-file behavior through the public `repl` API.
+- `tests/run_integration_tests.py`: external real-binary checks over MCP stdio, including basic R `repl`, pager command handling, files-mode output bundles, timeout/busy recovery, interrupt/restart prefixes, and `repl_reset` state clearing.
+- `tests/common/`: shared Rust MCP harness for public tool calls, transcript snapshots, sandbox assertions, and client-install fixtures.
+- `tests/repl_surface.rs`, `tests/server_smoke.rs`, `tests/mcp_transcripts.rs`, and `tests/write_stdin_*.rs`: core `repl`/`repl_reset` behavior, timeout polling, oversized text replies, transcript-file behavior, and snapshot coverage through the public tool API.
+- `tests/pager*.rs` and `tests/oversized_output_cli.rs`: pager mode, files mode, and oversized-output CLI behavior.
+- `tests/python_*.rs`, `tests/r_*.rs`, `tests/plot_images.rs`, and `tests/python_plot_images.rs`: backend-specific public behavior, help/manual surfaces, PTY-backed Python readline behavior, and image output.
 - `tests/zod_protocol.rs`: protocol-worker conformance, including PTY launch with sideband IPC kept separate from visible PTY output.
 - `tests/sandbox.rs` and `tests/sandbox_state_updates.rs`: sandbox policy behavior and Codex per-tool-call sandbox metadata.
-- `tests/plot_images.rs` and `tests/python_plot_images.rs`: plot/image behavior through the public tool surface.
-- `tests/codex_approvals_tui.rs` and `tests/claude_integration.rs`: client integration coverage.
+- `tests/install_*.rs`, `tests/codex_integration.rs`, and `tests/claude_integration.rs`: install-path and real client integration coverage.
 - `tests/docs_contracts.rs`: docs map and snapshot-facing documentation contracts.
 
 ## Snapshot Workflow
@@ -22,18 +22,115 @@ This file is the entrypoint for deciding how to verify a change.
   - `cargo insta test`
   - `cargo insta pending-snapshots`
   - `cargo insta review` or `cargo insta accept` / `cargo insta reject`
+- CI-style validation: `cargo insta test --check`
+- Do not add `--unreferenced=reject` to the general snapshot check; this
+  repository keeps valid platform-specific snapshots that are unreferenced on
+  other platforms.
 - Do not delete `tests/snapshots/*.snap.new` manually. Use `cargo insta reject`.
 
+## External Public API Suite
+
+Build the binary first, then run the Python suite:
+
+```sh
+cargo build
+python3 tests/run_integration_tests.py --binary target/debug/mcp-repl
+```
+
+The runner starts the real server over MCP stdio and calls public tools only. It
+uses `--sandbox danger-full-access` by default so the suite stays focused on
+client protocol behavior rather than sandbox policy.
+
+Use `--case <name>` to run one public API case while iterating.
+
+CI runs this suite after `cargo build` in the main cross-platform workflow,
+using the debug binary built for each matrix target.
+
+## Rust Suite
+
+Use Cargo's standard Rust test runner:
+
+```sh
+cargo test
+```
+
+The Rust suite uses plain `cargo test` as its single runner. Plain `cargo test`
+remains the full Cargo compatibility path. It must continue to discover the
+binary unit tests and Rust integration targets. CI passes Cargo's `--quiet`
+flag to keep successful logs compact.
+
+```sh
+cargo test --quiet
+```
+
+CI installs Codex before `cargo test` and sets `MCP_REPL_CODEX_BACKEND=mock`,
+so the Codex integration target runs through the mocked provider as part of the
+ordinary Rust suite. Windows keeps the Rust suite fully serial with `-j 1` and
+`--test-threads=1`.
+
+Do not opt Rust test targets out of Cargo discovery in anticipation of a future
+Python migration; migrate a scenario only when the Rust coverage is deleted or
+reduced in the same change that adds equivalent external coverage.
+
+## Real Client Integrations
+
+CI installs Codex before the Rust suite. The Codex CI integration does not
+require OpenAI authentication because the test config points Codex at a local
+mock provider.
+
+By default, the Codex integration uses `MCP_REPL_CODEX_BACKEND=auto`: it checks
+whether Codex is logged in, checks whether `gpt-5.3-codex-spark` is available,
+and uses that live backend when both checks pass. Otherwise it uses the mocked
+provider. Set `MCP_REPL_CODEX_BACKEND=live` or `MCP_REPL_CODEX_BACKEND=mock`
+to force one path.
+
+When changing Codex backend selection or CI real-client wiring, run the forced
+mock path explicitly:
+
+```sh
+MCP_REPL_CODEX_BACKEND=mock cargo test -j 1 --test codex_integration codex_exec_auto_backend_smoke -- --test-threads=1
+```
+
+To validate the authenticated live path directly on a machine with Spark access:
+
+```sh
+MCP_REPL_CODEX_BACKEND=live cargo test -j 1 --test codex_integration codex_exec_auto_backend_smoke -- --test-threads=1
+```
+
+Local full verification includes the Codex and Claude integration binaries when
+those clients are installed. Codex uses the Spark model
+(`gpt-5.3-codex-spark`) in its isolated test config. Claude uses `haiku`.
+If a required client binary is unavailable, the matching integration test prints
+a skip banner with the reason. Codex backend selection prints a `CODEX` banner
+showing whether the test selected live Spark or the mocked provider.
+
+To run only those integrations:
+
+```sh
+cargo test --quiet --test codex_integration --test claude_integration
+```
+
+CI runs the Codex integration target as part of `cargo test`; Claude integration
+remains local because provider authentication is unavailable in CI.
+
 ## Full Verification Before Replying
 
 If you modify code, run:
 
 - `cargo check`
 - `cargo build`
+- `python3 tests/run_integration_tests.py --binary target/debug/mcp-repl`
 - `cargo clippy --all-targets --all-features -- -D warnings`
-- `cargo test`
+- `cargo test --quiet`
 - `cargo +nightly fmt`
 
+For docs-only changes, run the narrow validation that covers the edited docs.
+For agent-facing docs, that is usually:
+
+```sh
+cargo test --test docs_contracts
+```
+
 ## Debug-Then-Validate Loop
 
 When behavior is unclear:
 
@@ -785,7 +785,6 @@ pub async fn run(
     sandbox_plan: SandboxCliPlan,
     oversized_output: OversizedOutputMode,
 ) -> Result<(), Box<dyn std::error::Error>> {
-    eprintln!("starting mcp-repl server");
     let backend = worker_launch.builtin_backend().unwrap_or(Backend::R);
     crate::event_log::log(
         "server_run_begin",