|
| 1 | +#!/usr/bin/env bash |
| 2 | +# shellcheck shell=bash |
| 3 | +# |
| 4 | +# run-env-propagation.sh — end-to-end smoke test for the SI-11 -> SI-13 |
| 5 | +# regression observed on 2026-05-18. |
| 6 | +# |
| 7 | +# Background: |
| 8 | +# The SI-11 plan-routing-filter (scripts/codex-fleet/lib/plan-routing-filter.sh) |
| 9 | +# computes a non-empty FLEET_DEFAULT_SPECIALTY at bringup time when the |
| 10 | +# priority plan's metadata.writable_roots are foreign (outside the |
| 11 | +# codex-fleet repo family). Bringup logs the value, and full-bringup.sh |
| 12 | +# spawns each codex worker via: |
| 13 | +# env ... CODEX_FLEET_SPECIALTY="$effective_specialty" \ |
| 14 | +# CODEX_FLEET_TIER=... CODEX_FLEET_AGENT_NAME=... \ |
| 15 | +# codex --dangerously-bypass-approvals-and-sandbox ... |
| 16 | +# |
| 17 | +# On 2026-05-18 ~07:36 UTC the host-Claude supervisor observed that |
| 18 | +# `printenv CODEX_FLEET_SPECIALTY` inside the spawned codex CLI returned |
| 19 | +# the empty string even though bringup's "auto-routing:" log line proved |
| 20 | +# the value had been set on the env when the codex process was started. |
| 21 | +# Without specialty, Colony's matchmaker routed workers into stale plans |
| 22 | +# whose writable-roots failed preflight — the very pathology SI-11 was |
| 23 | +# supposed to prevent. |
| 24 | +# |
| 25 | +# Role of this test: |
| 26 | +# This script reproduces the exact propagation path end-to-end. It (a) |
| 27 | +# stages a minimal fixture plan whose writable_roots are deliberately |
| 28 | +# foreign (forcing SI-11 to set FLEET_DEFAULT_SPECIALTY), (b) spins up a |
| 29 | +# single-pane codex-fleet against that fixture on an isolated tmux socket |
| 30 | +# (codex-fleet-test), (c) sends `printenv` to the spawned codex CLI, (d) |
| 31 | +# captures the pane, and (e) asserts the four CODEX_FLEET_* env vars all |
| 32 | +# print non-empty values. Any future regression where codex CLI scrubs |
| 33 | +# these vars between spawn and prompt-execution will fail this test at PR |
| 34 | +# time — before it lands and silently breaks the live supervisor. |
| 35 | +# |
| 36 | +# Required env vars asserted non-empty on the spawned codex CLI side: |
| 37 | +# CODEX_FLEET_SPECIALTY — set by SI-11 routing-filter (foreign |
| 38 | +# writable_roots -> plan_slug) |
| 39 | +# CODEX_FLEET_TIER — set by full-bringup.sh from accounts.yml |
| 40 | +# lookup (default "high") |
| 41 | +# CODEX_FLEET_AGENT_NAME — set by full-bringup.sh as "codex-$id" |
| 42 | +# CODEX_FLEET_WORKER_CWD — set by the worker-prompt boot step (after |
| 43 | +# SI-17 lands, sourced from the staged env |
| 44 | +# file under /tmp/codex-fleet/<agent>/env) |
| 45 | +# |
| 46 | +# CI safety: |
| 47 | +# If cap-probe / account staging cannot find any healthy codex account |
| 48 | +# (CI runner without a logged-in codex CLI, or all accounts capped) the |
| 49 | +# test prints "[SKIP] no healthy codex accounts" and exits 0. Credential |
| 50 | +# issues must not red-CI the env-propagation lane. |
| 51 | +# |
| 52 | +# Usage: |
| 53 | +# bash scripts/codex-fleet/test/run-env-propagation.sh |
| 54 | + |
| 55 | +set -uo pipefail |
| 56 | + |
| 57 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 58 | +FLEET_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" |
| 59 | +REPO_ROOT="$(cd "$FLEET_DIR/../.." && pwd)" |
| 60 | +BRINGUP="$FLEET_DIR/full-bringup.sh" |
| 61 | + |
| 62 | +FIXTURE_SRC="$SCRIPT_DIR/env-prop-fixture/plan.json" |
| 63 | +FIXTURE_SLUG="env-prop-fixture-test" |
| 64 | +FIXTURE_ROOT="/tmp/env-prop-test" |
| 65 | +PLAN_DIR_DEST="$REPO_ROOT/openspec/plans/$FIXTURE_SLUG" |
| 66 | +TEST_SOCKET="codex-fleet-test" |
| 67 | +TEST_SESSION="codex-fleet-test" |
| 68 | + |
| 69 | +log() { printf '\033[36m[env-prop-test]\033[0m %s\n' "$*"; } |
| 70 | +warn() { printf '\033[33m[env-prop-test]\033[0m %s\n' "$*"; } |
| 71 | +fail() { printf '\033[31m[env-prop-test] FAIL:\033[0m %s\n' "$*" >&2; } |
| 72 | +skip() { printf '\033[33m[env-prop-test] SKIP:\033[0m %s\n' "$*"; exit 0; } |
| 73 | + |
| 74 | +# ---- Preflight ---------------------------------------------------------- |
| 75 | +[ -f "$BRINGUP" ] || { fail "missing $BRINGUP"; exit 1; } |
| 76 | +[ -f "$FIXTURE_SRC" ] || { fail "missing fixture plan at $FIXTURE_SRC"; exit 1; } |
| 77 | +command -v tmux >/dev/null 2>&1 || skip "tmux not on PATH" |
| 78 | + |
| 79 | +# ---- Teardown ----------------------------------------------------------- |
| 80 | +plan_dir_was_present=0 |
| 81 | +[ -d "$PLAN_DIR_DEST" ] && plan_dir_was_present=1 |
| 82 | + |
| 83 | +cleanup() { |
| 84 | + local rc=$? |
| 85 | + set +e |
| 86 | + # Kill the isolated tmux server (covers both the fleet session and the |
| 87 | + # sibling ticker session). Safe to run even if no server was ever started. |
| 88 | + tmux -L "$TEST_SOCKET" kill-server 2>/dev/null |
| 89 | + # Best-effort kill of any leftover daemons launched by full-bringup.sh |
| 90 | + # (fleet-tick / cap-swap / supervisor / plan-watcher) that may have |
| 91 | + # spawned outside the tmux server. |
| 92 | + pkill -f "fleet-tick-daemon.sh" 2>/dev/null |
| 93 | + pkill -f "plan-watcher.sh" 2>/dev/null |
| 94 | + pkill -f "cap-swap-daemon.sh" 2>/dev/null |
| 95 | + pkill -f "claude-supervisor.sh" 2>/dev/null |
| 96 | + pkill -f "auto-reviewer.sh" 2>/dev/null |
| 97 | + # Remove the fixture plan if we placed it (don't nuke an operator's |
| 98 | + # pre-existing plan dir of the same name — extremely unlikely but cheap |
| 99 | + # to guard against). |
| 100 | + if [ "$plan_dir_was_present" = "0" ] && [ -d "$PLAN_DIR_DEST" ]; then |
| 101 | + rm -rf "$PLAN_DIR_DEST" |
| 102 | + fi |
| 103 | + # Remove the fixture writable root (only if we created it). |
| 104 | + if [ "${FIXTURE_ROOT_CREATED:-0}" = "1" ] && [ -d "$FIXTURE_ROOT" ]; then |
| 105 | + rm -rf "$FIXTURE_ROOT" |
| 106 | + fi |
| 107 | + set -e |
| 108 | + exit "$rc" |
| 109 | +} |
| 110 | +trap cleanup EXIT INT TERM |
| 111 | + |
| 112 | +# ---- Refuse to clobber a live fleet ------------------------------------- |
| 113 | +if tmux -L "$TEST_SOCKET" has-session -t "$TEST_SESSION" 2>/dev/null; then |
| 114 | + fail "tmux session '$TEST_SESSION' already exists on socket '$TEST_SOCKET'; aborting to avoid clobbering a live fleet" |
| 115 | + exit 1 |
| 116 | +fi |
| 117 | + |
| 118 | +# ---- Stage the fixture writable root ------------------------------------ |
| 119 | +FIXTURE_ROOT_CREATED=0 |
| 120 | +if [ ! -d "$FIXTURE_ROOT" ]; then |
| 121 | + mkdir -p "$FIXTURE_ROOT" |
| 122 | + FIXTURE_ROOT_CREATED=1 |
| 123 | +fi |
| 124 | +[ -w "$FIXTURE_ROOT" ] || { fail "fixture writable root not writable: $FIXTURE_ROOT"; exit 1; } |
| 125 | +log "fixture writable root: $FIXTURE_ROOT (created=$FIXTURE_ROOT_CREATED)" |
| 126 | + |
| 127 | +# ---- Stage the fixture plan into openspec/plans/ ------------------------ |
| 128 | +# full-bringup.sh resolves the priority plan by openspec/plans/<slug>/plan.json |
| 129 | +# in the repo root. We copy the fixture in for the duration of the test and |
| 130 | +# remove it on cleanup (unless an identically named dir was already there). |
| 131 | +if [ "$plan_dir_was_present" = "0" ]; then |
| 132 | + mkdir -p "$PLAN_DIR_DEST" |
| 133 | + cp "$FIXTURE_SRC" "$PLAN_DIR_DEST/plan.json" |
| 134 | + log "staged fixture plan at $PLAN_DIR_DEST/plan.json" |
| 135 | +else |
| 136 | + warn "plan dir already exists at $PLAN_DIR_DEST; leaving operator copy in place" |
| 137 | +fi |
| 138 | + |
| 139 | +# ---- Bring up a single-pane fleet on the isolated socket --------------- |
| 140 | +log "running full-bringup.sh --plan-slug $FIXTURE_SLUG --n 1 --no-attach (socket=$TEST_SOCKET)" |
| 141 | +bringup_log="$(mktemp)" |
| 142 | +# shellcheck disable=SC2064 |
| 143 | +trap "rm -f '$bringup_log'; cleanup" EXIT INT TERM |
| 144 | + |
| 145 | +set +e |
| 146 | +CODEX_FLEET_TMUX_SOCKET="$TEST_SOCKET" \ |
| 147 | + SESSION="$TEST_SESSION" \ |
| 148 | + TICKER_SESSION="fleet-ticker-test" \ |
| 149 | + FLEET_STATE_DIR="/tmp/claude-viz/fleet-env-prop-test" \ |
| 150 | + bash "$BRINGUP" --plan-slug "$FIXTURE_SLUG" --n 1 --no-attach \ |
| 151 | + > "$bringup_log" 2>&1 |
| 152 | +bringup_rc=$? |
| 153 | +set -e |
| 154 | + |
| 155 | +# Surface bringup output for CI debugging. |
| 156 | +sed 's/^/ [bringup] /' "$bringup_log" |
| 157 | + |
| 158 | +if [ "$bringup_rc" -ne 0 ]; then |
| 159 | + # Distinguish credential failure (skip) from real failure. |
| 160 | + if grep -qE "no candidate accounts found|no healthy accounts" "$bringup_log"; then |
| 161 | + skip "no healthy codex accounts (cap-probe/agent-auth could not stage an account); skipping env-propagation assertion" |
| 162 | + fi |
| 163 | + fail "full-bringup.sh exited rc=$bringup_rc; see bringup output above" |
| 164 | + exit 1 |
| 165 | +fi |
| 166 | + |
| 167 | +# ---- Locate the worker pane -------------------------------------------- |
| 168 | +# full-bringup.sh creates an `overview` window with N worker panes (plus a |
| 169 | +# header pane marked '[codex-fleet-tab-strip]'). For N=1 there is exactly |
| 170 | +# one worker pane carrying '@panel = [codex-<id>]'. |
| 171 | +log "locating worker pane (overview window)" |
| 172 | +worker_pane="" |
| 173 | +for _ in $(seq 1 20); do |
| 174 | + worker_pane=$(tmux -L "$TEST_SOCKET" list-panes -t "$TEST_SESSION:overview" \ |
| 175 | + -F '#{@panel}|#{pane_id}' 2>/dev/null \ |
| 176 | + | awk -F'|' '$1 != "[codex-fleet-tab-strip]" && $1 != "" { print $2; exit }') |
| 177 | + [ -n "$worker_pane" ] && break |
| 178 | + sleep 0.5 |
| 179 | +done |
| 180 | + |
| 181 | +if [ -z "$worker_pane" ]; then |
| 182 | + fail "could not find a worker pane on $TEST_SESSION:overview after 10s" |
| 183 | + tmux -L "$TEST_SOCKET" list-panes -t "$TEST_SESSION:overview" \ |
| 184 | + -F 'pane=#{pane_id} panel=#{@panel}' 2>&1 | sed 's/^/ /' |
| 185 | + exit 1 |
| 186 | +fi |
| 187 | +log "worker pane id = $worker_pane" |
| 188 | + |
| 189 | +# ---- Wait up to 60s for the codex CLI prompt ('›') to appear ---------- |
| 190 | +log "waiting up to 60s for codex CLI prompt (looking for '>' marker)" |
| 191 | +prompt_seen=0 |
| 192 | +for i in $(seq 1 60); do |
| 193 | + pane_dump=$(tmux -L "$TEST_SOCKET" capture-pane -t "$worker_pane" -p 2>/dev/null || true) |
| 194 | + # codex CLI's interactive prompt uses the U+203A SINGLE RIGHT-POINTING |
| 195 | + # ANGLE QUOTATION MARK ('›'). Match either that or an ASCII '>' on the |
| 196 | + # last non-empty line for portability across codex CLI versions. |
| 197 | + if printf '%s\n' "$pane_dump" | grep -q '›'; then |
| 198 | + prompt_seen=1 |
| 199 | + log "codex prompt visible after ${i}s" |
| 200 | + break |
| 201 | + fi |
| 202 | + sleep 1 |
| 203 | +done |
| 204 | + |
| 205 | +if [ "$prompt_seen" -ne 1 ]; then |
| 206 | + fail "codex CLI prompt never appeared in pane $worker_pane within 60s" |
| 207 | + fail "pane capture follows:" |
| 208 | + tmux -L "$TEST_SOCKET" capture-pane -t "$worker_pane" -p 2>&1 | sed 's/^/ /' |
| 209 | + exit 1 |
| 210 | +fi |
| 211 | + |
| 212 | +# ---- Send printenv into the codex CLI ---------------------------------- |
| 213 | +PRINTENV_CMD='printenv CODEX_FLEET_SPECIALTY CODEX_FLEET_TIER CODEX_FLEET_AGENT_NAME CODEX_FLEET_WORKER_CWD' |
| 214 | +log "send-keys: $PRINTENV_CMD" |
| 215 | +tmux -L "$TEST_SOCKET" send-keys -t "$worker_pane" "$PRINTENV_CMD" Enter |
| 216 | + |
| 217 | +# ---- Wait for output, then capture -------------------------------------- |
| 218 | +sleep 5 |
| 219 | +captured="$(tmux -L "$TEST_SOCKET" capture-pane -t "$worker_pane" -p)" |
| 220 | +log "captured pane snapshot (last 40 lines):" |
| 221 | +printf '%s\n' "$captured" | tail -n 40 | sed 's/^/ /' |
| 222 | + |
| 223 | +# ---- Assert each var is non-empty in the captured output ---------------- |
| 224 | +# We assert the *value* line (the line after printenv's echo) is non-empty. |
| 225 | +# `printenv VAR` prints either the value followed by a newline, or nothing |
| 226 | +# at all (and exits non-zero) when the var is unset. We can't easily parse |
| 227 | +# per-var output (printenv's multi-arg form concatenates results without |
| 228 | +# labels), so we use a defensive heuristic: each var name listed in the |
| 229 | +# command line above must appear once (the echoed command line itself) and |
| 230 | +# at least one non-command, non-prompt line of output must follow. |
| 231 | +fail_count=0 |
| 232 | +assertions=( |
| 233 | + "CODEX_FLEET_SPECIALTY" |
| 234 | + "CODEX_FLEET_TIER" |
| 235 | + "CODEX_FLEET_AGENT_NAME" |
| 236 | + "CODEX_FLEET_WORKER_CWD" |
| 237 | +) |
| 238 | + |
| 239 | +# Pull only the lines that follow the last printenv echo in the capture. |
| 240 | +# This trims older noise (codex banner, wake-prompt, etc). |
| 241 | +post_cmd=$(printf '%s\n' "$captured" \ |
| 242 | + | awk -v cmd="$PRINTENV_CMD" ' |
| 243 | + index($0, cmd) { last = NR; next } |
| 244 | + { lines[NR] = $0 } |
| 245 | + END { |
| 246 | + for (i = last + 1; i <= NR; i++) { |
| 247 | + if (lines[i] != "") print lines[i] |
| 248 | + } |
| 249 | + }') |
| 250 | + |
| 251 | +if [ -z "$post_cmd" ]; then |
| 252 | + fail "no output captured after sending printenv; codex CLI may not have executed the command" |
| 253 | + fail "full pane capture follows:" |
| 254 | + printf '%s\n' "$captured" | sed 's/^/ /' |
| 255 | + exit 1 |
| 256 | +fi |
| 257 | + |
| 258 | +# Count non-empty value lines after the printenv echo. printenv emits one |
| 259 | +# value per arg in order, so for 4 vars we expect at least 4 non-empty |
| 260 | +# lines of values before the next codex prompt redraws. |
| 261 | +value_lines=$(printf '%s\n' "$post_cmd" | grep -v -E '^[[:space:]]*[›>][[:space:]]*$' | grep -vE '^[[:space:]]*$' | head -n 20) |
| 262 | +log "value-line region:" |
| 263 | +printf '%s\n' "$value_lines" | sed 's/^/ >> /' |
| 264 | + |
| 265 | +# Heuristic per-var presence check: every value line we found must be |
| 266 | +# non-empty. printenv writes nothing for an unset var, so a missing var |
| 267 | +# would manifest as N-1 lines (or fewer) of output. |
| 268 | +nonempty_count=$(printf '%s\n' "$value_lines" | grep -cE '^.+$' || true) |
| 269 | +if [ "$nonempty_count" -lt 4 ]; then |
| 270 | + fail "expected at least 4 non-empty value lines from printenv (one per var), got $nonempty_count" |
| 271 | + fail "one or more of ${assertions[*]} is unset in the spawned codex CLI" |
| 272 | + fail "this is the SI-11 -> SI-13 regression: bringup-time env did not propagate to the codex process" |
| 273 | + fail_count=$((fail_count + 1)) |
| 274 | +fi |
| 275 | + |
| 276 | +# Additional sanity check: the SI-11 routing-filter must have logged a |
| 277 | +# non-empty FLEET_DEFAULT_SPECIALTY (foreign writable_roots forces it). |
| 278 | +if ! grep -qE "auto-routing: CODEX_FLEET_SPECIALTY default = '[^']+'" "$bringup_log"; then |
| 279 | + fail "bringup did not log a non-empty CODEX_FLEET_SPECIALTY default; SI-11 routing-filter may not have fired for the fixture plan" |
| 280 | + fail_count=$((fail_count + 1)) |
| 281 | +fi |
| 282 | + |
| 283 | +if [ "$fail_count" -gt 0 ]; then |
| 284 | + fail "env-propagation assertions failed (count=$fail_count); see SI-11 -> SI-13 regression notes in this script's header" |
| 285 | + exit 1 |
| 286 | +fi |
| 287 | + |
| 288 | +log "OK: all four CODEX_FLEET_* env vars propagated to the spawned codex CLI" |
| 289 | +exit 0 |
0 commit comments