From 20d128b4bef9037dd6f2e12f83a8b88f3c2d5bd2 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Thu, 30 Apr 2026 12:30:15 -0700 Subject: [PATCH 01/14] fix(recovery): add connect probe recovery path Signed-off-by: Aaron Erickson --- src/lib/agent-runtime.test.ts | 72 +++- src/lib/agent-runtime.ts | 98 ++++- src/lib/command-registry.ts | 2 +- src/nemoclaw.ts | 221 +++++++---- test/cli.test.ts | 360 +++++++++++++++++- .../test-issue-2478-crash-loop-recovery.sh | 56 ++- 6 files changed, 717 insertions(+), 92 deletions(-) diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts index dedbee930f..a1f96df2b2 100644 --- a/src/lib/agent-runtime.test.ts +++ b/src/lib/agent-runtime.test.ts @@ -3,7 +3,7 @@ import { describe, it, expect } from "vitest"; // Import from compiled dist/ so coverage is attributed correctly. -import { buildRecoveryScript } from "../../dist/lib/agent-runtime"; +import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../dist/lib/agent-runtime"; import type { AgentDefinition } from "./agent-defs"; function makeAgent(overrides: Partial = {}): AgentDefinition { @@ -60,13 +60,13 @@ describe("buildRecoveryScript", () => { it("launches the default gateway command through the validated agent binary", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain("command -v 'test-agent'"); - expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000'); + expect(script).toContain('"$AGENT_BIN" gateway run --port 19000'); }); it("falls back to openclaw gateway run when gateway_command is absent", () => { const agent = makeAgent({ gateway_command: undefined }); const script = buildRecoveryScript(agent, 19000); - expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000'); + expect(script).toContain('"$AGENT_BIN" gateway run --port 19000'); }); it("validates and launches custom gateway commands explicitly", () => { @@ -127,11 +127,13 @@ describe("buildRecoveryScript", () => { // anyone debugging a crash-loop. (#2478) expect(script).toContain('echo "$_W" >> /tmp/gateway.log'); // And the warning must be deferred until AFTER gateway.log is - // freshly touched/chmod'd, otherwise the redirect targets a stale - // file that gets removed seconds later. - const touchIdx = script!.indexOf("touch /tmp/gateway.log"); + // safely opened with O_NOFOLLOW, otherwise the redirect targets a + // stale or attacker-controlled file. + const prepareIdx = script!.indexOf("os.open(path, flags, 0o644)"); const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log'); - expect(touchIdx).toBeLessThan(warnIdx); + expect(prepareIdx).toBeGreaterThanOrEqual(0); + expect(warnIdx).toBeGreaterThanOrEqual(0); + expect(prepareIdx).toBeLessThan(warnIdx); }); it("appends (not truncates) gateway.log on launch so warnings survive", () => { @@ -139,8 +141,62 @@ describe("buildRecoveryScript", () => { // Truncating with `>` wipes the [gateway-recovery] WARNING that the // recovery script wrote moments earlier — meaning a sysadmin tailing // gateway.log would see the eventual crash without the explanation. - expect(script).toContain(">> /tmp/gateway.log 2>&1 &"); + expect(script).toContain('>> "$_GATEWAY_LOG" 2>&1 &'); expect(script).not.toMatch(/[^>]> \/tmp\/gateway\.log 2>&1 &/); }); + + it("preserves an existing gateway.log and has a writable fallback log", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).not.toContain("rm -f /tmp/gateway.log"); + expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log"); + expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log"); + }); + + it("rejects a symlinked gateway.log before preparing the log", () => { + const script = buildOpenClawRecoveryScript(18789); + const noFollowIdx = script.indexOf("O_NOFOLLOW"); + const openIdx = script.indexOf("os.open(path, flags, 0o644)"); + const fchownIdx = script.indexOf("os.fchown(fd"); + expect(script).toContain("refusing to prepare symlinked /tmp/gateway.log"); + expect(script).toContain("sys.exit(1)"); + expect(script).not.toContain(": > /tmp/gateway.log"); + expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log"); + expect(noFollowIdx).toBeGreaterThanOrEqual(0); + expect(openIdx).toBeGreaterThanOrEqual(0); + expect(fchownIdx).toBeGreaterThanOrEqual(0); + expect(noFollowIdx).toBeLessThan(openIdx); + expect(openIdx).toBeLessThan(fchownIdx); + }); + + it("prepares gateway.log for the real gateway-owned sandbox log", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).toContain("os.fchown(fd"); + expect(script).toContain("pw.pw_gid"); + expect(script).not.toContain("grp.getgrnam"); + expect(script).toContain("owner_mode = 0o644"); + expect(script).toContain("os.fchmod(fd, owner_mode)"); + expect(script).toContain("/tmp/gateway.log 'gateway'"); + expect(script).toContain("gosu 'gateway'"); + }); + + it("prepares auto-pair.log without unlinking or following symlinks", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).toContain("refusing to prepare symlinked /tmp/auto-pair.log"); + expect(script).toContain("/tmp/auto-pair.log 'sandbox'"); + expect(script).toContain("owner_mode = 0o600"); + expect(script).not.toContain("rm -f /tmp/auto-pair.log"); + expect(script).not.toContain(": > /tmp/auto-pair.log"); + expect(script).not.toContain("touch /tmp/auto-pair.log"); + expect(script).not.toContain("chown sandbox:sandbox /tmp/auto-pair.log"); + expect(script).not.toContain("chmod 600 /tmp/auto-pair.log"); + }); + + it("does not force non-OpenClaw agents to run as the gateway user", () => { + const script = buildRecoveryScript(minimalAgent, 19000); + expect(script).not.toContain("chown gateway:gateway /tmp/gateway.log"); + expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log"); + expect(script).not.toContain("gosu gateway"); + expect(script).not.toContain("gosu 'gateway'"); + }); }); }); diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts index 495f86efa1..6995cc7d45 100644 --- a/src/lib/agent-runtime.ts +++ b/src/lib/agent-runtime.ts @@ -48,6 +48,93 @@ export function getHealthProbeUrl(agent: AgentDefinition | null): string { return agent.healthProbe?.url || `http://127.0.0.1:${DASHBOARD_PORT}/`; } +function buildNoFollowLogSetupCommand( + path: string, + logOwnerUser?: string, + ownerMode = "0o644", +): string { + const displayPath = path.replace(/\\/g, "\\\\").replace(/'/g, "\\'"); + const prepareLog = [ + "import errno, os, pwd, stat, sys", + "path = sys.argv[1]", + "owner = sys.argv[2] if len(sys.argv) > 2 else ''", + `owner_mode = ${ownerMode}`, + "fallback_mode = 0o600", + "flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC | getattr(os, 'O_NOFOLLOW', 0)", + "try:", + " fd = os.open(path, flags, 0o644)", + "except OSError as exc:", + " if exc.errno == errno.ELOOP:", + ` print('[gateway-recovery] ERROR: refusing to prepare symlinked ${displayPath}', file=sys.stderr)`, + " sys.exit(1)", + " if exc.errno in (errno.EACCES, errno.EPERM):", + ` print('[gateway-recovery] ERROR: ${displayPath} is not writable by recovery user', file=sys.stderr)`, + " sys.exit(0)", + ` print(f'[gateway-recovery] ERROR: cannot prepare ${displayPath}: {exc}', file=sys.stderr)`, + " sys.exit(1)", + "try:", + " if not stat.S_ISREG(os.fstat(fd).st_mode):", + ` print('[gateway-recovery] ERROR: ${displayPath} is not a regular file', file=sys.stderr)`, + " sys.exit(1)", + " if owner and os.geteuid() == 0:", + " try:", + " pw = pwd.getpwnam(owner)", + " except KeyError:", + " os.fchmod(fd, fallback_mode)", + " else:", + " os.fchown(fd, pw.pw_uid, pw.pw_gid)", + " os.fchmod(fd, owner_mode)", + " else:", + " os.fchmod(fd, fallback_mode)", + "finally:", + " os.close(fd)", + ].join("\n"); + return [ + "python3", + "-c", + shellQuote(prepareLog), + path, + ...(logOwnerUser ? [shellQuote(logOwnerUser)] : []), + ].join(" "); +} + +function buildGatewayLogSetup(includeAutoPairLog = false, logOwnerUser?: string): string[] { + const lines = [`${buildNoFollowLogSetupCommand("/tmp/gateway.log", logOwnerUser)};`]; + if (includeAutoPairLog) { + lines.push(`${buildNoFollowLogSetupCommand("/tmp/auto-pair.log", "sandbox", "0o600")};`); + } + return lines; +} + +function gatewayLaunchCommand(command: string, runAsUser?: string): string { + const userLaunch = `_GATEWAY_LOG=/tmp/gateway.log; if ! : >> "$_GATEWAY_LOG" 2>/dev/null; then _GATEWAY_LOG=/tmp/gateway-recovery.log; : > "$_GATEWAY_LOG" 2>/dev/null || true; fi; nohup ${command} >> "$_GATEWAY_LOG" 2>&1 &`; + if (!runAsUser) { + return userLaunch; + } + return `if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> /tmp/gateway.log 2>&1 & else ${userLaunch} fi`; +} + +/** + * Build the OpenClaw recovery shell script used by the default sandbox. + */ +export function buildOpenClawRecoveryScript(port: number): string { + return [ + "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", + "[ -f ~/.bashrc ] && . ~/.bashrc;", + 'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _GUARDS_MISSING=0 ;; *) _GUARDS_MISSING=1 ;; esac;', + `if curl -sf --max-time 3 http://127.0.0.1:${port}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`, + "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", + ...buildGatewayLogSetup(true, "gateway"), + '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', + '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', + 'OPENCLAW="$(command -v openclaw)";', + 'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;', + gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"), + "GPID=$!; sleep 2;", + 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi', + ].join(" "); +} + /** * Build the recovery shell script for a non-OpenClaw agent. * Returns the script string, or null if agent is null (use existing inline @@ -78,8 +165,8 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number) // *why* the gateway is about to crash gets wiped by the same launch // that's about to crash on a missing guard. (#2478) const launchCommand = usesValidatedBinary - ? `nohup "$AGENT_BIN" gateway run --port ${port} >> /tmp/gateway.log 2>&1 &` - : `nohup ${configuredGatewayCommand} --port ${port} >> /tmp/gateway.log 2>&1 &`; + ? gatewayLaunchCommand(`"$AGENT_BIN" gateway run --port ${port}`) + : gatewayLaunchCommand(`${configuredGatewayCommand} --port ${port}`); const isHermes = agent.name === "hermes"; const hermesHome = isHermes ? "export HERMES_HOME=/sandbox/.hermes; " : ""; @@ -106,10 +193,9 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number) 'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _GUARDS_MISSING=0 ;; *) _GUARDS_MISSING=1 ;; esac;', hermesHome, `if curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`, - "rm -f /tmp/gateway.log;", - "touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;", - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing — gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', - '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload — gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', + ...buildGatewayLogSetup(false), + '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', + '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', ...validationSteps, launchCommand, "GPID=$!; sleep 2;", diff --git a/src/lib/command-registry.ts b/src/lib/command-registry.ts index 85d842c4d5..7b841c52e9 100644 --- a/src/lib/command-registry.ts +++ b/src/lib/command-registry.ts @@ -97,6 +97,7 @@ export const COMMANDS: readonly CommandDef[] = [ { usage: "nemoclaw connect", description: "Shell into a running sandbox", + flags: "[--probe-only]", group: "Sandbox Management", scope: "sandbox", }, @@ -398,7 +399,6 @@ export const COMMANDS: readonly CommandDef[] = [ scope: "global", hidden: true, }, - ] as const; /** All global-scope commands. */ diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index d99bdbc527..99dc5c2de4 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -133,6 +133,8 @@ type SandboxCommandResult = { stderr: string; }; +const SANDBOX_EXEC_STARTED_MARKER = "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; + type RecoveredSandboxMetadata = Partial< Pick > & { @@ -234,6 +236,7 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom timeout: OPENSHELL_PROBE_TIMEOUT_MS, }); if (sshConfigResult.status !== 0) return null; + if (!sshConfigResult.output.trim()) return null; const tmpFile = path.join(os.tmpdir(), `nemoclaw-ssh-${process.pid}-${Date.now()}.conf`); fs.writeFileSync(tmpFile, sshConfigResult.output, { mode: 0o600 }); @@ -272,6 +275,46 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom } } +function executeSandboxExecCommand( + sandboxName: string, + command: string, + timeout = 15000, +): SandboxCommandResult | null { + const markedCommand = `printf '%s\\n' '${SANDBOX_EXEC_STARTED_MARKER}'; ${command}`; + try { + const result = spawnSync( + getOpenshellBinary(), + ["sandbox", "exec", "--name", sandboxName, "--", "sh", "-c", markedCommand], + { + cwd: ROOT, + encoding: "utf-8", + env: process.env, + stdio: ["ignore", "pipe", "pipe"], + timeout, + }, + ); + const stdout = (result.stdout || "").trim(); + const stdoutLines = stdout.split(/\r?\n/); + const markerIndex = stdoutLines.indexOf(SANDBOX_EXEC_STARTED_MARKER); + if (markerIndex === -1) return null; + const commandStdoutLines = stdoutLines.slice(markerIndex + 1); + return { + status: result.error ? 1 : (result.status ?? 1), + stdout: commandStdoutLines.join("\n").trim(), + stderr: (result.stderr || "").trim(), + }; + } catch { + return null; + } +} + +function parseSandboxGatewayProbe(result: SandboxCommandResult | null): boolean | null { + if (!result) return null; + if (result.stdout === "RUNNING") return true; + if (result.stdout === "STOPPED") return false; + return null; +} + /** * Check whether the OpenClaw gateway process is running inside the sandbox. * Uses the gateway's HTTP endpoint (dashboard port) as the source of truth, @@ -281,62 +324,40 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom function isSandboxGatewayRunning(sandboxName: string): boolean | null { const agent = agentRuntime.getSessionAgent(sandboxName); const probeUrl = agentRuntime.getHealthProbeUrl(agent); - const result = executeSandboxCommand( - sandboxName, - `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`, - ); - if (!result) return null; - if (result.stdout === "RUNNING") return true; - if (result.stdout === "STOPPED") return false; - return null; + const command = `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`; + const execProbe = parseSandboxGatewayProbe(executeSandboxExecCommand(sandboxName, command)); + if (execProbe !== null) return execProbe; + return parseSandboxGatewayProbe(executeSandboxCommand(sandboxName, command)); } /** - * Restart the OpenClaw gateway process inside the sandbox after a pod restart. + * Restart the gateway process inside the sandbox after a pod restart. * Cleans stale lock/temp files, sources proxy config, and launches the gateway * in the background. Returns true on success. */ function recoverSandboxProcesses(sandboxName: string): boolean { const agent = agentRuntime.getSessionAgent(sandboxName); const agentScript = agentRuntime.buildRecoveryScript(agent, agent?.forwardPort ?? DASHBOARD_PORT); - const script = - agentScript || - [ - // Source /tmp/nemoclaw-proxy-env.sh explicitly so NODE_OPTIONS preload - // guards (safety-net, ciao, slack, …) survive gateway respawn. Without - // this, library errors crash-loop the gateway because the original - // .bashrc-only path silently failed when the env file was unreadable - // or the shell did not source ~/.bashrc. See #2478. Mirrors the - // hardened block in src/lib/agent-runtime.ts:buildRecoveryScript. - // Defer warning emission until AFTER touch+chmod gateway.log so - // warnings land in the persistent log a sysadmin would tail. Stderr - // alone hides them because executeSandboxCommand captures stderr - // without surfacing it. Mirrors src/lib/agent-runtime.ts. - "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", - "[ -f ~/.bashrc ] && . ~/.bashrc;", - 'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _GUARDS_MISSING=0 ;; *) _GUARDS_MISSING=1 ;; esac;', - `if curl -sf --max-time 3 http://127.0.0.1:${DASHBOARD_PORT}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`, - "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", - "rm -f /tmp/gateway.log /tmp/auto-pair.log;", - "touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;", - "touch /tmp/auto-pair.log; chmod 600 /tmp/auto-pair.log;", - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing — gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', - '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload — gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', - 'OPENCLAW="$(command -v openclaw)";', - 'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;', - // Append rather than truncate so [gateway-recovery] WARNING lines - // written above survive past the launch. (#2478) - `nohup "$OPENCLAW" gateway run --port ${DASHBOARD_PORT} >> /tmp/gateway.log 2>&1 &`, - "GPID=$!; sleep 2;", - 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi', - ].join(" "); - - const result = executeSandboxCommand(sandboxName, script); - if (!result) return false; - return ( - result.status === 0 && - (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING")) - ); + const hasRecoveryMarker = (result: SandboxCommandResult | null) => + !!( + result && + (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING")) + ); + const recoveredSsh = (result: SandboxCommandResult | null) => + !!(result && result.status === 0 && hasRecoveryMarker(result)); + + if (agentScript) { + // Non-OpenClaw manifests do not yet declare a runtime user for root + // sandbox exec. Recover them over SSH so the launch inherits the sandbox + // login user instead of creating root-owned agent state under /sandbox. + return recoveredSsh(executeSandboxCommand(sandboxName, agentScript)); + } + + const script = agentRuntime.buildOpenClawRecoveryScript(DASHBOARD_PORT); + const execResult = executeSandboxExecCommand(sandboxName, script, 30000); + if (hasRecoveryMarker(execResult)) return true; + if (execResult !== null) return false; + return recoveredSsh(executeSandboxCommand(sandboxName, script)); } /** @@ -1531,10 +1552,79 @@ async function listSandboxes(args: string[] = []): Promise { // ── Sandbox-scoped actions ─────────────────────────────────────── -async function sandboxConnect(sandboxName: string) { +type SandboxConnectOptions = { + probeOnly?: boolean; +}; + +function printSandboxConnectHelp(sandboxName = "") { + console.log(""); + console.log(` Usage: nemoclaw ${sandboxName} connect [--probe-only]`); + console.log(""); + console.log(" Options:"); + console.log( + " --probe-only Run recovery checks and exit without opening SSH", + ); + console.log(" -h, --help Show this help"); + console.log(""); +} + +function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): SandboxConnectOptions { + const options: SandboxConnectOptions = {}; + for (const arg of actionArgs) { + switch (arg) { + case "--dangerously-skip-permissions": + console.error(" --dangerously-skip-permissions was removed; use shields commands instead."); + printSandboxConnectHelp(sandboxName); + process.exit(1); + case "--probe-only": + options.probeOnly = true; + break; + case "--help": + case "-h": + printSandboxConnectHelp(sandboxName); + process.exit(0); + break; + default: + console.error(` Unknown flag for connect: ${arg}`); + printSandboxConnectHelp(sandboxName); + process.exit(1); + } + } + return options; +} + +async function sandboxConnect( + sandboxName: string, + { probeOnly = false }: SandboxConnectOptions = {}, +) { const { isSandboxReady, parseSandboxStatus } = require("./lib/onboard"); await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true }); + if (probeOnly) { + const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true }); + const agent = agentRuntime.getSessionAgent(sandboxName); + const agentName = agentRuntime.getAgentDisplayName(agent); + if (!processCheck.checked) { + console.error( + ` Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`, + ); + process.exit(1); + } + if (processCheck.wasRunning) { + console.log(` Probe complete: ${agentName} gateway is running in '${sandboxName}'.`); + return; + } + if (processCheck.recovered) { + console.log(` Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`); + return; + } + console.error( + ` Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`, + ); + console.error(" Check /tmp/gateway.log inside the sandbox for details."); + process.exit(1); + } + // Version staleness check — warn but don't block try { const versionCheck = sandboxVersion.checkAgentVersion(sandboxName); @@ -4340,11 +4430,26 @@ const [cmd, ...args] = process.argv.slice(2); } // Sandbox-scoped commands: nemoclaw + const firstSandboxArg = args[0]; + const implicitConnectArg = + firstSandboxArg === "--help" || firstSandboxArg === "-h" || firstSandboxArg === "--probe-only"; + const requestedSandboxAction = + !firstSandboxArg || implicitConnectArg ? "connect" : firstSandboxArg; + const requestedSandboxActionArgs = !firstSandboxArg || implicitConnectArg ? args : args.slice(1); + if ( + requestedSandboxAction === "connect" && + requestedSandboxActionArgs.some((arg) => arg === "--help" || arg === "-h") + ) { + validateName(cmd, "sandbox name"); + printSandboxConnectHelp(cmd); + return; + } + // If the registry doesn't know this name but the action is a sandbox-scoped // command, attempt recovery — the sandbox may still be live with a stale registry. // Derived from command registry — single source of truth const sandboxActions = sandboxActionTokens(); - if (!registry.getSandbox(cmd) && sandboxActions.includes(args[0] || "")) { + if (!registry.getSandbox(cmd) && sandboxActions.includes(requestedSandboxAction)) { validateName(cmd, "sandbox name"); await recoverRegistryEntries({ requestedSandboxName: cmd }); if (!registry.getSandbox(cmd)) { @@ -4363,24 +4468,12 @@ const [cmd, ...args] = process.argv.slice(2); const sandbox = registry.getSandbox(cmd); if (sandbox) { validateName(cmd, "sandbox name"); - const action = args[0] || "connect"; - const actionArgs = args.slice(1); + const action = requestedSandboxAction; + const actionArgs = requestedSandboxActionArgs; switch (action) { case "connect": - if (actionArgs.length > 0) { - console.error( - ` Unknown connect argument${actionArgs.length === 1 ? "" : "s"}: ${actionArgs.join(" ")}`, - ); - if (actionArgs.includes("--dangerously-skip-permissions")) { - console.error( - " --dangerously-skip-permissions was removed; use shields commands instead.", - ); - } - console.error(` Usage: ${CLI_NAME} connect`); - process.exit(1); - } - await sandboxConnect(cmd); + await sandboxConnect(cmd, parseSandboxConnectArgs(cmd, actionArgs)); break; case "status": await sandboxStatus(cmd); diff --git a/test/cli.test.ts b/test/cli.test.ts index 34adc85690..6cb4b99a92 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -90,7 +90,7 @@ function readRecordedArgs(markerFile: string): string[] { return fs.readFileSync(markerFile, "utf8").trim().split(/\s+/); } -function writeSandboxRegistry(home: string): void { +function writeSandboxRegistry(home: string, sandboxOverrides: Record = {}): void { const registryDir = path.join(home, ".nemoclaw"); fs.mkdirSync(registryDir, { recursive: true }); fs.writeFileSync( @@ -103,6 +103,7 @@ function writeSandboxRegistry(home: string): void { provider: "nvidia-prod", gpuEnabled: false, policies: [], + ...sandboxOverrides, }, }, defaultSandbox: "alpha", @@ -1415,6 +1416,363 @@ describe("CLI dispatch", () => { expect(calls.some((call) => call.startsWith("forward start --background 18789"))).toBe(false); }); + it("shows connect help without opening an interactive session", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-help-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + fs.mkdirSync(localBin, { recursive: true }); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`, + "exit 99", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --help", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + const implicit = runWithEnv("alpha --help", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Usage: nemoclaw alpha connect"); + expect(r.out).toContain("--probe-only"); + expect(implicit.code).toBe(0); + expect(implicit.out).toContain("Usage: nemoclaw alpha connect"); + expect(fs.existsSync(markerFile)).toBe(false); + }); + + it("rejects the removed skip-permissions connect flag", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-flags-")); + writeSandboxRegistry(home); + + const r = runWithEnv("alpha connect --dangerously-skip-permissions", { + HOME: home, + }); + + expect(r.code).toBe(1); + expect(r.out).toContain("--dangerously-skip-permissions was removed"); + }); + + it("connect --probe-only recovers the gateway without opening SSH", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + const stateFile = path.join(home, "probe-state"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync(stateFile, "stopped"); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + `state_file=${JSON.stringify(stateFile)}`, + 'printf \'%s\\n\' "$*" >> "$marker_file"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' case "$cmd" in', + ' *"OPENCLAW="*)', + ' echo recovered > "$state_file"', + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + " echo 'GATEWAY_PID=123'", + " exit 42", + " ;;", + " *'curl -sf'*)", + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + ' if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi', + " exit 0", + " ;;", + " esac", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: recovered OpenClaw gateway"); + const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean); + expect(calls).toContain("sandbox get alpha"); + expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); + expect(calls).not.toContain("sandbox ssh-config alpha"); + expect(calls).not.toContain("sandbox connect alpha"); + }); + + it("treats leading --probe-only as an implicit connect probe", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-leading-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + 'printf \'%s\\n\' "$*" >> "$marker_file"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' if [[ "$cmd" == *"curl -sf"* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo RUNNING; exit 0; fi', + ' if [[ "$cmd" == *"OPENCLAW="* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo UNEXPECTED_RECOVERY; exit 1; fi', + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: OpenClaw gateway is running"); + const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean); + expect(calls).toContain("sandbox get alpha"); + expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); + expect(calls).not.toContain("sandbox ssh-config alpha"); + expect(calls).not.toContain("sandbox connect alpha"); + }); + + it("connect --probe-only does not retry a failed sandbox exec recovery over SSH", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-no-ssh-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + 'printf \'%s\\n\' "$*" >> "$marker_file"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' if [[ "$cmd" == *"OPENCLAW="* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo RECOVERY_FAILED >&2; exit 42; fi', + ' if [[ "$cmd" == *"curl -sf"* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo STOPPED; exit 0; fi', + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ]; then', + " echo 'Host openshell-alpha'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean); + expect(calls).toContain("sandbox get alpha"); + expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); + expect(calls).not.toContain("sandbox ssh-config alpha"); + }); + + it("connect --probe-only falls back to SSH when sandbox exec never starts", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-exec-fallback-"), + ); + const localBin = path.join(home, "bin"); + const openshellCalls = path.join(home, "openshell-calls"); + const sshCalls = path.join(home, "ssh-calls"); + const stateFile = path.join(home, "probe-state"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync(stateFile, "stopped"); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `calls=${JSON.stringify(openshellCalls)}`, + 'printf \'%s\\n\' "$*" >> "$calls"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ]; then', + " echo 'error: sandbox exec transport failed before command start' >&2", + " exit 2", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ] && [ "$3" = "alpha" ]; then', + " echo 'Host openshell-alpha'", + " echo ' HostName 127.0.0.1'", + " echo ' User sandbox'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "ssh"), + [ + "#!/usr/bin/env bash", + `calls=${JSON.stringify(sshCalls)}`, + `state_file=${JSON.stringify(stateFile)}`, + 'cmd="${@: -1}"', + 'printf \'%s\\n\' "$cmd" >> "$calls"', + 'if [[ "$cmd" == *"OPENCLAW="* ]]; then', + ' echo recovered > "$state_file"', + " echo 'GATEWAY_PID=456'", + " exit 0", + "fi", + 'if [[ "$cmd" == *"curl -sf"* ]]; then', + ' if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi', + " exit 0", + "fi", + "exit 1", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: recovered OpenClaw gateway"); + const openshellLog = fs.readFileSync(openshellCalls, "utf8"); + const sshLog = fs.readFileSync(sshCalls, "utf8"); + expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c"); + expect(openshellLog).toContain("sandbox ssh-config alpha"); + expect(sshLog).toContain('OPENCLAW="$(command -v openclaw)"'); + }); + + it("recovers non-OpenClaw agents over SSH instead of root sandbox exec", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-agent-")); + const localBin = path.join(home, "bin"); + const openshellCalls = path.join(home, "openshell-calls"); + const sshCalls = path.join(home, "ssh-calls"); + const stateFile = path.join(home, "probe-state"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, { agent: "hermes" }); + fs.writeFileSync(stateFile, "stopped"); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `calls=${JSON.stringify(openshellCalls)}`, + `state_file=${JSON.stringify(stateFile)}`, + 'printf \'%s\\n\' "$*" >> "$calls"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' if [[ "$cmd" == *"curl -sf"* ]]; then', + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + ' if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi', + " exit 0", + " fi", + ' if [[ "$cmd" == *"HERMES_HOME=/sandbox/.hermes"* || "$cmd" == *"AGENT_BIN="* ]]; then', + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + " echo UNEXPECTED_ROOT_EXEC_RECOVERY", + " exit 1", + " fi", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ] && [ "$3" = "alpha" ]; then', + " echo 'Host openshell-alpha'", + " echo ' HostName 127.0.0.1'", + " echo ' User sandbox'", + " exit 0", + "fi", + 'if [ "$1" = "forward" ]; then', + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "ssh"), + [ + "#!/usr/bin/env bash", + `calls=${JSON.stringify(sshCalls)}`, + `state_file=${JSON.stringify(stateFile)}`, + 'cmd="${@: -1}"', + 'printf \'%s\\n\' "$cmd" >> "$calls"', + 'if [[ "$cmd" == *"AGENT_BIN=\'/usr/local/bin/hermes\'"* ]]; then', + ' echo recovered > "$state_file"', + " echo 'GATEWAY_PID=789'", + " exit 0", + "fi", + "exit 1", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: recovered Hermes Agent gateway"); + const openshellLog = fs.readFileSync(openshellCalls, "utf8"); + const sshLog = fs.readFileSync(sshCalls, "utf8"); + expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c"); + expect(openshellLog).toContain("sandbox ssh-config alpha"); + expect(openshellLog).not.toContain("HERMES_HOME=/sandbox/.hermes"); + expect(openshellLog).not.toContain("AGENT_BIN="); + expect(sshLog).toContain("HERMES_HOME=/sandbox/.hermes"); + expect(sshLog).toContain("AGENT_BIN='/usr/local/bin/hermes'"); + }); + it("waits for sandbox readiness before connecting", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-wait-")); const localBin = path.join(home, "bin"); diff --git a/test/e2e/test-issue-2478-crash-loop-recovery.sh b/test/e2e/test-issue-2478-crash-loop-recovery.sh index 0bdbdb8aec..88d76bba67 100755 --- a/test/e2e/test-issue-2478-crash-loop-recovery.sh +++ b/test/e2e/test-issue-2478-crash-loop-recovery.sh @@ -30,11 +30,12 @@ # 2. Verifies the *initial* gateway has the safety-net + ciao guard active # (via /proc//environ on the gateway PID). # 3. Crash-recovery loop (NORMAL): kill the gateway 5x, each time triggers -# `nemoclaw connect` (which calls recoverSandboxProcesses), and -# checks the respawned gateway still has guards in NODE_OPTIONS. +# `nemoclaw connect --probe-only` (which calls +# recoverSandboxProcesses), and checks the respawned gateway still has +# guards in NODE_OPTIONS. # 4. Negative case: removes /tmp/nemoclaw-proxy-env.sh, kills the gateway, # triggers recovery — expects the new "[gateway-recovery] WARNING" -# line in stderr (visible via gateway.log) instead of silent guard loss. +# line in gateway.log instead of silent guard loss. # 5. Soak: leaves the sandbox idle for $NEMOCLAW_E2E_SOAK_SECONDS # (default 300) so the health-monitor restart cadence (~4 min in prod) # gets at least one chance to fire, then asserts the gateway has not @@ -236,6 +237,20 @@ gateway_diagnostics() { echo " ---------------------------" } +run_probe_only_or_fail() { + local context="$1" + local probe_out + probe_out="$(mktemp)" + if ! timeout 60 nemoclaw "$SANDBOX_NAME" connect --probe-only >"$probe_out" 2>&1; then + fail "${context}: connect --probe-only exited nonzero" + sed 's/^/ /' "$probe_out" + rm -f "$probe_out" + gateway_diagnostics "" + exit 1 + fi + rm -f "$probe_out" +} + # Wait until gateway PID is non-empty (or timeout). Echoes pid, returns 0/1. wait_for_gateway_up() { local timeout="${1:-30}" @@ -362,11 +377,18 @@ for cycle in $(seq 1 "$CRASH_CYCLES"); do info "Cycle $cycle/$CRASH_CYCLES — killing gateway pid=$prev_pid" sandbox_exec sh -c "kill -9 $prev_pid 2>/dev/null; sleep 1; pgrep -fo '[o]penclaw[ -]gateway' || echo DEAD" >/dev/null - # Trigger recovery via the actual code path: `nemoclaw status` - # calls checkAndRecoverSandboxProcesses() → recoverSandboxProcesses(), - # which is the hardened path #2478 changes. Bound it with `timeout` - # so a hang in CLI internals cannot eat the whole 30-min job budget. - timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true + # Trigger recovery via the actual operator probe path: + # `nemoclaw connect --probe-only` calls + # checkAndRecoverSandboxProcesses() -> recoverSandboxProcesses() without + # opening an interactive SSH session. Bound it with `timeout` so a hang in + # CLI internals cannot eat the whole 30-min job budget. + run_probe_only_or_fail "Cycle $cycle after gateway kill" + + if ! sandbox_exec sh -c 'test -s /tmp/gateway.log'; then + fail "Cycle $cycle: connect --probe-only did not leave /tmp/gateway.log evidence" + gateway_diagnostics "" + exit 1 + fi new_pid="$(wait_for_gateway_up 45)" if [ -z "$new_pid" ]; then @@ -426,9 +448,10 @@ info "Snapshotted proxy-env.sh ($SNAPSHOT_SIZE bytes, ${#SNAPSHOT_B64}-char base # recovery script (which is the only path that emits the warning). sandbox_exec sh -c 'rm -f /tmp/nemoclaw-proxy-env.sh' >/dev/null sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null -timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true +run_probe_only_or_fail "Negative case after proxy-env removal" -# The new gateway.log should contain the [gateway-recovery] WARNING line. +# The new gateway.log should contain the [gateway-recovery] WARNING line and +# recovery should have attempted a real gateway respawn. warn_seen=false for _ in 1 2 3 4 5; do if gateway_log_tail 100 | grep -q '\[gateway-recovery\] WARNING'; then @@ -443,6 +466,13 @@ else fail "Recovery silently launched without warning (regression of #2478 fix)" gateway_log_tail 100 fi +NEGATIVE_PID="$(wait_for_gateway_up 45)" +if [ -z "$NEGATIVE_PID" ]; then + fail "Recovery warning was logged, but gateway did not respawn within 45s" + gateway_diagnostics "" + exit 1 +fi +info "Negative-case recovery respawned gateway pid=$NEGATIVE_PID" # Restore proxy-env.sh by base64-injecting the snapshot via argv. `openshell # sandbox exec` does not pipe stdin from the caller through to the subshell, @@ -458,8 +488,10 @@ if [ "$restored_size" != "$SNAPSHOT_SIZE" ]; then fi info "proxy-env.sh restored (${restored_size} bytes verified)" -# Trigger recovery to bring the gateway back with guards intact. -timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true +# Kill the guardless negative-case gateway, then trigger recovery to bring the +# gateway back with guards intact from the restored env file. +sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null +run_probe_only_or_fail "Guard restore recovery" SOAK_START_PID="$(wait_for_gateway_up 30)" if [ -z "$SOAK_START_PID" ]; then fail "Gateway not up entering soak phase" From 05bfe893da515956ba72b75cae1f9df932e98b84 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Thu, 30 Apr 2026 14:36:10 -0700 Subject: [PATCH 02/14] test(recovery): address CodeRabbit cleanup Signed-off-by: Aaron Erickson --- src/lib/agent-runtime.test.ts | 6 +++--- src/nemoclaw.ts | 2 +- test/cli.test.ts | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts index a1f96df2b2..1453b06639 100644 --- a/src/lib/agent-runtime.test.ts +++ b/src/lib/agent-runtime.test.ts @@ -129,11 +129,11 @@ describe("buildRecoveryScript", () => { // And the warning must be deferred until AFTER gateway.log is // safely opened with O_NOFOLLOW, otherwise the redirect targets a // stale or attacker-controlled file. - const prepareIdx = script!.indexOf("os.open(path, flags, 0o644)"); + const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log;"); const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log'); - expect(prepareIdx).toBeGreaterThanOrEqual(0); + expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0); expect(warnIdx).toBeGreaterThanOrEqual(0); - expect(prepareIdx).toBeLessThan(warnIdx); + expect(gatewayPrepIdx).toBeLessThan(warnIdx); }); it("appends (not truncates) gateway.log on launch so warnings survive", () => { diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 99dc5c2de4..7ffafc6b4e 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -1558,7 +1558,7 @@ type SandboxConnectOptions = { function printSandboxConnectHelp(sandboxName = "") { console.log(""); - console.log(` Usage: nemoclaw ${sandboxName} connect [--probe-only]`); + console.log(` Usage: ${CLI_NAME} ${sandboxName} connect [--probe-only]`); console.log(""); console.log(" Options:"); console.log( diff --git a/test/cli.test.ts b/test/cli.test.ts index 6cb4b99a92..3e5b1974be 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1450,14 +1450,28 @@ describe("CLI dispatch", () => { it("rejects the removed skip-permissions connect flag", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-flags-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + fs.mkdirSync(localBin, { recursive: true }); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`, + "exit 99", + ].join("\n"), + { mode: 0o755 }, + ); writeSandboxRegistry(home); const r = runWithEnv("alpha connect --dangerously-skip-permissions", { HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, }); expect(r.code).toBe(1); expect(r.out).toContain("--dangerously-skip-permissions was removed"); + expect(fs.existsSync(markerFile)).toBe(false); }); it("connect --probe-only recovers the gateway without opening SSH", () => { From e6eb1afc28b9f0e2da0af968354b301b4860ffcf Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Thu, 30 Apr 2026 15:15:44 -0700 Subject: [PATCH 03/14] test(recovery): assert probe fallback stays non-interactive Signed-off-by: Aaron Erickson --- test/cli.test.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/cli.test.ts b/test/cli.test.ts index 08e6fc5f30..f63ee7dea0 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1686,7 +1686,8 @@ describe("CLI dispatch", () => { `calls=${JSON.stringify(sshCalls)}`, `state_file=${JSON.stringify(stateFile)}`, 'cmd="${@: -1}"', - 'printf \'%s\\n\' "$cmd" >> "$calls"', + 'printf \'ARGS %s\\n\' "$*" >> "$calls"', + 'printf \'CMD %s\\n\' "$cmd" >> "$calls"', 'if [[ "$cmd" == *"OPENCLAW="* ]]; then', ' echo recovered > "$state_file"', " echo 'GATEWAY_PID=456'", @@ -1712,7 +1713,9 @@ describe("CLI dispatch", () => { const sshLog = fs.readFileSync(sshCalls, "utf8"); expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c"); expect(openshellLog).toContain("sandbox ssh-config alpha"); + expect(openshellLog).not.toContain("sandbox connect"); expect(sshLog).toContain('OPENCLAW="$(command -v openclaw)"'); + expect(sshLog).not.toMatch(/(^|\s)-tt?(\s|$)/); }); it("recovers non-OpenClaw agents over SSH instead of root sandbox exec", () => { @@ -1773,7 +1776,8 @@ describe("CLI dispatch", () => { `calls=${JSON.stringify(sshCalls)}`, `state_file=${JSON.stringify(stateFile)}`, 'cmd="${@: -1}"', - 'printf \'%s\\n\' "$cmd" >> "$calls"', + 'printf \'ARGS %s\\n\' "$*" >> "$calls"', + 'printf \'CMD %s\\n\' "$cmd" >> "$calls"', 'if [[ "$cmd" == *"AGENT_BIN=\'/usr/local/bin/hermes\'"* ]]; then', ' echo recovered > "$state_file"', " echo 'GATEWAY_PID=789'", @@ -1797,8 +1801,10 @@ describe("CLI dispatch", () => { expect(openshellLog).toContain("sandbox ssh-config alpha"); expect(openshellLog).not.toContain("HERMES_HOME=/sandbox/.hermes"); expect(openshellLog).not.toContain("AGENT_BIN="); + expect(openshellLog).not.toContain("sandbox connect"); expect(sshLog).toContain("HERMES_HOME=/sandbox/.hermes"); expect(sshLog).toContain("AGENT_BIN='/usr/local/bin/hermes'"); + expect(sshLog).not.toMatch(/(^|\s)-tt?(\s|$)/); }); it("waits for sandbox readiness before connecting", () => { From d1b9145364aa116da7d6dac04cbc9c0ca7d18329 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 15:29:14 -0700 Subject: [PATCH 04/14] fix(recovery): stop on log hardening setup failure Signed-off-by: Aaron Erickson --- src/lib/agent-runtime.test.ts | 8 +++++++- src/lib/agent-runtime.ts | 6 ++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts index a888a2a6e1..c5a2bd09db 100644 --- a/src/lib/agent-runtime.test.ts +++ b/src/lib/agent-runtime.test.ts @@ -131,13 +131,19 @@ describe("buildRecoveryScript", () => { // And the warning must be deferred until AFTER gateway.log is // safely opened with O_NOFOLLOW, otherwise the redirect targets a // stale or attacker-controlled file. - const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log;"); + const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;"); const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log'); expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0); expect(warnIdx).toBeGreaterThanOrEqual(0); expect(gatewayPrepIdx).toBeLessThan(warnIdx); }); + it("stops recovery when hardened log setup fails", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).toContain(" /tmp/gateway.log 'gateway' || exit 1;"); + expect(script).toContain(" /tmp/auto-pair.log 'sandbox' || exit 1;"); + }); + it("appends (not truncates) gateway.log on launch so warnings survive", () => { const script = buildRecoveryScript(minimalAgent, 19000); // Truncating with `>` wipes the [gateway-recovery] WARNING that the diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts index 25371fda62..71dc29b9a0 100644 --- a/src/lib/agent-runtime.ts +++ b/src/lib/agent-runtime.ts @@ -99,9 +99,11 @@ function buildNoFollowLogSetupCommand( } function buildGatewayLogSetup(includeAutoPairLog = false, logOwnerUser?: string): string[] { - const lines = [`${buildNoFollowLogSetupCommand("/tmp/gateway.log", logOwnerUser)};`]; + const lines = [`${buildNoFollowLogSetupCommand("/tmp/gateway.log", logOwnerUser)} || exit 1;`]; if (includeAutoPairLog) { - lines.push(`${buildNoFollowLogSetupCommand("/tmp/auto-pair.log", "sandbox", "0o600")};`); + lines.push( + `${buildNoFollowLogSetupCommand("/tmp/auto-pair.log", "sandbox", "0o600")} || exit 1;`, + ); } return lines; } From c4d0f637deeda293e9c0b4a2e0ad85a0d133f0d1 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 16:02:32 -0700 Subject: [PATCH 05/14] fix(ci): keep PR checks green after main merge Signed-off-by: Aaron Erickson --- nemoclaw/eslint.config.mjs | 1 + scripts/nemoclaw-start.sh | 2 +- test/fetch-guard-patch-regression.test.ts | 7 +- test/nemoclaw-start.test.ts | 9 +- test/repro-2681-group-writable.test.ts | 305 +++++++++++++++------- 5 files changed, 222 insertions(+), 102 deletions(-) diff --git a/nemoclaw/eslint.config.mjs b/nemoclaw/eslint.config.mjs index aaca103e86..a3c88c8a40 100644 --- a/nemoclaw/eslint.config.mjs +++ b/nemoclaw/eslint.config.mjs @@ -10,6 +10,7 @@ export default [ parserOptions: { projectService: { allowDefaultProject: ["src/*.test.ts", "src/*/*.test.ts"], + maximumDefaultProjectFileMatchCount_THIS_WILL_SLOW_DOWN_LINTING: 32, }, tsconfigRootDir: import.meta.dirname, }, diff --git a/scripts/nemoclaw-start.sh b/scripts/nemoclaw-start.sh index 2754710292..93d9fd47d5 100755 --- a/scripts/nemoclaw-start.sh +++ b/scripts/nemoclaw-start.sh @@ -230,7 +230,7 @@ normalize_mutable_config_perms() { # Detect shields-up. Config dir owned by root means shields are # currently locked; normalizing would weaken the contract. local config_dir_owner - config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || echo unknown)" + config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || stat -f '%Su' "$config_dir" 2>/dev/null || echo unknown)" if [ "$config_dir_owner" = "root" ]; then return 0 fi diff --git a/test/fetch-guard-patch-regression.test.ts b/test/fetch-guard-patch-regression.test.ts index 7838058ca4..ba4c74508b 100644 --- a/test/fetch-guard-patch-regression.test.ts +++ b/test/fetch-guard-patch-regression.test.ts @@ -23,7 +23,12 @@ function dockerRunCommandBetween(startMarker: string, endMarker: string): string return dockerfile .slice(runIndex, end) .trim() - .replace(/^RUN\s+/, ""); + .replace(/^RUN\s+/, "") + .split("\n") + .filter((line) => !line.trimStart().startsWith("#")) + .join("\n") + .replace(/\\\n/g, " ") + .replace(/\\\s*$/, ""); } function runOpenClawUpgradeBlock(currentVersion: string) { diff --git a/test/nemoclaw-start.test.ts b/test/nemoclaw-start.test.ts index f390a33680..513194500d 100644 --- a/test/nemoclaw-start.test.ts +++ b/test/nemoclaw-start.test.ts @@ -1030,7 +1030,9 @@ describe("nemoclaw-start gateway launch signal handling", () => { "start_auto_pair() { sleep 30 & AUTO_PAIR_PID=$!; }", "cleanup_on_signal() { :; }", launchBlock(kind, gatewayLog), - "sleep 0.5", + kind === "root" + ? `for _ in {1..20}; do [ -s ${JSON.stringify(gosuLog)} ] && break; sleep 0.1; done` + : `for _ in {1..20}; do [ -s ${JSON.stringify(openclawLog)} ] && break; sleep 0.1; done`, 'printf "GATEWAY_PID=%s\\n" "$GATEWAY_PID"', 'printf "AUTO_PAIR_PID=%s\\n" "${AUTO_PAIR_PID:-}"', 'printf "TAIL_PID=%s\\n" "${GATEWAY_LOG_TAIL_PID:-}"', @@ -1070,10 +1072,10 @@ describe("nemoclaw-start gateway launch signal handling", () => { }); it("launches the root gateway through gosu with the configured port and tracks child PIDs", () => { - const { result, openclaw, gosu } = runLaunchBlock("root"); + const { result, gosu } = runLaunchBlock("root"); expect(result.status).toBe(0); expect(gosu).toContain("user=gateway"); - expect(openclaw).toContain("gateway run --port 19000"); + expect(gosu).toContain("gateway run --port 19000"); const gatewayPid = result.stdout.match(/GATEWAY_PID=(\d+)/)?.[1]; expect(gatewayPid).toBeTruthy(); expect(result.stdout).toContain(`WAIT_PID=${gatewayPid}`); @@ -1442,6 +1444,7 @@ describe("Telegram diagnostics (#2766)", () => { 'verify_no_slack_secrets_on_disk() { :; }', 'write_auth_profile() { :; }', 'harden_auth_profiles() { :; }', + 'normalize_mutable_config_perms() { :; }', 'chown() { :; }', 'chown_tree_no_symlink_follow() { :; }', 'gosu() { shift; "$@"; }', diff --git a/test/repro-2681-group-writable.test.ts b/test/repro-2681-group-writable.test.ts index c23a21810b..304524e4bb 100644 --- a/test/repro-2681-group-writable.test.ts +++ b/test/repro-2681-group-writable.test.ts @@ -1,120 +1,231 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -/** - * Regression guards for the group-writable mutable-default contract (#2681). - * - * Before this PR, control-UI config mutations in the OpenClaw sandbox - * (Enable Dreaming, account toggles, etc.) wrote through mutateConfigFile, - * which targeted /sandbox/.openclaw/openclaw.json — owned sandbox:sandbox - * mode 600. The gateway runs as a separate UID, so every mutation EACCES'd. - * - * The previous proposal (#2693) wrapped mutateConfigFile in a try/catch - * that swallowed EACCES, making the mutation a silent no-op. That made - * toggles non-functional in the sandbox. - * - * This PR replaces that approach with proper Unix group permissions: - * 1. `gateway` is a member of the `sandbox` group (Dockerfile.base usermod). - * 2. /sandbox/.openclaw is group-writable + setgid (chmod g+w + g+s). - * 3. nemoclaw-start.sh normalizes those perms before gateway launch when - * shields are not UP. - * 4. `shields down` restores 660/2770 instead of the old 600/700. - * - * Result: writes succeed in default mode without an EACCES swallow. - * - * These tests lock the structural invariants so a future change can't - * silently regress to the swallow approach. - */ - +import { spawnSync } from "node:child_process"; import fs from "node:fs"; +import os from "node:os"; import path from "node:path"; import { describe, expect, it } from "vitest"; const ROOT = path.join(import.meta.dirname, ".."); -const DOCKERFILE_BASE = fs.readFileSync(path.join(ROOT, "Dockerfile.base"), "utf-8"); -const DOCKERFILE = fs.readFileSync(path.join(ROOT, "Dockerfile"), "utf-8"); -const NEMOCLAW_START = fs.readFileSync( - path.join(ROOT, "scripts", "nemoclaw-start.sh"), - "utf-8", -); -const SHIELDS_TS = fs.readFileSync(path.join(ROOT, "src", "lib", "shields.ts"), "utf-8"); - -describe("Issue #2681 — group-writable mutable-default contract", () => { - it("Dockerfile.base adds gateway to the sandbox group", () => { - expect(DOCKERFILE_BASE).toMatch(/usermod\s+-aG\s+sandbox\s+gateway/); - }); - it("Dockerfile.base makes /sandbox/.openclaw group-writable + setgid", () => { - expect(DOCKERFILE_BASE).toMatch(/chmod\s+-R\s+g\+w\s+\/sandbox\/\.openclaw/); - expect(DOCKERFILE_BASE).toMatch( - /find\s+\/sandbox\/\.openclaw\s+-type\s+d\s+-exec\s+chmod\s+g\+s/, - ); - }); +function readRepoFile(...parts: string[]): string { + return fs.readFileSync(path.join(ROOT, ...parts), "utf-8"); +} - it("Dockerfile has stale-base fallback that idempotently adds gateway to sandbox group", () => { - // Older base images won't have the usermod yet; the derived image must - // add it at build time so PR images work even before sandbox-base is - // rebuilt. - expect(DOCKERFILE).toMatch(/id\s+gateway/); - expect(DOCKERFILE).toMatch(/usermod\s+-aG\s+sandbox\s+gateway/); - }); +function dockerRunCommandBetween( + fileParts: string[], + startMarker: string, + endMarker: string, +): string { + const dockerfile = readRepoFile(...fileParts); + const start = dockerfile.indexOf(startMarker); + const end = dockerfile.indexOf(endMarker, start); + if (start === -1 || end === -1 || end <= start) { + throw new Error(`Expected Dockerfile block between ${startMarker} and ${endMarker}`); + } + const runIndex = dockerfile.indexOf("RUN ", start); + if (runIndex === -1 || runIndex > end) { + throw new Error(`Expected RUN instruction after ${startMarker}`); + } + return dockerfile + .slice(runIndex, end) + .trim() + .replace(/^RUN\s+/, "") + .replace(/\\\n/g, " "); +} - it("Dockerfile applies group-writable + setgid in the production image too", () => { - expect(DOCKERFILE).toMatch(/chmod\s+-R\s+g\+w\s+\/sandbox\/\.openclaw/); - expect(DOCKERFILE).toMatch( - /find\s+\/sandbox\/\.openclaw\s+-type\s+d\s+-exec\s+chmod\s+g\+s/, - ); - }); +function shellFunctionFromFile( + fileParts: string[], + functionName: string, + replaceSandboxWith?: string, +): string { + const source = readRepoFile(...fileParts); + const start = source.indexOf(`${functionName}()`); + const nextSection = source.indexOf("\n# ──", start + functionName.length); + if (start === -1 || nextSection === -1 || nextSection <= start) { + throw new Error(`Expected shell function ${functionName}`); + } + const body = source.slice(start, nextSection).trim(); + return replaceSandboxWith ? body.replaceAll("/sandbox", replaceSandboxWith) : body; +} - it("Dockerfile creates .config-hash group-writable (664), not read-only-for-group (644)", () => { - // Aaron's spec item 3 explicitly calls out "group-writable config/hash - // files". The .config-hash sha256 is created AFTER the recursive chmod - // g+w pass, so it gets its own explicit chmod. Lock it to 664 so a - // future change can't silently revert to 644 and break gateway writes. - expect(DOCKERFILE).toMatch(/chmod\s+664\s+\/sandbox\/\.openclaw\/\.config-hash/); - expect(DOCKERFILE).not.toMatch(/chmod\s+644\s+\/sandbox\/\.openclaw\/\.config-hash/); +function runScript( + tmpDir: string, + lines: string[], + env: NodeJS.ProcessEnv = {}, +): { result: ReturnType; log: string } { + const logPath = path.join(tmpDir, "calls.log"); + const scriptPath = path.join(tmpDir, "run.sh"); + fs.writeFileSync( + scriptPath, + ["#!/usr/bin/env bash", "set -euo pipefail", `CALL_LOG=${JSON.stringify(logPath)}`, ...lines] + .join("\n"), + { mode: 0o700 }, + ); + const result = spawnSync("bash", [scriptPath], { + encoding: "utf-8", + env: { ...process.env, ...env }, + timeout: 5000, }); + const log = fs.existsSync(logPath) ? fs.readFileSync(logPath, "utf-8") : ""; + return { result, log }; +} - it("Dockerfile does NOT introduce a mutateConfigFile EACCES swallow patch", () => { - // The PR explicitly replaces #2693's approach. If a future PR adds - // Patch 4b back, this test fails and forces re-evaluation. - expect(DOCKERFILE).not.toMatch(/Patch\s+4b/); - expect(DOCKERFILE).not.toMatch(/mutateConfigFile.*EACCES/); - expect(DOCKERFILE).not.toMatch(/mutation not persisted/); - }); +function rewriteSandbox(command: string, sandboxRoot: string): string { + return command.replaceAll("/sandbox", sandboxRoot); +} - it("nemoclaw-start.sh defines normalize_mutable_config_perms", () => { - expect(NEMOCLAW_START).toMatch(/normalize_mutable_config_perms\s*\(\)/); - }); +function mode(pathname: string): number { + return fs.statSync(pathname).mode & 0o7777; +} - it("normalize_mutable_config_perms skips when shields are UP (root-owned config dir)", () => { - // The function must check ownership before chmod-ing; if shields are - // up the dir is root-owned and normalizing would weaken the lock. - const fnIdx = NEMOCLAW_START.indexOf("normalize_mutable_config_perms()"); - expect(fnIdx).toBeGreaterThan(0); - const fnBody = NEMOCLAW_START.slice(fnIdx, fnIdx + 1500); - expect(fnBody).toMatch(/stat\s+-c\s+'%U'/); - expect(fnBody).toMatch(/= "root"/); - }); +describe("Issue #2681 group-writable mutable-default contract", () => { + it("executes base-image user and layout setup with shared sandbox-group access", () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-2681-base-")); + const sandboxRoot = path.join(tmpDir, "sandbox"); - it("normalize_mutable_config_perms is called in the root-mode startup path", () => { - expect(NEMOCLAW_START).toMatch(/normalize_mutable_config_perms\b[^(]/); + try { + fs.mkdirSync(sandboxRoot, { recursive: true }); + const users = runScript(tmpDir, [ + 'groupadd() { printf "groupadd %s\\n" "$*" >> "$CALL_LOG"; }', + 'useradd() { printf "useradd %s\\n" "$*" >> "$CALL_LOG"; }', + 'usermod() { printf "usermod %s\\n" "$*" >> "$CALL_LOG"; }', + 'chown() { printf "chown %s\\n" "$*" >> "$CALL_LOG"; }', + rewriteSandbox( + dockerRunCommandBetween( + ["Dockerfile.base"], + "# Create sandbox user", + "# Create .openclaw", + ), + sandboxRoot, + ), + ]); + expect(users.result.status).toBe(0); + expect(users.log).toContain("usermod -aG sandbox gateway"); + + const layout = runScript(tmpDir, [ + 'chown() { printf "chown %s\\n" "$*" >> "$CALL_LOG"; }', + rewriteSandbox( + dockerRunCommandBetween( + ["Dockerfile.base"], + "# Create .openclaw with all state subdirs directly", + "# Pre-create shell init files", + ), + sandboxRoot, + ), + ]); + const openclawDir = path.join(sandboxRoot, ".openclaw"); + expect(layout.result.status).toBe(0); + expect(mode(openclawDir) & 0o020).not.toBe(0); + expect(mode(openclawDir) & 0o2000).not.toBe(0); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } }); - it("shields.ts unlock path uses group-writable file mode (660) + setgid dir (2770) for openclaw", () => { - // Pre-#2681 openclaw unlock used 600/700 which stripped group-write. - // After this PR openclaw uses 660/2770 so the gateway UID (member of - // sandbox group) can write OpenClaw config. Hermes is left unchanged - // (no separate gateway UID, so the shared-group contract doesn't apply). - expect(SHIELDS_TS).toMatch(/agentName === "hermes" \? "640" : "660"/); - expect(SHIELDS_TS).toMatch(/agentName === "hermes" \? "750" : "2770"/); + it("executes production-image fallback setup without losing group-writable config files", () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-2681-prod-")); + const sandboxRoot = path.join(tmpDir, "sandbox"); + const openclawDir = path.join(sandboxRoot, ".openclaw"); + + try { + fs.mkdirSync(openclawDir, { recursive: true }); + fs.writeFileSync(path.join(openclawDir, "openclaw.json"), "{}\n"); + + const fallback = runScript(tmpDir, [ + 'id() { if [ "${1:-}" = "-nG" ]; then printf "gateway\\n"; return 0; fi; return 0; }', + 'usermod() { printf "usermod %s\\n" "$*" >> "$CALL_LOG"; }', + dockerRunCommandBetween( + ["Dockerfile"], + "# Stale-base fallback for the gateway-in-sandbox-group setup", + "# Keep the image readable", + ), + ]); + expect(fallback.result.status).toBe(0); + expect(fallback.log).toContain("usermod -aG sandbox gateway"); + + const layout = runScript(tmpDir, [ + 'chown() { printf "chown %s\\n" "$*" >> "$CALL_LOG"; }', + rewriteSandbox( + dockerRunCommandBetween( + ["Dockerfile"], + "# `chmod g+w` + setgid", + "# Pin config hash", + ), + sandboxRoot, + ), + ]); + expect(layout.result.status).toBe(0); + expect(mode(openclawDir) & 0o020).not.toBe(0); + expect(mode(openclawDir) & 0o2000).not.toBe(0); + + const hash = runScript(tmpDir, [ + 'chown() { printf "chown %s\\n" "$*" >> "$CALL_LOG"; }', + 'sha256sum() { printf "hash %s\\n" "$1"; }', + rewriteSandbox( + dockerRunCommandBetween( + ["Dockerfile"], + "# Pin config hash", + "# DAC-protect .nemoclaw directory", + ), + sandboxRoot, + ), + ]); + expect(hash.result.status).toBe(0); + expect((mode(path.join(openclawDir, ".config-hash")) & 0o777).toString(8)).toBe("664"); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } }); - it("applyStateDirLockMode re-adds group-write when unlocking (shields down)", () => { - // The chmod in the unlock path must explicitly RE-ADD group-write, - // not just preserve it. A prior `chmod -R go-w` from shields-up - // already stripped g+w from descendants, so unlock must use `g+w,o-w` - // to restore the group-writable contract on the whole tree. - expect(SHIELDS_TS).toMatch(/isLocking\s*\?\s*"go-w"\s*:\s*"g\+w,o-w"/); + it("normalizes mutable config permissions only when the config dir is not locked", () => { + const runCase = (owner: "root" | "sandbox") => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), `nemoclaw-2681-normalize-${owner}-`)); + const sandboxRoot = path.join(tmpDir, "sandbox"); + const openclawDir = path.join(sandboxRoot, ".openclaw"); + try { + fs.mkdirSync(path.join(openclawDir, "workspace"), { recursive: true }); + fs.chmodSync(openclawDir, 0o755); + fs.chmodSync(path.join(openclawDir, "workspace"), 0o755); + const script = shellFunctionFromFile( + ["scripts", "nemoclaw-start.sh"], + "normalize_mutable_config_perms", + sandboxRoot, + ); + const { result } = runScript( + tmpDir, + [ + 'id() { if [ "${1:-}" = "-u" ]; then printf "0"; return 0; fi; command id "$@"; }', + 'stat() { if [ "${1:-}" = "-c" ] || [ "${1:-}" = "-f" ]; then printf "%s\\n" "$CONFIG_OWNER"; return 0; fi; command stat "$@"; }', + script, + "normalize_mutable_config_perms", + ], + { CONFIG_OWNER: owner }, + ); + return { result, openclawDir, tmpDir }; + } catch (err) { + fs.rmSync(tmpDir, { recursive: true, force: true }); + throw err; + } + }; + + const locked = runCase("root"); + try { + expect(locked.result.status).toBe(0); + expect(mode(locked.openclawDir) & 0o020).toBe(0); + expect(mode(locked.openclawDir) & 0o2000).toBe(0); + } finally { + fs.rmSync(locked.tmpDir, { recursive: true, force: true }); + } + + const unlocked = runCase("sandbox"); + try { + expect(unlocked.result.status).toBe(0); + expect(mode(unlocked.openclawDir) & 0o020).not.toBe(0); + expect(mode(unlocked.openclawDir) & 0o2000).not.toBe(0); + } finally { + fs.rmSync(unlocked.tmpDir, { recursive: true, force: true }); + } }); }); From cd5be3032b9d2bdbb16b06998364e100893701d6 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 16:43:32 -0700 Subject: [PATCH 06/14] test(wsl): simulate mutable OpenClaw config owner Signed-off-by: Aaron Erickson --- test/repro-2681-group-writable.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/test/repro-2681-group-writable.test.ts b/test/repro-2681-group-writable.test.ts index 9d6701fadc..01c7c344b8 100644 --- a/test/repro-2681-group-writable.test.ts +++ b/test/repro-2681-group-writable.test.ts @@ -61,6 +61,7 @@ describe("Issue #2681 — mutable OpenClaw config permissions", () => { [ "set -euo pipefail", 'id() { if [ "${1:-}" = "-u" ]; then printf "0"; else command id "$@"; fi; }', + 'stat() { if [ "${1:-}" = "-c" ] && [ "${2:-}" = "%U" ]; then printf "sandbox\\n"; else command stat "$@"; fi; }', normalizeMutableConfigPermsFor(configDir), "normalize_mutable_config_perms", ].join("\n"), From c15a003254a67d05a140b6cdf59a1d8f9f9aeac1 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 17:23:15 -0700 Subject: [PATCH 07/14] test(wsl): avoid slow shields subprocess Signed-off-by: Aaron Erickson --- test/repro-2681-group-writable.test.ts | 99 ++++++++++++++------------ 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/test/repro-2681-group-writable.test.ts b/test/repro-2681-group-writable.test.ts index 01c7c344b8..0538ddcdd9 100644 --- a/test/repro-2681-group-writable.test.ts +++ b/test/repro-2681-group-writable.test.ts @@ -16,7 +16,7 @@ import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { spawnSync } from "node:child_process"; -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; const START_SCRIPT = path.join(import.meta.dirname, "..", "scripts", "nemoclaw-start.sh"); @@ -40,6 +40,38 @@ function modeBits(filePath: string): number { return fs.statSync(filePath).mode; } +function withMockedDockerExecFileSync(calls: string[][], run: () => T): T { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const dockerExecModule = require("../dist/lib/docker/exec.js") as { + dockerExecFileSync: (args: readonly string[]) => string; + }; + const originalDockerExecFileSync = dockerExecModule.dockerExecFileSync; + const shieldsModulePath = require.resolve("../dist/lib/shields.js"); + delete require.cache[shieldsModulePath]; + + dockerExecModule.dockerExecFileSync = vi.fn((args: readonly string[]) => { + const separator = args.indexOf("--"); + const command = separator >= 0 ? args.slice(separator + 1) : [...args]; + calls.push(command); + if (command[0] === "stat" && command[1] === "-c") { + return command.at(-1) === "/sandbox/.openclaw" + ? "2770 sandbox:sandbox\n" + : "660 sandbox:sandbox\n"; + } + if (command[0] === "lsattr") { + return `---------------------- ${command.at(-1)}\n`; + } + return ""; + }); + + try { + return run(); + } finally { + dockerExecModule.dockerExecFileSync = originalDockerExecFileSync; + delete require.cache[shieldsModulePath]; + } +} + describe("Issue #2681 — mutable OpenClaw config permissions", () => { it("restores group-write and setgid on mutable config trees during root startup", () => { const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-2681-perms-")); @@ -80,50 +112,29 @@ describe("Issue #2681 — mutable OpenClaw config permissions", () => { }); it("shields-down restores OpenClaw group-writable file modes and setgid dirs", () => { - const probe = spawnSync( - process.execPath, - [ - "-e", - String.raw` -const Module = require("node:module"); -const originalLoad = Module._load; -const calls = []; -Module._load = function patchedLoad(request, parent, isMain) { - if (request === "./docker/exec") { - return { - dockerExecFileSync(args) { - const separator = args.indexOf("--"); - const command = separator >= 0 ? args.slice(separator + 1) : args; - calls.push(command); - if (command[0] === "stat" && command[1] === "-c") { - return command.at(-1) === "/sandbox/.openclaw" - ? "2770 sandbox:sandbox\n" - : "660 sandbox:sandbox\n"; - } - if (command[0] === "lsattr") { - return "---------------------- " + command.at(-1) + "\n"; - } - return ""; - }, - }; - } - return originalLoad.call(this, request, parent, isMain); -}; -const { unlockAgentConfig } = require("./dist/lib/shields.js"); -unlockAgentConfig("sandbox-pod", { - agentName: "openclaw", - configPath: "/sandbox/.openclaw/openclaw.json", - configDir: "/sandbox/.openclaw", - sensitiveFiles: ["/sandbox/.openclaw/.config-hash"], -}); -process.stdout.write(JSON.stringify(calls)); -`, - ], - { encoding: "utf-8", timeout: 5000 }, - ); + const commands: string[][] = []; + withMockedDockerExecFileSync(commands, () => { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { unlockAgentConfig } = require("../dist/lib/shields.js") as { + unlockAgentConfig: ( + sandboxName: string, + target: { + agentName?: string; + configPath: string; + configDir: string; + sensitiveFiles?: string[]; + }, + ) => void; + }; + + unlockAgentConfig("sandbox-pod", { + agentName: "openclaw", + configPath: "/sandbox/.openclaw/openclaw.json", + configDir: "/sandbox/.openclaw", + sensitiveFiles: ["/sandbox/.openclaw/.config-hash"], + }); + }); - expect(probe.status).toBe(0); - const commands = JSON.parse(probe.stdout) as string[][]; expect(commands).toContainEqual(["chmod", "660", "/sandbox/.openclaw/openclaw.json"]); expect(commands).toContainEqual(["chmod", "660", "/sandbox/.openclaw/.config-hash"]); expect(commands).toContainEqual(["chmod", "2770", "/sandbox/.openclaw"]); From 1d137c6cf2024766a530d0d963c957e76038cb97 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 17:54:34 -0700 Subject: [PATCH 08/14] fix(recovery): address coderabbit follow-ups Signed-off-by: Aaron Erickson --- src/lib/agent-runtime.test.ts | 15 +++++--- src/lib/agent-runtime.ts | 25 ++++++++----- src/nemoclaw.ts | 66 ++++++++++++++++++++--------------- test/cli.test.ts | 43 ++++++++++++++++++++++- 4 files changed, 107 insertions(+), 42 deletions(-) diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts index c5a2bd09db..b11126d1ef 100644 --- a/src/lib/agent-runtime.test.ts +++ b/src/lib/agent-runtime.test.ts @@ -123,19 +123,22 @@ describe("buildRecoveryScript", () => { it("writes the warning to gateway.log so it persists for sysadmin tail", () => { const script = buildRecoveryScript(minimalAgent, 19000); - // Both warnings must end up in /tmp/gateway.log, not just stderr — + // Both warnings must end up in the selected gateway log, not just stderr — // executeSandboxCommand silently discards stderr from the recovery // script, so a warning that only goes to stderr is invisible to // anyone debugging a crash-loop. (#2478) - expect(script).toContain('echo "$_W" >> /tmp/gateway.log'); + expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"'); // And the warning must be deferred until AFTER gateway.log is // safely opened with O_NOFOLLOW, otherwise the redirect targets a // stale or attacker-controlled file. const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;"); - const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log'); + const logSelectionIdx = script!.indexOf("_GATEWAY_LOG=/tmp/gateway.log"); + const warnIdx = script!.indexOf('echo "$_W" >> "$_GATEWAY_LOG"'); expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0); + expect(logSelectionIdx).toBeGreaterThanOrEqual(0); expect(warnIdx).toBeGreaterThanOrEqual(0); - expect(gatewayPrepIdx).toBeLessThan(warnIdx); + expect(gatewayPrepIdx).toBeLessThan(logSelectionIdx); + expect(logSelectionIdx).toBeLessThan(warnIdx); }); it("stops recovery when hardened log setup fails", () => { @@ -158,6 +161,10 @@ describe("buildRecoveryScript", () => { expect(script).not.toContain("rm -f /tmp/gateway.log"); expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log"); expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log"); + expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"'); + expect(script).toContain('tail -5 "$_GATEWAY_LOG"'); + expect(script).not.toContain('echo "$_W" >> /tmp/gateway.log'); + expect(script).not.toContain("cat /tmp/gateway.log"); }); it("rejects a symlinked gateway.log before preparing the log", () => { diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts index 71dc29b9a0..a4ae7e7dc1 100644 --- a/src/lib/agent-runtime.ts +++ b/src/lib/agent-runtime.ts @@ -108,12 +108,17 @@ function buildGatewayLogSetup(includeAutoPairLog = false, logOwnerUser?: string) return lines; } +function buildGatewayLogSelection(): string { + return '_GATEWAY_LOG=/tmp/gateway.log; if ! : >> "$_GATEWAY_LOG" 2>/dev/null; then _GATEWAY_LOG=/tmp/gateway-recovery.log; : >> "$_GATEWAY_LOG" 2>/dev/null || true; fi;'; +} + function gatewayLaunchCommand(command: string, runAsUser?: string): string { - const userLaunch = `_GATEWAY_LOG=/tmp/gateway.log; if ! : >> "$_GATEWAY_LOG" 2>/dev/null; then _GATEWAY_LOG=/tmp/gateway-recovery.log; : > "$_GATEWAY_LOG" 2>/dev/null || true; fi; nohup ${command} >> "$_GATEWAY_LOG" 2>&1 &`; + const logSelection = buildGatewayLogSelection(); + const userLaunch = `nohup ${command} >> "$_GATEWAY_LOG" 2>&1 &`; if (!runAsUser) { - return userLaunch; + return `${logSelection} ${userLaunch}`; } - return `if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> /tmp/gateway.log 2>&1 & else ${userLaunch} fi`; + return `${logSelection} if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> "$_GATEWAY_LOG" 2>&1 & else ${userLaunch} fi`; } /** @@ -127,13 +132,14 @@ export function buildOpenClawRecoveryScript(port: number): string { `if curl -sf --max-time 3 http://127.0.0.1:${port}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`, "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", ...buildGatewayLogSetup(true, "gateway"), - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', - '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', + buildGatewayLogSelection(), + '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', + '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', 'OPENCLAW="$(command -v openclaw)";', 'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;', gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"), "GPID=$!; sleep 2;", - 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi', + 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; tail -5 "$_GATEWAY_LOG" 2>/dev/null; fi', ].join(" "); } @@ -196,12 +202,13 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number) hermesHome, `if curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`, ...buildGatewayLogSetup(false), - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', - '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', + buildGatewayLogSelection(), + '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', + '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', ...validationSteps, launchCommand, "GPID=$!; sleep 2;", - 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi', + 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; tail -5 "$_GATEWAY_LOG" 2>/dev/null; fi', ].join(" "); } diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 435492758e..91ef88d161 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -1223,6 +1223,12 @@ type SandboxConnectOptions = { probeOnly?: boolean; }; +const SANDBOX_CONNECT_FLAGS = new Set(["--dangerously-skip-permissions", "--probe-only", "--help", "-h"]); + +function isSandboxConnectFlag(arg: string | undefined): boolean { + return typeof arg === "string" && SANDBOX_CONNECT_FLAGS.has(arg); +} + function printSandboxConnectHelp(sandboxName = "") { console.log(""); console.log(` Usage: ${CLI_NAME} ${sandboxName} connect [--probe-only]`); @@ -1238,6 +1244,11 @@ function printSandboxConnectHelp(sandboxName = "") { function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): SandboxConnectOptions { const options: SandboxConnectOptions = {}; for (const arg of actionArgs) { + if (!isSandboxConnectFlag(arg)) { + console.error(` Unknown flag for connect: ${arg}`); + printSandboxConnectHelp(sandboxName); + process.exit(1); + } switch (arg) { case "--dangerously-skip-permissions": console.error(" --dangerously-skip-permissions was removed; use shields commands instead."); @@ -1251,15 +1262,36 @@ function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): San printSandboxConnectHelp(sandboxName); process.exit(0); break; - default: - console.error(` Unknown flag for connect: ${arg}`); - printSandboxConnectHelp(sandboxName); - process.exit(1); } } return options; } +function runSandboxConnectProbe(sandboxName: string): void { + const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true }); + const agent = agentRuntime.getSessionAgent(sandboxName); + const agentName = agentRuntime.getAgentDisplayName(agent); + if (!processCheck.checked) { + console.error( + ` Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`, + ); + process.exit(1); + } + if (processCheck.wasRunning) { + console.log(` Probe complete: ${agentName} gateway is running in '${sandboxName}'.`); + return; + } + if (processCheck.recovered) { + console.log(` Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`); + return; + } + console.error( + ` Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`, + ); + console.error(" Check /tmp/gateway.log inside the sandbox for details."); + process.exit(1); +} + async function sandboxConnect( sandboxName: string, { probeOnly = false }: SandboxConnectOptions = {}, @@ -1268,28 +1300,7 @@ async function sandboxConnect( await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true }); if (probeOnly) { - const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true }); - const agent = agentRuntime.getSessionAgent(sandboxName); - const agentName = agentRuntime.getAgentDisplayName(agent); - if (!processCheck.checked) { - console.error( - ` Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`, - ); - process.exit(1); - } - if (processCheck.wasRunning) { - console.log(` Probe complete: ${agentName} gateway is running in '${sandboxName}'.`); - return; - } - if (processCheck.recovered) { - console.log(` Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`); - return; - } - console.error( - ` Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`, - ); - console.error(" Check /tmp/gateway.log inside the sandbox for details."); - process.exit(1); + return runSandboxConnectProbe(sandboxName); } // Version staleness check — warn but don't block @@ -4151,8 +4162,7 @@ const [cmd, ...args] = process.argv.slice(2); // Sandbox-scoped commands: nemoclaw const firstSandboxArg = args[0]; - const implicitConnectArg = - firstSandboxArg === "--help" || firstSandboxArg === "-h" || firstSandboxArg === "--probe-only"; + const implicitConnectArg = isSandboxConnectFlag(firstSandboxArg); const requestedSandboxAction = !firstSandboxArg || implicitConnectArg ? "connect" : firstSandboxArg; const requestedSandboxActionArgs = !firstSandboxArg || implicitConnectArg ? args : args.slice(1); diff --git a/test/cli.test.ts b/test/cli.test.ts index f3ca38fed8..bed704109f 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -94,7 +94,33 @@ function readRecordedArgs(markerFile: string): string[] { return fs.readFileSync(markerFile, "utf8").trim().split(/\s+/); } -function writeSandboxRegistry(home: string, sandboxOverrides: Record = {}): void { +type SandboxEntry = { + name: string; + model: string; + provider: string; + gpuEnabled: boolean; + policies: string[]; + agent?: string; +}; + +function writeRecordingCommand( + binDir: string, + command: string, + markerFile: string, + exitCode: number, +): void { + fs.writeFileSync( + path.join(binDir, command), + [ + "#!/usr/bin/env bash", + `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`, + `exit ${exitCode}`, + ].join("\n"), + { mode: 0o755 }, + ); +} + +function writeSandboxRegistry(home: string, sandboxOverrides: Partial = {}): void { const registryDir = path.join(home, ".nemoclaw"); fs.mkdirSync(registryDir, { recursive: true }); fs.writeFileSync( @@ -1609,6 +1635,7 @@ describe("CLI dispatch", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-help-")); const localBin = path.join(home, "bin"); const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); fs.mkdirSync(localBin, { recursive: true }); fs.writeFileSync( path.join(localBin, "openshell"), @@ -1619,6 +1646,7 @@ describe("CLI dispatch", () => { ].join("\n"), { mode: 0o755 }, ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); const r = runWithEnv("alpha connect --help", { HOME: home, @@ -1635,12 +1663,14 @@ describe("CLI dispatch", () => { expect(implicit.code).toBe(0); expect(implicit.out).toContain("Usage: nemoclaw alpha connect"); expect(fs.existsSync(markerFile)).toBe(false); + expect(fs.existsSync(sshMarkerFile)).toBe(false); }); it("rejects the removed skip-permissions connect flag", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-flags-")); const localBin = path.join(home, "bin"); const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); fs.mkdirSync(localBin, { recursive: true }); fs.writeFileSync( path.join(localBin, "openshell"), @@ -1651,6 +1681,7 @@ describe("CLI dispatch", () => { ].join("\n"), { mode: 0o755 }, ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); writeSandboxRegistry(home); const r = runWithEnv("alpha connect --dangerously-skip-permissions", { @@ -1661,12 +1692,14 @@ describe("CLI dispatch", () => { expect(r.code).toBe(1); expect(r.out).toContain("--dangerously-skip-permissions was removed"); expect(fs.existsSync(markerFile)).toBe(false); + expect(fs.existsSync(sshMarkerFile)).toBe(false); }); it("connect --probe-only recovers the gateway without opening SSH", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-")); const localBin = path.join(home, "bin"); const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); const stateFile = path.join(home, "probe-state"); fs.mkdirSync(localBin, { recursive: true }); writeSandboxRegistry(home); @@ -1707,6 +1740,7 @@ describe("CLI dispatch", () => { ].join("\n"), { mode: 0o755 }, ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); const r = runWithEnv("alpha connect --probe-only", { HOME: home, @@ -1720,12 +1754,14 @@ describe("CLI dispatch", () => { expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); expect(calls).not.toContain("sandbox ssh-config alpha"); expect(calls).not.toContain("sandbox connect alpha"); + expect(fs.existsSync(sshMarkerFile)).toBe(false); }); it("treats leading --probe-only as an implicit connect probe", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-leading-")); const localBin = path.join(home, "bin"); const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); fs.mkdirSync(localBin, { recursive: true }); writeSandboxRegistry(home); fs.writeFileSync( @@ -1752,6 +1788,7 @@ describe("CLI dispatch", () => { ].join("\n"), { mode: 0o755 }, ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); const r = runWithEnv("alpha --probe-only", { HOME: home, @@ -1765,12 +1802,14 @@ describe("CLI dispatch", () => { expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); expect(calls).not.toContain("sandbox ssh-config alpha"); expect(calls).not.toContain("sandbox connect alpha"); + expect(fs.existsSync(sshMarkerFile)).toBe(false); }); it("connect --probe-only does not retry a failed sandbox exec recovery over SSH", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-no-ssh-")); const localBin = path.join(home, "bin"); const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); fs.mkdirSync(localBin, { recursive: true }); writeSandboxRegistry(home); fs.writeFileSync( @@ -1801,6 +1840,7 @@ describe("CLI dispatch", () => { ].join("\n"), { mode: 0o755 }, ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); const r = runWithEnv("alpha connect --probe-only", { HOME: home, @@ -1812,6 +1852,7 @@ describe("CLI dispatch", () => { expect(calls).toContain("sandbox get alpha"); expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); expect(calls).not.toContain("sandbox ssh-config alpha"); + expect(fs.existsSync(sshMarkerFile)).toBe(false); }); it("connect --probe-only falls back to SSH when sandbox exec never starts", () => { From 61bb8458dfbbd7378f17ef40b95896d1243dfaba Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 18:17:06 -0700 Subject: [PATCH 09/14] fix(recovery): stabilize e2e follow-ups Signed-off-by: Aaron Erickson --- src/lib/agent-onboard.test.ts | 24 +++++++++++++++-- src/lib/agent-onboard.ts | 9 ++++--- src/lib/agent-runtime.test.ts | 6 +++++ src/lib/agent-runtime.ts | 2 +- src/lib/shields.ts | 9 +++++++ test/e2e/test-shields-config.sh | 48 ++++++++++++++++++++++----------- test/shields.test.ts | 3 +++ 7 files changed, 78 insertions(+), 23 deletions(-) diff --git a/src/lib/agent-onboard.test.ts b/src/lib/agent-onboard.test.ts index 6e00cf830a..5b1bb0a110 100644 --- a/src/lib/agent-onboard.test.ts +++ b/src/lib/agent-onboard.test.ts @@ -5,7 +5,7 @@ import { describe, it, expect, beforeEach, afterEach, afterAll, vi } from "vites import fs from "node:fs"; import path from "node:path"; // Import from compiled dist/ so coverage is attributed correctly. -import { printDashboardUi } from "../../dist/lib/agent-onboard"; +import { printDashboardUi, verifyAgentBinaryAvailable } from "../../dist/lib/agent-onboard"; import type { AgentDefinition } from "./agent-defs"; function makeAgent(overrides: Partial = {}): AgentDefinition { @@ -128,10 +128,13 @@ describe("handleAgentSetup guards", () => { const source = fs.readFileSync(path.join(import.meta.dirname, "agent-onboard.ts"), "utf-8"); expect(source).toContain("verifyAgentBinaryAvailable"); + expect(source).toContain("[ -e ${shellQuote(binaryPath)} ]"); + expect(source).toContain("[ -x ${shellQuote(binaryPath)} ]"); expect(source).toContain( 'resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"', ); - expect(source).toContain('[ "$resolved" = ${shellQuote(binaryPath)} ]'); + expect(source).toContain('[ -z "$resolved" ] || [ "$resolved" = ${shellQuote(binaryPath)} ]'); + expect(source).not.toContain('[ -n "$resolved" ] || { echo not_found'); expect(source).toMatch( /"sandbox",\s*"exec",\s*"-n",\s*sandboxName,\s*"--",\s*"sh",\s*"-lc",\s*script/, ); @@ -150,4 +153,21 @@ describe("handleAgentSetup guards", () => { expect(source).toContain('parsed.status === "ok"'); expect(source).not.toContain('.includes("ok")'); }); + + it("accepts an executable configured binary path when PATH lookup is empty", () => { + let script = ""; + const result = verifyAgentBinaryAvailable( + "alpha", + makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }), + (args) => { + script = String(args[7] || ""); + return "ok"; + }, + ); + + expect(result).toEqual({ available: true }); + expect(script).toContain("[ -e '/usr/local/bin/hermes' ]"); + expect(script).toContain("[ -x '/usr/local/bin/hermes' ]"); + expect(script).toContain('[ -z "$resolved" ] || [ "$resolved" = \'/usr/local/bin/hermes\' ]'); + }); }); diff --git a/src/lib/agent-onboard.ts b/src/lib/agent-onboard.ts index c5bad2dff1..d67af0801f 100644 --- a/src/lib/agent-onboard.ts +++ b/src/lib/agent-onboard.ts @@ -129,7 +129,8 @@ type AgentBinaryAvailability = resolvedPath?: string; }; -function verifyAgentBinaryAvailable( +// Exported for unit coverage of the sandbox-side guard without running onboarding. +export function verifyAgentBinaryAvailable( sandboxName: string, agent: AgentDefinition, runCaptureOpenshell: OnboardContext["runCaptureOpenshell"], @@ -138,10 +139,10 @@ function verifyAgentBinaryAvailable( const binaryPath = typeof agent.binary_path === "string" ? agent.binary_path.trim() : ""; const script = binaryPath ? [ - `resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`, - `[ -n "$resolved" ] || { echo not_found; exit 1; }`, + `[ -e ${shellQuote(binaryPath)} ] || { echo not_found; exit 1; }`, `[ -x ${shellQuote(binaryPath)} ] || { echo not_executable; exit 1; }`, - `[ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`, + `resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`, + `[ -z "$resolved" ] || [ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`, "echo ok", ].join(" && ") : `command -v ${shellQuote(executable)} >/dev/null 2>&1 && echo ok || echo not_found`; diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts index b11126d1ef..965729842f 100644 --- a/src/lib/agent-runtime.test.ts +++ b/src/lib/agent-runtime.test.ts @@ -194,6 +194,12 @@ describe("buildRecoveryScript", () => { expect(script).toContain("gosu 'gateway'"); }); + it("terminates the conditional launch branch before capturing the gateway pid", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).toContain(" fi; GPID=$!"); + expect(script).not.toContain(" fi GPID=$!"); + }); + it("prepares auto-pair.log without unlinking or following symlinks", () => { const script = buildOpenClawRecoveryScript(18789); expect(script).toContain("refusing to prepare symlinked /tmp/auto-pair.log"); diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts index a4ae7e7dc1..7d01f3e05a 100644 --- a/src/lib/agent-runtime.ts +++ b/src/lib/agent-runtime.ts @@ -118,7 +118,7 @@ function gatewayLaunchCommand(command: string, runAsUser?: string): string { if (!runAsUser) { return `${logSelection} ${userLaunch}`; } - return `${logSelection} if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> "$_GATEWAY_LOG" 2>&1 & else ${userLaunch} fi`; + return `${logSelection} if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> "$_GATEWAY_LOG" 2>&1 & else ${userLaunch} fi;`; } /** diff --git a/src/lib/shields.ts b/src/lib/shields.ts index f53d281cbe..554bb87e30 100644 --- a/src/lib/shields.ts +++ b/src/lib/shields.ts @@ -279,6 +279,9 @@ function applyStateDirLockMode(sandboxName: string, configDir: string, owner: st } try { kubectlExec(sandboxName, ["chmod", dirMode, dirPath]); + if (isLocking) { + kubectlExec(sandboxName, ["chmod", "g-s", dirPath]); + } kubectlExec(sandboxName, ["chmod", "-R", writeStrip, dirPath]); } catch { // Silently skip @@ -301,6 +304,7 @@ for dir in "$config_dir"/workspace-*; do [ -d "$dir" ] || continue chown -R "$owner" "$dir" 2>/dev/null || true chmod "$dir_mode" "$dir" 2>/dev/null || true + if [ "$dir_mode" = "755" ]; then chmod g-s "$dir" 2>/dev/null || true; fi chmod -R "$write_strip" "$dir" 2>/dev/null || true done `, @@ -483,6 +487,11 @@ function lockAgentConfig( } catch { errors.push("chmod 755 config dir"); } + try { + kubectlExec(sandboxName, ["chmod", "g-s", target.configDir]); + } catch { + errors.push("chmod g-s config dir"); + } try { kubectlExec(sandboxName, ["chown", "root:root", target.configDir]); diff --git a/test/e2e/test-shields-config.sh b/test/e2e/test-shields-config.sh index 061741b430..499584c6c7 100755 --- a/test/e2e/test-shields-config.sh +++ b/test/e2e/test-shields-config.sh @@ -156,16 +156,22 @@ pass "NemoClaw installed (sandbox: $SANDBOX_NAME)" # ══════════════════════════════════════════════════════════════════ section "Phase 2: Config is writable (mutable default)" -# Verify file permissions — should start as 600 sandbox:sandbox +# Verify file permissions — fresh OpenClaw sandboxes start sandbox-owned. +# Depending on whether the image entrypoint ran as root, the mutable-default +# tree may already include the #2681 group-write/setgid contract or may only +# gain it after the first host-side shields-down. PERMS=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \ stat -c '%a %U:%G' "${CONFIG_PATH}" 2>/dev/null || true) info "Config perms (default): ${PERMS}" -if [ "$(echo "$PERMS" | awk '{print $1}')" = "600" ]; then - pass "Config file mode is 600 (mutable default)" -else - fail "Config file should start as mode 600: ${PERMS}" -fi +case "$(echo "$PERMS" | awk '{print $1}')" in + 600 | 660) + pass "Config file mode is mutable default (${PERMS})" + ;; + *) + fail "Config file should start as mutable mode 600 or 660: ${PERMS}" + ;; +esac if [ "$(echo "$PERMS" | awk '{print $2}')" = "sandbox:sandbox" ]; then pass "Config file owned by sandbox:sandbox (mutable default)" @@ -177,10 +183,19 @@ DIR_PERMS=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \ stat -c '%a %U:%G' "$(dirname "${CONFIG_PATH}")" 2>/dev/null || true) info "Config dir perms (default): ${DIR_PERMS}" -if [ "$(echo "$DIR_PERMS" | awk '{print $1}')" = "700" ]; then - pass "Config directory mode is 700 (mutable default)" +case "$(echo "$DIR_PERMS" | awk '{print $1}')" in + 700 | 2700 | 2770) + pass "Config directory mode is mutable default (${DIR_PERMS})" + ;; + *) + fail "Config directory should be mutable mode 700, 2700, or 2770: ${DIR_PERMS}" + ;; +esac + +if [ "$(echo "$DIR_PERMS" | awk '{print $2}')" = "sandbox:sandbox" ]; then + pass "Config directory owned by sandbox:sandbox (mutable default)" else - fail "Config directory should be mode 700: ${DIR_PERMS}" + fail "Config directory should be owned by sandbox:sandbox: ${DIR_PERMS}" fi STATUS_DEFAULT=$(nemoclaw "${SANDBOX_NAME}" shields status 2>&1) @@ -321,15 +336,16 @@ else fail "shields down did not report success: ${SHIELDS_DOWN_OUTPUT}" fi -# Check permissions changed — should be sandbox:sandbox 600/700 (doctor-aligned) +# Check permissions changed — OpenClaw uses group-writable + setgid mutable mode +# so the gateway UID and sandbox UID can both update config/state (#2681). PERMS_DOWN=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \ stat -c '%a %U:%G' "${CONFIG_PATH}" 2>/dev/null || true) info "Config perms (shields DOWN): ${PERMS_DOWN}" -if [ "$(echo "$PERMS_DOWN" | awk '{print $1}')" = "600" ]; then - pass "Config file mode is 600 (restored to mutable default)" +if [ "$(echo "$PERMS_DOWN" | awk '{print $1}')" = "660" ]; then + pass "Config file mode is 660 (restored to mutable group-writable default)" else - fail "Config file should be mode 600 after shields down: ${PERMS_DOWN}" + fail "Config file should be mode 660 after shields down: ${PERMS_DOWN}" fi if [ "$(echo "$PERMS_DOWN" | awk '{print $2}')" = "sandbox:sandbox" ]; then @@ -342,10 +358,10 @@ DIR_PERMS_DOWN=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \ stat -c '%a %U:%G' "$(dirname "${CONFIG_PATH}")" 2>/dev/null || true) info "Config dir perms (shields DOWN): ${DIR_PERMS_DOWN}" -if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $1}')" = "700" ]; then - pass "Config directory mode is 700 (restored to mutable default)" +if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $1}')" = "2770" ]; then + pass "Config directory mode is 2770 (restored to mutable setgid default)" else - fail "Config directory should be mode 700 after shields down: ${DIR_PERMS_DOWN}" + fail "Config directory should be mode 2770 after shields down: ${DIR_PERMS_DOWN}" fi if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $2}')" = "sandbox:sandbox" ]; then diff --git a/test/shields.test.ts b/test/shields.test.ts index 6d3d31102d..fa80e0d627 100644 --- a/test/shields.test.ts +++ b/test/shields.test.ts @@ -418,6 +418,9 @@ describe("NC-2227-05: shields.ts locks state directories", () => { expect(src).toContain("function applyStateDirLockMode"); expect(src).toContain("workspace-*"); expect(fnBody).toContain("applyStateDirLockMode"); + expect(fnBody).toContain('["chmod", "g-s", target.configDir]'); + expect(src).toContain('["chmod", "g-s", dirPath]'); + expect(src).toContain('if [ "$dir_mode" = "755" ]; then chmod g-s "$dir"'); expect(fnBody).toContain("chown"); expect(fnBody).toContain("root:root"); }); From b8e27d5a8d3fac11d22fc1a2dde094f09c773f06 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 18:26:37 -0700 Subject: [PATCH 10/14] fix(shields): keep lock-down write stripping mandatory Signed-off-by: Aaron Erickson --- src/lib/shields.ts | 10 +++++++++- test/shields.test.ts | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/lib/shields.ts b/src/lib/shields.ts index 554bb87e30..2bac74c4d5 100644 --- a/src/lib/shields.ts +++ b/src/lib/shields.ts @@ -279,9 +279,17 @@ function applyStateDirLockMode(sandboxName: string, configDir: string, owner: st } try { kubectlExec(sandboxName, ["chmod", dirMode, dirPath]); - if (isLocking) { + } catch { + // Silently skip + } + if (isLocking) { + try { kubectlExec(sandboxName, ["chmod", "g-s", dirPath]); + } catch { + // Best effort; do not skip recursive write stripping. } + } + try { kubectlExec(sandboxName, ["chmod", "-R", writeStrip, dirPath]); } catch { // Silently skip diff --git a/test/shields.test.ts b/test/shields.test.ts index fa80e0d627..49f5d73a64 100644 --- a/test/shields.test.ts +++ b/test/shields.test.ts @@ -420,6 +420,7 @@ describe("NC-2227-05: shields.ts locks state directories", () => { expect(fnBody).toContain("applyStateDirLockMode"); expect(fnBody).toContain('["chmod", "g-s", target.configDir]'); expect(src).toContain('["chmod", "g-s", dirPath]'); + expect(src).toContain("Best effort; do not skip recursive write stripping."); expect(src).toContain('if [ "$dir_mode" = "755" ]; then chmod g-s "$dir"'); expect(fnBody).toContain("chown"); expect(fnBody).toContain("root:root"); From 69a2656ba21ba0957924e183357d9dbe5abcc9a1 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 18:38:37 -0700 Subject: [PATCH 11/14] fix(onboard): accept configured agent binary path --- src/lib/agent-onboard.test.ts | 22 +++++++++++++++++----- src/lib/agent-onboard.ts | 17 +---------------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/lib/agent-onboard.test.ts b/src/lib/agent-onboard.test.ts index 5b1bb0a110..e9abb6e521 100644 --- a/src/lib/agent-onboard.test.ts +++ b/src/lib/agent-onboard.test.ts @@ -130,11 +130,8 @@ describe("handleAgentSetup guards", () => { expect(source).toContain("verifyAgentBinaryAvailable"); expect(source).toContain("[ -e ${shellQuote(binaryPath)} ]"); expect(source).toContain("[ -x ${shellQuote(binaryPath)} ]"); - expect(source).toContain( - 'resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"', - ); - expect(source).toContain('[ -z "$resolved" ] || [ "$resolved" = ${shellQuote(binaryPath)} ]'); expect(source).not.toContain('[ -n "$resolved" ] || { echo not_found'); + expect(source).not.toContain("path_mismatch"); expect(source).toMatch( /"sandbox",\s*"exec",\s*"-n",\s*sandboxName,\s*"--",\s*"sh",\s*"-lc",\s*script/, ); @@ -168,6 +165,21 @@ describe("handleAgentSetup guards", () => { expect(result).toEqual({ available: true }); expect(script).toContain("[ -e '/usr/local/bin/hermes' ]"); expect(script).toContain("[ -x '/usr/local/bin/hermes' ]"); - expect(script).toContain('[ -z "$resolved" ] || [ "$resolved" = \'/usr/local/bin/hermes\' ]'); + expect(script).not.toContain("command -v 'hermes'"); + }); + + it("does not reject a configured binary when PATH resolves the symlink target", () => { + let script = ""; + const result = verifyAgentBinaryAvailable( + "alpha", + makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }), + (args) => { + script = String(args[7] || ""); + return "ok"; + }, + ); + + expect(result).toEqual({ available: true }); + expect(script).not.toContain("path_mismatch"); }); }); diff --git a/src/lib/agent-onboard.ts b/src/lib/agent-onboard.ts index d67af0801f..aa536dccda 100644 --- a/src/lib/agent-onboard.ts +++ b/src/lib/agent-onboard.ts @@ -124,9 +124,8 @@ type AgentBinaryAvailability = | { available: true } | { available: false; - reason: "not_found" | "not_executable" | "path_mismatch"; + reason: "not_found" | "not_executable"; binaryPath?: string; - resolvedPath?: string; }; // Exported for unit coverage of the sandbox-side guard without running onboarding. @@ -141,8 +140,6 @@ export function verifyAgentBinaryAvailable( ? [ `[ -e ${shellQuote(binaryPath)} ] || { echo not_found; exit 1; }`, `[ -x ${shellQuote(binaryPath)} ] || { echo not_executable; exit 1; }`, - `resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`, - `[ -z "$resolved" ] || [ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`, "echo ok", ].join(" && ") : `command -v ${shellQuote(executable)} >/dev/null 2>&1 && echo ok || echo not_found`; @@ -157,15 +154,6 @@ export function verifyAgentBinaryAvailable( return { available: true }; } if (binaryPath && result) { - const mismatch = result.match(/path_mismatch:([^\n]+)/); - if (mismatch) { - return { - available: false, - reason: "path_mismatch", - binaryPath, - resolvedPath: mismatch[1].trim(), - }; - } if (result.includes("not_executable")) { return { available: false, reason: "not_executable", binaryPath }; } @@ -179,9 +167,6 @@ function describeAgentBinaryFailure( result: Exclude, ): string { const executable = agentExecutableName(agent); - if (result.reason === "path_mismatch") { - return `${agent.displayName} binary '${executable}' resolves to '${result.resolvedPath}', expected '${result.binaryPath}' inside sandbox '${sandboxName}'`; - } if (result.reason === "not_executable") { return `${agent.displayName} configured binary '${result.binaryPath}' is not executable inside sandbox '${sandboxName}'`; } From 92cebda54fe186176c186355591f6dc9daebfc60 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 18:45:48 -0700 Subject: [PATCH 12/14] fix(connect): wait for recovered gateway readiness --- src/nemoclaw.ts | 38 +++++++++++++++++++++++++--- test/cli.test.ts | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 3 deletions(-) diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index bcb4de3d59..d8fe3574ff 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -352,6 +352,38 @@ function recoverSandboxProcesses(sandboxName: string): boolean { return recoveredSsh(executeSandboxCommand(sandboxName, script)); } +function readNonNegativeNumberEnv(name: string, fallback: number): number { + const raw = process.env[name]; + if (raw === undefined || raw.trim() === "") return fallback; + const parsed = Number(raw); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +} + +function waitForRecoveredSandboxGateway(sandboxName: string): boolean { + const timeoutSeconds = readNonNegativeNumberEnv( + "NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS", + 30, + ); + const intervalSeconds = readNonNegativeNumberEnv( + "NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS", + 3, + ); + const attempts = + intervalSeconds > 0 + ? Math.max(1, Math.floor(timeoutSeconds / intervalSeconds) + 1) + : Math.max(1, Math.floor(timeoutSeconds) + 1); + + for (let attempt = 0; attempt < attempts; attempt += 1) { + if (isSandboxGatewayRunning(sandboxName) === true) { + return true; + } + if (attempt < attempts - 1) { + sleepSeconds(intervalSeconds); + } + } + return false; +} + /** * Re-establish the dashboard port forward to the sandbox. * Uses the agent's forward port when a non-OpenClaw agent is active. @@ -394,9 +426,9 @@ function checkAndRecoverSandboxProcesses( const recovered = recoverSandboxProcesses(sandboxName); if (recovered) { - // Wait for gateway to bind its HTTP port before declaring success - sleepSeconds(3); - if (isSandboxGatewayRunning(sandboxName) !== true) { + // Wait for gateway to bind its HTTP port before declaring success. The + // recovered process can be alive before the OpenAI-compatible API is ready. + if (!waitForRecoveredSandboxGateway(sandboxName)) { if (!quiet) { console.error(" Gateway process started but is not responding."); console.error(" Check /tmp/gateway.log inside the sandbox for details."); diff --git a/test/cli.test.ts b/test/cli.test.ts index 4aec7039fa..eccef65ecd 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1822,6 +1822,72 @@ describe("CLI dispatch", () => { expect(fs.existsSync(sshMarkerFile)).toBe(false); }); + it("waits for recovered gateway health before failing probe-only", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-wait-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + const stateFile = path.join(home, "probe-state"); + const readyCountFile = path.join(home, "ready-count"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync(stateFile, "stopped"); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + `state_file=${JSON.stringify(stateFile)}`, + `ready_count_file=${JSON.stringify(readyCountFile)}`, + 'printf \'%s\\n\' "$*" >> "$marker_file"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' case "$cmd" in', + ' *"OPENCLAW="*)', + ' echo recovered > "$state_file"', + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + " echo 'GATEWAY_PID=123'", + " exit 0", + " ;;", + " *'curl -sf'*)", + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + ' if [ "$(cat "$state_file")" != recovered ]; then echo STOPPED; exit 0; fi', + ' count=$(cat "$ready_count_file" 2>/dev/null || echo 0)', + " count=$((count + 1))", + ' echo "$count" > "$ready_count_file"', + ' if [ "$count" -ge 3 ]; then echo RUNNING; else echo STOPPED; fi', + " exit 0", + " ;;", + " esac", + "fi", + 'if [ "$1" = "forward" ]; then', + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS: "3", + NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS: "0", + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: recovered OpenClaw gateway"); + expect(fs.readFileSync(readyCountFile, "utf8").trim()).toBe("3"); + }); + it("treats leading --probe-only as an implicit connect probe", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-leading-")); const localBin = path.join(home, "bin"); From 46584a76decd18635c485e0aadedab01b536fd97 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 19:44:43 -0700 Subject: [PATCH 13/14] fix(status): bound sandbox version probe --- src/lib/sandbox-version.test.ts | 6 ++++++ src/lib/sandbox-version.ts | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/lib/sandbox-version.test.ts b/src/lib/sandbox-version.test.ts index 9b79eb8008..cf844b5288 100644 --- a/src/lib/sandbox-version.test.ts +++ b/src/lib/sandbox-version.test.ts @@ -50,6 +50,7 @@ vi.mock("child_process", async (importOriginal) => { import { checkAgentVersion, formatStalenessWarning } from "./sandbox-version.js"; import * as registry from "./registry.js"; import { captureOpenshellCommand } from "./openshell.js"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts.js"; import { spawnSync } from "child_process"; describe("checkAgentVersion", () => { @@ -130,6 +131,11 @@ describe("checkAgentVersion", () => { expect(result.detectionMethod).toBe("ssh-exec"); expect(result.sandboxVersion).toBe("2026.4.24"); expect(result.isStale).toBe(false); + expect(captureOpenshellCommand).toHaveBeenCalledWith( + "/usr/local/bin/openshell", + ["sandbox", "ssh-config", "test-sb"], + { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS }, + ); // Should have cached the version in registry const updated = registry.getSandbox("test-sb"); diff --git a/src/lib/sandbox-version.ts b/src/lib/sandbox-version.ts index 6d24c8c6cd..add8c0d223 100644 --- a/src/lib/sandbox-version.ts +++ b/src/lib/sandbox-version.ts @@ -18,6 +18,7 @@ import * as registry from "./registry.js"; import { loadAgent } from "./agent-defs.js"; import { resolveOpenshell } from "./resolve-openshell.js"; import { captureOpenshellCommand } from "./openshell.js"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts.js"; export interface VersionCheckResult { sandboxVersion: string | null; @@ -49,7 +50,7 @@ export function probeAgentVersion(sandboxName: string): string | null { const sshConfigResult = captureOpenshellCommand( openshellBinary, ["sandbox", "ssh-config", sandboxName], - { ignoreError: true }, + { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS }, ); if (sshConfigResult.status !== 0) return null; From 71bfe6dcb5336d698ed09f9bd04deda6941af826 Mon Sep 17 00:00:00 2001 From: Aaron Erickson Date: Fri, 1 May 2026 20:53:46 -0700 Subject: [PATCH 14/14] fix(status): bound nim status probes --- src/lib/nim.test.ts | 41 +++++++++++++++++++++++++++++++++++++++++ src/lib/nim.ts | 13 ++++++++++--- test/cli.test.ts | 23 +++++++++++++++-------- 3 files changed, 66 insertions(+), 11 deletions(-) diff --git a/src/lib/nim.test.ts b/src/lib/nim.test.ts index 4501c0f6c0..d1eff93d54 100644 --- a/src/lib/nim.test.ts +++ b/src/lib/nim.test.ts @@ -50,6 +50,17 @@ function hasCurlTimeoutArgs(cmd: string | string[]): boolean { return cmd[0] === "curl" && cmd[connectTimeout + 1] === "5" && cmd[maxTime + 1] === "5"; } +function timeoutForCommand( + runCapture: Mock, + predicate: (cmd: string | string[]) => boolean, +): number | undefined { + const call = runCapture.mock.calls.find((mockCall) => { + const cmd = mockCall[0] as string | string[]; + return predicate(cmd); + }); + return (call?.[1] as { timeout?: number } | undefined)?.timeout; +} + describe("nim", () => { describe("listModels", () => { it("returns 5 models", () => { @@ -353,6 +364,18 @@ describe("nim", () => { true, ); expect(commands.some((c) => c[0] === "curl" && hasCurlTimeoutArgs(c))).toBe(true); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"), + ), + ).toBe(5000); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "curl" && c.includes("http://127.0.0.1:9000/v1/models"), + ), + ).toBe(6000); } finally { restore(); } @@ -375,6 +398,18 @@ describe("nim", () => { expect(st).toMatchObject({ running: true, healthy: true, container: "foo", state: "running" }); expect(commands.some((c) => c[0] === "docker" && c.includes("port"))).toBe(true); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"), + ), + ).toBe(5000); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "docker" && c.includes("port"), + ), + ).toBe(5000); } finally { restore(); } @@ -410,6 +445,12 @@ describe("nim", () => { try { const st = nimModule.nimStatusByName("foo"); expect(st).toMatchObject({ running: false, healthy: false, container: "foo", state: "exited" }); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"), + ), + ).toBe(5000); expect(runCapture.mock.calls).toHaveLength(1); } finally { restore(); diff --git a/src/lib/nim.ts b/src/lib/nim.ts index 068139a02a..e9a0265429 100644 --- a/src/lib/nim.ts +++ b/src/lib/nim.ts @@ -20,6 +20,7 @@ const nimImages = require("../../bin/lib/nim-images.json"); import { VLLM_PORT } from "./ports"; const UNIFIED_MEMORY_GPU_TAGS = ["GB10", "Thor", "Orin", "Xavier"]; +const NIM_STATUS_PROBE_TIMEOUT_MS = 5000; export interface NimModel { name: string; @@ -338,14 +339,20 @@ export function nimStatus(sandboxName: string, port?: number): NimStatus { export function nimStatusByName(name: string, port?: number): NimStatus { try { - const state = dockerContainerInspectFormat("{{.State.Status}}", name, { ignoreError: true }); + const state = dockerContainerInspectFormat("{{.State.Status}}", name, { + ignoreError: true, + timeout: NIM_STATUS_PROBE_TIMEOUT_MS, + }); if (!state) return { running: false, container: name }; let healthy = false; if (state === "running") { let resolvedHostPort = port != null ? Number(port) : 0; if (!resolvedHostPort) { - const mapping = dockerPort(name, "8000", { ignoreError: true }); + const mapping = dockerPort(name, "8000", { + ignoreError: true, + timeout: NIM_STATUS_PROBE_TIMEOUT_MS, + }); const m = mapping && mapping.match(/:(\d+)\s*$/); resolvedHostPort = m ? Number(m[1]) : VLLM_PORT; } @@ -359,7 +366,7 @@ export function nimStatusByName(name: string, port?: number): NimStatus { "5", `http://127.0.0.1:${resolvedHostPort}/v1/models`, ], - { ignoreError: true }, + { ignoreError: true, timeout: NIM_STATUS_PROBE_TIMEOUT_MS + 1000 }, ); healthy = !!health; } diff --git a/test/cli.test.ts b/test/cli.test.ts index 323b76a41e..88611a055a 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -247,6 +247,19 @@ function createDoctorTestSetup(prefix: string, openshellLines: string[], sandbox }; } +function createCloudflaredServiceDir(prefix: string): { sandboxName: string; serviceDir: string } { + const suffix = [ + process.pid.toString(36), + Date.now().toString(36), + Math.random().toString(36).slice(2, 10), + ].join("-"); + const sandboxName = `${prefix}${suffix}`; + const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`); + fs.rmSync(serviceDir, { recursive: true, force: true }); + fs.mkdirSync(serviceDir, { recursive: true }); + return { sandboxName, serviceDir }; +} + function createDebugCommandTestEnv(prefix: string): Record { const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix)); const localBin = path.join(home, "bin"); @@ -954,7 +967,7 @@ describe("CLI dispatch", () => { }); it("doctor treats a live non-cloudflared PID as stale", () => { - const sandboxName = `doctorpid-${process.pid}`; + const { sandboxName, serviceDir } = createCloudflaredServiceDir("doctorpid-"); const setup = createDoctorTestSetup( "nemoclaw-cli-doctor-wrong-cloudflared-pid-", [ @@ -967,9 +980,6 @@ describe("CLI dispatch", () => { ], sandboxName, ); - const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`); - fs.rmSync(serviceDir, { recursive: true, force: true }); - fs.mkdirSync(serviceDir, { recursive: true }); const sleeper = spawn(process.execPath, ["-e", "setTimeout(() => {}, 30000)"], { stdio: "ignore", }); @@ -999,7 +1009,7 @@ describe("CLI dispatch", () => { }); it("doctor accepts a live cloudflared PID", () => { - const sandboxName = `doctorcloudflared-${process.pid}`; + const { sandboxName, serviceDir } = createCloudflaredServiceDir("doctorcloudflared-"); const setup = createDoctorTestSetup( "nemoclaw-cli-doctor-cloudflared-pid-", [ @@ -1012,11 +1022,8 @@ describe("CLI dispatch", () => { ], sandboxName, ); - const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`); const shimDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cloudflared-shim-")); const cloudflaredBin = path.join(shimDir, "cloudflared"); - fs.rmSync(serviceDir, { recursive: true, force: true }); - fs.mkdirSync(serviceDir, { recursive: true }); fs.symlinkSync(process.execPath, cloudflaredBin); const sleeper = spawn(cloudflaredBin, ["-e", "setTimeout(() => {}, 30000)"], { stdio: "ignore",