diff --git a/scripts/nemoclaw-start.sh b/scripts/nemoclaw-start.sh index 8e7f84eaed..daddd5351d 100755 --- a/scripts/nemoclaw-start.sh +++ b/scripts/nemoclaw-start.sh @@ -230,7 +230,7 @@ normalize_mutable_config_perms() { # Detect shields-up. Config dir owned by root means shields are # currently locked; normalizing would weaken the contract. local config_dir_owner - config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || echo unknown)" + config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || stat -f '%Su' "$config_dir" 2>/dev/null || echo unknown)" if [ "$config_dir_owner" = "root" ]; then return 0 fi diff --git a/src/lib/agent-onboard.test.ts b/src/lib/agent-onboard.test.ts index 6e00cf830a..e9abb6e521 100644 --- a/src/lib/agent-onboard.test.ts +++ b/src/lib/agent-onboard.test.ts @@ -5,7 +5,7 @@ import { describe, it, expect, beforeEach, afterEach, afterAll, vi } from "vites import fs from "node:fs"; import path from "node:path"; // Import from compiled dist/ so coverage is attributed correctly. -import { printDashboardUi } from "../../dist/lib/agent-onboard"; +import { printDashboardUi, verifyAgentBinaryAvailable } from "../../dist/lib/agent-onboard"; import type { AgentDefinition } from "./agent-defs"; function makeAgent(overrides: Partial = {}): AgentDefinition { @@ -128,10 +128,10 @@ describe("handleAgentSetup guards", () => { const source = fs.readFileSync(path.join(import.meta.dirname, "agent-onboard.ts"), "utf-8"); expect(source).toContain("verifyAgentBinaryAvailable"); - expect(source).toContain( - 'resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"', - ); - expect(source).toContain('[ "$resolved" = ${shellQuote(binaryPath)} ]'); + expect(source).toContain("[ -e ${shellQuote(binaryPath)} ]"); + expect(source).toContain("[ -x ${shellQuote(binaryPath)} ]"); + expect(source).not.toContain('[ -n "$resolved" ] || { echo not_found'); + expect(source).not.toContain("path_mismatch"); expect(source).toMatch( /"sandbox",\s*"exec",\s*"-n",\s*sandboxName,\s*"--",\s*"sh",\s*"-lc",\s*script/, ); @@ -150,4 +150,36 @@ describe("handleAgentSetup guards", () => { expect(source).toContain('parsed.status === "ok"'); expect(source).not.toContain('.includes("ok")'); }); + + it("accepts an executable configured binary path when PATH lookup is empty", () => { + let script = ""; + const result = verifyAgentBinaryAvailable( + "alpha", + makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }), + (args) => { + script = String(args[7] || ""); + return "ok"; + }, + ); + + expect(result).toEqual({ available: true }); + expect(script).toContain("[ -e '/usr/local/bin/hermes' ]"); + expect(script).toContain("[ -x '/usr/local/bin/hermes' ]"); + expect(script).not.toContain("command -v 'hermes'"); + }); + + it("does not reject a configured binary when PATH resolves the symlink target", () => { + let script = ""; + const result = verifyAgentBinaryAvailable( + "alpha", + makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }), + (args) => { + script = String(args[7] || ""); + return "ok"; + }, + ); + + expect(result).toEqual({ available: true }); + expect(script).not.toContain("path_mismatch"); + }); }); diff --git a/src/lib/agent-onboard.ts b/src/lib/agent-onboard.ts index c5bad2dff1..aa536dccda 100644 --- a/src/lib/agent-onboard.ts +++ b/src/lib/agent-onboard.ts @@ -124,12 +124,12 @@ type AgentBinaryAvailability = | { available: true } | { available: false; - reason: "not_found" | "not_executable" | "path_mismatch"; + reason: "not_found" | "not_executable"; binaryPath?: string; - resolvedPath?: string; }; -function verifyAgentBinaryAvailable( +// Exported for unit coverage of the sandbox-side guard without running onboarding. +export function verifyAgentBinaryAvailable( sandboxName: string, agent: AgentDefinition, runCaptureOpenshell: OnboardContext["runCaptureOpenshell"], @@ -138,10 +138,8 @@ function verifyAgentBinaryAvailable( const binaryPath = typeof agent.binary_path === "string" ? agent.binary_path.trim() : ""; const script = binaryPath ? [ - `resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`, - `[ -n "$resolved" ] || { echo not_found; exit 1; }`, + `[ -e ${shellQuote(binaryPath)} ] || { echo not_found; exit 1; }`, `[ -x ${shellQuote(binaryPath)} ] || { echo not_executable; exit 1; }`, - `[ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`, "echo ok", ].join(" && ") : `command -v ${shellQuote(executable)} >/dev/null 2>&1 && echo ok || echo not_found`; @@ -156,15 +154,6 @@ function verifyAgentBinaryAvailable( return { available: true }; } if (binaryPath && result) { - const mismatch = result.match(/path_mismatch:([^\n]+)/); - if (mismatch) { - return { - available: false, - reason: "path_mismatch", - binaryPath, - resolvedPath: mismatch[1].trim(), - }; - } if (result.includes("not_executable")) { return { available: false, reason: "not_executable", binaryPath }; } @@ -178,9 +167,6 @@ function describeAgentBinaryFailure( result: Exclude, ): string { const executable = agentExecutableName(agent); - if (result.reason === "path_mismatch") { - return `${agent.displayName} binary '${executable}' resolves to '${result.resolvedPath}', expected '${result.binaryPath}' inside sandbox '${sandboxName}'`; - } if (result.reason === "not_executable") { return `${agent.displayName} configured binary '${result.binaryPath}' is not executable inside sandbox '${sandboxName}'`; } diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts index 9ae9e7024f..965729842f 100644 --- a/src/lib/agent-runtime.test.ts +++ b/src/lib/agent-runtime.test.ts @@ -3,7 +3,7 @@ import { describe, it, expect } from "vitest"; // Import from compiled dist/ so coverage is attributed correctly. -import { buildRecoveryScript } from "../../dist/lib/agent-runtime"; +import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../dist/lib/agent-runtime"; import type { AgentDefinition } from "./agent-defs"; function makeAgent(overrides: Partial = {}): AgentDefinition { @@ -60,13 +60,13 @@ describe("buildRecoveryScript", () => { it("launches the default gateway command through the validated agent binary", () => { const script = buildRecoveryScript(minimalAgent, 19000); expect(script).toContain("command -v 'test-agent'"); - expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000'); + expect(script).toContain('"$AGENT_BIN" gateway run --port 19000'); }); it("falls back to openclaw gateway run when gateway_command is absent", () => { const agent = makeAgent({ gateway_command: undefined }); const script = buildRecoveryScript(agent, 19000); - expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000'); + expect(script).toContain('"$AGENT_BIN" gateway run --port 19000'); }); it("validates and launches custom gateway commands explicitly", () => { @@ -123,17 +123,28 @@ describe("buildRecoveryScript", () => { it("writes the warning to gateway.log so it persists for sysadmin tail", () => { const script = buildRecoveryScript(minimalAgent, 19000); - // Both warnings must end up in /tmp/gateway.log, not just stderr — + // Both warnings must end up in the selected gateway log, not just stderr — // executeSandboxCommand silently discards stderr from the recovery // script, so a warning that only goes to stderr is invisible to // anyone debugging a crash-loop. (#2478) - expect(script).toContain('echo "$_W" >> /tmp/gateway.log'); + expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"'); // And the warning must be deferred until AFTER gateway.log is - // freshly touched/chmod'd, otherwise the redirect targets a stale - // file that gets removed seconds later. - const touchIdx = script!.indexOf("touch /tmp/gateway.log"); - const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log'); - expect(touchIdx).toBeLessThan(warnIdx); + // safely opened with O_NOFOLLOW, otherwise the redirect targets a + // stale or attacker-controlled file. + const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;"); + const logSelectionIdx = script!.indexOf("_GATEWAY_LOG=/tmp/gateway.log"); + const warnIdx = script!.indexOf('echo "$_W" >> "$_GATEWAY_LOG"'); + expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0); + expect(logSelectionIdx).toBeGreaterThanOrEqual(0); + expect(warnIdx).toBeGreaterThanOrEqual(0); + expect(gatewayPrepIdx).toBeLessThan(logSelectionIdx); + expect(logSelectionIdx).toBeLessThan(warnIdx); + }); + + it("stops recovery when hardened log setup fails", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).toContain(" /tmp/gateway.log 'gateway' || exit 1;"); + expect(script).toContain(" /tmp/auto-pair.log 'sandbox' || exit 1;"); }); it("appends (not truncates) gateway.log on launch so warnings survive", () => { @@ -141,8 +152,72 @@ describe("buildRecoveryScript", () => { // Truncating with `>` wipes the [gateway-recovery] WARNING that the // recovery script wrote moments earlier — meaning a sysadmin tailing // gateway.log would see the eventual crash without the explanation. - expect(script).toContain(">> /tmp/gateway.log 2>&1 &"); + expect(script).toContain('>> "$_GATEWAY_LOG" 2>&1 &'); expect(script).not.toMatch(/[^>]> \/tmp\/gateway\.log 2>&1 &/); }); + + it("preserves an existing gateway.log and has a writable fallback log", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).not.toContain("rm -f /tmp/gateway.log"); + expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log"); + expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log"); + expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"'); + expect(script).toContain('tail -5 "$_GATEWAY_LOG"'); + expect(script).not.toContain('echo "$_W" >> /tmp/gateway.log'); + expect(script).not.toContain("cat /tmp/gateway.log"); + }); + + it("rejects a symlinked gateway.log before preparing the log", () => { + const script = buildOpenClawRecoveryScript(18789); + const noFollowIdx = script.indexOf("O_NOFOLLOW"); + const openIdx = script.indexOf("os.open(path, flags, 0o644)"); + const fchownIdx = script.indexOf("os.fchown(fd"); + expect(script).toContain("refusing to prepare symlinked /tmp/gateway.log"); + expect(script).toContain("sys.exit(1)"); + expect(script).not.toContain(": > /tmp/gateway.log"); + expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log"); + expect(noFollowIdx).toBeGreaterThanOrEqual(0); + expect(openIdx).toBeGreaterThanOrEqual(0); + expect(fchownIdx).toBeGreaterThanOrEqual(0); + expect(noFollowIdx).toBeLessThan(openIdx); + expect(openIdx).toBeLessThan(fchownIdx); + }); + + it("prepares gateway.log for the real gateway-owned sandbox log", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).toContain("os.fchown(fd"); + expect(script).toContain("pw.pw_gid"); + expect(script).not.toContain("grp.getgrnam"); + expect(script).toContain("owner_mode = 0o644"); + expect(script).toContain("os.fchmod(fd, owner_mode)"); + expect(script).toContain("/tmp/gateway.log 'gateway'"); + expect(script).toContain("gosu 'gateway'"); + }); + + it("terminates the conditional launch branch before capturing the gateway pid", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).toContain(" fi; GPID=$!"); + expect(script).not.toContain(" fi GPID=$!"); + }); + + it("prepares auto-pair.log without unlinking or following symlinks", () => { + const script = buildOpenClawRecoveryScript(18789); + expect(script).toContain("refusing to prepare symlinked /tmp/auto-pair.log"); + expect(script).toContain("/tmp/auto-pair.log 'sandbox'"); + expect(script).toContain("owner_mode = 0o600"); + expect(script).not.toContain("rm -f /tmp/auto-pair.log"); + expect(script).not.toContain(": > /tmp/auto-pair.log"); + expect(script).not.toContain("touch /tmp/auto-pair.log"); + expect(script).not.toContain("chown sandbox:sandbox /tmp/auto-pair.log"); + expect(script).not.toContain("chmod 600 /tmp/auto-pair.log"); + }); + + it("does not force non-OpenClaw agents to run as the gateway user", () => { + const script = buildRecoveryScript(minimalAgent, 19000); + expect(script).not.toContain("chown gateway:gateway /tmp/gateway.log"); + expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log"); + expect(script).not.toContain("gosu gateway"); + expect(script).not.toContain("gosu 'gateway'"); + }); }); }); diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts index 8a22e557de..7d01f3e05a 100644 --- a/src/lib/agent-runtime.ts +++ b/src/lib/agent-runtime.ts @@ -48,6 +48,101 @@ export function getHealthProbeUrl(agent: AgentDefinition | null): string { return agent.healthProbe?.url || `http://127.0.0.1:${DASHBOARD_PORT}/`; } +function buildNoFollowLogSetupCommand( + path: string, + logOwnerUser?: string, + ownerMode = "0o644", +): string { + const displayPath = path.replace(/\\/g, "\\\\").replace(/'/g, "\\'"); + const prepareLog = [ + "import errno, os, pwd, stat, sys", + "path = sys.argv[1]", + "owner = sys.argv[2] if len(sys.argv) > 2 else ''", + `owner_mode = ${ownerMode}`, + "fallback_mode = 0o600", + "flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC | getattr(os, 'O_NOFOLLOW', 0)", + "try:", + " fd = os.open(path, flags, 0o644)", + "except OSError as exc:", + " if exc.errno == errno.ELOOP:", + ` print('[gateway-recovery] ERROR: refusing to prepare symlinked ${displayPath}', file=sys.stderr)`, + " sys.exit(1)", + " if exc.errno in (errno.EACCES, errno.EPERM):", + ` print('[gateway-recovery] ERROR: ${displayPath} is not writable by recovery user', file=sys.stderr)`, + " sys.exit(0)", + ` print(f'[gateway-recovery] ERROR: cannot prepare ${displayPath}: {exc}', file=sys.stderr)`, + " sys.exit(1)", + "try:", + " if not stat.S_ISREG(os.fstat(fd).st_mode):", + ` print('[gateway-recovery] ERROR: ${displayPath} is not a regular file', file=sys.stderr)`, + " sys.exit(1)", + " if owner and os.geteuid() == 0:", + " try:", + " pw = pwd.getpwnam(owner)", + " except KeyError:", + " os.fchmod(fd, fallback_mode)", + " else:", + " os.fchown(fd, pw.pw_uid, pw.pw_gid)", + " os.fchmod(fd, owner_mode)", + " else:", + " os.fchmod(fd, fallback_mode)", + "finally:", + " os.close(fd)", + ].join("\n"); + return [ + "python3", + "-c", + shellQuote(prepareLog), + path, + ...(logOwnerUser ? [shellQuote(logOwnerUser)] : []), + ].join(" "); +} + +function buildGatewayLogSetup(includeAutoPairLog = false, logOwnerUser?: string): string[] { + const lines = [`${buildNoFollowLogSetupCommand("/tmp/gateway.log", logOwnerUser)} || exit 1;`]; + if (includeAutoPairLog) { + lines.push( + `${buildNoFollowLogSetupCommand("/tmp/auto-pair.log", "sandbox", "0o600")} || exit 1;`, + ); + } + return lines; +} + +function buildGatewayLogSelection(): string { + return '_GATEWAY_LOG=/tmp/gateway.log; if ! : >> "$_GATEWAY_LOG" 2>/dev/null; then _GATEWAY_LOG=/tmp/gateway-recovery.log; : >> "$_GATEWAY_LOG" 2>/dev/null || true; fi;'; +} + +function gatewayLaunchCommand(command: string, runAsUser?: string): string { + const logSelection = buildGatewayLogSelection(); + const userLaunch = `nohup ${command} >> "$_GATEWAY_LOG" 2>&1 &`; + if (!runAsUser) { + return `${logSelection} ${userLaunch}`; + } + return `${logSelection} if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> "$_GATEWAY_LOG" 2>&1 & else ${userLaunch} fi;`; +} + +/** + * Build the OpenClaw recovery shell script used by the default sandbox. + */ +export function buildOpenClawRecoveryScript(port: number): string { + return [ + "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", + "[ -f ~/.bashrc ] && . ~/.bashrc;", + 'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi;', + `if curl -sf --max-time 3 http://127.0.0.1:${port}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`, + "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", + ...buildGatewayLogSetup(true, "gateway"), + buildGatewayLogSelection(), + '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', + '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', + 'OPENCLAW="$(command -v openclaw)";', + 'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;', + gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"), + "GPID=$!; sleep 2;", + 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; tail -5 "$_GATEWAY_LOG" 2>/dev/null; fi', + ].join(" "); +} + /** * Build the recovery shell script for a non-OpenClaw agent. * Returns the script string, or null if agent is null (use existing inline @@ -78,8 +173,8 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number) // *why* the gateway is about to crash gets wiped by the same launch // that's about to crash on a missing guard. (#2478) const launchCommand = usesValidatedBinary - ? `nohup "$AGENT_BIN" gateway run --port ${port} >> /tmp/gateway.log 2>&1 &` - : `nohup ${configuredGatewayCommand} --port ${port} >> /tmp/gateway.log 2>&1 &`; + ? gatewayLaunchCommand(`"$AGENT_BIN" gateway run --port ${port}`) + : gatewayLaunchCommand(`${configuredGatewayCommand} --port ${port}`); const isHermes = agent.name === "hermes"; const hermesHome = isHermes ? "export HERMES_HOME=/sandbox/.hermes; " : ""; @@ -106,14 +201,14 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number) 'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi;', hermesHome, `if curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`, - "rm -f /tmp/gateway.log;", - "touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;", - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing — gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', - '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload — gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', + ...buildGatewayLogSetup(false), + buildGatewayLogSelection(), + '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', + '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };', ...validationSteps, launchCommand, "GPID=$!; sleep 2;", - 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi', + 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; tail -5 "$_GATEWAY_LOG" 2>/dev/null; fi', ].join(" "); } diff --git a/src/lib/command-registry.ts b/src/lib/command-registry.ts index df809c67b7..31014192dc 100644 --- a/src/lib/command-registry.ts +++ b/src/lib/command-registry.ts @@ -98,6 +98,7 @@ export const COMMANDS: readonly CommandDef[] = [ { usage: "nemoclaw connect", description: "Shell into a running sandbox", + flags: "[--probe-only]", group: "Sandbox Management", scope: "sandbox", }, @@ -427,7 +428,6 @@ export const COMMANDS: readonly CommandDef[] = [ scope: "global", hidden: true, }, - ] as const; /** All global-scope commands. */ diff --git a/src/lib/nim.test.ts b/src/lib/nim.test.ts index 4501c0f6c0..d1eff93d54 100644 --- a/src/lib/nim.test.ts +++ b/src/lib/nim.test.ts @@ -50,6 +50,17 @@ function hasCurlTimeoutArgs(cmd: string | string[]): boolean { return cmd[0] === "curl" && cmd[connectTimeout + 1] === "5" && cmd[maxTime + 1] === "5"; } +function timeoutForCommand( + runCapture: Mock, + predicate: (cmd: string | string[]) => boolean, +): number | undefined { + const call = runCapture.mock.calls.find((mockCall) => { + const cmd = mockCall[0] as string | string[]; + return predicate(cmd); + }); + return (call?.[1] as { timeout?: number } | undefined)?.timeout; +} + describe("nim", () => { describe("listModels", () => { it("returns 5 models", () => { @@ -353,6 +364,18 @@ describe("nim", () => { true, ); expect(commands.some((c) => c[0] === "curl" && hasCurlTimeoutArgs(c))).toBe(true); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"), + ), + ).toBe(5000); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "curl" && c.includes("http://127.0.0.1:9000/v1/models"), + ), + ).toBe(6000); } finally { restore(); } @@ -375,6 +398,18 @@ describe("nim", () => { expect(st).toMatchObject({ running: true, healthy: true, container: "foo", state: "running" }); expect(commands.some((c) => c[0] === "docker" && c.includes("port"))).toBe(true); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"), + ), + ).toBe(5000); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "docker" && c.includes("port"), + ), + ).toBe(5000); } finally { restore(); } @@ -410,6 +445,12 @@ describe("nim", () => { try { const st = nimModule.nimStatusByName("foo"); expect(st).toMatchObject({ running: false, healthy: false, container: "foo", state: "exited" }); + expect( + timeoutForCommand( + runCapture, + (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"), + ), + ).toBe(5000); expect(runCapture.mock.calls).toHaveLength(1); } finally { restore(); diff --git a/src/lib/nim.ts b/src/lib/nim.ts index 068139a02a..e9a0265429 100644 --- a/src/lib/nim.ts +++ b/src/lib/nim.ts @@ -20,6 +20,7 @@ const nimImages = require("../../bin/lib/nim-images.json"); import { VLLM_PORT } from "./ports"; const UNIFIED_MEMORY_GPU_TAGS = ["GB10", "Thor", "Orin", "Xavier"]; +const NIM_STATUS_PROBE_TIMEOUT_MS = 5000; export interface NimModel { name: string; @@ -338,14 +339,20 @@ export function nimStatus(sandboxName: string, port?: number): NimStatus { export function nimStatusByName(name: string, port?: number): NimStatus { try { - const state = dockerContainerInspectFormat("{{.State.Status}}", name, { ignoreError: true }); + const state = dockerContainerInspectFormat("{{.State.Status}}", name, { + ignoreError: true, + timeout: NIM_STATUS_PROBE_TIMEOUT_MS, + }); if (!state) return { running: false, container: name }; let healthy = false; if (state === "running") { let resolvedHostPort = port != null ? Number(port) : 0; if (!resolvedHostPort) { - const mapping = dockerPort(name, "8000", { ignoreError: true }); + const mapping = dockerPort(name, "8000", { + ignoreError: true, + timeout: NIM_STATUS_PROBE_TIMEOUT_MS, + }); const m = mapping && mapping.match(/:(\d+)\s*$/); resolvedHostPort = m ? Number(m[1]) : VLLM_PORT; } @@ -359,7 +366,7 @@ export function nimStatusByName(name: string, port?: number): NimStatus { "5", `http://127.0.0.1:${resolvedHostPort}/v1/models`, ], - { ignoreError: true }, + { ignoreError: true, timeout: NIM_STATUS_PROBE_TIMEOUT_MS + 1000 }, ); healthy = !!health; } diff --git a/src/lib/sandbox-version.test.ts b/src/lib/sandbox-version.test.ts index 9b79eb8008..cf844b5288 100644 --- a/src/lib/sandbox-version.test.ts +++ b/src/lib/sandbox-version.test.ts @@ -50,6 +50,7 @@ vi.mock("child_process", async (importOriginal) => { import { checkAgentVersion, formatStalenessWarning } from "./sandbox-version.js"; import * as registry from "./registry.js"; import { captureOpenshellCommand } from "./openshell.js"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts.js"; import { spawnSync } from "child_process"; describe("checkAgentVersion", () => { @@ -130,6 +131,11 @@ describe("checkAgentVersion", () => { expect(result.detectionMethod).toBe("ssh-exec"); expect(result.sandboxVersion).toBe("2026.4.24"); expect(result.isStale).toBe(false); + expect(captureOpenshellCommand).toHaveBeenCalledWith( + "/usr/local/bin/openshell", + ["sandbox", "ssh-config", "test-sb"], + { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS }, + ); // Should have cached the version in registry const updated = registry.getSandbox("test-sb"); diff --git a/src/lib/sandbox-version.ts b/src/lib/sandbox-version.ts index 6d24c8c6cd..add8c0d223 100644 --- a/src/lib/sandbox-version.ts +++ b/src/lib/sandbox-version.ts @@ -18,6 +18,7 @@ import * as registry from "./registry.js"; import { loadAgent } from "./agent-defs.js"; import { resolveOpenshell } from "./resolve-openshell.js"; import { captureOpenshellCommand } from "./openshell.js"; +import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts.js"; export interface VersionCheckResult { sandboxVersion: string | null; @@ -49,7 +50,7 @@ export function probeAgentVersion(sandboxName: string): string | null { const sshConfigResult = captureOpenshellCommand( openshellBinary, ["sandbox", "ssh-config", sandboxName], - { ignoreError: true }, + { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS }, ); if (sshConfigResult.status !== 0) return null; diff --git a/src/lib/shields.ts b/src/lib/shields.ts index f53d281cbe..2bac74c4d5 100644 --- a/src/lib/shields.ts +++ b/src/lib/shields.ts @@ -279,6 +279,17 @@ function applyStateDirLockMode(sandboxName: string, configDir: string, owner: st } try { kubectlExec(sandboxName, ["chmod", dirMode, dirPath]); + } catch { + // Silently skip + } + if (isLocking) { + try { + kubectlExec(sandboxName, ["chmod", "g-s", dirPath]); + } catch { + // Best effort; do not skip recursive write stripping. + } + } + try { kubectlExec(sandboxName, ["chmod", "-R", writeStrip, dirPath]); } catch { // Silently skip @@ -301,6 +312,7 @@ for dir in "$config_dir"/workspace-*; do [ -d "$dir" ] || continue chown -R "$owner" "$dir" 2>/dev/null || true chmod "$dir_mode" "$dir" 2>/dev/null || true + if [ "$dir_mode" = "755" ]; then chmod g-s "$dir" 2>/dev/null || true; fi chmod -R "$write_strip" "$dir" 2>/dev/null || true done `, @@ -483,6 +495,11 @@ function lockAgentConfig( } catch { errors.push("chmod 755 config dir"); } + try { + kubectlExec(sandboxName, ["chmod", "g-s", target.configDir]); + } catch { + errors.push("chmod g-s config dir"); + } try { kubectlExec(sandboxName, ["chown", "root:root", target.configDir]); diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts index 82689a4790..fd375797bf 100644 --- a/src/nemoclaw.ts +++ b/src/nemoclaw.ts @@ -128,6 +128,8 @@ type SandboxCommandResult = { stderr: string; }; +const SANDBOX_EXEC_STARTED_MARKER = "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; + type RecoveredSandboxMetadata = Partial< Pick > & { @@ -245,6 +247,7 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom timeout: OPENSHELL_PROBE_TIMEOUT_MS, }); if (sshConfigResult.status !== 0) return null; + if (!sshConfigResult.output.trim()) return null; const tmpFile = path.join(os.tmpdir(), `nemoclaw-ssh-${process.pid}-${Date.now()}.conf`); fs.writeFileSync(tmpFile, sshConfigResult.output, { mode: 0o600 }); @@ -283,6 +286,46 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom } } +function executeSandboxExecCommand( + sandboxName: string, + command: string, + timeout = 15000, +): SandboxCommandResult | null { + const markedCommand = `printf '%s\\n' '${SANDBOX_EXEC_STARTED_MARKER}'; ${command}`; + try { + const result = spawnSync( + getOpenshellBinary(), + ["sandbox", "exec", "--name", sandboxName, "--", "sh", "-c", markedCommand], + { + cwd: ROOT, + encoding: "utf-8", + env: process.env, + stdio: ["ignore", "pipe", "pipe"], + timeout, + }, + ); + const stdout = (result.stdout || "").trim(); + const stdoutLines = stdout.split(/\r?\n/); + const markerIndex = stdoutLines.indexOf(SANDBOX_EXEC_STARTED_MARKER); + if (markerIndex === -1) return null; + const commandStdoutLines = stdoutLines.slice(markerIndex + 1); + return { + status: result.error ? 1 : (result.status ?? 1), + stdout: commandStdoutLines.join("\n").trim(), + stderr: (result.stderr || "").trim(), + }; + } catch { + return null; + } +} + +function parseSandboxGatewayProbe(result: SandboxCommandResult | null): boolean | null { + if (!result) return null; + if (result.stdout === "RUNNING") return true; + if (result.stdout === "STOPPED") return false; + return null; +} + /** * Check whether the OpenClaw gateway process is running inside the sandbox. * Uses the gateway's HTTP endpoint (dashboard port) as the source of truth, @@ -292,62 +335,72 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom function isSandboxGatewayRunning(sandboxName: string): boolean | null { const agent = agentRuntime.getSessionAgent(sandboxName); const probeUrl = agentRuntime.getHealthProbeUrl(agent); - const result = executeSandboxCommand( - sandboxName, - `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`, - ); - if (!result) return null; - if (result.stdout === "RUNNING") return true; - if (result.stdout === "STOPPED") return false; - return null; + const command = `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`; + const execProbe = parseSandboxGatewayProbe(executeSandboxExecCommand(sandboxName, command)); + if (execProbe !== null) return execProbe; + return parseSandboxGatewayProbe(executeSandboxCommand(sandboxName, command)); } /** - * Restart the OpenClaw gateway process inside the sandbox after a pod restart. + * Restart the gateway process inside the sandbox after a pod restart. * Cleans stale lock/temp files, sources proxy config, and launches the gateway * in the background. Returns true on success. */ function recoverSandboxProcesses(sandboxName: string): boolean { const agent = agentRuntime.getSessionAgent(sandboxName); const agentScript = agentRuntime.buildRecoveryScript(agent, agent?.forwardPort ?? DASHBOARD_PORT); - const script = - agentScript || - [ - // Source /tmp/nemoclaw-proxy-env.sh explicitly so NODE_OPTIONS preload - // guards (safety-net, ciao, slack, …) survive gateway respawn. Without - // this, library errors crash-loop the gateway because the original - // .bashrc-only path silently failed when the env file was unreadable - // or the shell did not source ~/.bashrc. See #2478. Mirrors the - // hardened block in src/lib/agent-runtime.ts:buildRecoveryScript. - // Defer warning emission until AFTER touch+chmod gateway.log so - // warnings land in the persistent log a sysadmin would tail. Stderr - // alone hides them because executeSandboxCommand captures stderr - // without surfacing it. Mirrors src/lib/agent-runtime.ts. - "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;", - "[ -f ~/.bashrc ] && . ~/.bashrc;", - 'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi;', - `if curl -sf --max-time 3 http://127.0.0.1:${DASHBOARD_PORT}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`, - "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;", - "rm -f /tmp/gateway.log /tmp/auto-pair.log;", - "touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;", - "touch /tmp/auto-pair.log; chmod 600 /tmp/auto-pair.log;", - '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing — gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', - '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload — gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };', - 'OPENCLAW="$(command -v openclaw)";', - 'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;', - // Append rather than truncate so [gateway-recovery] WARNING lines - // written above survive past the launch. (#2478) - `nohup "$OPENCLAW" gateway run --port ${DASHBOARD_PORT} >> /tmp/gateway.log 2>&1 &`, - "GPID=$!; sleep 2;", - 'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi', - ].join(" "); - - const result = executeSandboxCommand(sandboxName, script); - if (!result) return false; - return ( - result.status === 0 && - (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING")) + const hasRecoveryMarker = (result: SandboxCommandResult | null) => + !!( + result && + (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING")) + ); + const recoveredSsh = (result: SandboxCommandResult | null) => + !!(result && result.status === 0 && hasRecoveryMarker(result)); + + if (agentScript) { + // Non-OpenClaw manifests do not yet declare a runtime user for root + // sandbox exec. Recover them over SSH so the launch inherits the sandbox + // login user instead of creating root-owned agent state under /sandbox. + return recoveredSsh(executeSandboxCommand(sandboxName, agentScript)); + } + + const script = agentRuntime.buildOpenClawRecoveryScript(DASHBOARD_PORT); + const execResult = executeSandboxExecCommand(sandboxName, script, 30000); + if (hasRecoveryMarker(execResult)) return true; + if (execResult !== null) return false; + return recoveredSsh(executeSandboxCommand(sandboxName, script)); +} + +function readNonNegativeNumberEnv(name: string, fallback: number): number { + const raw = process.env[name]; + if (raw === undefined || raw.trim() === "") return fallback; + const parsed = Number(raw); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +} + +function waitForRecoveredSandboxGateway(sandboxName: string): boolean { + const timeoutSeconds = readNonNegativeNumberEnv( + "NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS", + 30, ); + const intervalSeconds = readNonNegativeNumberEnv( + "NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS", + 3, + ); + const attempts = + intervalSeconds > 0 + ? Math.max(1, Math.floor(timeoutSeconds / intervalSeconds) + 1) + : Math.max(1, Math.floor(timeoutSeconds) + 1); + + for (let attempt = 0; attempt < attempts; attempt += 1) { + if (isSandboxGatewayRunning(sandboxName) === true) { + return true; + } + if (attempt < attempts - 1) { + sleepSeconds(intervalSeconds); + } + } + return false; } /** @@ -392,9 +445,9 @@ function checkAndRecoverSandboxProcesses( const recovered = recoverSandboxProcesses(sandboxName); if (recovered) { - // Wait for gateway to bind its HTTP port before declaring success - sleepSeconds(3); - if (isSandboxGatewayRunning(sandboxName) !== true) { + // Wait for gateway to bind its HTTP port before declaring success. The + // recovered process can be alive before the OpenAI-compatible API is ready. + if (!waitForRecoveredSandboxGateway(sandboxName)) { if (!quiet) { console.error(" Gateway process started but is not responding."); console.error(" Check /tmp/gateway.log inside the sandbox for details."); @@ -1233,10 +1286,90 @@ function printSandboxActionUsage(action: string): void { // ── Sandbox-scoped actions ─────────────────────────────────────── -async function sandboxConnect(sandboxName: string) { +type SandboxConnectOptions = { + probeOnly?: boolean; +}; + +const SANDBOX_CONNECT_FLAGS = new Set(["--dangerously-skip-permissions", "--probe-only", "--help", "-h"]); + +function isSandboxConnectFlag(arg: string | undefined): boolean { + return typeof arg === "string" && SANDBOX_CONNECT_FLAGS.has(arg); +} + +function printSandboxConnectHelp(sandboxName = "") { + console.log(""); + console.log(` Usage: ${CLI_NAME} ${sandboxName} connect [--probe-only]`); + console.log(""); + console.log(" Options:"); + console.log( + " --probe-only Run recovery checks and exit without opening SSH", + ); + console.log(" -h, --help Show this help"); + console.log(""); +} + +function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): SandboxConnectOptions { + const options: SandboxConnectOptions = {}; + for (const arg of actionArgs) { + if (!isSandboxConnectFlag(arg)) { + console.error(` Unknown flag for connect: ${arg}`); + printSandboxConnectHelp(sandboxName); + process.exit(1); + } + switch (arg) { + case "--dangerously-skip-permissions": + console.error(" --dangerously-skip-permissions was removed; use shields commands instead."); + printSandboxConnectHelp(sandboxName); + process.exit(1); + case "--probe-only": + options.probeOnly = true; + break; + case "--help": + case "-h": + printSandboxConnectHelp(sandboxName); + process.exit(0); + break; + } + } + return options; +} + +function runSandboxConnectProbe(sandboxName: string): void { + const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true }); + const agent = agentRuntime.getSessionAgent(sandboxName); + const agentName = agentRuntime.getAgentDisplayName(agent); + if (!processCheck.checked) { + console.error( + ` Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`, + ); + process.exit(1); + } + if (processCheck.wasRunning) { + console.log(` Probe complete: ${agentName} gateway is running in '${sandboxName}'.`); + return; + } + if (processCheck.recovered) { + console.log(` Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`); + return; + } + console.error( + ` Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`, + ); + console.error(" Check /tmp/gateway.log inside the sandbox for details."); + process.exit(1); +} + +async function sandboxConnect( + sandboxName: string, + { probeOnly = false }: SandboxConnectOptions = {}, +) { const { isSandboxReady, parseSandboxStatus } = require("./lib/onboard"); await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true }); + if (probeOnly) { + return runSandboxConnectProbe(sandboxName); + } + // Version staleness check — warn but don't block try { const versionCheck = sandboxVersion.checkAgentVersion(sandboxName); @@ -4682,11 +4815,24 @@ const [cmd, ...args] = process.argv.slice(2); } // Sandbox-scoped commands: nemoclaw + const firstSandboxArg = args[0]; + const implicitConnectArg = isSandboxConnectFlag(firstSandboxArg); + const requestedSandboxAction = + !firstSandboxArg || implicitConnectArg ? "connect" : firstSandboxArg; + const requestedSandboxActionArgs = !firstSandboxArg || implicitConnectArg ? args : args.slice(1); + if ( + requestedSandboxAction === "connect" && + requestedSandboxActionArgs.some((arg) => arg === "--help" || arg === "-h") + ) { + validateName(cmd, "sandbox name"); + printSandboxConnectHelp(cmd); + return; + } + // If the registry doesn't know this name but the action is a sandbox-scoped // command, attempt recovery — the sandbox may still be live with a stale registry. // Derived from command registry — single source of truth const sandboxActions = sandboxActionTokens(); - const requestedSandboxAction = args[0] || "connect"; if (!registry.getSandbox(cmd) && sandboxActions.includes(requestedSandboxAction)) { validateName(cmd, "sandbox name"); await recoverRegistryEntries({ requestedSandboxName: cmd }); @@ -4730,28 +4876,12 @@ const [cmd, ...args] = process.argv.slice(2); const sandbox = registry.getSandbox(cmd); if (sandbox) { validateName(cmd, "sandbox name"); - const action = args[0] || "connect"; - const actionArgs = args.slice(1); + const action = requestedSandboxAction; + const actionArgs = requestedSandboxActionArgs; switch (action) { case "connect": - if (actionArgs.length > 0) { - console.error( - ` Unknown connect argument${actionArgs.length === 1 ? "" : "s"}: ${actionArgs.join(" ")}`, - ); - if (actionArgs.includes("--dangerously-skip-permissions")) { - console.error( - " --dangerously-skip-permissions was removed; use shields commands instead.", - ); - } - const reorderedCandidate = findRegisteredSandboxName(actionArgs); - if (reorderedCandidate) { - printConnectOrderHint(reorderedCandidate); - } - console.error(` Usage: ${CLI_NAME} connect`); - process.exit(1); - } - await sandboxConnect(cmd); + await sandboxConnect(cmd, parseSandboxConnectArgs(cmd, actionArgs)); break; case "status": if (hasHelpFlag(actionArgs)) { diff --git a/test/cli.test.ts b/test/cli.test.ts index 448d5ed936..88611a055a 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -94,7 +94,41 @@ function readRecordedArgs(markerFile: string): string[] { return fs.readFileSync(markerFile, "utf8").trim().split(/\s+/); } -function writeSandboxRegistry(home: string, sandboxName = "alpha"): void { +type SandboxEntry = { + name: string; + model: string; + provider: string; + gpuEnabled: boolean; + policies: string[]; + agent?: string; +}; + +function writeRecordingCommand( + binDir: string, + command: string, + markerFile: string, + exitCode: number, +): void { + fs.writeFileSync( + path.join(binDir, command), + [ + "#!/usr/bin/env bash", + `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`, + `exit ${exitCode}`, + ].join("\n"), + { mode: 0o755 }, + ); +} + +function writeSandboxRegistry( + home: string, + sandboxNameOrOverrides: string | Partial = "alpha", + sandboxOverridesArg: Partial = {}, +): void { + const sandboxName = + typeof sandboxNameOrOverrides === "string" ? sandboxNameOrOverrides : "alpha"; + const sandboxOverrides = + typeof sandboxNameOrOverrides === "string" ? sandboxOverridesArg : sandboxNameOrOverrides; const registryDir = path.join(home, ".nemoclaw"); fs.mkdirSync(registryDir, { recursive: true }); fs.writeFileSync( @@ -107,6 +141,7 @@ function writeSandboxRegistry(home: string, sandboxName = "alpha"): void { provider: "nvidia-prod", gpuEnabled: false, policies: [], + ...sandboxOverrides, }, }, defaultSandbox: sandboxName, @@ -212,6 +247,19 @@ function createDoctorTestSetup(prefix: string, openshellLines: string[], sandbox }; } +function createCloudflaredServiceDir(prefix: string): { sandboxName: string; serviceDir: string } { + const suffix = [ + process.pid.toString(36), + Date.now().toString(36), + Math.random().toString(36).slice(2, 10), + ].join("-"); + const sandboxName = `${prefix}${suffix}`; + const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`); + fs.rmSync(serviceDir, { recursive: true, force: true }); + fs.mkdirSync(serviceDir, { recursive: true }); + return { sandboxName, serviceDir }; +} + function createDebugCommandTestEnv(prefix: string): Record { const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix)); const localBin = path.join(home, "bin"); @@ -919,7 +967,7 @@ describe("CLI dispatch", () => { }); it("doctor treats a live non-cloudflared PID as stale", () => { - const sandboxName = `doctorpid-${process.pid}`; + const { sandboxName, serviceDir } = createCloudflaredServiceDir("doctorpid-"); const setup = createDoctorTestSetup( "nemoclaw-cli-doctor-wrong-cloudflared-pid-", [ @@ -932,9 +980,6 @@ describe("CLI dispatch", () => { ], sandboxName, ); - const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`); - fs.rmSync(serviceDir, { recursive: true, force: true }); - fs.mkdirSync(serviceDir, { recursive: true }); const sleeper = spawn(process.execPath, ["-e", "setTimeout(() => {}, 30000)"], { stdio: "ignore", }); @@ -964,7 +1009,7 @@ describe("CLI dispatch", () => { }); it("doctor accepts a live cloudflared PID", () => { - const sandboxName = `doctorcloudflared-${process.pid}`; + const { sandboxName, serviceDir } = createCloudflaredServiceDir("doctorcloudflared-"); const setup = createDoctorTestSetup( "nemoclaw-cli-doctor-cloudflared-pid-", [ @@ -977,11 +1022,8 @@ describe("CLI dispatch", () => { ], sandboxName, ); - const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`); const shimDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cloudflared-shim-")); const cloudflaredBin = path.join(shimDir, "cloudflared"); - fs.rmSync(serviceDir, { recursive: true, force: true }); - fs.mkdirSync(serviceDir, { recursive: true }); fs.symlinkSync(process.execPath, cloudflaredBin); const sleeper = spawn(cloudflaredBin, ["-e", "setTimeout(() => {}, 30000)"], { stdio: "ignore", @@ -1887,6 +1929,464 @@ describe("CLI dispatch", () => { expect(calls.some((call) => call.startsWith("forward start --background 18789"))).toBe(false); }); + it("shows connect help without opening an interactive session", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-help-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); + fs.mkdirSync(localBin, { recursive: true }); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`, + "exit 99", + ].join("\n"), + { mode: 0o755 }, + ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); + + const r = runWithEnv("alpha connect --help", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + const implicit = runWithEnv("alpha --help", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Usage: nemoclaw alpha connect"); + expect(r.out).toContain("--probe-only"); + expect(implicit.code).toBe(0); + expect(implicit.out).toContain("Usage: nemoclaw alpha connect"); + expect(fs.existsSync(markerFile)).toBe(false); + expect(fs.existsSync(sshMarkerFile)).toBe(false); + }); + + it("rejects the removed skip-permissions connect flag", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-flags-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); + fs.mkdirSync(localBin, { recursive: true }); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`, + "exit 99", + ].join("\n"), + { mode: 0o755 }, + ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); + writeSandboxRegistry(home); + + const r = runWithEnv("alpha connect --dangerously-skip-permissions", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + expect(r.out).toContain("--dangerously-skip-permissions was removed"); + expect(fs.existsSync(markerFile)).toBe(false); + expect(fs.existsSync(sshMarkerFile)).toBe(false); + }); + + it("connect --probe-only recovers the gateway without opening SSH", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); + const stateFile = path.join(home, "probe-state"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync(stateFile, "stopped"); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + `state_file=${JSON.stringify(stateFile)}`, + 'printf \'%s\\n\' "$*" >> "$marker_file"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' case "$cmd" in', + ' *"OPENCLAW="*)', + ' echo recovered > "$state_file"', + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + " echo 'GATEWAY_PID=123'", + " exit 42", + " ;;", + " *'curl -sf'*)", + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + ' if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi', + " exit 0", + " ;;", + " esac", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: recovered OpenClaw gateway"); + const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean); + expect(calls).toContain("sandbox get alpha"); + expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); + expect(calls).not.toContain("sandbox ssh-config alpha"); + expect(calls).not.toContain("sandbox connect alpha"); + expect(fs.existsSync(sshMarkerFile)).toBe(false); + }); + + it("waits for recovered gateway health before failing probe-only", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-wait-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + const stateFile = path.join(home, "probe-state"); + const readyCountFile = path.join(home, "ready-count"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync(stateFile, "stopped"); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + `state_file=${JSON.stringify(stateFile)}`, + `ready_count_file=${JSON.stringify(readyCountFile)}`, + 'printf \'%s\\n\' "$*" >> "$marker_file"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' case "$cmd" in', + ' *"OPENCLAW="*)', + ' echo recovered > "$state_file"', + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + " echo 'GATEWAY_PID=123'", + " exit 0", + " ;;", + " *'curl -sf'*)", + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + ' if [ "$(cat "$state_file")" != recovered ]; then echo STOPPED; exit 0; fi', + ' count=$(cat "$ready_count_file" 2>/dev/null || echo 0)', + " count=$((count + 1))", + ' echo "$count" > "$ready_count_file"', + ' if [ "$count" -ge 3 ]; then echo RUNNING; else echo STOPPED; fi', + " exit 0", + " ;;", + " esac", + "fi", + 'if [ "$1" = "forward" ]; then', + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS: "3", + NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS: "0", + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: recovered OpenClaw gateway"); + expect(fs.readFileSync(readyCountFile, "utf8").trim()).toBe("3"); + }); + + it("treats leading --probe-only as an implicit connect probe", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-leading-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + 'printf \'%s\\n\' "$*" >> "$marker_file"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' if [[ "$cmd" == *"curl -sf"* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo RUNNING; exit 0; fi', + ' if [[ "$cmd" == *"OPENCLAW="* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo UNEXPECTED_RECOVERY; exit 1; fi', + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); + + const r = runWithEnv("alpha --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: OpenClaw gateway is running"); + const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean); + expect(calls).toContain("sandbox get alpha"); + expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); + expect(calls).not.toContain("sandbox ssh-config alpha"); + expect(calls).not.toContain("sandbox connect alpha"); + expect(fs.existsSync(sshMarkerFile)).toBe(false); + }); + + it("connect --probe-only does not retry a failed sandbox exec recovery over SSH", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-no-ssh-")); + const localBin = path.join(home, "bin"); + const markerFile = path.join(home, "openshell-calls"); + const sshMarkerFile = path.join(home, "ssh-calls"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `marker_file=${JSON.stringify(markerFile)}`, + 'printf \'%s\\n\' "$*" >> "$marker_file"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' if [[ "$cmd" == *"OPENCLAW="* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo RECOVERY_FAILED >&2; exit 42; fi', + ' if [[ "$cmd" == *"curl -sf"* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo STOPPED; exit 0; fi', + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ]; then', + " echo 'Host openshell-alpha'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(1); + const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean); + expect(calls).toContain("sandbox get alpha"); + expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true); + expect(calls).not.toContain("sandbox ssh-config alpha"); + expect(fs.existsSync(sshMarkerFile)).toBe(false); + }); + + it("connect --probe-only falls back to SSH when sandbox exec never starts", () => { + const home = fs.mkdtempSync( + path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-exec-fallback-"), + ); + const localBin = path.join(home, "bin"); + const openshellCalls = path.join(home, "openshell-calls"); + const sshCalls = path.join(home, "ssh-calls"); + const stateFile = path.join(home, "probe-state"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home); + fs.writeFileSync(stateFile, "stopped"); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `calls=${JSON.stringify(openshellCalls)}`, + 'printf \'%s\\n\' "$*" >> "$calls"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ]; then', + " echo 'error: sandbox exec transport failed before command start' >&2", + " exit 2", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ] && [ "$3" = "alpha" ]; then', + " echo 'Host openshell-alpha'", + " echo ' HostName 127.0.0.1'", + " echo ' User sandbox'", + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "ssh"), + [ + "#!/usr/bin/env bash", + `calls=${JSON.stringify(sshCalls)}`, + `state_file=${JSON.stringify(stateFile)}`, + 'cmd="${@: -1}"', + 'printf \'ARGS %s\\n\' "$*" >> "$calls"', + 'printf \'CMD %s\\n\' "$cmd" >> "$calls"', + 'if [[ "$cmd" == *"OPENCLAW="* ]]; then', + ' echo recovered > "$state_file"', + " echo 'GATEWAY_PID=456'", + " exit 0", + "fi", + 'if [[ "$cmd" == *"curl -sf"* ]]; then', + ' if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi', + " exit 0", + "fi", + "exit 1", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: recovered OpenClaw gateway"); + const openshellLog = fs.readFileSync(openshellCalls, "utf8"); + const sshLog = fs.readFileSync(sshCalls, "utf8"); + expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c"); + expect(openshellLog).toContain("sandbox ssh-config alpha"); + expect(openshellLog).not.toContain("sandbox connect"); + expect(sshLog).toContain('OPENCLAW="$(command -v openclaw)"'); + expect(sshLog).not.toMatch(/(^|\s)-tt?(\s|$)/); + }); + + it("recovers non-OpenClaw agents over SSH instead of root sandbox exec", () => { + const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-agent-")); + const localBin = path.join(home, "bin"); + const openshellCalls = path.join(home, "openshell-calls"); + const sshCalls = path.join(home, "ssh-calls"); + const stateFile = path.join(home, "probe-state"); + fs.mkdirSync(localBin, { recursive: true }); + writeSandboxRegistry(home, { agent: "hermes" }); + fs.writeFileSync(stateFile, "stopped"); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/usr/bin/env bash", + `calls=${JSON.stringify(openshellCalls)}`, + `state_file=${JSON.stringify(stateFile)}`, + 'printf \'%s\\n\' "$*" >> "$calls"', + 'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then', + " echo 'Sandbox:'", + " echo", + " echo ' Id: abc'", + " echo ' Name: alpha'", + " echo ' Namespace: openshell'", + " echo ' Phase: Ready'", + " exit 0", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then', + ' cmd="$8"', + ' if [[ "$cmd" == *"curl -sf"* ]]; then', + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + ' if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi', + " exit 0", + " fi", + ' if [[ "$cmd" == *"HERMES_HOME=/sandbox/.hermes"* || "$cmd" == *"AGENT_BIN="* ]]; then', + " echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'", + " echo UNEXPECTED_ROOT_EXEC_RECOVERY", + " exit 1", + " fi", + "fi", + 'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ] && [ "$3" = "alpha" ]; then', + " echo 'Host openshell-alpha'", + " echo ' HostName 127.0.0.1'", + " echo ' User sandbox'", + " exit 0", + "fi", + 'if [ "$1" = "forward" ]; then', + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(localBin, "ssh"), + [ + "#!/usr/bin/env bash", + `calls=${JSON.stringify(sshCalls)}`, + `state_file=${JSON.stringify(stateFile)}`, + 'cmd="${@: -1}"', + 'printf \'ARGS %s\\n\' "$*" >> "$calls"', + 'printf \'CMD %s\\n\' "$cmd" >> "$calls"', + 'if [[ "$cmd" == *"AGENT_BIN=\'/usr/local/bin/hermes\'"* ]]; then', + ' echo recovered > "$state_file"', + " echo 'GATEWAY_PID=789'", + " exit 0", + "fi", + "exit 1", + ].join("\n"), + { mode: 0o755 }, + ); + + const r = runWithEnv("alpha connect --probe-only", { + HOME: home, + PATH: `${localBin}:${process.env.PATH || ""}`, + }); + + expect(r.code).toBe(0); + expect(r.out).toContain("Probe complete: recovered Hermes Agent gateway"); + const openshellLog = fs.readFileSync(openshellCalls, "utf8"); + const sshLog = fs.readFileSync(sshCalls, "utf8"); + expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c"); + expect(openshellLog).toContain("sandbox ssh-config alpha"); + expect(openshellLog).not.toContain("HERMES_HOME=/sandbox/.hermes"); + expect(openshellLog).not.toContain("AGENT_BIN="); + expect(openshellLog).not.toContain("sandbox connect"); + expect(sshLog).toContain("HERMES_HOME=/sandbox/.hermes"); + expect(sshLog).toContain("AGENT_BIN='/usr/local/bin/hermes'"); + expect(sshLog).not.toMatch(/(^|\s)-tt?(\s|$)/); + }); + it("waits for sandbox readiness before connecting", () => { const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-wait-")); const localBin = path.join(home, "bin"); diff --git a/test/e2e/test-issue-2478-crash-loop-recovery.sh b/test/e2e/test-issue-2478-crash-loop-recovery.sh index c0878a03ba..966df30bf7 100755 --- a/test/e2e/test-issue-2478-crash-loop-recovery.sh +++ b/test/e2e/test-issue-2478-crash-loop-recovery.sh @@ -30,11 +30,12 @@ # 2. Verifies the *initial* gateway has the safety-net + ciao guard active # (via /proc//environ on the gateway PID). # 3. Crash-recovery loop (NORMAL): kill the gateway 5x, each time triggers -# `nemoclaw connect` (which calls recoverSandboxProcesses), and -# checks the respawned gateway still has guards in NODE_OPTIONS. +# `nemoclaw connect --probe-only` (which calls +# recoverSandboxProcesses), and checks the respawned gateway still has +# guards in NODE_OPTIONS. # 4. Negative case: removes /tmp/nemoclaw-proxy-env.sh, kills the gateway, # triggers recovery — expects the new "[gateway-recovery] WARNING" -# line in stderr (visible via gateway.log) instead of silent guard loss. +# line in gateway.log instead of silent guard loss. # 5. Soak: leaves the sandbox idle for $NEMOCLAW_E2E_SOAK_SECONDS # (default 300) so the health-monitor restart cadence (~4 min in prod) # gets at least one chance to fire, then asserts the gateway has not @@ -245,6 +246,20 @@ gateway_diagnostics() { echo " ---------------------------" } +run_probe_only_or_fail() { + local context="$1" + local probe_out + probe_out="$(mktemp)" + if ! timeout 60 nemoclaw "$SANDBOX_NAME" connect --probe-only >"$probe_out" 2>&1; then + fail "${context}: connect --probe-only exited nonzero" + sed 's/^/ /' "$probe_out" + rm -f "$probe_out" + gateway_diagnostics "" + exit 1 + fi + rm -f "$probe_out" +} + # Wait until gateway PID is non-empty (or timeout). Echoes pid, returns 0/1. wait_for_gateway_up() { local timeout="${1:-30}" @@ -371,11 +386,18 @@ for cycle in $(seq 1 "$CRASH_CYCLES"); do info "Cycle $cycle/$CRASH_CYCLES — killing gateway pid=$prev_pid" sandbox_exec sh -c "kill -9 $prev_pid 2>/dev/null; sleep 1; pgrep -fo '[o]penclaw[ -]gateway' || echo DEAD" >/dev/null - # Trigger recovery via the actual code path: `nemoclaw status` - # calls checkAndRecoverSandboxProcesses() → recoverSandboxProcesses(), - # which is the hardened path #2478 changes. Bound it with `timeout` - # so a hang in CLI internals cannot eat the whole 30-min job budget. - timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true + # Trigger recovery via the actual operator probe path: + # `nemoclaw connect --probe-only` calls + # checkAndRecoverSandboxProcesses() -> recoverSandboxProcesses() without + # opening an interactive SSH session. Bound it with `timeout` so a hang in + # CLI internals cannot eat the whole 30-min job budget. + run_probe_only_or_fail "Cycle $cycle after gateway kill" + + if ! sandbox_exec sh -c 'test -s /tmp/gateway.log'; then + fail "Cycle $cycle: connect --probe-only did not leave /tmp/gateway.log evidence" + gateway_diagnostics "" + exit 1 + fi new_pid="$(wait_for_gateway_up 45)" if [ -z "$new_pid" ]; then @@ -435,9 +457,10 @@ info "Snapshotted proxy-env.sh ($SNAPSHOT_SIZE bytes, ${#SNAPSHOT_B64}-char base # recovery script (which is the only path that emits the warning). sandbox_exec sh -c 'rm -f /tmp/nemoclaw-proxy-env.sh' >/dev/null sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null -timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true +run_probe_only_or_fail "Negative case after proxy-env removal" -# The new gateway.log should contain the [gateway-recovery] WARNING line. +# The new gateway.log should contain the [gateway-recovery] WARNING line and +# recovery should have attempted a real gateway respawn. warn_seen=false for _ in 1 2 3 4 5; do if gateway_log_tail 100 | grep -q '\[gateway-recovery\] WARNING'; then @@ -452,6 +475,13 @@ else fail "Recovery silently launched without warning (regression of #2478 fix)" gateway_log_tail 100 fi +NEGATIVE_PID="$(wait_for_gateway_up 45)" +if [ -z "$NEGATIVE_PID" ]; then + fail "Recovery warning was logged, but gateway did not respawn within 45s" + gateway_diagnostics "" + exit 1 +fi +info "Negative-case recovery respawned gateway pid=$NEGATIVE_PID" # Restore proxy-env.sh by base64-injecting the snapshot via argv. `openshell # sandbox exec` does not pipe stdin from the caller through to the subshell, @@ -467,10 +497,10 @@ if [ "$restored_size" != "$SNAPSHOT_SIZE" ]; then fi info "proxy-env.sh restored (${restored_size} bytes verified)" -# Stop the no-guard gateway started by the negative case, then trigger -# recovery to bring the gateway back with guards intact. +# Kill the guardless negative-case gateway, then trigger recovery to bring the +# gateway back with guards intact from the restored env file. sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null -timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true +run_probe_only_or_fail "Guard restore recovery" SOAK_START_PID="$(wait_for_gateway_up 30)" if [ -z "$SOAK_START_PID" ]; then fail "Gateway not up entering soak phase" diff --git a/test/e2e/test-shields-config.sh b/test/e2e/test-shields-config.sh index 061741b430..499584c6c7 100755 --- a/test/e2e/test-shields-config.sh +++ b/test/e2e/test-shields-config.sh @@ -156,16 +156,22 @@ pass "NemoClaw installed (sandbox: $SANDBOX_NAME)" # ══════════════════════════════════════════════════════════════════ section "Phase 2: Config is writable (mutable default)" -# Verify file permissions — should start as 600 sandbox:sandbox +# Verify file permissions — fresh OpenClaw sandboxes start sandbox-owned. +# Depending on whether the image entrypoint ran as root, the mutable-default +# tree may already include the #2681 group-write/setgid contract or may only +# gain it after the first host-side shields-down. PERMS=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \ stat -c '%a %U:%G' "${CONFIG_PATH}" 2>/dev/null || true) info "Config perms (default): ${PERMS}" -if [ "$(echo "$PERMS" | awk '{print $1}')" = "600" ]; then - pass "Config file mode is 600 (mutable default)" -else - fail "Config file should start as mode 600: ${PERMS}" -fi +case "$(echo "$PERMS" | awk '{print $1}')" in + 600 | 660) + pass "Config file mode is mutable default (${PERMS})" + ;; + *) + fail "Config file should start as mutable mode 600 or 660: ${PERMS}" + ;; +esac if [ "$(echo "$PERMS" | awk '{print $2}')" = "sandbox:sandbox" ]; then pass "Config file owned by sandbox:sandbox (mutable default)" @@ -177,10 +183,19 @@ DIR_PERMS=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \ stat -c '%a %U:%G' "$(dirname "${CONFIG_PATH}")" 2>/dev/null || true) info "Config dir perms (default): ${DIR_PERMS}" -if [ "$(echo "$DIR_PERMS" | awk '{print $1}')" = "700" ]; then - pass "Config directory mode is 700 (mutable default)" +case "$(echo "$DIR_PERMS" | awk '{print $1}')" in + 700 | 2700 | 2770) + pass "Config directory mode is mutable default (${DIR_PERMS})" + ;; + *) + fail "Config directory should be mutable mode 700, 2700, or 2770: ${DIR_PERMS}" + ;; +esac + +if [ "$(echo "$DIR_PERMS" | awk '{print $2}')" = "sandbox:sandbox" ]; then + pass "Config directory owned by sandbox:sandbox (mutable default)" else - fail "Config directory should be mode 700: ${DIR_PERMS}" + fail "Config directory should be owned by sandbox:sandbox: ${DIR_PERMS}" fi STATUS_DEFAULT=$(nemoclaw "${SANDBOX_NAME}" shields status 2>&1) @@ -321,15 +336,16 @@ else fail "shields down did not report success: ${SHIELDS_DOWN_OUTPUT}" fi -# Check permissions changed — should be sandbox:sandbox 600/700 (doctor-aligned) +# Check permissions changed — OpenClaw uses group-writable + setgid mutable mode +# so the gateway UID and sandbox UID can both update config/state (#2681). PERMS_DOWN=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \ stat -c '%a %U:%G' "${CONFIG_PATH}" 2>/dev/null || true) info "Config perms (shields DOWN): ${PERMS_DOWN}" -if [ "$(echo "$PERMS_DOWN" | awk '{print $1}')" = "600" ]; then - pass "Config file mode is 600 (restored to mutable default)" +if [ "$(echo "$PERMS_DOWN" | awk '{print $1}')" = "660" ]; then + pass "Config file mode is 660 (restored to mutable group-writable default)" else - fail "Config file should be mode 600 after shields down: ${PERMS_DOWN}" + fail "Config file should be mode 660 after shields down: ${PERMS_DOWN}" fi if [ "$(echo "$PERMS_DOWN" | awk '{print $2}')" = "sandbox:sandbox" ]; then @@ -342,10 +358,10 @@ DIR_PERMS_DOWN=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \ stat -c '%a %U:%G' "$(dirname "${CONFIG_PATH}")" 2>/dev/null || true) info "Config dir perms (shields DOWN): ${DIR_PERMS_DOWN}" -if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $1}')" = "700" ]; then - pass "Config directory mode is 700 (restored to mutable default)" +if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $1}')" = "2770" ]; then + pass "Config directory mode is 2770 (restored to mutable setgid default)" else - fail "Config directory should be mode 700 after shields down: ${DIR_PERMS_DOWN}" + fail "Config directory should be mode 2770 after shields down: ${DIR_PERMS_DOWN}" fi if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $2}')" = "sandbox:sandbox" ]; then diff --git a/test/fetch-guard-patch-regression.test.ts b/test/fetch-guard-patch-regression.test.ts index 8c42701f21..a12a859494 100644 --- a/test/fetch-guard-patch-regression.test.ts +++ b/test/fetch-guard-patch-regression.test.ts @@ -26,8 +26,10 @@ function dockerRunCommandBetween(startMarker: string, endMarker: string): string .replace(/^RUN\s+/, "") .split("\n") .filter((line) => !line.trimStart().startsWith("#")) - .join("\n"); - return command.replace(/\\\s*$/, ""); + .join("\n") + .replace(/\\\n/g, " ") + .replace(/\\\s*$/, ""); + return command; } function runOpenClawUpgradeBlock(currentVersion: string) { diff --git a/test/nemoclaw-start.test.ts b/test/nemoclaw-start.test.ts index 9752c6314e..1f8c4001bb 100644 --- a/test/nemoclaw-start.test.ts +++ b/test/nemoclaw-start.test.ts @@ -1451,7 +1451,6 @@ describe("Telegram diagnostics (#2766)", () => { 'harden_auth_profiles() { :; }', 'chown() { :; }', 'chown_tree_no_symlink_follow() { :; }', - 'normalize_mutable_config_perms() { :; }', 'start_persistent_gateway_log_mirror() { :; }', 'gosu() { shift; "$@"; }', 'validate_tmp_permissions() { printf "VALIDATE:%s\\n" "$*"; }', diff --git a/test/repro-2681-group-writable.test.ts b/test/repro-2681-group-writable.test.ts index a389584373..5d2f6c102c 100644 --- a/test/repro-2681-group-writable.test.ts +++ b/test/repro-2681-group-writable.test.ts @@ -16,7 +16,7 @@ import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { spawnSync } from "node:child_process"; -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; const START_SCRIPT = path.join(import.meta.dirname, "..", "scripts", "nemoclaw-start.sh"); @@ -40,6 +40,38 @@ function modeBits(filePath: string): number { return fs.statSync(filePath).mode; } +function withMockedDockerExecFileSync(calls: string[][], run: () => T): T { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const dockerExecModule = require("../dist/lib/docker/exec.js") as { + dockerExecFileSync: (args: readonly string[]) => string; + }; + const originalDockerExecFileSync = dockerExecModule.dockerExecFileSync; + const shieldsModulePath = require.resolve("../dist/lib/shields.js"); + delete require.cache[shieldsModulePath]; + + dockerExecModule.dockerExecFileSync = vi.fn((args: readonly string[]) => { + const separator = args.indexOf("--"); + const command = separator >= 0 ? args.slice(separator + 1) : [...args]; + calls.push(command); + if (command[0] === "stat" && command[1] === "-c") { + return command.at(-1) === "/sandbox/.openclaw" + ? "2770 sandbox:sandbox\n" + : "660 sandbox:sandbox\n"; + } + if (command[0] === "lsattr") { + return `---------------------- ${command.at(-1)}\n`; + } + return ""; + }); + + try { + return run(); + } finally { + dockerExecModule.dockerExecFileSync = originalDockerExecFileSync; + delete require.cache[shieldsModulePath]; + } +} + function mkdtempOnPosixFs(prefix: string): string { const roots = process.platform === "linux" ? ["/tmp", os.tmpdir()] : [os.tmpdir()]; let lastError: unknown = null; @@ -93,50 +125,29 @@ describe("Issue #2681 — mutable OpenClaw config permissions", () => { }); it("shields-down restores OpenClaw group-writable file modes and setgid dirs", () => { - const probe = spawnSync( - process.execPath, - [ - "-e", - String.raw` -const Module = require("node:module"); -const originalLoad = Module._load; -const calls = []; -Module._load = function patchedLoad(request, parent, isMain) { - if (request === "./docker/exec") { - return { - dockerExecFileSync(args) { - const separator = args.indexOf("--"); - const command = separator >= 0 ? args.slice(separator + 1) : args; - calls.push(command); - if (command[0] === "stat" && command[1] === "-c") { - return command.at(-1) === "/sandbox/.openclaw" - ? "2770 sandbox:sandbox\n" - : "660 sandbox:sandbox\n"; - } - if (command[0] === "lsattr") { - return "---------------------- " + command.at(-1) + "\n"; - } - return ""; - }, - }; - } - return originalLoad.call(this, request, parent, isMain); -}; -const { unlockAgentConfig } = require("./dist/lib/shields.js"); -unlockAgentConfig("sandbox-pod", { - agentName: "openclaw", - configPath: "/sandbox/.openclaw/openclaw.json", - configDir: "/sandbox/.openclaw", - sensitiveFiles: ["/sandbox/.openclaw/.config-hash"], -}); -process.stdout.write(JSON.stringify(calls)); -`, - ], - { encoding: "utf-8", timeout: 5000 }, - ); + const commands: string[][] = []; + withMockedDockerExecFileSync(commands, () => { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { unlockAgentConfig } = require("../dist/lib/shields.js") as { + unlockAgentConfig: ( + sandboxName: string, + target: { + agentName?: string; + configPath: string; + configDir: string; + sensitiveFiles?: string[]; + }, + ) => void; + }; + + unlockAgentConfig("sandbox-pod", { + agentName: "openclaw", + configPath: "/sandbox/.openclaw/openclaw.json", + configDir: "/sandbox/.openclaw", + sensitiveFiles: ["/sandbox/.openclaw/.config-hash"], + }); + }); - expect(probe.status).toBe(0); - const commands = JSON.parse(probe.stdout) as string[][]; expect(commands).toContainEqual(["chmod", "660", "/sandbox/.openclaw/openclaw.json"]); expect(commands).toContainEqual(["chmod", "660", "/sandbox/.openclaw/.config-hash"]); expect(commands).toContainEqual(["chmod", "2770", "/sandbox/.openclaw"]); diff --git a/test/shields.test.ts b/test/shields.test.ts index 6d3d31102d..49f5d73a64 100644 --- a/test/shields.test.ts +++ b/test/shields.test.ts @@ -418,6 +418,10 @@ describe("NC-2227-05: shields.ts locks state directories", () => { expect(src).toContain("function applyStateDirLockMode"); expect(src).toContain("workspace-*"); expect(fnBody).toContain("applyStateDirLockMode"); + expect(fnBody).toContain('["chmod", "g-s", target.configDir]'); + expect(src).toContain('["chmod", "g-s", dirPath]'); + expect(src).toContain("Best effort; do not skip recursive write stripping."); + expect(src).toContain('if [ "$dir_mode" = "755" ]; then chmod g-s "$dir"'); expect(fnBody).toContain("chown"); expect(fnBody).toContain("root:root"); });