From 20d128b4bef9037dd6f2e12f83a8b88f3c2d5bd2 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Thu, 30 Apr 2026 12:30:15 -0700
Subject: [PATCH 01/14] fix(recovery): add connect probe recovery path

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 src/lib/agent-runtime.test.ts                 |  72 +++-
 src/lib/agent-runtime.ts                      |  98 ++++-
 src/lib/command-registry.ts                   |   2 +-
 src/nemoclaw.ts                               | 221 +++++++----
 test/cli.test.ts                              | 360 +++++++++++++++++-
 .../test-issue-2478-crash-loop-recovery.sh    |  56 ++-
 6 files changed, 717 insertions(+), 92 deletions(-)
diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts
index dedbee930f..a1f96df2b2 100644
--- a/src/lib/agent-runtime.test.ts
+++ b/src/lib/agent-runtime.test.ts
@@ -3,7 +3,7 @@
 
 import { describe, it, expect } from "vitest";
 // Import from compiled dist/ so coverage is attributed correctly.
-import { buildRecoveryScript } from "../../dist/lib/agent-runtime";
+import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../dist/lib/agent-runtime";
 import type { AgentDefinition } from "./agent-defs";
 
 function makeAgent(overrides: Partial<AgentDefinition> = {}): AgentDefinition {
@@ -60,13 +60,13 @@ describe("buildRecoveryScript", () => {
   it("launches the default gateway command through the validated agent binary", () => {
     const script = buildRecoveryScript(minimalAgent, 19000);
     expect(script).toContain("command -v 'test-agent'");
-    expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000');
+    expect(script).toContain('"$AGENT_BIN" gateway run --port 19000');
   });
 
   it("falls back to openclaw gateway run when gateway_command is absent", () => {
     const agent = makeAgent({ gateway_command: undefined });
     const script = buildRecoveryScript(agent, 19000);
-    expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000');
+    expect(script).toContain('"$AGENT_BIN" gateway run --port 19000');
   });
 
   it("validates and launches custom gateway commands explicitly", () => {
@@ -127,11 +127,13 @@ describe("buildRecoveryScript", () => {
       // anyone debugging a crash-loop. (#2478)
       expect(script).toContain('echo "$_W" >> /tmp/gateway.log');
       // And the warning must be deferred until AFTER gateway.log is
-      // freshly touched/chmod'd, otherwise the redirect targets a stale
-      // file that gets removed seconds later.
-      const touchIdx = script!.indexOf("touch /tmp/gateway.log");
+      // safely opened with O_NOFOLLOW, otherwise the redirect targets a
+      // stale or attacker-controlled file.
+      const prepareIdx = script!.indexOf("os.open(path, flags, 0o644)");
       const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log');
-      expect(touchIdx).toBeLessThan(warnIdx);
+      expect(prepareIdx).toBeGreaterThanOrEqual(0);
+      expect(warnIdx).toBeGreaterThanOrEqual(0);
+      expect(prepareIdx).toBeLessThan(warnIdx);
     });
 
     it("appends (not truncates) gateway.log on launch so warnings survive", () => {
@@ -139,8 +141,62 @@ describe("buildRecoveryScript", () => {
       // Truncating with `>` wipes the [gateway-recovery] WARNING that the
       // recovery script wrote moments earlier — meaning a sysadmin tailing
       // gateway.log would see the eventual crash without the explanation.
-      expect(script).toContain(">> /tmp/gateway.log 2>&1 &");
+      expect(script).toContain('>> "$_GATEWAY_LOG" 2>&1 &');
       expect(script).not.toMatch(/[^>]> \/tmp\/gateway\.log 2>&1 &/);
     });
+
+    it("preserves an existing gateway.log and has a writable fallback log", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).not.toContain("rm -f /tmp/gateway.log");
+      expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log");
+      expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log");
+    });
+
+    it("rejects a symlinked gateway.log before preparing the log", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      const noFollowIdx = script.indexOf("O_NOFOLLOW");
+      const openIdx = script.indexOf("os.open(path, flags, 0o644)");
+      const fchownIdx = script.indexOf("os.fchown(fd");
+      expect(script).toContain("refusing to prepare symlinked /tmp/gateway.log");
+      expect(script).toContain("sys.exit(1)");
+      expect(script).not.toContain(": > /tmp/gateway.log");
+      expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log");
+      expect(noFollowIdx).toBeGreaterThanOrEqual(0);
+      expect(openIdx).toBeGreaterThanOrEqual(0);
+      expect(fchownIdx).toBeGreaterThanOrEqual(0);
+      expect(noFollowIdx).toBeLessThan(openIdx);
+      expect(openIdx).toBeLessThan(fchownIdx);
+    });
+
+    it("prepares gateway.log for the real gateway-owned sandbox log", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).toContain("os.fchown(fd");
+      expect(script).toContain("pw.pw_gid");
+      expect(script).not.toContain("grp.getgrnam");
+      expect(script).toContain("owner_mode = 0o644");
+      expect(script).toContain("os.fchmod(fd, owner_mode)");
+      expect(script).toContain("/tmp/gateway.log 'gateway'");
+      expect(script).toContain("gosu 'gateway'");
+    });
+
+    it("prepares auto-pair.log without unlinking or following symlinks", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).toContain("refusing to prepare symlinked /tmp/auto-pair.log");
+      expect(script).toContain("/tmp/auto-pair.log 'sandbox'");
+      expect(script).toContain("owner_mode = 0o600");
+      expect(script).not.toContain("rm -f /tmp/auto-pair.log");
+      expect(script).not.toContain(": > /tmp/auto-pair.log");
+      expect(script).not.toContain("touch /tmp/auto-pair.log");
+      expect(script).not.toContain("chown sandbox:sandbox /tmp/auto-pair.log");
+      expect(script).not.toContain("chmod 600 /tmp/auto-pair.log");
+    });
+
+    it("does not force non-OpenClaw agents to run as the gateway user", () => {
+      const script = buildRecoveryScript(minimalAgent, 19000);
+      expect(script).not.toContain("chown gateway:gateway /tmp/gateway.log");
+      expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log");
+      expect(script).not.toContain("gosu gateway");
+      expect(script).not.toContain("gosu 'gateway'");
+    });
   });
 });
diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts
index 495f86efa1..6995cc7d45 100644
--- a/src/lib/agent-runtime.ts
+++ b/src/lib/agent-runtime.ts
@@ -48,6 +48,93 @@ export function getHealthProbeUrl(agent: AgentDefinition | null): string {
   return agent.healthProbe?.url || `http://127.0.0.1:${DASHBOARD_PORT}/`;
 }
 
+function buildNoFollowLogSetupCommand(
+  path: string,
+  logOwnerUser?: string,
+  ownerMode = "0o644",
+): string {
+  const displayPath = path.replace(/\\/g, "\\\\").replace(/'/g, "\\'");
+  const prepareLog = [
+    "import errno, os, pwd, stat, sys",
+    "path = sys.argv[1]",
+    "owner = sys.argv[2] if len(sys.argv) > 2 else ''",
+    `owner_mode = ${ownerMode}`,
+    "fallback_mode = 0o600",
+    "flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC | getattr(os, 'O_NOFOLLOW', 0)",
+    "try:",
+    "    fd = os.open(path, flags, 0o644)",
+    "except OSError as exc:",
+    "    if exc.errno == errno.ELOOP:",
+    `        print('[gateway-recovery] ERROR: refusing to prepare symlinked ${displayPath}', file=sys.stderr)`,
+    "        sys.exit(1)",
+    "    if exc.errno in (errno.EACCES, errno.EPERM):",
+    `        print('[gateway-recovery] ERROR: ${displayPath} is not writable by recovery user', file=sys.stderr)`,
+    "        sys.exit(0)",
+    `    print(f'[gateway-recovery] ERROR: cannot prepare ${displayPath}: {exc}', file=sys.stderr)`,
+    "    sys.exit(1)",
+    "try:",
+    "    if not stat.S_ISREG(os.fstat(fd).st_mode):",
+    `        print('[gateway-recovery] ERROR: ${displayPath} is not a regular file', file=sys.stderr)`,
+    "        sys.exit(1)",
+    "    if owner and os.geteuid() == 0:",
+    "        try:",
+    "            pw = pwd.getpwnam(owner)",
+    "        except KeyError:",
+    "            os.fchmod(fd, fallback_mode)",
+    "        else:",
+    "            os.fchown(fd, pw.pw_uid, pw.pw_gid)",
+    "            os.fchmod(fd, owner_mode)",
+    "    else:",
+    "        os.fchmod(fd, fallback_mode)",
+    "finally:",
+    "    os.close(fd)",
+  ].join("\n");
+  return [
+    "python3",
+    "-c",
+    shellQuote(prepareLog),
+    path,
+    ...(logOwnerUser ? [shellQuote(logOwnerUser)] : []),
+  ].join(" ");
+}
+
+function buildGatewayLogSetup(includeAutoPairLog = false, logOwnerUser?: string): string[] {
+  const lines = [`${buildNoFollowLogSetupCommand("/tmp/gateway.log", logOwnerUser)};`];
+  if (includeAutoPairLog) {
+    lines.push(`${buildNoFollowLogSetupCommand("/tmp/auto-pair.log", "sandbox", "0o600")};`);
+  }
+  return lines;
+}
+
+function gatewayLaunchCommand(command: string, runAsUser?: string): string {
+  const userLaunch = `_GATEWAY_LOG=/tmp/gateway.log; if ! : >> "$_GATEWAY_LOG" 2>/dev/null; then _GATEWAY_LOG=/tmp/gateway-recovery.log; : > "$_GATEWAY_LOG" 2>/dev/null || true; fi; nohup ${command} >> "$_GATEWAY_LOG" 2>&1 &`;
+  if (!runAsUser) {
+    return userLaunch;
+  }
+  return `if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> /tmp/gateway.log 2>&1 & else ${userLaunch} fi`;
+}
+
+/**
+ * Build the OpenClaw recovery shell script used by the default sandbox.
+ */
+export function buildOpenClawRecoveryScript(port: number): string {
+  return [
+    "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;",
+    "[ -f ~/.bashrc ] && . ~/.bashrc;",
+    'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _GUARDS_MISSING=0 ;; *) _GUARDS_MISSING=1 ;; esac;',
+    `if curl -sf --max-time 3 http://127.0.0.1:${port}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`,
+    "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;",
+    ...buildGatewayLogSetup(true, "gateway"),
+    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
+    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
+    'OPENCLAW="$(command -v openclaw)";',
+    'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;',
+    gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"),
+    "GPID=$!; sleep 2;",
+    'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi',
+  ].join(" ");
+}
+
 /**
  * Build the recovery shell script for a non-OpenClaw agent.
  * Returns the script string, or null if agent is null (use existing inline
@@ -78,8 +165,8 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number)
   // *why* the gateway is about to crash gets wiped by the same launch
   // that's about to crash on a missing guard. (#2478)
   const launchCommand = usesValidatedBinary
-    ? `nohup "$AGENT_BIN" gateway run --port ${port} >> /tmp/gateway.log 2>&1 &`
-    : `nohup ${configuredGatewayCommand} --port ${port} >> /tmp/gateway.log 2>&1 &`;
+    ? gatewayLaunchCommand(`"$AGENT_BIN" gateway run --port ${port}`)
+    : gatewayLaunchCommand(`${configuredGatewayCommand} --port ${port}`);
   const isHermes = agent.name === "hermes";
   const hermesHome = isHermes ? "export HERMES_HOME=/sandbox/.hermes; " : "";
 
@@ -106,10 +193,9 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number)
     'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _GUARDS_MISSING=0 ;; *) _GUARDS_MISSING=1 ;; esac;',
     hermesHome,
     `if curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`,
-    "rm -f /tmp/gateway.log;",
-    "touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;",
-    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing — gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
-    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload — gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
+    ...buildGatewayLogSetup(false),
+    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
+    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
     ...validationSteps,
     launchCommand,
     "GPID=$!; sleep 2;",
diff --git a/src/lib/command-registry.ts b/src/lib/command-registry.ts
index 85d842c4d5..7b841c52e9 100644
--- a/src/lib/command-registry.ts
+++ b/src/lib/command-registry.ts
@@ -97,6 +97,7 @@ export const COMMANDS: readonly CommandDef[] = [
   {
     usage: "nemoclaw <name> connect",
     description: "Shell into a running sandbox",
+    flags: "[--probe-only]",
     group: "Sandbox Management",
     scope: "sandbox",
   },
@@ -398,7 +399,6 @@ export const COMMANDS: readonly CommandDef[] = [
     scope: "global",
     hidden: true,
   },
-
 ] as const;
 
 /** All global-scope commands. */
diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts
index d99bdbc527..99dc5c2de4 100644
--- a/src/nemoclaw.ts
+++ b/src/nemoclaw.ts
@@ -133,6 +133,8 @@ type SandboxCommandResult = {
   stderr: string;
 };
 
+const SANDBOX_EXEC_STARTED_MARKER = "__NEMOCLAW_SANDBOX_EXEC_STARTED__";
+
 type RecoveredSandboxMetadata = Partial<
   Pick<SandboxEntry, "model" | "provider" | "gpuEnabled" | "policies" | "nimContainer" | "agent">
 > & {
@@ -234,6 +236,7 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom
     timeout: OPENSHELL_PROBE_TIMEOUT_MS,
   });
   if (sshConfigResult.status !== 0) return null;
+  if (!sshConfigResult.output.trim()) return null;
 
   const tmpFile = path.join(os.tmpdir(), `nemoclaw-ssh-${process.pid}-${Date.now()}.conf`);
   fs.writeFileSync(tmpFile, sshConfigResult.output, { mode: 0o600 });
@@ -272,6 +275,46 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom
   }
 }
 
+function executeSandboxExecCommand(
+  sandboxName: string,
+  command: string,
+  timeout = 15000,
+): SandboxCommandResult | null {
+  const markedCommand = `printf '%s\\n' '${SANDBOX_EXEC_STARTED_MARKER}'; ${command}`;
+  try {
+    const result = spawnSync(
+      getOpenshellBinary(),
+      ["sandbox", "exec", "--name", sandboxName, "--", "sh", "-c", markedCommand],
+      {
+        cwd: ROOT,
+        encoding: "utf-8",
+        env: process.env,
+        stdio: ["ignore", "pipe", "pipe"],
+        timeout,
+      },
+    );
+    const stdout = (result.stdout || "").trim();
+    const stdoutLines = stdout.split(/\r?\n/);
+    const markerIndex = stdoutLines.indexOf(SANDBOX_EXEC_STARTED_MARKER);
+    if (markerIndex === -1) return null;
+    const commandStdoutLines = stdoutLines.slice(markerIndex + 1);
+    return {
+      status: result.error ? 1 : (result.status ?? 1),
+      stdout: commandStdoutLines.join("\n").trim(),
+      stderr: (result.stderr || "").trim(),
+    };
+  } catch {
+    return null;
+  }
+}
+
+function parseSandboxGatewayProbe(result: SandboxCommandResult | null): boolean | null {
+  if (!result) return null;
+  if (result.stdout === "RUNNING") return true;
+  if (result.stdout === "STOPPED") return false;
+  return null;
+}
+
 /**
  * Check whether the OpenClaw gateway process is running inside the sandbox.
  * Uses the gateway's HTTP endpoint (dashboard port) as the source of truth,
@@ -281,62 +324,40 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom
 function isSandboxGatewayRunning(sandboxName: string): boolean | null {
   const agent = agentRuntime.getSessionAgent(sandboxName);
   const probeUrl = agentRuntime.getHealthProbeUrl(agent);
-  const result = executeSandboxCommand(
-    sandboxName,
-    `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`,
-  );
-  if (!result) return null;
-  if (result.stdout === "RUNNING") return true;
-  if (result.stdout === "STOPPED") return false;
-  return null;
+  const command = `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`;
+  const execProbe = parseSandboxGatewayProbe(executeSandboxExecCommand(sandboxName, command));
+  if (execProbe !== null) return execProbe;
+  return parseSandboxGatewayProbe(executeSandboxCommand(sandboxName, command));
 }
 
 /**
- * Restart the OpenClaw gateway process inside the sandbox after a pod restart.
+ * Restart the gateway process inside the sandbox after a pod restart.
  * Cleans stale lock/temp files, sources proxy config, and launches the gateway
  * in the background. Returns true on success.
  */
 function recoverSandboxProcesses(sandboxName: string): boolean {
   const agent = agentRuntime.getSessionAgent(sandboxName);
   const agentScript = agentRuntime.buildRecoveryScript(agent, agent?.forwardPort ?? DASHBOARD_PORT);
-  const script =
-    agentScript ||
-    [
-      // Source /tmp/nemoclaw-proxy-env.sh explicitly so NODE_OPTIONS preload
-      // guards (safety-net, ciao, slack, …) survive gateway respawn. Without
-      // this, library errors crash-loop the gateway because the original
-      // .bashrc-only path silently failed when the env file was unreadable
-      // or the shell did not source ~/.bashrc. See #2478. Mirrors the
-      // hardened block in src/lib/agent-runtime.ts:buildRecoveryScript.
-      // Defer warning emission until AFTER touch+chmod gateway.log so
-      // warnings land in the persistent log a sysadmin would tail. Stderr
-      // alone hides them because executeSandboxCommand captures stderr
-      // without surfacing it. Mirrors src/lib/agent-runtime.ts.
-      "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;",
-      "[ -f ~/.bashrc ] && . ~/.bashrc;",
-      'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _GUARDS_MISSING=0 ;; *) _GUARDS_MISSING=1 ;; esac;',
-      `if curl -sf --max-time 3 http://127.0.0.1:${DASHBOARD_PORT}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`,
-      "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;",
-      "rm -f /tmp/gateway.log /tmp/auto-pair.log;",
-      "touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;",
-      "touch /tmp/auto-pair.log; chmod 600 /tmp/auto-pair.log;",
-      '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing — gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
-      '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload — gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
-      'OPENCLAW="$(command -v openclaw)";',
-      'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;',
-      // Append rather than truncate so [gateway-recovery] WARNING lines
-      // written above survive past the launch. (#2478)
-      `nohup "$OPENCLAW" gateway run --port ${DASHBOARD_PORT} >> /tmp/gateway.log 2>&1 &`,
-      "GPID=$!; sleep 2;",
-      'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi',
-    ].join(" ");
-
-  const result = executeSandboxCommand(sandboxName, script);
-  if (!result) return false;
-  return (
-    result.status === 0 &&
-    (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING"))
-  );
+  const hasRecoveryMarker = (result: SandboxCommandResult | null) =>
+    !!(
+      result &&
+      (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING"))
+    );
+  const recoveredSsh = (result: SandboxCommandResult | null) =>
+    !!(result && result.status === 0 && hasRecoveryMarker(result));
+
+  if (agentScript) {
+    // Non-OpenClaw manifests do not yet declare a runtime user for root
+    // sandbox exec. Recover them over SSH so the launch inherits the sandbox
+    // login user instead of creating root-owned agent state under /sandbox.
+    return recoveredSsh(executeSandboxCommand(sandboxName, agentScript));
+  }
+
+  const script = agentRuntime.buildOpenClawRecoveryScript(DASHBOARD_PORT);
+  const execResult = executeSandboxExecCommand(sandboxName, script, 30000);
+  if (hasRecoveryMarker(execResult)) return true;
+  if (execResult !== null) return false;
+  return recoveredSsh(executeSandboxCommand(sandboxName, script));
 }
 
 /**
@@ -1531,10 +1552,79 @@ async function listSandboxes(args: string[] = []): Promise<void> {
 
 // ── Sandbox-scoped actions ───────────────────────────────────────
 
-async function sandboxConnect(sandboxName: string) {
+type SandboxConnectOptions = {
+  probeOnly?: boolean;
+};
+
+function printSandboxConnectHelp(sandboxName = "<name>") {
+  console.log("");
+  console.log(`  Usage: nemoclaw ${sandboxName} connect [--probe-only]`);
+  console.log("");
+  console.log("  Options:");
+  console.log(
+    "    --probe-only                    Run recovery checks and exit without opening SSH",
+  );
+  console.log("    -h, --help                      Show this help");
+  console.log("");
+}
+
+function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): SandboxConnectOptions {
+  const options: SandboxConnectOptions = {};
+  for (const arg of actionArgs) {
+    switch (arg) {
+      case "--dangerously-skip-permissions":
+        console.error("  --dangerously-skip-permissions was removed; use shields commands instead.");
+        printSandboxConnectHelp(sandboxName);
+        process.exit(1);
+      case "--probe-only":
+        options.probeOnly = true;
+        break;
+      case "--help":
+      case "-h":
+        printSandboxConnectHelp(sandboxName);
+        process.exit(0);
+        break;
+      default:
+        console.error(`  Unknown flag for connect: ${arg}`);
+        printSandboxConnectHelp(sandboxName);
+        process.exit(1);
+    }
+  }
+  return options;
+}
+
+async function sandboxConnect(
+  sandboxName: string,
+  { probeOnly = false }: SandboxConnectOptions = {},
+) {
   const { isSandboxReady, parseSandboxStatus } = require("./lib/onboard");
   await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true });
 
+  if (probeOnly) {
+    const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true });
+    const agent = agentRuntime.getSessionAgent(sandboxName);
+    const agentName = agentRuntime.getAgentDisplayName(agent);
+    if (!processCheck.checked) {
+      console.error(
+        `  Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`,
+      );
+      process.exit(1);
+    }
+    if (processCheck.wasRunning) {
+      console.log(`  Probe complete: ${agentName} gateway is running in '${sandboxName}'.`);
+      return;
+    }
+    if (processCheck.recovered) {
+      console.log(`  Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`);
+      return;
+    }
+    console.error(
+      `  Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`,
+    );
+    console.error("  Check /tmp/gateway.log inside the sandbox for details.");
+    process.exit(1);
+  }
+
   // Version staleness check — warn but don't block
   try {
     const versionCheck = sandboxVersion.checkAgentVersion(sandboxName);
@@ -4340,11 +4430,26 @@ const [cmd, ...args] = process.argv.slice(2);
   }
 
   // Sandbox-scoped commands: nemoclaw <name> <action>
+  const firstSandboxArg = args[0];
+  const implicitConnectArg =
+    firstSandboxArg === "--help" || firstSandboxArg === "-h" || firstSandboxArg === "--probe-only";
+  const requestedSandboxAction =
+    !firstSandboxArg || implicitConnectArg ? "connect" : firstSandboxArg;
+  const requestedSandboxActionArgs = !firstSandboxArg || implicitConnectArg ? args : args.slice(1);
+  if (
+    requestedSandboxAction === "connect" &&
+    requestedSandboxActionArgs.some((arg) => arg === "--help" || arg === "-h")
+  ) {
+    validateName(cmd, "sandbox name");
+    printSandboxConnectHelp(cmd);
+    return;
+  }
+
   // If the registry doesn't know this name but the action is a sandbox-scoped
   // command, attempt recovery — the sandbox may still be live with a stale registry.
   // Derived from command registry — single source of truth
   const sandboxActions = sandboxActionTokens();
-  if (!registry.getSandbox(cmd) && sandboxActions.includes(args[0] || "")) {
+  if (!registry.getSandbox(cmd) && sandboxActions.includes(requestedSandboxAction)) {
     validateName(cmd, "sandbox name");
     await recoverRegistryEntries({ requestedSandboxName: cmd });
     if (!registry.getSandbox(cmd)) {
@@ -4363,24 +4468,12 @@ const [cmd, ...args] = process.argv.slice(2);
   const sandbox = registry.getSandbox(cmd);
   if (sandbox) {
     validateName(cmd, "sandbox name");
-    const action = args[0] || "connect";
-    const actionArgs = args.slice(1);
+    const action = requestedSandboxAction;
+    const actionArgs = requestedSandboxActionArgs;
 
     switch (action) {
       case "connect":
-        if (actionArgs.length > 0) {
-          console.error(
-            `  Unknown connect argument${actionArgs.length === 1 ? "" : "s"}: ${actionArgs.join(" ")}`,
-          );
-          if (actionArgs.includes("--dangerously-skip-permissions")) {
-            console.error(
-              "  --dangerously-skip-permissions was removed; use shields commands instead.",
-            );
-          }
-          console.error(`  Usage: ${CLI_NAME} <name> connect`);
-          process.exit(1);
-        }
-        await sandboxConnect(cmd);
+        await sandboxConnect(cmd, parseSandboxConnectArgs(cmd, actionArgs));
         break;
       case "status":
         await sandboxStatus(cmd);
diff --git a/test/cli.test.ts b/test/cli.test.ts
index 34adc85690..6cb4b99a92 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -90,7 +90,7 @@ function readRecordedArgs(markerFile: string): string[] {
   return fs.readFileSync(markerFile, "utf8").trim().split(/\s+/);
 }
 
-function writeSandboxRegistry(home: string): void {
+function writeSandboxRegistry(home: string, sandboxOverrides: Record<string, unknown> = {}): void {
   const registryDir = path.join(home, ".nemoclaw");
   fs.mkdirSync(registryDir, { recursive: true });
   fs.writeFileSync(
@@ -103,6 +103,7 @@ function writeSandboxRegistry(home: string): void {
           provider: "nvidia-prod",
           gpuEnabled: false,
           policies: [],
+          ...sandboxOverrides,
         },
       },
       defaultSandbox: "alpha",
@@ -1415,6 +1416,363 @@ describe("CLI dispatch", () => {
     expect(calls.some((call) => call.startsWith("forward start --background 18789"))).toBe(false);
   });
 
+  it("shows connect help without opening an interactive session", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-help-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    fs.mkdirSync(localBin, { recursive: true });
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`,
+        "exit 99",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --help", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+    const implicit = runWithEnv("alpha --help", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Usage: nemoclaw alpha connect");
+    expect(r.out).toContain("--probe-only");
+    expect(implicit.code).toBe(0);
+    expect(implicit.out).toContain("Usage: nemoclaw alpha connect");
+    expect(fs.existsSync(markerFile)).toBe(false);
+  });
+
+  it("rejects the removed skip-permissions connect flag", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-flags-"));
+    writeSandboxRegistry(home);
+
+    const r = runWithEnv("alpha connect --dangerously-skip-permissions", {
+      HOME: home,
+    });
+
+    expect(r.code).toBe(1);
+    expect(r.out).toContain("--dangerously-skip-permissions was removed");
+  });
+
+  it("connect --probe-only recovers the gateway without opening SSH", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    const stateFile = path.join(home, "probe-state");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(stateFile, "stopped");
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `marker_file=${JSON.stringify(markerFile)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        'printf \'%s\\n\' "$*" >> "$marker_file"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  case "$cmd" in',
+        '    *"OPENCLAW="*)',
+        '      echo recovered > "$state_file"',
+        "      echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        "      echo 'GATEWAY_PID=123'",
+        "      exit 42",
+        "      ;;",
+        "    *'curl -sf'*)",
+        "      echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        '      if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi',
+        "      exit 0",
+        "      ;;",
+        "  esac",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: recovered OpenClaw gateway");
+    const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean);
+    expect(calls).toContain("sandbox get alpha");
+    expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
+    expect(calls).not.toContain("sandbox ssh-config alpha");
+    expect(calls).not.toContain("sandbox connect alpha");
+  });
+
+  it("treats leading --probe-only as an implicit connect probe", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-leading-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `marker_file=${JSON.stringify(markerFile)}`,
+        'printf \'%s\\n\' "$*" >> "$marker_file"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  if [[ "$cmd" == *"curl -sf"* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo RUNNING; exit 0; fi',
+        '  if [[ "$cmd" == *"OPENCLAW="* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo UNEXPECTED_RECOVERY; exit 1; fi',
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: OpenClaw gateway is running");
+    const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean);
+    expect(calls).toContain("sandbox get alpha");
+    expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
+    expect(calls).not.toContain("sandbox ssh-config alpha");
+    expect(calls).not.toContain("sandbox connect alpha");
+  });
+
+  it("connect --probe-only does not retry a failed sandbox exec recovery over SSH", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-no-ssh-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `marker_file=${JSON.stringify(markerFile)}`,
+        'printf \'%s\\n\' "$*" >> "$marker_file"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  if [[ "$cmd" == *"OPENCLAW="* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo RECOVERY_FAILED >&2; exit 42; fi',
+        '  if [[ "$cmd" == *"curl -sf"* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo STOPPED; exit 0; fi',
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ]; then',
+        "  echo 'Host openshell-alpha'",
+        "  exit 0",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(1);
+    const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean);
+    expect(calls).toContain("sandbox get alpha");
+    expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
+    expect(calls).not.toContain("sandbox ssh-config alpha");
+  });
+
+  it("connect --probe-only falls back to SSH when sandbox exec never starts", () => {
+    const home = fs.mkdtempSync(
+      path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-exec-fallback-"),
+    );
+    const localBin = path.join(home, "bin");
+    const openshellCalls = path.join(home, "openshell-calls");
+    const sshCalls = path.join(home, "ssh-calls");
+    const stateFile = path.join(home, "probe-state");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(stateFile, "stopped");
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `calls=${JSON.stringify(openshellCalls)}`,
+        'printf \'%s\\n\' "$*" >> "$calls"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ]; then',
+        "  echo 'error: sandbox exec transport failed before command start' >&2",
+        "  exit 2",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Host openshell-alpha'",
+        "  echo '  HostName 127.0.0.1'",
+        "  echo '  User sandbox'",
+        "  exit 0",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    fs.writeFileSync(
+      path.join(localBin, "ssh"),
+      [
+        "#!/usr/bin/env bash",
+        `calls=${JSON.stringify(sshCalls)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        'cmd="${@: -1}"',
+        'printf \'%s\\n\' "$cmd" >> "$calls"',
+        'if [[ "$cmd" == *"OPENCLAW="* ]]; then',
+        '  echo recovered > "$state_file"',
+        "  echo 'GATEWAY_PID=456'",
+        "  exit 0",
+        "fi",
+        'if [[ "$cmd" == *"curl -sf"* ]]; then',
+        '  if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi',
+        "  exit 0",
+        "fi",
+        "exit 1",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: recovered OpenClaw gateway");
+    const openshellLog = fs.readFileSync(openshellCalls, "utf8");
+    const sshLog = fs.readFileSync(sshCalls, "utf8");
+    expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c");
+    expect(openshellLog).toContain("sandbox ssh-config alpha");
+    expect(sshLog).toContain('OPENCLAW="$(command -v openclaw)"');
+  });
+
+  it("recovers non-OpenClaw agents over SSH instead of root sandbox exec", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-agent-"));
+    const localBin = path.join(home, "bin");
+    const openshellCalls = path.join(home, "openshell-calls");
+    const sshCalls = path.join(home, "ssh-calls");
+    const stateFile = path.join(home, "probe-state");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home, { agent: "hermes" });
+    fs.writeFileSync(stateFile, "stopped");
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `calls=${JSON.stringify(openshellCalls)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        'printf \'%s\\n\' "$*" >> "$calls"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  if [[ "$cmd" == *"curl -sf"* ]]; then',
+        "    echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        '    if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi',
+        "    exit 0",
+        "  fi",
+        '  if [[ "$cmd" == *"HERMES_HOME=/sandbox/.hermes"* || "$cmd" == *"AGENT_BIN="* ]]; then',
+        "    echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        "    echo UNEXPECTED_ROOT_EXEC_RECOVERY",
+        "    exit 1",
+        "  fi",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Host openshell-alpha'",
+        "  echo '  HostName 127.0.0.1'",
+        "  echo '  User sandbox'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "forward" ]; then',
+        "  exit 0",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    fs.writeFileSync(
+      path.join(localBin, "ssh"),
+      [
+        "#!/usr/bin/env bash",
+        `calls=${JSON.stringify(sshCalls)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        'cmd="${@: -1}"',
+        'printf \'%s\\n\' "$cmd" >> "$calls"',
+        'if [[ "$cmd" == *"AGENT_BIN=\'/usr/local/bin/hermes\'"* ]]; then',
+        '  echo recovered > "$state_file"',
+        "  echo 'GATEWAY_PID=789'",
+        "  exit 0",
+        "fi",
+        "exit 1",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: recovered Hermes Agent gateway");
+    const openshellLog = fs.readFileSync(openshellCalls, "utf8");
+    const sshLog = fs.readFileSync(sshCalls, "utf8");
+    expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c");
+    expect(openshellLog).toContain("sandbox ssh-config alpha");
+    expect(openshellLog).not.toContain("HERMES_HOME=/sandbox/.hermes");
+    expect(openshellLog).not.toContain("AGENT_BIN=");
+    expect(sshLog).toContain("HERMES_HOME=/sandbox/.hermes");
+    expect(sshLog).toContain("AGENT_BIN='/usr/local/bin/hermes'");
+  });
+
   it("waits for sandbox readiness before connecting", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-wait-"));
     const localBin = path.join(home, "bin");
diff --git a/test/e2e/test-issue-2478-crash-loop-recovery.sh b/test/e2e/test-issue-2478-crash-loop-recovery.sh
index 0bdbdb8aec..88d76bba67 100755
--- a/test/e2e/test-issue-2478-crash-loop-recovery.sh
+++ b/test/e2e/test-issue-2478-crash-loop-recovery.sh
@@ -30,11 +30,12 @@
 #   2. Verifies the *initial* gateway has the safety-net + ciao guard active
 #      (via /proc/<pid>/environ on the gateway PID).
 #   3. Crash-recovery loop (NORMAL): kill the gateway 5x, each time triggers
-#      `nemoclaw <name> connect` (which calls recoverSandboxProcesses), and
-#      checks the respawned gateway still has guards in NODE_OPTIONS.
+#      `nemoclaw <name> connect --probe-only` (which calls
+#      recoverSandboxProcesses), and checks the respawned gateway still has
+#      guards in NODE_OPTIONS.
 #   4. Negative case: removes /tmp/nemoclaw-proxy-env.sh, kills the gateway,
 #      triggers recovery — expects the new "[gateway-recovery] WARNING"
-#      line in stderr (visible via gateway.log) instead of silent guard loss.
+#      line in gateway.log instead of silent guard loss.
 #   5. Soak: leaves the sandbox idle for $NEMOCLAW_E2E_SOAK_SECONDS
 #      (default 300) so the health-monitor restart cadence (~4 min in prod)
 #      gets at least one chance to fire, then asserts the gateway has not
@@ -236,6 +237,20 @@ gateway_diagnostics() {
   echo "  ---------------------------"
 }
 
+run_probe_only_or_fail() {
+  local context="$1"
+  local probe_out
+  probe_out="$(mktemp)"
+  if ! timeout 60 nemoclaw "$SANDBOX_NAME" connect --probe-only >"$probe_out" 2>&1; then
+    fail "${context}: connect --probe-only exited nonzero"
+    sed 's/^/    /' "$probe_out"
+    rm -f "$probe_out"
+    gateway_diagnostics ""
+    exit 1
+  fi
+  rm -f "$probe_out"
+}
+
 # Wait until gateway PID is non-empty (or timeout). Echoes pid, returns 0/1.
 wait_for_gateway_up() {
   local timeout="${1:-30}"
@@ -362,11 +377,18 @@ for cycle in $(seq 1 "$CRASH_CYCLES"); do
   info "Cycle $cycle/$CRASH_CYCLES — killing gateway pid=$prev_pid"
   sandbox_exec sh -c "kill -9 $prev_pid 2>/dev/null; sleep 1; pgrep -fo '[o]penclaw[ -]gateway' || echo DEAD" >/dev/null
 
-  # Trigger recovery via the actual code path: `nemoclaw <name> status`
-  # calls checkAndRecoverSandboxProcesses() → recoverSandboxProcesses(),
-  # which is the hardened path #2478 changes. Bound it with `timeout`
-  # so a hang in CLI internals cannot eat the whole 30-min job budget.
-  timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true
+  # Trigger recovery via the actual operator probe path:
+  # `nemoclaw <name> connect --probe-only` calls
+  # checkAndRecoverSandboxProcesses() -> recoverSandboxProcesses() without
+  # opening an interactive SSH session. Bound it with `timeout` so a hang in
+  # CLI internals cannot eat the whole 30-min job budget.
+  run_probe_only_or_fail "Cycle $cycle after gateway kill"
+
+  if ! sandbox_exec sh -c 'test -s /tmp/gateway.log'; then
+    fail "Cycle $cycle: connect --probe-only did not leave /tmp/gateway.log evidence"
+    gateway_diagnostics ""
+    exit 1
+  fi
 
   new_pid="$(wait_for_gateway_up 45)"
   if [ -z "$new_pid" ]; then
@@ -426,9 +448,10 @@ info "Snapshotted proxy-env.sh ($SNAPSHOT_SIZE bytes, ${#SNAPSHOT_B64}-char base
 # recovery script (which is the only path that emits the warning).
 sandbox_exec sh -c 'rm -f /tmp/nemoclaw-proxy-env.sh' >/dev/null
 sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null
-timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true
+run_probe_only_or_fail "Negative case after proxy-env removal"
 
-# The new gateway.log should contain the [gateway-recovery] WARNING line.
+# The new gateway.log should contain the [gateway-recovery] WARNING line and
+# recovery should have attempted a real gateway respawn.
 warn_seen=false
 for _ in 1 2 3 4 5; do
   if gateway_log_tail 100 | grep -q '\[gateway-recovery\] WARNING'; then
@@ -443,6 +466,13 @@ else
   fail "Recovery silently launched without warning (regression of #2478 fix)"
   gateway_log_tail 100
 fi
+NEGATIVE_PID="$(wait_for_gateway_up 45)"
+if [ -z "$NEGATIVE_PID" ]; then
+  fail "Recovery warning was logged, but gateway did not respawn within 45s"
+  gateway_diagnostics ""
+  exit 1
+fi
+info "Negative-case recovery respawned gateway pid=$NEGATIVE_PID"
 
 # Restore proxy-env.sh by base64-injecting the snapshot via argv. `openshell
 # sandbox exec` does not pipe stdin from the caller through to the subshell,
@@ -458,8 +488,10 @@ if [ "$restored_size" != "$SNAPSHOT_SIZE" ]; then
 fi
 info "proxy-env.sh restored (${restored_size} bytes verified)"
 
-# Trigger recovery to bring the gateway back with guards intact.
-timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true
+# Kill the guardless negative-case gateway, then trigger recovery to bring the
+# gateway back with guards intact from the restored env file.
+sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null
+run_probe_only_or_fail "Guard restore recovery"
 SOAK_START_PID="$(wait_for_gateway_up 30)"
 if [ -z "$SOAK_START_PID" ]; then
   fail "Gateway not up entering soak phase"

From 05bfe893da515956ba72b75cae1f9df932e98b84 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Thu, 30 Apr 2026 14:36:10 -0700
Subject: [PATCH 02/14] test(recovery): address CodeRabbit cleanup

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 src/lib/agent-runtime.test.ts |  6 +++---
 src/nemoclaw.ts               |  2 +-
 test/cli.test.ts              | 14 ++++++++++++++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts
index a1f96df2b2..1453b06639 100644
--- a/src/lib/agent-runtime.test.ts
+++ b/src/lib/agent-runtime.test.ts
@@ -129,11 +129,11 @@ describe("buildRecoveryScript", () => {
       // And the warning must be deferred until AFTER gateway.log is
       // safely opened with O_NOFOLLOW, otherwise the redirect targets a
       // stale or attacker-controlled file.
-      const prepareIdx = script!.indexOf("os.open(path, flags, 0o644)");
+      const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log;");
       const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log');
-      expect(prepareIdx).toBeGreaterThanOrEqual(0);
+      expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0);
       expect(warnIdx).toBeGreaterThanOrEqual(0);
-      expect(prepareIdx).toBeLessThan(warnIdx);
+      expect(gatewayPrepIdx).toBeLessThan(warnIdx);
     });
 
     it("appends (not truncates) gateway.log on launch so warnings survive", () => {
diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts
index 99dc5c2de4..7ffafc6b4e 100644
--- a/src/nemoclaw.ts
+++ b/src/nemoclaw.ts
@@ -1558,7 +1558,7 @@ type SandboxConnectOptions = {
 
 function printSandboxConnectHelp(sandboxName = "<name>") {
   console.log("");
-  console.log(`  Usage: nemoclaw ${sandboxName} connect [--probe-only]`);
+  console.log(`  Usage: ${CLI_NAME} ${sandboxName} connect [--probe-only]`);
   console.log("");
   console.log("  Options:");
   console.log(
diff --git a/test/cli.test.ts b/test/cli.test.ts
index 6cb4b99a92..3e5b1974be 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -1450,14 +1450,28 @@ describe("CLI dispatch", () => {
 
   it("rejects the removed skip-permissions connect flag", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-flags-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    fs.mkdirSync(localBin, { recursive: true });
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`,
+        "exit 99",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
     writeSandboxRegistry(home);
 
     const r = runWithEnv("alpha connect --dangerously-skip-permissions", {
       HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
     });
 
     expect(r.code).toBe(1);
     expect(r.out).toContain("--dangerously-skip-permissions was removed");
+    expect(fs.existsSync(markerFile)).toBe(false);
   });
 
   it("connect --probe-only recovers the gateway without opening SSH", () => {

From e6eb1afc28b9f0e2da0af968354b301b4860ffcf Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Thu, 30 Apr 2026 15:15:44 -0700
Subject: [PATCH 03/14] test(recovery): assert probe fallback stays
 non-interactive

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 test/cli.test.ts | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/cli.test.ts b/test/cli.test.ts
index 08e6fc5f30..f63ee7dea0 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -1686,7 +1686,8 @@ describe("CLI dispatch", () => {
         `calls=${JSON.stringify(sshCalls)}`,
         `state_file=${JSON.stringify(stateFile)}`,
         'cmd="${@: -1}"',
-        'printf \'%s\\n\' "$cmd" >> "$calls"',
+        'printf \'ARGS %s\\n\' "$*" >> "$calls"',
+        'printf \'CMD %s\\n\' "$cmd" >> "$calls"',
         'if [[ "$cmd" == *"OPENCLAW="* ]]; then',
         '  echo recovered > "$state_file"',
         "  echo 'GATEWAY_PID=456'",
@@ -1712,7 +1713,9 @@ describe("CLI dispatch", () => {
     const sshLog = fs.readFileSync(sshCalls, "utf8");
     expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c");
     expect(openshellLog).toContain("sandbox ssh-config alpha");
+    expect(openshellLog).not.toContain("sandbox connect");
     expect(sshLog).toContain('OPENCLAW="$(command -v openclaw)"');
+    expect(sshLog).not.toMatch(/(^|\s)-tt?(\s|$)/);
   });
 
   it("recovers non-OpenClaw agents over SSH instead of root sandbox exec", () => {
@@ -1773,7 +1776,8 @@ describe("CLI dispatch", () => {
         `calls=${JSON.stringify(sshCalls)}`,
         `state_file=${JSON.stringify(stateFile)}`,
         'cmd="${@: -1}"',
-        'printf \'%s\\n\' "$cmd" >> "$calls"',
+        'printf \'ARGS %s\\n\' "$*" >> "$calls"',
+        'printf \'CMD %s\\n\' "$cmd" >> "$calls"',
         'if [[ "$cmd" == *"AGENT_BIN=\'/usr/local/bin/hermes\'"* ]]; then',
         '  echo recovered > "$state_file"',
         "  echo 'GATEWAY_PID=789'",
@@ -1797,8 +1801,10 @@ describe("CLI dispatch", () => {
     expect(openshellLog).toContain("sandbox ssh-config alpha");
     expect(openshellLog).not.toContain("HERMES_HOME=/sandbox/.hermes");
     expect(openshellLog).not.toContain("AGENT_BIN=");
+    expect(openshellLog).not.toContain("sandbox connect");
     expect(sshLog).toContain("HERMES_HOME=/sandbox/.hermes");
     expect(sshLog).toContain("AGENT_BIN='/usr/local/bin/hermes'");
+    expect(sshLog).not.toMatch(/(^|\s)-tt?(\s|$)/);
   });
 
   it("waits for sandbox readiness before connecting", () => {

From d1b9145364aa116da7d6dac04cbc9c0ca7d18329 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 15:29:14 -0700
Subject: [PATCH 04/14] fix(recovery): stop on log hardening setup failure

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 src/lib/agent-runtime.test.ts | 8 +++++++-
 src/lib/agent-runtime.ts      | 6 ++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts
index a888a2a6e1..c5a2bd09db 100644
--- a/src/lib/agent-runtime.test.ts
+++ b/src/lib/agent-runtime.test.ts
@@ -131,13 +131,19 @@ describe("buildRecoveryScript", () => {
       // And the warning must be deferred until AFTER gateway.log is
       // safely opened with O_NOFOLLOW, otherwise the redirect targets a
       // stale or attacker-controlled file.
-      const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log;");
+      const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;");
       const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log');
       expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0);
       expect(warnIdx).toBeGreaterThanOrEqual(0);
       expect(gatewayPrepIdx).toBeLessThan(warnIdx);
     });
 
+    it("stops recovery when hardened log setup fails", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).toContain(" /tmp/gateway.log 'gateway' || exit 1;");
+      expect(script).toContain(" /tmp/auto-pair.log 'sandbox' || exit 1;");
+    });
+
     it("appends (not truncates) gateway.log on launch so warnings survive", () => {
       const script = buildRecoveryScript(minimalAgent, 19000);
       // Truncating with `>` wipes the [gateway-recovery] WARNING that the
diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts
index 25371fda62..71dc29b9a0 100644
--- a/src/lib/agent-runtime.ts
+++ b/src/lib/agent-runtime.ts
@@ -99,9 +99,11 @@ function buildNoFollowLogSetupCommand(
 }
 
 function buildGatewayLogSetup(includeAutoPairLog = false, logOwnerUser?: string): string[] {
-  const lines = [`${buildNoFollowLogSetupCommand("/tmp/gateway.log", logOwnerUser)};`];
+  const lines = [`${buildNoFollowLogSetupCommand("/tmp/gateway.log", logOwnerUser)} || exit 1;`];
   if (includeAutoPairLog) {
-    lines.push(`${buildNoFollowLogSetupCommand("/tmp/auto-pair.log", "sandbox", "0o600")};`);
+    lines.push(
+      `${buildNoFollowLogSetupCommand("/tmp/auto-pair.log", "sandbox", "0o600")} || exit 1;`,
+    );
   }
   return lines;
 }

From c4d0f637deeda293e9c0b4a2e0ad85a0d133f0d1 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 16:02:32 -0700
Subject: [PATCH 05/14] fix(ci): keep PR checks green after main merge

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 nemoclaw/eslint.config.mjs                |   1 +
 scripts/nemoclaw-start.sh                 |   2 +-
 test/fetch-guard-patch-regression.test.ts |   7 +-
 test/nemoclaw-start.test.ts               |   9 +-
 test/repro-2681-group-writable.test.ts    | 305 +++++++++++++++-------
 5 files changed, 222 insertions(+), 102 deletions(-)

diff --git a/nemoclaw/eslint.config.mjs b/nemoclaw/eslint.config.mjs
index aaca103e86..a3c88c8a40 100644
--- a/nemoclaw/eslint.config.mjs
+++ b/nemoclaw/eslint.config.mjs
@@ -10,6 +10,7 @@ export default [
       parserOptions: {
         projectService: {
           allowDefaultProject: ["src/*.test.ts", "src/*/*.test.ts"],
+          maximumDefaultProjectFileMatchCount_THIS_WILL_SLOW_DOWN_LINTING: 32,
         },
         tsconfigRootDir: import.meta.dirname,
       },
diff --git a/scripts/nemoclaw-start.sh b/scripts/nemoclaw-start.sh
index 2754710292..93d9fd47d5 100755
--- a/scripts/nemoclaw-start.sh
+++ b/scripts/nemoclaw-start.sh
@@ -230,7 +230,7 @@ normalize_mutable_config_perms() {
   # Detect shields-up. Config dir owned by root means shields are
   # currently locked; normalizing would weaken the contract.
   local config_dir_owner
-  config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || echo unknown)"
+  config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || stat -f '%Su' "$config_dir" 2>/dev/null || echo unknown)"
   if [ "$config_dir_owner" = "root" ]; then
     return 0
   fi
diff --git a/test/fetch-guard-patch-regression.test.ts b/test/fetch-guard-patch-regression.test.ts
index 7838058ca4..ba4c74508b 100644
--- a/test/fetch-guard-patch-regression.test.ts
+++ b/test/fetch-guard-patch-regression.test.ts
@@ -23,7 +23,12 @@ function dockerRunCommandBetween(startMarker: string, endMarker: string): string
   return dockerfile
     .slice(runIndex, end)
     .trim()
-    .replace(/^RUN\s+/, "");
+    .replace(/^RUN\s+/, "")
+    .split("\n")
+    .filter((line) => !line.trimStart().startsWith("#"))
+    .join("\n")
+    .replace(/\\\n/g, " ")
+    .replace(/\\\s*$/, "");
 }
 
 function runOpenClawUpgradeBlock(currentVersion: string) {
diff --git a/test/nemoclaw-start.test.ts b/test/nemoclaw-start.test.ts
index f390a33680..513194500d 100644
--- a/test/nemoclaw-start.test.ts
+++ b/test/nemoclaw-start.test.ts
@@ -1030,7 +1030,9 @@ describe("nemoclaw-start gateway launch signal handling", () => {
         "start_auto_pair() { sleep 30 & AUTO_PAIR_PID=$!; }",
         "cleanup_on_signal() { :; }",
         launchBlock(kind, gatewayLog),
-        "sleep 0.5",
+        kind === "root"
+          ? `for _ in {1..20}; do [ -s ${JSON.stringify(gosuLog)} ] && break; sleep 0.1; done`
+          : `for _ in {1..20}; do [ -s ${JSON.stringify(openclawLog)} ] && break; sleep 0.1; done`,
         'printf "GATEWAY_PID=%s\\n" "$GATEWAY_PID"',
         'printf "AUTO_PAIR_PID=%s\\n" "${AUTO_PAIR_PID:-}"',
         'printf "TAIL_PID=%s\\n" "${GATEWAY_LOG_TAIL_PID:-}"',
@@ -1070,10 +1072,10 @@ describe("nemoclaw-start gateway launch signal handling", () => {
   });
 
   it("launches the root gateway through gosu with the configured port and tracks child PIDs", () => {
-    const { result, openclaw, gosu } = runLaunchBlock("root");
+    const { result, gosu } = runLaunchBlock("root");
     expect(result.status).toBe(0);
     expect(gosu).toContain("user=gateway");
-    expect(openclaw).toContain("gateway run --port 19000");
+    expect(gosu).toContain("gateway run --port 19000");
     const gatewayPid = result.stdout.match(/GATEWAY_PID=(\d+)/)?.[1];
     expect(gatewayPid).toBeTruthy();
     expect(result.stdout).toContain(`WAIT_PID=${gatewayPid}`);
@@ -1442,6 +1444,7 @@ describe("Telegram diagnostics (#2766)", () => {
         'verify_no_slack_secrets_on_disk() { :; }',
         'write_auth_profile() { :; }',
         'harden_auth_profiles() { :; }',
+        'normalize_mutable_config_perms() { :; }',
         'chown() { :; }',
         'chown_tree_no_symlink_follow() { :; }',
         'gosu() { shift; "$@"; }',
diff --git a/test/repro-2681-group-writable.test.ts b/test/repro-2681-group-writable.test.ts
index c23a21810b..304524e4bb 100644
--- a/test/repro-2681-group-writable.test.ts
+++ b/test/repro-2681-group-writable.test.ts
@@ -1,120 +1,231 @@
 // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Regression guards for the group-writable mutable-default contract (#2681).
- *
- * Before this PR, control-UI config mutations in the OpenClaw sandbox
- * (Enable Dreaming, account toggles, etc.) wrote through mutateConfigFile,
- * which targeted /sandbox/.openclaw/openclaw.json — owned sandbox:sandbox
- * mode 600. The gateway runs as a separate UID, so every mutation EACCES'd.
- *
- * The previous proposal (#2693) wrapped mutateConfigFile in a try/catch
- * that swallowed EACCES, making the mutation a silent no-op. That made
- * toggles non-functional in the sandbox.
- *
- * This PR replaces that approach with proper Unix group permissions:
- *   1. `gateway` is a member of the `sandbox` group (Dockerfile.base usermod).
- *   2. /sandbox/.openclaw is group-writable + setgid (chmod g+w + g+s).
- *   3. nemoclaw-start.sh normalizes those perms before gateway launch when
- *      shields are not UP.
- *   4. `shields down` restores 660/2770 instead of the old 600/700.
- *
- * Result: writes succeed in default mode without an EACCES swallow.
- *
- * These tests lock the structural invariants so a future change can't
- * silently regress to the swallow approach.
- */
-
+import { spawnSync } from "node:child_process";
 import fs from "node:fs";
+import os from "node:os";
 import path from "node:path";
 import { describe, expect, it } from "vitest";
 
 const ROOT = path.join(import.meta.dirname, "..");
-const DOCKERFILE_BASE = fs.readFileSync(path.join(ROOT, "Dockerfile.base"), "utf-8");
-const DOCKERFILE = fs.readFileSync(path.join(ROOT, "Dockerfile"), "utf-8");
-const NEMOCLAW_START = fs.readFileSync(
-  path.join(ROOT, "scripts", "nemoclaw-start.sh"),
-  "utf-8",
-);
-const SHIELDS_TS = fs.readFileSync(path.join(ROOT, "src", "lib", "shields.ts"), "utf-8");
-
-describe("Issue #2681 — group-writable mutable-default contract", () => {
-  it("Dockerfile.base adds gateway to the sandbox group", () => {
-    expect(DOCKERFILE_BASE).toMatch(/usermod\s+-aG\s+sandbox\s+gateway/);
-  });
 
-  it("Dockerfile.base makes /sandbox/.openclaw group-writable + setgid", () => {
-    expect(DOCKERFILE_BASE).toMatch(/chmod\s+-R\s+g\+w\s+\/sandbox\/\.openclaw/);
-    expect(DOCKERFILE_BASE).toMatch(
-      /find\s+\/sandbox\/\.openclaw\s+-type\s+d\s+-exec\s+chmod\s+g\+s/,
-    );
-  });
+function readRepoFile(...parts: string[]): string {
+  return fs.readFileSync(path.join(ROOT, ...parts), "utf-8");
+}
 
-  it("Dockerfile has stale-base fallback that idempotently adds gateway to sandbox group", () => {
-    // Older base images won't have the usermod yet; the derived image must
-    // add it at build time so PR images work even before sandbox-base is
-    // rebuilt.
-    expect(DOCKERFILE).toMatch(/id\s+gateway/);
-    expect(DOCKERFILE).toMatch(/usermod\s+-aG\s+sandbox\s+gateway/);
-  });
+function dockerRunCommandBetween(
+  fileParts: string[],
+  startMarker: string,
+  endMarker: string,
+): string {
+  const dockerfile = readRepoFile(...fileParts);
+  const start = dockerfile.indexOf(startMarker);
+  const end = dockerfile.indexOf(endMarker, start);
+  if (start === -1 || end === -1 || end <= start) {
+    throw new Error(`Expected Dockerfile block between ${startMarker} and ${endMarker}`);
+  }
+  const runIndex = dockerfile.indexOf("RUN ", start);
+  if (runIndex === -1 || runIndex > end) {
+    throw new Error(`Expected RUN instruction after ${startMarker}`);
+  }
+  return dockerfile
+    .slice(runIndex, end)
+    .trim()
+    .replace(/^RUN\s+/, "")
+    .replace(/\\\n/g, " ");
+}
 
-  it("Dockerfile applies group-writable + setgid in the production image too", () => {
-    expect(DOCKERFILE).toMatch(/chmod\s+-R\s+g\+w\s+\/sandbox\/\.openclaw/);
-    expect(DOCKERFILE).toMatch(
-      /find\s+\/sandbox\/\.openclaw\s+-type\s+d\s+-exec\s+chmod\s+g\+s/,
-    );
-  });
+function shellFunctionFromFile(
+  fileParts: string[],
+  functionName: string,
+  replaceSandboxWith?: string,
+): string {
+  const source = readRepoFile(...fileParts);
+  const start = source.indexOf(`${functionName}()`);
+  const nextSection = source.indexOf("\n# ──", start + functionName.length);
+  if (start === -1 || nextSection === -1 || nextSection <= start) {
+    throw new Error(`Expected shell function ${functionName}`);
+  }
+  const body = source.slice(start, nextSection).trim();
+  return replaceSandboxWith ? body.replaceAll("/sandbox", replaceSandboxWith) : body;
+}
 
-  it("Dockerfile creates .config-hash group-writable (664), not read-only-for-group (644)", () => {
-    // Aaron's spec item 3 explicitly calls out "group-writable config/hash
-    // files". The .config-hash sha256 is created AFTER the recursive chmod
-    // g+w pass, so it gets its own explicit chmod. Lock it to 664 so a
-    // future change can't silently revert to 644 and break gateway writes.
-    expect(DOCKERFILE).toMatch(/chmod\s+664\s+\/sandbox\/\.openclaw\/\.config-hash/);
-    expect(DOCKERFILE).not.toMatch(/chmod\s+644\s+\/sandbox\/\.openclaw\/\.config-hash/);
+function runScript(
+  tmpDir: string,
+  lines: string[],
+  env: NodeJS.ProcessEnv = {},
+): { result: ReturnType<typeof spawnSync>; log: string } {
+  const logPath = path.join(tmpDir, "calls.log");
+  const scriptPath = path.join(tmpDir, "run.sh");
+  fs.writeFileSync(
+    scriptPath,
+    ["#!/usr/bin/env bash", "set -euo pipefail", `CALL_LOG=${JSON.stringify(logPath)}`, ...lines]
+      .join("\n"),
+    { mode: 0o700 },
+  );
+  const result = spawnSync("bash", [scriptPath], {
+    encoding: "utf-8",
+    env: { ...process.env, ...env },
+    timeout: 5000,
   });
+  const log = fs.existsSync(logPath) ? fs.readFileSync(logPath, "utf-8") : "";
+  return { result, log };
+}
 
-  it("Dockerfile does NOT introduce a mutateConfigFile EACCES swallow patch", () => {
-    // The PR explicitly replaces #2693's approach. If a future PR adds
-    // Patch 4b back, this test fails and forces re-evaluation.
-    expect(DOCKERFILE).not.toMatch(/Patch\s+4b/);
-    expect(DOCKERFILE).not.toMatch(/mutateConfigFile.*EACCES/);
-    expect(DOCKERFILE).not.toMatch(/mutation not persisted/);
-  });
+function rewriteSandbox(command: string, sandboxRoot: string): string {
+  return command.replaceAll("/sandbox", sandboxRoot);
+}
 
-  it("nemoclaw-start.sh defines normalize_mutable_config_perms", () => {
-    expect(NEMOCLAW_START).toMatch(/normalize_mutable_config_perms\s*\(\)/);
-  });
+function mode(pathname: string): number {
+  return fs.statSync(pathname).mode & 0o7777;
+}
 
-  it("normalize_mutable_config_perms skips when shields are UP (root-owned config dir)", () => {
-    // The function must check ownership before chmod-ing; if shields are
-    // up the dir is root-owned and normalizing would weaken the lock.
-    const fnIdx = NEMOCLAW_START.indexOf("normalize_mutable_config_perms()");
-    expect(fnIdx).toBeGreaterThan(0);
-    const fnBody = NEMOCLAW_START.slice(fnIdx, fnIdx + 1500);
-    expect(fnBody).toMatch(/stat\s+-c\s+'%U'/);
-    expect(fnBody).toMatch(/= "root"/);
-  });
+describe("Issue #2681 group-writable mutable-default contract", () => {
+  it("executes base-image user and layout setup with shared sandbox-group access", () => {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-2681-base-"));
+    const sandboxRoot = path.join(tmpDir, "sandbox");
 
-  it("normalize_mutable_config_perms is called in the root-mode startup path", () => {
-    expect(NEMOCLAW_START).toMatch(/normalize_mutable_config_perms\b[^(]/);
+    try {
+      fs.mkdirSync(sandboxRoot, { recursive: true });
+      const users = runScript(tmpDir, [
+        'groupadd() { printf "groupadd %s\\n" "$*" >> "$CALL_LOG"; }',
+        'useradd() { printf "useradd %s\\n" "$*" >> "$CALL_LOG"; }',
+        'usermod() { printf "usermod %s\\n" "$*" >> "$CALL_LOG"; }',
+        'chown() { printf "chown %s\\n" "$*" >> "$CALL_LOG"; }',
+        rewriteSandbox(
+          dockerRunCommandBetween(
+            ["Dockerfile.base"],
+            "# Create sandbox user",
+            "# Create .openclaw",
+          ),
+          sandboxRoot,
+        ),
+      ]);
+      expect(users.result.status).toBe(0);
+      expect(users.log).toContain("usermod -aG sandbox gateway");
+
+      const layout = runScript(tmpDir, [
+        'chown() { printf "chown %s\\n" "$*" >> "$CALL_LOG"; }',
+        rewriteSandbox(
+          dockerRunCommandBetween(
+            ["Dockerfile.base"],
+            "# Create .openclaw with all state subdirs directly",
+            "# Pre-create shell init files",
+          ),
+          sandboxRoot,
+        ),
+      ]);
+      const openclawDir = path.join(sandboxRoot, ".openclaw");
+      expect(layout.result.status).toBe(0);
+      expect(mode(openclawDir) & 0o020).not.toBe(0);
+      expect(mode(openclawDir) & 0o2000).not.toBe(0);
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    }
   });
 
-  it("shields.ts unlock path uses group-writable file mode (660) + setgid dir (2770) for openclaw", () => {
-    // Pre-#2681 openclaw unlock used 600/700 which stripped group-write.
-    // After this PR openclaw uses 660/2770 so the gateway UID (member of
-    // sandbox group) can write OpenClaw config. Hermes is left unchanged
-    // (no separate gateway UID, so the shared-group contract doesn't apply).
-    expect(SHIELDS_TS).toMatch(/agentName === "hermes" \? "640" : "660"/);
-    expect(SHIELDS_TS).toMatch(/agentName === "hermes" \? "750" : "2770"/);
+  it("executes production-image fallback setup without losing group-writable config files", () => {
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-2681-prod-"));
+    const sandboxRoot = path.join(tmpDir, "sandbox");
+    const openclawDir = path.join(sandboxRoot, ".openclaw");
+
+    try {
+      fs.mkdirSync(openclawDir, { recursive: true });
+      fs.writeFileSync(path.join(openclawDir, "openclaw.json"), "{}\n");
+
+      const fallback = runScript(tmpDir, [
+        'id() { if [ "${1:-}" = "-nG" ]; then printf "gateway\\n"; return 0; fi; return 0; }',
+        'usermod() { printf "usermod %s\\n" "$*" >> "$CALL_LOG"; }',
+        dockerRunCommandBetween(
+          ["Dockerfile"],
+          "# Stale-base fallback for the gateway-in-sandbox-group setup",
+          "# Keep the image readable",
+        ),
+      ]);
+      expect(fallback.result.status).toBe(0);
+      expect(fallback.log).toContain("usermod -aG sandbox gateway");
+
+      const layout = runScript(tmpDir, [
+        'chown() { printf "chown %s\\n" "$*" >> "$CALL_LOG"; }',
+        rewriteSandbox(
+          dockerRunCommandBetween(
+            ["Dockerfile"],
+            "# `chmod g+w` + setgid",
+            "# Pin config hash",
+          ),
+          sandboxRoot,
+        ),
+      ]);
+      expect(layout.result.status).toBe(0);
+      expect(mode(openclawDir) & 0o020).not.toBe(0);
+      expect(mode(openclawDir) & 0o2000).not.toBe(0);
+
+      const hash = runScript(tmpDir, [
+        'chown() { printf "chown %s\\n" "$*" >> "$CALL_LOG"; }',
+        'sha256sum() { printf "hash  %s\\n" "$1"; }',
+        rewriteSandbox(
+          dockerRunCommandBetween(
+            ["Dockerfile"],
+            "# Pin config hash",
+            "# DAC-protect .nemoclaw directory",
+          ),
+          sandboxRoot,
+        ),
+      ]);
+      expect(hash.result.status).toBe(0);
+      expect((mode(path.join(openclawDir, ".config-hash")) & 0o777).toString(8)).toBe("664");
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    }
   });
 
-  it("applyStateDirLockMode re-adds group-write when unlocking (shields down)", () => {
-    // The chmod in the unlock path must explicitly RE-ADD group-write,
-    // not just preserve it. A prior `chmod -R go-w` from shields-up
-    // already stripped g+w from descendants, so unlock must use `g+w,o-w`
-    // to restore the group-writable contract on the whole tree.
-    expect(SHIELDS_TS).toMatch(/isLocking\s*\?\s*"go-w"\s*:\s*"g\+w,o-w"/);
+  it("normalizes mutable config permissions only when the config dir is not locked", () => {
+    const runCase = (owner: "root" | "sandbox") => {
+      const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), `nemoclaw-2681-normalize-${owner}-`));
+      const sandboxRoot = path.join(tmpDir, "sandbox");
+      const openclawDir = path.join(sandboxRoot, ".openclaw");
+      try {
+        fs.mkdirSync(path.join(openclawDir, "workspace"), { recursive: true });
+        fs.chmodSync(openclawDir, 0o755);
+        fs.chmodSync(path.join(openclawDir, "workspace"), 0o755);
+        const script = shellFunctionFromFile(
+          ["scripts", "nemoclaw-start.sh"],
+          "normalize_mutable_config_perms",
+          sandboxRoot,
+        );
+        const { result } = runScript(
+          tmpDir,
+          [
+            'id() { if [ "${1:-}" = "-u" ]; then printf "0"; return 0; fi; command id "$@"; }',
+            'stat() { if [ "${1:-}" = "-c" ] || [ "${1:-}" = "-f" ]; then printf "%s\\n" "$CONFIG_OWNER"; return 0; fi; command stat "$@"; }',
+            script,
+            "normalize_mutable_config_perms",
+          ],
+          { CONFIG_OWNER: owner },
+        );
+        return { result, openclawDir, tmpDir };
+      } catch (err) {
+        fs.rmSync(tmpDir, { recursive: true, force: true });
+        throw err;
+      }
+    };
+
+    const locked = runCase("root");
+    try {
+      expect(locked.result.status).toBe(0);
+      expect(mode(locked.openclawDir) & 0o020).toBe(0);
+      expect(mode(locked.openclawDir) & 0o2000).toBe(0);
+    } finally {
+      fs.rmSync(locked.tmpDir, { recursive: true, force: true });
+    }
+
+    const unlocked = runCase("sandbox");
+    try {
+      expect(unlocked.result.status).toBe(0);
+      expect(mode(unlocked.openclawDir) & 0o020).not.toBe(0);
+      expect(mode(unlocked.openclawDir) & 0o2000).not.toBe(0);
+    } finally {
+      fs.rmSync(unlocked.tmpDir, { recursive: true, force: true });
+    }
   });
 });

From cd5be3032b9d2bdbb16b06998364e100893701d6 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 16:43:32 -0700
Subject: [PATCH 06/14] test(wsl): simulate mutable OpenClaw config owner

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 test/repro-2681-group-writable.test.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/repro-2681-group-writable.test.ts b/test/repro-2681-group-writable.test.ts
index 9d6701fadc..01c7c344b8 100644
--- a/test/repro-2681-group-writable.test.ts
+++ b/test/repro-2681-group-writable.test.ts
@@ -61,6 +61,7 @@ describe("Issue #2681 — mutable OpenClaw config permissions", () => {
           [
             "set -euo pipefail",
             'id() { if [ "${1:-}" = "-u" ]; then printf "0"; else command id "$@"; fi; }',
+            'stat() { if [ "${1:-}" = "-c" ] && [ "${2:-}" = "%U" ]; then printf "sandbox\\n"; else command stat "$@"; fi; }',
             normalizeMutableConfigPermsFor(configDir),
             "normalize_mutable_config_perms",
           ].join("\n"),

From c15a003254a67d05a140b6cdf59a1d8f9f9aeac1 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 17:23:15 -0700
Subject: [PATCH 07/14] test(wsl): avoid slow shields subprocess

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 test/repro-2681-group-writable.test.ts | 99 ++++++++++++++------------
 1 file changed, 55 insertions(+), 44 deletions(-)

diff --git a/test/repro-2681-group-writable.test.ts b/test/repro-2681-group-writable.test.ts
index 01c7c344b8..0538ddcdd9 100644
--- a/test/repro-2681-group-writable.test.ts
+++ b/test/repro-2681-group-writable.test.ts
@@ -16,7 +16,7 @@ import fs from "node:fs";
 import os from "node:os";
 import path from "node:path";
 import { spawnSync } from "node:child_process";
-import { describe, expect, it } from "vitest";
+import { describe, expect, it, vi } from "vitest";
 
 const START_SCRIPT = path.join(import.meta.dirname, "..", "scripts", "nemoclaw-start.sh");
 
@@ -40,6 +40,38 @@ function modeBits(filePath: string): number {
   return fs.statSync(filePath).mode;
 }
 
+function withMockedDockerExecFileSync<T>(calls: string[][], run: () => T): T {
+  // eslint-disable-next-line @typescript-eslint/no-require-imports
+  const dockerExecModule = require("../dist/lib/docker/exec.js") as {
+    dockerExecFileSync: (args: readonly string[]) => string;
+  };
+  const originalDockerExecFileSync = dockerExecModule.dockerExecFileSync;
+  const shieldsModulePath = require.resolve("../dist/lib/shields.js");
+  delete require.cache[shieldsModulePath];
+
+  dockerExecModule.dockerExecFileSync = vi.fn((args: readonly string[]) => {
+    const separator = args.indexOf("--");
+    const command = separator >= 0 ? args.slice(separator + 1) : [...args];
+    calls.push(command);
+    if (command[0] === "stat" && command[1] === "-c") {
+      return command.at(-1) === "/sandbox/.openclaw"
+        ? "2770 sandbox:sandbox\n"
+        : "660 sandbox:sandbox\n";
+    }
+    if (command[0] === "lsattr") {
+      return `---------------------- ${command.at(-1)}\n`;
+    }
+    return "";
+  });
+
+  try {
+    return run();
+  } finally {
+    dockerExecModule.dockerExecFileSync = originalDockerExecFileSync;
+    delete require.cache[shieldsModulePath];
+  }
+}
+
 describe("Issue #2681 — mutable OpenClaw config permissions", () => {
   it("restores group-write and setgid on mutable config trees during root startup", () => {
     const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-2681-perms-"));
@@ -80,50 +112,29 @@ describe("Issue #2681 — mutable OpenClaw config permissions", () => {
   });
 
   it("shields-down restores OpenClaw group-writable file modes and setgid dirs", () => {
-    const probe = spawnSync(
-      process.execPath,
-      [
-        "-e",
-        String.raw`
-const Module = require("node:module");
-const originalLoad = Module._load;
-const calls = [];
-Module._load = function patchedLoad(request, parent, isMain) {
-  if (request === "./docker/exec") {
-    return {
-      dockerExecFileSync(args) {
-        const separator = args.indexOf("--");
-        const command = separator >= 0 ? args.slice(separator + 1) : args;
-        calls.push(command);
-        if (command[0] === "stat" && command[1] === "-c") {
-          return command.at(-1) === "/sandbox/.openclaw"
-            ? "2770 sandbox:sandbox\n"
-            : "660 sandbox:sandbox\n";
-        }
-        if (command[0] === "lsattr") {
-          return "---------------------- " + command.at(-1) + "\n";
-        }
-        return "";
-      },
-    };
-  }
-  return originalLoad.call(this, request, parent, isMain);
-};
-const { unlockAgentConfig } = require("./dist/lib/shields.js");
-unlockAgentConfig("sandbox-pod", {
-  agentName: "openclaw",
-  configPath: "/sandbox/.openclaw/openclaw.json",
-  configDir: "/sandbox/.openclaw",
-  sensitiveFiles: ["/sandbox/.openclaw/.config-hash"],
-});
-process.stdout.write(JSON.stringify(calls));
-`,
-      ],
-      { encoding: "utf-8", timeout: 5000 },
-    );
+    const commands: string[][] = [];
+    withMockedDockerExecFileSync(commands, () => {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      const { unlockAgentConfig } = require("../dist/lib/shields.js") as {
+        unlockAgentConfig: (
+          sandboxName: string,
+          target: {
+            agentName?: string;
+            configPath: string;
+            configDir: string;
+            sensitiveFiles?: string[];
+          },
+        ) => void;
+      };
+
+      unlockAgentConfig("sandbox-pod", {
+        agentName: "openclaw",
+        configPath: "/sandbox/.openclaw/openclaw.json",
+        configDir: "/sandbox/.openclaw",
+        sensitiveFiles: ["/sandbox/.openclaw/.config-hash"],
+      });
+    });
 
-    expect(probe.status).toBe(0);
-    const commands = JSON.parse(probe.stdout) as string[][];
     expect(commands).toContainEqual(["chmod", "660", "/sandbox/.openclaw/openclaw.json"]);
     expect(commands).toContainEqual(["chmod", "660", "/sandbox/.openclaw/.config-hash"]);
     expect(commands).toContainEqual(["chmod", "2770", "/sandbox/.openclaw"]);

From 1d137c6cf2024766a530d0d963c957e76038cb97 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 17:54:34 -0700
Subject: [PATCH 08/14] fix(recovery): address coderabbit follow-ups

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 src/lib/agent-runtime.test.ts | 15 +++++---
 src/lib/agent-runtime.ts      | 25 ++++++++-----
 src/nemoclaw.ts               | 66 ++++++++++++++++++++---------------
 test/cli.test.ts              | 43 ++++++++++++++++++++++-
 4 files changed, 107 insertions(+), 42 deletions(-)

diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts
index c5a2bd09db..b11126d1ef 100644
--- a/src/lib/agent-runtime.test.ts
+++ b/src/lib/agent-runtime.test.ts
@@ -123,19 +123,22 @@ describe("buildRecoveryScript", () => {
 
     it("writes the warning to gateway.log so it persists for sysadmin tail", () => {
       const script = buildRecoveryScript(minimalAgent, 19000);
-      // Both warnings must end up in /tmp/gateway.log, not just stderr —
+      // Both warnings must end up in the selected gateway log, not just stderr —
       // executeSandboxCommand silently discards stderr from the recovery
       // script, so a warning that only goes to stderr is invisible to
       // anyone debugging a crash-loop. (#2478)
-      expect(script).toContain('echo "$_W" >> /tmp/gateway.log');
+      expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"');
       // And the warning must be deferred until AFTER gateway.log is
       // safely opened with O_NOFOLLOW, otherwise the redirect targets a
       // stale or attacker-controlled file.
       const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;");
-      const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log');
+      const logSelectionIdx = script!.indexOf("_GATEWAY_LOG=/tmp/gateway.log");
+      const warnIdx = script!.indexOf('echo "$_W" >> "$_GATEWAY_LOG"');
       expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0);
+      expect(logSelectionIdx).toBeGreaterThanOrEqual(0);
       expect(warnIdx).toBeGreaterThanOrEqual(0);
-      expect(gatewayPrepIdx).toBeLessThan(warnIdx);
+      expect(gatewayPrepIdx).toBeLessThan(logSelectionIdx);
+      expect(logSelectionIdx).toBeLessThan(warnIdx);
     });
 
     it("stops recovery when hardened log setup fails", () => {
@@ -158,6 +161,10 @@ describe("buildRecoveryScript", () => {
       expect(script).not.toContain("rm -f /tmp/gateway.log");
       expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log");
       expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log");
+      expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"');
+      expect(script).toContain('tail -5 "$_GATEWAY_LOG"');
+      expect(script).not.toContain('echo "$_W" >> /tmp/gateway.log');
+      expect(script).not.toContain("cat /tmp/gateway.log");
     });
 
     it("rejects a symlinked gateway.log before preparing the log", () => {
diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts
index 71dc29b9a0..a4ae7e7dc1 100644
--- a/src/lib/agent-runtime.ts
+++ b/src/lib/agent-runtime.ts
@@ -108,12 +108,17 @@ function buildGatewayLogSetup(includeAutoPairLog = false, logOwnerUser?: string)
   return lines;
 }
 
+function buildGatewayLogSelection(): string {
+  return '_GATEWAY_LOG=/tmp/gateway.log; if ! : >> "$_GATEWAY_LOG" 2>/dev/null; then _GATEWAY_LOG=/tmp/gateway-recovery.log; : >> "$_GATEWAY_LOG" 2>/dev/null || true; fi;';
+}
+
 function gatewayLaunchCommand(command: string, runAsUser?: string): string {
-  const userLaunch = `_GATEWAY_LOG=/tmp/gateway.log; if ! : >> "$_GATEWAY_LOG" 2>/dev/null; then _GATEWAY_LOG=/tmp/gateway-recovery.log; : > "$_GATEWAY_LOG" 2>/dev/null || true; fi; nohup ${command} >> "$_GATEWAY_LOG" 2>&1 &`;
+  const logSelection = buildGatewayLogSelection();
+  const userLaunch = `nohup ${command} >> "$_GATEWAY_LOG" 2>&1 &`;
   if (!runAsUser) {
-    return userLaunch;
+    return `${logSelection} ${userLaunch}`;
   }
-  return `if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> /tmp/gateway.log 2>&1 & else ${userLaunch} fi`;
+  return `${logSelection} if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> "$_GATEWAY_LOG" 2>&1 & else ${userLaunch} fi`;
 }
 
 /**
@@ -127,13 +132,14 @@ export function buildOpenClawRecoveryScript(port: number): string {
     `if curl -sf --max-time 3 http://127.0.0.1:${port}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`,
     "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;",
     ...buildGatewayLogSetup(true, "gateway"),
-    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
-    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
+    buildGatewayLogSelection(),
+    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };',
+    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };',
     'OPENCLAW="$(command -v openclaw)";',
     'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;',
     gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"),
     "GPID=$!; sleep 2;",
-    'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi',
+    'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; tail -5 "$_GATEWAY_LOG" 2>/dev/null; fi',
   ].join(" ");
 }
 
@@ -196,12 +202,13 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number)
     hermesHome,
     `if curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`,
     ...buildGatewayLogSetup(false),
-    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
-    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
+    buildGatewayLogSelection(),
+    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };',
+    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };',
     ...validationSteps,
     launchCommand,
     "GPID=$!; sleep 2;",
-    'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi',
+    'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; tail -5 "$_GATEWAY_LOG" 2>/dev/null; fi',
   ].join(" ");
 }
 
diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts
index 435492758e..91ef88d161 100644
--- a/src/nemoclaw.ts
+++ b/src/nemoclaw.ts
@@ -1223,6 +1223,12 @@ type SandboxConnectOptions = {
   probeOnly?: boolean;
 };
 
+const SANDBOX_CONNECT_FLAGS = new Set(["--dangerously-skip-permissions", "--probe-only", "--help", "-h"]);
+
+function isSandboxConnectFlag(arg: string | undefined): boolean {
+  return typeof arg === "string" && SANDBOX_CONNECT_FLAGS.has(arg);
+}
+
 function printSandboxConnectHelp(sandboxName = "<name>") {
   console.log("");
   console.log(`  Usage: ${CLI_NAME} ${sandboxName} connect [--probe-only]`);
@@ -1238,6 +1244,11 @@ function printSandboxConnectHelp(sandboxName = "<name>") {
 function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): SandboxConnectOptions {
   const options: SandboxConnectOptions = {};
   for (const arg of actionArgs) {
+    if (!isSandboxConnectFlag(arg)) {
+      console.error(`  Unknown flag for connect: ${arg}`);
+      printSandboxConnectHelp(sandboxName);
+      process.exit(1);
+    }
     switch (arg) {
       case "--dangerously-skip-permissions":
         console.error("  --dangerously-skip-permissions was removed; use shields commands instead.");
@@ -1251,15 +1262,36 @@ function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): San
         printSandboxConnectHelp(sandboxName);
         process.exit(0);
         break;
-      default:
-        console.error(`  Unknown flag for connect: ${arg}`);
-        printSandboxConnectHelp(sandboxName);
-        process.exit(1);
     }
   }
   return options;
 }
 
+function runSandboxConnectProbe(sandboxName: string): void {
+  const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true });
+  const agent = agentRuntime.getSessionAgent(sandboxName);
+  const agentName = agentRuntime.getAgentDisplayName(agent);
+  if (!processCheck.checked) {
+    console.error(
+      `  Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`,
+    );
+    process.exit(1);
+  }
+  if (processCheck.wasRunning) {
+    console.log(`  Probe complete: ${agentName} gateway is running in '${sandboxName}'.`);
+    return;
+  }
+  if (processCheck.recovered) {
+    console.log(`  Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`);
+    return;
+  }
+  console.error(
+    `  Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`,
+  );
+  console.error("  Check /tmp/gateway.log inside the sandbox for details.");
+  process.exit(1);
+}
+
 async function sandboxConnect(
   sandboxName: string,
   { probeOnly = false }: SandboxConnectOptions = {},
@@ -1268,28 +1300,7 @@ async function sandboxConnect(
   await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true });
 
   if (probeOnly) {
-    const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true });
-    const agent = agentRuntime.getSessionAgent(sandboxName);
-    const agentName = agentRuntime.getAgentDisplayName(agent);
-    if (!processCheck.checked) {
-      console.error(
-        `  Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`,
-      );
-      process.exit(1);
-    }
-    if (processCheck.wasRunning) {
-      console.log(`  Probe complete: ${agentName} gateway is running in '${sandboxName}'.`);
-      return;
-    }
-    if (processCheck.recovered) {
-      console.log(`  Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`);
-      return;
-    }
-    console.error(
-      `  Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`,
-    );
-    console.error("  Check /tmp/gateway.log inside the sandbox for details.");
-    process.exit(1);
+    return runSandboxConnectProbe(sandboxName);
   }
 
   // Version staleness check — warn but don't block
@@ -4151,8 +4162,7 @@ const [cmd, ...args] = process.argv.slice(2);
 
   // Sandbox-scoped commands: nemoclaw <name> <action>
   const firstSandboxArg = args[0];
-  const implicitConnectArg =
-    firstSandboxArg === "--help" || firstSandboxArg === "-h" || firstSandboxArg === "--probe-only";
+  const implicitConnectArg = isSandboxConnectFlag(firstSandboxArg);
   const requestedSandboxAction =
     !firstSandboxArg || implicitConnectArg ? "connect" : firstSandboxArg;
   const requestedSandboxActionArgs = !firstSandboxArg || implicitConnectArg ? args : args.slice(1);
diff --git a/test/cli.test.ts b/test/cli.test.ts
index f3ca38fed8..bed704109f 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -94,7 +94,33 @@ function readRecordedArgs(markerFile: string): string[] {
   return fs.readFileSync(markerFile, "utf8").trim().split(/\s+/);
 }
 
-function writeSandboxRegistry(home: string, sandboxOverrides: Record<string, unknown> = {}): void {
+type SandboxEntry = {
+  name: string;
+  model: string;
+  provider: string;
+  gpuEnabled: boolean;
+  policies: string[];
+  agent?: string;
+};
+
+function writeRecordingCommand(
+  binDir: string,
+  command: string,
+  markerFile: string,
+  exitCode: number,
+): void {
+  fs.writeFileSync(
+    path.join(binDir, command),
+    [
+      "#!/usr/bin/env bash",
+      `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`,
+      `exit ${exitCode}`,
+    ].join("\n"),
+    { mode: 0o755 },
+  );
+}
+
+function writeSandboxRegistry(home: string, sandboxOverrides: Partial<SandboxEntry> = {}): void {
   const registryDir = path.join(home, ".nemoclaw");
   fs.mkdirSync(registryDir, { recursive: true });
   fs.writeFileSync(
@@ -1609,6 +1635,7 @@ describe("CLI dispatch", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-help-"));
     const localBin = path.join(home, "bin");
     const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
     fs.mkdirSync(localBin, { recursive: true });
     fs.writeFileSync(
       path.join(localBin, "openshell"),
@@ -1619,6 +1646,7 @@ describe("CLI dispatch", () => {
       ].join("\n"),
       { mode: 0o755 },
     );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
 
     const r = runWithEnv("alpha connect --help", {
       HOME: home,
@@ -1635,12 +1663,14 @@ describe("CLI dispatch", () => {
     expect(implicit.code).toBe(0);
     expect(implicit.out).toContain("Usage: nemoclaw alpha connect");
     expect(fs.existsSync(markerFile)).toBe(false);
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
   });
 
   it("rejects the removed skip-permissions connect flag", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-flags-"));
     const localBin = path.join(home, "bin");
     const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
     fs.mkdirSync(localBin, { recursive: true });
     fs.writeFileSync(
       path.join(localBin, "openshell"),
@@ -1651,6 +1681,7 @@ describe("CLI dispatch", () => {
       ].join("\n"),
       { mode: 0o755 },
     );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
     writeSandboxRegistry(home);
 
     const r = runWithEnv("alpha connect --dangerously-skip-permissions", {
@@ -1661,12 +1692,14 @@ describe("CLI dispatch", () => {
     expect(r.code).toBe(1);
     expect(r.out).toContain("--dangerously-skip-permissions was removed");
     expect(fs.existsSync(markerFile)).toBe(false);
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
   });
 
   it("connect --probe-only recovers the gateway without opening SSH", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-"));
     const localBin = path.join(home, "bin");
     const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
     const stateFile = path.join(home, "probe-state");
     fs.mkdirSync(localBin, { recursive: true });
     writeSandboxRegistry(home);
@@ -1707,6 +1740,7 @@ describe("CLI dispatch", () => {
       ].join("\n"),
       { mode: 0o755 },
     );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
 
     const r = runWithEnv("alpha connect --probe-only", {
       HOME: home,
@@ -1720,12 +1754,14 @@ describe("CLI dispatch", () => {
     expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
     expect(calls).not.toContain("sandbox ssh-config alpha");
     expect(calls).not.toContain("sandbox connect alpha");
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
   });
 
   it("treats leading --probe-only as an implicit connect probe", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-leading-"));
     const localBin = path.join(home, "bin");
     const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
     fs.mkdirSync(localBin, { recursive: true });
     writeSandboxRegistry(home);
     fs.writeFileSync(
@@ -1752,6 +1788,7 @@ describe("CLI dispatch", () => {
       ].join("\n"),
       { mode: 0o755 },
     );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
 
     const r = runWithEnv("alpha --probe-only", {
       HOME: home,
@@ -1765,12 +1802,14 @@ describe("CLI dispatch", () => {
     expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
     expect(calls).not.toContain("sandbox ssh-config alpha");
     expect(calls).not.toContain("sandbox connect alpha");
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
   });
 
   it("connect --probe-only does not retry a failed sandbox exec recovery over SSH", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-no-ssh-"));
     const localBin = path.join(home, "bin");
     const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
     fs.mkdirSync(localBin, { recursive: true });
     writeSandboxRegistry(home);
     fs.writeFileSync(
@@ -1801,6 +1840,7 @@ describe("CLI dispatch", () => {
       ].join("\n"),
       { mode: 0o755 },
     );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
 
     const r = runWithEnv("alpha connect --probe-only", {
       HOME: home,
@@ -1812,6 +1852,7 @@ describe("CLI dispatch", () => {
     expect(calls).toContain("sandbox get alpha");
     expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
     expect(calls).not.toContain("sandbox ssh-config alpha");
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
   });
 
   it("connect --probe-only falls back to SSH when sandbox exec never starts", () => {

From 61bb8458dfbbd7378f17ef40b95896d1243dfaba Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 18:17:06 -0700
Subject: [PATCH 09/14] fix(recovery): stabilize e2e follow-ups

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 src/lib/agent-onboard.test.ts   | 24 +++++++++++++++--
 src/lib/agent-onboard.ts        |  9 ++++---
 src/lib/agent-runtime.test.ts   |  6 +++++
 src/lib/agent-runtime.ts        |  2 +-
 src/lib/shields.ts              |  9 +++++++
 test/e2e/test-shields-config.sh | 48 ++++++++++++++++++++++-----------
 test/shields.test.ts            |  3 +++
 7 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/src/lib/agent-onboard.test.ts b/src/lib/agent-onboard.test.ts
index 6e00cf830a..5b1bb0a110 100644
--- a/src/lib/agent-onboard.test.ts
+++ b/src/lib/agent-onboard.test.ts
@@ -5,7 +5,7 @@ import { describe, it, expect, beforeEach, afterEach, afterAll, vi } from "vites
 import fs from "node:fs";
 import path from "node:path";
 // Import from compiled dist/ so coverage is attributed correctly.
-import { printDashboardUi } from "../../dist/lib/agent-onboard";
+import { printDashboardUi, verifyAgentBinaryAvailable } from "../../dist/lib/agent-onboard";
 import type { AgentDefinition } from "./agent-defs";
 
 function makeAgent(overrides: Partial<AgentDefinition> = {}): AgentDefinition {
@@ -128,10 +128,13 @@ describe("handleAgentSetup guards", () => {
     const source = fs.readFileSync(path.join(import.meta.dirname, "agent-onboard.ts"), "utf-8");
 
     expect(source).toContain("verifyAgentBinaryAvailable");
+    expect(source).toContain("[ -e ${shellQuote(binaryPath)} ]");
+    expect(source).toContain("[ -x ${shellQuote(binaryPath)} ]");
     expect(source).toContain(
       'resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"',
     );
-    expect(source).toContain('[ "$resolved" = ${shellQuote(binaryPath)} ]');
+    expect(source).toContain('[ -z "$resolved" ] || [ "$resolved" = ${shellQuote(binaryPath)} ]');
+    expect(source).not.toContain('[ -n "$resolved" ] || { echo not_found');
     expect(source).toMatch(
       /"sandbox",\s*"exec",\s*"-n",\s*sandboxName,\s*"--",\s*"sh",\s*"-lc",\s*script/,
     );
@@ -150,4 +153,21 @@ describe("handleAgentSetup guards", () => {
     expect(source).toContain('parsed.status === "ok"');
     expect(source).not.toContain('.includes("ok")');
   });
+
+  it("accepts an executable configured binary path when PATH lookup is empty", () => {
+    let script = "";
+    const result = verifyAgentBinaryAvailable(
+      "alpha",
+      makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }),
+      (args) => {
+        script = String(args[7] || "");
+        return "ok";
+      },
+    );
+
+    expect(result).toEqual({ available: true });
+    expect(script).toContain("[ -e '/usr/local/bin/hermes' ]");
+    expect(script).toContain("[ -x '/usr/local/bin/hermes' ]");
+    expect(script).toContain('[ -z "$resolved" ] || [ "$resolved" = \'/usr/local/bin/hermes\' ]');
+  });
 });
diff --git a/src/lib/agent-onboard.ts b/src/lib/agent-onboard.ts
index c5bad2dff1..d67af0801f 100644
--- a/src/lib/agent-onboard.ts
+++ b/src/lib/agent-onboard.ts
@@ -129,7 +129,8 @@ type AgentBinaryAvailability =
       resolvedPath?: string;
     };
 
-function verifyAgentBinaryAvailable(
+// Exported for unit coverage of the sandbox-side guard without running onboarding.
+export function verifyAgentBinaryAvailable(
   sandboxName: string,
   agent: AgentDefinition,
   runCaptureOpenshell: OnboardContext["runCaptureOpenshell"],
@@ -138,10 +139,10 @@ function verifyAgentBinaryAvailable(
   const binaryPath = typeof agent.binary_path === "string" ? agent.binary_path.trim() : "";
   const script = binaryPath
     ? [
-        `resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`,
-        `[ -n "$resolved" ] || { echo not_found; exit 1; }`,
+        `[ -e ${shellQuote(binaryPath)} ] || { echo not_found; exit 1; }`,
         `[ -x ${shellQuote(binaryPath)} ] || { echo not_executable; exit 1; }`,
-        `[ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`,
+        `resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`,
+        `[ -z "$resolved" ] || [ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`,
         "echo ok",
       ].join(" && ")
     : `command -v ${shellQuote(executable)} >/dev/null 2>&1 && echo ok || echo not_found`;
diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts
index b11126d1ef..965729842f 100644
--- a/src/lib/agent-runtime.test.ts
+++ b/src/lib/agent-runtime.test.ts
@@ -194,6 +194,12 @@ describe("buildRecoveryScript", () => {
       expect(script).toContain("gosu 'gateway'");
     });
 
+    it("terminates the conditional launch branch before capturing the gateway pid", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).toContain(" fi; GPID=$!");
+      expect(script).not.toContain(" fi GPID=$!");
+    });
+
     it("prepares auto-pair.log without unlinking or following symlinks", () => {
       const script = buildOpenClawRecoveryScript(18789);
       expect(script).toContain("refusing to prepare symlinked /tmp/auto-pair.log");
diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts
index a4ae7e7dc1..7d01f3e05a 100644
--- a/src/lib/agent-runtime.ts
+++ b/src/lib/agent-runtime.ts
@@ -118,7 +118,7 @@ function gatewayLaunchCommand(command: string, runAsUser?: string): string {
   if (!runAsUser) {
     return `${logSelection} ${userLaunch}`;
   }
-  return `${logSelection} if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> "$_GATEWAY_LOG" 2>&1 & else ${userLaunch} fi`;
+  return `${logSelection} if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> "$_GATEWAY_LOG" 2>&1 & else ${userLaunch} fi;`;
 }
 
 /**
diff --git a/src/lib/shields.ts b/src/lib/shields.ts
index f53d281cbe..554bb87e30 100644
--- a/src/lib/shields.ts
+++ b/src/lib/shields.ts
@@ -279,6 +279,9 @@ function applyStateDirLockMode(sandboxName: string, configDir: string, owner: st
     }
     try {
       kubectlExec(sandboxName, ["chmod", dirMode, dirPath]);
+      if (isLocking) {
+        kubectlExec(sandboxName, ["chmod", "g-s", dirPath]);
+      }
       kubectlExec(sandboxName, ["chmod", "-R", writeStrip, dirPath]);
     } catch {
       // Silently skip
@@ -301,6 +304,7 @@ for dir in "$config_dir"/workspace-*; do
   [ -d "$dir" ] || continue
   chown -R "$owner" "$dir" 2>/dev/null || true
   chmod "$dir_mode" "$dir" 2>/dev/null || true
+  if [ "$dir_mode" = "755" ]; then chmod g-s "$dir" 2>/dev/null || true; fi
   chmod -R "$write_strip" "$dir" 2>/dev/null || true
 done
 `,
@@ -483,6 +487,11 @@ function lockAgentConfig(
   } catch {
     errors.push("chmod 755 config dir");
   }
+  try {
+    kubectlExec(sandboxName, ["chmod", "g-s", target.configDir]);
+  } catch {
+    errors.push("chmod g-s config dir");
+  }
 
   try {
     kubectlExec(sandboxName, ["chown", "root:root", target.configDir]);
diff --git a/test/e2e/test-shields-config.sh b/test/e2e/test-shields-config.sh
index 061741b430..499584c6c7 100755
--- a/test/e2e/test-shields-config.sh
+++ b/test/e2e/test-shields-config.sh
@@ -156,16 +156,22 @@ pass "NemoClaw installed (sandbox: $SANDBOX_NAME)"
 # ══════════════════════════════════════════════════════════════════
 section "Phase 2: Config is writable (mutable default)"
 
-# Verify file permissions — should start as 600 sandbox:sandbox
+# Verify file permissions — fresh OpenClaw sandboxes start sandbox-owned.
+# Depending on whether the image entrypoint ran as root, the mutable-default
+# tree may already include the #2681 group-write/setgid contract or may only
+# gain it after the first host-side shields-down.
 PERMS=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \
   stat -c '%a %U:%G' "${CONFIG_PATH}" 2>/dev/null || true)
 info "Config perms (default): ${PERMS}"
 
-if [ "$(echo "$PERMS" | awk '{print $1}')" = "600" ]; then
-  pass "Config file mode is 600 (mutable default)"
-else
-  fail "Config file should start as mode 600: ${PERMS}"
-fi
+case "$(echo "$PERMS" | awk '{print $1}')" in
+  600 | 660)
+    pass "Config file mode is mutable default (${PERMS})"
+    ;;
+  *)
+    fail "Config file should start as mutable mode 600 or 660: ${PERMS}"
+    ;;
+esac
 
 if [ "$(echo "$PERMS" | awk '{print $2}')" = "sandbox:sandbox" ]; then
   pass "Config file owned by sandbox:sandbox (mutable default)"
@@ -177,10 +183,19 @@ DIR_PERMS=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \
   stat -c '%a %U:%G' "$(dirname "${CONFIG_PATH}")" 2>/dev/null || true)
 info "Config dir perms (default): ${DIR_PERMS}"
 
-if [ "$(echo "$DIR_PERMS" | awk '{print $1}')" = "700" ]; then
-  pass "Config directory mode is 700 (mutable default)"
+case "$(echo "$DIR_PERMS" | awk '{print $1}')" in
+  700 | 2700 | 2770)
+    pass "Config directory mode is mutable default (${DIR_PERMS})"
+    ;;
+  *)
+    fail "Config directory should be mutable mode 700, 2700, or 2770: ${DIR_PERMS}"
+    ;;
+esac
+
+if [ "$(echo "$DIR_PERMS" | awk '{print $2}')" = "sandbox:sandbox" ]; then
+  pass "Config directory owned by sandbox:sandbox (mutable default)"
 else
-  fail "Config directory should be mode 700: ${DIR_PERMS}"
+  fail "Config directory should be owned by sandbox:sandbox: ${DIR_PERMS}"
 fi
 
 STATUS_DEFAULT=$(nemoclaw "${SANDBOX_NAME}" shields status 2>&1)
@@ -321,15 +336,16 @@ else
   fail "shields down did not report success: ${SHIELDS_DOWN_OUTPUT}"
 fi
 
-# Check permissions changed — should be sandbox:sandbox 600/700 (doctor-aligned)
+# Check permissions changed — OpenClaw uses group-writable + setgid mutable mode
+# so the gateway UID and sandbox UID can both update config/state (#2681).
 PERMS_DOWN=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \
   stat -c '%a %U:%G' "${CONFIG_PATH}" 2>/dev/null || true)
 info "Config perms (shields DOWN): ${PERMS_DOWN}"
 
-if [ "$(echo "$PERMS_DOWN" | awk '{print $1}')" = "600" ]; then
-  pass "Config file mode is 600 (restored to mutable default)"
+if [ "$(echo "$PERMS_DOWN" | awk '{print $1}')" = "660" ]; then
+  pass "Config file mode is 660 (restored to mutable group-writable default)"
 else
-  fail "Config file should be mode 600 after shields down: ${PERMS_DOWN}"
+  fail "Config file should be mode 660 after shields down: ${PERMS_DOWN}"
 fi
 
 if [ "$(echo "$PERMS_DOWN" | awk '{print $2}')" = "sandbox:sandbox" ]; then
@@ -342,10 +358,10 @@ DIR_PERMS_DOWN=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \
   stat -c '%a %U:%G' "$(dirname "${CONFIG_PATH}")" 2>/dev/null || true)
 info "Config dir perms (shields DOWN): ${DIR_PERMS_DOWN}"
 
-if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $1}')" = "700" ]; then
-  pass "Config directory mode is 700 (restored to mutable default)"
+if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $1}')" = "2770" ]; then
+  pass "Config directory mode is 2770 (restored to mutable setgid default)"
 else
-  fail "Config directory should be mode 700 after shields down: ${DIR_PERMS_DOWN}"
+  fail "Config directory should be mode 2770 after shields down: ${DIR_PERMS_DOWN}"
 fi
 
 if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $2}')" = "sandbox:sandbox" ]; then
diff --git a/test/shields.test.ts b/test/shields.test.ts
index 6d3d31102d..fa80e0d627 100644
--- a/test/shields.test.ts
+++ b/test/shields.test.ts
@@ -418,6 +418,9 @@ describe("NC-2227-05: shields.ts locks state directories", () => {
     expect(src).toContain("function applyStateDirLockMode");
     expect(src).toContain("workspace-*");
     expect(fnBody).toContain("applyStateDirLockMode");
+    expect(fnBody).toContain('["chmod", "g-s", target.configDir]');
+    expect(src).toContain('["chmod", "g-s", dirPath]');
+    expect(src).toContain('if [ "$dir_mode" = "755" ]; then chmod g-s "$dir"');
     expect(fnBody).toContain("chown");
     expect(fnBody).toContain("root:root");
   });

From b8e27d5a8d3fac11d22fc1a2dde094f09c773f06 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 18:26:37 -0700
Subject: [PATCH 10/14] fix(shields): keep lock-down write stripping mandatory

Signed-off-by: Aaron Erickson <aerickson@nvidia.com>
---
 src/lib/shields.ts   | 10 +++++++++-
 test/shields.test.ts |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/lib/shields.ts b/src/lib/shields.ts
index 554bb87e30..2bac74c4d5 100644
--- a/src/lib/shields.ts
+++ b/src/lib/shields.ts
@@ -279,9 +279,17 @@ function applyStateDirLockMode(sandboxName: string, configDir: string, owner: st
     }
     try {
       kubectlExec(sandboxName, ["chmod", dirMode, dirPath]);
-      if (isLocking) {
+    } catch {
+      // Silently skip
+    }
+    if (isLocking) {
+      try {
         kubectlExec(sandboxName, ["chmod", "g-s", dirPath]);
+      } catch {
+        // Best effort; do not skip recursive write stripping.
       }
+    }
+    try {
       kubectlExec(sandboxName, ["chmod", "-R", writeStrip, dirPath]);
     } catch {
       // Silently skip
diff --git a/test/shields.test.ts b/test/shields.test.ts
index fa80e0d627..49f5d73a64 100644
--- a/test/shields.test.ts
+++ b/test/shields.test.ts
@@ -420,6 +420,7 @@ describe("NC-2227-05: shields.ts locks state directories", () => {
     expect(fnBody).toContain("applyStateDirLockMode");
     expect(fnBody).toContain('["chmod", "g-s", target.configDir]');
     expect(src).toContain('["chmod", "g-s", dirPath]');
+    expect(src).toContain("Best effort; do not skip recursive write stripping.");
     expect(src).toContain('if [ "$dir_mode" = "755" ]; then chmod g-s "$dir"');
     expect(fnBody).toContain("chown");
     expect(fnBody).toContain("root:root");

From 69a2656ba21ba0957924e183357d9dbe5abcc9a1 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 18:38:37 -0700
Subject: [PATCH 11/14] fix(onboard): accept configured agent binary path

---
 src/lib/agent-onboard.test.ts | 22 +++++++++++++++++-----
 src/lib/agent-onboard.ts      | 17 +----------------
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/src/lib/agent-onboard.test.ts b/src/lib/agent-onboard.test.ts
index 5b1bb0a110..e9abb6e521 100644
--- a/src/lib/agent-onboard.test.ts
+++ b/src/lib/agent-onboard.test.ts
@@ -130,11 +130,8 @@ describe("handleAgentSetup guards", () => {
     expect(source).toContain("verifyAgentBinaryAvailable");
     expect(source).toContain("[ -e ${shellQuote(binaryPath)} ]");
     expect(source).toContain("[ -x ${shellQuote(binaryPath)} ]");
-    expect(source).toContain(
-      'resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"',
-    );
-    expect(source).toContain('[ -z "$resolved" ] || [ "$resolved" = ${shellQuote(binaryPath)} ]');
     expect(source).not.toContain('[ -n "$resolved" ] || { echo not_found');
+    expect(source).not.toContain("path_mismatch");
     expect(source).toMatch(
       /"sandbox",\s*"exec",\s*"-n",\s*sandboxName,\s*"--",\s*"sh",\s*"-lc",\s*script/,
     );
@@ -168,6 +165,21 @@ describe("handleAgentSetup guards", () => {
     expect(result).toEqual({ available: true });
     expect(script).toContain("[ -e '/usr/local/bin/hermes' ]");
     expect(script).toContain("[ -x '/usr/local/bin/hermes' ]");
-    expect(script).toContain('[ -z "$resolved" ] || [ "$resolved" = \'/usr/local/bin/hermes\' ]');
+    expect(script).not.toContain("command -v 'hermes'");
+  });
+
+  it("does not reject a configured binary when PATH resolves the symlink target", () => {
+    let script = "";
+    const result = verifyAgentBinaryAvailable(
+      "alpha",
+      makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }),
+      (args) => {
+        script = String(args[7] || "");
+        return "ok";
+      },
+    );
+
+    expect(result).toEqual({ available: true });
+    expect(script).not.toContain("path_mismatch");
   });
 });
diff --git a/src/lib/agent-onboard.ts b/src/lib/agent-onboard.ts
index d67af0801f..aa536dccda 100644
--- a/src/lib/agent-onboard.ts
+++ b/src/lib/agent-onboard.ts
@@ -124,9 +124,8 @@ type AgentBinaryAvailability =
   | { available: true }
   | {
       available: false;
-      reason: "not_found" | "not_executable" | "path_mismatch";
+      reason: "not_found" | "not_executable";
       binaryPath?: string;
-      resolvedPath?: string;
     };
 
 // Exported for unit coverage of the sandbox-side guard without running onboarding.
@@ -141,8 +140,6 @@ export function verifyAgentBinaryAvailable(
     ? [
         `[ -e ${shellQuote(binaryPath)} ] || { echo not_found; exit 1; }`,
         `[ -x ${shellQuote(binaryPath)} ] || { echo not_executable; exit 1; }`,
-        `resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`,
-        `[ -z "$resolved" ] || [ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`,
         "echo ok",
       ].join(" && ")
     : `command -v ${shellQuote(executable)} >/dev/null 2>&1 && echo ok || echo not_found`;
@@ -157,15 +154,6 @@ export function verifyAgentBinaryAvailable(
     return { available: true };
   }
   if (binaryPath && result) {
-    const mismatch = result.match(/path_mismatch:([^\n]+)/);
-    if (mismatch) {
-      return {
-        available: false,
-        reason: "path_mismatch",
-        binaryPath,
-        resolvedPath: mismatch[1].trim(),
-      };
-    }
     if (result.includes("not_executable")) {
       return { available: false, reason: "not_executable", binaryPath };
     }
@@ -179,9 +167,6 @@ function describeAgentBinaryFailure(
   result: Exclude<AgentBinaryAvailability, { available: true }>,
 ): string {
   const executable = agentExecutableName(agent);
-  if (result.reason === "path_mismatch") {
-    return `${agent.displayName} binary '${executable}' resolves to '${result.resolvedPath}', expected '${result.binaryPath}' inside sandbox '${sandboxName}'`;
-  }
   if (result.reason === "not_executable") {
     return `${agent.displayName} configured binary '${result.binaryPath}' is not executable inside sandbox '${sandboxName}'`;
   }

From 92cebda54fe186176c186355591f6dc9daebfc60 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 18:45:48 -0700
Subject: [PATCH 12/14] fix(connect): wait for recovered gateway readiness

---
 src/nemoclaw.ts  | 38 +++++++++++++++++++++++++---
 test/cli.test.ts | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts
index bcb4de3d59..d8fe3574ff 100644
--- a/src/nemoclaw.ts
+++ b/src/nemoclaw.ts
@@ -352,6 +352,38 @@ function recoverSandboxProcesses(sandboxName: string): boolean {
   return recoveredSsh(executeSandboxCommand(sandboxName, script));
 }
 
+function readNonNegativeNumberEnv(name: string, fallback: number): number {
+  const raw = process.env[name];
+  if (raw === undefined || raw.trim() === "") return fallback;
+  const parsed = Number(raw);
+  return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback;
+}
+
+function waitForRecoveredSandboxGateway(sandboxName: string): boolean {
+  const timeoutSeconds = readNonNegativeNumberEnv(
+    "NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS",
+    30,
+  );
+  const intervalSeconds = readNonNegativeNumberEnv(
+    "NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS",
+    3,
+  );
+  const attempts =
+    intervalSeconds > 0
+      ? Math.max(1, Math.floor(timeoutSeconds / intervalSeconds) + 1)
+      : Math.max(1, Math.floor(timeoutSeconds) + 1);
+
+  for (let attempt = 0; attempt < attempts; attempt += 1) {
+    if (isSandboxGatewayRunning(sandboxName) === true) {
+      return true;
+    }
+    if (attempt < attempts - 1) {
+      sleepSeconds(intervalSeconds);
+    }
+  }
+  return false;
+}
+
 /**
  * Re-establish the dashboard port forward to the sandbox.
  * Uses the agent's forward port when a non-OpenClaw agent is active.
@@ -394,9 +426,9 @@ function checkAndRecoverSandboxProcesses(
 
   const recovered = recoverSandboxProcesses(sandboxName);
   if (recovered) {
-    // Wait for gateway to bind its HTTP port before declaring success
-    sleepSeconds(3);
-    if (isSandboxGatewayRunning(sandboxName) !== true) {
+    // Wait for gateway to bind its HTTP port before declaring success. The
+    // recovered process can be alive before the OpenAI-compatible API is ready.
+    if (!waitForRecoveredSandboxGateway(sandboxName)) {
       if (!quiet) {
         console.error("  Gateway process started but is not responding.");
         console.error("  Check /tmp/gateway.log inside the sandbox for details.");
diff --git a/test/cli.test.ts b/test/cli.test.ts
index 4aec7039fa..eccef65ecd 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -1822,6 +1822,72 @@ describe("CLI dispatch", () => {
     expect(fs.existsSync(sshMarkerFile)).toBe(false);
   });
 
+  it("waits for recovered gateway health before failing probe-only", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-wait-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    const stateFile = path.join(home, "probe-state");
+    const readyCountFile = path.join(home, "ready-count");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(stateFile, "stopped");
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `marker_file=${JSON.stringify(markerFile)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        `ready_count_file=${JSON.stringify(readyCountFile)}`,
+        'printf \'%s\\n\' "$*" >> "$marker_file"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  case "$cmd" in',
+        '    *"OPENCLAW="*)',
+        '      echo recovered > "$state_file"',
+        "      echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        "      echo 'GATEWAY_PID=123'",
+        "      exit 0",
+        "      ;;",
+        "    *'curl -sf'*)",
+        "      echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        '      if [ "$(cat "$state_file")" != recovered ]; then echo STOPPED; exit 0; fi',
+        '      count=$(cat "$ready_count_file" 2>/dev/null || echo 0)',
+        "      count=$((count + 1))",
+        '      echo "$count" > "$ready_count_file"',
+        '      if [ "$count" -ge 3 ]; then echo RUNNING; else echo STOPPED; fi',
+        "      exit 0",
+        "      ;;",
+        "  esac",
+        "fi",
+        'if [ "$1" = "forward" ]; then',
+        "  exit 0",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+      NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS: "3",
+      NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS: "0",
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: recovered OpenClaw gateway");
+    expect(fs.readFileSync(readyCountFile, "utf8").trim()).toBe("3");
+  });
+
   it("treats leading --probe-only as an implicit connect probe", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-leading-"));
     const localBin = path.join(home, "bin");

From 46584a76decd18635c485e0aadedab01b536fd97 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 19:44:43 -0700
Subject: [PATCH 13/14] fix(status): bound sandbox version probe

---
 src/lib/sandbox-version.test.ts | 6 ++++++
 src/lib/sandbox-version.ts      | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/lib/sandbox-version.test.ts b/src/lib/sandbox-version.test.ts
index 9b79eb8008..cf844b5288 100644
--- a/src/lib/sandbox-version.test.ts
+++ b/src/lib/sandbox-version.test.ts
@@ -50,6 +50,7 @@ vi.mock("child_process", async (importOriginal) => {
 import { checkAgentVersion, formatStalenessWarning } from "./sandbox-version.js";
 import * as registry from "./registry.js";
 import { captureOpenshellCommand } from "./openshell.js";
+import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts.js";
 import { spawnSync } from "child_process";
 
 describe("checkAgentVersion", () => {
@@ -130,6 +131,11 @@ describe("checkAgentVersion", () => {
     expect(result.detectionMethod).toBe("ssh-exec");
     expect(result.sandboxVersion).toBe("2026.4.24");
     expect(result.isStale).toBe(false);
+    expect(captureOpenshellCommand).toHaveBeenCalledWith(
+      "/usr/local/bin/openshell",
+      ["sandbox", "ssh-config", "test-sb"],
+      { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS },
+    );
 
     // Should have cached the version in registry
     const updated = registry.getSandbox("test-sb");
diff --git a/src/lib/sandbox-version.ts b/src/lib/sandbox-version.ts
index 6d24c8c6cd..add8c0d223 100644
--- a/src/lib/sandbox-version.ts
+++ b/src/lib/sandbox-version.ts
@@ -18,6 +18,7 @@ import * as registry from "./registry.js";
 import { loadAgent } from "./agent-defs.js";
 import { resolveOpenshell } from "./resolve-openshell.js";
 import { captureOpenshellCommand } from "./openshell.js";
+import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts.js";
 
 export interface VersionCheckResult {
   sandboxVersion: string | null;
@@ -49,7 +50,7 @@ export function probeAgentVersion(sandboxName: string): string | null {
   const sshConfigResult = captureOpenshellCommand(
     openshellBinary,
     ["sandbox", "ssh-config", sandboxName],
-    { ignoreError: true },
+    { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS },
   );
   if (sshConfigResult.status !== 0) return null;
 

From 71bfe6dcb5336d698ed09f9bd04deda6941af826 Mon Sep 17 00:00:00 2001
From: Aaron Erickson <aerickson@nvidia.com>
Date: Fri, 1 May 2026 20:53:46 -0700
Subject: [PATCH 14/14] fix(status): bound nim status probes

---
 src/lib/nim.test.ts | 41 +++++++++++++++++++++++++++++++++++++++++
 src/lib/nim.ts      | 13 ++++++++++---
 test/cli.test.ts    | 23 +++++++++++++++--------
 3 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/src/lib/nim.test.ts b/src/lib/nim.test.ts
index 4501c0f6c0..d1eff93d54 100644
--- a/src/lib/nim.test.ts
+++ b/src/lib/nim.test.ts
@@ -50,6 +50,17 @@ function hasCurlTimeoutArgs(cmd: string | string[]): boolean {
   return cmd[0] === "curl" && cmd[connectTimeout + 1] === "5" && cmd[maxTime + 1] === "5";
 }
 
+function timeoutForCommand(
+  runCapture: Mock,
+  predicate: (cmd: string | string[]) => boolean,
+): number | undefined {
+  const call = runCapture.mock.calls.find((mockCall) => {
+    const cmd = mockCall[0] as string | string[];
+    return predicate(cmd);
+  });
+  return (call?.[1] as { timeout?: number } | undefined)?.timeout;
+}
+
 describe("nim", () => {
   describe("listModels", () => {
     it("returns 5 models", () => {
@@ -353,6 +364,18 @@ describe("nim", () => {
           true,
         );
         expect(commands.some((c) => c[0] === "curl" && hasCurlTimeoutArgs(c))).toBe(true);
+        expect(
+          timeoutForCommand(
+            runCapture,
+            (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"),
+          ),
+        ).toBe(5000);
+        expect(
+          timeoutForCommand(
+            runCapture,
+            (c) => Array.isArray(c) && c[0] === "curl" && c.includes("http://127.0.0.1:9000/v1/models"),
+          ),
+        ).toBe(6000);
       } finally {
         restore();
       }
@@ -375,6 +398,18 @@ describe("nim", () => {
 
           expect(st).toMatchObject({ running: true, healthy: true, container: "foo", state: "running" });
           expect(commands.some((c) => c[0] === "docker" && c.includes("port"))).toBe(true);
+          expect(
+            timeoutForCommand(
+              runCapture,
+              (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"),
+            ),
+          ).toBe(5000);
+          expect(
+            timeoutForCommand(
+              runCapture,
+              (c) => Array.isArray(c) && c[0] === "docker" && c.includes("port"),
+            ),
+          ).toBe(5000);
         } finally {
           restore();
         }
@@ -410,6 +445,12 @@ describe("nim", () => {
       try {
         const st = nimModule.nimStatusByName("foo");
         expect(st).toMatchObject({ running: false, healthy: false, container: "foo", state: "exited" });
+        expect(
+          timeoutForCommand(
+            runCapture,
+            (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"),
+          ),
+        ).toBe(5000);
         expect(runCapture.mock.calls).toHaveLength(1);
       } finally {
         restore();
diff --git a/src/lib/nim.ts b/src/lib/nim.ts
index 068139a02a..e9a0265429 100644
--- a/src/lib/nim.ts
+++ b/src/lib/nim.ts
@@ -20,6 +20,7 @@ const nimImages = require("../../bin/lib/nim-images.json");
 import { VLLM_PORT } from "./ports";
 
 const UNIFIED_MEMORY_GPU_TAGS = ["GB10", "Thor", "Orin", "Xavier"];
+const NIM_STATUS_PROBE_TIMEOUT_MS = 5000;
 
 export interface NimModel {
   name: string;
@@ -338,14 +339,20 @@ export function nimStatus(sandboxName: string, port?: number): NimStatus {
 
 export function nimStatusByName(name: string, port?: number): NimStatus {
   try {
-    const state = dockerContainerInspectFormat("{{.State.Status}}", name, { ignoreError: true });
+    const state = dockerContainerInspectFormat("{{.State.Status}}", name, {
+      ignoreError: true,
+      timeout: NIM_STATUS_PROBE_TIMEOUT_MS,
+    });
     if (!state) return { running: false, container: name };
 
     let healthy = false;
     if (state === "running") {
       let resolvedHostPort = port != null ? Number(port) : 0;
       if (!resolvedHostPort) {
-        const mapping = dockerPort(name, "8000", { ignoreError: true });
+        const mapping = dockerPort(name, "8000", {
+          ignoreError: true,
+          timeout: NIM_STATUS_PROBE_TIMEOUT_MS,
+        });
         const m = mapping && mapping.match(/:(\d+)\s*$/);
         resolvedHostPort = m ? Number(m[1]) : VLLM_PORT;
       }
@@ -359,7 +366,7 @@ export function nimStatusByName(name: string, port?: number): NimStatus {
           "5",
           `http://127.0.0.1:${resolvedHostPort}/v1/models`,
         ],
-        { ignoreError: true },
+        { ignoreError: true, timeout: NIM_STATUS_PROBE_TIMEOUT_MS + 1000 },
       );
       healthy = !!health;
     }
diff --git a/test/cli.test.ts b/test/cli.test.ts
index 323b76a41e..88611a055a 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -247,6 +247,19 @@ function createDoctorTestSetup(prefix: string, openshellLines: string[], sandbox
   };
 }
 
+function createCloudflaredServiceDir(prefix: string): { sandboxName: string; serviceDir: string } {
+  const suffix = [
+    process.pid.toString(36),
+    Date.now().toString(36),
+    Math.random().toString(36).slice(2, 10),
+  ].join("-");
+  const sandboxName = `${prefix}${suffix}`;
+  const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`);
+  fs.rmSync(serviceDir, { recursive: true, force: true });
+  fs.mkdirSync(serviceDir, { recursive: true });
+  return { sandboxName, serviceDir };
+}
+
 function createDebugCommandTestEnv(prefix: string): Record<string, string> {
   const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
   const localBin = path.join(home, "bin");
@@ -954,7 +967,7 @@ describe("CLI dispatch", () => {
   });
 
   it("doctor treats a live non-cloudflared PID as stale", () => {
-    const sandboxName = `doctorpid-${process.pid}`;
+    const { sandboxName, serviceDir } = createCloudflaredServiceDir("doctorpid-");
     const setup = createDoctorTestSetup(
       "nemoclaw-cli-doctor-wrong-cloudflared-pid-",
       [
@@ -967,9 +980,6 @@ describe("CLI dispatch", () => {
       ],
       sandboxName,
     );
-    const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`);
-    fs.rmSync(serviceDir, { recursive: true, force: true });
-    fs.mkdirSync(serviceDir, { recursive: true });
     const sleeper = spawn(process.execPath, ["-e", "setTimeout(() => {}, 30000)"], {
       stdio: "ignore",
     });
@@ -999,7 +1009,7 @@ describe("CLI dispatch", () => {
   });
 
   it("doctor accepts a live cloudflared PID", () => {
-    const sandboxName = `doctorcloudflared-${process.pid}`;
+    const { sandboxName, serviceDir } = createCloudflaredServiceDir("doctorcloudflared-");
     const setup = createDoctorTestSetup(
       "nemoclaw-cli-doctor-cloudflared-pid-",
       [
@@ -1012,11 +1022,8 @@ describe("CLI dispatch", () => {
       ],
       sandboxName,
     );
-    const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`);
     const shimDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cloudflared-shim-"));
     const cloudflaredBin = path.join(shimDir, "cloudflared");
-    fs.rmSync(serviceDir, { recursive: true, force: true });
-    fs.mkdirSync(serviceDir, { recursive: true });
     fs.symlinkSync(process.execPath, cloudflaredBin);
     const sleeper = spawn(cloudflaredBin, ["-e", "setTimeout(() => {}, 30000)"], {
       stdio: "ignore",