diff --git a/scripts/nemoclaw-start.sh b/scripts/nemoclaw-start.sh
index 8e7f84eaed..daddd5351d 100755
--- a/scripts/nemoclaw-start.sh
+++ b/scripts/nemoclaw-start.sh
@@ -230,7 +230,7 @@ normalize_mutable_config_perms() {
   # Detect shields-up. Config dir owned by root means shields are
   # currently locked; normalizing would weaken the contract.
   local config_dir_owner
-  config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || echo unknown)"
+  config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || stat -f '%Su' "$config_dir" 2>/dev/null || echo unknown)"
   if [ "$config_dir_owner" = "root" ]; then
     return 0
   fi
diff --git a/src/lib/agent-onboard.test.ts b/src/lib/agent-onboard.test.ts
index 6e00cf830a..e9abb6e521 100644
--- a/src/lib/agent-onboard.test.ts
+++ b/src/lib/agent-onboard.test.ts
@@ -5,7 +5,7 @@ import { describe, it, expect, beforeEach, afterEach, afterAll, vi } from "vites
 import fs from "node:fs";
 import path from "node:path";
 // Import from compiled dist/ so coverage is attributed correctly.
-import { printDashboardUi } from "../../dist/lib/agent-onboard";
+import { printDashboardUi, verifyAgentBinaryAvailable } from "../../dist/lib/agent-onboard";
 import type { AgentDefinition } from "./agent-defs";
 
 function makeAgent(overrides: Partial<AgentDefinition> = {}): AgentDefinition {
@@ -128,10 +128,10 @@ describe("handleAgentSetup guards", () => {
     const source = fs.readFileSync(path.join(import.meta.dirname, "agent-onboard.ts"), "utf-8");
 
     expect(source).toContain("verifyAgentBinaryAvailable");
-    expect(source).toContain(
-      'resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"',
-    );
-    expect(source).toContain('[ "$resolved" = ${shellQuote(binaryPath)} ]');
+    expect(source).toContain("[ -e ${shellQuote(binaryPath)} ]");
+    expect(source).toContain("[ -x ${shellQuote(binaryPath)} ]");
+    expect(source).not.toContain('[ -n "$resolved" ] || { echo not_found');
+    expect(source).not.toContain("path_mismatch");
     expect(source).toMatch(
       /"sandbox",\s*"exec",\s*"-n",\s*sandboxName,\s*"--",\s*"sh",\s*"-lc",\s*script/,
     );
@@ -150,4 +150,36 @@ describe("handleAgentSetup guards", () => {
     expect(source).toContain('parsed.status === "ok"');
     expect(source).not.toContain('.includes("ok")');
   });
+
+  it("accepts an executable configured binary path when PATH lookup is empty", () => {
+    let script = "";
+    const result = verifyAgentBinaryAvailable(
+      "alpha",
+      makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }),
+      (args) => {
+        script = String(args[7] || "");
+        return "ok";
+      },
+    );
+
+    expect(result).toEqual({ available: true });
+    expect(script).toContain("[ -e '/usr/local/bin/hermes' ]");
+    expect(script).toContain("[ -x '/usr/local/bin/hermes' ]");
+    expect(script).not.toContain("command -v 'hermes'");
+  });
+
+  it("does not reject a configured binary when PATH resolves the symlink target", () => {
+    let script = "";
+    const result = verifyAgentBinaryAvailable(
+      "alpha",
+      makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }),
+      (args) => {
+        script = String(args[7] || "");
+        return "ok";
+      },
+    );
+
+    expect(result).toEqual({ available: true });
+    expect(script).not.toContain("path_mismatch");
+  });
 });
diff --git a/src/lib/agent-onboard.ts b/src/lib/agent-onboard.ts
index c5bad2dff1..aa536dccda 100644
--- a/src/lib/agent-onboard.ts
+++ b/src/lib/agent-onboard.ts
@@ -124,12 +124,12 @@ type AgentBinaryAvailability =
   | { available: true }
   | {
       available: false;
-      reason: "not_found" | "not_executable" | "path_mismatch";
+      reason: "not_found" | "not_executable";
       binaryPath?: string;
-      resolvedPath?: string;
     };
 
-function verifyAgentBinaryAvailable(
+// Exported for unit coverage of the sandbox-side guard without running onboarding.
+export function verifyAgentBinaryAvailable(
   sandboxName: string,
   agent: AgentDefinition,
   runCaptureOpenshell: OnboardContext["runCaptureOpenshell"],
@@ -138,10 +138,8 @@ function verifyAgentBinaryAvailable(
   const binaryPath = typeof agent.binary_path === "string" ? agent.binary_path.trim() : "";
   const script = binaryPath
     ? [
-        `resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`,
-        `[ -n "$resolved" ] || { echo not_found; exit 1; }`,
+        `[ -e ${shellQuote(binaryPath)} ] || { echo not_found; exit 1; }`,
         `[ -x ${shellQuote(binaryPath)} ] || { echo not_executable; exit 1; }`,
-        `[ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`,
         "echo ok",
       ].join(" && ")
     : `command -v ${shellQuote(executable)} >/dev/null 2>&1 && echo ok || echo not_found`;
@@ -156,15 +154,6 @@ function verifyAgentBinaryAvailable(
     return { available: true };
   }
   if (binaryPath && result) {
-    const mismatch = result.match(/path_mismatch:([^\n]+)/);
-    if (mismatch) {
-      return {
-        available: false,
-        reason: "path_mismatch",
-        binaryPath,
-        resolvedPath: mismatch[1].trim(),
-      };
-    }
     if (result.includes("not_executable")) {
       return { available: false, reason: "not_executable", binaryPath };
     }
@@ -178,9 +167,6 @@ function describeAgentBinaryFailure(
   result: Exclude<AgentBinaryAvailability, { available: true }>,
 ): string {
   const executable = agentExecutableName(agent);
-  if (result.reason === "path_mismatch") {
-    return `${agent.displayName} binary '${executable}' resolves to '${result.resolvedPath}', expected '${result.binaryPath}' inside sandbox '${sandboxName}'`;
-  }
   if (result.reason === "not_executable") {
     return `${agent.displayName} configured binary '${result.binaryPath}' is not executable inside sandbox '${sandboxName}'`;
   }
diff --git a/src/lib/agent-runtime.test.ts b/src/lib/agent-runtime.test.ts
index 9ae9e7024f..965729842f 100644
--- a/src/lib/agent-runtime.test.ts
+++ b/src/lib/agent-runtime.test.ts
@@ -3,7 +3,7 @@
 
 import { describe, it, expect } from "vitest";
 // Import from compiled dist/ so coverage is attributed correctly.
-import { buildRecoveryScript } from "../../dist/lib/agent-runtime";
+import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../dist/lib/agent-runtime";
 import type { AgentDefinition } from "./agent-defs";
 
 function makeAgent(overrides: Partial<AgentDefinition> = {}): AgentDefinition {
@@ -60,13 +60,13 @@ describe("buildRecoveryScript", () => {
   it("launches the default gateway command through the validated agent binary", () => {
     const script = buildRecoveryScript(minimalAgent, 19000);
     expect(script).toContain("command -v 'test-agent'");
-    expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000');
+    expect(script).toContain('"$AGENT_BIN" gateway run --port 19000');
   });
 
   it("falls back to openclaw gateway run when gateway_command is absent", () => {
     const agent = makeAgent({ gateway_command: undefined });
     const script = buildRecoveryScript(agent, 19000);
-    expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000');
+    expect(script).toContain('"$AGENT_BIN" gateway run --port 19000');
   });
 
   it("validates and launches custom gateway commands explicitly", () => {
@@ -123,17 +123,28 @@ describe("buildRecoveryScript", () => {
 
     it("writes the warning to gateway.log so it persists for sysadmin tail", () => {
       const script = buildRecoveryScript(minimalAgent, 19000);
-      // Both warnings must end up in /tmp/gateway.log, not just stderr —
+      // Both warnings must end up in the selected gateway log, not just stderr —
       // executeSandboxCommand silently discards stderr from the recovery
       // script, so a warning that only goes to stderr is invisible to
       // anyone debugging a crash-loop. (#2478)
-      expect(script).toContain('echo "$_W" >> /tmp/gateway.log');
+      expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"');
       // And the warning must be deferred until AFTER gateway.log is
-      // freshly touched/chmod'd, otherwise the redirect targets a stale
-      // file that gets removed seconds later.
-      const touchIdx = script!.indexOf("touch /tmp/gateway.log");
-      const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log');
-      expect(touchIdx).toBeLessThan(warnIdx);
+      // safely opened with O_NOFOLLOW, otherwise the redirect targets a
+      // stale or attacker-controlled file.
+      const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;");
+      const logSelectionIdx = script!.indexOf("_GATEWAY_LOG=/tmp/gateway.log");
+      const warnIdx = script!.indexOf('echo "$_W" >> "$_GATEWAY_LOG"');
+      expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0);
+      expect(logSelectionIdx).toBeGreaterThanOrEqual(0);
+      expect(warnIdx).toBeGreaterThanOrEqual(0);
+      expect(gatewayPrepIdx).toBeLessThan(logSelectionIdx);
+      expect(logSelectionIdx).toBeLessThan(warnIdx);
+    });
+
+    it("stops recovery when hardened log setup fails", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).toContain(" /tmp/gateway.log 'gateway' || exit 1;");
+      expect(script).toContain(" /tmp/auto-pair.log 'sandbox' || exit 1;");
     });
 
     it("appends (not truncates) gateway.log on launch so warnings survive", () => {
@@ -141,8 +152,72 @@ describe("buildRecoveryScript", () => {
       // Truncating with `>` wipes the [gateway-recovery] WARNING that the
       // recovery script wrote moments earlier — meaning a sysadmin tailing
       // gateway.log would see the eventual crash without the explanation.
-      expect(script).toContain(">> /tmp/gateway.log 2>&1 &");
+      expect(script).toContain('>> "$_GATEWAY_LOG" 2>&1 &');
       expect(script).not.toMatch(/[^>]> \/tmp\/gateway\.log 2>&1 &/);
     });
+
+    it("preserves an existing gateway.log and has a writable fallback log", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).not.toContain("rm -f /tmp/gateway.log");
+      expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log");
+      expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log");
+      expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"');
+      expect(script).toContain('tail -5 "$_GATEWAY_LOG"');
+      expect(script).not.toContain('echo "$_W" >> /tmp/gateway.log');
+      expect(script).not.toContain("cat /tmp/gateway.log");
+    });
+
+    it("rejects a symlinked gateway.log before preparing the log", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      const noFollowIdx = script.indexOf("O_NOFOLLOW");
+      const openIdx = script.indexOf("os.open(path, flags, 0o644)");
+      const fchownIdx = script.indexOf("os.fchown(fd");
+      expect(script).toContain("refusing to prepare symlinked /tmp/gateway.log");
+      expect(script).toContain("sys.exit(1)");
+      expect(script).not.toContain(": > /tmp/gateway.log");
+      expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log");
+      expect(noFollowIdx).toBeGreaterThanOrEqual(0);
+      expect(openIdx).toBeGreaterThanOrEqual(0);
+      expect(fchownIdx).toBeGreaterThanOrEqual(0);
+      expect(noFollowIdx).toBeLessThan(openIdx);
+      expect(openIdx).toBeLessThan(fchownIdx);
+    });
+
+    it("prepares gateway.log for the real gateway-owned sandbox log", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).toContain("os.fchown(fd");
+      expect(script).toContain("pw.pw_gid");
+      expect(script).not.toContain("grp.getgrnam");
+      expect(script).toContain("owner_mode = 0o644");
+      expect(script).toContain("os.fchmod(fd, owner_mode)");
+      expect(script).toContain("/tmp/gateway.log 'gateway'");
+      expect(script).toContain("gosu 'gateway'");
+    });
+
+    it("terminates the conditional launch branch before capturing the gateway pid", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).toContain(" fi; GPID=$!");
+      expect(script).not.toContain(" fi GPID=$!");
+    });
+
+    it("prepares auto-pair.log without unlinking or following symlinks", () => {
+      const script = buildOpenClawRecoveryScript(18789);
+      expect(script).toContain("refusing to prepare symlinked /tmp/auto-pair.log");
+      expect(script).toContain("/tmp/auto-pair.log 'sandbox'");
+      expect(script).toContain("owner_mode = 0o600");
+      expect(script).not.toContain("rm -f /tmp/auto-pair.log");
+      expect(script).not.toContain(": > /tmp/auto-pair.log");
+      expect(script).not.toContain("touch /tmp/auto-pair.log");
+      expect(script).not.toContain("chown sandbox:sandbox /tmp/auto-pair.log");
+      expect(script).not.toContain("chmod 600 /tmp/auto-pair.log");
+    });
+
+    it("does not force non-OpenClaw agents to run as the gateway user", () => {
+      const script = buildRecoveryScript(minimalAgent, 19000);
+      expect(script).not.toContain("chown gateway:gateway /tmp/gateway.log");
+      expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log");
+      expect(script).not.toContain("gosu gateway");
+      expect(script).not.toContain("gosu 'gateway'");
+    });
   });
 });
diff --git a/src/lib/agent-runtime.ts b/src/lib/agent-runtime.ts
index 8a22e557de..7d01f3e05a 100644
--- a/src/lib/agent-runtime.ts
+++ b/src/lib/agent-runtime.ts
@@ -48,6 +48,101 @@ export function getHealthProbeUrl(agent: AgentDefinition | null): string {
   return agent.healthProbe?.url || `http://127.0.0.1:${DASHBOARD_PORT}/`;
 }
 
+function buildNoFollowLogSetupCommand(
+  path: string,
+  logOwnerUser?: string,
+  ownerMode = "0o644",
+): string {
+  const displayPath = path.replace(/\\/g, "\\\\").replace(/'/g, "\\'");
+  const prepareLog = [
+    "import errno, os, pwd, stat, sys",
+    "path = sys.argv[1]",
+    "owner = sys.argv[2] if len(sys.argv) > 2 else ''",
+    `owner_mode = ${ownerMode}`,
+    "fallback_mode = 0o600",
+    "flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC | getattr(os, 'O_NOFOLLOW', 0)",
+    "try:",
+    "    fd = os.open(path, flags, 0o644)",
+    "except OSError as exc:",
+    "    if exc.errno == errno.ELOOP:",
+    `        print('[gateway-recovery] ERROR: refusing to prepare symlinked ${displayPath}', file=sys.stderr)`,
+    "        sys.exit(1)",
+    "    if exc.errno in (errno.EACCES, errno.EPERM):",
+    `        print('[gateway-recovery] ERROR: ${displayPath} is not writable by recovery user', file=sys.stderr)`,
+    "        sys.exit(0)",
+    `    print(f'[gateway-recovery] ERROR: cannot prepare ${displayPath}: {exc}', file=sys.stderr)`,
+    "    sys.exit(1)",
+    "try:",
+    "    if not stat.S_ISREG(os.fstat(fd).st_mode):",
+    `        print('[gateway-recovery] ERROR: ${displayPath} is not a regular file', file=sys.stderr)`,
+    "        sys.exit(1)",
+    "    if owner and os.geteuid() == 0:",
+    "        try:",
+    "            pw = pwd.getpwnam(owner)",
+    "        except KeyError:",
+    "            os.fchmod(fd, fallback_mode)",
+    "        else:",
+    "            os.fchown(fd, pw.pw_uid, pw.pw_gid)",
+    "            os.fchmod(fd, owner_mode)",
+    "    else:",
+    "        os.fchmod(fd, fallback_mode)",
+    "finally:",
+    "    os.close(fd)",
+  ].join("\n");
+  return [
+    "python3",
+    "-c",
+    shellQuote(prepareLog),
+    path,
+    ...(logOwnerUser ? [shellQuote(logOwnerUser)] : []),
+  ].join(" ");
+}
+
+function buildGatewayLogSetup(includeAutoPairLog = false, logOwnerUser?: string): string[] {
+  const lines = [`${buildNoFollowLogSetupCommand("/tmp/gateway.log", logOwnerUser)} || exit 1;`];
+  if (includeAutoPairLog) {
+    lines.push(
+      `${buildNoFollowLogSetupCommand("/tmp/auto-pair.log", "sandbox", "0o600")} || exit 1;`,
+    );
+  }
+  return lines;
+}
+
+function buildGatewayLogSelection(): string {
+  return '_GATEWAY_LOG=/tmp/gateway.log; if ! : >> "$_GATEWAY_LOG" 2>/dev/null; then _GATEWAY_LOG=/tmp/gateway-recovery.log; : >> "$_GATEWAY_LOG" 2>/dev/null || true; fi;';
+}
+
+function gatewayLaunchCommand(command: string, runAsUser?: string): string {
+  const logSelection = buildGatewayLogSelection();
+  const userLaunch = `nohup ${command} >> "$_GATEWAY_LOG" 2>&1 &`;
+  if (!runAsUser) {
+    return `${logSelection} ${userLaunch}`;
+  }
+  return `${logSelection} if [ "$(id -u)" = "0" ] && command -v gosu >/dev/null 2>&1 && id ${shellQuote(runAsUser)} >/dev/null 2>&1; then nohup gosu ${shellQuote(runAsUser)} ${command} >> "$_GATEWAY_LOG" 2>&1 & else ${userLaunch} fi;`;
+}
+
+/**
+ * Build the OpenClaw recovery shell script used by the default sandbox.
+ */
+export function buildOpenClawRecoveryScript(port: number): string {
+  return [
+    "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;",
+    "[ -f ~/.bashrc ] && . ~/.bashrc;",
+    'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi;',
+    `if curl -sf --max-time 3 http://127.0.0.1:${port}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`,
+    "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;",
+    ...buildGatewayLogSetup(true, "gateway"),
+    buildGatewayLogSelection(),
+    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };',
+    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };',
+    'OPENCLAW="$(command -v openclaw)";',
+    'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;',
+    gatewayLaunchCommand('"$OPENCLAW" gateway run --port ' + port, "gateway"),
+    "GPID=$!; sleep 2;",
+    'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; tail -5 "$_GATEWAY_LOG" 2>/dev/null; fi',
+  ].join(" ");
+}
+
 /**
  * Build the recovery shell script for a non-OpenClaw agent.
  * Returns the script string, or null if agent is null (use existing inline
@@ -78,8 +173,8 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number)
   // *why* the gateway is about to crash gets wiped by the same launch
   // that's about to crash on a missing guard. (#2478)
   const launchCommand = usesValidatedBinary
-    ? `nohup "$AGENT_BIN" gateway run --port ${port} >> /tmp/gateway.log 2>&1 &`
-    : `nohup ${configuredGatewayCommand} --port ${port} >> /tmp/gateway.log 2>&1 &`;
+    ? gatewayLaunchCommand(`"$AGENT_BIN" gateway run --port ${port}`)
+    : gatewayLaunchCommand(`${configuredGatewayCommand} --port ${port}`);
   const isHermes = agent.name === "hermes";
   const hermesHome = isHermes ? "export HERMES_HOME=/sandbox/.hermes; " : "";
 
@@ -106,14 +201,14 @@ export function buildRecoveryScript(agent: AgentDefinition | null, port: number)
     'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi;',
     hermesHome,
     `if curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`,
-    "rm -f /tmp/gateway.log;",
-    "touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;",
-    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing — gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
-    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload — gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
+    ...buildGatewayLogSetup(false),
+    buildGatewayLogSelection(),
+    '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing - gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };',
+    '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload - gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> "$_GATEWAY_LOG"; };',
     ...validationSteps,
     launchCommand,
     "GPID=$!; sleep 2;",
-    'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi',
+    'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; tail -5 "$_GATEWAY_LOG" 2>/dev/null; fi',
   ].join(" ");
 }
 
diff --git a/src/lib/command-registry.ts b/src/lib/command-registry.ts
index df809c67b7..31014192dc 100644
--- a/src/lib/command-registry.ts
+++ b/src/lib/command-registry.ts
@@ -98,6 +98,7 @@ export const COMMANDS: readonly CommandDef[] = [
   {
     usage: "nemoclaw <name> connect",
     description: "Shell into a running sandbox",
+    flags: "[--probe-only]",
     group: "Sandbox Management",
     scope: "sandbox",
   },
@@ -427,7 +428,6 @@ export const COMMANDS: readonly CommandDef[] = [
     scope: "global",
     hidden: true,
   },
-
 ] as const;
 
 /** All global-scope commands. */
diff --git a/src/lib/nim.test.ts b/src/lib/nim.test.ts
index 4501c0f6c0..d1eff93d54 100644
--- a/src/lib/nim.test.ts
+++ b/src/lib/nim.test.ts
@@ -50,6 +50,17 @@ function hasCurlTimeoutArgs(cmd: string | string[]): boolean {
   return cmd[0] === "curl" && cmd[connectTimeout + 1] === "5" && cmd[maxTime + 1] === "5";
 }
 
+function timeoutForCommand(
+  runCapture: Mock,
+  predicate: (cmd: string | string[]) => boolean,
+): number | undefined {
+  const call = runCapture.mock.calls.find((mockCall) => {
+    const cmd = mockCall[0] as string | string[];
+    return predicate(cmd);
+  });
+  return (call?.[1] as { timeout?: number } | undefined)?.timeout;
+}
+
 describe("nim", () => {
   describe("listModels", () => {
     it("returns 5 models", () => {
@@ -353,6 +364,18 @@ describe("nim", () => {
           true,
         );
         expect(commands.some((c) => c[0] === "curl" && hasCurlTimeoutArgs(c))).toBe(true);
+        expect(
+          timeoutForCommand(
+            runCapture,
+            (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"),
+          ),
+        ).toBe(5000);
+        expect(
+          timeoutForCommand(
+            runCapture,
+            (c) => Array.isArray(c) && c[0] === "curl" && c.includes("http://127.0.0.1:9000/v1/models"),
+          ),
+        ).toBe(6000);
       } finally {
         restore();
       }
@@ -375,6 +398,18 @@ describe("nim", () => {
 
           expect(st).toMatchObject({ running: true, healthy: true, container: "foo", state: "running" });
           expect(commands.some((c) => c[0] === "docker" && c.includes("port"))).toBe(true);
+          expect(
+            timeoutForCommand(
+              runCapture,
+              (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"),
+            ),
+          ).toBe(5000);
+          expect(
+            timeoutForCommand(
+              runCapture,
+              (c) => Array.isArray(c) && c[0] === "docker" && c.includes("port"),
+            ),
+          ).toBe(5000);
         } finally {
           restore();
         }
@@ -410,6 +445,12 @@ describe("nim", () => {
       try {
         const st = nimModule.nimStatusByName("foo");
         expect(st).toMatchObject({ running: false, healthy: false, container: "foo", state: "exited" });
+        expect(
+          timeoutForCommand(
+            runCapture,
+            (c) => Array.isArray(c) && c[0] === "docker" && c.includes("inspect"),
+          ),
+        ).toBe(5000);
         expect(runCapture.mock.calls).toHaveLength(1);
       } finally {
         restore();
diff --git a/src/lib/nim.ts b/src/lib/nim.ts
index 068139a02a..e9a0265429 100644
--- a/src/lib/nim.ts
+++ b/src/lib/nim.ts
@@ -20,6 +20,7 @@ const nimImages = require("../../bin/lib/nim-images.json");
 import { VLLM_PORT } from "./ports";
 
 const UNIFIED_MEMORY_GPU_TAGS = ["GB10", "Thor", "Orin", "Xavier"];
+const NIM_STATUS_PROBE_TIMEOUT_MS = 5000;
 
 export interface NimModel {
   name: string;
@@ -338,14 +339,20 @@ export function nimStatus(sandboxName: string, port?: number): NimStatus {
 
 export function nimStatusByName(name: string, port?: number): NimStatus {
   try {
-    const state = dockerContainerInspectFormat("{{.State.Status}}", name, { ignoreError: true });
+    const state = dockerContainerInspectFormat("{{.State.Status}}", name, {
+      ignoreError: true,
+      timeout: NIM_STATUS_PROBE_TIMEOUT_MS,
+    });
     if (!state) return { running: false, container: name };
 
     let healthy = false;
     if (state === "running") {
       let resolvedHostPort = port != null ? Number(port) : 0;
       if (!resolvedHostPort) {
-        const mapping = dockerPort(name, "8000", { ignoreError: true });
+        const mapping = dockerPort(name, "8000", {
+          ignoreError: true,
+          timeout: NIM_STATUS_PROBE_TIMEOUT_MS,
+        });
         const m = mapping && mapping.match(/:(\d+)\s*$/);
         resolvedHostPort = m ? Number(m[1]) : VLLM_PORT;
       }
@@ -359,7 +366,7 @@ export function nimStatusByName(name: string, port?: number): NimStatus {
           "5",
           `http://127.0.0.1:${resolvedHostPort}/v1/models`,
         ],
-        { ignoreError: true },
+        { ignoreError: true, timeout: NIM_STATUS_PROBE_TIMEOUT_MS + 1000 },
       );
       healthy = !!health;
     }
diff --git a/src/lib/sandbox-version.test.ts b/src/lib/sandbox-version.test.ts
index 9b79eb8008..cf844b5288 100644
--- a/src/lib/sandbox-version.test.ts
+++ b/src/lib/sandbox-version.test.ts
@@ -50,6 +50,7 @@ vi.mock("child_process", async (importOriginal) => {
 import { checkAgentVersion, formatStalenessWarning } from "./sandbox-version.js";
 import * as registry from "./registry.js";
 import { captureOpenshellCommand } from "./openshell.js";
+import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts.js";
 import { spawnSync } from "child_process";
 
 describe("checkAgentVersion", () => {
@@ -130,6 +131,11 @@ describe("checkAgentVersion", () => {
     expect(result.detectionMethod).toBe("ssh-exec");
     expect(result.sandboxVersion).toBe("2026.4.24");
     expect(result.isStale).toBe(false);
+    expect(captureOpenshellCommand).toHaveBeenCalledWith(
+      "/usr/local/bin/openshell",
+      ["sandbox", "ssh-config", "test-sb"],
+      { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS },
+    );
 
     // Should have cached the version in registry
     const updated = registry.getSandbox("test-sb");
diff --git a/src/lib/sandbox-version.ts b/src/lib/sandbox-version.ts
index 6d24c8c6cd..add8c0d223 100644
--- a/src/lib/sandbox-version.ts
+++ b/src/lib/sandbox-version.ts
@@ -18,6 +18,7 @@ import * as registry from "./registry.js";
 import { loadAgent } from "./agent-defs.js";
 import { resolveOpenshell } from "./resolve-openshell.js";
 import { captureOpenshellCommand } from "./openshell.js";
+import { OPENSHELL_PROBE_TIMEOUT_MS } from "./openshell-timeouts.js";
 
 export interface VersionCheckResult {
   sandboxVersion: string | null;
@@ -49,7 +50,7 @@ export function probeAgentVersion(sandboxName: string): string | null {
   const sshConfigResult = captureOpenshellCommand(
     openshellBinary,
     ["sandbox", "ssh-config", sandboxName],
-    { ignoreError: true },
+    { ignoreError: true, timeout: OPENSHELL_PROBE_TIMEOUT_MS },
   );
   if (sshConfigResult.status !== 0) return null;
 
diff --git a/src/lib/shields.ts b/src/lib/shields.ts
index f53d281cbe..2bac74c4d5 100644
--- a/src/lib/shields.ts
+++ b/src/lib/shields.ts
@@ -279,6 +279,17 @@ function applyStateDirLockMode(sandboxName: string, configDir: string, owner: st
     }
     try {
       kubectlExec(sandboxName, ["chmod", dirMode, dirPath]);
+    } catch {
+      // Silently skip
+    }
+    if (isLocking) {
+      try {
+        kubectlExec(sandboxName, ["chmod", "g-s", dirPath]);
+      } catch {
+        // Best effort; do not skip recursive write stripping.
+      }
+    }
+    try {
       kubectlExec(sandboxName, ["chmod", "-R", writeStrip, dirPath]);
     } catch {
       // Silently skip
@@ -301,6 +312,7 @@ for dir in "$config_dir"/workspace-*; do
   [ -d "$dir" ] || continue
   chown -R "$owner" "$dir" 2>/dev/null || true
   chmod "$dir_mode" "$dir" 2>/dev/null || true
+  if [ "$dir_mode" = "755" ]; then chmod g-s "$dir" 2>/dev/null || true; fi
   chmod -R "$write_strip" "$dir" 2>/dev/null || true
 done
 `,
@@ -483,6 +495,11 @@ function lockAgentConfig(
   } catch {
     errors.push("chmod 755 config dir");
   }
+  try {
+    kubectlExec(sandboxName, ["chmod", "g-s", target.configDir]);
+  } catch {
+    errors.push("chmod g-s config dir");
+  }
 
   try {
     kubectlExec(sandboxName, ["chown", "root:root", target.configDir]);
diff --git a/src/nemoclaw.ts b/src/nemoclaw.ts
index 82689a4790..fd375797bf 100644
--- a/src/nemoclaw.ts
+++ b/src/nemoclaw.ts
@@ -128,6 +128,8 @@ type SandboxCommandResult = {
   stderr: string;
 };
 
+const SANDBOX_EXEC_STARTED_MARKER = "__NEMOCLAW_SANDBOX_EXEC_STARTED__";
+
 type RecoveredSandboxMetadata = Partial<
   Pick<SandboxEntry, "model" | "provider" | "gpuEnabled" | "policies" | "nimContainer" | "agent">
 > & {
@@ -245,6 +247,7 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom
     timeout: OPENSHELL_PROBE_TIMEOUT_MS,
   });
   if (sshConfigResult.status !== 0) return null;
+  if (!sshConfigResult.output.trim()) return null;
 
   const tmpFile = path.join(os.tmpdir(), `nemoclaw-ssh-${process.pid}-${Date.now()}.conf`);
   fs.writeFileSync(tmpFile, sshConfigResult.output, { mode: 0o600 });
@@ -283,6 +286,46 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom
   }
 }
 
+function executeSandboxExecCommand(
+  sandboxName: string,
+  command: string,
+  timeout = 15000,
+): SandboxCommandResult | null {
+  const markedCommand = `printf '%s\\n' '${SANDBOX_EXEC_STARTED_MARKER}'; ${command}`;
+  try {
+    const result = spawnSync(
+      getOpenshellBinary(),
+      ["sandbox", "exec", "--name", sandboxName, "--", "sh", "-c", markedCommand],
+      {
+        cwd: ROOT,
+        encoding: "utf-8",
+        env: process.env,
+        stdio: ["ignore", "pipe", "pipe"],
+        timeout,
+      },
+    );
+    const stdout = (result.stdout || "").trim();
+    const stdoutLines = stdout.split(/\r?\n/);
+    const markerIndex = stdoutLines.indexOf(SANDBOX_EXEC_STARTED_MARKER);
+    if (markerIndex === -1) return null;
+    const commandStdoutLines = stdoutLines.slice(markerIndex + 1);
+    return {
+      status: result.error ? 1 : (result.status ?? 1),
+      stdout: commandStdoutLines.join("\n").trim(),
+      stderr: (result.stderr || "").trim(),
+    };
+  } catch {
+    return null;
+  }
+}
+
+function parseSandboxGatewayProbe(result: SandboxCommandResult | null): boolean | null {
+  if (!result) return null;
+  if (result.stdout === "RUNNING") return true;
+  if (result.stdout === "STOPPED") return false;
+  return null;
+}
+
 /**
  * Check whether the OpenClaw gateway process is running inside the sandbox.
  * Uses the gateway's HTTP endpoint (dashboard port) as the source of truth,
@@ -292,62 +335,72 @@ function executeSandboxCommand(sandboxName: string, command: string): SandboxCom
 function isSandboxGatewayRunning(sandboxName: string): boolean | null {
   const agent = agentRuntime.getSessionAgent(sandboxName);
   const probeUrl = agentRuntime.getHealthProbeUrl(agent);
-  const result = executeSandboxCommand(
-    sandboxName,
-    `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`,
-  );
-  if (!result) return null;
-  if (result.stdout === "RUNNING") return true;
-  if (result.stdout === "STOPPED") return false;
-  return null;
+  const command = `curl -sf --max-time 3 ${shellQuote(probeUrl)} > /dev/null 2>&1 && echo RUNNING || echo STOPPED`;
+  const execProbe = parseSandboxGatewayProbe(executeSandboxExecCommand(sandboxName, command));
+  if (execProbe !== null) return execProbe;
+  return parseSandboxGatewayProbe(executeSandboxCommand(sandboxName, command));
 }
 
 /**
- * Restart the OpenClaw gateway process inside the sandbox after a pod restart.
+ * Restart the gateway process inside the sandbox after a pod restart.
  * Cleans stale lock/temp files, sources proxy config, and launches the gateway
  * in the background. Returns true on success.
  */
 function recoverSandboxProcesses(sandboxName: string): boolean {
   const agent = agentRuntime.getSessionAgent(sandboxName);
   const agentScript = agentRuntime.buildRecoveryScript(agent, agent?.forwardPort ?? DASHBOARD_PORT);
-  const script =
-    agentScript ||
-    [
-      // Source /tmp/nemoclaw-proxy-env.sh explicitly so NODE_OPTIONS preload
-      // guards (safety-net, ciao, slack, …) survive gateway respawn. Without
-      // this, library errors crash-loop the gateway because the original
-      // .bashrc-only path silently failed when the env file was unreadable
-      // or the shell did not source ~/.bashrc. See #2478. Mirrors the
-      // hardened block in src/lib/agent-runtime.ts:buildRecoveryScript.
-      // Defer warning emission until AFTER touch+chmod gateway.log so
-      // warnings land in the persistent log a sysadmin would tail. Stderr
-      // alone hides them because executeSandboxCommand captures stderr
-      // without surfacing it. Mirrors src/lib/agent-runtime.ts.
-      "if [ -r /tmp/nemoclaw-proxy-env.sh ]; then . /tmp/nemoclaw-proxy-env.sh; _PE_MISSING=0; else _PE_MISSING=1; fi;",
-      "[ -f ~/.bashrc ] && . ~/.bashrc;",
-      'case "${NODE_OPTIONS:-}" in *nemoclaw-sandbox-safety-net*) _SN_MISSING=0 ;; *) _SN_MISSING=1 ;; esac; case "${NODE_OPTIONS:-}" in *nemoclaw-ciao-network-guard*) _CIAO_MISSING=0 ;; *) _CIAO_MISSING=1 ;; esac; if [ "$_SN_MISSING" = "0" ] && [ "$_CIAO_MISSING" = "0" ]; then _GUARDS_MISSING=0; else _GUARDS_MISSING=1; fi;',
-      `if curl -sf --max-time 3 http://127.0.0.1:${DASHBOARD_PORT}/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;`,
-      "rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;",
-      "rm -f /tmp/gateway.log /tmp/auto-pair.log;",
-      "touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;",
-      "touch /tmp/auto-pair.log; chmod 600 /tmp/auto-pair.log;",
-      '[ "$_PE_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: /tmp/nemoclaw-proxy-env.sh missing — gateway launching without library guards (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
-      '[ "$_GUARDS_MISSING" = "1" ] && { _W="[gateway-recovery] WARNING: NODE_OPTIONS missing safety-net preload or ciao preload — gateway may crash on unhandled library errors (#2478)"; echo "$_W" >&2; echo "$_W" >> /tmp/gateway.log; };',
-      'OPENCLAW="$(command -v openclaw)";',
-      'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;',
-      // Append rather than truncate so [gateway-recovery] WARNING lines
-      // written above survive past the launch. (#2478)
-      `nohup "$OPENCLAW" gateway run --port ${DASHBOARD_PORT} >> /tmp/gateway.log 2>&1 &`,
-      "GPID=$!; sleep 2;",
-      'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi',
-    ].join(" ");
-
-  const result = executeSandboxCommand(sandboxName, script);
-  if (!result) return false;
-  return (
-    result.status === 0 &&
-    (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING"))
+  const hasRecoveryMarker = (result: SandboxCommandResult | null) =>
+    !!(
+      result &&
+      (result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING"))
+    );
+  const recoveredSsh = (result: SandboxCommandResult | null) =>
+    !!(result && result.status === 0 && hasRecoveryMarker(result));
+
+  if (agentScript) {
+    // Non-OpenClaw manifests do not yet declare a runtime user for root
+    // sandbox exec. Recover them over SSH so the launch inherits the sandbox
+    // login user instead of creating root-owned agent state under /sandbox.
+    return recoveredSsh(executeSandboxCommand(sandboxName, agentScript));
+  }
+
+  const script = agentRuntime.buildOpenClawRecoveryScript(DASHBOARD_PORT);
+  const execResult = executeSandboxExecCommand(sandboxName, script, 30000);
+  if (hasRecoveryMarker(execResult)) return true;
+  if (execResult !== null) return false;
+  return recoveredSsh(executeSandboxCommand(sandboxName, script));
+}
+
+function readNonNegativeNumberEnv(name: string, fallback: number): number {
+  const raw = process.env[name];
+  if (raw === undefined || raw.trim() === "") return fallback;
+  const parsed = Number(raw);
+  return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback;
+}
+
+function waitForRecoveredSandboxGateway(sandboxName: string): boolean {
+  const timeoutSeconds = readNonNegativeNumberEnv(
+    "NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS",
+    30,
   );
+  const intervalSeconds = readNonNegativeNumberEnv(
+    "NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS",
+    3,
+  );
+  const attempts =
+    intervalSeconds > 0
+      ? Math.max(1, Math.floor(timeoutSeconds / intervalSeconds) + 1)
+      : Math.max(1, Math.floor(timeoutSeconds) + 1);
+
+  for (let attempt = 0; attempt < attempts; attempt += 1) {
+    if (isSandboxGatewayRunning(sandboxName) === true) {
+      return true;
+    }
+    if (attempt < attempts - 1) {
+      sleepSeconds(intervalSeconds);
+    }
+  }
+  return false;
 }
 
 /**
@@ -392,9 +445,9 @@ function checkAndRecoverSandboxProcesses(
 
   const recovered = recoverSandboxProcesses(sandboxName);
   if (recovered) {
-    // Wait for gateway to bind its HTTP port before declaring success
-    sleepSeconds(3);
-    if (isSandboxGatewayRunning(sandboxName) !== true) {
+    // Wait for gateway to bind its HTTP port before declaring success. The
+    // recovered process can be alive before the OpenAI-compatible API is ready.
+    if (!waitForRecoveredSandboxGateway(sandboxName)) {
       if (!quiet) {
         console.error("  Gateway process started but is not responding.");
         console.error("  Check /tmp/gateway.log inside the sandbox for details.");
@@ -1233,10 +1286,90 @@ function printSandboxActionUsage(action: string): void {
 
 // ── Sandbox-scoped actions ───────────────────────────────────────
 
-async function sandboxConnect(sandboxName: string) {
+type SandboxConnectOptions = {
+  probeOnly?: boolean;
+};
+
+const SANDBOX_CONNECT_FLAGS = new Set(["--dangerously-skip-permissions", "--probe-only", "--help", "-h"]);
+
+function isSandboxConnectFlag(arg: string | undefined): boolean {
+  return typeof arg === "string" && SANDBOX_CONNECT_FLAGS.has(arg);
+}
+
+function printSandboxConnectHelp(sandboxName = "<name>") {
+  console.log("");
+  console.log(`  Usage: ${CLI_NAME} ${sandboxName} connect [--probe-only]`);
+  console.log("");
+  console.log("  Options:");
+  console.log(
+    "    --probe-only                    Run recovery checks and exit without opening SSH",
+  );
+  console.log("    -h, --help                      Show this help");
+  console.log("");
+}
+
+function parseSandboxConnectArgs(sandboxName: string, actionArgs: string[]): SandboxConnectOptions {
+  const options: SandboxConnectOptions = {};
+  for (const arg of actionArgs) {
+    if (!isSandboxConnectFlag(arg)) {
+      console.error(`  Unknown flag for connect: ${arg}`);
+      printSandboxConnectHelp(sandboxName);
+      process.exit(1);
+    }
+    switch (arg) {
+      case "--dangerously-skip-permissions":
+        console.error("  --dangerously-skip-permissions was removed; use shields commands instead.");
+        printSandboxConnectHelp(sandboxName);
+        process.exit(1);
+      case "--probe-only":
+        options.probeOnly = true;
+        break;
+      case "--help":
+      case "-h":
+        printSandboxConnectHelp(sandboxName);
+        process.exit(0);
+        break;
+    }
+  }
+  return options;
+}
+
+function runSandboxConnectProbe(sandboxName: string): void {
+  const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true });
+  const agent = agentRuntime.getSessionAgent(sandboxName);
+  const agentName = agentRuntime.getAgentDisplayName(agent);
+  if (!processCheck.checked) {
+    console.error(
+      `  Probe failed: could not inspect the ${agentName} gateway inside sandbox '${sandboxName}'.`,
+    );
+    process.exit(1);
+  }
+  if (processCheck.wasRunning) {
+    console.log(`  Probe complete: ${agentName} gateway is running in '${sandboxName}'.`);
+    return;
+  }
+  if (processCheck.recovered) {
+    console.log(`  Probe complete: recovered ${agentName} gateway in '${sandboxName}'.`);
+    return;
+  }
+  console.error(
+    `  Probe failed: ${agentName} gateway is not running in '${sandboxName}' and automatic recovery failed.`,
+  );
+  console.error("  Check /tmp/gateway.log inside the sandbox for details.");
+  process.exit(1);
+}
+
+async function sandboxConnect(
+  sandboxName: string,
+  { probeOnly = false }: SandboxConnectOptions = {},
+) {
   const { isSandboxReady, parseSandboxStatus } = require("./lib/onboard");
   await ensureLiveSandboxOrExit(sandboxName, { allowNonReadyPhase: true });
 
+  if (probeOnly) {
+    return runSandboxConnectProbe(sandboxName);
+  }
+
   // Version staleness check — warn but don't block
   try {
     const versionCheck = sandboxVersion.checkAgentVersion(sandboxName);
@@ -4682,11 +4815,24 @@ const [cmd, ...args] = process.argv.slice(2);
   }
 
   // Sandbox-scoped commands: nemoclaw <name> <action>
+  const firstSandboxArg = args[0];
+  const implicitConnectArg = isSandboxConnectFlag(firstSandboxArg);
+  const requestedSandboxAction =
+    !firstSandboxArg || implicitConnectArg ? "connect" : firstSandboxArg;
+  const requestedSandboxActionArgs = !firstSandboxArg || implicitConnectArg ? args : args.slice(1);
+  if (
+    requestedSandboxAction === "connect" &&
+    requestedSandboxActionArgs.some((arg) => arg === "--help" || arg === "-h")
+  ) {
+    validateName(cmd, "sandbox name");
+    printSandboxConnectHelp(cmd);
+    return;
+  }
+
   // If the registry doesn't know this name but the action is a sandbox-scoped
   // command, attempt recovery — the sandbox may still be live with a stale registry.
   // Derived from command registry — single source of truth
   const sandboxActions = sandboxActionTokens();
-  const requestedSandboxAction = args[0] || "connect";
   if (!registry.getSandbox(cmd) && sandboxActions.includes(requestedSandboxAction)) {
     validateName(cmd, "sandbox name");
     await recoverRegistryEntries({ requestedSandboxName: cmd });
@@ -4730,28 +4876,12 @@ const [cmd, ...args] = process.argv.slice(2);
   const sandbox = registry.getSandbox(cmd);
   if (sandbox) {
     validateName(cmd, "sandbox name");
-    const action = args[0] || "connect";
-    const actionArgs = args.slice(1);
+    const action = requestedSandboxAction;
+    const actionArgs = requestedSandboxActionArgs;
 
     switch (action) {
       case "connect":
-        if (actionArgs.length > 0) {
-          console.error(
-            `  Unknown connect argument${actionArgs.length === 1 ? "" : "s"}: ${actionArgs.join(" ")}`,
-          );
-          if (actionArgs.includes("--dangerously-skip-permissions")) {
-            console.error(
-              "  --dangerously-skip-permissions was removed; use shields commands instead.",
-            );
-          }
-          const reorderedCandidate = findRegisteredSandboxName(actionArgs);
-          if (reorderedCandidate) {
-            printConnectOrderHint(reorderedCandidate);
-          }
-          console.error(`  Usage: ${CLI_NAME} <name> connect`);
-          process.exit(1);
-        }
-        await sandboxConnect(cmd);
+        await sandboxConnect(cmd, parseSandboxConnectArgs(cmd, actionArgs));
         break;
       case "status":
         if (hasHelpFlag(actionArgs)) {
diff --git a/test/cli.test.ts b/test/cli.test.ts
index 448d5ed936..88611a055a 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -94,7 +94,41 @@ function readRecordedArgs(markerFile: string): string[] {
   return fs.readFileSync(markerFile, "utf8").trim().split(/\s+/);
 }
 
-function writeSandboxRegistry(home: string, sandboxName = "alpha"): void {
+type SandboxEntry = {
+  name: string;
+  model: string;
+  provider: string;
+  gpuEnabled: boolean;
+  policies: string[];
+  agent?: string;
+};
+
+function writeRecordingCommand(
+  binDir: string,
+  command: string,
+  markerFile: string,
+  exitCode: number,
+): void {
+  fs.writeFileSync(
+    path.join(binDir, command),
+    [
+      "#!/usr/bin/env bash",
+      `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`,
+      `exit ${exitCode}`,
+    ].join("\n"),
+    { mode: 0o755 },
+  );
+}
+
+function writeSandboxRegistry(
+  home: string,
+  sandboxNameOrOverrides: string | Partial<SandboxEntry> = "alpha",
+  sandboxOverridesArg: Partial<SandboxEntry> = {},
+): void {
+  const sandboxName =
+    typeof sandboxNameOrOverrides === "string" ? sandboxNameOrOverrides : "alpha";
+  const sandboxOverrides =
+    typeof sandboxNameOrOverrides === "string" ? sandboxOverridesArg : sandboxNameOrOverrides;
   const registryDir = path.join(home, ".nemoclaw");
   fs.mkdirSync(registryDir, { recursive: true });
   fs.writeFileSync(
@@ -107,6 +141,7 @@ function writeSandboxRegistry(home: string, sandboxName = "alpha"): void {
           provider: "nvidia-prod",
           gpuEnabled: false,
           policies: [],
+          ...sandboxOverrides,
         },
       },
       defaultSandbox: sandboxName,
@@ -212,6 +247,19 @@ function createDoctorTestSetup(prefix: string, openshellLines: string[], sandbox
   };
 }
 
+function createCloudflaredServiceDir(prefix: string): { sandboxName: string; serviceDir: string } {
+  const suffix = [
+    process.pid.toString(36),
+    Date.now().toString(36),
+    Math.random().toString(36).slice(2, 10),
+  ].join("-");
+  const sandboxName = `${prefix}${suffix}`;
+  const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`);
+  fs.rmSync(serviceDir, { recursive: true, force: true });
+  fs.mkdirSync(serviceDir, { recursive: true });
+  return { sandboxName, serviceDir };
+}
+
 function createDebugCommandTestEnv(prefix: string): Record<string, string> {
   const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
   const localBin = path.join(home, "bin");
@@ -919,7 +967,7 @@ describe("CLI dispatch", () => {
   });
 
   it("doctor treats a live non-cloudflared PID as stale", () => {
-    const sandboxName = `doctorpid-${process.pid}`;
+    const { sandboxName, serviceDir } = createCloudflaredServiceDir("doctorpid-");
     const setup = createDoctorTestSetup(
       "nemoclaw-cli-doctor-wrong-cloudflared-pid-",
       [
@@ -932,9 +980,6 @@ describe("CLI dispatch", () => {
       ],
       sandboxName,
     );
-    const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`);
-    fs.rmSync(serviceDir, { recursive: true, force: true });
-    fs.mkdirSync(serviceDir, { recursive: true });
     const sleeper = spawn(process.execPath, ["-e", "setTimeout(() => {}, 30000)"], {
       stdio: "ignore",
     });
@@ -964,7 +1009,7 @@ describe("CLI dispatch", () => {
   });
 
   it("doctor accepts a live cloudflared PID", () => {
-    const sandboxName = `doctorcloudflared-${process.pid}`;
+    const { sandboxName, serviceDir } = createCloudflaredServiceDir("doctorcloudflared-");
     const setup = createDoctorTestSetup(
       "nemoclaw-cli-doctor-cloudflared-pid-",
       [
@@ -977,11 +1022,8 @@ describe("CLI dispatch", () => {
       ],
       sandboxName,
     );
-    const serviceDir = path.join("/tmp", `nemoclaw-services-${sandboxName}`);
     const shimDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cloudflared-shim-"));
     const cloudflaredBin = path.join(shimDir, "cloudflared");
-    fs.rmSync(serviceDir, { recursive: true, force: true });
-    fs.mkdirSync(serviceDir, { recursive: true });
     fs.symlinkSync(process.execPath, cloudflaredBin);
     const sleeper = spawn(cloudflaredBin, ["-e", "setTimeout(() => {}, 30000)"], {
       stdio: "ignore",
@@ -1887,6 +1929,464 @@ describe("CLI dispatch", () => {
     expect(calls.some((call) => call.startsWith("forward start --background 18789"))).toBe(false);
   });
 
+  it("shows connect help without opening an interactive session", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-help-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
+    fs.mkdirSync(localBin, { recursive: true });
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`,
+        "exit 99",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
+
+    const r = runWithEnv("alpha connect --help", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+    const implicit = runWithEnv("alpha --help", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Usage: nemoclaw alpha connect");
+    expect(r.out).toContain("--probe-only");
+    expect(implicit.code).toBe(0);
+    expect(implicit.out).toContain("Usage: nemoclaw alpha connect");
+    expect(fs.existsSync(markerFile)).toBe(false);
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
+  });
+
+  it("rejects the removed skip-permissions connect flag", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-flags-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
+    fs.mkdirSync(localBin, { recursive: true });
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `printf '%s\\n' "$*" >> ${JSON.stringify(markerFile)}`,
+        "exit 99",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
+    writeSandboxRegistry(home);
+
+    const r = runWithEnv("alpha connect --dangerously-skip-permissions", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(1);
+    expect(r.out).toContain("--dangerously-skip-permissions was removed");
+    expect(fs.existsSync(markerFile)).toBe(false);
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
+  });
+
+  it("connect --probe-only recovers the gateway without opening SSH", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
+    const stateFile = path.join(home, "probe-state");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(stateFile, "stopped");
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `marker_file=${JSON.stringify(markerFile)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        'printf \'%s\\n\' "$*" >> "$marker_file"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  case "$cmd" in',
+        '    *"OPENCLAW="*)',
+        '      echo recovered > "$state_file"',
+        "      echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        "      echo 'GATEWAY_PID=123'",
+        "      exit 42",
+        "      ;;",
+        "    *'curl -sf'*)",
+        "      echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        '      if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi',
+        "      exit 0",
+        "      ;;",
+        "  esac",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: recovered OpenClaw gateway");
+    const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean);
+    expect(calls).toContain("sandbox get alpha");
+    expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
+    expect(calls).not.toContain("sandbox ssh-config alpha");
+    expect(calls).not.toContain("sandbox connect alpha");
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
+  });
+
+  it("waits for recovered gateway health before failing probe-only", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-wait-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    const stateFile = path.join(home, "probe-state");
+    const readyCountFile = path.join(home, "ready-count");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(stateFile, "stopped");
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `marker_file=${JSON.stringify(markerFile)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        `ready_count_file=${JSON.stringify(readyCountFile)}`,
+        'printf \'%s\\n\' "$*" >> "$marker_file"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  case "$cmd" in',
+        '    *"OPENCLAW="*)',
+        '      echo recovered > "$state_file"',
+        "      echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        "      echo 'GATEWAY_PID=123'",
+        "      exit 0",
+        "      ;;",
+        "    *'curl -sf'*)",
+        "      echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        '      if [ "$(cat "$state_file")" != recovered ]; then echo STOPPED; exit 0; fi',
+        '      count=$(cat "$ready_count_file" 2>/dev/null || echo 0)',
+        "      count=$((count + 1))",
+        '      echo "$count" > "$ready_count_file"',
+        '      if [ "$count" -ge 3 ]; then echo RUNNING; else echo STOPPED; fi',
+        "      exit 0",
+        "      ;;",
+        "  esac",
+        "fi",
+        'if [ "$1" = "forward" ]; then',
+        "  exit 0",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+      NEMOCLAW_GATEWAY_RECOVERY_WAIT_SECONDS: "3",
+      NEMOCLAW_GATEWAY_RECOVERY_POLL_INTERVAL_SECONDS: "0",
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: recovered OpenClaw gateway");
+    expect(fs.readFileSync(readyCountFile, "utf8").trim()).toBe("3");
+  });
+
+  it("treats leading --probe-only as an implicit connect probe", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-leading-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `marker_file=${JSON.stringify(markerFile)}`,
+        'printf \'%s\\n\' "$*" >> "$marker_file"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  if [[ "$cmd" == *"curl -sf"* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo RUNNING; exit 0; fi',
+        '  if [[ "$cmd" == *"OPENCLAW="* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo UNEXPECTED_RECOVERY; exit 1; fi',
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
+
+    const r = runWithEnv("alpha --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: OpenClaw gateway is running");
+    const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean);
+    expect(calls).toContain("sandbox get alpha");
+    expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
+    expect(calls).not.toContain("sandbox ssh-config alpha");
+    expect(calls).not.toContain("sandbox connect alpha");
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
+  });
+
+  it("connect --probe-only does not retry a failed sandbox exec recovery over SSH", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-no-ssh-"));
+    const localBin = path.join(home, "bin");
+    const markerFile = path.join(home, "openshell-calls");
+    const sshMarkerFile = path.join(home, "ssh-calls");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `marker_file=${JSON.stringify(markerFile)}`,
+        'printf \'%s\\n\' "$*" >> "$marker_file"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  if [[ "$cmd" == *"OPENCLAW="* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo RECOVERY_FAILED >&2; exit 42; fi',
+        '  if [[ "$cmd" == *"curl -sf"* ]]; then echo "__NEMOCLAW_SANDBOX_EXEC_STARTED__"; echo STOPPED; exit 0; fi',
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ]; then',
+        "  echo 'Host openshell-alpha'",
+        "  exit 0",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    writeRecordingCommand(localBin, "ssh", sshMarkerFile, 98);
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(1);
+    const calls = fs.readFileSync(markerFile, "utf8").trim().split("\n").filter(Boolean);
+    expect(calls).toContain("sandbox get alpha");
+    expect(calls.some((call) => call.startsWith("sandbox exec --name alpha -- sh -c"))).toBe(true);
+    expect(calls).not.toContain("sandbox ssh-config alpha");
+    expect(fs.existsSync(sshMarkerFile)).toBe(false);
+  });
+
+  it("connect --probe-only falls back to SSH when sandbox exec never starts", () => {
+    const home = fs.mkdtempSync(
+      path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-exec-fallback-"),
+    );
+    const localBin = path.join(home, "bin");
+    const openshellCalls = path.join(home, "openshell-calls");
+    const sshCalls = path.join(home, "ssh-calls");
+    const stateFile = path.join(home, "probe-state");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home);
+    fs.writeFileSync(stateFile, "stopped");
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `calls=${JSON.stringify(openshellCalls)}`,
+        'printf \'%s\\n\' "$*" >> "$calls"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ]; then',
+        "  echo 'error: sandbox exec transport failed before command start' >&2",
+        "  exit 2",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Host openshell-alpha'",
+        "  echo '  HostName 127.0.0.1'",
+        "  echo '  User sandbox'",
+        "  exit 0",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    fs.writeFileSync(
+      path.join(localBin, "ssh"),
+      [
+        "#!/usr/bin/env bash",
+        `calls=${JSON.stringify(sshCalls)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        'cmd="${@: -1}"',
+        'printf \'ARGS %s\\n\' "$*" >> "$calls"',
+        'printf \'CMD %s\\n\' "$cmd" >> "$calls"',
+        'if [[ "$cmd" == *"OPENCLAW="* ]]; then',
+        '  echo recovered > "$state_file"',
+        "  echo 'GATEWAY_PID=456'",
+        "  exit 0",
+        "fi",
+        'if [[ "$cmd" == *"curl -sf"* ]]; then',
+        '  if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi',
+        "  exit 0",
+        "fi",
+        "exit 1",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: recovered OpenClaw gateway");
+    const openshellLog = fs.readFileSync(openshellCalls, "utf8");
+    const sshLog = fs.readFileSync(sshCalls, "utf8");
+    expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c");
+    expect(openshellLog).toContain("sandbox ssh-config alpha");
+    expect(openshellLog).not.toContain("sandbox connect");
+    expect(sshLog).toContain('OPENCLAW="$(command -v openclaw)"');
+    expect(sshLog).not.toMatch(/(^|\s)-tt?(\s|$)/);
+  });
+
+  it("recovers non-OpenClaw agents over SSH instead of root sandbox exec", () => {
+    const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-probe-agent-"));
+    const localBin = path.join(home, "bin");
+    const openshellCalls = path.join(home, "openshell-calls");
+    const sshCalls = path.join(home, "ssh-calls");
+    const stateFile = path.join(home, "probe-state");
+    fs.mkdirSync(localBin, { recursive: true });
+    writeSandboxRegistry(home, { agent: "hermes" });
+    fs.writeFileSync(stateFile, "stopped");
+    fs.writeFileSync(
+      path.join(localBin, "openshell"),
+      [
+        "#!/usr/bin/env bash",
+        `calls=${JSON.stringify(openshellCalls)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        'printf \'%s\\n\' "$*" >> "$calls"',
+        'if [ "$1" = "sandbox" ] && [ "$2" = "get" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Sandbox:'",
+        "  echo",
+        "  echo '  Id: abc'",
+        "  echo '  Name: alpha'",
+        "  echo '  Namespace: openshell'",
+        "  echo '  Phase: Ready'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "exec" ] && [ "$3" = "--name" ] && [ "$4" = "alpha" ]; then',
+        '  cmd="$8"',
+        '  if [[ "$cmd" == *"curl -sf"* ]]; then',
+        "    echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        '    if [ "$(cat "$state_file")" = recovered ]; then echo RUNNING; else echo STOPPED; fi',
+        "    exit 0",
+        "  fi",
+        '  if [[ "$cmd" == *"HERMES_HOME=/sandbox/.hermes"* || "$cmd" == *"AGENT_BIN="* ]]; then',
+        "    echo '__NEMOCLAW_SANDBOX_EXEC_STARTED__'",
+        "    echo UNEXPECTED_ROOT_EXEC_RECOVERY",
+        "    exit 1",
+        "  fi",
+        "fi",
+        'if [ "$1" = "sandbox" ] && [ "$2" = "ssh-config" ] && [ "$3" = "alpha" ]; then',
+        "  echo 'Host openshell-alpha'",
+        "  echo '  HostName 127.0.0.1'",
+        "  echo '  User sandbox'",
+        "  exit 0",
+        "fi",
+        'if [ "$1" = "forward" ]; then',
+        "  exit 0",
+        "fi",
+        "exit 0",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+    fs.writeFileSync(
+      path.join(localBin, "ssh"),
+      [
+        "#!/usr/bin/env bash",
+        `calls=${JSON.stringify(sshCalls)}`,
+        `state_file=${JSON.stringify(stateFile)}`,
+        'cmd="${@: -1}"',
+        'printf \'ARGS %s\\n\' "$*" >> "$calls"',
+        'printf \'CMD %s\\n\' "$cmd" >> "$calls"',
+        'if [[ "$cmd" == *"AGENT_BIN=\'/usr/local/bin/hermes\'"* ]]; then',
+        '  echo recovered > "$state_file"',
+        "  echo 'GATEWAY_PID=789'",
+        "  exit 0",
+        "fi",
+        "exit 1",
+      ].join("\n"),
+      { mode: 0o755 },
+    );
+
+    const r = runWithEnv("alpha connect --probe-only", {
+      HOME: home,
+      PATH: `${localBin}:${process.env.PATH || ""}`,
+    });
+
+    expect(r.code).toBe(0);
+    expect(r.out).toContain("Probe complete: recovered Hermes Agent gateway");
+    const openshellLog = fs.readFileSync(openshellCalls, "utf8");
+    const sshLog = fs.readFileSync(sshCalls, "utf8");
+    expect(openshellLog).toContain("sandbox exec --name alpha -- sh -c");
+    expect(openshellLog).toContain("sandbox ssh-config alpha");
+    expect(openshellLog).not.toContain("HERMES_HOME=/sandbox/.hermes");
+    expect(openshellLog).not.toContain("AGENT_BIN=");
+    expect(openshellLog).not.toContain("sandbox connect");
+    expect(sshLog).toContain("HERMES_HOME=/sandbox/.hermes");
+    expect(sshLog).toContain("AGENT_BIN='/usr/local/bin/hermes'");
+    expect(sshLog).not.toMatch(/(^|\s)-tt?(\s|$)/);
+  });
+
   it("waits for sandbox readiness before connecting", () => {
     const home = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-cli-connect-wait-"));
     const localBin = path.join(home, "bin");
diff --git a/test/e2e/test-issue-2478-crash-loop-recovery.sh b/test/e2e/test-issue-2478-crash-loop-recovery.sh
index c0878a03ba..966df30bf7 100755
--- a/test/e2e/test-issue-2478-crash-loop-recovery.sh
+++ b/test/e2e/test-issue-2478-crash-loop-recovery.sh
@@ -30,11 +30,12 @@
 #   2. Verifies the *initial* gateway has the safety-net + ciao guard active
 #      (via /proc/<pid>/environ on the gateway PID).
 #   3. Crash-recovery loop (NORMAL): kill the gateway 5x, each time triggers
-#      `nemoclaw <name> connect` (which calls recoverSandboxProcesses), and
-#      checks the respawned gateway still has guards in NODE_OPTIONS.
+#      `nemoclaw <name> connect --probe-only` (which calls
+#      recoverSandboxProcesses), and checks the respawned gateway still has
+#      guards in NODE_OPTIONS.
 #   4. Negative case: removes /tmp/nemoclaw-proxy-env.sh, kills the gateway,
 #      triggers recovery — expects the new "[gateway-recovery] WARNING"
-#      line in stderr (visible via gateway.log) instead of silent guard loss.
+#      line in gateway.log instead of silent guard loss.
 #   5. Soak: leaves the sandbox idle for $NEMOCLAW_E2E_SOAK_SECONDS
 #      (default 300) so the health-monitor restart cadence (~4 min in prod)
 #      gets at least one chance to fire, then asserts the gateway has not
@@ -245,6 +246,20 @@ gateway_diagnostics() {
   echo "  ---------------------------"
 }
 
+run_probe_only_or_fail() {
+  local context="$1"
+  local probe_out
+  probe_out="$(mktemp)"
+  if ! timeout 60 nemoclaw "$SANDBOX_NAME" connect --probe-only >"$probe_out" 2>&1; then
+    fail "${context}: connect --probe-only exited nonzero"
+    sed 's/^/    /' "$probe_out"
+    rm -f "$probe_out"
+    gateway_diagnostics ""
+    exit 1
+  fi
+  rm -f "$probe_out"
+}
+
 # Wait until gateway PID is non-empty (or timeout). Echoes pid, returns 0/1.
 wait_for_gateway_up() {
   local timeout="${1:-30}"
@@ -371,11 +386,18 @@ for cycle in $(seq 1 "$CRASH_CYCLES"); do
   info "Cycle $cycle/$CRASH_CYCLES — killing gateway pid=$prev_pid"
   sandbox_exec sh -c "kill -9 $prev_pid 2>/dev/null; sleep 1; pgrep -fo '[o]penclaw[ -]gateway' || echo DEAD" >/dev/null
 
-  # Trigger recovery via the actual code path: `nemoclaw <name> status`
-  # calls checkAndRecoverSandboxProcesses() → recoverSandboxProcesses(),
-  # which is the hardened path #2478 changes. Bound it with `timeout`
-  # so a hang in CLI internals cannot eat the whole 30-min job budget.
-  timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true
+  # Trigger recovery via the actual operator probe path:
+  # `nemoclaw <name> connect --probe-only` calls
+  # checkAndRecoverSandboxProcesses() -> recoverSandboxProcesses() without
+  # opening an interactive SSH session. Bound it with `timeout` so a hang in
+  # CLI internals cannot eat the whole 30-min job budget.
+  run_probe_only_or_fail "Cycle $cycle after gateway kill"
+
+  if ! sandbox_exec sh -c 'test -s /tmp/gateway.log'; then
+    fail "Cycle $cycle: connect --probe-only did not leave /tmp/gateway.log evidence"
+    gateway_diagnostics ""
+    exit 1
+  fi
 
   new_pid="$(wait_for_gateway_up 45)"
   if [ -z "$new_pid" ]; then
@@ -435,9 +457,10 @@ info "Snapshotted proxy-env.sh ($SNAPSHOT_SIZE bytes, ${#SNAPSHOT_B64}-char base
 # recovery script (which is the only path that emits the warning).
 sandbox_exec sh -c 'rm -f /tmp/nemoclaw-proxy-env.sh' >/dev/null
 sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null
-timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true
+run_probe_only_or_fail "Negative case after proxy-env removal"
 
-# The new gateway.log should contain the [gateway-recovery] WARNING line.
+# The new gateway.log should contain the [gateway-recovery] WARNING line and
+# recovery should have attempted a real gateway respawn.
 warn_seen=false
 for _ in 1 2 3 4 5; do
   if gateway_log_tail 100 | grep -q '\[gateway-recovery\] WARNING'; then
@@ -452,6 +475,13 @@ else
   fail "Recovery silently launched without warning (regression of #2478 fix)"
   gateway_log_tail 100
 fi
+NEGATIVE_PID="$(wait_for_gateway_up 45)"
+if [ -z "$NEGATIVE_PID" ]; then
+  fail "Recovery warning was logged, but gateway did not respawn within 45s"
+  gateway_diagnostics ""
+  exit 1
+fi
+info "Negative-case recovery respawned gateway pid=$NEGATIVE_PID"
 
 # Restore proxy-env.sh by base64-injecting the snapshot via argv. `openshell
 # sandbox exec` does not pipe stdin from the caller through to the subshell,
@@ -467,10 +497,10 @@ if [ "$restored_size" != "$SNAPSHOT_SIZE" ]; then
 fi
 info "proxy-env.sh restored (${restored_size} bytes verified)"
 
-# Stop the no-guard gateway started by the negative case, then trigger
-# recovery to bring the gateway back with guards intact.
+# Kill the guardless negative-case gateway, then trigger recovery to bring the
+# gateway back with guards intact from the restored env file.
 sandbox_exec sh -c "pkill -9 -f '[o]penclaw' 2>/dev/null; sleep 2; pgrep -af '[o]penclaw' || echo ALL_DEAD" >/dev/null
-timeout 60 nemoclaw "$SANDBOX_NAME" status >/dev/null 2>&1 || true
+run_probe_only_or_fail "Guard restore recovery"
 SOAK_START_PID="$(wait_for_gateway_up 30)"
 if [ -z "$SOAK_START_PID" ]; then
   fail "Gateway not up entering soak phase"
diff --git a/test/e2e/test-shields-config.sh b/test/e2e/test-shields-config.sh
index 061741b430..499584c6c7 100755
--- a/test/e2e/test-shields-config.sh
+++ b/test/e2e/test-shields-config.sh
@@ -156,16 +156,22 @@ pass "NemoClaw installed (sandbox: $SANDBOX_NAME)"
 # ══════════════════════════════════════════════════════════════════
 section "Phase 2: Config is writable (mutable default)"
 
-# Verify file permissions — should start as 600 sandbox:sandbox
+# Verify file permissions — fresh OpenClaw sandboxes start sandbox-owned.
+# Depending on whether the image entrypoint ran as root, the mutable-default
+# tree may already include the #2681 group-write/setgid contract or may only
+# gain it after the first host-side shields-down.
 PERMS=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \
   stat -c '%a %U:%G' "${CONFIG_PATH}" 2>/dev/null || true)
 info "Config perms (default): ${PERMS}"
 
-if [ "$(echo "$PERMS" | awk '{print $1}')" = "600" ]; then
-  pass "Config file mode is 600 (mutable default)"
-else
-  fail "Config file should start as mode 600: ${PERMS}"
-fi
+case "$(echo "$PERMS" | awk '{print $1}')" in
+  600 | 660)
+    pass "Config file mode is mutable default (${PERMS})"
+    ;;
+  *)
+    fail "Config file should start as mutable mode 600 or 660: ${PERMS}"
+    ;;
+esac
 
 if [ "$(echo "$PERMS" | awk '{print $2}')" = "sandbox:sandbox" ]; then
   pass "Config file owned by sandbox:sandbox (mutable default)"
@@ -177,10 +183,19 @@ DIR_PERMS=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \
   stat -c '%a %U:%G' "$(dirname "${CONFIG_PATH}")" 2>/dev/null || true)
 info "Config dir perms (default): ${DIR_PERMS}"
 
-if [ "$(echo "$DIR_PERMS" | awk '{print $1}')" = "700" ]; then
-  pass "Config directory mode is 700 (mutable default)"
+case "$(echo "$DIR_PERMS" | awk '{print $1}')" in
+  700 | 2700 | 2770)
+    pass "Config directory mode is mutable default (${DIR_PERMS})"
+    ;;
+  *)
+    fail "Config directory should be mutable mode 700, 2700, or 2770: ${DIR_PERMS}"
+    ;;
+esac
+
+if [ "$(echo "$DIR_PERMS" | awk '{print $2}')" = "sandbox:sandbox" ]; then
+  pass "Config directory owned by sandbox:sandbox (mutable default)"
 else
-  fail "Config directory should be mode 700: ${DIR_PERMS}"
+  fail "Config directory should be owned by sandbox:sandbox: ${DIR_PERMS}"
 fi
 
 STATUS_DEFAULT=$(nemoclaw "${SANDBOX_NAME}" shields status 2>&1)
@@ -321,15 +336,16 @@ else
   fail "shields down did not report success: ${SHIELDS_DOWN_OUTPUT}"
 fi
 
-# Check permissions changed — should be sandbox:sandbox 600/700 (doctor-aligned)
+# Check permissions changed — OpenClaw uses group-writable + setgid mutable mode
+# so the gateway UID and sandbox UID can both update config/state (#2681).
 PERMS_DOWN=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \
   stat -c '%a %U:%G' "${CONFIG_PATH}" 2>/dev/null || true)
 info "Config perms (shields DOWN): ${PERMS_DOWN}"
 
-if [ "$(echo "$PERMS_DOWN" | awk '{print $1}')" = "600" ]; then
-  pass "Config file mode is 600 (restored to mutable default)"
+if [ "$(echo "$PERMS_DOWN" | awk '{print $1}')" = "660" ]; then
+  pass "Config file mode is 660 (restored to mutable group-writable default)"
 else
-  fail "Config file should be mode 600 after shields down: ${PERMS_DOWN}"
+  fail "Config file should be mode 660 after shields down: ${PERMS_DOWN}"
 fi
 
 if [ "$(echo "$PERMS_DOWN" | awk '{print $2}')" = "sandbox:sandbox" ]; then
@@ -342,10 +358,10 @@ DIR_PERMS_DOWN=$(openshell sandbox exec --name "${SANDBOX_NAME}" -- \
   stat -c '%a %U:%G' "$(dirname "${CONFIG_PATH}")" 2>/dev/null || true)
 info "Config dir perms (shields DOWN): ${DIR_PERMS_DOWN}"
 
-if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $1}')" = "700" ]; then
-  pass "Config directory mode is 700 (restored to mutable default)"
+if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $1}')" = "2770" ]; then
+  pass "Config directory mode is 2770 (restored to mutable setgid default)"
 else
-  fail "Config directory should be mode 700 after shields down: ${DIR_PERMS_DOWN}"
+  fail "Config directory should be mode 2770 after shields down: ${DIR_PERMS_DOWN}"
 fi
 
 if [ "$(echo "$DIR_PERMS_DOWN" | awk '{print $2}')" = "sandbox:sandbox" ]; then
diff --git a/test/fetch-guard-patch-regression.test.ts b/test/fetch-guard-patch-regression.test.ts
index 8c42701f21..a12a859494 100644
--- a/test/fetch-guard-patch-regression.test.ts
+++ b/test/fetch-guard-patch-regression.test.ts
@@ -26,8 +26,10 @@ function dockerRunCommandBetween(startMarker: string, endMarker: string): string
     .replace(/^RUN\s+/, "")
     .split("\n")
     .filter((line) => !line.trimStart().startsWith("#"))
-    .join("\n");
-  return command.replace(/\\\s*$/, "");
+    .join("\n")
+    .replace(/\\\n/g, " ")
+    .replace(/\\\s*$/, "");
+  return command;
 }
 
 function runOpenClawUpgradeBlock(currentVersion: string) {
diff --git a/test/nemoclaw-start.test.ts b/test/nemoclaw-start.test.ts
index 9752c6314e..1f8c4001bb 100644
--- a/test/nemoclaw-start.test.ts
+++ b/test/nemoclaw-start.test.ts
@@ -1451,7 +1451,6 @@ describe("Telegram diagnostics (#2766)", () => {
         'harden_auth_profiles() { :; }',
         'chown() { :; }',
         'chown_tree_no_symlink_follow() { :; }',
-        'normalize_mutable_config_perms() { :; }',
         'start_persistent_gateway_log_mirror() { :; }',
         'gosu() { shift; "$@"; }',
         'validate_tmp_permissions() { printf "VALIDATE:%s\\n" "$*"; }',
diff --git a/test/repro-2681-group-writable.test.ts b/test/repro-2681-group-writable.test.ts
index a389584373..5d2f6c102c 100644
--- a/test/repro-2681-group-writable.test.ts
+++ b/test/repro-2681-group-writable.test.ts
@@ -16,7 +16,7 @@ import fs from "node:fs";
 import os from "node:os";
 import path from "node:path";
 import { spawnSync } from "node:child_process";
-import { describe, expect, it } from "vitest";
+import { describe, expect, it, vi } from "vitest";
 
 const START_SCRIPT = path.join(import.meta.dirname, "..", "scripts", "nemoclaw-start.sh");
 
@@ -40,6 +40,38 @@ function modeBits(filePath: string): number {
   return fs.statSync(filePath).mode;
 }
 
+function withMockedDockerExecFileSync<T>(calls: string[][], run: () => T): T {
+  // eslint-disable-next-line @typescript-eslint/no-require-imports
+  const dockerExecModule = require("../dist/lib/docker/exec.js") as {
+    dockerExecFileSync: (args: readonly string[]) => string;
+  };
+  const originalDockerExecFileSync = dockerExecModule.dockerExecFileSync;
+  const shieldsModulePath = require.resolve("../dist/lib/shields.js");
+  delete require.cache[shieldsModulePath];
+
+  dockerExecModule.dockerExecFileSync = vi.fn((args: readonly string[]) => {
+    const separator = args.indexOf("--");
+    const command = separator >= 0 ? args.slice(separator + 1) : [...args];
+    calls.push(command);
+    if (command[0] === "stat" && command[1] === "-c") {
+      return command.at(-1) === "/sandbox/.openclaw"
+        ? "2770 sandbox:sandbox\n"
+        : "660 sandbox:sandbox\n";
+    }
+    if (command[0] === "lsattr") {
+      return `---------------------- ${command.at(-1)}\n`;
+    }
+    return "";
+  });
+
+  try {
+    return run();
+  } finally {
+    dockerExecModule.dockerExecFileSync = originalDockerExecFileSync;
+    delete require.cache[shieldsModulePath];
+  }
+}
+
 function mkdtempOnPosixFs(prefix: string): string {
   const roots = process.platform === "linux" ? ["/tmp", os.tmpdir()] : [os.tmpdir()];
   let lastError: unknown = null;
@@ -93,50 +125,29 @@ describe("Issue #2681 — mutable OpenClaw config permissions", () => {
   });
 
   it("shields-down restores OpenClaw group-writable file modes and setgid dirs", () => {
-    const probe = spawnSync(
-      process.execPath,
-      [
-        "-e",
-        String.raw`
-const Module = require("node:module");
-const originalLoad = Module._load;
-const calls = [];
-Module._load = function patchedLoad(request, parent, isMain) {
-  if (request === "./docker/exec") {
-    return {
-      dockerExecFileSync(args) {
-        const separator = args.indexOf("--");
-        const command = separator >= 0 ? args.slice(separator + 1) : args;
-        calls.push(command);
-        if (command[0] === "stat" && command[1] === "-c") {
-          return command.at(-1) === "/sandbox/.openclaw"
-            ? "2770 sandbox:sandbox\n"
-            : "660 sandbox:sandbox\n";
-        }
-        if (command[0] === "lsattr") {
-          return "---------------------- " + command.at(-1) + "\n";
-        }
-        return "";
-      },
-    };
-  }
-  return originalLoad.call(this, request, parent, isMain);
-};
-const { unlockAgentConfig } = require("./dist/lib/shields.js");
-unlockAgentConfig("sandbox-pod", {
-  agentName: "openclaw",
-  configPath: "/sandbox/.openclaw/openclaw.json",
-  configDir: "/sandbox/.openclaw",
-  sensitiveFiles: ["/sandbox/.openclaw/.config-hash"],
-});
-process.stdout.write(JSON.stringify(calls));
-`,
-      ],
-      { encoding: "utf-8", timeout: 5000 },
-    );
+    const commands: string[][] = [];
+    withMockedDockerExecFileSync(commands, () => {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      const { unlockAgentConfig } = require("../dist/lib/shields.js") as {
+        unlockAgentConfig: (
+          sandboxName: string,
+          target: {
+            agentName?: string;
+            configPath: string;
+            configDir: string;
+            sensitiveFiles?: string[];
+          },
+        ) => void;
+      };
+
+      unlockAgentConfig("sandbox-pod", {
+        agentName: "openclaw",
+        configPath: "/sandbox/.openclaw/openclaw.json",
+        configDir: "/sandbox/.openclaw",
+        sensitiveFiles: ["/sandbox/.openclaw/.config-hash"],
+      });
+    });
 
-    expect(probe.status).toBe(0);
-    const commands = JSON.parse(probe.stdout) as string[][];
     expect(commands).toContainEqual(["chmod", "660", "/sandbox/.openclaw/openclaw.json"]);
     expect(commands).toContainEqual(["chmod", "660", "/sandbox/.openclaw/.config-hash"]);
     expect(commands).toContainEqual(["chmod", "2770", "/sandbox/.openclaw"]);
diff --git a/test/shields.test.ts b/test/shields.test.ts
index 6d3d31102d..49f5d73a64 100644
--- a/test/shields.test.ts
+++ b/test/shields.test.ts
@@ -418,6 +418,10 @@ describe("NC-2227-05: shields.ts locks state directories", () => {
     expect(src).toContain("function applyStateDirLockMode");
     expect(src).toContain("workspace-*");
     expect(fnBody).toContain("applyStateDirLockMode");
+    expect(fnBody).toContain('["chmod", "g-s", target.configDir]');
+    expect(src).toContain('["chmod", "g-s", dirPath]');
+    expect(src).toContain("Best effort; do not skip recursive write stripping.");
+    expect(src).toContain('if [ "$dir_mode" = "755" ]; then chmod g-s "$dir"');
     expect(fnBody).toContain("chown");
     expect(fnBody).toContain("root:root");
   });