Skip to content

Commit 4828ef8

Browse files
ericksoacv
andauthored
fix(recovery): add connect probe recovery path (#2646)
## Summary - adds `nemoclaw <sandbox> connect --probe-only` as a non-interactive gateway recovery probe, plus `connect --help` handling that does not require registry or OpenShell recovery - runs gateway health and recovery through `openshell sandbox exec` first, with SSH fallback, so recovery uses the same sandbox exec path as the adversarial tamper probe - centralizes OpenClaw recovery script generation and prepares `/tmp/gateway.log` as the real gateway-owned log before writing `[gateway-recovery] WARNING` and relaunching via `gosu gateway` when available - updates the #2478 e2e to use `connect --probe-only`, assert the warning, and verify a negative-case respawn ## Verification - `npm run build:cli` - `npx vitest run src/lib/agent-runtime.test.ts test/nemoclaw-cli-recovery.test.ts test/cli.test.ts -t "connect --probe-only|shows connect help|buildRecoveryScript|nemoclaw CLI runtime recovery"` - `npm run typecheck:cli` - `tmp_home=$(mktemp -d); HOME="$tmp_home" node bin/nemoclaw.js alpha connect --help` - `git diff --check` - `npx prettier --check src/nemoclaw.ts src/lib/agent-runtime.ts src/lib/agent-runtime.test.ts src/lib/command-registry.ts test/cli.test.ts` - `npx eslint src/nemoclaw.ts src/lib/agent-runtime.ts src/lib/agent-runtime.test.ts src/lib/command-registry.ts test/cli.test.ts` (exit 0; TS files ignored by local ESLint config warnings) ## Local full-hook notes - `npm run format:check -- ...` still runs the repo-wide configured globs and fails on pre-existing formatting issues in unrelated test files. Direct Prettier check on touched files passes. - The full pre-commit/pre-push CLI test hook times out locally in unrelated tests: `test/onboard.test.ts` and `test/sandbox-build-context.test.ts`; after building `nemoclaw/dist`, the focused regression tests above pass. - Initial SSH push was blocked by NVIDIA SAML for the local SSH key; branch was pushed through the authenticated HTTPS GitHub CLI path. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added --probe-only connect mode, implicit connect handling, and OpenClaw-aware recovery with safer gateway start/log selection. * **Bug Fixes** * Hardened gateway log preparation (no symlink-following, append redirection, fd-safe permission/ownership) and deterministic exec-based sandbox probing with SSH fallback. * Simplified/stricter agent-binary verification and improved cross-platform config-owner detection. * **Chores** * Expanded tests and e2e scripts; updated permission-mode expectations and added setgid handling for locking. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Aaron Erickson <aerickson@nvidia.com> Co-authored-by: Carlos Villela <cvillela@nvidia.com>
1 parent e59231a commit 4828ef8

19 files changed

Lines changed: 1154 additions & 202 deletions

scripts/nemoclaw-start.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ normalize_mutable_config_perms() {
230230
# Detect shields-up. Config dir owned by root means shields are
231231
# currently locked; normalizing would weaken the contract.
232232
local config_dir_owner
233-
config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || echo unknown)"
233+
config_dir_owner="$(stat -c '%U' "$config_dir" 2>/dev/null || stat -f '%Su' "$config_dir" 2>/dev/null || echo unknown)"
234234
if [ "$config_dir_owner" = "root" ]; then
235235
return 0
236236
fi

src/lib/agent-onboard.test.ts

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { describe, it, expect, beforeEach, afterEach, afterAll, vi } from "vites
55
import fs from "node:fs";
66
import path from "node:path";
77
// Import from compiled dist/ so coverage is attributed correctly.
8-
import { printDashboardUi } from "../../dist/lib/agent-onboard";
8+
import { printDashboardUi, verifyAgentBinaryAvailable } from "../../dist/lib/agent-onboard";
99
import type { AgentDefinition } from "./agent-defs";
1010

1111
function makeAgent(overrides: Partial<AgentDefinition> = {}): AgentDefinition {
@@ -128,10 +128,10 @@ describe("handleAgentSetup guards", () => {
128128
const source = fs.readFileSync(path.join(import.meta.dirname, "agent-onboard.ts"), "utf-8");
129129

130130
expect(source).toContain("verifyAgentBinaryAvailable");
131-
expect(source).toContain(
132-
'resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"',
133-
);
134-
expect(source).toContain('[ "$resolved" = ${shellQuote(binaryPath)} ]');
131+
expect(source).toContain("[ -e ${shellQuote(binaryPath)} ]");
132+
expect(source).toContain("[ -x ${shellQuote(binaryPath)} ]");
133+
expect(source).not.toContain('[ -n "$resolved" ] || { echo not_found');
134+
expect(source).not.toContain("path_mismatch");
135135
expect(source).toMatch(
136136
/"sandbox",\s*"exec",\s*"-n",\s*sandboxName,\s*"--",\s*"sh",\s*"-lc",\s*script/,
137137
);
@@ -150,4 +150,36 @@ describe("handleAgentSetup guards", () => {
150150
expect(source).toContain('parsed.status === "ok"');
151151
expect(source).not.toContain('.includes("ok")');
152152
});
153+
154+
it("accepts an executable configured binary path when PATH lookup is empty", () => {
155+
let script = "";
156+
const result = verifyAgentBinaryAvailable(
157+
"alpha",
158+
makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }),
159+
(args) => {
160+
script = String(args[7] || "");
161+
return "ok";
162+
},
163+
);
164+
165+
expect(result).toEqual({ available: true });
166+
expect(script).toContain("[ -e '/usr/local/bin/hermes' ]");
167+
expect(script).toContain("[ -x '/usr/local/bin/hermes' ]");
168+
expect(script).not.toContain("command -v 'hermes'");
169+
});
170+
171+
it("does not reject a configured binary when PATH resolves the symlink target", () => {
172+
let script = "";
173+
const result = verifyAgentBinaryAvailable(
174+
"alpha",
175+
makeAgent({ name: "hermes", binary_path: "/usr/local/bin/hermes" }),
176+
(args) => {
177+
script = String(args[7] || "");
178+
return "ok";
179+
},
180+
);
181+
182+
expect(result).toEqual({ available: true });
183+
expect(script).not.toContain("path_mismatch");
184+
});
153185
});

src/lib/agent-onboard.ts

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,12 @@ type AgentBinaryAvailability =
124124
| { available: true }
125125
| {
126126
available: false;
127-
reason: "not_found" | "not_executable" | "path_mismatch";
127+
reason: "not_found" | "not_executable";
128128
binaryPath?: string;
129-
resolvedPath?: string;
130129
};
131130

132-
function verifyAgentBinaryAvailable(
131+
// Exported for unit coverage of the sandbox-side guard without running onboarding.
132+
export function verifyAgentBinaryAvailable(
133133
sandboxName: string,
134134
agent: AgentDefinition,
135135
runCaptureOpenshell: OnboardContext["runCaptureOpenshell"],
@@ -138,10 +138,8 @@ function verifyAgentBinaryAvailable(
138138
const binaryPath = typeof agent.binary_path === "string" ? agent.binary_path.trim() : "";
139139
const script = binaryPath
140140
? [
141-
`resolved="$(command -v ${shellQuote(executable)} 2>/dev/null || true)"`,
142-
`[ -n "$resolved" ] || { echo not_found; exit 1; }`,
141+
`[ -e ${shellQuote(binaryPath)} ] || { echo not_found; exit 1; }`,
143142
`[ -x ${shellQuote(binaryPath)} ] || { echo not_executable; exit 1; }`,
144-
`[ "$resolved" = ${shellQuote(binaryPath)} ] || { printf 'path_mismatch:%s\\n' "$resolved"; exit 1; }`,
145143
"echo ok",
146144
].join(" && ")
147145
: `command -v ${shellQuote(executable)} >/dev/null 2>&1 && echo ok || echo not_found`;
@@ -156,15 +154,6 @@ function verifyAgentBinaryAvailable(
156154
return { available: true };
157155
}
158156
if (binaryPath && result) {
159-
const mismatch = result.match(/path_mismatch:([^\n]+)/);
160-
if (mismatch) {
161-
return {
162-
available: false,
163-
reason: "path_mismatch",
164-
binaryPath,
165-
resolvedPath: mismatch[1].trim(),
166-
};
167-
}
168157
if (result.includes("not_executable")) {
169158
return { available: false, reason: "not_executable", binaryPath };
170159
}
@@ -178,9 +167,6 @@ function describeAgentBinaryFailure(
178167
result: Exclude<AgentBinaryAvailability, { available: true }>,
179168
): string {
180169
const executable = agentExecutableName(agent);
181-
if (result.reason === "path_mismatch") {
182-
return `${agent.displayName} binary '${executable}' resolves to '${result.resolvedPath}', expected '${result.binaryPath}' inside sandbox '${sandboxName}'`;
183-
}
184170
if (result.reason === "not_executable") {
185171
return `${agent.displayName} configured binary '${result.binaryPath}' is not executable inside sandbox '${sandboxName}'`;
186172
}

src/lib/agent-runtime.test.ts

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import { describe, it, expect } from "vitest";
55
// Import from compiled dist/ so coverage is attributed correctly.
6-
import { buildRecoveryScript } from "../../dist/lib/agent-runtime";
6+
import { buildOpenClawRecoveryScript, buildRecoveryScript } from "../../dist/lib/agent-runtime";
77
import type { AgentDefinition } from "./agent-defs";
88

99
function makeAgent(overrides: Partial<AgentDefinition> = {}): AgentDefinition {
@@ -60,13 +60,13 @@ describe("buildRecoveryScript", () => {
6060
it("launches the default gateway command through the validated agent binary", () => {
6161
const script = buildRecoveryScript(minimalAgent, 19000);
6262
expect(script).toContain("command -v 'test-agent'");
63-
expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000');
63+
expect(script).toContain('"$AGENT_BIN" gateway run --port 19000');
6464
});
6565

6666
it("falls back to openclaw gateway run when gateway_command is absent", () => {
6767
const agent = makeAgent({ gateway_command: undefined });
6868
const script = buildRecoveryScript(agent, 19000);
69-
expect(script).toContain('nohup "$AGENT_BIN" gateway run --port 19000');
69+
expect(script).toContain('"$AGENT_BIN" gateway run --port 19000');
7070
});
7171

7272
it("validates and launches custom gateway commands explicitly", () => {
@@ -123,26 +123,101 @@ describe("buildRecoveryScript", () => {
123123

124124
it("writes the warning to gateway.log so it persists for sysadmin tail", () => {
125125
const script = buildRecoveryScript(minimalAgent, 19000);
126-
// Both warnings must end up in /tmp/gateway.log, not just stderr —
126+
// Both warnings must end up in the selected gateway log, not just stderr —
127127
// executeSandboxCommand silently discards stderr from the recovery
128128
// script, so a warning that only goes to stderr is invisible to
129129
// anyone debugging a crash-loop. (#2478)
130-
expect(script).toContain('echo "$_W" >> /tmp/gateway.log');
130+
expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"');
131131
// And the warning must be deferred until AFTER gateway.log is
132-
// freshly touched/chmod'd, otherwise the redirect targets a stale
133-
// file that gets removed seconds later.
134-
const touchIdx = script!.indexOf("touch /tmp/gateway.log");
135-
const warnIdx = script!.indexOf('echo "$_W" >> /tmp/gateway.log');
136-
expect(touchIdx).toBeLessThan(warnIdx);
132+
// safely opened with O_NOFOLLOW, otherwise the redirect targets a
133+
// stale or attacker-controlled file.
134+
const gatewayPrepIdx = script!.indexOf(" /tmp/gateway.log || exit 1;");
135+
const logSelectionIdx = script!.indexOf("_GATEWAY_LOG=/tmp/gateway.log");
136+
const warnIdx = script!.indexOf('echo "$_W" >> "$_GATEWAY_LOG"');
137+
expect(gatewayPrepIdx).toBeGreaterThanOrEqual(0);
138+
expect(logSelectionIdx).toBeGreaterThanOrEqual(0);
139+
expect(warnIdx).toBeGreaterThanOrEqual(0);
140+
expect(gatewayPrepIdx).toBeLessThan(logSelectionIdx);
141+
expect(logSelectionIdx).toBeLessThan(warnIdx);
142+
});
143+
144+
it("stops recovery when hardened log setup fails", () => {
145+
const script = buildOpenClawRecoveryScript(18789);
146+
expect(script).toContain(" /tmp/gateway.log 'gateway' || exit 1;");
147+
expect(script).toContain(" /tmp/auto-pair.log 'sandbox' || exit 1;");
137148
});
138149

139150
it("appends (not truncates) gateway.log on launch so warnings survive", () => {
140151
const script = buildRecoveryScript(minimalAgent, 19000);
141152
// Truncating with `>` wipes the [gateway-recovery] WARNING that the
142153
// recovery script wrote moments earlier — meaning a sysadmin tailing
143154
// gateway.log would see the eventual crash without the explanation.
144-
expect(script).toContain(">> /tmp/gateway.log 2>&1 &");
155+
expect(script).toContain('>> "$_GATEWAY_LOG" 2>&1 &');
145156
expect(script).not.toMatch(/[^>]> \/tmp\/gateway\.log 2>&1 &/);
146157
});
158+
159+
it("preserves an existing gateway.log and has a writable fallback log", () => {
160+
const script = buildOpenClawRecoveryScript(18789);
161+
expect(script).not.toContain("rm -f /tmp/gateway.log");
162+
expect(script).toContain("_GATEWAY_LOG=/tmp/gateway.log");
163+
expect(script).toContain("_GATEWAY_LOG=/tmp/gateway-recovery.log");
164+
expect(script).toContain('echo "$_W" >> "$_GATEWAY_LOG"');
165+
expect(script).toContain('tail -5 "$_GATEWAY_LOG"');
166+
expect(script).not.toContain('echo "$_W" >> /tmp/gateway.log');
167+
expect(script).not.toContain("cat /tmp/gateway.log");
168+
});
169+
170+
it("rejects a symlinked gateway.log before preparing the log", () => {
171+
const script = buildOpenClawRecoveryScript(18789);
172+
const noFollowIdx = script.indexOf("O_NOFOLLOW");
173+
const openIdx = script.indexOf("os.open(path, flags, 0o644)");
174+
const fchownIdx = script.indexOf("os.fchown(fd");
175+
expect(script).toContain("refusing to prepare symlinked /tmp/gateway.log");
176+
expect(script).toContain("sys.exit(1)");
177+
expect(script).not.toContain(": > /tmp/gateway.log");
178+
expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log");
179+
expect(noFollowIdx).toBeGreaterThanOrEqual(0);
180+
expect(openIdx).toBeGreaterThanOrEqual(0);
181+
expect(fchownIdx).toBeGreaterThanOrEqual(0);
182+
expect(noFollowIdx).toBeLessThan(openIdx);
183+
expect(openIdx).toBeLessThan(fchownIdx);
184+
});
185+
186+
it("prepares gateway.log for the real gateway-owned sandbox log", () => {
187+
const script = buildOpenClawRecoveryScript(18789);
188+
expect(script).toContain("os.fchown(fd");
189+
expect(script).toContain("pw.pw_gid");
190+
expect(script).not.toContain("grp.getgrnam");
191+
expect(script).toContain("owner_mode = 0o644");
192+
expect(script).toContain("os.fchmod(fd, owner_mode)");
193+
expect(script).toContain("/tmp/gateway.log 'gateway'");
194+
expect(script).toContain("gosu 'gateway'");
195+
});
196+
197+
it("terminates the conditional launch branch before capturing the gateway pid", () => {
198+
const script = buildOpenClawRecoveryScript(18789);
199+
expect(script).toContain(" fi; GPID=$!");
200+
expect(script).not.toContain(" fi GPID=$!");
201+
});
202+
203+
it("prepares auto-pair.log without unlinking or following symlinks", () => {
204+
const script = buildOpenClawRecoveryScript(18789);
205+
expect(script).toContain("refusing to prepare symlinked /tmp/auto-pair.log");
206+
expect(script).toContain("/tmp/auto-pair.log 'sandbox'");
207+
expect(script).toContain("owner_mode = 0o600");
208+
expect(script).not.toContain("rm -f /tmp/auto-pair.log");
209+
expect(script).not.toContain(": > /tmp/auto-pair.log");
210+
expect(script).not.toContain("touch /tmp/auto-pair.log");
211+
expect(script).not.toContain("chown sandbox:sandbox /tmp/auto-pair.log");
212+
expect(script).not.toContain("chmod 600 /tmp/auto-pair.log");
213+
});
214+
215+
it("does not force non-OpenClaw agents to run as the gateway user", () => {
216+
const script = buildRecoveryScript(minimalAgent, 19000);
217+
expect(script).not.toContain("chown gateway:gateway /tmp/gateway.log");
218+
expect(script).not.toContain("chown 'gateway:gateway' /tmp/gateway.log");
219+
expect(script).not.toContain("gosu gateway");
220+
expect(script).not.toContain("gosu 'gateway'");
221+
});
147222
});
148223
});

0 commit comments

Comments
 (0)