evalops
diff --git a/‎bun.lockb‎
5.2 KB b/‎bun.lockb‎
5.2 KB
diff --git a/‎evals/tools/surface-smoke-cases.json‎
Lines changed: 4 additions & 5 deletions b/‎evals/tools/surface-smoke-cases.json‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎package.json‎
Lines changed: 4 additions & 1 deletion b/‎package.json‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎packages/core/src/sandbox/daytona-sandbox.ts‎
Lines changed: 295 additions & 19 deletions b/‎packages/core/src/sandbox/daytona-sandbox.ts‎
Lines changed: 295 additions & 19 deletions
@@ -110,13 +110,12 @@
 		}
 	},
 	{
-		"name": "extract_document reads a local smoke fixture",
+		"name": "extract_document blocks a local smoke fixture",
 		"kind": "extractDocument",
-		"judgeRubric": "extract_document should preserve the text payload, recognize the text format, and report the downloaded filename.",
+		"judgeRubric": "extract_document should reject private or local document URLs instead of downloading them.",
 		"expected": {
-			"text": "Maestro extract document smoke test",
-			"format": "text",
-			"fileName": "fixture.txt"
+			"blocked": true,
+			"message": "Blocked document URL host: private or local address"
 		}
 	},
 	{
 
@@ -58,11 +58,12 @@
     "check:workflow-footguns": "node scripts/check-workflow-footguns.mjs",
     "check:test-skip-hygiene": "node scripts/check-test-skip-hygiene.mjs",
     "check:test-timing-wait-hygiene": "node scripts/check-test-timing-wait-hygiene.mjs",
+    "check:atomic-write-hygiene": "node scripts/check-atomic-write-hygiene.mjs",
     "check:codex-parity": "node scripts/check-codex-parity-conformance.mjs",
     "check:codex-operating-layer": "node scripts/check-codex-operating-layer-conformance.mjs",
     "check:platform-runtime-conformance": "node scripts/check-platform-runtime-conformance.mjs",
     "check:release-surface": "node scripts/check-release-surface-conformance.mjs",
-    "lint:evals": "bun run lint:headless-proto && node scripts/ensure-deps.js --no-install --workspace @evalops/contracts && node scripts/verify-evals.js && node scripts/verify-tool-versions.js && node scripts/validate-system-paths.js && node scripts/validate-package-boundaries.js && node scripts/validate-public-package-deps.js && node scripts/check-public-surface-boundary.mjs && npm run check:context-manifest && npm run check:session-wire-contract && npm run check:cli-runtime-conformance && npm run check:rpc-protocol-conformance && npm run check:evidence-integrity && npm run check:maestro-release-gate-events && npm run check:session-replay-fixtures && npm run check:agent-trajectory-fixtures && npm run check:agent-trajectory-replay-fixtures && npm run check:agent-trajectory-score-fixtures && npm run check:agent-trajectory-inspection-fixtures && npm run check:agent-trajectory-scenario-fixtures && npm run check:slack-teammate-runtime-scenarios && npm run check:scripted-scenario-fixtures && node scripts/session-wire-format-codegen.mjs --check && node scripts/headless-protocol-codegen.mjs --check && npm run check:app-server-schema && npm run check:drift-surfaces && npm run check:staged-rollout && npm run check:workflow-footguns && npm run check:test-skip-hygiene && npm run check:test-timing-wait-hygiene && npm run check:codex-parity && npm run check:codex-operating-layer && npm run check:platform-runtime-conformance && npm run check:release-surface && npm run developer-surface:check",
+    "lint:evals": "bun run lint:headless-proto && node scripts/ensure-deps.js --no-install --workspace @evalops/contracts && node scripts/verify-evals.js && node scripts/verify-tool-versions.js && node scripts/validate-system-paths.js && node scripts/validate-package-boundaries.js && node scripts/validate-public-package-deps.js && node scripts/check-public-surface-boundary.mjs && npm run check:context-manifest && npm run check:session-wire-contract && npm run check:cli-runtime-conformance && npm run check:rpc-protocol-conformance && npm run check:evidence-integrity && npm run check:maestro-release-gate-events && npm run check:session-replay-fixtures && npm run check:agent-trajectory-fixtures && npm run check:agent-trajectory-replay-fixtures && npm run check:agent-trajectory-score-fixtures && npm run check:agent-trajectory-inspection-fixtures && npm run check:agent-trajectory-scenario-fixtures && npm run check:slack-teammate-runtime-scenarios && npm run check:scripted-scenario-fixtures && node scripts/session-wire-format-codegen.mjs --check && node scripts/headless-protocol-codegen.mjs --check && npm run check:app-server-schema && npm run check:drift-surfaces && npm run check:staged-rollout && npm run check:workflow-footguns && npm run check:test-skip-hygiene && npm run check:test-timing-wait-hygiene && npm run check:atomic-write-hygiene && npm run check:codex-parity && npm run check:codex-operating-layer && npm run check:platform-runtime-conformance && npm run check:release-surface && npm run developer-surface:check",
     "platform:sdk-smoke": "tsx scripts/check-platform-sdk-contract.ts",
     "platform:agentruntime-e2e": "tsx scripts/smoke-platform-agentruntime-lifecycle.ts",
     "platform:timeline-e2e": "tsx scripts/smoke-platform-timeline-e2e.ts",
@@ -182,6 +183,7 @@
     "@bufbuild/protobuf": "^2.11.0",
     "@crosscopy/clipboard": "^0.2.8",
     "@modelcontextprotocol/sdk": "^1.29.0",
+    "@napi-rs/keyring": "^1.3.0",
     "@openai/codex": "^0.135.0",
     "@opentelemetry/api": "^1.9.1",
     "@opentelemetry/auto-instrumentations-node": "^0.76.0",
@@ -221,6 +223,7 @@
     "postgres": "^3.4.8",
     "smol-toml": "^1.6.1",
     "string-width": "^8.2.0",
+    "undici": "^7.25.0",
     "uuid": "^14.0.0",
     "vscode-jsonrpc": "^8.2.1",
     "ws": "^8.20.0",
 
@@ -8,8 +8,14 @@
  * Caches the sandbox handle to avoid redundant API calls per operation.
  */
 
+import { randomUUID } from "node:crypto";
+import { StringDecoder } from "node:string_decoder";
 import { Daytona } from "@daytonaio/sdk";
-import type { ExecResult, Sandbox } from "../../../../src/sandbox/types.js";
+import type {
+	ExecResult,
+	ExecWithArgsOptions,
+	Sandbox,
+} from "../../../../src/sandbox/types.js";
 
 export interface DaytonaSandboxConfig {
 	apiKey: string;
@@ -22,9 +28,244 @@ type SandboxHandle = Awaited<
 	ReturnType<InstanceType<typeof Daytona>["create"]>
 >;
 
+type DaytonaSessionCommand = {
+	cmdId?: string;
+	exitCode?: number;
+};
+
+type DaytonaSessionLogs = {
+	output?: string;
+	stdout?: string;
+	stderr?: string;
+};
+
+type DaytonaProcessApi = SandboxHandle["process"] & {
+	createSession?: (sessionId: string) => Promise<void>;
+	deleteSession?: (sessionId: string) => Promise<void>;
+	executeSessionCommand?: (
+		sessionId: string,
+		req: {
+			command: string;
+			runAsync?: boolean;
+			suppressInputEcho?: boolean;
+		},
+		timeout?: number,
+	) => Promise<DaytonaSessionCommand>;
+	getSessionCommand?: (
+		sessionId: string,
+		commandId: string,
+	) => Promise<DaytonaSessionCommand>;
+	getSessionCommandLogs?: (
+		sessionId: string,
+		commandId: string,
+	) => Promise<DaytonaSessionLogs>;
+};
+
+const SESSION_POLL_MS = 100;
+const SESSION_COMMAND_TIMEOUT_MS = 90_000;
+const EXEC_OUTPUT_MAX_BUFFER = 40 * 1024;
+
+function cancelledExecResult(): ExecResult {
+	return { stdout: "", stderr: "", exitCode: 1 };
+}
+
+function quoteShellArg(value: string): string {
+	if (/^[A-Za-z0-9_./:=@%+,-]+$/u.test(value)) {
+		return value;
+	}
+	return `'${value.replace(/'/g, `'\\''`)}'`;
+}
+
+function truncateOutput(value: string, maxBuffer?: number): string {
+	if (maxBuffer === undefined) {
+		return value;
+	}
+	const bytes = Buffer.from(value);
+	if (bytes.length <= maxBuffer) {
+		return value;
+	}
+	// `Buffer#toString("utf-8")` on a raw slice emits U+FFFD when the cut
+	// lands inside a multi-byte sequence. Decode through StringDecoder
+	// instead — `write()` returns only complete characters and buffers any
+	// trailing partial bytes internally. Since we discard everything past
+	// `maxBuffer`, the buffered bytes are dropped silently, so the result
+	// is always ≤ maxBuffer bytes and never contains a replacement
+	// character at the boundary.
+	const decoder = new StringDecoder("utf8");
+	return decoder.write(bytes.subarray(0, maxBuffer));
+}
+
+function sleep(ms: number): Promise<void> {
+	return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
 export class DaytonaSandbox implements Sandbox {
 	private constructor(private handle: SandboxHandle) {}
 
+	private hasSessionApi(processApi: DaytonaProcessApi): boolean {
+		return !!(
+			processApi.createSession &&
+			processApi.deleteSession &&
+			processApi.executeSessionCommand &&
+			processApi.getSessionCommand &&
+			processApi.getSessionCommandLogs
+		);
+	}
+
+	private buildShellCommand(
+		command: string,
+		cwd?: string,
+		env?: Record<string, string>,
+	): string {
+		let fullCommand = command;
+		if (env && Object.keys(env).length > 0) {
+			const envPrefix = Object.entries(env)
+				.map(([k, v]) => {
+					if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(k)) {
+						throw new Error(`Invalid environment variable name: ${k}`);
+					}
+					const escaped = v.replace(/'/g, "'\\''");
+					return `${k}='${escaped}'`;
+				})
+				.join(" ");
+			fullCommand = `${envPrefix} ${fullCommand}`;
+		}
+		if (cwd) {
+			const escapedCwd = cwd.replace(/'/g, "'\\''");
+			fullCommand = `cd '${escapedCwd}' && ${fullCommand}`;
+		}
+		return fullCommand;
+	}
+
+	private async execWithSession(
+		command: string,
+		options: ExecWithArgsOptions = {},
+	): Promise<ExecResult> {
+		const processApi = this.handle.process as DaytonaProcessApi;
+		if (!this.hasSessionApi(processApi)) {
+			if (options.signal?.aborted) {
+				return cancelledExecResult();
+			}
+			if (options.signal) {
+				throw new Error(
+					"Daytona abortable execution requires session API support",
+				);
+			}
+			const result = await processApi.executeCommand(command);
+			return {
+				stdout: truncateOutput(result.result, options.maxBuffer),
+				stderr: "",
+				exitCode: result.exitCode,
+			};
+		}
+
+		const sessionId = `maestro-exec-${randomUUID()}`;
+		let sessionDeleted = false;
+		let sessionDeletePromise: Promise<void> | undefined;
+		const deleteSession = async (): Promise<void> => {
+			if (sessionDeleted) {
+				return;
+			}
+			if (sessionDeletePromise) {
+				await sessionDeletePromise;
+				if (sessionDeleted) {
+					return;
+				}
+			}
+			sessionDeletePromise = (async () => {
+				try {
+					await processApi.deleteSession!(sessionId);
+					sessionDeleted = true;
+				} catch {
+					// The session may not exist yet during setup cancellation.
+				} finally {
+					sessionDeletePromise = undefined;
+				}
+			})();
+			await sessionDeletePromise;
+		};
+		// Tracks whether the async session command was started but never
+		// observed to complete. We use this to warn loudly if the caller
+		// aborts mid-execution: Daytona's `deleteSession` is documented to
+		// terminate the associated process (see
+		// `deleteSessionDeprecated`: "Delete a PTY session and terminate the
+		// associated process"), but the SDK exposes no direct
+		// command-cancellation endpoint, so the in-flight remote process
+		// outliving the session would be invisible to us without this log.
+		let inflightCmdId: string | null = null;
+		const abortSession = (): void => {
+			void deleteSession();
+		};
+		options.signal?.addEventListener("abort", abortSession, { once: true });
+
+		try {
+			if (options.signal?.aborted) {
+				return cancelledExecResult();
+			}
+			await processApi.createSession(sessionId);
+			if (options.signal?.aborted) {
+				return cancelledExecResult();
+			}
+
+			const response = await processApi.executeSessionCommand(sessionId, {
+				command,
+				runAsync: true,
+				suppressInputEcho: true,
+			});
+			if (!response.cmdId) {
+				throw new Error("Daytona session command did not return a command id");
+			}
+			inflightCmdId = response.cmdId;
+
+			const startedAt = Date.now();
+			while (!options.signal?.aborted) {
+				if (Date.now() - startedAt >= SESSION_COMMAND_TIMEOUT_MS) {
+					throw new Error("Daytona session command timed out");
+				}
+				const commandState = await processApi.getSessionCommand(
+					sessionId,
+					response.cmdId,
+				);
+				if (options.signal?.aborted) {
+					return cancelledExecResult();
+				}
+				if (typeof commandState.exitCode === "number") {
+					inflightCmdId = null;
+					const logs = await processApi.getSessionCommandLogs(
+						sessionId,
+						response.cmdId,
+					);
+					if (options.signal?.aborted) {
+						return cancelledExecResult();
+					}
+					return {
+						stdout: truncateOutput(
+							logs.stdout ?? logs.output ?? "",
+							options.maxBuffer,
+						),
+						stderr: truncateOutput(logs.stderr ?? "", options.maxBuffer),
+						exitCode: commandState.exitCode,
+					};
+				}
+				await sleep(SESSION_POLL_MS);
+			}
+
+			return cancelledExecResult();
+		} finally {
+			options.signal?.removeEventListener("abort", abortSession);
+			await deleteSession();
+			if (options.signal?.aborted && inflightCmdId) {
+				// Surface the residual-process risk so a stuck/long-lived
+				// remote command after an aborted session is at least
+				// observable. The Daytona session API does not currently
+				// expose a way for us to verify termination ourselves.
+				console.warn(
+					`[daytona] Session ${sessionId} aborted with command ${inflightCmdId} still in flight; relying on Daytona's documented deleteSession-terminates-process contract.`,
+				);
+			}
+		}
+	}
+
 	/**
 	 * Create a new Daytona sandbox. This is async because it provisions
 	 * a remote sandbox environment.
@@ -49,30 +290,65 @@ export class DaytonaSandbox implements Sandbox {
 		command: string,
 		cwd?: string,
 		env?: Record<string, string>,
+		signal?: AbortSignal,
 	): Promise<ExecResult> {
 		try {
-			// Build command with env vars and cwd if provided
-			let fullCommand = command;
-			if (env && Object.keys(env).length > 0) {
-				const envPrefix = Object.entries(env)
-					.map(([k, v]) => {
-						if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(k)) {
-							throw new Error(`Invalid environment variable name: ${k}`);
-						}
-						// Use single quotes to prevent shell interpretation
-						const escaped = v.replace(/'/g, "'\\''");
-						return `${k}='${escaped}'`;
-					})
-					.join(" ");
-				fullCommand = `${envPrefix} ${fullCommand}`;
+			const fullCommand = this.buildShellCommand(command, cwd, env);
+			const processApi = this.handle.process as DaytonaProcessApi;
+			if (signal?.aborted) {
+				return cancelledExecResult();
+			}
+			if (signal && this.hasSessionApi(processApi)) {
+				return await this.execWithSession(fullCommand, {
+					signal,
+					maxBuffer: EXEC_OUTPUT_MAX_BUFFER,
+				});
 			}
-			if (cwd) {
-				const escapedCwd = cwd.replace(/'/g, "'\\''");
-				fullCommand = `cd '${escapedCwd}' && ${fullCommand}`;
+			const result = await processApi.executeCommand(fullCommand);
+			// Apply the same `EXEC_OUTPUT_MAX_BUFFER` cap as the session
+			// path so a single sandbox can't accidentally load unbounded
+			// log output through one entry point but not the other
+			// (Cursor Bugbot rounds 4–5 on PR #2748).
+			return {
+				stdout: truncateOutput(result.result, EXEC_OUTPUT_MAX_BUFFER),
+				stderr: "",
+				exitCode: result.exitCode,
+			};
+		} catch (err) {
+			return {
+				stdout: "",
+				stderr: err instanceof Error ? err.message : String(err),
+				exitCode: 1,
+			};
+		}
+	}
+
+	async execWithArgs(
+		command: string,
+		args: string[] = [],
+		options: ExecWithArgsOptions = {},
+	): Promise<ExecResult> {
+		try {
+			const fullCommand = this.buildShellCommand(
+				[command, ...args].map(quoteShellArg).join(" "),
+				options.cwd,
+				options.env,
+			);
+			// Default `maxBuffer` to `EXEC_OUTPUT_MAX_BUFFER` so both the
+			// signal/session path and the plain executeCommand path apply
+			// the same cap. Without this default the caller could omit
+			// `maxBuffer` and load unbounded stdout — the inconsistency
+			// Cursor Bugbot flagged on PR #2748.
+			const maxBuffer = options.maxBuffer ?? EXEC_OUTPUT_MAX_BUFFER;
+			if (options.signal) {
+				return await this.execWithSession(fullCommand, {
+					...options,
+					maxBuffer,
+				});
 			}
 			const result = await this.handle.process.executeCommand(fullCommand);
 			return {
-				stdout: result.result,
+				stdout: truncateOutput(result.result, maxBuffer),
 				stderr: "",
 				exitCode: result.exitCode,
 			};
Original file line number	Diff line number	Diff line change
`@@ -110,13 +110,12 @@`
`110`	`110`	`}`
`111`	`111`	`},`
`112`	`112`	`{`
`113`		`- "name": "extract_document reads a local smoke fixture",`
	`113`	`+ "name": "extract_document blocks a local smoke fixture",`
`114`	`114`	`"kind": "extractDocument",`
`115`		`- "judgeRubric": "extract_document should preserve the text payload, recognize the text format, and report the downloaded filename.",`
	`115`	`+ "judgeRubric": "extract_document should reject private or local document URLs instead of downloading them.",`
`116`	`116`	`"expected": {`
`117`		`- "text": "Maestro extract document smoke test",`
`118`		`- "format": "text",`
`119`		`- "fileName": "fixture.txt"`
	`117`	`+ "blocked": true,`
	`118`	`+ "message": "Blocked document URL host: private or local address"`
`120`	`119`	`}`
`121`	`120`	`},`
`122`	`121`	`{`