diff --git a/packages/core/examples/gpt5-4-cua-example.ts b/packages/core/examples/gpt5-4-cua-example.ts new file mode 100644 index 000000000..3c83c7e63 --- /dev/null +++ b/packages/core/examples/gpt5-4-cua-example.ts @@ -0,0 +1,47 @@ +import { Stagehand } from "../lib/v3/index.js"; +import chalk from "chalk"; + +async function main() { + console.log(`\n${chalk.bold("Stagehand ๐Ÿค˜ GPT-5.4 CUA Demo")}\n`); + + const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 2, + }); + await stagehand.init(); + + try { + const page = stagehand.context.pages()[0]; + + const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "openai/gpt-5.4", + apiKey: process.env.OPENAI_API_KEY, + }, + systemPrompt: `You are a helpful assistant that can use a web browser. + Do not ask follow up questions, the user will trust your judgement. + Today's date is ${new Date().toLocaleDateString()}.`, + }); + + await page.goto("https://news.ycombinator.com"); + + const instruction = + "Find the top story on Hacker News and tell me its title, link, and point count and then click on it and extract a summary for me"; + console.log(`Instruction: ${chalk.white(instruction)}`); + + const result = await agent.execute({ + instruction, + maxSteps: 10, + }); + + console.log(`\n${chalk.green("โœ“")} Done`); + console.log(`${chalk.yellow("โคท")} ${result.message}`); + } catch (error) { + console.error(`${chalk.red("โœ—")} Error:`, error); + } finally { + await stagehand.close(); + } +} + +main(); diff --git a/packages/core/lib/v3/agent/AgentProvider.ts b/packages/core/lib/v3/agent/AgentProvider.ts index 248e61805..b9bdfdd53 100644 --- a/packages/core/lib/v3/agent/AgentProvider.ts +++ b/packages/core/lib/v3/agent/AgentProvider.ts @@ -14,6 +14,7 @@ import { MicrosoftCUAClient } from "./MicrosoftCUAClient.js"; // Map model names to their provider types export const modelToAgentProviderMap: Record = { + "gpt-5.4": "openai", "computer-use-preview": "openai", "computer-use-preview-2025-03-11": "openai", "claude-sonnet-4-20250514": "anthropic", diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts index 35a6d16df..184346381 100644 --- a/packages/core/lib/v3/agent/OpenAICUAClient.ts +++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts @@ -56,6 +56,10 @@ export class OpenAICUAClient extends AgentClient { private tools?: ToolSet; private safetyConfirmationHandler?: SafetyConfirmationHandler; + private get usesNewComputerTool(): boolean { + return this.modelName.startsWith("gpt-5"); + } + constructor( type: AgentType, modelName: string, @@ -293,15 +297,15 @@ export class OpenAICUAClient extends AgentClient { if (item.type === "computer_call" && this.isComputerCallItem(item)) { logger({ category: "agent", - message: `Found computer_call: ${item.action.type}, payload: ${JSON.stringify(item.action)}, call_id: ${item.call_id}`, + message: `Found computer_call with call_id: ${item.call_id}`, level: 2, }); - const action = this.convertComputerCallToAction(item); - if (action) { + const actions = this.convertComputerCallToActions(item); + for (const action of actions) { stepActions.push(action); logger({ category: "agent", - message: `Converted computer_call to action: ${action.type}`, + message: `Found computer_call action: ${action.type}, payload: ${JSON.stringify(action)}, call_id: ${item.call_id}`, level: 2, }); } @@ -385,8 +389,8 @@ export class OpenAICUAClient extends AgentClient { return ( item.type === "computer_call" && "call_id" in item && - "action" in item && - typeof item.action === "object" + (("action" in item && typeof item.action === "object") || + ("actions" in item && Array.isArray(item.actions))) ); } @@ -487,19 +491,21 @@ export class OpenAICUAClient extends AgentClient { usage: Record; }> { try { - // Create the request parameters - const requestParams: Record = { - model: this.modelName, - tools: [ - { - type: "computer_use_preview", + // Create the request parameters, branching on tool format + const computerTool = this.usesNewComputerTool + ? { type: "computer" as const } + : { + type: "computer_use_preview" as const, display_width: this.currentViewport.width, display_height: this.currentViewport.height, environment: this.environment, - }, - ], + }; + + const requestParams: Record = { + model: this.modelName, + tools: [computerTool], input: inputItems, - truncation: "auto", + ...(this.usesNewComputerTool ? {} : { truncation: "auto" }), }; // Add custom tools if available @@ -601,29 +607,38 @@ export class OpenAICUAClient extends AgentClient { // Process each output item for (const item of output) { if (item.type === "computer_call" && this.isComputerCallItem(item)) { - // Handle computer calls + // Handle computer calls (both single-action and batched-actions formats) try { - const action = this.convertComputerCallToAction(item); + const actions = this.convertComputerCallToActions(item); - if (action && this.actionHandler) { - logger({ - category: "agent", - message: `Executing computer action: ${action.type}`, - level: 1, - }); - await this.actionHandler(action); + if (this.actionHandler) { + for (const action of actions) { + logger({ + category: "agent", + message: `Executing computer action: ${action.type}`, + level: 1, + }); + await this.actionHandler(action); + } } - // Capture a screenshot + // Capture a screenshot after all actions in the batch const screenshot = await this.captureScreenshot(); - // Create a computer_call_output for the next request + // Build the output โ€” use "computer_screenshot" for new format, "input_image" for legacy + const outputType = this.usesNewComputerTool + ? ("computer_screenshot" as const) + : ("input_image" as const); + const outputItem = { type: "computer_call_output" as const, call_id: item.call_id, output: { - type: "input_image" as const, + type: outputType, image_url: screenshot, + ...(this.usesNewComputerTool + ? { detail: "original" as const } + : {}), }, } as ResponseInputItem; @@ -633,13 +648,13 @@ export class OpenAICUAClient extends AgentClient { level: 2, }); - // Add current URL if available - if (this.currentUrl) { + // Legacy format supports current_url on the output; new format does not + if (!this.usesNewComputerTool && this.currentUrl) { const computerCallOutput = outputItem as { type: "computer_call_output"; call_id: string; output: { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; current_url?: string; }; @@ -662,7 +677,7 @@ export class OpenAICUAClient extends AgentClient { type: "computer_call_output"; call_id: string; output: { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; }; acknowledged_safety_checks?: SafetyCheck[]; @@ -687,26 +702,31 @@ export class OpenAICUAClient extends AgentClient { }); try { - // Capture a screenshot even on error const screenshot = await this.captureScreenshot(); + const outputType = this.usesNewComputerTool + ? ("computer_screenshot" as const) + : ("input_image" as const); + const errorOutputItem = { type: "computer_call_output" as const, call_id: item.call_id, output: { - type: "input_image" as const, + type: outputType, image_url: screenshot, error: errorMessage, + ...(this.usesNewComputerTool + ? { detail: "original" as const } + : {}), }, } as ResponseInputItem; - // Add current URL if available - if (this.currentUrl) { + if (!this.usesNewComputerTool && this.currentUrl) { const computerCallOutput = errorOutputItem as { type: "computer_call_output"; call_id: string; output: { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; current_url?: string; }; @@ -729,7 +749,7 @@ export class OpenAICUAClient extends AgentClient { type: "computer_call_output"; call_id: string; output: { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; }; acknowledged_safety_checks?: SafetyCheck[]; @@ -744,14 +764,12 @@ export class OpenAICUAClient extends AgentClient { if (screenshotError instanceof StagehandClosedError) { throw screenshotError; } - // If we can't capture a screenshot, just send the error logger({ category: "agent", message: `Error capturing screenshot: ${String(screenshotError)}`, level: 0, }); - // For error cases without a screenshot, we need to use a string output nextInputItems.push({ type: "computer_call_output", call_id: item.call_id, @@ -863,12 +881,11 @@ export class OpenAICUAClient extends AgentClient { call: ComputerCallItem, ): AgentAction | null { const { action } = call; + if (!action) return null; - // Instead of wrapping the action in a params object, spread the action properties directly - // This ensures properties like x, y, button, etc. are directly accessible on the AgentAction return { type: action.type as string, - ...action, // Spread all properties from the action + ...action, }; } @@ -894,6 +911,18 @@ export class OpenAICUAClient extends AgentClient { } } + private convertComputerCallToActions(call: ComputerCallItem): AgentAction[] { + if (call.actions && Array.isArray(call.actions)) { + return call.actions.map((action) => ({ + type: action.type as string, + ...action, + })); + } + + const single = this.convertComputerCallToAction(call); + return single ? [single] : []; + } + private convertFunctionCallToAction( call: FunctionCallItem, ): AgentAction | null { diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 46da95d6d..6cefa4b4d 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -129,8 +129,8 @@ export class V3CuaAgentHandler { } } await new Promise((r) => setTimeout(r, 300)); - // Skip logging for screenshot actions - they're no-ops, the actual - // Page.screenshot in captureAndSendScreenshot() is logged separately + // Skip logging for screenshot actions - they're no-ops; the CUA client + // takes its own screenshot via screenshotProvider between API turns. const shouldLog = action.type !== "screenshot"; if (shouldLog) { await FlowLogger.runWithLogging( @@ -151,17 +151,6 @@ export class V3CuaAgentHandler { action.timestamp = Date.now(); await new Promise((r) => setTimeout(r, waitBetween)); - try { - await this.captureAndSendScreenshot(); - } catch (e) { - this.logger({ - category: "agent", - message: `Warning: Failed to take screenshot after action: ${String( - (e as Error)?.message ?? e, - )}`, - level: 1, - }); - } } catch (error) { const msg = (error as Error)?.message ?? String(error); this.logger({ @@ -503,7 +492,8 @@ export class V3CuaAgentHandler { return { success: true }; } case "screenshot": { - // No-op - screenshot is captured by captureAndSendScreenshot() after all actions + // No-op - the CUA client captures a screenshot itself after each + // computer_call (or batch of actions) for the next request. return { success: true }; } case "goto": { diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts index 7278e0c2d..52ea87948 100644 --- a/packages/core/lib/v3/types/public/agent.ts +++ b/packages/core/lib/v3/types/public/agent.ts @@ -449,6 +449,7 @@ export type AgentType = | "bedrock"; export const AVAILABLE_CUA_MODELS = [ + "openai/gpt-5.4", "openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", "anthropic/claude-opus-4-5-20251101", @@ -577,10 +578,14 @@ export interface ResponseItem { export interface ComputerCallItem extends ResponseItem { type: "computer_call"; call_id: string; - action: { + action?: { type: string; [key: string]: unknown; }; + actions?: Array<{ + type: string; + [key: string]: unknown; + }>; pending_safety_checks?: Array<{ id: string; code: string; @@ -602,8 +607,9 @@ export type ResponseInputItem = call_id: string; output: | { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; + detail?: "original" | "high" | "low"; current_url?: string; error?: string; [key: string]: unknown; diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index abb15d0f2..b3d584c25 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -432,3 +432,76 @@ describe("agent captcha hooks", () => { ).toBe(true); }); }); + +describe("v3 cua handler screenshot behavior", () => { + let page: MockPage; + let logs: LogLine[]; + let logger: (line: LogLine) => void; + + beforeEach(() => { + page = new MockPage(); + logs = []; + logger = (line) => { + logs.push(line); + }; + fakeCuaClient = new FakeCuaClient(); + }); + + it("does not take per-action screenshots when a batch of actions runs", async () => { + const screenshotSpy = vi.spyOn(page, "screenshot"); + const batchSize = 4; + + fakeCuaClient.executeImpl = vi.fn(async () => { + for (let i = 0; i < batchSize; i += 1) { + await fakeCuaClient.actionHandler?.({ + type: "scroll", + x: 0, + y: 0, + scroll_x: 0, + scroll_y: 100, + }); + } + return { + success: true, + message: "ok", + actions: [], + completed: true, + }; + }); + + const handler = new V3CuaAgentHandler( + { + context: { + awaitActivePage: async () => page, + }, + bus: { emit: vi.fn() }, + isCaptchaAutoSolveEnabled: false, + isAdvancedStealth: false, + configuredViewport: { width: 1288, height: 711 }, + isAgentReplayActive: () => false, + updateMetrics: vi.fn(), + } as never, + logger, + { + modelName: "openai/gpt-5.4", + clientOptions: { waitBetweenActions: 1 }, + } as never, + ); + + vi.spyOn( + handler as unknown as { + executeAction: (action: Record) => Promise; + }, + "executeAction", + ).mockResolvedValue({ success: true }); + + await handler.execute({ + instruction: "scroll to the bottom", + highlightCursor: false, + }); + + // The handler must not call page.screenshot for each action in a batch โ€” + // the CUA client takes a single screenshot after all actions itself. + expect(screenshotSpy).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/core/tests/unit/public-api/llm-and-agents.test.ts b/packages/core/tests/unit/public-api/llm-and-agents.test.ts index cfec8a209..baf5f3665 100644 --- a/packages/core/tests/unit/public-api/llm-and-agents.test.ts +++ b/packages/core/tests/unit/public-api/llm-and-agents.test.ts @@ -39,6 +39,7 @@ describe("LLM and Agents public API types", () => { const expectedModels = [ "openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", + "openai/gpt-5.4", "anthropic/claude-opus-4-5-20251101", "anthropic/claude-opus-4-6", "anthropic/claude-sonnet-4-6",