Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions packages/core/examples/gpt5-4-cua-example.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { Stagehand } from "../lib/v3/index.js";
import chalk from "chalk";

async function main() {
console.log(`\n${chalk.bold("Stagehand 🤘 GPT-5.4 CUA Demo")}\n`);

const stagehand = new Stagehand({
env: "LOCAL",
verbose: 2,
});
await stagehand.init();

try {
const page = stagehand.context.pages()[0];

const agent = stagehand.agent({
mode: "cua",
model: {
modelName: "openai/gpt-5.4",
apiKey: process.env.OPENAI_API_KEY,
},
systemPrompt: `You are a helpful assistant that can use a web browser.
Do not ask follow up questions, the user will trust your judgement.
Today's date is ${new Date().toLocaleDateString()}.`,
});

await page.goto("https://news.ycombinator.com");

const instruction =
"Find the top story on Hacker News and tell me its title, link, and point count and then click on it and extract a summary for me";
console.log(`Instruction: ${chalk.white(instruction)}`);

const result = await agent.execute({
instruction,
maxSteps: 10,
});

console.log(`\n${chalk.green("✓")} Done`);
console.log(`${chalk.yellow("⤷")} ${result.message}`);
} catch (error) {
console.error(`${chalk.red("✗")} Error:`, error);
} finally {
await stagehand.close();
}
}

main();
1 change: 1 addition & 0 deletions packages/core/lib/v3/agent/AgentProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { MicrosoftCUAClient } from "./MicrosoftCUAClient.js";

// Map model names to their provider types
export const modelToAgentProviderMap: Record<string, AgentProviderType> = {
"gpt-5.4": "openai",
"computer-use-preview": "openai",
"computer-use-preview-2025-03-11": "openai",
"claude-sonnet-4-20250514": "anthropic",
Expand Down
113 changes: 71 additions & 42 deletions packages/core/lib/v3/agent/OpenAICUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ export class OpenAICUAClient extends AgentClient {
private tools?: ToolSet;
private safetyConfirmationHandler?: SafetyConfirmationHandler;

private get usesNewComputerTool(): boolean {
return this.modelName.startsWith("gpt-5");
Comment thread
miguelg719 marked this conversation as resolved.
}

constructor(
type: AgentType,
modelName: string,
Expand Down Expand Up @@ -293,15 +297,15 @@ export class OpenAICUAClient extends AgentClient {
if (item.type === "computer_call" && this.isComputerCallItem(item)) {
logger({
category: "agent",
message: `Found computer_call: ${item.action.type}, payload: ${JSON.stringify(item.action)}, call_id: ${item.call_id}`,
message: `Found computer_call with call_id: ${item.call_id}`,
level: 2,
});
const action = this.convertComputerCallToAction(item);
if (action) {
const actions = this.convertComputerCallToActions(item);
for (const action of actions) {
stepActions.push(action);
logger({
category: "agent",
message: `Converted computer_call to action: ${action.type}`,
message: `Found computer_call action: ${action.type}, payload: ${JSON.stringify(action)}, call_id: ${item.call_id}`,
level: 2,
});
}
Expand Down Expand Up @@ -385,8 +389,8 @@ export class OpenAICUAClient extends AgentClient {
return (
item.type === "computer_call" &&
"call_id" in item &&
"action" in item &&
typeof item.action === "object"
(("action" in item && typeof item.action === "object") ||
("actions" in item && Array.isArray(item.actions)))
);
}

Expand Down Expand Up @@ -487,19 +491,21 @@ export class OpenAICUAClient extends AgentClient {
usage: Record<string, number>;
}> {
try {
// Create the request parameters
const requestParams: Record<string, unknown> = {
model: this.modelName,
tools: [
{
type: "computer_use_preview",
// Create the request parameters, branching on tool format
const computerTool = this.usesNewComputerTool
? { type: "computer" as const }
: {
type: "computer_use_preview" as const,
display_width: this.currentViewport.width,
display_height: this.currentViewport.height,
environment: this.environment,
},
],
};

const requestParams: Record<string, unknown> = {
model: this.modelName,
tools: [computerTool],
input: inputItems,
truncation: "auto",
...(this.usesNewComputerTool ? {} : { truncation: "auto" }),
};

// Add custom tools if available
Expand Down Expand Up @@ -601,29 +607,38 @@ export class OpenAICUAClient extends AgentClient {
// Process each output item
for (const item of output) {
if (item.type === "computer_call" && this.isComputerCallItem(item)) {
// Handle computer calls
// Handle computer calls (both single-action and batched-actions formats)
try {
const action = this.convertComputerCallToAction(item);
const actions = this.convertComputerCallToActions(item);

if (action && this.actionHandler) {
logger({
category: "agent",
message: `Executing computer action: ${action.type}`,
level: 1,
});
await this.actionHandler(action);
if (this.actionHandler) {
for (const action of actions) {
logger({
category: "agent",
message: `Executing computer action: ${action.type}`,
level: 1,
});
await this.actionHandler(action);
}
}

// Capture a screenshot
// Capture a screenshot after all actions in the batch
const screenshot = await this.captureScreenshot();

// Create a computer_call_output for the next request
// Build the output — use "computer_screenshot" for new format, "input_image" for legacy
const outputType = this.usesNewComputerTool
? ("computer_screenshot" as const)
: ("input_image" as const);

const outputItem = {
type: "computer_call_output" as const,
call_id: item.call_id,
output: {
type: "input_image" as const,
type: outputType,
image_url: screenshot,
...(this.usesNewComputerTool
? { detail: "original" as const }
: {}),
},
} as ResponseInputItem;

Expand All @@ -633,13 +648,13 @@ export class OpenAICUAClient extends AgentClient {
level: 2,
});

// Add current URL if available
if (this.currentUrl) {
// Legacy format supports current_url on the output; new format does not
if (!this.usesNewComputerTool && this.currentUrl) {
const computerCallOutput = outputItem as {
type: "computer_call_output";
call_id: string;
output: {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
current_url?: string;
};
Expand All @@ -662,7 +677,7 @@ export class OpenAICUAClient extends AgentClient {
type: "computer_call_output";
call_id: string;
output: {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
};
acknowledged_safety_checks?: SafetyCheck[];
Expand All @@ -687,26 +702,31 @@ export class OpenAICUAClient extends AgentClient {
});

try {
// Capture a screenshot even on error
const screenshot = await this.captureScreenshot();

const outputType = this.usesNewComputerTool
? ("computer_screenshot" as const)
: ("input_image" as const);

const errorOutputItem = {
type: "computer_call_output" as const,
call_id: item.call_id,
output: {
type: "input_image" as const,
type: outputType,
image_url: screenshot,
error: errorMessage,
...(this.usesNewComputerTool
? { detail: "original" as const }
: {}),
},
} as ResponseInputItem;

// Add current URL if available
if (this.currentUrl) {
if (!this.usesNewComputerTool && this.currentUrl) {
const computerCallOutput = errorOutputItem as {
type: "computer_call_output";
call_id: string;
output: {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
current_url?: string;
};
Expand All @@ -729,7 +749,7 @@ export class OpenAICUAClient extends AgentClient {
type: "computer_call_output";
call_id: string;
output: {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
};
acknowledged_safety_checks?: SafetyCheck[];
Expand All @@ -744,14 +764,12 @@ export class OpenAICUAClient extends AgentClient {
if (screenshotError instanceof StagehandClosedError) {
throw screenshotError;
}
// If we can't capture a screenshot, just send the error
logger({
category: "agent",
message: `Error capturing screenshot: ${String(screenshotError)}`,
level: 0,
});

// For error cases without a screenshot, we need to use a string output
nextInputItems.push({
type: "computer_call_output",
call_id: item.call_id,
Expand Down Expand Up @@ -863,12 +881,11 @@ export class OpenAICUAClient extends AgentClient {
call: ComputerCallItem,
): AgentAction | null {
const { action } = call;
if (!action) return null;

// Instead of wrapping the action in a params object, spread the action properties directly
// This ensures properties like x, y, button, etc. are directly accessible on the AgentAction
return {
type: action.type as string,
...action, // Spread all properties from the action
...action,
};
}

Expand All @@ -894,6 +911,18 @@ export class OpenAICUAClient extends AgentClient {
}
}

private convertComputerCallToActions(call: ComputerCallItem): AgentAction[] {
if (call.actions && Array.isArray(call.actions)) {
return call.actions.map((action) => ({
type: action.type as string,
...action,
}));
}

const single = this.convertComputerCallToAction(call);
return single ? [single] : [];
}

private convertFunctionCallToAction(
call: FunctionCallItem,
): AgentAction | null {
Expand Down
18 changes: 4 additions & 14 deletions packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ export class V3CuaAgentHandler {
}
}
await new Promise((r) => setTimeout(r, 300));
// Skip logging for screenshot actions - they're no-ops, the actual
// Page.screenshot in captureAndSendScreenshot() is logged separately
// Skip logging for screenshot actions - they're no-ops; the CUA client
// takes its own screenshot via screenshotProvider between API turns.
const shouldLog = action.type !== "screenshot";
if (shouldLog) {
await FlowLogger.runWithLogging(
Expand All @@ -151,17 +151,6 @@ export class V3CuaAgentHandler {
action.timestamp = Date.now();

await new Promise((r) => setTimeout(r, waitBetween));
try {
await this.captureAndSendScreenshot();
} catch (e) {
this.logger({
category: "agent",
message: `Warning: Failed to take screenshot after action: ${String(
(e as Error)?.message ?? e,
)}`,
level: 1,
});
}
} catch (error) {
const msg = (error as Error)?.message ?? String(error);
this.logger({
Expand Down Expand Up @@ -503,7 +492,8 @@ export class V3CuaAgentHandler {
return { success: true };
}
case "screenshot": {
// No-op - screenshot is captured by captureAndSendScreenshot() after all actions
// No-op - the CUA client captures a screenshot itself after each
// computer_call (or batch of actions) for the next request.
return { success: true };
}
case "goto": {
Expand Down
10 changes: 8 additions & 2 deletions packages/core/lib/v3/types/public/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ export type AgentType =
| "bedrock";

export const AVAILABLE_CUA_MODELS = [
"openai/gpt-5.4",
Comment thread
miguelg719 marked this conversation as resolved.
"openai/computer-use-preview",
"openai/computer-use-preview-2025-03-11",
"anthropic/claude-opus-4-5-20251101",
Expand Down Expand Up @@ -577,10 +578,14 @@ export interface ResponseItem {
export interface ComputerCallItem extends ResponseItem {
type: "computer_call";
call_id: string;
action: {
action?: {
type: string;
[key: string]: unknown;
};
actions?: Array<{
type: string;
[key: string]: unknown;
}>;
pending_safety_checks?: Array<{
id: string;
code: string;
Expand All @@ -602,8 +607,9 @@ export type ResponseInputItem =
call_id: string;
output:
| {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
detail?: "original" | "high" | "low";
current_url?: string;
error?: string;
[key: string]: unknown;
Expand Down
Loading
Loading