Skip to content

Commit 503e2ff

Browse files
committed
feat: add support for gpt 5.4 native computer use
1 parent 20b601d commit 503e2ff

5 files changed

Lines changed: 125 additions & 48 deletions

File tree

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { Stagehand } from "../lib/v3/index.js";
2+
import chalk from "chalk";
3+
4+
async function main() {
5+
console.log(`\n${chalk.bold("Stagehand 🤘 GPT-5.4 CUA Demo")}\n`);
6+
7+
const stagehand = new Stagehand({
8+
env: "LOCAL",
9+
verbose: 2,
10+
});
11+
await stagehand.init();
12+
13+
try {
14+
const page = stagehand.context.pages()[0];
15+
16+
const agent = stagehand.agent({
17+
mode: "cua",
18+
model: {
19+
modelName: "openai/gpt-5.4-2026-03-05",
20+
apiKey: process.env.OPENAI_API_KEY,
21+
},
22+
systemPrompt: `You are a helpful assistant that can use a web browser.
23+
Do not ask follow up questions, the user will trust your judgement.
24+
Today's date is ${new Date().toLocaleDateString()}.`,
25+
});
26+
27+
await page.goto("https://news.ycombinator.com");
28+
29+
const instruction =
30+
"Find the top story on Hacker News and tell me its title, link, and point count and then click on it and extract a summary for me";
31+
console.log(`Instruction: ${chalk.white(instruction)}`);
32+
33+
const result = await agent.execute({
34+
instruction,
35+
maxSteps: 10,
36+
});
37+
38+
console.log(`\n${chalk.green("✓")} Done`);
39+
console.log(`${chalk.yellow("⤷")} ${result.message}`);
40+
} catch (error) {
41+
console.error(`${chalk.red("✗")} Error:`, error);
42+
} finally {
43+
await stagehand.close();
44+
}
45+
}
46+
47+
main();

packages/core/lib/v3/agent/AgentProvider.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import { MicrosoftCUAClient } from "./MicrosoftCUAClient.js";
1414

1515
// Map model names to their provider types
1616
export const modelToAgentProviderMap: Record<string, AgentProviderType> = {
17+
"gpt-5.4-2026-03-05": "openai",
1718
"computer-use-preview": "openai",
1819
"computer-use-preview-2025-03-11": "openai",
1920
"claude-sonnet-4-20250514": "anthropic",

packages/core/lib/v3/agent/OpenAICUAClient.ts

Lines changed: 68 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ export class OpenAICUAClient extends AgentClient {
5656
private tools?: ToolSet;
5757
private safetyConfirmationHandler?: SafetyConfirmationHandler;
5858

59+
private get usesNewComputerTool(): boolean {
60+
return this.modelName.startsWith("gpt-5");
61+
}
62+
5963
constructor(
6064
type: AgentType,
6165
modelName: string,
@@ -291,17 +295,12 @@ export class OpenAICUAClient extends AgentClient {
291295
const stepActions: AgentAction[] = [];
292296
for (const item of output) {
293297
if (item.type === "computer_call" && this.isComputerCallItem(item)) {
294-
logger({
295-
category: "agent",
296-
message: `Found computer_call: ${item.action.type}, payload: ${JSON.stringify(item.action)}, call_id: ${item.call_id}`,
297-
level: 2,
298-
});
299-
const action = this.convertComputerCallToAction(item);
300-
if (action) {
298+
const actions = this.convertComputerCallToActions(item);
299+
for (const action of actions) {
301300
stepActions.push(action);
302301
logger({
303302
category: "agent",
304-
message: `Converted computer_call to action: ${action.type}`,
303+
message: `Found computer_call action: ${action.type}, payload: ${JSON.stringify(action)}, call_id: ${item.call_id}`,
305304
level: 2,
306305
});
307306
}
@@ -385,8 +384,8 @@ export class OpenAICUAClient extends AgentClient {
385384
return (
386385
item.type === "computer_call" &&
387386
"call_id" in item &&
388-
"action" in item &&
389-
typeof item.action === "object"
387+
(("action" in item && typeof item.action === "object") ||
388+
("actions" in item && Array.isArray(item.actions)))
390389
);
391390
}
392391

@@ -487,19 +486,21 @@ export class OpenAICUAClient extends AgentClient {
487486
usage: Record<string, number>;
488487
}> {
489488
try {
490-
// Create the request parameters
491-
const requestParams: Record<string, unknown> = {
492-
model: this.modelName,
493-
tools: [
494-
{
495-
type: "computer_use_preview",
489+
// Create the request parameters, branching on tool format
490+
const computerTool = this.usesNewComputerTool
491+
? { type: "computer" as const }
492+
: {
493+
type: "computer_use_preview" as const,
496494
display_width: this.currentViewport.width,
497495
display_height: this.currentViewport.height,
498496
environment: this.environment,
499-
},
500-
],
497+
};
498+
499+
const requestParams: Record<string, unknown> = {
500+
model: this.modelName,
501+
tools: [computerTool],
501502
input: inputItems,
502-
truncation: "auto",
503+
...(this.usesNewComputerTool ? {} : { truncation: "auto" }),
503504
};
504505

505506
// Add custom tools if available
@@ -601,29 +602,36 @@ export class OpenAICUAClient extends AgentClient {
601602
// Process each output item
602603
for (const item of output) {
603604
if (item.type === "computer_call" && this.isComputerCallItem(item)) {
604-
// Handle computer calls
605+
// Handle computer calls (both single-action and batched-actions formats)
605606
try {
606-
const action = this.convertComputerCallToAction(item);
607+
const actions = this.convertComputerCallToActions(item);
607608

608-
if (action && this.actionHandler) {
609-
logger({
610-
category: "agent",
611-
message: `Executing computer action: ${action.type}`,
612-
level: 1,
613-
});
614-
await this.actionHandler(action);
609+
if (this.actionHandler) {
610+
for (const action of actions) {
611+
logger({
612+
category: "agent",
613+
message: `Executing computer action: ${action.type}`,
614+
level: 1,
615+
});
616+
await this.actionHandler(action);
617+
}
615618
}
616619

617-
// Capture a screenshot
620+
// Capture a screenshot after all actions in the batch
618621
const screenshot = await this.captureScreenshot();
619622

620-
// Create a computer_call_output for the next request
623+
// Build the output — use "computer_screenshot" for new format, "input_image" for legacy
624+
const outputType = this.usesNewComputerTool
625+
? ("computer_screenshot" as const)
626+
: ("input_image" as const);
627+
621628
const outputItem = {
622629
type: "computer_call_output" as const,
623630
call_id: item.call_id,
624631
output: {
625-
type: "input_image" as const,
632+
type: outputType,
626633
image_url: screenshot,
634+
...(this.usesNewComputerTool ? { detail: "original" as const } : {}),
627635
},
628636
} as ResponseInputItem;
629637

@@ -633,13 +641,13 @@ export class OpenAICUAClient extends AgentClient {
633641
level: 2,
634642
});
635643

636-
// Add current URL if available
637-
if (this.currentUrl) {
644+
// Legacy format supports current_url on the output; new format does not
645+
if (!this.usesNewComputerTool && this.currentUrl) {
638646
const computerCallOutput = outputItem as {
639647
type: "computer_call_output";
640648
call_id: string;
641649
output: {
642-
type: "input_image";
650+
type: "input_image" | "computer_screenshot";
643651
image_url: string;
644652
current_url?: string;
645653
};
@@ -662,7 +670,7 @@ export class OpenAICUAClient extends AgentClient {
662670
type: "computer_call_output";
663671
call_id: string;
664672
output: {
665-
type: "input_image";
673+
type: "input_image" | "computer_screenshot";
666674
image_url: string;
667675
};
668676
acknowledged_safety_checks?: SafetyCheck[];
@@ -687,26 +695,29 @@ export class OpenAICUAClient extends AgentClient {
687695
});
688696

689697
try {
690-
// Capture a screenshot even on error
691698
const screenshot = await this.captureScreenshot();
692699

700+
const outputType = this.usesNewComputerTool
701+
? ("computer_screenshot" as const)
702+
: ("input_image" as const);
703+
693704
const errorOutputItem = {
694705
type: "computer_call_output" as const,
695706
call_id: item.call_id,
696707
output: {
697-
type: "input_image" as const,
708+
type: outputType,
698709
image_url: screenshot,
699710
error: errorMessage,
711+
...(this.usesNewComputerTool ? { detail: "original" as const } : {}),
700712
},
701713
} as ResponseInputItem;
702714

703-
// Add current URL if available
704-
if (this.currentUrl) {
715+
if (!this.usesNewComputerTool && this.currentUrl) {
705716
const computerCallOutput = errorOutputItem as {
706717
type: "computer_call_output";
707718
call_id: string;
708719
output: {
709-
type: "input_image";
720+
type: "input_image" | "computer_screenshot";
710721
image_url: string;
711722
current_url?: string;
712723
};
@@ -729,7 +740,7 @@ export class OpenAICUAClient extends AgentClient {
729740
type: "computer_call_output";
730741
call_id: string;
731742
output: {
732-
type: "input_image";
743+
type: "input_image" | "computer_screenshot";
733744
image_url: string;
734745
};
735746
acknowledged_safety_checks?: SafetyCheck[];
@@ -744,14 +755,12 @@ export class OpenAICUAClient extends AgentClient {
744755
if (screenshotError instanceof StagehandClosedError) {
745756
throw screenshotError;
746757
}
747-
// If we can't capture a screenshot, just send the error
748758
logger({
749759
category: "agent",
750760
message: `Error capturing screenshot: ${String(screenshotError)}`,
751761
level: 0,
752762
});
753763

754-
// For error cases without a screenshot, we need to use a string output
755764
nextInputItems.push({
756765
type: "computer_call_output",
757766
call_id: item.call_id,
@@ -863,12 +872,11 @@ export class OpenAICUAClient extends AgentClient {
863872
call: ComputerCallItem,
864873
): AgentAction | null {
865874
const { action } = call;
875+
if (!action) return null;
866876

867-
// Instead of wrapping the action in a params object, spread the action properties directly
868-
// This ensures properties like x, y, button, etc. are directly accessible on the AgentAction
869877
return {
870878
type: action.type as string,
871-
...action, // Spread all properties from the action
879+
...action,
872880
};
873881
}
874882

@@ -894,6 +902,20 @@ export class OpenAICUAClient extends AgentClient {
894902
}
895903
}
896904

905+
private convertComputerCallToActions(
906+
call: ComputerCallItem,
907+
): AgentAction[] {
908+
if (call.actions && Array.isArray(call.actions)) {
909+
return call.actions.map((action) => ({
910+
type: action.type as string,
911+
...action,
912+
}));
913+
}
914+
915+
const single = this.convertComputerCallToAction(call);
916+
return single ? [single] : [];
917+
}
918+
897919
private convertFunctionCallToAction(
898920
call: FunctionCallItem,
899921
): AgentAction | null {

packages/core/lib/v3/types/public/agent.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,7 @@ export type AgentType =
449449
| "bedrock";
450450

451451
export const AVAILABLE_CUA_MODELS = [
452+
"openai/gpt-5.4-2026-03-05",
452453
"openai/computer-use-preview",
453454
"openai/computer-use-preview-2025-03-11",
454455
"anthropic/claude-opus-4-5-20251101",
@@ -577,10 +578,14 @@ export interface ResponseItem {
577578
export interface ComputerCallItem extends ResponseItem {
578579
type: "computer_call";
579580
call_id: string;
580-
action: {
581+
action?: {
581582
type: string;
582583
[key: string]: unknown;
583584
};
585+
actions?: Array<{
586+
type: string;
587+
[key: string]: unknown;
588+
}>;
584589
pending_safety_checks?: Array<{
585590
id: string;
586591
code: string;
@@ -602,8 +607,9 @@ export type ResponseInputItem =
602607
call_id: string;
603608
output:
604609
| {
605-
type: "input_image";
610+
type: "input_image" | "computer_screenshot";
606611
image_url: string;
612+
detail?: "original" | "high" | "low";
607613
current_url?: string;
608614
error?: string;
609615
[key: string]: unknown;

packages/core/tests/unit/public-api/llm-and-agents.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ describe("LLM and Agents public API types", () => {
3737

3838
describe("AVAILABLE_CUA_MODELS", () => {
3939
const expectedModels = [
40+
"openai/gpt-5.4-2026-03-05",
4041
"openai/computer-use-preview",
4142
"openai/computer-use-preview-2025-03-11",
4243
"anthropic/claude-opus-4-5-20251101",

0 commit comments

Comments
 (0)