feat: add support for gpt 5.4 native computer use

miguelg719 · miguelg719 · commit 503e2ffc1f39 · 2026-04-21T15:05:40.000-07:00
diff --git a/packages/core/examples/gpt54-cua-example.ts b/packages/core/examples/gpt54-cua-example.ts
@@ -0,0 +1,47 @@
+import { Stagehand } from "../lib/v3/index.js";
+import chalk from "chalk";
+
+async function main() {
+  console.log(`\n${chalk.bold("Stagehand 🤘 GPT-5.4 CUA Demo")}\n`);
+
+  const stagehand = new Stagehand({
+    env: "LOCAL",
+    verbose: 2,
+  });
+  await stagehand.init();
+
+  try {
+    const page = stagehand.context.pages()[0];
+
+    const agent = stagehand.agent({
+      mode: "cua",
+      model: {
+        modelName: "openai/gpt-5.4-2026-03-05",
+        apiKey: process.env.OPENAI_API_KEY,
+      },
+      systemPrompt: `You are a helpful assistant that can use a web browser.
+      Do not ask follow up questions, the user will trust your judgement.
+      Today's date is ${new Date().toLocaleDateString()}.`,
+    });
+
+    await page.goto("https://news.ycombinator.com");
+
+    const instruction =
+      "Find the top story on Hacker News and tell me its title, link, and point count and then click on it and extract a summary for me";
+    console.log(`Instruction: ${chalk.white(instruction)}`);
+
+    const result = await agent.execute({
+      instruction,
+      maxSteps: 10,
+    });
+
+    console.log(`\n${chalk.green("✓")} Done`);
+    console.log(`${chalk.yellow("⤷")} ${result.message}`);
+  } catch (error) {
+    console.error(`${chalk.red("✗")} Error:`, error);
+  } finally {
+    await stagehand.close();
+  }
+}
+
+main();
diff --git a/packages/core/lib/v3/agent/AgentProvider.ts b/packages/core/lib/v3/agent/AgentProvider.ts
@@ -14,6 +14,7 @@ import { MicrosoftCUAClient } from "./MicrosoftCUAClient.js";
 
 // Map model names to their provider types
 export const modelToAgentProviderMap: Record<string, AgentProviderType> = {
+  "gpt-5.4-2026-03-05": "openai",
   "computer-use-preview": "openai",
   "computer-use-preview-2025-03-11": "openai",
   "claude-sonnet-4-20250514": "anthropic",
diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts
@@ -56,6 +56,10 @@ export class OpenAICUAClient extends AgentClient {
   private tools?: ToolSet;
   private safetyConfirmationHandler?: SafetyConfirmationHandler;
 
+  private get usesNewComputerTool(): boolean {
+    return this.modelName.startsWith("gpt-5");
+  }
+
   constructor(
     type: AgentType,
     modelName: string,
@@ -291,17 +295,12 @@ export class OpenAICUAClient extends AgentClient {
       const stepActions: AgentAction[] = [];
       for (const item of output) {
         if (item.type === "computer_call" && this.isComputerCallItem(item)) {
-          logger({
-            category: "agent",
-            message: `Found computer_call: ${item.action.type}, payload: ${JSON.stringify(item.action)}, call_id: ${item.call_id}`,
-            level: 2,
-          });
-          const action = this.convertComputerCallToAction(item);
-          if (action) {
+          const actions = this.convertComputerCallToActions(item);
+          for (const action of actions) {
             stepActions.push(action);
             logger({
               category: "agent",
-              message: `Converted computer_call to action: ${action.type}`,
+              message: `Found computer_call action: ${action.type}, payload: ${JSON.stringify(action)}, call_id: ${item.call_id}`,
               level: 2,
             });
           }
@@ -385,8 +384,8 @@ export class OpenAICUAClient extends AgentClient {
     return (
       item.type === "computer_call" &&
       "call_id" in item &&
-      "action" in item &&
-      typeof item.action === "object"
+      (("action" in item && typeof item.action === "object") ||
+        ("actions" in item && Array.isArray(item.actions)))
     );
   }
 
@@ -487,19 +486,21 @@ export class OpenAICUAClient extends AgentClient {
     usage: Record<string, number>;
   }> {
     try {
-      // Create the request parameters
-      const requestParams: Record<string, unknown> = {
-        model: this.modelName,
-        tools: [
-          {
-            type: "computer_use_preview",
+      // Create the request parameters, branching on tool format
+      const computerTool = this.usesNewComputerTool
+        ? { type: "computer" as const }
+        : {
+            type: "computer_use_preview" as const,
             display_width: this.currentViewport.width,
             display_height: this.currentViewport.height,
             environment: this.environment,
-          },
-        ],
+          };
+
+      const requestParams: Record<string, unknown> = {
+        model: this.modelName,
+        tools: [computerTool],
         input: inputItems,
-        truncation: "auto",
+        ...(this.usesNewComputerTool ? {} : { truncation: "auto" }),
       };
 
       // Add custom tools if available
@@ -601,29 +602,36 @@ export class OpenAICUAClient extends AgentClient {
     // Process each output item
     for (const item of output) {
       if (item.type === "computer_call" && this.isComputerCallItem(item)) {
-        // Handle computer calls
+        // Handle computer calls (both single-action and batched-actions formats)
         try {
-          const action = this.convertComputerCallToAction(item);
+          const actions = this.convertComputerCallToActions(item);
 
-          if (action && this.actionHandler) {
-            logger({
-              category: "agent",
-              message: `Executing computer action: ${action.type}`,
-              level: 1,
-            });
-            await this.actionHandler(action);
+          if (this.actionHandler) {
+            for (const action of actions) {
+              logger({
+                category: "agent",
+                message: `Executing computer action: ${action.type}`,
+                level: 1,
+              });
+              await this.actionHandler(action);
+            }
           }
 
-          // Capture a screenshot
+          // Capture a screenshot after all actions in the batch
           const screenshot = await this.captureScreenshot();
 
-          // Create a computer_call_output for the next request
+          // Build the output — use "computer_screenshot" for new format, "input_image" for legacy
+          const outputType = this.usesNewComputerTool
+            ? ("computer_screenshot" as const)
+            : ("input_image" as const);
+
           const outputItem = {
             type: "computer_call_output" as const,
             call_id: item.call_id,
             output: {
-              type: "input_image" as const,
+              type: outputType,
               image_url: screenshot,
+              ...(this.usesNewComputerTool ? { detail: "original" as const } : {}),
             },
           } as ResponseInputItem;
 
@@ -633,13 +641,13 @@ export class OpenAICUAClient extends AgentClient {
             level: 2,
           });
 
-          // Add current URL if available
-          if (this.currentUrl) {
+          // Legacy format supports current_url on the output; new format does not
+          if (!this.usesNewComputerTool && this.currentUrl) {
             const computerCallOutput = outputItem as {
               type: "computer_call_output";
               call_id: string;
               output: {
-                type: "input_image";
+                type: "input_image" | "computer_screenshot";
                 image_url: string;
                 current_url?: string;
               };
@@ -662,7 +670,7 @@ export class OpenAICUAClient extends AgentClient {
                 type: "computer_call_output";
                 call_id: string;
                 output: {
-                  type: "input_image";
+                  type: "input_image" | "computer_screenshot";
                   image_url: string;
                 };
                 acknowledged_safety_checks?: SafetyCheck[];
@@ -687,26 +695,29 @@ export class OpenAICUAClient extends AgentClient {
           });
 
           try {
-            // Capture a screenshot even on error
             const screenshot = await this.captureScreenshot();
 
+            const outputType = this.usesNewComputerTool
+              ? ("computer_screenshot" as const)
+              : ("input_image" as const);
+
             const errorOutputItem = {
               type: "computer_call_output" as const,
               call_id: item.call_id,
               output: {
-                type: "input_image" as const,
+                type: outputType,
                 image_url: screenshot,
                 error: errorMessage,
+                ...(this.usesNewComputerTool ? { detail: "original" as const } : {}),
               },
             } as ResponseInputItem;
 
-            // Add current URL if available
-            if (this.currentUrl) {
+            if (!this.usesNewComputerTool && this.currentUrl) {
               const computerCallOutput = errorOutputItem as {
                 type: "computer_call_output";
                 call_id: string;
                 output: {
-                  type: "input_image";
+                  type: "input_image" | "computer_screenshot";
                   image_url: string;
                   current_url?: string;
                 };
@@ -729,7 +740,7 @@ export class OpenAICUAClient extends AgentClient {
                   type: "computer_call_output";
                   call_id: string;
                   output: {
-                    type: "input_image";
+                    type: "input_image" | "computer_screenshot";
                     image_url: string;
                   };
                   acknowledged_safety_checks?: SafetyCheck[];
@@ -744,14 +755,12 @@ export class OpenAICUAClient extends AgentClient {
             if (screenshotError instanceof StagehandClosedError) {
               throw screenshotError;
             }
-            // If we can't capture a screenshot, just send the error
             logger({
               category: "agent",
               message: `Error capturing screenshot: ${String(screenshotError)}`,
               level: 0,
             });
 
-            // For error cases without a screenshot, we need to use a string output
             nextInputItems.push({
               type: "computer_call_output",
               call_id: item.call_id,
@@ -863,12 +872,11 @@ export class OpenAICUAClient extends AgentClient {
     call: ComputerCallItem,
   ): AgentAction | null {
     const { action } = call;
+    if (!action) return null;
 
-    // Instead of wrapping the action in a params object, spread the action properties directly
-    // This ensures properties like x, y, button, etc. are directly accessible on the AgentAction
     return {
       type: action.type as string,
-      ...action, // Spread all properties from the action
+      ...action,
     };
   }
 
@@ -894,6 +902,20 @@ export class OpenAICUAClient extends AgentClient {
     }
   }
 
+  private convertComputerCallToActions(
+    call: ComputerCallItem,
+  ): AgentAction[] {
+    if (call.actions && Array.isArray(call.actions)) {
+      return call.actions.map((action) => ({
+        type: action.type as string,
+        ...action,
+      }));
+    }
+
+    const single = this.convertComputerCallToAction(call);
+    return single ? [single] : [];
+  }
+
   private convertFunctionCallToAction(
     call: FunctionCallItem,
   ): AgentAction | null {
diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts
@@ -449,6 +449,7 @@ export type AgentType =
   | "bedrock";
 
 export const AVAILABLE_CUA_MODELS = [
+  "openai/gpt-5.4-2026-03-05",
   "openai/computer-use-preview",
   "openai/computer-use-preview-2025-03-11",
   "anthropic/claude-opus-4-5-20251101",
@@ -577,10 +578,14 @@ export interface ResponseItem {
 export interface ComputerCallItem extends ResponseItem {
   type: "computer_call";
   call_id: string;
-  action: {
+  action?: {
     type: string;
     [key: string]: unknown;
   };
+  actions?: Array<{
+    type: string;
+    [key: string]: unknown;
+  }>;
   pending_safety_checks?: Array<{
     id: string;
     code: string;
@@ -602,8 +607,9 @@ export type ResponseInputItem =
       call_id: string;
       output:
         | {
-            type: "input_image";
+            type: "input_image" | "computer_screenshot";
             image_url: string;
+            detail?: "original" | "high" | "low";
             current_url?: string;
             error?: string;
             [key: string]: unknown;
diff --git a/packages/core/tests/unit/public-api/llm-and-agents.test.ts b/packages/core/tests/unit/public-api/llm-and-agents.test.ts
@@ -37,6 +37,7 @@ describe("LLM and Agents public API types", () => {
 
   describe("AVAILABLE_CUA_MODELS", () => {
     const expectedModels = [
+      "openai/gpt-5.4-2026-03-05",
       "openai/computer-use-preview",
       "openai/computer-use-preview-2025-03-11",
       "anthropic/claude-opus-4-5-20251101",