Remove error thrown on no action taken (#2027)

miguelg719 · cubic-dev-ai[bot] · web-flow · commit 21ba2defa8db · 2026-04-22T13:35:45.000-07:00
# why After adding regex schema validation for element ids in `act`, the models were left with no choice to say 'no candidate found/no action to be taken' since an empty string on element id would fail validation. This PR corrects the regressed behavior # what changed `act()` no longer throws when the model can't find a target. Callers get the structured `success: false` result, matching the behavior `observe()` already has ([]) when nothing matches. - `packages/core/lib/inference.ts` — wrap the act fields (elementId, description, method, arguments, twoStep) under action: {...}.nullable(). When the model returns action: null, inference returns element: undefined and the existing handler branch returns { success: false, actions: [], message: "Failed to perform act: No action found" }. - Lift `twoStep` out of the nullable action object and back to the top level of the act schema — small models (gpt-4.1-nano) flatten booleans out of anyOf variants during structured output, causing spurious AI_NoObjectGeneratedError on otherwise-valid responses. - `packages/core/lib/prompt.ts` — update buildActSystemPrompt, buildActPrompt, and buildStepTwoPrompt to instruct the model to set action: null when no element matches, with an explicit ban on empty strings / placeholder values. - `packages/core/lib/v3/handlers/actHandler.ts` — no logic change; the existing no-action branch is now reachable. Act will return the following on these cases: <img width="586" height="343" alt="Screenshot 2026-04-22 at 1 19 33 PM" src="https://github.com/user-attachments/assets/34efa7dd-7e80-4c6e-916f-de29d4e28083" /> # test plan  --- ## Summary by cubic Stops `act` from throwing when no element matches by making the response `action` nullable and updating prompts/parsing to treat “no action” as valid. Fixes the regression where empty element IDs failed validation. Aligns with Linear STG-1849. - **Bug Fixes** - Changed `act` schema to `{ action: { elementId, description, method, arguments } | null, twoStep: boolean }` with the element ID regex; `twoStep` is top‑level and defaults to `false`. - Updated prompts to set `action: null` when no element matches and to not fabricate elements or use empty/placeholder values. - Adjusted parsing to handle `action === null` and return `element: undefined`, letting the existing no‑action path return `success: false`. - Updated integration tests and test utils to use the new `action` shape and top‑level `twoStep`. <sup>Written for commit 1cb51fa. Summary will update on new commits. <a href="https://cubic.dev/pr/browserbase/stagehand/pull/2027">Review in cubic</a></sup>  --------- Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
diff --git a/packages/core/lib/inference.ts b/packages/core/lib/inference.ts
@@ -411,34 +411,41 @@ export async function act({
   const isGPT5 = llmClient.modelName.includes("gpt-5"); // TODO: remove this as we update support for gpt-5 configuration options
 
   const actSchema = z.object({
-    elementId: z
-      .string()
-      .regex(/^\d+-\d+$/)
-      .describe(
-        "the ID string associated with the element. Never include surrounding square brackets. This field must follow the format of 'number-number'. for example, '0-76' or '16-21'",
-      ),
-    description: z
-      .string()
-      .describe("a description of the accessible element and its purpose"),
-    method: z
-      .enum(
-        // Use Object.values() for Zod v3 compatibility - z.enum() in v3 doesn't accept TypeScript enums directly
-        Object.values(SupportedUnderstudyAction) as unknown as readonly [
-          string,
-          ...string[],
-        ],
-      )
+    action: z
+      .object({
+        elementId: z
+          .string()
+          .regex(/^\d+-\d+$/)
+          .describe(
+            "the ID string associated with the element. Never include surrounding square brackets. This field must follow the format of 'number-number'. for example, '0-76' or '16-21'",
+          ),
+        description: z
+          .string()
+          .describe("a description of the accessible element and its purpose"),
+        method: z
+          .enum(
+            // Use Object.values() for Zod v3 compatibility - z.enum() in v3 doesn't accept TypeScript enums directly
+            Object.values(SupportedUnderstudyAction) as unknown as readonly [
+              string,
+              ...string[],
+            ],
+          )
+          .describe(
+            "the candidate method/action to interact with the element. Select one of the available Understudy interaction methods.",
+          ),
+        arguments: z.array(
+          z
+            .string()
+            .describe(
+              "the arguments to pass to the method. For example, for a click, the arguments are empty, but for a fill, the arguments are the value to fill in.",
+            ),
+        ),
+      })
+      .nullable()
       .describe(
-        "the candidate method/action to interact with the element. Select one of the available Understudy interaction methods.",
+        "The element to act on. Return null if no element on the page matches the instruction — do NOT fabricate or guess an element, and never emit empty strings or placeholder values.",
       ),
-    arguments: z.array(
-      z
-        .string()
-        .describe(
-          "the arguments to pass to the method. For example, for a click, the arguments are empty, but for a fill, the arguments are the value to fill in.",
-        ),
-    ),
-    twoStep: z.boolean(),
+    twoStep: z.boolean().default(false),
   });
 
   type ActResponse = z.infer<typeof actSchema>;
@@ -512,12 +519,14 @@ export async function act({
     });
   }
 
-  const parsedElement = {
-    elementId: actData.elementId,
-    description: String(actData.description),
-    method: String(actData.method),
-    arguments: actData.arguments,
-  };
+  const parsedElement = actData.action
+    ? {
+        elementId: actData.action.elementId,
+        description: String(actData.action.description),
+        method: String(actData.action.method),
+        arguments: actData.action.arguments,
+      }
+    : undefined;
 
   return {
     element: parsedElement,
diff --git a/packages/core/lib/prompt.ts b/packages/core/lib/prompt.ts
@@ -169,7 +169,7 @@ You will be given:
 1. a user defined instruction about what action to take
 2. a hierarchical accessibility tree showing the semantic structure of the page. The tree is a hybrid of the DOM and the accessibility tree.
 
-Return the element that matches the instruction if it exists. Otherwise, return an empty object.`;
+Return the element that matches the instruction if it exists. If no element on the page matches the instruction, set \`action\` to null. Do not fabricate or guess an element — empty strings or placeholder values for elementId/description/method are not acceptable.`;
   const content = actSystemPrompt.replace(/\s+/g, " ");
 
   return {
@@ -206,8 +206,8 @@ export function buildActPrompt(
   General Instructions: 
     Provide an action for this element such as ${supportedActions.join(", ")}. Remember that to users, buttons and links look the same in most cases.
     When choosing non-left click actions, provide right or middle as the argument
-    If the action is completely unrelated to a potential action to be taken on the page, return an empty object. 
-    ONLY return one action. If multiple actions are relevant, return the most relevant one. 
+    If the action is completely unrelated to a potential action to be taken on the page, or no matching element exists, set \`action\` to null. Do not fabricate or guess an element.
+    ONLY return one action. If multiple actions are relevant, return the most relevant one.
     If the user is asking to scroll to a position on the page, e.g., 'halfway' or 0.75, etc, you must return the argument formatted as the correct percentage, e.g., '50%' or '75%', etc.
     If the user is asking to scroll to the next chunk/previous chunk, choose the nextChunk/prevChunk method. No arguments are required here.
     If the action implies a key press, e.g., 'press enter', 'press a', 'press space', etc., always choose the press method with the appropriate key as argument — e.g. 'a', 'Enter', 'Space'. Do not choose a click action on an on-screen keyboard. Capitalize the first character like 'Enter', 'Tab', 'Escape' only for special keys. 
@@ -246,8 +246,8 @@ export function buildStepTwoPrompt(
   
   General Instructions: 
   Provide an action for this element such as ${supportedActions.join(", ")}. Remember that to users, buttons and links look the same in most cases.
-  If the action is completely unrelated to a potential action to be taken on the page, return an empty object. 
-  ONLY return one action. If multiple actions are relevant, return the most relevant one. 
+  If the action is completely unrelated to a potential action to be taken on the page, or no matching element exists, set \`action\` to null. Do not fabricate or guess an element.
+  ONLY return one action. If multiple actions are relevant, return the most relevant one.
   If the user is asking to scroll to a position on the page, e.g., 'halfway' or 0.75, etc, you must return the argument formatted as the correct percentage, e.g., '50%' or '75%', etc.
   If the user is asking to scroll to the next chunk/previous chunk, choose the nextChunk/prevChunk method. No arguments are required here.
   If the action implies a key press, e.g., 'press enter', 'press a', 'press space', etc., always choose the press method with the appropriate key as argument — e.g. 'a', 'Enter', 'Space'. Do not choose a click action on an on-screen keyboard. Capitalize the first character like 'Enter', 'Tab', 'Escape' only for special keys. 
diff --git a/packages/core/tests/integration/flowLogger.spec.ts b/packages/core/tests/integration/flowLogger.spec.ts
@@ -213,10 +213,12 @@ test.describe("flow logger integration", () => {
     const llmClient = createScriptedAisdkTestLlmClient({
       jsonResponses: {
         act: (options) => ({
-          elementId: findLastEncodedId(options),
-          description: `click ${buttonText}`,
-          method: "click",
-          arguments: [],
+          action: {
+            elementId: findLastEncodedId(options),
+            description: `click ${buttonText}`,
+            method: "click",
+            arguments: [],
+          },
           twoStep: false,
         }),
       },
@@ -435,10 +437,12 @@ test.describe("flow logger integration", () => {
     const llmClient = createScriptedAisdkTestLlmClient({
       jsonResponses: {
         act: (options) => ({
-          elementId: findLastEncodedId(options),
-          description: `click ${buttonText}`,
-          method: "click",
-          arguments: [],
+          action: {
+            elementId: findLastEncodedId(options),
+            description: `click ${buttonText}`,
+            method: "click",
+            arguments: [],
+          },
           twoStep: false,
         }),
       },
diff --git a/packages/core/tests/integration/testUtils.ts b/packages/core/tests/integration/testUtils.ts
@@ -133,7 +133,7 @@ function resolveJsonResponseKey(
   };
   const properties = schema?.properties ?? {};
 
-  if ("elementId" in properties && "twoStep" in properties) {
+  if ("action" in properties && "twoStep" in properties) {
     return "act";
   }
 
diff --git a/packages/core/tests/integration/timeouts.spec.ts b/packages/core/tests/integration/timeouts.spec.ts
@@ -116,10 +116,12 @@ function createToolTimeoutTestLlmClient(
       if (responseModelName === "act") {
         return {
           data: {
-            elementId: "1-0",
-            description: "click body",
-            method: "click",
-            arguments: [],
+            action: {
+              elementId: "1-0",
+              description: "click body",
+              method: "click",
+              arguments: [],
+            },
             twoStep: false,
           },
           usage,

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ function resolveJsonResponseKey(`
`133`	`133`	`};`
`134`	`134`	`const properties = schema?.properties ?? {};`
`135`	`135`
`136`		`- if ("elementId" in properties && "twoStep" in properties) {`
	`136`	`+ if ("action" in properties && "twoStep" in properties) {`
`137`	`137`	`return "act";`
`138`	`138`	`}`
`139`	`139`