Skip to content
Closed
169 changes: 88 additions & 81 deletions packages/core/lib/inference.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@ import type {
InferStagehandSchema,
StagehandZodObject,
} from "./v3/zodCompat.js";
import { SupportedUnderstudyAction } from "./v3/types/private/handlers.js";
import {
ElementRef,
ModelAction,
ModelActResponse,
modelActionSchema,
modelActResponseSchema,
} from "./v3/types/private/modelActions.js";
import type { EncodedId } from "./v3/types/private/internal.js";
import type { Variables } from "./v3/types/public/agent.js";

// Re-export for backward compatibility
Expand All @@ -30,6 +37,81 @@ function withLlmTimeout<T>(promise: Promise<T>, operation: string): Promise<T> {
);
}

type LegacyInferenceAction = {
elementId: EncodedId;
description: string;
method: ModelAction["method"];
arguments: string[];
};

function encodeElementRef(ref: ElementRef): EncodedId {
return `${ref.frameOrdinal}-${ref.backendNodeId}`;
}

function toLegacyInferenceAction(action: ModelAction): LegacyInferenceAction {
switch (action.method) {
case "click":
return {
elementId: encodeElementRef(action.target),
description: action.description,
method: action.method,
arguments: action.button ? [action.button] : [],
};
case "fill":
return {
elementId: encodeElementRef(action.target),
description: action.description,
method: action.method,
arguments: [action.value],
};
case "type":
return {
elementId: encodeElementRef(action.target),
description: action.description,
method: action.method,
arguments: [action.text],
};
case "press":
return {
elementId: encodeElementRef(action.target),
description: action.description,
method: action.method,
arguments: [action.key],
};
case "scrollTo":
return {
elementId: encodeElementRef(action.target),
description: action.description,
method: action.method,
arguments: [action.position],
};
case "selectOptionFromDropdown":
return {
elementId: encodeElementRef(action.target),
description: action.description,
method: action.method,
arguments: [action.option],
};
case "dragAndDrop":
return {
elementId: encodeElementRef(action.target),
description: action.description,
method: action.method,
arguments: [encodeElementRef(action.destination)],
};
case "doubleClick":
case "hover":
case "nextChunk":
case "prevChunk":
return {
elementId: encodeElementRef(action.target),
description: action.description,
method: action.method,
arguments: [],
};
}
}

export async function extract<T extends StagehandZodObject>({
instruction,
domElements,
Expand Down Expand Up @@ -164,7 +246,6 @@ export async function extract<T extends StagehandZodObject>({
response_model: {
name: "Metadata",
schema: metadataSchema,
strict: true,
},
temperature: isGPT5 ? 1 : 0.1,
top_p: 1,
Expand Down Expand Up @@ -265,39 +346,7 @@ export async function observe({

const observeSchema = z.object({
elements: z
.array(
z.object({
elementId: z
.string()
.regex(/^\d+-\d+$/)
.describe(
"the ID string associated with the element. Never include surrounding square brackets. This field must follow the format of 'number-number'.",
),
description: z
.string()
.describe(
"a description of the accessible element and its purpose",
),
method: z
.enum(
// Use Object.values() for Zod v3 compatibility - z.enum() in v3 doesn't accept TypeScript enums directly
Object.values(SupportedUnderstudyAction) as unknown as readonly [
string,
...string[],
],
)
.describe(
`the candidate method/action to interact with the element. Select one of the available Understudy interaction methods.`,
),
arguments: z.array(
z
.string()
.describe(
"the arguments to pass to the method. For example, for a click, the arguments are empty, but for a fill, the arguments are the value to fill in.",
),
),
}),
)
.array(modelActionSchema)
.describe("an array of accessible elements that match the instruction"),
});

Expand Down Expand Up @@ -334,7 +383,6 @@ export async function observe({
response_model: {
schema: observeSchema,
name: "Observation",
strict: true,
},
temperature: isGPT5 ? 1 : 0.1,
top_p: 1,
Expand Down Expand Up @@ -379,13 +427,7 @@ export async function observe({

const parsedElements =
observeData.elements?.map((el) => {
const base = {
elementId: el.elementId,
description: String(el.description),
method: String(el.method),
arguments: el.arguments,
};
return base;
return toLegacyInferenceAction(el);
}) ?? [];

return {
Expand Down Expand Up @@ -415,38 +457,9 @@ export async function act({
}) {
const isGPT5 = llmClient.modelName.includes("gpt-5"); // TODO: remove this as we update support for gpt-5 configuration options

const actSchema = z.object({
elementId: z
.string()
.regex(/^\d+-\d+$/)
.describe(
"the ID string associated with the element. Never include surrounding square brackets. This field must follow the format of 'number-number'. for example, '0-76' or '16-21'",
),
description: z
.string()
.describe("a description of the accessible element and its purpose"),
method: z
.enum(
// Use Object.values() for Zod v3 compatibility - z.enum() in v3 doesn't accept TypeScript enums directly
Object.values(SupportedUnderstudyAction) as unknown as readonly [
string,
...string[],
],
)
.describe(
"the candidate method/action to interact with the element. Select one of the available Understudy interaction methods.",
),
arguments: z.array(
z
.string()
.describe(
"the arguments to pass to the method. For example, for a click, the arguments are empty, but for a fill, the arguments are the value to fill in.",
),
),
twoStep: z.boolean(),
});
const actSchema = modelActResponseSchema;

type ActResponse = z.infer<typeof actSchema>;
type ActResponse = ModelActResponse;

const messages: ChatMessage[] = [
buildActSystemPrompt(userProvidedInstructions),
Expand Down Expand Up @@ -475,7 +488,6 @@ export async function act({
response_model: {
schema: actSchema,
name: "act",
strict: true,
},
temperature: isGPT5 ? 1 : 0.1,
top_p: 1,
Expand Down Expand Up @@ -518,12 +530,7 @@ export async function act({
});
}

const parsedElement = {
elementId: actData.elementId,
description: String(actData.description),
method: String(actData.method),
arguments: actData.arguments,
};
const parsedElement = toLegacyInferenceAction(actData.action);

return {
element: parsedElement,
Expand Down
5 changes: 4 additions & 1 deletion packages/core/lib/v3/agent/tools/act.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { tool } from "ai";
import { NoObjectGeneratedError, tool } from "ai";
import { z } from "zod";
import type { V3 } from "../../v3.js";
import type { Action } from "../../types/public/methods.js";
Expand Down Expand Up @@ -66,6 +66,9 @@ export const actTool = (
if (error instanceof TimeoutError) {
throw error;
}
if (NoObjectGeneratedError.isInstance(error)) {
throw error;
}
return {
success: false,
error: error?.message ?? String(error),
Expand Down
5 changes: 4 additions & 1 deletion packages/core/lib/v3/agent/tools/extract.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { tool } from "ai";
import { NoObjectGeneratedError, tool } from "ai";
import { z, ZodTypeAny } from "zod";
import type { V3 } from "../../v3.js";
import type { AgentModelConfig } from "../../types/public/agent.js";
Expand Down Expand Up @@ -103,6 +103,9 @@ export const extractTool = (
if (error instanceof TimeoutError) {
throw error;
}
if (NoObjectGeneratedError.isInstance(error)) {
throw error;
}
return { success: false, error: error?.message ?? String(error) };
}
},
Expand Down
5 changes: 4 additions & 1 deletion packages/core/lib/v3/agent/tools/fillform.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { tool } from "ai";
import { NoObjectGeneratedError, tool } from "ai";
import { z } from "zod";
import type { V3 } from "../../v3.js";
import type { Action } from "../../types/public/methods.js";
Expand Down Expand Up @@ -77,6 +77,9 @@ export const fillFormTool = (
if (error instanceof TimeoutError) {
throw error;
}
if (NoObjectGeneratedError.isInstance(error)) {
throw error;
}
return {
success: false,
error: error?.message ?? String(error),
Expand Down
4 changes: 0 additions & 4 deletions packages/core/lib/v3/external_clients/aisdk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,6 @@ export class AISdkClient extends LLMClient {
topP: options.top_p,
frequencyPenalty: options.frequency_penalty,
presencePenalty: options.presence_penalty,
providerOptions:
options.response_model.strict === false
? { openai: { strictJsonSchema: false } }
: undefined,
});

return {
Expand Down
Loading
Loading