Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions packages/core/src/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,17 @@ export const ifMidsceneLocatorField = (field: any): boolean => {
return false;
};

const formatPromptWithImages = (
promptObj: Exclude<TUserPrompt, string>,
): string => {
let promptString = promptObj.prompt;
if (Array.isArray(promptObj.images) && promptObj.images.length > 0) {
const imageCount = promptObj.images.length;
promptString += ` (with ${imageCount} image${imageCount > 1 ? 's' : ''})`;
}
Comment on lines +569 to +571
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Keep dumped prompt text identical for cache replays

Appending "(with N images)" here changes the locator prompt content that gets serialized into yamlFlow; that flow is later cached (packages/core/src/agent/agent.ts) and executed via runYaml, so cache hits replay a different prompt than the one originally planned. In multimodal runs, there is no later step that strips this suffix or restores images, so the cached execution can drift and miss elements when wording is sensitive. Preserve the original prompt text and serialize image metadata separately instead of mutating the prompt string.

Useful? React with 👍 / 👎.

return promptString;
};

export const dumpMidsceneLocatorField = (field: any): string => {
assert(
ifMidsceneLocatorField(field),
Expand All @@ -580,7 +591,7 @@ export const dumpMidsceneLocatorField = (field: any): string => {
}
// If prompt is a TUserPrompt object, extract the prompt string
if (typeof field.prompt === 'object' && field.prompt.prompt) {
return field.prompt.prompt; // TODO: dump images if necessary
return formatPromptWithImages(field.prompt);
}
}

Expand Down Expand Up @@ -648,7 +659,7 @@ export const dumpActionParam = (
fieldValue.prompt.prompt
) {
// If prompt is a TUserPrompt object, extract the prompt string
result[fieldName] = fieldValue.prompt.prompt;
result[fieldName] = formatPromptWithImages(fieldValue.prompt);
}
}
}
Expand Down
66 changes: 66 additions & 0 deletions packages/core/tests/unit-test/utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,72 @@ describe('dumpActionParam', () => {
`);
});

it('should format locator fields that have image arrays', () => {
const schema = z.object({
locator: getMidsceneLocationSchema(),
});

const inputWithImages = {
locator: {
midscene_location_field_flag: true,
prompt: {
prompt: 'find the button',
images: [
{ name: 'button1.png', url: 'data:image/png;base64,xyz' },
{ name: 'button2.png', url: 'https://example.com/img.png' },
],
},
center: [100, 200],
rect: { left: 50, top: 100, width: 100, height: 50 },
},
};

const resultWithImages = dumpActionParam(inputWithImages, schema);
expect(resultWithImages).toMatchInlineSnapshot(`
{
"locator": "find the button (with 2 images)",
}
`);

const inputWithOneImage = {
locator: {
midscene_location_field_flag: true,
prompt: {
prompt: 'find the text',
images: [{ name: 'text.png', url: 'data:image/png;base64,abc' }],
},
center: [100, 200],
rect: { left: 50, top: 100, width: 100, height: 50 },
},
};

const resultWithOneImage = dumpActionParam(inputWithOneImage, schema);
expect(resultWithOneImage).toMatchInlineSnapshot(`
{
"locator": "find the text (with 1 image)",
}
`);

const inputWithEmptyImages = {
locator: {
midscene_location_field_flag: true,
prompt: {
prompt: 'find the link',
images: [],
},
center: [100, 200],
rect: { left: 50, top: 100, width: 100, height: 50 },
},
};

const resultWithEmptyImages = dumpActionParam(inputWithEmptyImages, schema);
expect(resultWithEmptyImages).toMatchInlineSnapshot(`
{
"locator": "find the link",
}
`);
});

it('should handle edge cases and invalid inputs', () => {
const schema = z.object({
foo: z.string(),
Expand Down
Loading