Skip to content

Commit 0b05ea0

Browse files
committed
fix(core): harden aiLocateAll contract
1 parent 88d4260 commit 0b05ea0

9 files changed

Lines changed: 148 additions & 21 deletions

File tree

apps/site/docs/en/api.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,15 +1000,15 @@ function aiLocateAll(
10001000
height: number;
10011001
};
10021002
center: [number, number];
1003-
dpr: number; // device pixel ratio
1003+
dpr?: number; // device pixel ratio
10041004
}>
10051005
>;
10061006
```
10071007

10081008
- Parameters:
10091009

10101010
- `locate: string | Object` - A natural language description shared by all target elements, or [prompting with images](#prompting-with-images).
1011-
- `options?: Object` - Optional, a configuration object. `uiContext` and image prompt options are supported. Single-element optimizations such as `xpath`, cache lookup, and `deepLocate` are not applied to `aiLocateAll()`.
1011+
- `options?: Object` - Optional, a configuration object. `uiContext` and image prompt options are supported. Single-element optimizations such as `xpath`, cache lookup, and `deepLocate` are not applied to `aiLocateAll()`; passing these options throws an error.
10121012

10131013
- Return Value:
10141014

apps/site/docs/zh/api.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -993,15 +993,15 @@ function aiLocateAll(
993993
height: number;
994994
};
995995
center: [number, number];
996-
dpr: number; // device pixel ratio
996+
dpr?: number; // device pixel ratio
997997
}>
998998
>;
999999
```
10001000

10011001
- 参数:
10021002

10031003
- `locate: string | Object` - 所有目标元素共用的自然语言描述,或[使用图片作为提示词](#使用图片作为提示词)
1004-
- `options?: Object` - 可选配置对象。支持 `uiContext` 和图片提示词相关配置。`xpath`、定位缓存、`deepLocate` 等单元素定位优化不会应用到 `aiLocateAll()`
1004+
- `options?: Object` - 可选配置对象。支持 `uiContext` 和图片提示词相关配置。`xpath`、定位缓存、`deepLocate` 等单元素定位优化不会应用到 `aiLocateAll()`;传入这些选项会抛错
10051005

10061006
- 返回值:
10071007

packages/core/src/agent/agent.ts

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import {
1818
type ExecutionRecorderItem,
1919
type ExecutionTask,
2020
type ExecutionTaskLog,
21+
type LocateAllOption,
2122
type LocateOption,
2223
type LocateResultElement,
2324
type OnTaskStartTip,
@@ -96,6 +97,10 @@ const defaultServiceExtractOption: ServiceExtractOption = {
9697
screenshotIncluded: true,
9798
};
9899

100+
type LocateAllResultItem = Pick<LocateResultElement, 'rect' | 'center'> & {
101+
dpr?: number;
102+
};
103+
99104
export type AiActOptions = {
100105
cacheable?: boolean;
101106
fileChooserAccept?: string | string[];
@@ -111,6 +116,30 @@ type AiActInternalOptions = AiActOptions & {
111116
};
112117
};
113118

119+
const unsupportedLocateAllOptionKeys = [
120+
'deepLocate',
121+
'deepThink',
122+
'xpath',
123+
'cacheable',
124+
'fileChooserAccept',
125+
] as const;
126+
127+
function assertLocateAllOptionsSupported(opt?: LocateAllOption) {
128+
if (!opt || typeof opt !== 'object') {
129+
return;
130+
}
131+
132+
const providedUnsupportedKeys = unsupportedLocateAllOptionKeys.filter((key) =>
133+
Object.prototype.hasOwnProperty.call(opt, key),
134+
);
135+
assert(
136+
providedUnsupportedKeys.length === 0,
137+
`aiLocateAll does not support these single-element locate options: ${providedUnsupportedKeys.join(
138+
', ',
139+
)}. Supported options are uiContext and image prompt options.`,
140+
);
141+
}
142+
114143
export class Agent<
115144
InterfaceType extends AbstractInterface = AbstractInterface,
116145
> {
@@ -1126,16 +1155,21 @@ export class Agent<
11261155
} as Pick<LocateResultElement, 'rect' | 'center'>;
11271156
}
11281157

1129-
async aiLocateAll(prompt: TUserPrompt, opt?: LocateOption) {
1158+
async aiLocateAll(
1159+
prompt: TUserPrompt,
1160+
opt?: LocateAllOption,
1161+
): Promise<LocateAllResultItem[]> {
1162+
assertLocateAllOptionsSupported(opt);
11301163
const locateParam = buildDetailedLocateParam(prompt, opt);
11311164
assert(locateParam, 'cannot get locate param for aiLocateAll');
1132-
const locatePlan = locatePlanForLocateAll(locateParam);
1165+
const locateAllParam = { prompt: locateParam.prompt };
1166+
const locatePlan = locatePlanForLocateAll(locateAllParam);
11331167
const plans = [locatePlan];
11341168
const defaultModel = this.resolveModelRuntime('default');
11351169
const planningModel = this.resolveModelRuntime('planning');
11361170

11371171
const { output } = await this.taskExecutor.runPlans(
1138-
taskTitleStr('Locate', locateParamStr(locateParam)),
1172+
taskTitleStr('LocateAll', locateParamStr(locateAllParam)),
11391173
plans,
11401174
planningModel,
11411175
defaultModel,

packages/core/src/agent/ui-utils.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ export type TaskTitleType =
150150
| 'Assert'
151151
| 'WaitFor'
152152
| 'Locate'
153+
| 'LocateAll'
153154
| 'Markdown'
154155
| 'Boolean'
155156
| 'Number'

packages/core/src/ai-model/inspect.ts

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,24 @@ export async function AiLocateAllElements(
515515
const errors: string[] = Array.isArray(res.content.errors)
516516
? [...res.content.errors]
517517
: [];
518-
const rawElements = Array.isArray(res.content.elements)
519-
? res.content.elements
520-
: [];
518+
if (!Array.isArray(res.content.elements)) {
519+
return {
520+
rawResponse,
521+
rawChoiceMessage: res.rawChoiceMessage,
522+
usage: res.usage,
523+
reasoning_content: res.reasoning_content,
524+
parseResult: {
525+
elements: [],
526+
errors: [
527+
...errors,
528+
'AI response error: locate all response must contain an elements array',
529+
],
530+
fatalError: true,
531+
},
532+
};
533+
}
534+
535+
const rawElements = res.content.elements;
521536
const elements: LocateResultElement[] = [];
522537

523538
rawElements.forEach((rawElement, index) => {
@@ -543,6 +558,11 @@ export async function AiLocateAllElements(
543558
}
544559
});
545560

561+
const fatalError = rawElements.length > 0 && elements.length === 0;
562+
if (fatalError) {
563+
errors.push('AI response error: failed to parse every locate all element');
564+
}
565+
546566
return {
547567
rawResponse,
548568
rawChoiceMessage: res.rawChoiceMessage,
@@ -551,6 +571,7 @@ export async function AiLocateAllElements(
551571
parseResult: {
552572
elements: sortAndDedupeLocateElements(elements),
553573
errors,
574+
fatalError,
554575
},
555576
};
556577
}

packages/core/src/yaml.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ export interface LocateOption extends Partial<TMultimodalPrompt> {
1818
fileChooserAccept?: string | string[]; // file path(s) to upload when tapping triggers a file chooser
1919
}
2020

21+
export interface LocateAllOption extends Partial<TMultimodalPrompt> {
22+
prompt?: TUserPrompt;
23+
uiContext?: UIContext;
24+
}
25+
2126
export interface ServiceExtractOption {
2227
domIncluded?: boolean | 'visible-only';
2328
screenshotIncluded?: boolean;

packages/core/tests/unit-test/agent-locate-all.test.ts

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,15 @@ describe('Agent.aiLocateAll', () => {
4848
runner: {} as any,
4949
});
5050

51-
const result = await agent.aiLocateAll('all submit buttons', {
52-
cacheable: false,
53-
});
51+
const result = await agent.aiLocateAll('all submit buttons');
5452

5553
expect(runPlans).toHaveBeenCalledWith(
56-
'Locate - all submit buttons',
54+
'LocateAll - all submit buttons',
5755
[
5856
{
5957
type: 'LocateAll',
6058
param: {
6159
prompt: 'all submit buttons',
62-
cacheable: false,
63-
deepLocate: false,
64-
xpath: undefined,
6560
},
6661
thought: '',
6762
},
@@ -82,4 +77,31 @@ describe('Agent.aiLocateAll', () => {
8277
},
8378
]);
8479
});
80+
81+
it('rejects single-element locate options that are unsupported by aiLocateAll', async () => {
82+
const agent = new Agent(createMockInterface(), {
83+
generateReport: false,
84+
modelConfig,
85+
});
86+
const runPlans = vi.spyOn(agent.taskExecutor, 'runPlans');
87+
88+
await expect(
89+
agent.aiLocateAll('all submit buttons', { cacheable: false } as any),
90+
).rejects.toThrow(
91+
/aiLocateAll does not support these single-element locate options: cacheable/,
92+
);
93+
await expect(
94+
agent.aiLocateAll('all submit buttons', { deepLocate: true } as any),
95+
).rejects.toThrow(
96+
/aiLocateAll does not support these single-element locate options: deepLocate/,
97+
);
98+
await expect(
99+
agent.aiLocateAll('all submit buttons', {
100+
xpath: '//*[@id="submit"]',
101+
} as any),
102+
).rejects.toThrow(
103+
/aiLocateAll does not support these single-element locate options: xpath/,
104+
);
105+
expect(runPlans).not.toHaveBeenCalled();
106+
});
85107
});

packages/core/tests/unit-test/inspect-locate-all.test.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,51 @@ describe('AiLocateAllElements', () => {
6464
expect(result.usage).toEqual({ total_tokens: 10 });
6565
expect(result.reasoning_content).toBe('all locate reasoning');
6666
});
67+
68+
it('marks malformed top-level locate-all responses as fatal', async () => {
69+
vi.mocked(callAIWithObjectResponse).mockResolvedValue({
70+
content: {
71+
bbox: [100, 100, 120, 120],
72+
},
73+
rawChoiceMessage: { role: 'assistant' },
74+
contentString: '{}',
75+
});
76+
77+
const result = await AiLocateAllElements({
78+
context: createFakeContext(),
79+
targetElementDescription: 'submit buttons',
80+
modelRuntime: getModelRuntime(modelConfig),
81+
});
82+
83+
expect(result.parseResult).toEqual({
84+
elements: [],
85+
errors: [
86+
'AI response error: locate all response must contain an elements array',
87+
],
88+
fatalError: true,
89+
});
90+
});
91+
92+
it('marks responses as fatal when every returned candidate fails to parse', async () => {
93+
vi.mocked(callAIWithObjectResponse).mockResolvedValue({
94+
content: {
95+
elements: [{ bbox: [10, 20, Number.NaN, 40] }],
96+
},
97+
rawChoiceMessage: { role: 'assistant' },
98+
contentString: '{}',
99+
});
100+
101+
const result = await AiLocateAllElements({
102+
context: createFakeContext(),
103+
targetElementDescription: 'submit buttons',
104+
modelRuntime: getModelRuntime(modelConfig),
105+
});
106+
107+
expect(result.parseResult.elements).toEqual([]);
108+
expect(result.parseResult.errors).toEqual([
109+
expect.stringMatching(/Failed to parse locate result #1/),
110+
'AI response error: failed to parse every locate all element',
111+
]);
112+
expect(result.parseResult.fatalError).toBe(true);
113+
});
67114
});

packages/core/tests/unit-test/player-action-dispatch.test.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,6 @@ describe('player action dispatch ordering', () => {
413413
flow: [
414414
{
415415
aiLocateAll: 'all delete buttons',
416-
cacheable: false,
417416
name: 'buttons',
418417
},
419418
],
@@ -424,9 +423,7 @@ describe('player action dispatch ordering', () => {
424423

425424
await player.playTask(taskStatus, agent);
426425

427-
expect(agent.aiLocateAll).toHaveBeenCalledWith('all delete buttons', {
428-
cacheable: false,
429-
});
426+
expect(agent.aiLocateAll).toHaveBeenCalledWith('all delete buttons', {});
430427
expect(player.result.buttons).toBe(locateAllResult);
431428
});
432429

0 commit comments

Comments
 (0)