Skip to content

Commit e5ed0a7

Browse files
committed
feat(core): support aiAct prompt file uploads
1 parent 4178291 commit e5ed0a7

9 files changed

Lines changed: 299 additions & 64 deletions

File tree

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import type { AbstractInterface, FileChooserHandler } from '@/device';
2+
import { normalizeFilePaths } from './utils';
3+
4+
export type FileChooserAccept = string | string[];
5+
6+
export function normalizeFileChooserAccept(files: FileChooserAccept): string[] {
7+
const filesArray = Array.isArray(files) ? files : [files];
8+
return normalizeFilePaths(filesArray);
9+
}
10+
11+
export async function withFileChooser<T>(
12+
interfaceInstance: AbstractInterface,
13+
fileChooserAccept: string[] | undefined,
14+
action: () => Promise<T>,
15+
): Promise<T> {
16+
if (!fileChooserAccept?.length) {
17+
return action();
18+
}
19+
20+
if (!interfaceInstance.registerFileChooserListener) {
21+
throw new Error(
22+
`File upload is not supported on ${interfaceInstance.interfaceType}`,
23+
);
24+
}
25+
26+
const handler = async (chooser: FileChooserHandler) => {
27+
await chooser.accept(fileChooserAccept);
28+
};
29+
30+
const { dispose, getError } =
31+
await interfaceInstance.registerFileChooserListener(handler);
32+
try {
33+
const result = await action();
34+
const error = await getError();
35+
if (error) {
36+
throw error;
37+
}
38+
return result;
39+
} finally {
40+
dispose();
41+
}
42+
}

packages/core/src/agent/task-builder.ts

Lines changed: 54 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import { sleep } from '@/utils';
2525
import { generateElementByRect } from '@midscene/shared/extractor';
2626
import { getDebug } from '@midscene/shared/logger';
2727
import { assert } from '@midscene/shared/utils';
28+
import { normalizeFileChooserAccept, withFileChooser } from './file-chooser';
2829
import type { TaskCache } from './task-cache';
2930
import { withUsageIntent } from './usage-intent';
3031
import {
@@ -88,6 +89,21 @@ function normalizeLocateParam(
8889
return deepLocate === undefined ? rest : { ...rest, deepLocate };
8990
}
9091

92+
function getTapFileChooserAccept(
93+
actionName: string,
94+
param: unknown,
95+
): string[] | undefined {
96+
if (actionName !== 'Tap' || !param || typeof param !== 'object') {
97+
return undefined;
98+
}
99+
100+
const fileChooserAccept = (param as { fileChooserAccept?: string | string[] })
101+
.fileChooserAccept;
102+
return fileChooserAccept
103+
? normalizeFileChooserAccept(fileChooserAccept)
104+
: undefined;
105+
}
106+
91107
export function locatePlanForLocate(param: string | DetailedLocateParam) {
92108
const locate = normalizeLocateParam(param);
93109
const locatePlan: PlanningAction<PlanningLocateParam> = {
@@ -342,38 +358,47 @@ export class TaskBuilder {
342358

343359
debug('calling action', action.name);
344360
const actionFn = action.call.bind(this.interface);
345-
const actionResult = await actionFn(param, taskContext);
346-
setTimingFieldOnce(timing, 'callActionEnd');
347-
debug('called action', action.name, 'result:', actionResult);
348-
349-
setTimingFieldOnce(timing, 'afterInvokeActionHookStart');
361+
const fileChooserAccept = getTapFileChooserAccept(action.name, param);
362+
const actionResult = await withFileChooser(
363+
this.interface,
364+
fileChooserAccept,
365+
async () => {
366+
const result = await actionFn(param, taskContext);
367+
setTimingFieldOnce(timing, 'callActionEnd');
368+
debug('called action', action.name, 'result:', result);
369+
370+
setTimingFieldOnce(timing, 'afterInvokeActionHookStart');
371+
372+
const delayAfterRunner =
373+
action.delayAfterRunner ?? this.waitAfterAction ?? 300;
374+
if (delayAfterRunner > 0) {
375+
await sleep(delayAfterRunner);
376+
}
350377

351-
const delayAfterRunner =
352-
action.delayAfterRunner ?? this.waitAfterAction ?? 300;
353-
if (delayAfterRunner > 0) {
354-
await sleep(delayAfterRunner);
355-
}
378+
try {
379+
if (this.interface.afterInvokeAction) {
380+
debug(
381+
`will call "afterInvokeAction" for interface with action name ${action.name}`,
382+
);
383+
await this.interface.afterInvokeAction(action.name, param);
384+
debug(
385+
`called "afterInvokeAction" for interface with action name ${action.name}`,
386+
);
387+
}
388+
} catch (originalError: any) {
389+
const originalMessage =
390+
originalError?.message || String(originalError);
391+
throw new Error(
392+
`error in running afterInvokeAction for ${action.name}: ${originalMessage}`,
393+
{ cause: originalError },
394+
);
395+
}
356396

357-
try {
358-
if (this.interface.afterInvokeAction) {
359-
debug(
360-
`will call "afterInvokeAction" for interface with action name ${action.name}`,
361-
);
362-
await this.interface.afterInvokeAction(action.name, param);
363-
debug(
364-
`called "afterInvokeAction" for interface with action name ${action.name}`,
365-
);
366-
}
367-
} catch (originalError: any) {
368-
const originalMessage =
369-
originalError?.message || String(originalError);
370-
throw new Error(
371-
`error in running afterInvokeAction for ${action.name}: ${originalMessage}`,
372-
{ cause: originalError },
373-
);
374-
}
397+
setTimingFieldOnce(timing, 'afterInvokeActionHookEnd');
375398

376-
setTimingFieldOnce(timing, 'afterInvokeActionHookEnd');
399+
return result;
400+
},
401+
);
377402

378403
return {
379404
output: actionResult,

packages/core/src/agent/tasks.ts

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {
1010
userPromptToMultimodalPrompt,
1111
userPromptToString,
1212
} from '@/common';
13-
import type { AbstractInterface, FileChooserHandler } from '@/device';
13+
import type { AbstractInterface } from '@/device';
1414
import type Service from '@/service';
1515
import type { TaskRunner } from '@/task-runner';
1616
import { TaskExecutionError } from '@/task-runner';
@@ -35,6 +35,7 @@ import { ServiceError } from '@/types';
3535
import { getDebug } from '@midscene/shared/logger';
3636
import { assert } from '@midscene/shared/utils';
3737
import { ExecutionSession } from './execution-session';
38+
import { withFileChooser } from './file-chooser';
3839
import { TaskBuilder } from './task-builder';
3940
import type { TaskCache } from './task-cache';
4041
export { locatePlanForLocate } from './task-builder';
@@ -82,6 +83,7 @@ function truncatePlanningFeedback(feedback: string): string {
8283
}
8384

8485
export { TaskExecutionError };
86+
export { withFileChooser } from './file-chooser';
8587

8688
export class TaskExecutor {
8789
interface: AbstractInterface;
@@ -949,37 +951,3 @@ export class TaskExecutor {
949951
return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);
950952
}
951953
}
952-
953-
export async function withFileChooser<T>(
954-
interfaceInstance: AbstractInterface,
955-
fileChooserAccept: string[] | undefined,
956-
action: () => Promise<T>,
957-
): Promise<T> {
958-
if (!fileChooserAccept?.length) {
959-
return action();
960-
}
961-
962-
if (!interfaceInstance.registerFileChooserListener) {
963-
throw new Error(
964-
`File upload is not supported on ${interfaceInstance.interfaceType}`,
965-
);
966-
}
967-
968-
const handler = async (chooser: FileChooserHandler) => {
969-
await chooser.accept(fileChooserAccept);
970-
};
971-
972-
const { dispose, getError } =
973-
await interfaceInstance.registerFileChooserListener(handler);
974-
try {
975-
const result = await action();
976-
// Check for errors that occurred during file chooser handling
977-
const error = await getError();
978-
if (error) {
979-
throw error;
980-
}
981-
return result;
982-
} finally {
983-
dispose();
984-
}
985-
}

packages/core/src/device/index.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,9 +285,16 @@ function defineLocatedPointAction<
285285
// Tap
286286
export const actionTapParamSchema = z.object({
287287
locate: getMidsceneLocationSchema().describe('The element to be tapped'),
288+
fileChooserAccept: z
289+
.union([z.string(), z.array(z.string())])
290+
.optional()
291+
.describe(
292+
'Optional file path(s) to upload when this tap triggers a file chooser. Use only for file upload controls. If the user asks to upload a concrete file path, copy the exact path here.',
293+
),
288294
});
289295
export type ActionTapParam = {
290296
locate: LocateResultElement;
297+
fileChooserAccept?: string | string[];
291298
};
292299

293300
export const defineActionTap = (
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import { join, resolve } from 'node:path';
2+
import { TaskExecutor } from '@/agent';
3+
import { buildYamlFlowFromPlans } from '@/common';
4+
import { actionTapParamSchema } from '@/device';
5+
import type { DeviceAction, PlanningAction } from '@/types';
6+
import { describe, expect, it, vi } from 'vitest';
7+
8+
const fixtureFile = join(__dirname, 'ai-act-file-upload-tap.test.ts');
9+
type TestFileChooserHandler = (chooser: {
10+
accept(files: string[]): Promise<void>;
11+
}) => Promise<void>;
12+
13+
describe('aiAct Tap fileChooserAccept', () => {
14+
it('should keep fileChooserAccept in generated yaml flow', () => {
15+
const plans: PlanningAction[] = [
16+
{
17+
type: 'Tap',
18+
thought: 'upload id card',
19+
param: {
20+
locate: { prompt: 'the id card upload button' },
21+
fileChooserAccept: './fixtures/id-card.png',
22+
},
23+
},
24+
];
25+
const actionSpace = [
26+
{
27+
name: 'Tap',
28+
description: 'Tap the element',
29+
interfaceAlias: 'aiTap',
30+
paramSchema: actionTapParamSchema,
31+
call: vi.fn(),
32+
},
33+
] as unknown as DeviceAction[];
34+
35+
expect(buildYamlFlowFromPlans(plans, actionSpace)).toEqual([
36+
{
37+
aiTap: '',
38+
locate: 'the id card upload button',
39+
fileChooserAccept: './fixtures/id-card.png',
40+
},
41+
]);
42+
});
43+
44+
it('should accept files when executing a planned Tap action', async () => {
45+
let fileChooserHandler: TestFileChooserHandler | undefined;
46+
const dispose = vi.fn();
47+
const acceptedFiles: string[][] = [];
48+
const triggerFileChooser = async () => {
49+
await fileChooserHandler?.({
50+
accept: async (files: string[]) => {
51+
acceptedFiles.push(files);
52+
},
53+
});
54+
};
55+
const actionCall = vi.fn(async () => {});
56+
57+
const mockInterface = {
58+
interfaceType: 'playwright',
59+
registerFileChooserListener: vi.fn(
60+
async (handler: TestFileChooserHandler) => {
61+
fileChooserHandler = handler;
62+
return {
63+
dispose: () => {
64+
fileChooserHandler = undefined;
65+
dispose();
66+
},
67+
getError: () => undefined,
68+
};
69+
},
70+
),
71+
afterInvokeAction: vi.fn(async () => {
72+
await triggerFileChooser();
73+
}),
74+
actionSpace: () => [
75+
{
76+
name: 'Tap',
77+
description: 'Tap the element',
78+
interfaceAlias: 'aiTap',
79+
paramSchema: actionTapParamSchema,
80+
delayBeforeRunner: 0,
81+
delayAfterRunner: 0,
82+
call: actionCall,
83+
},
84+
],
85+
} as any;
86+
87+
const taskExecutor = new TaskExecutor(mockInterface, {} as any, {
88+
actionSpace: mockInterface.actionSpace(),
89+
});
90+
91+
const plans: PlanningAction[] = [
92+
{
93+
type: 'Tap',
94+
thought: 'upload id card',
95+
param: {
96+
locate: { prompt: 'the id card upload button' },
97+
fileChooserAccept: fixtureFile,
98+
},
99+
},
100+
];
101+
102+
const { tasks } = await (taskExecutor as any).convertPlanToExecutable(
103+
plans,
104+
);
105+
const tapTask = tasks[tasks.length - 1];
106+
tapTask.param.locate = {
107+
id: 'upload',
108+
center: [100, 200],
109+
rect: { left: 90, top: 190, width: 20, height: 20 },
110+
};
111+
112+
await tapTask.executor(tapTask.param, {
113+
task: { timing: {} },
114+
uiContext: {
115+
shrunkShotToLogicalRatio: 1,
116+
},
117+
});
118+
119+
expect(mockInterface.registerFileChooserListener).toHaveBeenCalledTimes(1);
120+
expect(actionCall).toHaveBeenCalledTimes(1);
121+
expect(mockInterface.afterInvokeAction).toHaveBeenCalledTimes(1);
122+
expect(acceptedFiles).toEqual([[resolve(fixtureFile)]]);
123+
expect(dispose).toHaveBeenCalledTimes(1);
124+
});
125+
});

0 commit comments

Comments
 (0)