Skip to content

Commit 36a6ce2

Browse files
committed
feat(core): support aiAct prompt file uploads
1 parent 39ea1c7 commit 36a6ce2

10 files changed

Lines changed: 332 additions & 79 deletions

File tree

packages/core/src/agent/agent.ts

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,6 @@ import {
4949
parseYamlScript,
5050
} from '../yaml/index';
5151

52-
import { existsSync } from 'node:fs';
53-
import { resolve } from 'node:path';
5452
import type { AbstractInterface } from '@/device';
5553
import type { TaskRunner } from '@/task-runner';
5654
import {
@@ -64,6 +62,7 @@ import { getDebug } from '@midscene/shared/logger';
6462
import { assert, ifInBrowser, uuid } from '@midscene/shared/utils';
6563
import { defineActionSleep } from '../device';
6664
import { validateAgentCacheInput } from './cache-config';
65+
import { normalizeFileChooserAccept } from './file-chooser';
6766
import { TaskCache } from './task-cache';
6867
import {
6968
TaskExecutionError,
@@ -1527,19 +1526,7 @@ export class Agent<
15271526
}
15281527

15291528
private normalizeFilePaths(files: string[]): string[] {
1530-
if (ifInBrowser) {
1531-
throw new Error('File chooser is not supported in browser environment');
1532-
}
1533-
1534-
return files.map((file) => {
1535-
const absolutePath = resolve(file);
1536-
if (!existsSync(absolutePath)) {
1537-
throw new Error(
1538-
`File not found: ${file}. Resolved to: ${absolutePath}. Current working directory: ${process.cwd()}`,
1539-
);
1540-
}
1541-
return absolutePath;
1542-
});
1529+
return normalizeFileChooserAccept(files);
15431530
}
15441531

15451532
private normalizeFileInput(files: string | string[]): string[] {
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import { existsSync } from 'node:fs';
2+
import { resolve } from 'node:path';
3+
import type { AbstractInterface, FileChooserHandler } from '@/device';
4+
import { ifInBrowser } from '@midscene/shared/utils';
5+
6+
export type FileChooserAccept = string | string[];
7+
8+
export function normalizeFileChooserAccept(
9+
files: FileChooserAccept,
10+
): string[] {
11+
const filesArray = Array.isArray(files) ? files : [files];
12+
13+
if (ifInBrowser) {
14+
throw new Error('File chooser is not supported in browser environment');
15+
}
16+
17+
return filesArray.map((file) => {
18+
const absolutePath = resolve(file);
19+
if (!existsSync(absolutePath)) {
20+
throw new Error(
21+
`File not found: ${file}. Resolved to: ${absolutePath}. Current working directory: ${process.cwd()}`,
22+
);
23+
}
24+
return absolutePath;
25+
});
26+
}
27+
28+
export async function withFileChooser<T>(
29+
interfaceInstance: AbstractInterface,
30+
fileChooserAccept: string[] | undefined,
31+
action: () => Promise<T>,
32+
): Promise<T> {
33+
if (!fileChooserAccept?.length) {
34+
return action();
35+
}
36+
37+
if (!interfaceInstance.registerFileChooserListener) {
38+
throw new Error(
39+
`File upload is not supported on ${interfaceInstance.interfaceType}`,
40+
);
41+
}
42+
43+
const handler = async (chooser: FileChooserHandler) => {
44+
await chooser.accept(fileChooserAccept);
45+
};
46+
47+
const { dispose, getError } =
48+
await interfaceInstance.registerFileChooserListener(handler);
49+
try {
50+
const result = await action();
51+
const error = getError();
52+
if (error) {
53+
throw error;
54+
}
55+
return result;
56+
} finally {
57+
dispose();
58+
}
59+
}

packages/core/src/agent/task-builder.ts

Lines changed: 55 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import {
3333
transformLogicalElementToScreenshot,
3434
transformLogicalRectToScreenshotRect,
3535
} from './utils';
36+
import { normalizeFileChooserAccept, withFileChooser } from './file-chooser';
3637

3738
const debug = getDebug('agent:task-builder');
3839

@@ -48,6 +49,22 @@ function hasNonEmptyCache(cache: unknown): boolean {
4849
);
4950
}
5051

52+
function getTapFileChooserAccept(
53+
actionName: string,
54+
param: unknown,
55+
): string[] | undefined {
56+
if (actionName !== 'Tap' || !param || typeof param !== 'object') {
57+
return undefined;
58+
}
59+
60+
const fileChooserAccept = (
61+
param as { fileChooserAccept?: string | string[] }
62+
).fileChooserAccept;
63+
return fileChooserAccept
64+
? normalizeFileChooserAccept(fileChooserAccept)
65+
: undefined;
66+
}
67+
5168
export function locatePlanForLocate(param: string | DetailedLocateParam) {
5269
const locate = typeof param === 'string' ? { prompt: param } : param;
5370
const locatePlan: PlanningAction<PlanningLocateParam> = {
@@ -307,38 +324,47 @@ export class TaskBuilder {
307324

308325
debug('calling action', action.name);
309326
const actionFn = action.call.bind(this.interface);
310-
const actionResult = await actionFn(param, taskContext);
311-
setTimingFieldOnce(timing, 'callActionEnd');
312-
debug('called action', action.name, 'result:', actionResult);
313-
314-
setTimingFieldOnce(timing, 'afterInvokeActionHookStart');
327+
const fileChooserAccept = getTapFileChooserAccept(action.name, param);
328+
const actionResult = await withFileChooser(
329+
this.interface,
330+
fileChooserAccept,
331+
async () => {
332+
const result = await actionFn(param, taskContext);
333+
setTimingFieldOnce(timing, 'callActionEnd');
334+
debug('called action', action.name, 'result:', result);
335+
336+
setTimingFieldOnce(timing, 'afterInvokeActionHookStart');
337+
338+
const delayAfterRunner =
339+
action.delayAfterRunner ?? this.waitAfterAction ?? 300;
340+
if (delayAfterRunner > 0) {
341+
await sleep(delayAfterRunner);
342+
}
315343

316-
const delayAfterRunner =
317-
action.delayAfterRunner ?? this.waitAfterAction ?? 300;
318-
if (delayAfterRunner > 0) {
319-
await sleep(delayAfterRunner);
320-
}
344+
try {
345+
if (this.interface.afterInvokeAction) {
346+
debug(
347+
`will call "afterInvokeAction" for interface with action name ${action.name}`,
348+
);
349+
await this.interface.afterInvokeAction(action.name, param);
350+
debug(
351+
`called "afterInvokeAction" for interface with action name ${action.name}`,
352+
);
353+
}
354+
} catch (originalError: any) {
355+
const originalMessage =
356+
originalError?.message || String(originalError);
357+
throw new Error(
358+
`error in running afterInvokeAction for ${action.name}: ${originalMessage}`,
359+
{ cause: originalError },
360+
);
361+
}
321362

322-
try {
323-
if (this.interface.afterInvokeAction) {
324-
debug(
325-
`will call "afterInvokeAction" for interface with action name ${action.name}`,
326-
);
327-
await this.interface.afterInvokeAction(action.name, param);
328-
debug(
329-
`called "afterInvokeAction" for interface with action name ${action.name}`,
330-
);
331-
}
332-
} catch (originalError: any) {
333-
const originalMessage =
334-
originalError?.message || String(originalError);
335-
throw new Error(
336-
`error in running afterInvokeAction for ${action.name}: ${originalMessage}`,
337-
{ cause: originalError },
338-
);
339-
}
363+
setTimingFieldOnce(timing, 'afterInvokeActionHookEnd');
340364

341-
setTimingFieldOnce(timing, 'afterInvokeActionHookEnd');
365+
return result;
366+
},
367+
);
342368

343369
return {
344370
output: actionResult,

packages/core/src/agent/tasks.ts

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import {
1212
type TUserPrompt,
1313
getReadableTimeString,
1414
} from '@/common';
15-
import type { AbstractInterface, FileChooserHandler } from '@/device';
15+
import type { AbstractInterface } from '@/device';
1616
import type Service from '@/service';
1717
import type { TaskRunner } from '@/task-runner';
1818
import { TaskExecutionError } from '@/task-runner';
@@ -36,6 +36,7 @@ import type { IModelConfig } from '@midscene/shared/env';
3636
import { getDebug } from '@midscene/shared/logger';
3737
import { assert } from '@midscene/shared/utils';
3838
import { ExecutionSession } from './execution-session';
39+
import { withFileChooser } from './file-chooser';
3940
import { TaskBuilder } from './task-builder';
4041
import type { TaskCache } from './task-cache';
4142
export { locatePlanForLocate } from './task-builder';
@@ -63,6 +64,7 @@ const warnLog = getDebug('device-task-executor', { console: true });
6364
const maxErrorCountAllowedInOnePlanningLoop = 5;
6465

6566
export { TaskExecutionError };
67+
export { withFileChooser } from './file-chooser';
6668

6769
export class TaskExecutor {
6870
interface: AbstractInterface;
@@ -809,37 +811,3 @@ export class TaskExecutor {
809811
return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);
810812
}
811813
}
812-
813-
export async function withFileChooser<T>(
814-
interfaceInstance: AbstractInterface,
815-
fileChooserAccept: string[] | undefined,
816-
action: () => Promise<T>,
817-
): Promise<T> {
818-
if (!fileChooserAccept?.length) {
819-
return action();
820-
}
821-
822-
if (!interfaceInstance.registerFileChooserListener) {
823-
throw new Error(
824-
`File upload is not supported on ${interfaceInstance.interfaceType}`,
825-
);
826-
}
827-
828-
const handler = async (chooser: FileChooserHandler) => {
829-
await chooser.accept(fileChooserAccept);
830-
};
831-
832-
const { dispose, getError } =
833-
await interfaceInstance.registerFileChooserListener(handler);
834-
try {
835-
const result = await action();
836-
// Check for errors that occurred during file chooser handling
837-
const error = getError();
838-
if (error) {
839-
throw error;
840-
}
841-
return result;
842-
} finally {
843-
dispose();
844-
}
845-
}

packages/core/src/device/index.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,16 @@ function defineLocatedPointAction<
268268
// Tap
269269
export const actionTapParamSchema = z.object({
270270
locate: getMidsceneLocationSchema().describe('The element to be tapped'),
271+
fileChooserAccept: z
272+
.union([z.string(), z.array(z.string())])
273+
.optional()
274+
.describe(
275+
'Optional file path(s) to upload when this tap triggers a file chooser. Use only for file upload controls. If the user asks to upload a concrete file path, copy the exact path here.',
276+
),
271277
});
272278
export type ActionTapParam = {
273279
locate: LocateResultElement;
280+
fileChooserAccept?: string | string[];
274281
};
275282

276283
export const defineActionTap = (

0 commit comments

Comments
 (0)