Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions packages/core/src/agent/file-chooser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import type { AbstractInterface, FileChooserHandler } from '@/device';
import { normalizeFilePaths } from './utils';

export type FileChooserAccept = string | string[];

export function normalizeFileChooserAccept(files: FileChooserAccept): string[] {
const filesArray = Array.isArray(files) ? files : [files];
return normalizeFilePaths(filesArray);
}

export async function withFileChooser<T>(
interfaceInstance: AbstractInterface,
fileChooserAccept: string[] | undefined,
action: () => Promise<T>,
): Promise<T> {
if (!fileChooserAccept?.length) {
return action();
}

if (!interfaceInstance.registerFileChooserListener) {
throw new Error(
`File upload is not supported on ${interfaceInstance.interfaceType}`,
);
}

const handler = async (chooser: FileChooserHandler) => {
await chooser.accept(fileChooserAccept);
};

const { dispose, getError } =
await interfaceInstance.registerFileChooserListener(handler);
try {
const result = await action();
const error = await getError();
if (error) {
throw error;
}
return result;
} finally {
dispose();
}
}
83 changes: 54 additions & 29 deletions packages/core/src/agent/task-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { sleep } from '@/utils';
import { generateElementByRect } from '@midscene/shared/extractor';
import { getDebug } from '@midscene/shared/logger';
import { assert } from '@midscene/shared/utils';
import { normalizeFileChooserAccept, withFileChooser } from './file-chooser';
import type { TaskCache } from './task-cache';
import { withUsageIntent } from './usage-intent';
import {
Expand Down Expand Up @@ -88,6 +89,21 @@ function normalizeLocateParam(
return deepLocate === undefined ? rest : { ...rest, deepLocate };
}

function getTapFileChooserAccept(
actionName: string,
param: unknown,
): string[] | undefined {
if (actionName !== 'Tap' || !param || typeof param !== 'object') {
return undefined;
}

const fileChooserAccept = (param as { fileChooserAccept?: string | string[] })
.fileChooserAccept;
return fileChooserAccept
? normalizeFileChooserAccept(fileChooserAccept)
: undefined;
}

export function locatePlanForLocate(param: string | DetailedLocateParam) {
const locate = normalizeLocateParam(param);
const locatePlan: PlanningAction<PlanningLocateParam> = {
Expand Down Expand Up @@ -342,38 +358,47 @@ export class TaskBuilder {

debug('calling action', action.name);
const actionFn = action.call.bind(this.interface);
const actionResult = await actionFn(param, taskContext);
setTimingFieldOnce(timing, 'callActionEnd');
debug('called action', action.name, 'result:', actionResult);

setTimingFieldOnce(timing, 'afterInvokeActionHookStart');
const fileChooserAccept = getTapFileChooserAccept(action.name, param);
const actionResult = await withFileChooser(
this.interface,
fileChooserAccept,
async () => {
const result = await actionFn(param, taskContext);
setTimingFieldOnce(timing, 'callActionEnd');
debug('called action', action.name, 'result:', result);

setTimingFieldOnce(timing, 'afterInvokeActionHookStart');

const delayAfterRunner =
action.delayAfterRunner ?? this.waitAfterAction ?? 300;
if (delayAfterRunner > 0) {
await sleep(delayAfterRunner);
}

const delayAfterRunner =
action.delayAfterRunner ?? this.waitAfterAction ?? 300;
if (delayAfterRunner > 0) {
await sleep(delayAfterRunner);
}
try {
if (this.interface.afterInvokeAction) {
debug(
`will call "afterInvokeAction" for interface with action name ${action.name}`,
);
await this.interface.afterInvokeAction(action.name, param);
debug(
`called "afterInvokeAction" for interface with action name ${action.name}`,
);
}
} catch (originalError: any) {
const originalMessage =
originalError?.message || String(originalError);
throw new Error(
`error in running afterInvokeAction for ${action.name}: ${originalMessage}`,
{ cause: originalError },
);
}

try {
if (this.interface.afterInvokeAction) {
debug(
`will call "afterInvokeAction" for interface with action name ${action.name}`,
);
await this.interface.afterInvokeAction(action.name, param);
debug(
`called "afterInvokeAction" for interface with action name ${action.name}`,
);
}
} catch (originalError: any) {
const originalMessage =
originalError?.message || String(originalError);
throw new Error(
`error in running afterInvokeAction for ${action.name}: ${originalMessage}`,
{ cause: originalError },
);
}
setTimingFieldOnce(timing, 'afterInvokeActionHookEnd');

setTimingFieldOnce(timing, 'afterInvokeActionHookEnd');
return result;
},
);

return {
output: actionResult,
Expand Down
38 changes: 3 additions & 35 deletions packages/core/src/agent/tasks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import {
userPromptToMultimodalPrompt,
userPromptToString,
} from '@/common';
import type { AbstractInterface, FileChooserHandler } from '@/device';
import type { AbstractInterface } from '@/device';
import type Service from '@/service';
import type { TaskRunner } from '@/task-runner';
import { TaskExecutionError } from '@/task-runner';
Expand All @@ -35,6 +35,7 @@ import { ServiceError } from '@/types';
import { getDebug } from '@midscene/shared/logger';
import { assert } from '@midscene/shared/utils';
import { ExecutionSession } from './execution-session';
import { withFileChooser } from './file-chooser';
import { TaskBuilder } from './task-builder';
import type { TaskCache } from './task-cache';
export { locatePlanForLocate } from './task-builder';
Expand Down Expand Up @@ -82,6 +83,7 @@ function truncatePlanningFeedback(feedback: string): string {
}

export { TaskExecutionError };
export { withFileChooser } from './file-chooser';

export class TaskExecutor {
interface: AbstractInterface;
Expand Down Expand Up @@ -949,37 +951,3 @@ export class TaskExecutor {
return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);
}
}

export async function withFileChooser<T>(
interfaceInstance: AbstractInterface,
fileChooserAccept: string[] | undefined,
action: () => Promise<T>,
): Promise<T> {
if (!fileChooserAccept?.length) {
return action();
}

if (!interfaceInstance.registerFileChooserListener) {
throw new Error(
`File upload is not supported on ${interfaceInstance.interfaceType}`,
);
}

const handler = async (chooser: FileChooserHandler) => {
await chooser.accept(fileChooserAccept);
};

const { dispose, getError } =
await interfaceInstance.registerFileChooserListener(handler);
try {
const result = await action();
// Check for errors that occurred during file chooser handling
const error = await getError();
if (error) {
throw error;
}
return result;
} finally {
dispose();
}
}
7 changes: 7 additions & 0 deletions packages/core/src/device/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,16 @@ function defineLocatedPointAction<
// Tap
export const actionTapParamSchema = z.object({
locate: getMidsceneLocationSchema().describe('The element to be tapped'),
fileChooserAccept: z
.union([z.string(), z.array(z.string())])
.optional()
.describe(
'Optional file path(s) to upload when this tap triggers a file chooser. Use only for file upload controls. If the user asks to upload a concrete file path, copy the exact path here.',
),
});
export type ActionTapParam = {
locate: LocateResultElement;
fileChooserAccept?: string | string[];
};

export const defineActionTap = (
Expand Down
125 changes: 125 additions & 0 deletions packages/core/tests/unit-test/ai-act-file-upload-tap.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import { join, resolve } from 'node:path';
import { TaskExecutor } from '@/agent';
import { buildYamlFlowFromPlans } from '@/common';
import { actionTapParamSchema } from '@/device';
import type { DeviceAction, PlanningAction } from '@/types';
import { describe, expect, it, vi } from 'vitest';

const fixtureFile = join(__dirname, 'ai-act-file-upload-tap.test.ts');
type TestFileChooserHandler = (chooser: {
accept(files: string[]): Promise<void>;
}) => Promise<void>;

describe('aiAct Tap fileChooserAccept', () => {
it('should keep fileChooserAccept in generated yaml flow', () => {
const plans: PlanningAction[] = [
{
type: 'Tap',
thought: 'upload id card',
param: {
locate: { prompt: 'the id card upload button' },
fileChooserAccept: './fixtures/id-card.png',
},
},
];
const actionSpace = [
{
name: 'Tap',
description: 'Tap the element',
interfaceAlias: 'aiTap',
paramSchema: actionTapParamSchema,
call: vi.fn(),
},
] as unknown as DeviceAction[];

expect(buildYamlFlowFromPlans(plans, actionSpace)).toEqual([
{
aiTap: '',
locate: 'the id card upload button',
fileChooserAccept: './fixtures/id-card.png',
},
]);
});

it('should accept files when executing a planned Tap action', async () => {
let fileChooserHandler: TestFileChooserHandler | undefined;
const dispose = vi.fn();
const acceptedFiles: string[][] = [];
const triggerFileChooser = async () => {
await fileChooserHandler?.({
accept: async (files: string[]) => {
acceptedFiles.push(files);
},
});
};
const actionCall = vi.fn(async () => {});

const mockInterface = {
interfaceType: 'playwright',
registerFileChooserListener: vi.fn(
async (handler: TestFileChooserHandler) => {
fileChooserHandler = handler;
return {
dispose: () => {
fileChooserHandler = undefined;
dispose();
},
getError: () => undefined,
};
},
),
afterInvokeAction: vi.fn(async () => {
await triggerFileChooser();
}),
actionSpace: () => [
{
name: 'Tap',
description: 'Tap the element',
interfaceAlias: 'aiTap',
paramSchema: actionTapParamSchema,
delayBeforeRunner: 0,
delayAfterRunner: 0,
call: actionCall,
},
],
} as any;

const taskExecutor = new TaskExecutor(mockInterface, {} as any, {
actionSpace: mockInterface.actionSpace(),
});

const plans: PlanningAction[] = [
{
type: 'Tap',
thought: 'upload id card',
param: {
locate: { prompt: 'the id card upload button' },
fileChooserAccept: fixtureFile,
},
},
];

const { tasks } = await (taskExecutor as any).convertPlanToExecutable(
plans,
);
const tapTask = tasks[tasks.length - 1];
tapTask.param.locate = {
id: 'upload',
center: [100, 200],
rect: { left: 90, top: 190, width: 20, height: 20 },
};

await tapTask.executor(tapTask.param, {
task: { timing: {} },
uiContext: {
shrunkShotToLogicalRatio: 1,
},
});

expect(mockInterface.registerFileChooserListener).toHaveBeenCalledTimes(1);
expect(actionCall).toHaveBeenCalledTimes(1);
expect(mockInterface.afterInvokeAction).toHaveBeenCalledTimes(1);
expect(acceptedFiles).toEqual([[resolve(fixtureFile)]]);
expect(dispose).toHaveBeenCalledTimes(1);
});
});
Loading
Loading