diff --git a/packages/core/src/agent/file-chooser.ts b/packages/core/src/agent/file-chooser.ts new file mode 100644 index 0000000000..4bcd425680 --- /dev/null +++ b/packages/core/src/agent/file-chooser.ts @@ -0,0 +1,42 @@ +import type { AbstractInterface, FileChooserHandler } from '@/device'; +import { normalizeFilePaths } from './utils'; + +export type FileChooserAccept = string | string[]; + +export function normalizeFileChooserAccept(files: FileChooserAccept): string[] { + const filesArray = Array.isArray(files) ? files : [files]; + return normalizeFilePaths(filesArray); +} + +export async function withFileChooser( + interfaceInstance: AbstractInterface, + fileChooserAccept: string[] | undefined, + action: () => Promise, +): Promise { + if (!fileChooserAccept?.length) { + return action(); + } + + if (!interfaceInstance.registerFileChooserListener) { + throw new Error( + `File upload is not supported on ${interfaceInstance.interfaceType}`, + ); + } + + const handler = async (chooser: FileChooserHandler) => { + await chooser.accept(fileChooserAccept); + }; + + const { dispose, getError } = + await interfaceInstance.registerFileChooserListener(handler); + try { + const result = await action(); + const error = await getError(); + if (error) { + throw error; + } + return result; + } finally { + dispose(); + } +} diff --git a/packages/core/src/agent/task-builder.ts b/packages/core/src/agent/task-builder.ts index 6487d2e88a..499973f597 100644 --- a/packages/core/src/agent/task-builder.ts +++ b/packages/core/src/agent/task-builder.ts @@ -25,6 +25,7 @@ import { sleep } from '@/utils'; import { generateElementByRect } from '@midscene/shared/extractor'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; +import { normalizeFileChooserAccept, withFileChooser } from './file-chooser'; import type { TaskCache } from './task-cache'; import { withUsageIntent } from './usage-intent'; import { @@ -88,6 +89,21 @@ function normalizeLocateParam( return deepLocate === undefined ? rest : { ...rest, deepLocate }; } +function getTapFileChooserAccept( + actionName: string, + param: unknown, +): string[] | undefined { + if (actionName !== 'Tap' || !param || typeof param !== 'object') { + return undefined; + } + + const fileChooserAccept = (param as { fileChooserAccept?: string | string[] }) + .fileChooserAccept; + return fileChooserAccept + ? normalizeFileChooserAccept(fileChooserAccept) + : undefined; +} + export function locatePlanForLocate(param: string | DetailedLocateParam) { const locate = normalizeLocateParam(param); const locatePlan: PlanningAction = { @@ -342,38 +358,47 @@ export class TaskBuilder { debug('calling action', action.name); const actionFn = action.call.bind(this.interface); - const actionResult = await actionFn(param, taskContext); - setTimingFieldOnce(timing, 'callActionEnd'); - debug('called action', action.name, 'result:', actionResult); - - setTimingFieldOnce(timing, 'afterInvokeActionHookStart'); + const fileChooserAccept = getTapFileChooserAccept(action.name, param); + const actionResult = await withFileChooser( + this.interface, + fileChooserAccept, + async () => { + const result = await actionFn(param, taskContext); + setTimingFieldOnce(timing, 'callActionEnd'); + debug('called action', action.name, 'result:', result); + + setTimingFieldOnce(timing, 'afterInvokeActionHookStart'); + + const delayAfterRunner = + action.delayAfterRunner ?? this.waitAfterAction ?? 300; + if (delayAfterRunner > 0) { + await sleep(delayAfterRunner); + } - const delayAfterRunner = - action.delayAfterRunner ?? this.waitAfterAction ?? 300; - if (delayAfterRunner > 0) { - await sleep(delayAfterRunner); - } + try { + if (this.interface.afterInvokeAction) { + debug( + `will call "afterInvokeAction" for interface with action name ${action.name}`, + ); + await this.interface.afterInvokeAction(action.name, param); + debug( + `called "afterInvokeAction" for interface with action name ${action.name}`, + ); + } + } catch (originalError: any) { + const originalMessage = + originalError?.message || String(originalError); + throw new Error( + `error in running afterInvokeAction for ${action.name}: ${originalMessage}`, + { cause: originalError }, + ); + } - try { - if (this.interface.afterInvokeAction) { - debug( - `will call "afterInvokeAction" for interface with action name ${action.name}`, - ); - await this.interface.afterInvokeAction(action.name, param); - debug( - `called "afterInvokeAction" for interface with action name ${action.name}`, - ); - } - } catch (originalError: any) { - const originalMessage = - originalError?.message || String(originalError); - throw new Error( - `error in running afterInvokeAction for ${action.name}: ${originalMessage}`, - { cause: originalError }, - ); - } + setTimingFieldOnce(timing, 'afterInvokeActionHookEnd'); - setTimingFieldOnce(timing, 'afterInvokeActionHookEnd'); + return result; + }, + ); return { output: actionResult, diff --git a/packages/core/src/agent/tasks.ts b/packages/core/src/agent/tasks.ts index 2dd8cbddeb..1f7514c3d7 100644 --- a/packages/core/src/agent/tasks.ts +++ b/packages/core/src/agent/tasks.ts @@ -10,7 +10,7 @@ import { userPromptToMultimodalPrompt, userPromptToString, } from '@/common'; -import type { AbstractInterface, FileChooserHandler } from '@/device'; +import type { AbstractInterface } from '@/device'; import type Service from '@/service'; import type { TaskRunner } from '@/task-runner'; import { TaskExecutionError } from '@/task-runner'; @@ -35,6 +35,7 @@ import { ServiceError } from '@/types'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; import { ExecutionSession } from './execution-session'; +import { withFileChooser } from './file-chooser'; import { TaskBuilder } from './task-builder'; import type { TaskCache } from './task-cache'; export { locatePlanForLocate } from './task-builder'; @@ -82,6 +83,7 @@ function truncatePlanningFeedback(feedback: string): string { } export { TaskExecutionError }; +export { withFileChooser } from './file-chooser'; export class TaskExecutor { interface: AbstractInterface; @@ -949,37 +951,3 @@ export class TaskExecutor { return session.appendErrorPlan(`waitFor timeout: ${errorThought}`); } } - -export async function withFileChooser( - interfaceInstance: AbstractInterface, - fileChooserAccept: string[] | undefined, - action: () => Promise, -): Promise { - if (!fileChooserAccept?.length) { - return action(); - } - - if (!interfaceInstance.registerFileChooserListener) { - throw new Error( - `File upload is not supported on ${interfaceInstance.interfaceType}`, - ); - } - - const handler = async (chooser: FileChooserHandler) => { - await chooser.accept(fileChooserAccept); - }; - - const { dispose, getError } = - await interfaceInstance.registerFileChooserListener(handler); - try { - const result = await action(); - // Check for errors that occurred during file chooser handling - const error = await getError(); - if (error) { - throw error; - } - return result; - } finally { - dispose(); - } -} diff --git a/packages/core/src/device/index.ts b/packages/core/src/device/index.ts index 011d98ec17..a8044c6a8d 100644 --- a/packages/core/src/device/index.ts +++ b/packages/core/src/device/index.ts @@ -285,9 +285,16 @@ function defineLocatedPointAction< // Tap export const actionTapParamSchema = z.object({ locate: getMidsceneLocationSchema().describe('The element to be tapped'), + fileChooserAccept: z + .union([z.string(), z.array(z.string())]) + .optional() + .describe( + 'Optional file path(s) to upload when this tap triggers a file chooser. Use only for file upload controls. If the user asks to upload a concrete file path, copy the exact path here.', + ), }); export type ActionTapParam = { locate: LocateResultElement; + fileChooserAccept?: string | string[]; }; export const defineActionTap = ( diff --git a/packages/core/tests/unit-test/ai-act-file-upload-tap.test.ts b/packages/core/tests/unit-test/ai-act-file-upload-tap.test.ts new file mode 100644 index 0000000000..952f5e387e --- /dev/null +++ b/packages/core/tests/unit-test/ai-act-file-upload-tap.test.ts @@ -0,0 +1,125 @@ +import { join, resolve } from 'node:path'; +import { TaskExecutor } from '@/agent'; +import { buildYamlFlowFromPlans } from '@/common'; +import { actionTapParamSchema } from '@/device'; +import type { DeviceAction, PlanningAction } from '@/types'; +import { describe, expect, it, vi } from 'vitest'; + +const fixtureFile = join(__dirname, 'ai-act-file-upload-tap.test.ts'); +type TestFileChooserHandler = (chooser: { + accept(files: string[]): Promise; +}) => Promise; + +describe('aiAct Tap fileChooserAccept', () => { + it('should keep fileChooserAccept in generated yaml flow', () => { + const plans: PlanningAction[] = [ + { + type: 'Tap', + thought: 'upload id card', + param: { + locate: { prompt: 'the id card upload button' }, + fileChooserAccept: './fixtures/id-card.png', + }, + }, + ]; + const actionSpace = [ + { + name: 'Tap', + description: 'Tap the element', + interfaceAlias: 'aiTap', + paramSchema: actionTapParamSchema, + call: vi.fn(), + }, + ] as unknown as DeviceAction[]; + + expect(buildYamlFlowFromPlans(plans, actionSpace)).toEqual([ + { + aiTap: '', + locate: 'the id card upload button', + fileChooserAccept: './fixtures/id-card.png', + }, + ]); + }); + + it('should accept files when executing a planned Tap action', async () => { + let fileChooserHandler: TestFileChooserHandler | undefined; + const dispose = vi.fn(); + const acceptedFiles: string[][] = []; + const triggerFileChooser = async () => { + await fileChooserHandler?.({ + accept: async (files: string[]) => { + acceptedFiles.push(files); + }, + }); + }; + const actionCall = vi.fn(async () => {}); + + const mockInterface = { + interfaceType: 'playwright', + registerFileChooserListener: vi.fn( + async (handler: TestFileChooserHandler) => { + fileChooserHandler = handler; + return { + dispose: () => { + fileChooserHandler = undefined; + dispose(); + }, + getError: () => undefined, + }; + }, + ), + afterInvokeAction: vi.fn(async () => { + await triggerFileChooser(); + }), + actionSpace: () => [ + { + name: 'Tap', + description: 'Tap the element', + interfaceAlias: 'aiTap', + paramSchema: actionTapParamSchema, + delayBeforeRunner: 0, + delayAfterRunner: 0, + call: actionCall, + }, + ], + } as any; + + const taskExecutor = new TaskExecutor(mockInterface, {} as any, { + actionSpace: mockInterface.actionSpace(), + }); + + const plans: PlanningAction[] = [ + { + type: 'Tap', + thought: 'upload id card', + param: { + locate: { prompt: 'the id card upload button' }, + fileChooserAccept: fixtureFile, + }, + }, + ]; + + const { tasks } = await (taskExecutor as any).convertPlanToExecutable( + plans, + ); + const tapTask = tasks[tasks.length - 1]; + tapTask.param.locate = { + id: 'upload', + center: [100, 200], + rect: { left: 90, top: 190, width: 20, height: 20 }, + }; + + await tapTask.executor(tapTask.param, { + task: { timing: {} }, + uiContext: { + shrunkShotToLogicalRatio: 1, + }, + }); + + expect(mockInterface.registerFileChooserListener).toHaveBeenCalledTimes(1); + expect(actionCall).toHaveBeenCalledTimes(1); + expect(mockInterface.afterInvokeAction).toHaveBeenCalledTimes(1); + expect(acceptedFiles).toEqual([[resolve(fixtureFile)]]); + expect(dispose).toHaveBeenCalledTimes(1); + }); +}); diff --git a/packages/web-integration/tests/ai/fixtures/ai-act-file-upload.html b/packages/web-integration/tests/ai/fixtures/ai-act-file-upload.html new file mode 100644 index 0000000000..f6629f9b5c --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/ai-act-file-upload.html @@ -0,0 +1,46 @@ + + + + AI Act File Upload + + + +

AI Act File Upload

+ + +
No file selected
+ + + + diff --git a/packages/web-integration/tests/ai/fixtures/bar.txt b/packages/web-integration/tests/ai/fixtures/bar.txt new file mode 100644 index 0000000000..5716ca5987 --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/bar.txt @@ -0,0 +1 @@ +bar diff --git a/packages/web-integration/tests/ai/fixtures/foo.txt b/packages/web-integration/tests/ai/fixtures/foo.txt new file mode 100644 index 0000000000..257cc5642c --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/foo.txt @@ -0,0 +1 @@ +foo diff --git a/packages/web-integration/tests/ai/web/playwright/file-upload.spec.ts b/packages/web-integration/tests/ai/web/playwright/file-upload.spec.ts index 9d830bafb2..c04d0a2fb7 100644 --- a/packages/web-integration/tests/ai/web/playwright/file-upload.spec.ts +++ b/packages/web-integration/tests/ai/web/playwright/file-upload.spec.ts @@ -3,6 +3,26 @@ import { expect } from 'playwright/test'; import { test } from './fixture'; test.describe('file upload functionality', () => { + test('should upload different files in one aiAct via prompt paths', async ({ + aiAct, + page, + }) => { + const fooFile = join(__dirname, '../../fixtures/foo.txt'); + const barFile = join(__dirname, '../../fixtures/bar.txt'); + const selectedFileName = page.locator('#selected-file-name'); + + await page.goto( + `file://${join(__dirname, '../../fixtures/ai-act-file-upload.html')}`, + ); + + await aiAct( + `First click the "Upload document" button and upload the file at this exact path: ${fooFile}. Wait until the page displays "foo.txt". Then click the "Upload document" button again and upload the file at this exact path: ${barFile}. Finish only after the page displays "bar.txt".`, + { cacheable: false }, + ); + + await expect(selectedFileName).toHaveText('bar.txt'); + }); + test('should upload single file', async ({ aiTap, aiAssert, page }) => { const testFile = join(__dirname, '../../fixtures/test-file.txt');