From 32c247d9a993b17b14cee923d95a7ad176f9a81d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Thu, 16 Apr 2026 15:06:25 +0200 Subject: [PATCH] feat: add runtime system and gesture commands --- COMMAND_OWNERSHIP.md | 23 + .../agent-device/references/macos-desktop.md | 2 +- src/__tests__/runtime-conformance.test.ts | 55 +++ src/__tests__/runtime-interactions.test.ts | 241 +++++++++- src/__tests__/runtime-public.test.ts | 24 +- src/__tests__/runtime-system.test.ts | 116 +++++ src/backend.ts | 126 +++++ src/commands/catalog.ts | 13 + src/commands/index.ts | 144 ++++++ src/commands/interaction-gestures.ts | 429 ++++++++++++++++++ src/commands/interaction-resolution.ts | 283 ++++++++++++ src/commands/interactions.ts | 367 +++------------ src/commands/router.ts | 158 +++++++ src/commands/system.ts | 398 ++++++++++++++++ src/index.ts | 13 + src/testing/conformance.ts | 149 ++++++ website/docs/docs/commands.md | 1 + 17 files changed, 2237 insertions(+), 305 deletions(-) create mode 100644 src/__tests__/runtime-system.test.ts create mode 100644 src/commands/interaction-gestures.ts create mode 100644 src/commands/interaction-resolution.ts create mode 100644 src/commands/system.ts diff --git a/COMMAND_OWNERSHIP.md b/COMMAND_OWNERSHIP.md index 13312379d..fceaf885f 100644 --- a/COMMAND_OWNERSHIP.md +++ b/COMMAND_OWNERSHIP.md @@ -69,6 +69,29 @@ Their semantics should live in `agent-device/commands` as they migrate. local file inputs remain command-policy gated. - `trigger-app-event`: runtime `apps.triggerEvent` implemented with event name and JSON payload validation. +- `back`: runtime `system.back` implemented with typed in-app/system modes. +- `home`: runtime `system.home` implemented. +- `rotate`: runtime `system.rotate` implemented with explicit orientation + validation. +- `keyboard`: runtime `system.keyboard` implemented with explicit status/get + and dismiss result shapes. +- `clipboard`: runtime `system.clipboard` implemented with read/write result + unions. +- `settings`: runtime `system.settings` implemented as a typed settings-open + primitive. +- `alert`: runtime `system.alert` implemented with explicit status, handled, + and wait result unions. +- `app-switcher`: runtime `system.appSwitcher` implemented. +- `focus`: runtime `interactions.focus` implemented for point, ref, and + selector targets. +- `longpress`: runtime `interactions.longPress` implemented for point, ref, and + selector targets. +- `swipe`: runtime `interactions.swipe` implemented with point, ref, selector, + and viewport-derived directional starts. +- `scroll`: runtime `interactions.scroll` implemented with viewport, point, ref, + and selector targets. +- `pinch`: runtime `interactions.pinch` implemented behind the typed backend + primitive. ## Boundary Requirements diff --git a/skills/agent-device/references/macos-desktop.md b/skills/agent-device/references/macos-desktop.md index 4f2d41aac..a411d370b 100644 --- a/skills/agent-device/references/macos-desktop.md +++ b/skills/agent-device/references/macos-desktop.md @@ -75,7 +75,7 @@ Use `snapshot --raw --platform macos` only when debugging AX structure or collec Things not to rely on: - Mobile-only helpers such as `install`, `reinstall`, or `push`. -- Desktop-global click or fill parity from `desktop` or `menubar` sessions. +- Desktop-global click, fill, or gesture parity from `desktop` or `menubar` sessions. - Raw coordinate assumptions across runs. Troubleshooting: diff --git a/src/__tests__/runtime-conformance.test.ts b/src/__tests__/runtime-conformance.test.ts index d1fbc9a0d..af01da093 100644 --- a/src/__tests__/runtime-conformance.test.ts +++ b/src/__tests__/runtime-conformance.test.ts @@ -31,6 +31,19 @@ test('command conformance suites run against a fixture backend', async () => { assert.equal(calls.includes('tap'), true); assert.equal(calls.includes('fill'), true); assert.equal(calls.includes('typeText'), true); + assert.equal(calls.includes('focus'), true); + assert.equal(calls.includes('longPress'), true); + assert.equal(calls.includes('swipe'), true); + assert.equal(calls.includes('scroll'), true); + assert.equal(calls.includes('pinch'), true); + assert.equal(calls.includes('pressBack'), true); + assert.equal(calls.includes('pressHome'), true); + assert.equal(calls.includes('rotate'), true); + assert.equal(calls.includes('setKeyboard'), true); + assert.equal(calls.includes('getClipboard'), true); + assert.equal(calls.includes('openSettings'), true); + assert.equal(calls.includes('handleAlert'), true); + assert.equal(calls.includes('openAppSwitcher'), true); assert.equal(calls.includes('openApp'), true); assert.equal(calls.includes('closeApp'), true); assert.equal(calls.includes('listApps'), true); @@ -78,6 +91,48 @@ function createFixtureBackend(calls: string[]): AgentDeviceBackend { typeText: async () => { calls.push('typeText'); }, + focus: async () => { + calls.push('focus'); + }, + longPress: async () => { + calls.push('longPress'); + }, + swipe: async () => { + calls.push('swipe'); + }, + scroll: async () => { + calls.push('scroll'); + }, + pinch: async () => { + calls.push('pinch'); + }, + pressBack: async () => { + calls.push('pressBack'); + }, + pressHome: async () => { + calls.push('pressHome'); + }, + rotate: async () => { + calls.push('rotate'); + }, + setKeyboard: async (_context, options) => { + calls.push('setKeyboard'); + return { action: options.action, visible: false }; + }, + getClipboard: async () => { + calls.push('getClipboard'); + return { text: 'copied' }; + }, + openSettings: async () => { + calls.push('openSettings'); + }, + handleAlert: async () => { + calls.push('handleAlert'); + return { kind: 'alertStatus', alert: null }; + }, + openAppSwitcher: async () => { + calls.push('openAppSwitcher'); + }, openApp: async () => { calls.push('openApp'); }, diff --git a/src/__tests__/runtime-interactions.test.ts b/src/__tests__/runtime-interactions.test.ts index de5021886..5f30a265b 100644 --- a/src/__tests__/runtime-interactions.test.ts +++ b/src/__tests__/runtime-interactions.test.ts @@ -193,6 +193,195 @@ test('runtime typeText validates refs and forwards text to the backend primitive ); }); +test('runtime focus and longPress share selector/ref target resolution', async () => { + const calls: unknown[] = []; + const device = createInteractionDevice(selectorSnapshot(), { + focus: async (_context, point) => { + calls.push({ command: 'focus', point }); + return { focused: true }; + }, + longPress: async (_context, point, options) => { + calls.push({ command: 'longPress', point, durationMs: options?.durationMs }); + }, + }); + + const focused = await device.interactions.focus(selector('label=Continue'), { + session: 'default', + }); + const longPressed = await device.interactions.longPress(ref('@e1'), { + session: 'default', + durationMs: 750, + }); + + assert.equal(focused.kind, 'selector'); + assert.deepEqual(focused.backendResult, { focused: true }); + assert.equal(longPressed.kind, 'ref'); + assert.deepEqual(calls, [ + { command: 'focus', point: { x: 60, y: 40 } }, + { command: 'longPress', point: { x: 60, y: 40 }, durationMs: 750 }, + ]); +}); + +test('runtime scroll resolves selector targets before calling the backend primitive', async () => { + const calls: unknown[] = []; + const device = createInteractionDevice(selectorSnapshot(), { + scroll: async (_context, target, options) => { + calls.push({ target, options }); + return { scrolled: true }; + }, + }); + + const selectorResult = await device.interactions.scroll({ + session: 'default', + target: selector('label=Continue'), + direction: 'down', + pixels: 120, + }); + const viewportResult = await device.interactions.scroll({ + direction: 'up', + amount: 0.5, + }); + + assert.equal(selectorResult.kind, 'selector'); + assert.equal(viewportResult.kind, 'viewport'); + assert.deepEqual(calls, [ + { + target: { kind: 'point', point: { x: 60, y: 40 } }, + options: { direction: 'down', pixels: 120 }, + }, + { + target: { kind: 'viewport' }, + options: { direction: 'up', amount: 0.5 }, + }, + ]); +}); + +test('runtime swipe supports explicit and viewport-derived targets', async () => { + const calls: unknown[] = []; + const device = createInteractionDevice(selectorSnapshot(), { + swipe: async (_context, from, to, options) => { + calls.push({ from, to, durationMs: options?.durationMs }); + }, + }); + + const explicit = await device.interactions.swipe({ + from: selector('label=Continue'), + to: { x: 200, y: 40 }, + durationMs: 300, + session: 'default', + }); + const directional = await device.interactions.swipe({ + direction: 'left', + distance: 25, + session: 'default', + }); + + assert.deepEqual(explicit.from, { x: 60, y: 40 }); + assert.deepEqual(directional.from, { x: 60, y: 40 }); + assert.deepEqual(directional.to, { x: 35, y: 40 }); + assert.deepEqual(calls, [ + { from: { x: 60, y: 40 }, to: { x: 200, y: 40 }, durationMs: 300 }, + { from: { x: 60, y: 40 }, to: { x: 35, y: 40 }, durationMs: undefined }, + ]); +}); + +test('runtime directional swipe uses the visible viewport instead of off-screen content bounds', async () => { + const calls: unknown[] = []; + const device = createInteractionDevice(snapshotWithOffscreenContent(), { + swipe: async (_context, from, to) => { + calls.push({ from, to }); + }, + }); + + const result = await device.interactions.swipe({ + direction: 'left', + distance: 25, + session: 'default', + }); + + assert.deepEqual(result.from, { x: 50, y: 50 }); + assert.deepEqual(result.to, { x: 25, y: 50 }); + assert.deepEqual(calls, [{ from: { x: 50, y: 50 }, to: { x: 25, y: 50 } }]); +}); + +test('runtime viewport gestures reject inspect-only macOS surfaces', async () => { + for (const surface of ['desktop', 'menubar'] as const) { + const device = createInteractionDevice(selectorSnapshot(), { + platform: 'macos', + sessionMetadata: { surface }, + scroll: async () => { + throw new Error(`${surface} scroll should be rejected before backend call`); + }, + swipe: async () => { + throw new Error(`${surface} swipe should be rejected before backend call`); + }, + pinch: async () => { + throw new Error(`${surface} pinch should be rejected before backend call`); + }, + }); + + await assert.rejects( + () => + device.interactions.scroll({ + direction: 'down', + target: { kind: 'viewport' }, + session: 'default', + }), + new RegExp(`scroll is not supported on macOS ${surface}`), + ); + await assert.rejects( + () => + device.interactions.swipe({ + direction: 'left', + session: 'default', + }), + new RegExp(`swipe is not supported on macOS ${surface}`), + ); + await assert.rejects( + () => + device.interactions.swipe({ + from: { x: 10, y: 20 }, + to: { x: 30, y: 20 }, + session: 'default', + }), + new RegExp(`swipe is not supported on macOS ${surface}`), + ); + await assert.rejects( + () => + device.interactions.pinch({ + scale: 1.2, + session: 'default', + }), + new RegExp(`pinch is not supported on macOS ${surface}`), + ); + } +}); + +test('runtime pinch is backend-gated and resolves optional center targets', async () => { + const calls: unknown[] = []; + const unsupported = createInteractionDevice(selectorSnapshot()); + await assert.rejects( + () => unsupported.interactions.pinch({ scale: 1.2 }), + /pinch is not supported/, + ); + + const device = createInteractionDevice(selectorSnapshot(), { + pinch: async (_context, options) => { + calls.push(options); + }, + }); + + const result = await device.interactions.pinch({ + scale: 0.8, + center: ref('@e1'), + session: 'default', + }); + + assert.equal(result.kind, 'pinch'); + assert.deepEqual(result.center, { x: 60, y: 40 }); + assert.deepEqual(calls, [{ scale: 0.8, center: { x: 60, y: 40 } }]); +}); + test('runtime interaction commands are available from the command namespace', async () => { const device = createInteractionDevice(selectorSnapshot(), { tap: async () => {}, @@ -233,9 +422,52 @@ function fillableSnapshot(): SnapshotState { ]); } +function snapshotWithOffscreenContent(): SnapshotState { + return makeSnapshotState([ + { + index: 0, + depth: 0, + type: 'Application', + label: 'Example', + rect: { x: 0, y: 0, width: 100, height: 100 }, + }, + { + index: 1, + depth: 1, + parentIndex: 0, + type: 'Button', + label: 'Visible', + rect: { x: 10, y: 10, width: 20, height: 20 }, + hittable: true, + }, + { + index: 2, + depth: 1, + parentIndex: 0, + type: 'Button', + label: 'Offscreen', + rect: { x: 10, y: 900, width: 20, height: 20 }, + hittable: true, + }, + ]); +} + function createInteractionDevice( snapshot: SnapshotState, - overrides: Partial> & { + overrides: Partial< + Pick< + AgentDeviceBackend, + | 'captureSnapshot' + | 'tap' + | 'fill' + | 'typeText' + | 'focus' + | 'longPress' + | 'scroll' + | 'swipe' + | 'pinch' + > + > & { platform?: AgentDeviceBackend['platform']; sessionMetadata?: Record; } = {}, @@ -248,6 +480,13 @@ function createInteractionDevice( tap: async (...args) => await overrides.tap?.(...args), fill: async (...args) => await overrides.fill?.(...args), typeText: async (...args) => await overrides.typeText?.(...args), + focus: overrides.focus ? async (...args) => await overrides.focus?.(...args) : undefined, + longPress: overrides.longPress + ? async (...args) => await overrides.longPress?.(...args) + : undefined, + scroll: overrides.scroll ? async (...args) => await overrides.scroll?.(...args) : undefined, + swipe: overrides.swipe ? async (...args) => await overrides.swipe?.(...args) : undefined, + pinch: overrides.pinch ? async (...args) => await overrides.pinch?.(...args) : undefined, } satisfies AgentDeviceBackend, artifacts: createLocalArtifactAdapter(), sessions: createMemorySessionStore([ diff --git a/src/__tests__/runtime-public.test.ts b/src/__tests__/runtime-public.test.ts index 9ff24f1b0..7989dc861 100644 --- a/src/__tests__/runtime-public.test.ts +++ b/src/__tests__/runtime-public.test.ts @@ -43,6 +43,7 @@ const backend = { getAppState: async (_context, app: string) => ({ bundleId: app, state: 'foreground' as const }), pushFile: async () => {}, triggerAppEvent: async () => {}, + pressHome: async () => {}, } satisfies AgentDeviceBackend; const artifacts = { @@ -76,6 +77,7 @@ test('package root exposes command runtime skeleton', async () => { assert.equal(device.policy.allowLocalInputPaths, false); assert.equal(typeof device.capture.screenshot, 'function'); assert.equal(typeof device.interactions.click, 'function'); + assert.equal(typeof device.system.back, 'function'); assert.equal(typeof device.apps.open, 'function'); const result = await device.capture.screenshot({}); assert.equal(result.path, '/tmp/path.png'); @@ -365,11 +367,24 @@ test('public backend, commands, io, and conformance subpaths are importable', () assert.equal(typeof commands.interactions.press, 'function'); assert.equal(typeof commands.interactions.fill, 'function'); assert.equal(typeof commands.interactions.typeText, 'function'); + assert.equal(typeof commands.interactions.focus, 'function'); + assert.equal(typeof commands.interactions.longPress, 'function'); + assert.equal(typeof commands.interactions.swipe, 'function'); + assert.equal(typeof commands.interactions.scroll, 'function'); + assert.equal(typeof commands.interactions.pinch, 'function'); + assert.equal(typeof commands.system.back, 'function'); + assert.equal(typeof commands.system.home, 'function'); + assert.equal(typeof commands.system.rotate, 'function'); + assert.equal(typeof commands.system.keyboard, 'function'); + assert.equal(typeof commands.system.clipboard, 'function'); + assert.equal(typeof commands.system.settings, 'function'); + assert.equal(typeof commands.system.alert, 'function'); + assert.equal(typeof commands.system.appSwitcher, 'function'); assert.equal( commandCatalog.some((entry) => entry.command === 'click' && entry.status === 'implemented'), true, ); - assert.equal(commandConformanceSuites.length, 4); + assert.equal(commandConformanceSuites.length, 5); assert.equal(typeof runCommandConformance, 'function'); assert.equal(target.name, 'fake'); }); @@ -421,6 +436,13 @@ test('command router dispatches implemented runtime commands and normalizes erro assert.equal(typed.ok, true); assert.equal(typed.ok && 'text' in typed.data ? typed.data.text : undefined, 'hello'); + const home = await router.dispatch({ + command: 'system.home', + options: {}, + }); + assert.equal(home.ok, true); + assert.equal(home.ok && 'kind' in home.data ? home.data.kind : undefined, 'systemHome'); + const opened = await router.dispatch({ command: 'apps.open', options: { diff --git a/src/__tests__/runtime-system.test.ts b/src/__tests__/runtime-system.test.ts new file mode 100644 index 000000000..2f73c75e7 --- /dev/null +++ b/src/__tests__/runtime-system.test.ts @@ -0,0 +1,116 @@ +import assert from 'node:assert/strict'; +import { test } from 'vitest'; +import type { + AgentDeviceBackend, + BackendAlertAction, + BackendDeviceOrientation, + BackendKeyboardOptions, +} from '../backend.ts'; +import { createLocalArtifactAdapter } from '../io.ts'; +import { createAgentDevice, localCommandPolicy } from '../runtime.ts'; + +test('runtime system commands call typed backend primitives', async () => { + const calls: unknown[] = []; + const device = createAgentDevice({ + backend: createSystemBackend(calls), + artifacts: createLocalArtifactAdapter(), + policy: localCommandPolicy(), + }); + + const back = await device.system.back({ session: 'default', mode: 'system' }); + const home = await device.system.home({ session: 'default' }); + const rotated = await device.system.rotate({ orientation: 'landscape-left' }); + const keyboard = await device.system.keyboard({ action: 'dismiss' }); + const clipboardRead = await device.system.clipboard({ action: 'read' }); + const clipboardWrite = await device.system.clipboard({ action: 'write', text: 'hello' }); + const settings = await device.system.settings({ target: 'privacy' }); + const alert = await device.system.alert({ action: 'accept', timeoutMs: 500 }); + const appSwitcher = await device.system.appSwitcher(); + + assert.equal(back.kind, 'systemBack'); + assert.equal(home.kind, 'systemHome'); + assert.equal(rotated.orientation, 'landscape-left'); + assert.equal(keyboard.kind, 'keyboardDismissed'); + assert.deepEqual(clipboardRead, { kind: 'clipboardText', action: 'read', text: 'copied' }); + assert.equal(clipboardWrite.kind, 'clipboardUpdated'); + assert.equal(clipboardWrite.textLength, 5); + assert.equal(settings.target, 'privacy'); + assert.equal(alert.kind, 'alertHandled'); + assert.equal(appSwitcher.kind, 'appSwitcherOpened'); + assert.deepEqual(calls, [ + { command: 'pressBack', mode: 'system', session: 'default' }, + { command: 'pressHome', session: 'default' }, + { command: 'rotate', orientation: 'landscape-left' }, + { command: 'setKeyboard', options: { action: 'dismiss' } }, + { command: 'getClipboard' }, + { command: 'setClipboard', text: 'hello' }, + { command: 'openSettings', target: 'privacy' }, + { command: 'handleAlert', action: 'accept', timeoutMs: 500 }, + { command: 'openAppSwitcher' }, + ]); +}); + +test('runtime system commands validate options before backend calls', async () => { + const calls: unknown[] = []; + const device = createAgentDevice({ + backend: createSystemBackend(calls), + artifacts: createLocalArtifactAdapter(), + policy: localCommandPolicy(), + }); + + await assert.rejects( + () => device.system.rotate({ orientation: 'sideways' as BackendDeviceOrientation }), + /orientation must be/, + ); + await assert.rejects( + () => device.system.keyboard({ action: 'hide' as BackendKeyboardOptions['action'] }), + /action must be/, + ); + await assert.rejects( + () => device.system.clipboard({ action: 'write', text: undefined as unknown as string }), + /requires text/, + ); + await assert.rejects( + () => device.system.alert({ action: 'tap' as BackendAlertAction }), + /action must be/, + ); + + assert.deepEqual(calls, []); +}); + +function createSystemBackend(calls: unknown[]): AgentDeviceBackend { + return { + platform: 'ios', + pressBack: async (context, options) => { + calls.push({ command: 'pressBack', mode: options?.mode, session: context.session }); + return { ok: true }; + }, + pressHome: async (context) => { + calls.push({ command: 'pressHome', session: context.session }); + }, + rotate: async (_context, orientation) => { + calls.push({ command: 'rotate', orientation }); + }, + setKeyboard: async (_context, options) => { + calls.push({ command: 'setKeyboard', options }); + return { action: options.action, dismissed: true, visible: false }; + }, + getClipboard: async () => { + calls.push({ command: 'getClipboard' }); + return { text: 'copied' }; + }, + setClipboard: async (_context, text) => { + calls.push({ command: 'setClipboard', text }); + }, + openSettings: async (_context, target) => { + calls.push({ command: 'openSettings', target }); + }, + handleAlert: async (_context, action, options) => { + calls.push({ command: 'handleAlert', action, timeoutMs: options?.timeoutMs }); + return { kind: 'alertHandled', handled: true, button: 'OK' }; + }, + openAppSwitcher: async () => { + calls.push({ command: 'openAppSwitcher' }); + }, + }; +} diff --git a/src/backend.ts b/src/backend.ts index 5945916f0..b17233f29 100644 --- a/src/backend.ts +++ b/src/backend.ts @@ -76,6 +76,61 @@ export type BackendScreenshotResult = { export type BackendActionResult = Record | void; +export type BackendDeviceOrientation = + | 'portrait' + | 'portrait-upside-down' + | 'landscape-left' + | 'landscape-right'; + +export type BackendBackOptions = { + mode?: 'in-app' | 'system'; +}; + +export type BackendKeyboardOptions = { + action: 'status' | 'get' | 'dismiss'; +}; + +export type BackendKeyboardResult = { + platform?: 'android' | 'ios' | 'macos' | 'linux'; + action?: BackendKeyboardOptions['action']; + visible?: boolean; + inputType?: string | null; + type?: string | null; + wasVisible?: boolean; + dismissed?: boolean; + attempts?: number; +}; + +export type BackendClipboardTextResult = { + text: string; +}; + +export type BackendAlertAction = 'get' | 'accept' | 'dismiss' | 'wait'; + +export type BackendAlertInfo = { + title?: string; + message?: string; + buttons?: string[]; +}; + +export type BackendAlertResult = + | { + kind: 'alertStatus'; + alert: BackendAlertInfo | null; + } + | { + kind: 'alertHandled'; + handled: boolean; + alert?: BackendAlertInfo; + button?: string; + } + | { + kind: 'alertWait'; + alert: BackendAlertInfo | null; + waitedMs?: number; + timedOut?: boolean; + }; + export type BackendTapOptions = { button?: 'primary' | 'secondary' | 'middle'; count?: number; @@ -89,6 +144,34 @@ export type BackendFillOptions = { delayMs?: number; }; +export type BackendLongPressOptions = { + durationMs?: number; +}; + +export type BackendSwipeOptions = { + durationMs?: number; +}; + +export type BackendScrollTarget = + | { + kind: 'viewport'; + } + | { + kind: 'point'; + point: Point; + }; + +export type BackendScrollOptions = { + direction: 'up' | 'down' | 'left' | 'right'; + amount?: number; + pixels?: number; +}; + +export type BackendPinchOptions = { + scale: number; + center?: Point; +}; + export type BackendOpenTarget = { /** * Generic app identifier accepted by the backend. Hosted adapters should @@ -217,11 +300,54 @@ export type AgentDeviceBackend = { text: string, options?: { delayMs?: number }, ): Promise; + focus?(context: BackendCommandContext, point: Point): Promise; + longPress?( + context: BackendCommandContext, + point: Point, + options?: BackendLongPressOptions, + ): Promise; + swipe?( + context: BackendCommandContext, + from: Point, + to: Point, + options?: BackendSwipeOptions, + ): Promise; + scroll?( + context: BackendCommandContext, + target: BackendScrollTarget, + options: BackendScrollOptions, + ): Promise; + pinch?( + context: BackendCommandContext, + options: BackendPinchOptions, + ): Promise; pressKey?( context: BackendCommandContext, key: string, options?: { modifiers?: string[] }, ): Promise; + pressBack?( + context: BackendCommandContext, + options?: BackendBackOptions, + ): Promise; + pressHome?(context: BackendCommandContext): Promise; + rotate?( + context: BackendCommandContext, + orientation: BackendDeviceOrientation, + ): Promise; + setKeyboard?( + context: BackendCommandContext, + options: BackendKeyboardOptions, + ): Promise; + getClipboard?(context: BackendCommandContext): Promise; + setClipboard?(context: BackendCommandContext, text: string): Promise; + openSettings?(context: BackendCommandContext, target?: string): Promise; + handleAlert?( + context: BackendCommandContext, + action: BackendAlertAction, + options?: { timeoutMs?: number }, + ): Promise; + openAppSwitcher?(context: BackendCommandContext): Promise; openApp?( context: BackendCommandContext, target: BackendOpenTarget, diff --git a/src/commands/catalog.ts b/src/commands/catalog.ts index a1ea402b6..5e1fdcd42 100644 --- a/src/commands/catalog.ts +++ b/src/commands/catalog.ts @@ -32,6 +32,11 @@ export const commandCatalog: readonly CommandCatalogEntry[] = [ { command: 'type', category: 'portable-runtime', status: 'implemented' }, { command: 'scroll', category: 'portable-runtime', status: 'planned' }, { command: 'pinch', category: 'portable-runtime', status: 'planned' }, + { command: 'interactions.focus', category: 'portable-runtime', status: 'implemented' }, + { command: 'interactions.longPress', category: 'portable-runtime', status: 'implemented' }, + { command: 'interactions.swipe', category: 'portable-runtime', status: 'implemented' }, + { command: 'interactions.scroll', category: 'portable-runtime', status: 'implemented' }, + { command: 'interactions.pinch', category: 'portable-runtime', status: 'implemented' }, { command: 'open', category: 'portable-runtime', status: 'planned' }, { command: 'close', category: 'portable-runtime', status: 'planned' }, { command: 'apps', category: 'portable-runtime', status: 'planned' }, @@ -49,6 +54,14 @@ export const commandCatalog: readonly CommandCatalogEntry[] = [ { command: 'keyboard', category: 'portable-runtime', status: 'planned' }, { command: 'clipboard', category: 'portable-runtime', status: 'planned' }, { command: 'settings', category: 'portable-runtime', status: 'planned' }, + { command: 'system.back', category: 'portable-runtime', status: 'implemented' }, + { command: 'system.home', category: 'portable-runtime', status: 'implemented' }, + { command: 'system.rotate', category: 'portable-runtime', status: 'implemented' }, + { command: 'system.appSwitcher', category: 'portable-runtime', status: 'implemented' }, + { command: 'system.keyboard', category: 'portable-runtime', status: 'implemented' }, + { command: 'system.clipboard', category: 'portable-runtime', status: 'implemented' }, + { command: 'system.settings', category: 'portable-runtime', status: 'implemented' }, + { command: 'system.alert', category: 'portable-runtime', status: 'implemented' }, { command: 'push', category: 'portable-runtime', status: 'planned' }, { command: 'trigger-app-event', category: 'portable-runtime', status: 'planned' }, { command: 'devices', category: 'backend-admin', status: 'planned' }, diff --git a/src/commands/index.ts b/src/commands/index.ts index 02a2a297b..f2946151f 100644 --- a/src/commands/index.ts +++ b/src/commands/index.ts @@ -40,17 +40,58 @@ import { import { clickCommand, fillCommand, + focusCommand, + longPressCommand, + pinchCommand, pressCommand, + scrollCommand, + swipeCommand, typeTextCommand, type ClickCommandOptions, type FillCommandOptions, type FillCommandResult, + type FocusCommandOptions, + type FocusCommandResult, type InteractionTarget, + type LongPressCommandOptions, + type LongPressCommandResult, + type PinchCommandOptions, + type PinchCommandResult, type PressCommandOptions, type PressCommandResult, + type ScrollCommandOptions, + type ScrollCommandResult, + type SwipeCommandOptions, + type SwipeCommandResult, type TypeTextCommandOptions, type TypeTextCommandResult, } from './interactions.ts'; +import { + alertCommand, + appSwitcherCommand, + backCommand, + clipboardCommand, + homeCommand, + keyboardCommand, + rotateCommand, + settingsCommand, + type SystemAlertCommandOptions, + type SystemAlertCommandResult, + type SystemAppSwitcherCommandOptions, + type SystemAppSwitcherCommandResult, + type SystemBackCommandOptions, + type SystemBackCommandResult, + type SystemClipboardCommandOptions, + type SystemClipboardCommandResult, + type SystemHomeCommandOptions, + type SystemHomeCommandResult, + type SystemKeyboardCommandOptions, + type SystemKeyboardCommandResult, + type SystemRotateCommandOptions, + type SystemRotateCommandResult, + type SystemSettingsCommandOptions, + type SystemSettingsCommandResult, +} from './system.ts'; import { closeAppCommand, getAppStateCommand, @@ -107,13 +148,44 @@ export type { ClickCommandOptions, FillCommandOptions, FillCommandResult, + FocusCommandOptions, + FocusCommandResult, InteractionTarget, + LongPressCommandOptions, + LongPressCommandResult, + PinchCommandOptions, + PinchCommandResult, PointTarget, PressCommandOptions, PressCommandResult, + ResolvedInteractionTarget, + ScrollCommandOptions, + ScrollCommandResult, + ScrollTarget, + SwipeCommandOptions, + SwipeCommandResult, + SwipeOptions, TypeTextCommandOptions, TypeTextCommandResult, } from './interactions.ts'; +export type { + SystemAlertCommandOptions, + SystemAlertCommandResult, + SystemAppSwitcherCommandOptions, + SystemAppSwitcherCommandResult, + SystemBackCommandOptions, + SystemBackCommandResult, + SystemClipboardCommandOptions, + SystemClipboardCommandResult, + SystemHomeCommandOptions, + SystemHomeCommandResult, + SystemKeyboardCommandOptions, + SystemKeyboardCommandResult, + SystemRotateCommandOptions, + SystemRotateCommandResult, + SystemSettingsCommandOptions, + SystemSettingsCommandResult, +} from './system.ts'; export type { AppPushInput, CloseAppCommandOptions, @@ -195,6 +267,24 @@ export type AgentDeviceCommands = { press: RuntimeCommand; fill: RuntimeCommand; typeText: RuntimeCommand; + focus: RuntimeCommand; + longPress: RuntimeCommand; + swipe: RuntimeCommand; + scroll: RuntimeCommand; + pinch: RuntimeCommand; + }; + system: { + back: RuntimeCommand; + home: RuntimeCommand; + rotate: RuntimeCommand; + keyboard: RuntimeCommand; + clipboard: RuntimeCommand; + settings: RuntimeCommand; + alert: RuntimeCommand; + appSwitcher: RuntimeCommand< + SystemAppSwitcherCommandOptions | undefined, + SystemAppSwitcherCommandResult + >; }; apps: { open: RuntimeCommand; @@ -257,6 +347,29 @@ export type BoundAgentDeviceCommands = { text: string, options?: Omit, ) => Promise; + focus: ( + target: InteractionTarget, + options?: Omit, + ) => Promise; + longPress: ( + target: InteractionTarget, + options?: Omit, + ) => Promise; + swipe: BoundRuntimeCommand; + scroll: BoundRuntimeCommand; + pinch: BoundRuntimeCommand; + }; + system: { + back: (options?: SystemBackCommandOptions) => Promise; + home: (options?: SystemHomeCommandOptions) => Promise; + rotate: BoundRuntimeCommand; + keyboard: (options?: SystemKeyboardCommandOptions) => Promise; + clipboard: BoundRuntimeCommand; + settings: (options?: SystemSettingsCommandOptions) => Promise; + alert: (options?: SystemAlertCommandOptions) => Promise; + appSwitcher: ( + options?: SystemAppSwitcherCommandOptions, + ) => Promise; }; apps: { open: BoundRuntimeCommand; @@ -291,6 +404,21 @@ export const commands: AgentDeviceCommands = { press: pressCommand, fill: fillCommand, typeText: typeTextCommand, + focus: focusCommand, + longPress: longPressCommand, + swipe: swipeCommand, + scroll: scrollCommand, + pinch: pinchCommand, + }, + system: { + back: backCommand, + home: homeCommand, + rotate: rotateCommand, + keyboard: keyboardCommand, + clipboard: clipboardCommand, + settings: settingsCommand, + alert: alertCommand, + appSwitcher: appSwitcherCommand, }, apps: { open: openAppCommand, @@ -333,6 +461,22 @@ export function bindCommands(runtime: AgentDeviceRuntime): BoundAgentDeviceComma commands.interactions.fill(runtime, { ...options, target, text }), typeText: (text, options = {}) => commands.interactions.typeText(runtime, { ...options, text }), + focus: (target, options = {}) => commands.interactions.focus(runtime, { ...options, target }), + longPress: (target, options = {}) => + commands.interactions.longPress(runtime, { ...options, target }), + swipe: (options) => commands.interactions.swipe(runtime, options), + scroll: (options) => commands.interactions.scroll(runtime, options), + pinch: (options) => commands.interactions.pinch(runtime, options), + }, + system: { + back: (options) => commands.system.back(runtime, options), + home: (options) => commands.system.home(runtime, options), + rotate: (options) => commands.system.rotate(runtime, options), + keyboard: (options) => commands.system.keyboard(runtime, options), + clipboard: (options) => commands.system.clipboard(runtime, options), + settings: (options) => commands.system.settings(runtime, options), + alert: (options) => commands.system.alert(runtime, options), + appSwitcher: (options) => commands.system.appSwitcher(runtime, options), }, apps: { open: (options) => commands.apps.open(runtime, options), diff --git a/src/commands/interaction-gestures.ts b/src/commands/interaction-gestures.ts new file mode 100644 index 000000000..838bd17a2 --- /dev/null +++ b/src/commands/interaction-gestures.ts @@ -0,0 +1,429 @@ +import { AppError } from '../utils/errors.ts'; +import type { Point, Rect, SnapshotNode, SnapshotState } from '../utils/snapshot.ts'; +import { centerOfRect } from '../utils/snapshot.ts'; +import type { AgentDeviceRuntime, CommandContext } from '../runtime.ts'; +import { requireIntInRange } from '../utils/validation.ts'; +import { successText } from '../utils/success-text.ts'; +import { isNodeVisibleInEffectiveViewport } from '../utils/mobile-snapshot-semantics.ts'; +import type { RuntimeCommand } from './index.ts'; +import { + assertSupportedInteractionSurface, + captureInteractionSnapshot, + type InteractionTarget, + type ResolvedInteractionTarget, + resolveInteractionTarget, +} from './interaction-resolution.ts'; +import { toBackendContext } from './selector-read-utils.ts'; + +export type FocusCommandOptions = CommandContext & { + target: InteractionTarget; +}; + +export type FocusCommandResult = ResolvedInteractionTarget & { + backendResult?: Record; + message?: string; +}; + +export type LongPressCommandOptions = CommandContext & { + target: InteractionTarget; + durationMs?: number; +}; + +export type LongPressCommandResult = ResolvedInteractionTarget & { + durationMs?: number; + backendResult?: Record; + message?: string; +}; + +export type GestureDirection = 'up' | 'down' | 'left' | 'right'; + +export type ScrollTarget = + | InteractionTarget + | { + kind: 'viewport'; + }; + +export type ScrollCommandOptions = CommandContext & { + target?: ScrollTarget; + direction: GestureDirection; + amount?: number; + pixels?: number; +}; + +export type ScrollCommandResult = + | { + kind: 'viewport'; + direction: GestureDirection; + amount?: number; + pixels?: number; + backendResult?: Record; + message?: string; + } + | (ResolvedInteractionTarget & { + direction: GestureDirection; + amount?: number; + pixels?: number; + backendResult?: Record; + message?: string; + }); + +type ResolvedScrollTarget = { kind: 'viewport' } | ResolvedInteractionTarget; + +export type SwipeOptions = { + from?: Point | InteractionTarget; + to?: Point; + direction?: GestureDirection; + distance?: number; + durationMs?: number; +}; + +export type SwipeCommandOptions = CommandContext & SwipeOptions; + +export type SwipeCommandResult = { + kind: 'swipe'; + from: Point; + to: Point; + direction?: GestureDirection; + distance?: number; + durationMs?: number; + fromTarget?: ResolvedInteractionTarget | { kind: 'viewport' }; + backendResult?: Record; + message?: string; +}; + +export type PinchCommandOptions = CommandContext & { + scale: number; + center?: InteractionTarget; +}; + +export type PinchCommandResult = { + kind: 'pinch'; + scale: number; + center?: Point; + centerTarget?: ResolvedInteractionTarget; + backendResult?: Record; + message?: string; +}; + +export const focusCommand: RuntimeCommand = async ( + runtime, + options, +): Promise => { + const resolved = await resolveInteractionTarget(runtime, options, { + action: 'focus', + requireInteractive: true, + promoteToHittableAncestor: false, + }); + if (!runtime.backend.focus) { + throw new AppError('UNSUPPORTED_OPERATION', 'focus is not supported by this backend'); + } + const backendResult = await runtime.backend.focus( + toBackendContext(runtime, options), + resolved.point, + ); + const formattedBackendResult = toBackendResult(backendResult); + return { + ...resolved, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText(`Focused (${resolved.point.x}, ${resolved.point.y})`), + }; +}; + +export const longPressCommand: RuntimeCommand< + LongPressCommandOptions, + LongPressCommandResult +> = async (runtime, options): Promise => { + const resolved = await resolveInteractionTarget(runtime, options, { + action: 'longPress', + requireInteractive: true, + promoteToHittableAncestor: true, + }); + if (!runtime.backend.longPress) { + throw new AppError('UNSUPPORTED_OPERATION', 'longPress is not supported by this backend'); + } + const durationMs = + options.durationMs === undefined + ? undefined + : requireIntInRange(options.durationMs, 'durationMs', 0, 120_000); + const backendResult = await runtime.backend.longPress( + toBackendContext(runtime, options), + resolved.point, + { durationMs }, + ); + const formattedBackendResult = toBackendResult(backendResult); + return { + ...resolved, + ...(durationMs !== undefined ? { durationMs } : {}), + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText(`Long pressed (${resolved.point.x}, ${resolved.point.y})`), + }; +}; + +export const scrollCommand: RuntimeCommand = async ( + runtime, + options, +): Promise => { + if (!runtime.backend.scroll) { + throw new AppError('UNSUPPORTED_OPERATION', 'scroll is not supported by this backend'); + } + const direction = requireDirection(options.direction, 'scroll direction'); + const amount = normalizeOptionalPositiveNumber(options.amount, 'scroll amount'); + const pixels = normalizeOptionalPositiveInteger(options.pixels, 'scroll pixels'); + if (amount !== undefined && pixels !== undefined) { + throw new AppError('INVALID_ARGS', 'scroll accepts either amount or pixels, not both'); + } + + const resolved = await resolveScrollTarget(runtime, options); + const backendTarget = + resolved.kind === 'viewport' + ? { kind: 'viewport' as const } + : { kind: 'point' as const, point: resolved.point }; + const backendResult = await runtime.backend.scroll( + toBackendContext(runtime, options), + backendTarget, + { + direction, + ...(amount !== undefined ? { amount } : {}), + ...(pixels !== undefined ? { pixels } : {}), + }, + ); + const formattedBackendResult = toBackendResult(backendResult); + return { + ...resolved, + direction, + ...(amount !== undefined ? { amount } : {}), + ...(pixels !== undefined ? { pixels } : {}), + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText( + pixels !== undefined + ? `Scrolled ${direction} by ${pixels}px` + : amount !== undefined + ? `Scrolled ${direction} by ${amount}` + : `Scrolled ${direction}`, + ), + }; +}; + +export const swipeCommand: RuntimeCommand = async ( + runtime, + options, +): Promise => { + if (!runtime.backend.swipe) { + throw new AppError('UNSUPPORTED_OPERATION', 'swipe is not supported by this backend'); + } + const resolvedFrom = await resolveSwipeFrom(runtime, options); + const to = resolveSwipeTo(resolvedFrom.point, options); + const durationMs = + options.durationMs === undefined + ? undefined + : requireIntInRange(options.durationMs, 'durationMs', 16, 10_000); + const backendResult = await runtime.backend.swipe( + toBackendContext(runtime, options), + resolvedFrom.point, + to.point, + { durationMs }, + ); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'swipe', + from: resolvedFrom.point, + to: to.point, + ...(to.direction ? { direction: to.direction } : {}), + ...(to.distance !== undefined ? { distance: to.distance } : {}), + ...(durationMs !== undefined ? { durationMs } : {}), + ...(resolvedFrom.target ? { fromTarget: resolvedFrom.target } : {}), + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText('Swiped'), + }; +}; + +export const pinchCommand: RuntimeCommand = async ( + runtime, + options, +): Promise => { + if (!runtime.backend.pinch) { + throw new AppError('UNSUPPORTED_OPERATION', 'pinch is not supported by this backend'); + } + await assertSupportedInteractionSurface(runtime, options, 'pinch'); + const scale = normalizePositiveNumber(options.scale, 'pinch scale'); + const centerTarget = options.center + ? await resolveInteractionTarget( + runtime, + { ...options, target: options.center }, + { + action: 'pinch', + requireInteractive: false, + promoteToHittableAncestor: false, + }, + ) + : undefined; + const backendResult = await runtime.backend.pinch(toBackendContext(runtime, options), { + scale, + ...(centerTarget ? { center: centerTarget.point } : {}), + }); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'pinch', + scale, + ...(centerTarget ? { center: centerTarget.point, centerTarget } : {}), + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText(`Pinched to scale ${scale}`), + }; +}; + +async function resolveScrollTarget( + runtime: AgentDeviceRuntime, + options: ScrollCommandOptions, +): Promise { + const target = options.target ?? { kind: 'viewport' as const }; + if (target.kind === 'viewport') { + await assertSupportedInteractionSurface(runtime, options, 'scroll'); + return { kind: 'viewport' }; + } + return await resolveInteractionTarget( + runtime, + { ...options, target }, + { + action: 'scroll', + requireInteractive: false, + promoteToHittableAncestor: false, + }, + ); +} + +async function resolveSwipeFrom( + runtime: AgentDeviceRuntime, + options: SwipeCommandOptions, +): Promise<{ + point: Point; + target?: ResolvedInteractionTarget | { kind: 'viewport' }; +}> { + if (options.from) { + if (isPointLike(options.from)) { + await assertSupportedInteractionSurface(runtime, options, 'swipe'); + return { point: requirePoint(options.from, 'from') }; + } + const target = await resolveInteractionTarget( + runtime, + { ...options, target: options.from }, + { + action: 'swipe', + requireInteractive: false, + promoteToHittableAncestor: false, + }, + ); + return { point: target.point, target }; + } + if (!options.direction) { + throw new AppError('INVALID_ARGS', 'swipe requires from+to or a direction'); + } + await assertSupportedInteractionSurface(runtime, options, 'swipe'); + const capture = await captureInteractionSnapshot(runtime, options, false); + const viewport = resolveSnapshotViewport(capture.snapshot.nodes); + return { + point: centerOfRect(viewport), + target: { kind: 'viewport' }, + }; +} + +function resolveSwipeTo( + from: Point, + options: SwipeCommandOptions, +): { point: Point; direction?: GestureDirection; distance?: number } { + if (options.to) return { point: requirePoint(options.to, 'to') }; + const direction = requireDirection(options.direction, 'swipe direction'); + const distance = normalizePositiveNumber(options.distance ?? 200, 'swipe distance'); + switch (direction) { + case 'up': + return { point: { x: from.x, y: from.y - distance }, direction, distance }; + case 'down': + return { point: { x: from.x, y: from.y + distance }, direction, distance }; + case 'left': + return { point: { x: from.x - distance, y: from.y }, direction, distance }; + case 'right': + return { point: { x: from.x + distance, y: from.y }, direction, distance }; + } +} + +function requireDirection( + direction: GestureDirection | undefined, + field: string, +): GestureDirection { + switch (direction) { + case 'up': + case 'down': + case 'left': + case 'right': + return direction; + default: + throw new AppError('INVALID_ARGS', `${field} must be up, down, left, or right`); + } +} + +function requirePoint(point: Point, field: string): Point { + const x = Number(point.x); + const y = Number(point.y); + if (!Number.isFinite(x) || !Number.isFinite(y)) { + throw new AppError('INVALID_ARGS', `${field} point requires finite x and y`); + } + return { x, y }; +} + +function isPointLike(value: Point | InteractionTarget): value is Point { + return 'x' in value && 'y' in value; +} + +function normalizeOptionalPositiveNumber( + value: number | undefined, + field: string, +): number | undefined { + return value === undefined ? undefined : normalizePositiveNumber(value, field); +} + +function normalizePositiveNumber(value: number, field: string): number { + if (!Number.isFinite(value) || value <= 0) { + throw new AppError('INVALID_ARGS', `${field} must be a positive number`); + } + return value; +} + +function normalizeOptionalPositiveInteger( + value: number | undefined, + field: string, +): number | undefined { + if (value === undefined) return undefined; + if (!Number.isFinite(value) || !Number.isInteger(value) || value <= 0) { + throw new AppError('INVALID_ARGS', `${field} must be a positive integer`); + } + return value; +} + +function resolveSnapshotViewport(nodes: SnapshotState['nodes']): Rect { + const visibleRects = nodes + .filter((node) => isNodeVisibleInEffectiveViewport(node, nodes)) + .map((node) => node.rect) + .filter(isUsableRect); + const rects = + visibleRects.length > 0 ? visibleRects : nodes.map((node) => node.rect).filter(isUsableRect); + if (rects.length === 0) { + throw new AppError('COMMAND_FAILED', 'Cannot infer viewport for directional swipe'); + } + const minX = Math.min(...rects.map((rect) => rect.x)); + const minY = Math.min(...rects.map((rect) => rect.y)); + const maxX = Math.max(...rects.map((rect) => rect.x + rect.width)); + const maxY = Math.max(...rects.map((rect) => rect.y + rect.height)); + return { + x: minX, + y: minY, + width: maxX - minX, + height: maxY - minY, + }; +} + +function isUsableRect(rect: SnapshotNode['rect']): rect is NonNullable { + return Boolean(rect && rect.width > 0 && rect.height > 0); +} + +function toBackendResult(result: unknown): Record | undefined { + return result && typeof result === 'object' ? (result as Record) : undefined; +} diff --git a/src/commands/interaction-resolution.ts b/src/commands/interaction-resolution.ts new file mode 100644 index 000000000..91478b7e7 --- /dev/null +++ b/src/commands/interaction-resolution.ts @@ -0,0 +1,283 @@ +import { AppError } from '../utils/errors.ts'; +import type { Point, SnapshotNode, SnapshotState } from '../utils/snapshot.ts'; +import { centerOfRect, findNodeByRef, normalizeRef } from '../utils/snapshot.ts'; +import type { AgentDeviceRuntime, CommandContext } from '../runtime.ts'; +import { formatSelectorFailure, parseSelectorChain, resolveSelectorChain } from '../selectors.ts'; +import { buildSelectorChainForNode } from '../utils/selector-build.ts'; +import { findNodeByLabel, resolveRefLabel } from '../utils/snapshot-processing.ts'; +import { + isNodeVisibleInEffectiveViewport, + resolveEffectiveViewportRect, +} from '../utils/mobile-snapshot-semantics.ts'; +import { resolveActionableTouchNode } from './interaction-targeting.ts'; +import type { ElementTarget, ResolvedTarget } from './selector-read.ts'; +import { now, toBackendContext } from './selector-read-utils.ts'; + +export type PointTarget = { + kind: 'point'; + x: number; + y: number; +}; + +export type InteractionTarget = ElementTarget | PointTarget; + +export type ResolvedInteractionTarget = + | { + kind: 'point'; + point: Point; + } + | { + kind: 'ref'; + point: Point; + target: Extract; + node: SnapshotNode; + selectorChain: string[]; + refLabel?: string; + } + | { + kind: 'selector'; + point: Point; + target: Extract; + node: SnapshotNode; + selectorChain: string[]; + refLabel?: string; + }; + +export type InteractionAction = + | 'click' + | 'press' + | 'fill' + | 'focus' + | 'longPress' + | 'scroll' + | 'swipe' + | 'pinch'; + +export type CapturedSnapshot = { + snapshot: SnapshotState; +}; + +export async function resolveInteractionTarget( + runtime: AgentDeviceRuntime, + options: CommandContext & { target: InteractionTarget }, + params: { + action: InteractionAction; + requireInteractive: boolean; + promoteToHittableAncestor: boolean; + }, +): Promise { + await assertSupportedInteractionSurface(runtime, options, params.action); + + if (options.target.kind === 'point') { + return { + kind: 'point', + point: { x: options.target.x, y: options.target.y }, + }; + } + + if (options.target.kind === 'ref') { + const capture = await resolveSnapshotForRef(runtime, options, options.target); + const resolved = capture.resolved; + const node = params.promoteToHittableAncestor + ? resolveActionableTouchNode(capture.snapshot.nodes, resolved.node) + : resolved.node; + assertVisibleRefTarget(node, capture.snapshot.nodes, options.target.ref, params.action); + const point = resolveNodeCenter( + node, + `Ref ${options.target.ref} not found or has invalid bounds`, + ); + return { + kind: 'ref', + point, + target: { kind: 'ref', ref: `@${resolved.ref}` }, + node, + selectorChain: buildSelectorChainForNode(node, runtime.backend.platform, { + action: params.action === 'fill' ? 'fill' : 'click', + }), + refLabel: resolveRefLabel(node, capture.snapshot.nodes), + }; + } + + const capture = await captureInteractionSnapshot(runtime, options, params.requireInteractive); + const chain = parseSelectorChain(options.target.selector); + const resolved = resolveSelectorChain(capture.snapshot.nodes, chain, { + platform: runtime.backend.platform, + requireRect: true, + requireUnique: true, + disambiguateAmbiguous: true, + }); + if (!resolved || !resolved.node.rect) { + throw new AppError( + 'COMMAND_FAILED', + formatSelectorFailure(chain, resolved?.diagnostics ?? [], { unique: true }), + ); + } + const node = params.promoteToHittableAncestor + ? resolveActionableTouchNode(capture.snapshot.nodes, resolved.node) + : resolved.node; + const point = resolveNodeCenter( + node, + `Selector ${resolved.selector.raw} resolved to invalid bounds`, + ); + return { + kind: 'selector', + point, + target: { kind: 'selector', selector: resolved.selector.raw }, + node, + selectorChain: buildSelectorChainForNode(node, runtime.backend.platform, { + action: params.action === 'fill' ? 'fill' : 'click', + }), + refLabel: resolveRefLabel(node, capture.snapshot.nodes), + }; +} + +export async function captureInteractionSnapshot( + runtime: AgentDeviceRuntime, + options: CommandContext, + interactiveOnly: boolean, +): Promise { + if (!runtime.backend.captureSnapshot) { + throw new AppError('UNSUPPORTED_OPERATION', 'snapshot is not supported by this backend'); + } + const sessionName = options.session ?? 'default'; + const session = await runtime.sessions.get(sessionName); + if (!session) throw new AppError('SESSION_NOT_FOUND', 'No active session. Run open first.'); + const result = await runtime.backend.captureSnapshot(toBackendContext(runtime, options), { + interactiveOnly, + compact: interactiveOnly, + }); + const snapshot = + result.snapshot ?? + ({ + nodes: result.nodes ?? [], + truncated: result.truncated, + backend: result.backend as SnapshotState['backend'], + createdAt: now(runtime), + } satisfies SnapshotState); + await runtime.sessions.set({ ...session, snapshot }); + return { snapshot }; +} + +export async function assertSupportedInteractionSurface( + runtime: AgentDeviceRuntime, + options: CommandContext, + action: InteractionAction, +): Promise { + if (runtime.backend.platform !== 'macos') return; + const surface = await resolveInteractionSurface(runtime, options); + if (surface !== 'desktop' && surface !== 'menubar') return; + // Menu bar button activation is supported by the existing daemon path; text entry is not. + if (surface === 'menubar' && (action === 'click' || action === 'press')) return; + throw new AppError( + 'UNSUPPORTED_OPERATION', + `${action} is not supported on macOS ${surface} sessions yet. Open an app session to act, or use the ${surface} surface to inspect.`, + ); +} + +async function resolveInteractionSurface( + runtime: AgentDeviceRuntime, + options: CommandContext, +): Promise { + const session = await runtime.sessions.get(options.session ?? 'default'); + return session?.metadata?.surface; +} + +async function resolveSnapshotForRef( + runtime: AgentDeviceRuntime, + options: CommandContext, + target: Extract, +): Promise { + const sessionName = options.session ?? 'default'; + const session = await runtime.sessions.get(sessionName); + if (!session) throw new AppError('SESSION_NOT_FOUND', 'No active session. Run open first.'); + if (!session.snapshot) { + throw new AppError('INVALID_ARGS', 'No snapshot in session. Run snapshot first.'); + } + + const fallbackLabel = target.fallbackLabel ?? ''; + const stored = tryResolveRefNode(session.snapshot.nodes, target.ref, { + fallbackLabel, + requireRect: true, + }); + if (stored) { + return { snapshot: session.snapshot, resolved: stored }; + } + + const capture = await captureInteractionSnapshot(runtime, options, true); + const refreshed = tryResolveRefNode(capture.snapshot.nodes, target.ref, { + fallbackLabel, + requireRect: true, + }); + if (!refreshed) { + throw new AppError('COMMAND_FAILED', `Ref ${target.ref} not found or has no bounds`); + } + return { ...capture, resolved: refreshed }; +} + +function tryResolveRefNode( + nodes: SnapshotState['nodes'], + refInput: string, + options: { + fallbackLabel: string; + requireRect: boolean; + }, +): { ref: string; node: SnapshotNode } | null { + const ref = normalizeRef(refInput); + if (!ref) throw new AppError('INVALID_ARGS', `Invalid ref: ${refInput}`); + const refNode = findNodeByRef(nodes, ref); + if (isUsableResolvedNode(refNode, options.requireRect)) return { ref, node: refNode }; + const fallbackNode = + options.fallbackLabel.length > 0 ? findNodeByLabel(nodes, options.fallbackLabel) : null; + if (isUsableResolvedNode(fallbackNode, options.requireRect)) { + return { ref, node: fallbackNode }; + } + return null; +} + +function resolveNodeCenter(node: SnapshotNode, message: string): Point { + if (!node.rect) throw new AppError('COMMAND_FAILED', message); + const point = centerOfRect(node.rect); + if (!Number.isFinite(point.x) || !Number.isFinite(point.y)) { + throw new AppError('COMMAND_FAILED', message); + } + return point; +} + +function isUsableResolvedNode( + node: SnapshotNode | null | undefined, + requireRect: boolean, +): node is SnapshotNode { + if (!node) return false; + if (!requireRect) return true; + if (!node.rect) return false; + const { x, y, width, height } = node.rect; + if ( + !Number.isFinite(Number(x)) || + !Number.isFinite(Number(y)) || + !Number.isFinite(Number(width)) || + !Number.isFinite(Number(height)) || + Number(width) < 0 || + Number(height) < 0 + ) { + return false; + } + const point = centerOfRect(node.rect); + return Number.isFinite(point.x) && Number.isFinite(point.y); +} + +function assertVisibleRefTarget( + node: SnapshotNode, + nodes: SnapshotState['nodes'], + refInput: string, + action: InteractionAction, +): void { + const viewport = node.rect ? resolveEffectiveViewportRect(node, nodes) : null; + if (!node.rect || !viewport || isNodeVisibleInEffectiveViewport(node, nodes)) return; + throw new AppError('COMMAND_FAILED', `Ref ${refInput} is off-screen and not safe to ${action}`, { + reason: 'offscreen_ref', + ref: normalizeRef(refInput), + rect: node.rect, + viewport, + hint: `Use scroll with the direction from the off-screen summary, take a fresh snapshot, then retry ${action} with the new ref or a selector.`, + }); +} diff --git a/src/commands/interactions.ts b/src/commands/interactions.ts index 1d3c789ff..3f0955e35 100644 --- a/src/commands/interactions.ts +++ b/src/commands/interactions.ts @@ -1,28 +1,44 @@ import { AppError } from '../utils/errors.ts'; -import type { Point, SnapshotNode, SnapshotState } from '../utils/snapshot.ts'; -import { centerOfRect, findNodeByRef, normalizeRef } from '../utils/snapshot.ts'; import type { AgentDeviceRuntime, CommandContext } from '../runtime.ts'; -import { formatSelectorFailure, parseSelectorChain, resolveSelectorChain } from '../selectors.ts'; -import { buildSelectorChainForNode } from '../utils/selector-build.ts'; -import { findNodeByLabel, isFillableType, resolveRefLabel } from '../utils/snapshot-processing.ts'; +import { isFillableType } from '../utils/snapshot-processing.ts'; import { requireIntInRange } from '../utils/validation.ts'; -import { - isNodeVisibleInEffectiveViewport, - resolveEffectiveViewportRect, -} from '../utils/mobile-snapshot-semantics.ts'; import { successText } from '../utils/success-text.ts'; -import { resolveActionableTouchNode } from './interaction-targeting.ts'; -import type { ElementTarget, ResolvedTarget } from './selector-read.ts'; -import { now, toBackendContext } from './selector-read-utils.ts'; +import type { ResolvedTarget } from './selector-read.ts'; +import { toBackendContext } from './selector-read-utils.ts'; import type { RuntimeCommand } from './index.ts'; - -export type PointTarget = { - kind: 'point'; - x: number; - y: number; -}; - -export type InteractionTarget = ElementTarget | PointTarget; +import { + type InteractionTarget, + type ResolvedInteractionTarget, + resolveInteractionTarget, +} from './interaction-resolution.ts'; + +export { + focusCommand, + longPressCommand, + pinchCommand, + scrollCommand, + swipeCommand, +} from './interaction-gestures.ts'; +export type { + FocusCommandOptions, + FocusCommandResult, + GestureDirection, + LongPressCommandOptions, + LongPressCommandResult, + PinchCommandOptions, + PinchCommandResult, + ScrollCommandOptions, + ScrollCommandResult, + ScrollTarget, + SwipeCommandOptions, + SwipeCommandResult, + SwipeOptions, +} from './interaction-gestures.ts'; +export type { + InteractionTarget, + PointTarget, + ResolvedInteractionTarget, +} from './interaction-resolution.ts'; export type PressCommandOptions = CommandContext & { target: InteractionTarget; @@ -36,28 +52,6 @@ export type PressCommandOptions = CommandContext & { export type ClickCommandOptions = PressCommandOptions; -type ResolvedInteractionTarget = - | { - kind: 'point'; - point: Point; - } - | { - kind: 'ref'; - point: Point; - target: Extract; - node: SnapshotNode; - selectorChain: string[]; - refLabel?: string; - } - | { - kind: 'selector'; - point: Point; - target: Extract; - node: SnapshotNode; - selectorChain: string[]; - refLabel?: string; - }; - export type PressCommandResult = ResolvedInteractionTarget & { backendResult?: Record; }; @@ -87,10 +81,6 @@ export type TypeTextCommandResult = { message?: string; }; -type CapturedSnapshot = { - snapshot: SnapshotState; -}; - export const pressCommand: RuntimeCommand = async ( runtime, options, @@ -101,37 +91,6 @@ export const clickCommand: RuntimeCommand => await tapCommand(runtime, options, 'click'); -async function tapCommand( - runtime: AgentDeviceRuntime, - options: PressCommandOptions, - action: 'click' | 'press', -): Promise { - const resolved = await resolveInteractionTarget(runtime, options, { - action, - requireInteractive: true, - promoteToHittableAncestor: true, - }); - if (!runtime.backend.tap) { - throw new AppError('UNSUPPORTED_OPERATION', 'tap is not supported by this backend'); - } - const backendResult = await runtime.backend.tap( - toBackendContext(runtime, options), - resolved.point, - { - button: options.button, - count: options.count, - intervalMs: options.intervalMs, - holdMs: options.holdMs, - jitterPx: options.jitterPx, - doubleTap: options.doubleTap, - }, - ); - return { - ...resolved, - ...(toBackendResult(backendResult) ? { backendResult: toBackendResult(backendResult) } : {}), - }; -} - export const fillCommand: RuntimeCommand = async ( runtime, options, @@ -151,6 +110,7 @@ export const fillCommand: RuntimeCommand options.text, { delayMs: options.delayMs }, ); + const formattedBackendResult = toBackendResult(backendResult); const nodeType = 'node' in resolved ? (resolved.node.type ?? '') : ''; const warning = nodeType && !isFillableType(nodeType, runtime.backend.platform) @@ -160,7 +120,7 @@ export const fillCommand: RuntimeCommand ...resolved, text: options.text, ...(warning ? { warning } : {}), - ...(toBackendResult(backendResult) ? { backendResult: toBackendResult(backendResult) } : {}), + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), }; }; @@ -187,241 +147,48 @@ export const typeTextCommand: RuntimeCommand< const backendResult = await runtime.backend.typeText(toBackendContext(runtime, options), text, { delayMs, }); - const message = formatTextLengthMessage('Typed', text); + const formattedBackendResult = toBackendResult(backendResult); return { kind: 'text', text, delayMs, - ...(toBackendResult(backendResult) ? { backendResult: toBackendResult(backendResult) } : {}), - ...successText(message), + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText(`Typed ${Array.from(text).length} chars`), }; }; -async function resolveInteractionTarget( +async function tapCommand( runtime: AgentDeviceRuntime, - options: CommandContext & { target: InteractionTarget }, - params: { - action: 'click' | 'press' | 'fill'; - requireInteractive: boolean; - promoteToHittableAncestor: boolean; - }, -): Promise { - await assertSupportedInteractionSurface(runtime, options, params.action); - - if (options.target.kind === 'point') { - return { - kind: 'point', - point: { x: options.target.x, y: options.target.y }, - }; - } - - if (options.target.kind === 'ref') { - const capture = await resolveSnapshotForRef(runtime, options, options.target); - const resolved = capture.resolved; - const node = params.promoteToHittableAncestor - ? resolveActionableTouchNode(capture.snapshot.nodes, resolved.node) - : resolved.node; - assertVisibleRefTarget(node, capture.snapshot.nodes, options.target.ref, params.action); - const point = resolveNodeCenter( - node, - `Ref ${options.target.ref} not found or has invalid bounds`, - ); - return { - kind: 'ref', - point, - target: { kind: 'ref', ref: `@${resolved.ref}` }, - node, - selectorChain: buildSelectorChainForNode(node, runtime.backend.platform, { - action: params.action === 'fill' ? 'fill' : 'click', - }), - refLabel: resolveRefLabel(node, capture.snapshot.nodes), - }; - } - - const capture = await captureInteractionSnapshot(runtime, options, params.requireInteractive); - const chain = parseSelectorChain(options.target.selector); - const resolved = resolveSelectorChain(capture.snapshot.nodes, chain, { - platform: runtime.backend.platform, - requireRect: true, - requireUnique: true, - disambiguateAmbiguous: true, + options: PressCommandOptions, + action: 'click' | 'press', +): Promise { + const resolved = await resolveInteractionTarget(runtime, options, { + action, + requireInteractive: true, + promoteToHittableAncestor: true, }); - if (!resolved || !resolved.node.rect) { - throw new AppError( - 'COMMAND_FAILED', - formatSelectorFailure(chain, resolved?.diagnostics ?? [], { unique: true }), - ); + if (!runtime.backend.tap) { + throw new AppError('UNSUPPORTED_OPERATION', 'tap is not supported by this backend'); } - const node = params.promoteToHittableAncestor - ? resolveActionableTouchNode(capture.snapshot.nodes, resolved.node) - : resolved.node; - const point = resolveNodeCenter( - node, - `Selector ${resolved.selector.raw} resolved to invalid bounds`, + const backendResult = await runtime.backend.tap( + toBackendContext(runtime, options), + resolved.point, + { + button: options.button, + count: options.count, + intervalMs: options.intervalMs, + holdMs: options.holdMs, + jitterPx: options.jitterPx, + doubleTap: options.doubleTap, + }, ); + const formattedBackendResult = toBackendResult(backendResult); return { - kind: 'selector', - point, - target: { kind: 'selector', selector: resolved.selector.raw }, - node, - selectorChain: buildSelectorChainForNode(node, runtime.backend.platform, { - action: params.action === 'fill' ? 'fill' : 'click', - }), - refLabel: resolveRefLabel(node, capture.snapshot.nodes), + ...resolved, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), }; } -async function assertSupportedInteractionSurface( - runtime: AgentDeviceRuntime, - options: CommandContext, - action: 'click' | 'press' | 'fill', -): Promise { - if (runtime.backend.platform !== 'macos') return; - const surface = await resolveInteractionSurface(runtime, options); - if (surface !== 'desktop' && surface !== 'menubar') return; - // Menu bar button activation is supported by the existing daemon path; text entry is not. - if (surface === 'menubar' && (action === 'click' || action === 'press')) return; - throw new AppError( - 'UNSUPPORTED_OPERATION', - `${action} is not supported on macOS ${surface} sessions yet. Open an app session to act, or use the ${surface} surface to inspect.`, - ); -} - -async function resolveInteractionSurface( - runtime: AgentDeviceRuntime, - options: CommandContext, -): Promise { - const session = await runtime.sessions.get(options.session ?? 'default'); - return session?.metadata?.surface; -} - -async function captureInteractionSnapshot( - runtime: AgentDeviceRuntime, - options: CommandContext, - interactiveOnly: boolean, -): Promise { - if (!runtime.backend.captureSnapshot) { - throw new AppError('UNSUPPORTED_OPERATION', 'snapshot is not supported by this backend'); - } - const sessionName = options.session ?? 'default'; - const session = await runtime.sessions.get(sessionName); - if (!session) throw new AppError('SESSION_NOT_FOUND', 'No active session. Run open first.'); - const result = await runtime.backend.captureSnapshot(toBackendContext(runtime, options), { - interactiveOnly, - compact: interactiveOnly, - }); - const snapshot = - result.snapshot ?? - ({ - nodes: result.nodes ?? [], - truncated: result.truncated, - backend: result.backend as SnapshotState['backend'], - createdAt: now(runtime), - } satisfies SnapshotState); - await runtime.sessions.set({ ...session, snapshot }); - return { snapshot }; -} - -async function resolveSnapshotForRef( - runtime: AgentDeviceRuntime, - options: CommandContext, - target: Extract, -): Promise { - const sessionName = options.session ?? 'default'; - const session = await runtime.sessions.get(sessionName); - if (!session) throw new AppError('SESSION_NOT_FOUND', 'No active session. Run open first.'); - if (!session.snapshot) { - throw new AppError('INVALID_ARGS', 'No snapshot in session. Run snapshot first.'); - } - - const fallbackLabel = target.fallbackLabel ?? ''; - const stored = tryResolveRefNode(session.snapshot.nodes, target.ref, { - fallbackLabel, - requireRect: true, - }); - if (stored) { - return { snapshot: session.snapshot, resolved: stored }; - } - - const capture = await captureInteractionSnapshot(runtime, options, true); - const refreshed = tryResolveRefNode(capture.snapshot.nodes, target.ref, { - fallbackLabel, - requireRect: true, - }); - if (!refreshed) { - throw new AppError('COMMAND_FAILED', `Ref ${target.ref} not found or has no bounds`); - } - return { ...capture, resolved: refreshed }; -} - -function tryResolveRefNode( - nodes: SnapshotState['nodes'], - refInput: string, - options: { - fallbackLabel: string; - requireRect: boolean; - }, -): { ref: string; node: SnapshotNode } | null { - const ref = normalizeRef(refInput); - if (!ref) throw new AppError('INVALID_ARGS', `Invalid ref: ${refInput}`); - const refNode = findNodeByRef(nodes, ref); - if (isUsableResolvedNode(refNode, options.requireRect)) return { ref, node: refNode }; - const fallbackNode = - options.fallbackLabel.length > 0 ? findNodeByLabel(nodes, options.fallbackLabel) : null; - if (isUsableResolvedNode(fallbackNode, options.requireRect)) { - return { ref, node: fallbackNode }; - } - return null; -} - -function resolveNodeCenter(node: SnapshotNode, message: string): Point { - if (!node.rect) throw new AppError('COMMAND_FAILED', message); - const point = centerOfRect(node.rect); - if (!Number.isFinite(point.x) || !Number.isFinite(point.y)) { - throw new AppError('COMMAND_FAILED', message); - } - return point; -} - -function isUsableResolvedNode( - node: SnapshotNode | null | undefined, - requireRect: boolean, -): node is SnapshotNode { - if (!node) return false; - if (!requireRect) return true; - if (!node.rect) return false; - const { x, y, width, height } = node.rect; - if ( - !Number.isFinite(Number(x)) || - !Number.isFinite(Number(y)) || - !Number.isFinite(Number(width)) || - !Number.isFinite(Number(height)) || - Number(width) < 0 || - Number(height) < 0 - ) { - return false; - } - const point = centerOfRect(node.rect); - return Number.isFinite(point.x) && Number.isFinite(point.y); -} - -function assertVisibleRefTarget( - node: SnapshotNode, - nodes: SnapshotState['nodes'], - refInput: string, - action: 'click' | 'press' | 'fill', -): void { - const viewport = node.rect ? resolveEffectiveViewportRect(node, nodes) : null; - if (!node.rect || !viewport || isNodeVisibleInEffectiveViewport(node, nodes)) return; - throw new AppError('COMMAND_FAILED', `Ref ${refInput} is off-screen and not safe to ${action}`, { - reason: 'offscreen_ref', - ref: normalizeRef(refInput), - rect: node.rect, - viewport, - hint: `Use scroll with the direction from the off-screen summary, take a fresh snapshot, then retry ${action} with the new ref or a selector.`, - }); -} - function toBackendResult(result: unknown): Record | undefined { return result && typeof result === 'object' ? (result as Record) : undefined; } @@ -446,7 +213,3 @@ function findMistargetedTypeRef(text: string): string | null { } return null; } - -function formatTextLengthMessage(action: 'Typed' | 'Filled', text: string): string { - return `${action} ${Array.from(text).length} chars`; -} diff --git a/src/commands/router.ts b/src/commands/router.ts index a5992921b..d7fb69684 100644 --- a/src/commands/router.ts +++ b/src/commands/router.ts @@ -29,16 +29,57 @@ import { import { clickCommand, fillCommand, + focusCommand, + longPressCommand, + pinchCommand, pressCommand, + scrollCommand, + swipeCommand, typeTextCommand, type ClickCommandOptions, type FillCommandOptions, type FillCommandResult, + type FocusCommandOptions, + type FocusCommandResult, + type LongPressCommandOptions, + type LongPressCommandResult, + type PinchCommandOptions, + type PinchCommandResult, type PressCommandOptions, type PressCommandResult, + type ScrollCommandOptions, + type ScrollCommandResult, + type SwipeCommandOptions, + type SwipeCommandResult, type TypeTextCommandOptions, type TypeTextCommandResult, } from './interactions.ts'; +import { + alertCommand, + appSwitcherCommand, + backCommand, + clipboardCommand, + homeCommand, + keyboardCommand, + rotateCommand, + settingsCommand, + type SystemAlertCommandOptions, + type SystemAlertCommandResult, + type SystemAppSwitcherCommandOptions, + type SystemAppSwitcherCommandResult, + type SystemBackCommandOptions, + type SystemBackCommandResult, + type SystemClipboardCommandOptions, + type SystemClipboardCommandResult, + type SystemHomeCommandOptions, + type SystemHomeCommandResult, + type SystemKeyboardCommandOptions, + type SystemKeyboardCommandResult, + type SystemRotateCommandOptions, + type SystemRotateCommandResult, + type SystemSettingsCommandOptions, + type SystemSettingsCommandResult, +} from './system.ts'; import { closeAppCommand, getAppStateCommand, @@ -127,6 +168,71 @@ export type CommandRouterRequest = options: TypeTextCommandOptions; context?: TContext; } + | { + command: 'interactions.focus'; + options: FocusCommandOptions; + context?: TContext; + } + | { + command: 'interactions.longPress'; + options: LongPressCommandOptions; + context?: TContext; + } + | { + command: 'interactions.swipe'; + options: SwipeCommandOptions; + context?: TContext; + } + | { + command: 'interactions.scroll'; + options: ScrollCommandOptions; + context?: TContext; + } + | { + command: 'interactions.pinch'; + options: PinchCommandOptions; + context?: TContext; + } + | { + command: 'system.back'; + options?: SystemBackCommandOptions; + context?: TContext; + } + | { + command: 'system.home'; + options?: SystemHomeCommandOptions; + context?: TContext; + } + | { + command: 'system.rotate'; + options: SystemRotateCommandOptions; + context?: TContext; + } + | { + command: 'system.keyboard'; + options?: SystemKeyboardCommandOptions; + context?: TContext; + } + | { + command: 'system.clipboard'; + options: SystemClipboardCommandOptions; + context?: TContext; + } + | { + command: 'system.settings'; + options?: SystemSettingsCommandOptions; + context?: TContext; + } + | { + command: 'system.alert'; + options?: SystemAlertCommandOptions; + context?: TContext; + } + | { + command: 'system.appSwitcher'; + options?: SystemAppSwitcherCommandOptions; + context?: TContext; + } | { command: 'apps.open'; options: OpenAppCommandOptions; @@ -170,6 +276,19 @@ export type CommandRouterResult = | PressCommandResult | FillCommandResult | TypeTextCommandResult + | FocusCommandResult + | LongPressCommandResult + | SwipeCommandResult + | ScrollCommandResult + | PinchCommandResult + | SystemBackCommandResult + | SystemHomeCommandResult + | SystemRotateCommandResult + | SystemKeyboardCommandResult + | SystemClipboardCommandResult + | SystemSettingsCommandResult + | SystemAlertCommandResult + | SystemAppSwitcherCommandResult | OpenAppCommandResult | CloseAppCommandResult | ListAppsCommandResult @@ -232,6 +351,19 @@ const implementedRouterCommands = new Set([ 'interactions.press', 'interactions.fill', 'interactions.typeText', + 'interactions.focus', + 'interactions.longPress', + 'interactions.swipe', + 'interactions.scroll', + 'interactions.pinch', + 'system.back', + 'system.home', + 'system.rotate', + 'system.keyboard', + 'system.clipboard', + 'system.settings', + 'system.alert', + 'system.appSwitcher', 'apps.open', 'apps.close', 'apps.list', @@ -284,6 +416,32 @@ async function dispatchRuntimeCommand( return await fillCommand(runtime, request.options); case 'interactions.typeText': return await typeTextCommand(runtime, request.options); + case 'interactions.focus': + return await focusCommand(runtime, request.options); + case 'interactions.longPress': + return await longPressCommand(runtime, request.options); + case 'interactions.swipe': + return await swipeCommand(runtime, request.options); + case 'interactions.scroll': + return await scrollCommand(runtime, request.options); + case 'interactions.pinch': + return await pinchCommand(runtime, request.options); + case 'system.back': + return await backCommand(runtime, request.options); + case 'system.home': + return await homeCommand(runtime, request.options); + case 'system.rotate': + return await rotateCommand(runtime, request.options); + case 'system.keyboard': + return await keyboardCommand(runtime, request.options); + case 'system.clipboard': + return await clipboardCommand(runtime, request.options); + case 'system.settings': + return await settingsCommand(runtime, request.options); + case 'system.alert': + return await alertCommand(runtime, request.options); + case 'system.appSwitcher': + return await appSwitcherCommand(runtime, request.options); case 'apps.open': return await openAppCommand(runtime, request.options); case 'apps.close': diff --git a/src/commands/system.ts b/src/commands/system.ts new file mode 100644 index 000000000..cf98b7079 --- /dev/null +++ b/src/commands/system.ts @@ -0,0 +1,398 @@ +import type { + BackendAlertAction, + BackendAlertInfo, + BackendAlertResult, + BackendDeviceOrientation, + BackendKeyboardResult, +} from '../backend.ts'; +import type { CommandContext } from '../runtime.ts'; +import { AppError } from '../utils/errors.ts'; +import { successText } from '../utils/success-text.ts'; +import { requireIntInRange } from '../utils/validation.ts'; +import type { RuntimeCommand } from './index.ts'; +import { toBackendContext } from './selector-read-utils.ts'; + +export type SystemBackCommandOptions = CommandContext & { + mode?: 'in-app' | 'system'; +}; + +export type SystemBackCommandResult = { + kind: 'systemBack'; + mode: 'in-app' | 'system'; + backendResult?: Record; + message?: string; +}; + +export type SystemHomeCommandOptions = CommandContext; + +export type SystemHomeCommandResult = { + kind: 'systemHome'; + backendResult?: Record; + message?: string; +}; + +export type SystemRotateCommandOptions = CommandContext & { + orientation: BackendDeviceOrientation; +}; + +export type SystemRotateCommandResult = { + kind: 'systemRotated'; + orientation: BackendDeviceOrientation; + backendResult?: Record; + message?: string; +}; + +export type SystemKeyboardCommandOptions = CommandContext & { + action?: 'status' | 'get' | 'dismiss'; +}; + +export type SystemKeyboardCommandResult = + | { + kind: 'keyboardState'; + action: 'status' | 'get'; + state: BackendKeyboardResult; + backendResult?: Record; + } + | { + kind: 'keyboardDismissed'; + action: 'dismiss'; + state: BackendKeyboardResult; + backendResult?: Record; + message?: string; + }; + +export type SystemClipboardCommandOptions = + | (CommandContext & { + action: 'read'; + }) + | (CommandContext & { + action: 'write'; + text: string; + }); + +export type SystemClipboardCommandResult = + | { + kind: 'clipboardText'; + action: 'read'; + text: string; + } + | { + kind: 'clipboardUpdated'; + action: 'write'; + textLength: number; + backendResult?: Record; + message?: string; + }; + +export type SystemSettingsCommandOptions = CommandContext & { + target?: string; +}; + +export type SystemSettingsCommandResult = { + kind: 'settingsOpened'; + target?: string; + backendResult?: Record; + message?: string; +}; + +export type SystemAlertCommandOptions = CommandContext & { + action?: BackendAlertAction; + timeoutMs?: number; +}; + +export type SystemAlertCommandResult = + | { + kind: 'alertStatus'; + action: 'get'; + alert: BackendAlertInfo | null; + } + | { + kind: 'alertHandled'; + action: 'accept' | 'dismiss'; + handled: boolean; + alert?: BackendAlertInfo; + button?: string; + message?: string; + } + | { + kind: 'alertWait'; + action: 'wait'; + alert: BackendAlertInfo | null; + waitedMs?: number; + timedOut?: boolean; + message?: string; + }; + +export type SystemAppSwitcherCommandOptions = CommandContext; + +export type SystemAppSwitcherCommandResult = { + kind: 'appSwitcherOpened'; + backendResult?: Record; + message?: string; +}; + +export const backCommand: RuntimeCommand< + SystemBackCommandOptions | undefined, + SystemBackCommandResult +> = async (runtime, options = {}): Promise => { + if (!runtime.backend.pressBack) { + throw new AppError('UNSUPPORTED_OPERATION', 'system.back is not supported by this backend'); + } + const mode = options.mode ?? 'in-app'; + if (mode !== 'in-app' && mode !== 'system') { + throw new AppError('INVALID_ARGS', 'system.back mode must be in-app or system'); + } + const backendResult = await runtime.backend.pressBack(toBackendContext(runtime, options), { + mode, + }); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'systemBack', + mode, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText('Back'), + }; +}; + +export const homeCommand: RuntimeCommand< + SystemHomeCommandOptions | undefined, + SystemHomeCommandResult +> = async (runtime, options = {}): Promise => { + if (!runtime.backend.pressHome) { + throw new AppError('UNSUPPORTED_OPERATION', 'system.home is not supported by this backend'); + } + const backendResult = await runtime.backend.pressHome(toBackendContext(runtime, options)); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'systemHome', + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText('Home'), + }; +}; + +export const rotateCommand: RuntimeCommand< + SystemRotateCommandOptions, + SystemRotateCommandResult +> = async (runtime, options): Promise => { + if (!runtime.backend.rotate) { + throw new AppError('UNSUPPORTED_OPERATION', 'system.rotate is not supported by this backend'); + } + const orientation = requireOrientation(options.orientation); + const backendResult = await runtime.backend.rotate( + toBackendContext(runtime, options), + orientation, + ); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'systemRotated', + orientation, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText(`Rotated to ${orientation}`), + }; +}; + +export const keyboardCommand: RuntimeCommand< + SystemKeyboardCommandOptions | undefined, + SystemKeyboardCommandResult +> = async (runtime, options = {}): Promise => { + if (!runtime.backend.setKeyboard) { + throw new AppError('UNSUPPORTED_OPERATION', 'system.keyboard is not supported by this backend'); + } + const action = options.action ?? 'status'; + if (action !== 'status' && action !== 'get' && action !== 'dismiss') { + throw new AppError('INVALID_ARGS', 'system.keyboard action must be status, get, or dismiss'); + } + const state = await runtime.backend.setKeyboard(toBackendContext(runtime, options), { action }); + const formattedBackendResult = toBackendResult(state); + if (action === 'dismiss') { + const dismissed = isKeyboardResult(state) ? state.dismissed : undefined; + return { + kind: 'keyboardDismissed', + action, + state: isKeyboardResult(state) ? state : {}, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText(dismissed === false ? 'Keyboard already hidden' : 'Keyboard dismissed'), + }; + } + return { + kind: 'keyboardState', + action, + state: isKeyboardResult(state) ? state : {}, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + }; +}; + +export const clipboardCommand: RuntimeCommand< + SystemClipboardCommandOptions, + SystemClipboardCommandResult +> = async (runtime, options): Promise => { + if (options.action === 'read') { + if (!runtime.backend.getClipboard) { + throw new AppError( + 'UNSUPPORTED_OPERATION', + 'system.clipboard read is not supported by this backend', + ); + } + const result = await runtime.backend.getClipboard(toBackendContext(runtime, options)); + return { + kind: 'clipboardText', + action: 'read', + text: typeof result === 'string' ? result : result.text, + }; + } + + if (options.action !== 'write') { + throw new AppError('INVALID_ARGS', 'system.clipboard action must be read or write'); + } + if (!runtime.backend.setClipboard) { + throw new AppError( + 'UNSUPPORTED_OPERATION', + 'system.clipboard write is not supported by this backend', + ); + } + if (typeof options.text !== 'string') { + throw new AppError('INVALID_ARGS', 'system.clipboard write requires text'); + } + const backendResult = await runtime.backend.setClipboard( + toBackendContext(runtime, options), + options.text, + ); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'clipboardUpdated', + action: 'write', + textLength: Array.from(options.text).length, + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText('Clipboard updated'), + }; +}; + +export const settingsCommand: RuntimeCommand< + SystemSettingsCommandOptions | undefined, + SystemSettingsCommandResult +> = async (runtime, options = {}): Promise => { + if (!runtime.backend.openSettings) { + throw new AppError('UNSUPPORTED_OPERATION', 'system.settings is not supported by this backend'); + } + const target = normalizeOptionalText(options.target, 'target'); + const backendResult = await runtime.backend.openSettings( + toBackendContext(runtime, options), + target, + ); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'settingsOpened', + ...(target ? { target } : {}), + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText(target ? `Opened settings: ${target}` : 'Opened settings'), + }; +}; + +export const alertCommand: RuntimeCommand< + SystemAlertCommandOptions | undefined, + SystemAlertCommandResult +> = async (runtime, options = {}): Promise => { + if (!runtime.backend.handleAlert) { + throw new AppError('UNSUPPORTED_OPERATION', 'system.alert is not supported by this backend'); + } + const action = options.action ?? 'get'; + if (action !== 'get' && action !== 'accept' && action !== 'dismiss' && action !== 'wait') { + throw new AppError('INVALID_ARGS', 'system.alert action must be get, accept, dismiss, or wait'); + } + const timeoutMs = + options.timeoutMs === undefined + ? undefined + : requireIntInRange(options.timeoutMs, 'timeoutMs', 0, 120_000); + const result = await runtime.backend.handleAlert(toBackendContext(runtime, options), action, { + timeoutMs, + }); + return normalizeAlertResult(action, result); +}; + +export const appSwitcherCommand: RuntimeCommand< + SystemAppSwitcherCommandOptions | undefined, + SystemAppSwitcherCommandResult +> = async (runtime, options = {}): Promise => { + if (!runtime.backend.openAppSwitcher) { + throw new AppError( + 'UNSUPPORTED_OPERATION', + 'system.appSwitcher is not supported by this backend', + ); + } + const backendResult = await runtime.backend.openAppSwitcher(toBackendContext(runtime, options)); + const formattedBackendResult = toBackendResult(backendResult); + return { + kind: 'appSwitcherOpened', + ...(formattedBackendResult ? { backendResult: formattedBackendResult } : {}), + ...successText('Opened app switcher'), + }; +}; + +function requireOrientation(orientation: BackendDeviceOrientation): BackendDeviceOrientation { + switch (orientation) { + case 'portrait': + case 'portrait-upside-down': + case 'landscape-left': + case 'landscape-right': + return orientation; + default: + throw new AppError( + 'INVALID_ARGS', + 'system.rotate orientation must be portrait, portrait-upside-down, landscape-left, or landscape-right', + ); + } +} + +function normalizeAlertResult( + action: BackendAlertAction, + result: BackendAlertResult, +): SystemAlertCommandResult { + if (action === 'get') { + if (result.kind !== 'alertStatus') { + throw new AppError('COMMAND_FAILED', 'system.alert get returned an invalid backend result'); + } + return { kind: 'alertStatus', action, alert: result.alert }; + } + if (action === 'wait') { + if (result.kind !== 'alertWait') { + throw new AppError('COMMAND_FAILED', 'system.alert wait returned an invalid backend result'); + } + return { + kind: 'alertWait', + action, + alert: result.alert, + ...(result.waitedMs !== undefined ? { waitedMs: result.waitedMs } : {}), + ...(result.timedOut !== undefined ? { timedOut: result.timedOut } : {}), + ...successText(result.alert ? 'Alert visible' : 'Alert wait timed out'), + }; + } + if (result.kind !== 'alertHandled') { + throw new AppError( + 'COMMAND_FAILED', + `system.alert ${action} returned an invalid backend result`, + ); + } + return { + kind: 'alertHandled', + action, + handled: result.handled, + ...(result.alert ? { alert: result.alert } : {}), + ...(result.button ? { button: result.button } : {}), + ...successText(result.handled ? `Alert ${action}ed` : 'No alert handled'), + }; +} + +function normalizeOptionalText(value: string | undefined, field: string): string | undefined { + if (value === undefined) return undefined; + const text = value.trim(); + if (!text) throw new AppError('INVALID_ARGS', `${field} must be a non-empty string`); + return text; +} + +function isKeyboardResult(value: unknown): value is BackendKeyboardResult { + return Boolean(value && typeof value === 'object'); +} + +function toBackendResult(result: unknown): Record | undefined { + return result && typeof result === 'object' ? (result as Record) : undefined; +} diff --git a/src/index.ts b/src/index.ts index 753f37cb2..d16272db2 100644 --- a/src/index.ts +++ b/src/index.ts @@ -27,22 +27,34 @@ export type { AgentDeviceBackend, AgentDeviceBackendPlatform, BackendActionResult, + BackendAlertAction, + BackendAlertInfo, + BackendAlertResult, BackendAppEvent, BackendAppInfo, BackendAppListFilter, BackendAppState, + BackendBackOptions, BackendCapabilityName, BackendCapabilitySet, + BackendClipboardTextResult, BackendCommandContext, + BackendDeviceOrientation, BackendEscapeHatches, BackendFillOptions, BackendInstallTarget, BackendFindTextResult, + BackendKeyboardOptions, + BackendKeyboardResult, + BackendLongPressOptions, BackendOpenOptions, BackendOpenTarget, + BackendPinchOptions, BackendPushInput, BackendReadTextResult, BackendRunnerCommand, + BackendScrollOptions, + BackendScrollTarget, BackendSnapshotAnalysis, BackendSnapshotFreshness, BackendSnapshotOptions, @@ -50,6 +62,7 @@ export type { BackendScreenshotResult, BackendShellResult, BackendSnapshotResult, + BackendSwipeOptions, BackendTapOptions, } from './backend.ts'; diff --git a/src/testing/conformance.ts b/src/testing/conformance.ts index 0731da1b6..333b07f5c 100644 --- a/src/testing/conformance.ts +++ b/src/testing/conformance.ts @@ -15,6 +15,7 @@ export type CommandConformanceFixtures = { editableTarget: InteractionTarget; fillText: string; point: Point; + swipeTo: Point; }; export type CommandConformanceTarget = { @@ -75,6 +76,7 @@ export const defaultCommandConformanceFixtures: CommandConformanceFixtures = { editableTarget: selector('label=Email'), fillText: 'hello@example.com', point: { x: 4, y: 8 }, + swipeTo: { x: 24, y: 28 }, }; export const captureConformanceSuite = createCommandConformanceSuite({ @@ -207,6 +209,152 @@ export const interactionConformanceSuite = createCommandConformanceSuite({ assert.equal(result.text, fixtures.fillText); }, }, + { + name: 'focuses selector targets', + command: 'interactions.focus', + run: async (runtime, fixtures) => { + const result = await commands.interactions.focus(runtime, { + session: fixtures.session, + target: selector(fixtures.visibleSelector), + }); + assert.equal(result.kind, 'selector'); + }, + }, + { + name: 'long presses selector targets', + command: 'interactions.longPress', + run: async (runtime, fixtures) => { + const result = await commands.interactions.longPress(runtime, { + session: fixtures.session, + target: selector(fixtures.visibleSelector), + durationMs: 500, + }); + assert.equal(result.kind, 'selector'); + }, + }, + { + name: 'swipes explicit points', + command: 'interactions.swipe', + run: async (runtime, fixtures) => { + const result = await commands.interactions.swipe(runtime, { + session: fixtures.session, + from: fixtures.point, + to: fixtures.swipeTo, + }); + assert.deepEqual(result.from, fixtures.point); + }, + }, + { + name: 'scrolls viewport targets', + command: 'interactions.scroll', + run: async (runtime, fixtures) => { + const result = await commands.interactions.scroll(runtime, { + session: fixtures.session, + target: { kind: 'viewport' }, + direction: 'down', + }); + assert.equal(result.kind, 'viewport'); + }, + }, + { + name: 'pinches through the backend primitive', + command: 'interactions.pinch', + run: async (runtime) => { + const result = await commands.interactions.pinch(runtime, { + scale: 1.1, + }); + assert.equal(result.kind, 'pinch'); + }, + }, + ], +}); + +export const systemConformanceSuite = createCommandConformanceSuite({ + name: 'system', + cases: [ + { + name: 'presses back', + command: 'system.back', + run: async (runtime, fixtures) => { + const result = await commands.system.back(runtime, { + session: fixtures.session, + mode: 'in-app', + }); + assert.equal(result.kind, 'systemBack'); + }, + }, + { + name: 'presses home', + command: 'system.home', + run: async (runtime, fixtures) => { + const result = await commands.system.home(runtime, { session: fixtures.session }); + assert.equal(result.kind, 'systemHome'); + }, + }, + { + name: 'rotates devices', + command: 'system.rotate', + run: async (runtime, fixtures) => { + const result = await commands.system.rotate(runtime, { + session: fixtures.session, + orientation: 'portrait', + }); + assert.equal(result.orientation, 'portrait'); + }, + }, + { + name: 'reads keyboard state', + command: 'system.keyboard', + run: async (runtime, fixtures) => { + const result = await commands.system.keyboard(runtime, { + session: fixtures.session, + action: 'status', + }); + assert.equal(result.kind, 'keyboardState'); + }, + }, + { + name: 'reads clipboard text', + command: 'system.clipboard', + run: async (runtime, fixtures) => { + const result = await commands.system.clipboard(runtime, { + session: fixtures.session, + action: 'read', + }); + assert.equal(result.kind, 'clipboardText'); + }, + }, + { + name: 'opens settings', + command: 'system.settings', + run: async (runtime, fixtures) => { + const result = await commands.system.settings(runtime, { + session: fixtures.session, + }); + assert.equal(result.kind, 'settingsOpened'); + }, + }, + { + name: 'reads alert state', + command: 'system.alert', + run: async (runtime, fixtures) => { + const result = await commands.system.alert(runtime, { + session: fixtures.session, + action: 'get', + }); + assert.equal(result.kind, 'alertStatus'); + }, + }, + { + name: 'opens app switcher', + command: 'system.appSwitcher', + run: async (runtime, fixtures) => { + const result = await commands.system.appSwitcher(runtime, { + session: fixtures.session, + }); + assert.equal(result.kind, 'appSwitcherOpened'); + }, + }, ], }); @@ -290,6 +438,7 @@ export const commandConformanceSuites: readonly CommandConformanceSuite[] = [ captureConformanceSuite, selectorConformanceSuite, interactionConformanceSuite, + systemConformanceSuite, appsConformanceSuite, ]; diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index ad286f6d0..aa38d9872 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -491,6 +491,7 @@ agent-device clipboard write "" # clear clipboard ``` - `clipboard read` returns clipboard text for the selected target. +- Treat `clipboard read` output as sensitive data; it can include secrets copied by the user or app. - `clipboard write ` updates clipboard text on the selected target. - Works with an active session device or explicit selectors (`--platform`, `--device`, `--udid`, `--serial`). - Supported on macOS, Android emulator/device, and iOS simulator.