Skip to content

Commit afcd375

Browse files
authored
feat: add runtime system and gesture commands (#415)
1 parent 2c41225 commit afcd375

File tree

17 files changed

+2237
-305
lines changed

17 files changed

+2237
-305
lines changed

COMMAND_OWNERSHIP.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,29 @@ Their semantics should live in `agent-device/commands` as they migrate.
6969
local file inputs remain command-policy gated.
7070
- `trigger-app-event`: runtime `apps.triggerEvent` implemented with event name
7171
and JSON payload validation.
72+
- `back`: runtime `system.back` implemented with typed in-app/system modes.
73+
- `home`: runtime `system.home` implemented.
74+
- `rotate`: runtime `system.rotate` implemented with explicit orientation
75+
validation.
76+
- `keyboard`: runtime `system.keyboard` implemented with explicit status/get
77+
and dismiss result shapes.
78+
- `clipboard`: runtime `system.clipboard` implemented with read/write result
79+
unions.
80+
- `settings`: runtime `system.settings` implemented as a typed settings-open
81+
primitive.
82+
- `alert`: runtime `system.alert` implemented with explicit status, handled,
83+
and wait result unions.
84+
- `app-switcher`: runtime `system.appSwitcher` implemented.
85+
- `focus`: runtime `interactions.focus` implemented for point, ref, and
86+
selector targets.
87+
- `longpress`: runtime `interactions.longPress` implemented for point, ref, and
88+
selector targets.
89+
- `swipe`: runtime `interactions.swipe` implemented with point, ref, selector,
90+
and viewport-derived directional starts.
91+
- `scroll`: runtime `interactions.scroll` implemented with viewport, point, ref,
92+
and selector targets.
93+
- `pinch`: runtime `interactions.pinch` implemented behind the typed backend
94+
primitive.
7295

7396
## Boundary Requirements
7497

skills/agent-device/references/macos-desktop.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ Use `snapshot --raw --platform macos` only when debugging AX structure or collec
7575
Things not to rely on:
7676

7777
- Mobile-only helpers such as `install`, `reinstall`, or `push`.
78-
- Desktop-global click or fill parity from `desktop` or `menubar` sessions.
78+
- Desktop-global click, fill, or gesture parity from `desktop` or `menubar` sessions.
7979
- Raw coordinate assumptions across runs.
8080

8181
Troubleshooting:

src/__tests__/runtime-conformance.test.ts

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,19 @@ test('command conformance suites run against a fixture backend', async () => {
3131
assert.equal(calls.includes('tap'), true);
3232
assert.equal(calls.includes('fill'), true);
3333
assert.equal(calls.includes('typeText'), true);
34+
assert.equal(calls.includes('focus'), true);
35+
assert.equal(calls.includes('longPress'), true);
36+
assert.equal(calls.includes('swipe'), true);
37+
assert.equal(calls.includes('scroll'), true);
38+
assert.equal(calls.includes('pinch'), true);
39+
assert.equal(calls.includes('pressBack'), true);
40+
assert.equal(calls.includes('pressHome'), true);
41+
assert.equal(calls.includes('rotate'), true);
42+
assert.equal(calls.includes('setKeyboard'), true);
43+
assert.equal(calls.includes('getClipboard'), true);
44+
assert.equal(calls.includes('openSettings'), true);
45+
assert.equal(calls.includes('handleAlert'), true);
46+
assert.equal(calls.includes('openAppSwitcher'), true);
3447
assert.equal(calls.includes('openApp'), true);
3548
assert.equal(calls.includes('closeApp'), true);
3649
assert.equal(calls.includes('listApps'), true);
@@ -78,6 +91,48 @@ function createFixtureBackend(calls: string[]): AgentDeviceBackend {
7891
typeText: async () => {
7992
calls.push('typeText');
8093
},
94+
focus: async () => {
95+
calls.push('focus');
96+
},
97+
longPress: async () => {
98+
calls.push('longPress');
99+
},
100+
swipe: async () => {
101+
calls.push('swipe');
102+
},
103+
scroll: async () => {
104+
calls.push('scroll');
105+
},
106+
pinch: async () => {
107+
calls.push('pinch');
108+
},
109+
pressBack: async () => {
110+
calls.push('pressBack');
111+
},
112+
pressHome: async () => {
113+
calls.push('pressHome');
114+
},
115+
rotate: async () => {
116+
calls.push('rotate');
117+
},
118+
setKeyboard: async (_context, options) => {
119+
calls.push('setKeyboard');
120+
return { action: options.action, visible: false };
121+
},
122+
getClipboard: async () => {
123+
calls.push('getClipboard');
124+
return { text: 'copied' };
125+
},
126+
openSettings: async () => {
127+
calls.push('openSettings');
128+
},
129+
handleAlert: async () => {
130+
calls.push('handleAlert');
131+
return { kind: 'alertStatus', alert: null };
132+
},
133+
openAppSwitcher: async () => {
134+
calls.push('openAppSwitcher');
135+
},
81136
openApp: async () => {
82137
calls.push('openApp');
83138
},

src/__tests__/runtime-interactions.test.ts

Lines changed: 240 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,195 @@ test('runtime typeText validates refs and forwards text to the backend primitive
193193
);
194194
});
195195

196+
test('runtime focus and longPress share selector/ref target resolution', async () => {
197+
const calls: unknown[] = [];
198+
const device = createInteractionDevice(selectorSnapshot(), {
199+
focus: async (_context, point) => {
200+
calls.push({ command: 'focus', point });
201+
return { focused: true };
202+
},
203+
longPress: async (_context, point, options) => {
204+
calls.push({ command: 'longPress', point, durationMs: options?.durationMs });
205+
},
206+
});
207+
208+
const focused = await device.interactions.focus(selector('label=Continue'), {
209+
session: 'default',
210+
});
211+
const longPressed = await device.interactions.longPress(ref('@e1'), {
212+
session: 'default',
213+
durationMs: 750,
214+
});
215+
216+
assert.equal(focused.kind, 'selector');
217+
assert.deepEqual(focused.backendResult, { focused: true });
218+
assert.equal(longPressed.kind, 'ref');
219+
assert.deepEqual(calls, [
220+
{ command: 'focus', point: { x: 60, y: 40 } },
221+
{ command: 'longPress', point: { x: 60, y: 40 }, durationMs: 750 },
222+
]);
223+
});
224+
225+
test('runtime scroll resolves selector targets before calling the backend primitive', async () => {
226+
const calls: unknown[] = [];
227+
const device = createInteractionDevice(selectorSnapshot(), {
228+
scroll: async (_context, target, options) => {
229+
calls.push({ target, options });
230+
return { scrolled: true };
231+
},
232+
});
233+
234+
const selectorResult = await device.interactions.scroll({
235+
session: 'default',
236+
target: selector('label=Continue'),
237+
direction: 'down',
238+
pixels: 120,
239+
});
240+
const viewportResult = await device.interactions.scroll({
241+
direction: 'up',
242+
amount: 0.5,
243+
});
244+
245+
assert.equal(selectorResult.kind, 'selector');
246+
assert.equal(viewportResult.kind, 'viewport');
247+
assert.deepEqual(calls, [
248+
{
249+
target: { kind: 'point', point: { x: 60, y: 40 } },
250+
options: { direction: 'down', pixels: 120 },
251+
},
252+
{
253+
target: { kind: 'viewport' },
254+
options: { direction: 'up', amount: 0.5 },
255+
},
256+
]);
257+
});
258+
259+
test('runtime swipe supports explicit and viewport-derived targets', async () => {
260+
const calls: unknown[] = [];
261+
const device = createInteractionDevice(selectorSnapshot(), {
262+
swipe: async (_context, from, to, options) => {
263+
calls.push({ from, to, durationMs: options?.durationMs });
264+
},
265+
});
266+
267+
const explicit = await device.interactions.swipe({
268+
from: selector('label=Continue'),
269+
to: { x: 200, y: 40 },
270+
durationMs: 300,
271+
session: 'default',
272+
});
273+
const directional = await device.interactions.swipe({
274+
direction: 'left',
275+
distance: 25,
276+
session: 'default',
277+
});
278+
279+
assert.deepEqual(explicit.from, { x: 60, y: 40 });
280+
assert.deepEqual(directional.from, { x: 60, y: 40 });
281+
assert.deepEqual(directional.to, { x: 35, y: 40 });
282+
assert.deepEqual(calls, [
283+
{ from: { x: 60, y: 40 }, to: { x: 200, y: 40 }, durationMs: 300 },
284+
{ from: { x: 60, y: 40 }, to: { x: 35, y: 40 }, durationMs: undefined },
285+
]);
286+
});
287+
288+
test('runtime directional swipe uses the visible viewport instead of off-screen content bounds', async () => {
289+
const calls: unknown[] = [];
290+
const device = createInteractionDevice(snapshotWithOffscreenContent(), {
291+
swipe: async (_context, from, to) => {
292+
calls.push({ from, to });
293+
},
294+
});
295+
296+
const result = await device.interactions.swipe({
297+
direction: 'left',
298+
distance: 25,
299+
session: 'default',
300+
});
301+
302+
assert.deepEqual(result.from, { x: 50, y: 50 });
303+
assert.deepEqual(result.to, { x: 25, y: 50 });
304+
assert.deepEqual(calls, [{ from: { x: 50, y: 50 }, to: { x: 25, y: 50 } }]);
305+
});
306+
307+
test('runtime viewport gestures reject inspect-only macOS surfaces', async () => {
308+
for (const surface of ['desktop', 'menubar'] as const) {
309+
const device = createInteractionDevice(selectorSnapshot(), {
310+
platform: 'macos',
311+
sessionMetadata: { surface },
312+
scroll: async () => {
313+
throw new Error(`${surface} scroll should be rejected before backend call`);
314+
},
315+
swipe: async () => {
316+
throw new Error(`${surface} swipe should be rejected before backend call`);
317+
},
318+
pinch: async () => {
319+
throw new Error(`${surface} pinch should be rejected before backend call`);
320+
},
321+
});
322+
323+
await assert.rejects(
324+
() =>
325+
device.interactions.scroll({
326+
direction: 'down',
327+
target: { kind: 'viewport' },
328+
session: 'default',
329+
}),
330+
new RegExp(`scroll is not supported on macOS ${surface}`),
331+
);
332+
await assert.rejects(
333+
() =>
334+
device.interactions.swipe({
335+
direction: 'left',
336+
session: 'default',
337+
}),
338+
new RegExp(`swipe is not supported on macOS ${surface}`),
339+
);
340+
await assert.rejects(
341+
() =>
342+
device.interactions.swipe({
343+
from: { x: 10, y: 20 },
344+
to: { x: 30, y: 20 },
345+
session: 'default',
346+
}),
347+
new RegExp(`swipe is not supported on macOS ${surface}`),
348+
);
349+
await assert.rejects(
350+
() =>
351+
device.interactions.pinch({
352+
scale: 1.2,
353+
session: 'default',
354+
}),
355+
new RegExp(`pinch is not supported on macOS ${surface}`),
356+
);
357+
}
358+
});
359+
360+
test('runtime pinch is backend-gated and resolves optional center targets', async () => {
361+
const calls: unknown[] = [];
362+
const unsupported = createInteractionDevice(selectorSnapshot());
363+
await assert.rejects(
364+
() => unsupported.interactions.pinch({ scale: 1.2 }),
365+
/pinch is not supported/,
366+
);
367+
368+
const device = createInteractionDevice(selectorSnapshot(), {
369+
pinch: async (_context, options) => {
370+
calls.push(options);
371+
},
372+
});
373+
374+
const result = await device.interactions.pinch({
375+
scale: 0.8,
376+
center: ref('@e1'),
377+
session: 'default',
378+
});
379+
380+
assert.equal(result.kind, 'pinch');
381+
assert.deepEqual(result.center, { x: 60, y: 40 });
382+
assert.deepEqual(calls, [{ scale: 0.8, center: { x: 60, y: 40 } }]);
383+
});
384+
196385
test('runtime interaction commands are available from the command namespace', async () => {
197386
const device = createInteractionDevice(selectorSnapshot(), {
198387
tap: async () => {},
@@ -233,9 +422,52 @@ function fillableSnapshot(): SnapshotState {
233422
]);
234423
}
235424

425+
function snapshotWithOffscreenContent(): SnapshotState {
426+
return makeSnapshotState([
427+
{
428+
index: 0,
429+
depth: 0,
430+
type: 'Application',
431+
label: 'Example',
432+
rect: { x: 0, y: 0, width: 100, height: 100 },
433+
},
434+
{
435+
index: 1,
436+
depth: 1,
437+
parentIndex: 0,
438+
type: 'Button',
439+
label: 'Visible',
440+
rect: { x: 10, y: 10, width: 20, height: 20 },
441+
hittable: true,
442+
},
443+
{
444+
index: 2,
445+
depth: 1,
446+
parentIndex: 0,
447+
type: 'Button',
448+
label: 'Offscreen',
449+
rect: { x: 10, y: 900, width: 20, height: 20 },
450+
hittable: true,
451+
},
452+
]);
453+
}
454+
236455
function createInteractionDevice(
237456
snapshot: SnapshotState,
238-
overrides: Partial<Pick<AgentDeviceBackend, 'captureSnapshot' | 'tap' | 'fill' | 'typeText'>> & {
457+
overrides: Partial<
458+
Pick<
459+
AgentDeviceBackend,
460+
| 'captureSnapshot'
461+
| 'tap'
462+
| 'fill'
463+
| 'typeText'
464+
| 'focus'
465+
| 'longPress'
466+
| 'scroll'
467+
| 'swipe'
468+
| 'pinch'
469+
>
470+
> & {
239471
platform?: AgentDeviceBackend['platform'];
240472
sessionMetadata?: Record<string, unknown>;
241473
} = {},
@@ -248,6 +480,13 @@ function createInteractionDevice(
248480
tap: async (...args) => await overrides.tap?.(...args),
249481
fill: async (...args) => await overrides.fill?.(...args),
250482
typeText: async (...args) => await overrides.typeText?.(...args),
483+
focus: overrides.focus ? async (...args) => await overrides.focus?.(...args) : undefined,
484+
longPress: overrides.longPress
485+
? async (...args) => await overrides.longPress?.(...args)
486+
: undefined,
487+
scroll: overrides.scroll ? async (...args) => await overrides.scroll?.(...args) : undefined,
488+
swipe: overrides.swipe ? async (...args) => await overrides.swipe?.(...args) : undefined,
489+
pinch: overrides.pinch ? async (...args) => await overrides.pinch?.(...args) : undefined,
251490
} satisfies AgentDeviceBackend,
252491
artifacts: createLocalArtifactAdapter(),
253492
sessions: createMemorySessionStore([

0 commit comments

Comments
 (0)