Skip to content

Commit 711e26b

Browse files
committed
Add gesture series controls for press and swipe
1 parent 37d1faf commit 711e26b

14 files changed

Lines changed: 349 additions & 11 deletions

File tree

README.md

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ The project is in early development and considered experimental. Pull requests a
1414

1515
## Features
1616
- Platforms: iOS (simulator + limited device support) and Android (emulator + device).
17-
- Core commands: `open`, `back`, `home`, `app-switcher`, `press`, `long-press`, `focus`, `type`, `fill`, `scroll`, `scrollintoview`, `wait`, `alert`, `screenshot`, `close`, `reinstall`.
17+
- Core commands: `open`, `back`, `home`, `app-switcher`, `press`, `long-press`, `swipe`, `focus`, `type`, `fill`, `scroll`, `scrollintoview`, `pinch`, `wait`, `alert`, `screenshot`, `close`, `reinstall`.
1818
- Inspection commands: `snapshot` (accessibility tree).
1919
- Device tooling: `adb` (Android), `simctl`/`devicectl` (iOS via Xcode).
2020
- Minimal dependencies; TypeScript executed directly on Node 22+ (no build step).
@@ -71,13 +71,21 @@ agent-device trace stop ./trace.log
7171
```
7272

7373
Coordinates:
74-
- All coordinate-based commands (`press`, `long-press`, `focus`, `fill`) use device coordinates with origin at top-left.
74+
- All coordinate-based commands (`press`, `long-press`, `swipe`, `focus`, `fill`) use device coordinates with origin at top-left.
7575
- X increases to the right, Y increases downward.
7676

77+
Gesture series examples:
78+
79+
```bash
80+
agent-device press 300 500 --count 12 --interval-ms 45
81+
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
82+
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
83+
```
84+
7785
## Command Index
7886
- `boot`, `open`, `close`, `reinstall`, `home`, `back`, `app-switcher`
7987
- `snapshot`, `find`, `get`
80-
- `click`, `focus`, `type`, `fill`, `press`, `long-press`, `scroll`, `scrollintoview`, `is`
88+
- `click`, `focus`, `type`, `fill`, `press`, `long-press`, `swipe`, `scroll`, `scrollintoview`, `pinch`, `is`
8189
- `alert`, `wait`, `screenshot`
8290
- `trace start`, `trace stop`
8391
- `settings wifi|airplane|location on|off`
@@ -103,10 +111,20 @@ Flags:
103111
- `--serial <serial>` (Android)
104112
- `--activity <component>` (Android app launch only; package/Activity or package/.Activity; not for URL opens)
105113
- `--session <name>`
114+
- `--count <n>` repeat count for `press`/`swipe`
115+
- `--interval-ms <ms>` delay between `press` iterations
116+
- `--hold-ms <ms>` hold duration per `press` iteration
117+
- `--jitter-px <n>` deterministic coordinate jitter for `press`
118+
- `--pause-ms <ms>` delay between `swipe` iterations
119+
- `--pattern one-way|ping-pong` repeat pattern for `swipe`
106120
- `--verbose` for daemon and runner logs
107121
- `--json` for structured output
108122
- `--backend ax|xctest` (snapshot only; defaults to `xctest` on iOS)
109123

124+
Pinch:
125+
- `pinch` is supported on iOS simulators.
126+
- On Android, `pinch` currently returns `UNSUPPORTED_OPERATION` in the adb backend.
127+
110128
## Skills
111129
Install the automation skills listed in [SKILL.md](skills/agent-device/SKILL.md).
112130

ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests/RunnerTests.swift

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,13 @@ final class RunnerTests: XCTestCase {
251251
let duration = (command.durationMs ?? 800) / 1000.0
252252
longPressAt(app: activeApp, x: x, y: y, duration: duration)
253253
return Response(ok: true, data: DataPayload(message: "long pressed"))
254+
case .drag:
255+
guard let x = command.x, let y = command.y, let x2 = command.x2, let y2 = command.y2 else {
256+
return Response(ok: false, error: ErrorPayload(message: "drag requires x, y, x2, and y2"))
257+
}
258+
let holdDuration = min(max((command.durationMs ?? 250) / 1000.0, 0.016), 10.0)
259+
dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration)
260+
return Response(ok: true, data: DataPayload(message: "dragged"))
254261
case .type:
255262
guard let text = command.text else {
256263
return Response(ok: false, error: ErrorPayload(message: "type requires text"))
@@ -436,6 +443,20 @@ final class RunnerTests: XCTestCase {
436443
coordinate.press(forDuration: duration)
437444
}
438445

446+
private func dragAt(
447+
app: XCUIApplication,
448+
x: Double,
449+
y: Double,
450+
x2: Double,
451+
y2: Double,
452+
holdDuration: TimeInterval
453+
) {
454+
let origin = app.coordinate(withNormalizedOffset: CGVector(dx: 0, dy: 0))
455+
let start = origin.withOffset(CGVector(dx: x, dy: y))
456+
let end = origin.withOffset(CGVector(dx: x2, dy: y2))
457+
start.press(forDuration: holdDuration, thenDragTo: end)
458+
}
459+
439460
private func swipe(app: XCUIApplication, direction: SwipeDirection) {
440461
let target = app.windows.firstMatch.exists ? app.windows.firstMatch : app
441462
let start = target.coordinate(withNormalizedOffset: CGVector(dx: 0.5, dy: 0.2))
@@ -956,6 +977,7 @@ private func resolveRunnerPort() -> UInt16 {
956977
enum CommandType: String, Codable {
957978
case tap
958979
case longPress
980+
case drag
959981
case type
960982
case swipe
961983
case findText
@@ -984,6 +1006,8 @@ struct Command: Codable {
9841006
let action: String?
9851007
let x: Double?
9861008
let y: Double?
1009+
let x2: Double?
1010+
let y2: Double?
9871011
let durationMs: Double?
9881012
let direction: SwipeDirection?
9891013
let scale: Double?

skills/agent-device/SKILL.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,14 @@ agent-device focus @e2
112112
agent-device fill @e2 "text" # Clear then type (Android: verifies value and retries once on mismatch)
113113
agent-device type "text" # Type into focused field without clearing
114114
agent-device press 300 500 # Tap by coordinates
115+
agent-device press 300 500 --count 12 --interval-ms 45
116+
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
117+
agent-device swipe 540 1500 540 500 120
118+
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
115119
agent-device long-press 300 500 800 # Long press (where supported)
116120
agent-device scroll down 0.5
117-
agent-device pinch 2.0 # Zoom in 2x (iOS simulator + Android)
118-
agent-device pinch 0.5 200 400 # Zoom out at coordinates
121+
agent-device pinch 2.0 # Zoom in 2x (iOS simulator)
122+
agent-device pinch 0.5 200 400 # Zoom out at coordinates (iOS simulator)
119123
agent-device back
120124
agent-device home
121125
agent-device app-switcher
@@ -167,7 +171,9 @@ agent-device apps --platform android --user-installed
167171

168172
## Best practices
169173

170-
- Pinch (`pinch <scale> [x y]`) is supported on iOS simulators and Android; scale > 1 zooms in, < 1 zooms out. On Android, pinch uses multi-touch `sendevent` injection.
174+
- `press` supports gesture series controls: `--count`, `--interval-ms`, `--hold-ms`, `--jitter-px`.
175+
- `swipe` supports coordinate + timing controls and repeat patterns: `swipe x1 y1 x2 y2 [durationMs] --count --pause-ms --pattern`.
176+
- Pinch (`pinch <scale> [x y]`) is currently supported on iOS simulators only.
171177
- Snapshot refs are the core mechanism for interactive agent flows.
172178
- Use selectors for deterministic replay artifacts and assertions (e.g. in e2e test workflows).
173179
- Prefer `snapshot -i` to reduce output size.

src/core/__tests__/capabilities.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ test('iOS simulator + Android commands reject iOS devices', () => {
5252
'record',
5353
'screenshot',
5454
'scroll',
55+
'swipe',
5556
'settings',
5657
'snapshot',
5758
'type',

src/core/capabilities.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ const COMMAND_CAPABILITY_MATRIX: Record<string, CommandCapability> = {
3535
record: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
3636
screenshot: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
3737
scroll: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
38+
swipe: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
3839
settings: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
3940
snapshot: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
4041
type: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },

src/core/dispatch.ts

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ export type CommandFlags = {
3838
noRecord?: boolean;
3939
appsFilter?: 'launchable' | 'user-installed' | 'all';
4040
appsMetadata?: boolean;
41+
count?: number;
42+
intervalMs?: number;
43+
holdMs?: number;
44+
jitterPx?: number;
45+
pauseMs?: number;
46+
pattern?: 'one-way' | 'ping-pong';
4147
replayUpdate?: boolean;
4248
};
4349

@@ -91,6 +97,12 @@ export async function dispatchCommand(
9197
snapshotScope?: string;
9298
snapshotRaw?: boolean;
9399
snapshotBackend?: 'ax' | 'xctest';
100+
count?: number;
101+
intervalMs?: number;
102+
holdMs?: number;
103+
jitterPx?: number;
104+
pauseMs?: number;
105+
pattern?: 'one-way' | 'ping-pong';
94106
},
95107
): Promise<Record<string, unknown> | void> {
96108
const runnerCtx: RunnerContext = {
@@ -121,8 +133,48 @@ export async function dispatchCommand(
121133
case 'press': {
122134
const [x, y] = positionals.map(Number);
123135
if (Number.isNaN(x) || Number.isNaN(y)) throw new AppError('INVALID_ARGS', 'press requires x y');
124-
await interactor.tap(x, y);
125-
return { x, y };
136+
const count = requireIntInRange(context?.count ?? 1, 'count', 1, 200);
137+
const intervalMs = requireIntInRange(context?.intervalMs ?? 0, 'interval-ms', 0, 10_000);
138+
const holdMs = requireIntInRange(context?.holdMs ?? 0, 'hold-ms', 0, 10_000);
139+
const jitterPx = requireIntInRange(context?.jitterPx ?? 0, 'jitter-px', 0, 100);
140+
141+
for (let index = 0; index < count; index += 1) {
142+
const [dx, dy] = computeDeterministicJitter(index, jitterPx);
143+
const targetX = x + dx;
144+
const targetY = y + dy;
145+
if (holdMs > 0) await interactor.longPress(targetX, targetY, holdMs);
146+
else await interactor.tap(targetX, targetY);
147+
if (index < count - 1 && intervalMs > 0) await sleep(intervalMs);
148+
}
149+
150+
return { x, y, count, intervalMs, holdMs, jitterPx };
151+
}
152+
case 'swipe': {
153+
const x1 = Number(positionals[0]);
154+
const y1 = Number(positionals[1]);
155+
const x2 = Number(positionals[2]);
156+
const y2 = Number(positionals[3]);
157+
if ([x1, y1, x2, y2].some(Number.isNaN)) {
158+
throw new AppError('INVALID_ARGS', 'swipe requires x1 y1 x2 y2 [durationMs]');
159+
}
160+
161+
const rawDurationMs = positionals[4] ? Number(positionals[4]) : 250;
162+
const durationMs = requireIntInRange(rawDurationMs, 'durationMs', 16, 10_000);
163+
const count = requireIntInRange(context?.count ?? 1, 'count', 1, 200);
164+
const pauseMs = requireIntInRange(context?.pauseMs ?? 0, 'pause-ms', 0, 10_000);
165+
const pattern = context?.pattern ?? 'one-way';
166+
if (pattern !== 'one-way' && pattern !== 'ping-pong') {
167+
throw new AppError('INVALID_ARGS', `Invalid pattern: ${pattern}`);
168+
}
169+
170+
for (let index = 0; index < count; index += 1) {
171+
const reverse = pattern === 'ping-pong' && index % 2 === 1;
172+
if (reverse) await interactor.swipe(x2, y2, x1, y1, durationMs);
173+
else await interactor.swipe(x1, y1, x2, y2, durationMs);
174+
if (index < count - 1 && pauseMs > 0) await sleep(pauseMs);
175+
}
176+
177+
return { x1, y1, x2, y2, durationMs, count, pauseMs, pattern };
126178
}
127179
case 'long-press': {
128180
const x = Number(positionals[0]);
@@ -171,6 +223,12 @@ export async function dispatchCommand(
171223
return { text };
172224
}
173225
case 'pinch': {
226+
if (device.platform === 'android') {
227+
throw new AppError(
228+
'UNSUPPORTED_OPERATION',
229+
'Android pinch is not supported in current adb backend; requires instrumentation-based backend.',
230+
);
231+
}
174232
const scale = Number(positionals[0]);
175233
const x = positionals[1] ? Number(positionals[1]) : undefined;
176234
const y = positionals[2] ? Number(positionals[2]) : undefined;
@@ -280,3 +338,32 @@ export async function dispatchCommand(
280338
throw new AppError('INVALID_ARGS', `Unknown command: ${command}`);
281339
}
282340
}
341+
342+
const DETERMINISTIC_JITTER_PATTERN: ReadonlyArray<readonly [number, number]> = [
343+
[0, 0],
344+
[1, 0],
345+
[0, 1],
346+
[-1, 0],
347+
[0, -1],
348+
[1, 1],
349+
[-1, 1],
350+
[1, -1],
351+
[-1, -1],
352+
];
353+
354+
function requireIntInRange(value: number, name: string, min: number, max: number): number {
355+
if (!Number.isFinite(value) || !Number.isInteger(value) || value < min || value > max) {
356+
throw new AppError('INVALID_ARGS', `${name} must be an integer between ${min} and ${max}`);
357+
}
358+
return value;
359+
}
360+
361+
function computeDeterministicJitter(index: number, jitterPx: number): [number, number] {
362+
if (jitterPx <= 0) return [0, 0];
363+
const [dx, dy] = DETERMINISTIC_JITTER_PATTERN[index % DETERMINISTIC_JITTER_PATTERN.length];
364+
return [dx * jitterPx, dy * jitterPx];
365+
}
366+
367+
async function sleep(ms: number): Promise<void> {
368+
await new Promise((resolve) => setTimeout(resolve, ms));
369+
}

src/daemon/context.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ export type DaemonCommandContext = {
1212
snapshotScope?: string;
1313
snapshotBackend?: 'ax' | 'xctest';
1414
snapshotRaw?: boolean;
15+
count?: number;
16+
intervalMs?: number;
17+
holdMs?: number;
18+
jitterPx?: number;
19+
pauseMs?: number;
20+
pattern?: 'one-way' | 'ping-pong';
1521
};
1622

1723
export function contextFromFlags(
@@ -32,5 +38,11 @@ export function contextFromFlags(
3238
snapshotScope: flags?.snapshotScope,
3339
snapshotRaw: flags?.snapshotRaw,
3440
snapshotBackend: flags?.snapshotBackend,
41+
count: flags?.count,
42+
intervalMs: flags?.intervalMs,
43+
holdMs: flags?.holdMs,
44+
jitterPx: flags?.jitterPx,
45+
pauseMs: flags?.pauseMs,
46+
pattern: flags?.pattern,
3547
};
3648
}

src/platforms/android/__tests__/index.test.ts

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import test from 'node:test';
22
import assert from 'node:assert/strict';
3-
import { openAndroidApp, parseAndroidLaunchComponent } from '../index.ts';
3+
import { promises as fs } from 'node:fs';
4+
import os from 'node:os';
5+
import path from 'node:path';
6+
import { openAndroidApp, parseAndroidLaunchComponent, swipeAndroid } from '../index.ts';
47
import type { DeviceInfo } from '../../../utils/device.ts';
58
import { AppError } from '../../../utils/errors.ts';
69
import { findBounds, parseUiHierarchy } from '../ui-hierarchy.ts';
@@ -110,3 +113,45 @@ test('openAndroidApp rejects activity override for deep link URLs', async () =>
110113
},
111114
);
112115
});
116+
117+
test('swipeAndroid invokes adb input swipe with duration', async () => {
118+
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'agent-device-swipe-test-'));
119+
const adbPath = path.join(tmpDir, 'adb');
120+
const argsLogPath = path.join(tmpDir, 'args.log');
121+
await fs.writeFile(
122+
adbPath,
123+
'#!/bin/sh\nprintf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"\nexit 0\n',
124+
'utf8',
125+
);
126+
await fs.chmod(adbPath, 0o755);
127+
128+
const previousPath = process.env.PATH;
129+
const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE;
130+
process.env.PATH = `${tmpDir}${path.delimiter}${previousPath ?? ''}`;
131+
process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath;
132+
133+
const device: DeviceInfo = {
134+
platform: 'android',
135+
id: 'emulator-5554',
136+
name: 'Pixel',
137+
kind: 'emulator',
138+
booted: true,
139+
};
140+
141+
try {
142+
await swipeAndroid(device, 10, 20, 30, 40, 250);
143+
const args = (await fs.readFile(argsLogPath, 'utf8'))
144+
.trim()
145+
.split('\n')
146+
.filter(Boolean);
147+
assert.deepEqual(args, ['-s', 'emulator-5554', 'shell', 'input', 'swipe', '10', '20', '30', '40', '250']);
148+
} finally {
149+
process.env.PATH = previousPath;
150+
if (previousArgsFile === undefined) {
151+
delete process.env.AGENT_DEVICE_TEST_ARGS_FILE;
152+
} else {
153+
process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile;
154+
}
155+
await fs.rm(tmpDir, { recursive: true, force: true });
156+
}
157+
});

src/platforms/android/index.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,29 @@ export async function pressAndroid(device: DeviceInfo, x: number, y: number): Pr
333333
await runCmd('adb', adbArgs(device, ['shell', 'input', 'tap', String(x), String(y)]));
334334
}
335335

336+
export async function swipeAndroid(
337+
device: DeviceInfo,
338+
x1: number,
339+
y1: number,
340+
x2: number,
341+
y2: number,
342+
durationMs = 250,
343+
): Promise<void> {
344+
await runCmd(
345+
'adb',
346+
adbArgs(device, [
347+
'shell',
348+
'input',
349+
'swipe',
350+
String(x1),
351+
String(y1),
352+
String(x2),
353+
String(y2),
354+
String(durationMs),
355+
]),
356+
);
357+
}
358+
336359
export async function backAndroid(device: DeviceInfo): Promise<void> {
337360
await runCmd('adb', adbArgs(device, ['shell', 'input', 'keyevent', '4']));
338361
}

0 commit comments

Comments
 (0)