Skip to content

Commit 64d27c1

Browse files
authored
Add consolidated gesture controls for press and swipe (#54)
* Add gesture series controls for press and swipe * Normalize iOS swipe timing to safe duration * Harden daemon startup for integration runs
1 parent f1eca35 commit 64d27c1

15 files changed

Lines changed: 382 additions & 17 deletions

File tree

README.md

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ The project is in early development and considered experimental. Pull requests a
1414

1515
## Features
1616
- Platforms: iOS (simulator + limited device support) and Android (emulator + device).
17-
- Core commands: `open`, `back`, `home`, `app-switcher`, `press`, `long-press`, `focus`, `type`, `fill`, `scroll`, `scrollintoview`, `wait`, `alert`, `screenshot`, `close`, `reinstall`.
17+
- Core commands: `open`, `back`, `home`, `app-switcher`, `press`, `long-press`, `swipe`, `focus`, `type`, `fill`, `scroll`, `scrollintoview`, `pinch`, `wait`, `alert`, `screenshot`, `close`, `reinstall`.
1818
- Inspection commands: `snapshot` (accessibility tree).
1919
- Device tooling: `adb` (Android), `simctl`/`devicectl` (iOS via Xcode).
2020
- Minimal dependencies; TypeScript executed directly on Node 22+ (no build step).
@@ -71,13 +71,21 @@ agent-device trace stop ./trace.log
7171
```
7272

7373
Coordinates:
74-
- All coordinate-based commands (`press`, `long-press`, `focus`, `fill`) use device coordinates with origin at top-left.
74+
- All coordinate-based commands (`press`, `long-press`, `swipe`, `focus`, `fill`) use device coordinates with origin at top-left.
7575
- X increases to the right, Y increases downward.
7676

77+
Gesture series examples:
78+
79+
```bash
80+
agent-device press 300 500 --count 12 --interval-ms 45
81+
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
82+
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
83+
```
84+
7785
## Command Index
7886
- `boot`, `open`, `close`, `reinstall`, `home`, `back`, `app-switcher`
7987
- `snapshot`, `find`, `get`
80-
- `click`, `focus`, `type`, `fill`, `press`, `long-press`, `scroll`, `scrollintoview`, `is`
88+
- `click`, `focus`, `type`, `fill`, `press`, `long-press`, `swipe`, `scroll`, `scrollintoview`, `pinch`, `is`
8189
- `alert`, `wait`, `screenshot`
8290
- `trace start`, `trace stop`
8391
- `settings wifi|airplane|location on|off`
@@ -103,10 +111,25 @@ Flags:
103111
- `--serial <serial>` (Android)
104112
- `--activity <component>` (Android app launch only; package/Activity or package/.Activity; not for URL opens)
105113
- `--session <name>`
114+
- `--count <n>` repeat count for `press`/`swipe`
115+
- `--interval-ms <ms>` delay between `press` iterations
116+
- `--hold-ms <ms>` hold duration per `press` iteration
117+
- `--jitter-px <n>` deterministic coordinate jitter for `press`
118+
- `--pause-ms <ms>` delay between `swipe` iterations
119+
- `--pattern one-way|ping-pong` repeat pattern for `swipe`
106120
- `--verbose` for daemon and runner logs
107121
- `--json` for structured output
108122
- `--backend ax|xctest` (snapshot only; defaults to `xctest` on iOS)
109123

124+
Pinch:
125+
- `pinch` is supported on iOS simulators.
126+
- On Android, `pinch` currently returns `UNSUPPORTED_OPERATION` in the adb backend.
127+
128+
Swipe timing:
129+
- `swipe` accepts optional `durationMs` (default `250`, range `16..10000`).
130+
- Android uses requested swipe duration directly.
131+
- iOS uses a safe normalized duration to avoid long-press side effects.
132+
110133
## Skills
111134
Install the automation skills listed in [SKILL.md](skills/agent-device/SKILL.md).
112135

ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests/RunnerTests.swift

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,13 @@ final class RunnerTests: XCTestCase {
251251
let duration = (command.durationMs ?? 800) / 1000.0
252252
longPressAt(app: activeApp, x: x, y: y, duration: duration)
253253
return Response(ok: true, data: DataPayload(message: "long pressed"))
254+
case .drag:
255+
guard let x = command.x, let y = command.y, let x2 = command.x2, let y2 = command.y2 else {
256+
return Response(ok: false, error: ErrorPayload(message: "drag requires x, y, x2, and y2"))
257+
}
258+
let holdDuration = min(max((command.durationMs ?? 60) / 1000.0, 0.016), 10.0)
259+
dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration)
260+
return Response(ok: true, data: DataPayload(message: "dragged"))
254261
case .type:
255262
guard let text = command.text else {
256263
return Response(ok: false, error: ErrorPayload(message: "type requires text"))
@@ -436,6 +443,20 @@ final class RunnerTests: XCTestCase {
436443
coordinate.press(forDuration: duration)
437444
}
438445

446+
private func dragAt(
447+
app: XCUIApplication,
448+
x: Double,
449+
y: Double,
450+
x2: Double,
451+
y2: Double,
452+
holdDuration: TimeInterval
453+
) {
454+
let origin = app.coordinate(withNormalizedOffset: CGVector(dx: 0, dy: 0))
455+
let start = origin.withOffset(CGVector(dx: x, dy: y))
456+
let end = origin.withOffset(CGVector(dx: x2, dy: y2))
457+
start.press(forDuration: holdDuration, thenDragTo: end)
458+
}
459+
439460
private func swipe(app: XCUIApplication, direction: SwipeDirection) {
440461
let target = app.windows.firstMatch.exists ? app.windows.firstMatch : app
441462
let start = target.coordinate(withNormalizedOffset: CGVector(dx: 0.5, dy: 0.2))
@@ -956,6 +977,7 @@ private func resolveRunnerPort() -> UInt16 {
956977
enum CommandType: String, Codable {
957978
case tap
958979
case longPress
980+
case drag
959981
case type
960982
case swipe
961983
case findText
@@ -984,6 +1006,8 @@ struct Command: Codable {
9841006
let action: String?
9851007
let x: Double?
9861008
let y: Double?
1009+
let x2: Double?
1010+
let y2: Double?
9871011
let durationMs: Double?
9881012
let direction: SwipeDirection?
9891013
let scale: Double?

skills/agent-device/SKILL.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,14 @@ agent-device focus @e2
112112
agent-device fill @e2 "text" # Clear then type (Android: verifies value and retries once on mismatch)
113113
agent-device type "text" # Type into focused field without clearing
114114
agent-device press 300 500 # Tap by coordinates
115+
agent-device press 300 500 --count 12 --interval-ms 45
116+
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
117+
agent-device swipe 540 1500 540 500 120
118+
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
115119
agent-device long-press 300 500 800 # Long press (where supported)
116120
agent-device scroll down 0.5
117-
agent-device pinch 2.0 # Zoom in 2x (iOS simulator + Android)
118-
agent-device pinch 0.5 200 400 # Zoom out at coordinates
121+
agent-device pinch 2.0 # Zoom in 2x (iOS simulator)
122+
agent-device pinch 0.5 200 400 # Zoom out at coordinates (iOS simulator)
119123
agent-device back
120124
agent-device home
121125
agent-device app-switcher
@@ -167,7 +171,10 @@ agent-device apps --platform android --user-installed
167171

168172
## Best practices
169173

170-
- Pinch (`pinch <scale> [x y]`) is supported on iOS simulators and Android; scale > 1 zooms in, < 1 zooms out. On Android, pinch uses multi-touch `sendevent` injection.
174+
- `press` supports gesture series controls: `--count`, `--interval-ms`, `--hold-ms`, `--jitter-px`.
175+
- `swipe` supports coordinate + timing controls and repeat patterns: `swipe x1 y1 x2 y2 [durationMs] --count --pause-ms --pattern`.
176+
- `swipe` timing is platform-safe: Android uses requested duration; iOS uses normalized safe timing to avoid long-press side effects.
177+
- Pinch (`pinch <scale> [x y]`) is currently supported on iOS simulators only.
171178
- Snapshot refs are the core mechanism for interactive agent flows.
172179
- Use selectors for deterministic replay artifacts and assertions (e.g. in e2e test workflows).
173180
- Prefer `snapshot -i` to reduce output size.

src/core/__tests__/capabilities.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ test('iOS simulator + Android commands reject iOS devices', () => {
5252
'record',
5353
'screenshot',
5454
'scroll',
55+
'swipe',
5556
'settings',
5657
'snapshot',
5758
'type',

src/core/capabilities.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ const COMMAND_CAPABILITY_MATRIX: Record<string, CommandCapability> = {
3535
record: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
3636
screenshot: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
3737
scroll: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
38+
swipe: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
3839
settings: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
3940
snapshot: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
4041
type: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },

src/core/dispatch.ts

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ export type CommandFlags = {
3838
noRecord?: boolean;
3939
appsFilter?: 'launchable' | 'user-installed' | 'all';
4040
appsMetadata?: boolean;
41+
count?: number;
42+
intervalMs?: number;
43+
holdMs?: number;
44+
jitterPx?: number;
45+
pauseMs?: number;
46+
pattern?: 'one-way' | 'ping-pong';
4147
replayUpdate?: boolean;
4248
};
4349

@@ -91,6 +97,12 @@ export async function dispatchCommand(
9197
snapshotScope?: string;
9298
snapshotRaw?: boolean;
9399
snapshotBackend?: 'ax' | 'xctest';
100+
count?: number;
101+
intervalMs?: number;
102+
holdMs?: number;
103+
jitterPx?: number;
104+
pauseMs?: number;
105+
pattern?: 'one-way' | 'ping-pong';
94106
},
95107
): Promise<Record<string, unknown> | void> {
96108
const runnerCtx: RunnerContext = {
@@ -121,8 +133,60 @@ export async function dispatchCommand(
121133
case 'press': {
122134
const [x, y] = positionals.map(Number);
123135
if (Number.isNaN(x) || Number.isNaN(y)) throw new AppError('INVALID_ARGS', 'press requires x y');
124-
await interactor.tap(x, y);
125-
return { x, y };
136+
const count = requireIntInRange(context?.count ?? 1, 'count', 1, 200);
137+
const intervalMs = requireIntInRange(context?.intervalMs ?? 0, 'interval-ms', 0, 10_000);
138+
const holdMs = requireIntInRange(context?.holdMs ?? 0, 'hold-ms', 0, 10_000);
139+
const jitterPx = requireIntInRange(context?.jitterPx ?? 0, 'jitter-px', 0, 100);
140+
141+
for (let index = 0; index < count; index += 1) {
142+
const [dx, dy] = computeDeterministicJitter(index, jitterPx);
143+
const targetX = x + dx;
144+
const targetY = y + dy;
145+
if (holdMs > 0) await interactor.longPress(targetX, targetY, holdMs);
146+
else await interactor.tap(targetX, targetY);
147+
if (index < count - 1 && intervalMs > 0) await sleep(intervalMs);
148+
}
149+
150+
return { x, y, count, intervalMs, holdMs, jitterPx };
151+
}
152+
case 'swipe': {
153+
const x1 = Number(positionals[0]);
154+
const y1 = Number(positionals[1]);
155+
const x2 = Number(positionals[2]);
156+
const y2 = Number(positionals[3]);
157+
if ([x1, y1, x2, y2].some(Number.isNaN)) {
158+
throw new AppError('INVALID_ARGS', 'swipe requires x1 y1 x2 y2 [durationMs]');
159+
}
160+
161+
const requestedDurationMs = positionals[4] ? Number(positionals[4]) : 250;
162+
const durationMs = requireIntInRange(requestedDurationMs, 'durationMs', 16, 10_000);
163+
const effectiveDurationMs = device.platform === 'ios' ? 60 : durationMs;
164+
const count = requireIntInRange(context?.count ?? 1, 'count', 1, 200);
165+
const pauseMs = requireIntInRange(context?.pauseMs ?? 0, 'pause-ms', 0, 10_000);
166+
const pattern = context?.pattern ?? 'one-way';
167+
if (pattern !== 'one-way' && pattern !== 'ping-pong') {
168+
throw new AppError('INVALID_ARGS', `Invalid pattern: ${pattern}`);
169+
}
170+
171+
for (let index = 0; index < count; index += 1) {
172+
const reverse = pattern === 'ping-pong' && index % 2 === 1;
173+
if (reverse) await interactor.swipe(x2, y2, x1, y1, effectiveDurationMs);
174+
else await interactor.swipe(x1, y1, x2, y2, effectiveDurationMs);
175+
if (index < count - 1 && pauseMs > 0) await sleep(pauseMs);
176+
}
177+
178+
return {
179+
x1,
180+
y1,
181+
x2,
182+
y2,
183+
durationMs,
184+
effectiveDurationMs,
185+
timingMode: device.platform === 'ios' ? 'safe-normalized' : 'direct',
186+
count,
187+
pauseMs,
188+
pattern,
189+
};
126190
}
127191
case 'long-press': {
128192
const x = Number(positionals[0]);
@@ -171,6 +235,12 @@ export async function dispatchCommand(
171235
return { text };
172236
}
173237
case 'pinch': {
238+
if (device.platform === 'android') {
239+
throw new AppError(
240+
'UNSUPPORTED_OPERATION',
241+
'Android pinch is not supported in current adb backend; requires instrumentation-based backend.',
242+
);
243+
}
174244
const scale = Number(positionals[0]);
175245
const x = positionals[1] ? Number(positionals[1]) : undefined;
176246
const y = positionals[2] ? Number(positionals[2]) : undefined;
@@ -280,3 +350,32 @@ export async function dispatchCommand(
280350
throw new AppError('INVALID_ARGS', `Unknown command: ${command}`);
281351
}
282352
}
353+
354+
const DETERMINISTIC_JITTER_PATTERN: ReadonlyArray<readonly [number, number]> = [
355+
[0, 0],
356+
[1, 0],
357+
[0, 1],
358+
[-1, 0],
359+
[0, -1],
360+
[1, 1],
361+
[-1, 1],
362+
[1, -1],
363+
[-1, -1],
364+
];
365+
366+
function requireIntInRange(value: number, name: string, min: number, max: number): number {
367+
if (!Number.isFinite(value) || !Number.isInteger(value) || value < min || value > max) {
368+
throw new AppError('INVALID_ARGS', `${name} must be an integer between ${min} and ${max}`);
369+
}
370+
return value;
371+
}
372+
373+
function computeDeterministicJitter(index: number, jitterPx: number): [number, number] {
374+
if (jitterPx <= 0) return [0, 0];
375+
const [dx, dy] = DETERMINISTIC_JITTER_PATTERN[index % DETERMINISTIC_JITTER_PATTERN.length];
376+
return [dx * jitterPx, dy * jitterPx];
377+
}
378+
379+
async function sleep(ms: number): Promise<void> {
380+
await new Promise((resolve) => setTimeout(resolve, ms));
381+
}

src/daemon-client.ts

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,9 @@ export async function sendToDaemon(req: Omit<DaemonRequest, 'token'>): Promise<D
3535
async function ensureDaemon(): Promise<DaemonInfo> {
3636
const existing = readDaemonInfo();
3737
const localVersion = readVersion();
38-
if (existing && existing.version === localVersion && (await canConnect(existing))) return existing;
39-
if (existing && (existing.version !== localVersion || !(await canConnect(existing)))) {
38+
const existingReachable = existing ? await canConnect(existing) : false;
39+
if (existing && existing.version === localVersion && existingReachable) return existing;
40+
if (existing && (existing.version !== localVersion || !existingReachable)) {
4041
removeDaemonInfo();
4142
}
4243

@@ -67,7 +68,11 @@ function readDaemonInfo(): DaemonInfo | null {
6768
}
6869

6970
function removeDaemonInfo(): void {
70-
if (fs.existsSync(infoPath)) fs.unlinkSync(infoPath);
71+
try {
72+
if (fs.existsSync(infoPath)) fs.unlinkSync(infoPath);
73+
} catch {
74+
// Best-effort cleanup only; daemon can still overwrite this file on startup.
75+
}
7176
}
7277

7378
async function canConnect(info: DaemonInfo): Promise<boolean> {
@@ -87,11 +92,14 @@ async function startDaemon(): Promise<void> {
8792
const distPath = path.join(root, 'dist', 'src', 'daemon.js');
8893
const srcPath = path.join(root, 'src', 'daemon.ts');
8994

90-
const useDist = fs.existsSync(distPath);
91-
if (!useDist && !fs.existsSync(srcPath)) {
95+
const hasDist = fs.existsSync(distPath);
96+
const hasSrc = fs.existsSync(srcPath);
97+
if (!hasDist && !hasSrc) {
9298
throw new AppError('COMMAND_FAILED', 'Daemon entry not found', { distPath, srcPath });
9399
}
94-
const args = useDist ? [distPath] : ['--experimental-strip-types', srcPath];
100+
const runningFromSource = process.execArgv.includes('--experimental-strip-types');
101+
const useSrc = runningFromSource ? hasSrc : !hasDist && hasSrc;
102+
const args = useSrc ? ['--experimental-strip-types', srcPath] : [distPath];
95103

96104
runCmdDetached(process.execPath, args);
97105
}

src/daemon/context.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ export type DaemonCommandContext = {
1212
snapshotScope?: string;
1313
snapshotBackend?: 'ax' | 'xctest';
1414
snapshotRaw?: boolean;
15+
count?: number;
16+
intervalMs?: number;
17+
holdMs?: number;
18+
jitterPx?: number;
19+
pauseMs?: number;
20+
pattern?: 'one-way' | 'ping-pong';
1521
};
1622

1723
export function contextFromFlags(
@@ -32,5 +38,11 @@ export function contextFromFlags(
3238
snapshotScope: flags?.snapshotScope,
3339
snapshotRaw: flags?.snapshotRaw,
3440
snapshotBackend: flags?.snapshotBackend,
41+
count: flags?.count,
42+
intervalMs: flags?.intervalMs,
43+
holdMs: flags?.holdMs,
44+
jitterPx: flags?.jitterPx,
45+
pauseMs: flags?.pauseMs,
46+
pattern: flags?.pattern,
3547
};
3648
}

0 commit comments

Comments
 (0)