Skip to content

Commit 99cc684

Browse files
authored
fix: align press/click behavior and honor fast press cadence on iOS (#70)
* fix: align press and click semantics and reduce iOS tap latency * perf: batch repeated iOS taps in runner * perf: add opt-in tap batching for repeated iOS presses * perf: batch repeated iOS swipes in runner * fix: preserve interaction series semantics in record/replay * test: cover click/press interaction handler paths * refactor: dedupe replay and interaction series helpers * fix: unify press/click double-tap behavior * refactor: add interactor doubleTap and alias docs * refactor: normalize click alias to press path
1 parent 040b518 commit 99cc684

21 files changed

Lines changed: 928 additions & 124 deletions

File tree

README.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,15 @@ npx agent-device open SampleApp
3434
## Quick Start
3535

3636
Use refs for agent-driven exploration and normal automation flows.
37+
Use `press` as the canonical tap command; `click` is an equivalent alias.
3738

3839
```bash
3940
agent-device open Contacts --platform ios # creates session on iOS Simulator
4041
agent-device snapshot
41-
agent-device click @e5
42+
agent-device press @e5
4243
agent-device fill @e6 "John"
4344
agent-device fill @e7 "Doe"
44-
agent-device click @e3
45+
agent-device press @e3
4546
agent-device close
4647
```
4748

@@ -102,7 +103,7 @@ Basic flow:
102103
```bash
103104
agent-device open SampleApp
104105
agent-device snapshot
105-
agent-device click @e7
106+
agent-device press @e7
106107
agent-device fill @e8 "hello"
107108
agent-device close SampleApp
108109
```
@@ -119,20 +120,23 @@ agent-device trace stop ./trace.log
119120
Coordinates:
120121
- All coordinate-based commands (`press`, `long-press`, `swipe`, `focus`, `fill`) use device coordinates with origin at top-left.
121122
- X increases to the right, Y increases downward.
123+
- `press` is the canonical tap command.
124+
- `click` is an equivalent alias and accepts the same targets (`x y`, `@ref`, selector) and flags.
122125

123126
Gesture series examples:
124127

125128
```bash
126129
agent-device press 300 500 --count 12 --interval-ms 45
127130
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
131+
agent-device press @e5 --count 5 --double-tap
128132
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
129133
```
130134

131135
## Command Index
132136
- `boot`, `open`, `close`, `reinstall`, `home`, `back`, `app-switcher`
133137
- `batch`
134138
- `snapshot`, `find`, `get`
135-
- `click`, `focus`, `type`, `fill`, `press`, `long-press`, `swipe`, `scroll`, `scrollintoview`, `pinch`, `is`
139+
- `press` (alias: `click`), `focus`, `type`, `fill`, `long-press`, `swipe`, `scroll`, `scrollintoview`, `pinch`, `is`
136140
- `alert`, `wait`, `screenshot`
137141
- `trace start`, `trace stop`
138142
- `settings wifi|airplane|location on|off`
@@ -157,6 +161,7 @@ Flags:
157161
- `--interval-ms <ms>` delay between `press` iterations
158162
- `--hold-ms <ms>` hold duration per `press` iteration
159163
- `--jitter-px <n>` deterministic coordinate jitter for `press`
164+
- `--double-tap` use a double-tap gesture per `press`/`click` iteration (cannot be combined with `--hold-ms` or `--jitter-px`)
160165
- `--pause-ms <ms>` delay between `swipe` iterations
161166
- `--pattern one-way|ping-pong` repeat pattern for `swipe`
162167
- `--verbose` for daemon and runner logs

ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests/RunnerTests.swift

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,20 +219,27 @@ final class RunnerTests: XCTestCase {
219219
let normalizedBundleId = command.appBundleId?
220220
.trimmingCharacters(in: .whitespacesAndNewlines)
221221
let requestedBundleId = (normalizedBundleId?.isEmpty == true) ? nil : normalizedBundleId
222+
let switchedApp: Bool
222223
if let bundleId = requestedBundleId, currentBundleId != bundleId {
223224
let target = XCUIApplication(bundleIdentifier: bundleId)
224225
NSLog("AGENT_DEVICE_RUNNER_ACTIVATE bundle=%@ state=%d", bundleId, target.state.rawValue)
225226
// activate avoids terminating and relaunching the target app
226227
target.activate()
227228
currentApp = target
228229
currentBundleId = bundleId
230+
switchedApp = true
229231
} else if requestedBundleId == nil {
230232
// Do not reuse stale bundle targets when the caller does not explicitly request one.
231233
currentApp = nil
232234
currentBundleId = nil
235+
switchedApp = false
236+
} else {
237+
switchedApp = false
233238
}
234239
let activeApp = currentApp ?? app
235-
_ = activeApp.waitForExistence(timeout: 5)
240+
if switchedApp {
241+
_ = activeApp.waitForExistence(timeout: 5)
242+
}
236243

237244
switch command.command {
238245
case .shutdown:
@@ -250,6 +257,23 @@ final class RunnerTests: XCTestCase {
250257
return Response(ok: true, data: DataPayload(message: "tapped"))
251258
}
252259
return Response(ok: false, error: ErrorPayload(message: "tap requires text or x/y"))
260+
case .tapSeries:
261+
guard let x = command.x, let y = command.y else {
262+
return Response(ok: false, error: ErrorPayload(message: "tapSeries requires x and y"))
263+
}
264+
let count = max(Int(command.count ?? 1), 1)
265+
let intervalMs = max(command.intervalMs ?? 0, 0)
266+
let doubleTap = command.doubleTap ?? false
267+
if doubleTap {
268+
runSeries(count: count, pauseMs: intervalMs) { _ in
269+
doubleTapAt(app: activeApp, x: x, y: y)
270+
}
271+
return Response(ok: true, data: DataPayload(message: "tap series"))
272+
}
273+
runSeries(count: count, pauseMs: intervalMs) { _ in
274+
tapAt(app: activeApp, x: x, y: y)
275+
}
276+
return Response(ok: true, data: DataPayload(message: "tap series"))
253277
case .longPress:
254278
guard let x = command.x, let y = command.y else {
255279
return Response(ok: false, error: ErrorPayload(message: "longPress requires x and y"))
@@ -264,6 +288,26 @@ final class RunnerTests: XCTestCase {
264288
let holdDuration = min(max((command.durationMs ?? 60) / 1000.0, 0.016), 10.0)
265289
dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration)
266290
return Response(ok: true, data: DataPayload(message: "dragged"))
291+
case .dragSeries:
292+
guard let x = command.x, let y = command.y, let x2 = command.x2, let y2 = command.y2 else {
293+
return Response(ok: false, error: ErrorPayload(message: "dragSeries requires x, y, x2, and y2"))
294+
}
295+
let count = max(Int(command.count ?? 1), 1)
296+
let pauseMs = max(command.pauseMs ?? 0, 0)
297+
let pattern = command.pattern ?? "one-way"
298+
if pattern != "one-way" && pattern != "ping-pong" {
299+
return Response(ok: false, error: ErrorPayload(message: "dragSeries pattern must be one-way or ping-pong"))
300+
}
301+
let holdDuration = min(max((command.durationMs ?? 60) / 1000.0, 0.016), 10.0)
302+
runSeries(count: count, pauseMs: pauseMs) { idx in
303+
let reverse = pattern == "ping-pong" && (idx % 2 == 1)
304+
if reverse {
305+
dragAt(app: activeApp, x: x2, y: y2, x2: x, y2: y, holdDuration: holdDuration)
306+
} else {
307+
dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration)
308+
}
309+
}
310+
return Response(ok: true, data: DataPayload(message: "drag series"))
267311
case .type:
268312
guard let text = command.text else {
269313
return Response(ok: false, error: ErrorPayload(message: "type requires text"))
@@ -443,6 +487,12 @@ final class RunnerTests: XCTestCase {
443487
coordinate.tap()
444488
}
445489

490+
private func doubleTapAt(app: XCUIApplication, x: Double, y: Double) {
491+
let origin = app.coordinate(withNormalizedOffset: CGVector(dx: 0, dy: 0))
492+
let coordinate = origin.withOffset(CGVector(dx: x, dy: y))
493+
coordinate.doubleTap()
494+
}
495+
446496
private func longPressAt(app: XCUIApplication, x: Double, y: Double, duration: TimeInterval) {
447497
let origin = app.coordinate(withNormalizedOffset: CGVector(dx: 0, dy: 0))
448498
let coordinate = origin.withOffset(CGVector(dx: x, dy: y))
@@ -463,6 +513,17 @@ final class RunnerTests: XCTestCase {
463513
start.press(forDuration: holdDuration, thenDragTo: end)
464514
}
465515

516+
private func runSeries(count: Int, pauseMs: Double, operation: (Int) -> Void) {
517+
let total = max(count, 1)
518+
let pause = max(pauseMs, 0)
519+
for idx in 0..<total {
520+
operation(idx)
521+
if idx < total - 1 && pause > 0 {
522+
Thread.sleep(forTimeInterval: pause / 1000.0)
523+
}
524+
}
525+
}
526+
466527
private func swipe(app: XCUIApplication, direction: SwipeDirection) {
467528
let target = app.windows.firstMatch.exists ? app.windows.firstMatch : app
468529
let start = target.coordinate(withNormalizedOffset: CGVector(dx: 0.5, dy: 0.2))
@@ -982,8 +1043,10 @@ private func resolveRunnerPort() -> UInt16 {
9821043

9831044
enum CommandType: String, Codable {
9841045
case tap
1046+
case tapSeries
9851047
case longPress
9861048
case drag
1049+
case dragSeries
9871050
case type
9881051
case swipe
9891052
case findText
@@ -1012,6 +1075,11 @@ struct Command: Codable {
10121075
let action: String?
10131076
let x: Double?
10141077
let y: Double?
1078+
let count: Double?
1079+
let intervalMs: Double?
1080+
let doubleTap: Bool?
1081+
let pauseMs: Double?
1082+
let pattern: String?
10151083
let x2: Double?
10161084
let y2: Double?
10171085
let durationMs: Double?

skills/agent-device/SKILL.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ For agent-driven exploration: use refs. For deterministic replay scripts: use se
1212
```bash
1313
agent-device open Settings --platform ios
1414
agent-device snapshot -i
15-
agent-device click @e3
15+
agent-device press @e3
1616
agent-device wait text "Camera"
1717
agent-device alert wait 10000
1818
agent-device fill @e5 "test"
@@ -29,7 +29,7 @@ npx -y agent-device
2929

3030
1. Open app or deep link: `open [app|url] [url]` (`open` handles target selection + boot/activation in the normal flow)
3131
2. Snapshot: `snapshot` to get refs from accessibility tree
32-
3. Interact using refs (`click @ref`, `fill @ref "text"`)
32+
3. Interact using refs (`press @ref`, `fill @ref "text"`; `click` is an alias of `press`)
3333
4. Re-snapshot after navigation/UI changes
3434
5. Close session when done
3535

@@ -109,13 +109,15 @@ agent-device appstate
109109
### Interactions (use @refs from snapshot)
110110

111111
```bash
112-
agent-device click @e1
112+
agent-device press @e1 # Canonical tap command (`click` is an alias)
113113
agent-device focus @e2
114114
agent-device fill @e2 "text" # Clear then type (Android: verifies value and retries once on mismatch)
115115
agent-device type "text" # Type into focused field without clearing
116116
agent-device press 300 500 # Tap by coordinates
117117
agent-device press 300 500 --count 12 --interval-ms 45
118118
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
119+
agent-device press @e1 --count 5 # Repeat taps on the same target
120+
agent-device press @e1 --count 5 --double-tap # Use double-tap gesture per iteration
119121
agent-device swipe 540 1500 540 500 120
120122
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
121123
agent-device long-press 300 500 800 # Long press (where supported)
@@ -222,7 +224,10 @@ agent-device apps --platform android --user-installed
222224

223225
## Best practices
224226

225-
- `press` supports gesture series controls: `--count`, `--interval-ms`, `--hold-ms`, `--jitter-px`.
227+
- `press` is the canonical tap command; `click` is an alias with the same behavior.
228+
- `press` (and `click`) accepts `x y`, `@ref`, and selector targets.
229+
- `press`/`click` support gesture series controls: `--count`, `--interval-ms`, `--hold-ms`, `--jitter-px`, `--double-tap`.
230+
- `--double-tap` cannot be combined with `--hold-ms` or `--jitter-px`.
226231
- `swipe` supports coordinate + timing controls and repeat patterns: `swipe x1 y1 x2 y2 [durationMs] --count --pause-ms --pattern`.
227232
- `swipe` timing is platform-safe: Android uses requested duration; iOS uses normalized safe timing to avoid long-press side effects.
228233
- Pinch (`pinch <scale> [x y]`) is iOS simulator-only; scale > 1 zooms in, < 1 zooms out.

skills/agent-device/references/snapshot-refs.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## Purpose
44

55
Refs are useful for discovery/debugging. For deterministic scripts, use selectors.
6+
For tap interactions, `press` is canonical; `click` is an equivalent alias.
67

78
## Snapshot
89

@@ -24,14 +25,14 @@ App: com.apple.Preferences
2425
## Using refs (discovery/debug)
2526

2627
```bash
27-
agent-device click @e2
28+
agent-device press @e2
2829
agent-device fill @e5 "test"
2930
```
3031

3132
## Using selectors (deterministic)
3233

3334
```bash
34-
agent-device click 'id="camera_row" || label="Camera" role=button'
35+
agent-device press 'id="camera_row" || label="Camera" role=button'
3536
agent-device fill 'id="search_input" editable=true' "test"
3637
agent-device is visible 'id="camera_settings_anchor"'
3738
```

src/cli.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,12 +175,12 @@ export async function runCli(argv: string[], deps: CliDeps = DEFAULT_CLI_DEPS):
175175
if (logTailStopper) logTailStopper();
176176
return;
177177
}
178-
if (command === 'click') {
178+
if (command === 'click' || command === 'press') {
179179
const ref = (response.data as any)?.ref ?? '';
180180
const x = (response.data as any)?.x;
181181
const y = (response.data as any)?.y;
182182
if (ref && typeof x === 'number' && typeof y === 'number') {
183-
process.stdout.write(`Clicked @${ref} (${x}, ${y})\n`);
183+
process.stdout.write(`Tapped @${ref} (${x}, ${y})\n`);
184184
}
185185
if (logTailStopper) logTailStopper();
186186
return;
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import test from 'node:test';
2+
import assert from 'node:assert/strict';
3+
import { shouldUseIosDragSeries, shouldUseIosTapSeries } from '../dispatch.ts';
4+
import type { DeviceInfo } from '../../utils/device.ts';
5+
6+
const iosDevice: DeviceInfo = {
7+
platform: 'ios',
8+
id: 'ios-1',
9+
name: 'iPhone 15',
10+
kind: 'simulator',
11+
booted: true,
12+
};
13+
14+
const androidDevice: DeviceInfo = {
15+
platform: 'android',
16+
id: 'android-1',
17+
name: 'Pixel',
18+
kind: 'emulator',
19+
booted: true,
20+
};
21+
22+
test('shouldUseIosTapSeries enables fast path for repeated plain iOS taps', () => {
23+
assert.equal(shouldUseIosTapSeries(iosDevice, 5, 0, 0), true);
24+
});
25+
26+
test('shouldUseIosTapSeries disables fast path for single press or modified gestures', () => {
27+
assert.equal(shouldUseIosTapSeries(iosDevice, 1, 0, 0), false);
28+
assert.equal(shouldUseIosTapSeries(iosDevice, 5, 100, 0), false);
29+
assert.equal(shouldUseIosTapSeries(iosDevice, 5, 0, 1), false);
30+
});
31+
32+
test('shouldUseIosTapSeries disables fast path for non-iOS devices', () => {
33+
assert.equal(shouldUseIosTapSeries(androidDevice, 5, 0, 0), false);
34+
});
35+
36+
test('shouldUseIosDragSeries enables fast path for repeated iOS swipes', () => {
37+
assert.equal(shouldUseIosDragSeries(iosDevice, 3), true);
38+
});
39+
40+
test('shouldUseIosDragSeries disables fast path for single swipe and non-iOS', () => {
41+
assert.equal(shouldUseIosDragSeries(iosDevice, 1), false);
42+
assert.equal(shouldUseIosDragSeries(androidDevice, 3), false);
43+
});

0 commit comments

Comments
 (0)