Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ The project is in early development and considered experimental. Pull requests a

## Features
- Platforms: iOS (simulator + limited device support) and Android (emulator + device).
- Core commands: `open`, `back`, `home`, `app-switcher`, `press`, `long-press`, `focus`, `type`, `fill`, `scroll`, `scrollintoview`, `wait`, `alert`, `screenshot`, `close`, `reinstall`.
- Core commands: `open`, `back`, `home`, `app-switcher`, `press`, `long-press`, `swipe`, `focus`, `type`, `fill`, `scroll`, `scrollintoview`, `pinch`, `wait`, `alert`, `screenshot`, `close`, `reinstall`.
- Inspection commands: `snapshot` (accessibility tree).
- Device tooling: `adb` (Android), `simctl`/`devicectl` (iOS via Xcode).
- Minimal dependencies; TypeScript executed directly on Node 22+ (no build step).
Expand Down Expand Up @@ -71,13 +71,21 @@ agent-device trace stop ./trace.log
```

Coordinates:
- All coordinate-based commands (`press`, `long-press`, `focus`, `fill`) use device coordinates with origin at top-left.
- All coordinate-based commands (`press`, `long-press`, `swipe`, `focus`, `fill`) use device coordinates with origin at top-left.
- X increases to the right, Y increases downward.

Gesture series examples:

```bash
agent-device press 300 500 --count 12 --interval-ms 45
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
```

## Command Index
- `boot`, `open`, `close`, `reinstall`, `home`, `back`, `app-switcher`
- `snapshot`, `find`, `get`
- `click`, `focus`, `type`, `fill`, `press`, `long-press`, `scroll`, `scrollintoview`, `is`
- `click`, `focus`, `type`, `fill`, `press`, `long-press`, `swipe`, `scroll`, `scrollintoview`, `pinch`, `is`
- `alert`, `wait`, `screenshot`
- `trace start`, `trace stop`
- `settings wifi|airplane|location on|off`
Expand All @@ -103,10 +111,25 @@ Flags:
- `--serial <serial>` (Android)
- `--activity <component>` (Android app launch only; package/Activity or package/.Activity; not for URL opens)
- `--session <name>`
- `--count <n>` repeat count for `press`/`swipe`
- `--interval-ms <ms>` delay between `press` iterations
- `--hold-ms <ms>` hold duration per `press` iteration
- `--jitter-px <n>` deterministic coordinate jitter for `press`
- `--pause-ms <ms>` delay between `swipe` iterations
- `--pattern one-way|ping-pong` repeat pattern for `swipe`
- `--verbose` for daemon and runner logs
- `--json` for structured output
- `--backend ax|xctest` (snapshot only; defaults to `xctest` on iOS)

Pinch:
- `pinch` is supported on iOS simulators.
- On Android, `pinch` currently returns `UNSUPPORTED_OPERATION` in the adb backend.

Swipe timing:
- `swipe` accepts optional `durationMs` (default `250`, range `16..10000`).
- Android uses requested swipe duration directly.
- iOS uses a safe normalized duration to avoid long-press side effects.

## Skills
Install the automation skills listed in [SKILL.md](skills/agent-device/SKILL.md).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,13 @@ final class RunnerTests: XCTestCase {
let duration = (command.durationMs ?? 800) / 1000.0
longPressAt(app: activeApp, x: x, y: y, duration: duration)
return Response(ok: true, data: DataPayload(message: "long pressed"))
case .drag:
guard let x = command.x, let y = command.y, let x2 = command.x2, let y2 = command.y2 else {
return Response(ok: false, error: ErrorPayload(message: "drag requires x, y, x2, and y2"))
}
let holdDuration = min(max((command.durationMs ?? 60) / 1000.0, 0.016), 10.0)
dragAt(app: activeApp, x: x, y: y, x2: x2, y2: y2, holdDuration: holdDuration)
return Response(ok: true, data: DataPayload(message: "dragged"))
case .type:
guard let text = command.text else {
return Response(ok: false, error: ErrorPayload(message: "type requires text"))
Expand Down Expand Up @@ -436,6 +443,20 @@ final class RunnerTests: XCTestCase {
coordinate.press(forDuration: duration)
}

private func dragAt(
app: XCUIApplication,
x: Double,
y: Double,
x2: Double,
y2: Double,
holdDuration: TimeInterval
) {
let origin = app.coordinate(withNormalizedOffset: CGVector(dx: 0, dy: 0))
let start = origin.withOffset(CGVector(dx: x, dy: y))
let end = origin.withOffset(CGVector(dx: x2, dy: y2))
start.press(forDuration: holdDuration, thenDragTo: end)
}

private func swipe(app: XCUIApplication, direction: SwipeDirection) {
let target = app.windows.firstMatch.exists ? app.windows.firstMatch : app
let start = target.coordinate(withNormalizedOffset: CGVector(dx: 0.5, dy: 0.2))
Expand Down Expand Up @@ -956,6 +977,7 @@ private func resolveRunnerPort() -> UInt16 {
enum CommandType: String, Codable {
case tap
case longPress
case drag
case type
case swipe
case findText
Expand Down Expand Up @@ -984,6 +1006,8 @@ struct Command: Codable {
let action: String?
let x: Double?
let y: Double?
let x2: Double?
let y2: Double?
let durationMs: Double?
let direction: SwipeDirection?
let scale: Double?
Expand Down
13 changes: 10 additions & 3 deletions skills/agent-device/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,14 @@ agent-device focus @e2
agent-device fill @e2 "text" # Clear then type (Android: verifies value and retries once on mismatch)
agent-device type "text" # Type into focused field without clearing
agent-device press 300 500 # Tap by coordinates
agent-device press 300 500 --count 12 --interval-ms 45
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
agent-device swipe 540 1500 540 500 120
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
agent-device long-press 300 500 800 # Long press (where supported)
agent-device scroll down 0.5
agent-device pinch 2.0 # Zoom in 2x (iOS simulator + Android)
agent-device pinch 0.5 200 400 # Zoom out at coordinates
agent-device pinch 2.0 # Zoom in 2x (iOS simulator)
agent-device pinch 0.5 200 400 # Zoom out at coordinates (iOS simulator)
agent-device back
agent-device home
agent-device app-switcher
Expand Down Expand Up @@ -167,7 +171,10 @@ agent-device apps --platform android --user-installed

## Best practices

- Pinch (`pinch <scale> [x y]`) is supported on iOS simulators and Android; scale > 1 zooms in, < 1 zooms out. On Android, pinch uses multi-touch `sendevent` injection.
- `press` supports gesture series controls: `--count`, `--interval-ms`, `--hold-ms`, `--jitter-px`.
- `swipe` supports coordinate + timing controls and repeat patterns: `swipe x1 y1 x2 y2 [durationMs] --count --pause-ms --pattern`.
- `swipe` timing is platform-safe: Android uses requested duration; iOS uses normalized safe timing to avoid long-press side effects.
- Pinch (`pinch <scale> [x y]`) is currently supported on iOS simulators only.
- Snapshot refs are the core mechanism for interactive agent flows.
- Use selectors for deterministic replay artifacts and assertions (e.g. in e2e test workflows).
- Prefer `snapshot -i` to reduce output size.
Expand Down
1 change: 1 addition & 0 deletions src/core/__tests__/capabilities.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ test('iOS simulator + Android commands reject iOS devices', () => {
'record',
'screenshot',
'scroll',
'swipe',
'settings',
'snapshot',
'type',
Expand Down
1 change: 1 addition & 0 deletions src/core/capabilities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const COMMAND_CAPABILITY_MATRIX: Record<string, CommandCapability> = {
record: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
screenshot: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
scroll: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
swipe: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
settings: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
snapshot: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
type: { ios: { simulator: true }, android: { emulator: true, device: true, unknown: true } },
Expand Down
103 changes: 101 additions & 2 deletions src/core/dispatch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ export type CommandFlags = {
noRecord?: boolean;
appsFilter?: 'launchable' | 'user-installed' | 'all';
appsMetadata?: boolean;
count?: number;
intervalMs?: number;
holdMs?: number;
jitterPx?: number;
pauseMs?: number;
pattern?: 'one-way' | 'ping-pong';
replayUpdate?: boolean;
};

Expand Down Expand Up @@ -91,6 +97,12 @@ export async function dispatchCommand(
snapshotScope?: string;
snapshotRaw?: boolean;
snapshotBackend?: 'ax' | 'xctest';
count?: number;
intervalMs?: number;
holdMs?: number;
jitterPx?: number;
pauseMs?: number;
pattern?: 'one-way' | 'ping-pong';
},
): Promise<Record<string, unknown> | void> {
const runnerCtx: RunnerContext = {
Expand Down Expand Up @@ -121,8 +133,60 @@ export async function dispatchCommand(
case 'press': {
const [x, y] = positionals.map(Number);
if (Number.isNaN(x) || Number.isNaN(y)) throw new AppError('INVALID_ARGS', 'press requires x y');
await interactor.tap(x, y);
return { x, y };
const count = requireIntInRange(context?.count ?? 1, 'count', 1, 200);
const intervalMs = requireIntInRange(context?.intervalMs ?? 0, 'interval-ms', 0, 10_000);
const holdMs = requireIntInRange(context?.holdMs ?? 0, 'hold-ms', 0, 10_000);
const jitterPx = requireIntInRange(context?.jitterPx ?? 0, 'jitter-px', 0, 100);

for (let index = 0; index < count; index += 1) {
const [dx, dy] = computeDeterministicJitter(index, jitterPx);
const targetX = x + dx;
const targetY = y + dy;
if (holdMs > 0) await interactor.longPress(targetX, targetY, holdMs);
else await interactor.tap(targetX, targetY);
if (index < count - 1 && intervalMs > 0) await sleep(intervalMs);
}

return { x, y, count, intervalMs, holdMs, jitterPx };
}
case 'swipe': {
const x1 = Number(positionals[0]);
const y1 = Number(positionals[1]);
const x2 = Number(positionals[2]);
const y2 = Number(positionals[3]);
if ([x1, y1, x2, y2].some(Number.isNaN)) {
throw new AppError('INVALID_ARGS', 'swipe requires x1 y1 x2 y2 [durationMs]');
}

const requestedDurationMs = positionals[4] ? Number(positionals[4]) : 250;
const durationMs = requireIntInRange(requestedDurationMs, 'durationMs', 16, 10_000);
const effectiveDurationMs = device.platform === 'ios' ? 60 : durationMs;
const count = requireIntInRange(context?.count ?? 1, 'count', 1, 200);
const pauseMs = requireIntInRange(context?.pauseMs ?? 0, 'pause-ms', 0, 10_000);
const pattern = context?.pattern ?? 'one-way';
if (pattern !== 'one-way' && pattern !== 'ping-pong') {
throw new AppError('INVALID_ARGS', `Invalid pattern: ${pattern}`);
}

for (let index = 0; index < count; index += 1) {
const reverse = pattern === 'ping-pong' && index % 2 === 1;
if (reverse) await interactor.swipe(x2, y2, x1, y1, effectiveDurationMs);
else await interactor.swipe(x1, y1, x2, y2, effectiveDurationMs);
if (index < count - 1 && pauseMs > 0) await sleep(pauseMs);
}

return {
x1,
y1,
x2,
y2,
durationMs,
effectiveDurationMs,
timingMode: device.platform === 'ios' ? 'safe-normalized' : 'direct',
count,
pauseMs,
pattern,
};
}
case 'long-press': {
const x = Number(positionals[0]);
Expand Down Expand Up @@ -171,6 +235,12 @@ export async function dispatchCommand(
return { text };
}
case 'pinch': {
if (device.platform === 'android') {
throw new AppError(
'UNSUPPORTED_OPERATION',
'Android pinch is not supported in current adb backend; requires instrumentation-based backend.',
);
}
const scale = Number(positionals[0]);
const x = positionals[1] ? Number(positionals[1]) : undefined;
const y = positionals[2] ? Number(positionals[2]) : undefined;
Expand Down Expand Up @@ -280,3 +350,32 @@ export async function dispatchCommand(
throw new AppError('INVALID_ARGS', `Unknown command: ${command}`);
}
}

const DETERMINISTIC_JITTER_PATTERN: ReadonlyArray<readonly [number, number]> = [
[0, 0],
[1, 0],
[0, 1],
[-1, 0],
[0, -1],
[1, 1],
[-1, 1],
[1, -1],
[-1, -1],
];

function requireIntInRange(value: number, name: string, min: number, max: number): number {
if (!Number.isFinite(value) || !Number.isInteger(value) || value < min || value > max) {
throw new AppError('INVALID_ARGS', `${name} must be an integer between ${min} and ${max}`);
}
return value;
}

function computeDeterministicJitter(index: number, jitterPx: number): [number, number] {
if (jitterPx <= 0) return [0, 0];
const [dx, dy] = DETERMINISTIC_JITTER_PATTERN[index % DETERMINISTIC_JITTER_PATTERN.length];
return [dx * jitterPx, dy * jitterPx];
}

async function sleep(ms: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, ms));
}
20 changes: 14 additions & 6 deletions src/daemon-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ export async function sendToDaemon(req: Omit<DaemonRequest, 'token'>): Promise<D
async function ensureDaemon(): Promise<DaemonInfo> {
const existing = readDaemonInfo();
const localVersion = readVersion();
if (existing && existing.version === localVersion && (await canConnect(existing))) return existing;
if (existing && (existing.version !== localVersion || !(await canConnect(existing)))) {
const existingReachable = existing ? await canConnect(existing) : false;
if (existing && existing.version === localVersion && existingReachable) return existing;
if (existing && (existing.version !== localVersion || !existingReachable)) {
removeDaemonInfo();
}

Expand Down Expand Up @@ -67,7 +68,11 @@ function readDaemonInfo(): DaemonInfo | null {
}

function removeDaemonInfo(): void {
if (fs.existsSync(infoPath)) fs.unlinkSync(infoPath);
try {
if (fs.existsSync(infoPath)) fs.unlinkSync(infoPath);
} catch {
// Best-effort cleanup only; daemon can still overwrite this file on startup.
}
}

async function canConnect(info: DaemonInfo): Promise<boolean> {
Expand All @@ -87,11 +92,14 @@ async function startDaemon(): Promise<void> {
const distPath = path.join(root, 'dist', 'src', 'daemon.js');
const srcPath = path.join(root, 'src', 'daemon.ts');

const useDist = fs.existsSync(distPath);
if (!useDist && !fs.existsSync(srcPath)) {
const hasDist = fs.existsSync(distPath);
const hasSrc = fs.existsSync(srcPath);
if (!hasDist && !hasSrc) {
throw new AppError('COMMAND_FAILED', 'Daemon entry not found', { distPath, srcPath });
}
const args = useDist ? [distPath] : ['--experimental-strip-types', srcPath];
const runningFromSource = process.execArgv.includes('--experimental-strip-types');
const useSrc = runningFromSource ? hasSrc : !hasDist && hasSrc;
const args = useSrc ? ['--experimental-strip-types', srcPath] : [distPath];

runCmdDetached(process.execPath, args);
}
Expand Down
12 changes: 12 additions & 0 deletions src/daemon/context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ export type DaemonCommandContext = {
snapshotScope?: string;
snapshotBackend?: 'ax' | 'xctest';
snapshotRaw?: boolean;
count?: number;
intervalMs?: number;
holdMs?: number;
jitterPx?: number;
pauseMs?: number;
pattern?: 'one-way' | 'ping-pong';
};

export function contextFromFlags(
Expand All @@ -32,5 +38,11 @@ export function contextFromFlags(
snapshotScope: flags?.snapshotScope,
snapshotRaw: flags?.snapshotRaw,
snapshotBackend: flags?.snapshotBackend,
count: flags?.count,
intervalMs: flags?.intervalMs,
holdMs: flags?.holdMs,
jitterPx: flags?.jitterPx,
pauseMs: flags?.pauseMs,
pattern: flags?.pattern,
};
}
Loading