Skip to content

Commit a6f37c4

Browse files
committed
perf: speed up scrollintoview ref path
1 parent 1883fc2 commit a6f37c4

5 files changed

Lines changed: 318 additions & 4 deletions

File tree

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ agent-device press 300 500 --count 12 --interval-ms 45
130130
agent-device press 300 500 --count 6 --hold-ms 120 --interval-ms 30 --jitter-px 2
131131
agent-device press @e5 --count 5 --double-tap
132132
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
133+
agent-device scrollintoview "Sign in"
134+
agent-device scrollintoview @e42
133135
```
134136

135137
## Command Index
@@ -180,6 +182,7 @@ Swipe timing:
180182
- `swipe` accepts optional `durationMs` (default `250`, range `16..10000`).
181183
- Android uses requested swipe duration directly.
182184
- iOS uses a safe normalized duration to avoid longpress side effects.
185+
- `scrollintoview` accepts either plain text or a snapshot ref (`@eN`); ref mode uses geometry-based scrolling.
183186

184187
## Skills
185188
Install the automation skills listed in [SKILL.md](skills/agent-device/SKILL.md).

src/daemon/handlers/__tests__/interaction.test.ts

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,3 +185,117 @@ test('press coordinates does not treat extra trailing args as selector', async (
185185
assert.deepEqual(dispatchCalls[0]?.positionals, ['100', '200']);
186186
assert.equal(sessionStore.get(sessionName)?.actions.length, 1);
187187
});
188+
189+
test('scrollintoview @ref dispatches geometry-based swipe series', async () => {
190+
const sessionStore = makeSessionStore();
191+
const sessionName = 'default';
192+
const session = makeSession(sessionName);
193+
session.snapshot = {
194+
nodes: attachRefs([
195+
{
196+
index: 0,
197+
type: 'Application',
198+
rect: { x: 0, y: 0, width: 390, height: 844 },
199+
},
200+
{
201+
index: 1,
202+
type: 'XCUIElementTypeStaticText',
203+
label: 'Far item',
204+
rect: { x: 20, y: 2600, width: 120, height: 40 },
205+
},
206+
]),
207+
createdAt: Date.now(),
208+
backend: 'xctest',
209+
};
210+
sessionStore.set(sessionName, session);
211+
212+
const dispatchCalls: Array<{
213+
command: string;
214+
positionals: string[];
215+
context: Record<string, unknown> | undefined;
216+
}> = [];
217+
const response = await handleInteractionCommands({
218+
req: {
219+
token: 't',
220+
session: sessionName,
221+
command: 'scrollintoview',
222+
positionals: ['@e2'],
223+
flags: {},
224+
},
225+
sessionName,
226+
sessionStore,
227+
contextFromFlags,
228+
dispatch: async (_device, command, positionals, _out, context) => {
229+
dispatchCalls.push({ command, positionals, context: context as Record<string, unknown> | undefined });
230+
return { ok: true };
231+
},
232+
});
233+
234+
assert.ok(response);
235+
assert.equal(response.ok, true);
236+
assert.equal(dispatchCalls.length, 1);
237+
assert.equal(dispatchCalls[0]?.command, 'swipe');
238+
assert.equal(dispatchCalls[0]?.positionals.length, 5);
239+
assert.equal(dispatchCalls[0]?.context?.pattern, 'one-way');
240+
assert.equal(dispatchCalls[0]?.context?.pauseMs, 0);
241+
assert.equal(typeof dispatchCalls[0]?.context?.count, 'number');
242+
assert.ok((dispatchCalls[0]?.context?.count as number) > 1);
243+
244+
const stored = sessionStore.get(sessionName);
245+
assert.ok(stored);
246+
assert.equal(stored?.actions.length, 1);
247+
assert.equal(stored?.actions[0]?.command, 'scrollintoview');
248+
const result = (stored?.actions[0]?.result ?? {}) as Record<string, unknown>;
249+
assert.equal(result.ref, 'e2');
250+
assert.equal(result.strategy, 'ref-geometry');
251+
});
252+
253+
test('scrollintoview @ref returns immediately when target is already in viewport safe band', async () => {
254+
const sessionStore = makeSessionStore();
255+
const sessionName = 'default';
256+
const session = makeSession(sessionName);
257+
session.snapshot = {
258+
nodes: attachRefs([
259+
{
260+
index: 0,
261+
type: 'Application',
262+
rect: { x: 0, y: 0, width: 390, height: 844 },
263+
},
264+
{
265+
index: 1,
266+
type: 'XCUIElementTypeStaticText',
267+
label: 'Visible item',
268+
rect: { x: 20, y: 320, width: 120, height: 40 },
269+
},
270+
]),
271+
createdAt: Date.now(),
272+
backend: 'xctest',
273+
};
274+
sessionStore.set(sessionName, session);
275+
276+
const dispatchCalls: Array<{ command: string }> = [];
277+
const response = await handleInteractionCommands({
278+
req: {
279+
token: 't',
280+
session: sessionName,
281+
command: 'scrollintoview',
282+
positionals: ['@e2'],
283+
flags: {},
284+
},
285+
sessionName,
286+
sessionStore,
287+
contextFromFlags,
288+
dispatch: async (_device, command) => {
289+
dispatchCalls.push({ command });
290+
return { ok: true };
291+
},
292+
});
293+
294+
assert.ok(response);
295+
assert.equal(response.ok, true);
296+
assert.equal(dispatchCalls.length, 0);
297+
if (response.ok) {
298+
assert.equal(response.data?.attempts, 0);
299+
assert.equal(response.data?.alreadyVisible, true);
300+
}
301+
});

src/daemon/handlers/interaction.ts

Lines changed: 194 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
import { dispatchCommand, type CommandFlags } from '../../core/dispatch.ts';
22
import { isCommandSupportedOnDevice } from '../../core/capabilities.ts';
3-
import { attachRefs, centerOfRect, findNodeByRef, normalizeRef, type RawSnapshotNode } from '../../utils/snapshot.ts';
3+
import {
4+
attachRefs,
5+
centerOfRect,
6+
findNodeByRef,
7+
normalizeRef,
8+
type RawSnapshotNode,
9+
type Rect,
10+
} from '../../utils/snapshot.ts';
411
import type { DaemonCommandContext } from '../context.ts';
512
import type { DaemonRequest, DaemonResponse, SessionState } from '../types.ts';
613
import { SessionStore } from '../session-store.ts';
@@ -548,6 +555,94 @@ export async function handleInteractionCommands(params: {
548555
return { ok: true, data: { predicate, pass: true, selector: resolved.selector.raw } };
549556
}
550557

558+
if (command === 'scrollintoview') {
559+
const session = sessionStore.get(sessionName);
560+
if (!session) {
561+
return {
562+
ok: false,
563+
error: { code: 'SESSION_NOT_FOUND', message: 'No active session. Run open first.' },
564+
};
565+
}
566+
const targetInput = req.positionals?.[0] ?? '';
567+
if (!targetInput.startsWith('@')) {
568+
return null;
569+
}
570+
const invalidRefFlagsResponse = refSnapshotFlagGuardResponse('scrollintoview', req.flags);
571+
if (invalidRefFlagsResponse) return invalidRefFlagsResponse;
572+
if (!session.snapshot) {
573+
return { ok: false, error: { code: 'INVALID_ARGS', message: 'No snapshot in session. Run snapshot first.' } };
574+
}
575+
const ref = normalizeRef(targetInput);
576+
if (!ref) {
577+
return {
578+
ok: false,
579+
error: { code: 'INVALID_ARGS', message: 'scrollintoview requires a ref like @e2' },
580+
};
581+
}
582+
let node = findNodeByRef(session.snapshot.nodes, ref);
583+
if (!node?.rect && req.positionals && req.positionals.length > 1) {
584+
const fallbackLabel = req.positionals.slice(1).join(' ').trim();
585+
if (fallbackLabel.length > 0) {
586+
node = findNodeByLabel(session.snapshot.nodes, fallbackLabel);
587+
}
588+
}
589+
if (!node?.rect) {
590+
return {
591+
ok: false,
592+
error: { code: 'COMMAND_FAILED', message: `Ref ${targetInput} not found or has no bounds` },
593+
};
594+
}
595+
const viewportRect = resolveViewportRect(session.snapshot.nodes, node.rect);
596+
const plan = buildScrollIntoViewPlan(node.rect, viewportRect);
597+
const refLabel = resolveRefLabel(node, session.snapshot.nodes);
598+
const selectorChain = buildSelectorChainForNode(node, session.device.platform, { action: 'get' });
599+
if (!plan) {
600+
sessionStore.recordAction(session, {
601+
command,
602+
positionals: req.positionals ?? [],
603+
flags: req.flags ?? {},
604+
result: { ref, attempts: 0, alreadyVisible: true, strategy: 'ref-geometry', refLabel, selectorChain },
605+
});
606+
return { ok: true, data: { ref, attempts: 0, alreadyVisible: true, strategy: 'ref-geometry' } };
607+
}
608+
const data = await dispatch(
609+
session.device,
610+
'swipe',
611+
[String(plan.x), String(plan.startY), String(plan.x), String(plan.endY), '60'],
612+
req.flags?.out,
613+
{
614+
...contextFromFlags(req.flags, session.appBundleId, session.trace?.outPath),
615+
count: plan.count,
616+
pauseMs: 0,
617+
pattern: 'one-way',
618+
},
619+
);
620+
sessionStore.recordAction(session, {
621+
command,
622+
positionals: req.positionals ?? [],
623+
flags: req.flags ?? {},
624+
result: {
625+
...(data ?? {}),
626+
ref,
627+
attempts: plan.count,
628+
direction: plan.direction,
629+
strategy: 'ref-geometry',
630+
refLabel,
631+
selectorChain,
632+
},
633+
});
634+
return {
635+
ok: true,
636+
data: {
637+
...(data ?? {}),
638+
ref,
639+
attempts: plan.count,
640+
direction: plan.direction,
641+
strategy: 'ref-geometry',
642+
},
643+
};
644+
}
645+
551646
return null;
552647
}
553648

@@ -593,7 +688,7 @@ const REF_UNSUPPORTED_FLAG_MAP: ReadonlyArray<[keyof CommandFlags, string]> = [
593688
];
594689

595690
function refSnapshotFlagGuardResponse(
596-
command: 'press' | 'fill' | 'get',
691+
command: 'press' | 'fill' | 'get' | 'scrollintoview',
597692
flags: CommandFlags | undefined,
598693
): DaemonResponse | null {
599694
const unsupported = unsupportedRefSnapshotFlags(flags);
@@ -623,3 +718,100 @@ export function unsupportedRefSnapshotFlags(flags: CommandFlags | undefined): st
623718
}
624719
return unsupported;
625720
}
721+
722+
function resolveViewportRect(nodes: RawSnapshotNode[], targetRect: Rect): Rect {
723+
const targetCenter = centerOfRect(targetRect);
724+
const rectNodes = nodes.filter((node) => hasValidRect(node.rect));
725+
const viewportNodes = rectNodes.filter((node) => {
726+
const type = (node.type ?? '').toLowerCase();
727+
return type.includes('application') || type.includes('window');
728+
});
729+
730+
const containingViewport = pickLargestRect(
731+
viewportNodes
732+
.map((node) => node.rect as Rect)
733+
.filter((rect) => containsPoint(rect, targetCenter.x, targetCenter.y)),
734+
);
735+
if (containingViewport) return containingViewport;
736+
737+
const viewportFallback = pickLargestRect(viewportNodes.map((node) => node.rect as Rect));
738+
if (viewportFallback) return viewportFallback;
739+
740+
const genericContaining = pickLargestRect(
741+
rectNodes
742+
.map((node) => node.rect as Rect)
743+
.filter((rect) => containsPoint(rect, targetCenter.x, targetCenter.y)),
744+
);
745+
if (genericContaining) return genericContaining;
746+
747+
return targetRect;
748+
}
749+
750+
function buildScrollIntoViewPlan(
751+
targetRect: Rect,
752+
viewportRect: Rect,
753+
): { x: number; startY: number; endY: number; count: number; direction: 'up' | 'down' } | null {
754+
const viewportHeight = Math.max(1, viewportRect.height);
755+
const viewportTop = viewportRect.y;
756+
const viewportBottom = viewportRect.y + viewportHeight;
757+
const safeTop = viewportTop + viewportHeight * 0.25;
758+
const safeBottom = viewportBottom - viewportHeight * 0.25;
759+
const targetCenterY = targetRect.y + targetRect.height / 2;
760+
761+
if (targetCenterY >= safeTop && targetCenterY <= safeBottom) {
762+
return null;
763+
}
764+
765+
const x = Math.round(viewportRect.x + viewportRect.width / 2);
766+
const dragUpStartY = Math.round(viewportTop + viewportHeight * 0.78);
767+
const dragUpEndY = Math.round(viewportTop + viewportHeight * 0.22);
768+
const dragDownStartY = dragUpEndY;
769+
const dragDownEndY = dragUpStartY;
770+
const swipeStepPx = Math.max(1, Math.abs(dragUpStartY - dragUpEndY) * 0.9);
771+
772+
if (targetCenterY > safeBottom) {
773+
const delta = targetCenterY - safeBottom;
774+
return {
775+
x,
776+
startY: dragUpStartY,
777+
endY: dragUpEndY,
778+
count: clampInt(Math.ceil(delta / swipeStepPx), 1, 50),
779+
direction: 'down',
780+
};
781+
}
782+
783+
const delta = safeTop - targetCenterY;
784+
return {
785+
x,
786+
startY: dragDownStartY,
787+
endY: dragDownEndY,
788+
count: clampInt(Math.ceil(delta / swipeStepPx), 1, 50),
789+
direction: 'up',
790+
};
791+
}
792+
793+
function hasValidRect(rect: Rect | undefined): rect is Rect {
794+
if (!rect) return false;
795+
return Number.isFinite(rect.x) && Number.isFinite(rect.y) && Number.isFinite(rect.width) && Number.isFinite(rect.height);
796+
}
797+
798+
function containsPoint(rect: Rect, x: number, y: number): boolean {
799+
return x >= rect.x && x <= rect.x + rect.width && y >= rect.y && y <= rect.y + rect.height;
800+
}
801+
802+
function pickLargestRect(rects: Rect[]): Rect | null {
803+
let best: Rect | null = null;
804+
let bestArea = -1;
805+
for (const rect of rects) {
806+
const area = rect.width * rect.height;
807+
if (area > bestArea) {
808+
best = rect;
809+
bestArea = area;
810+
}
811+
}
812+
return best;
813+
}
814+
815+
function clampInt(value: number, min: number, max: number): number {
816+
return Math.min(max, Math.max(min, Math.round(value)));
817+
}

src/utils/command-schema.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -483,8 +483,10 @@ export const COMMAND_SCHEMAS: Record<string, CommandSchema> = {
483483
allowedFlags: [],
484484
},
485485
scrollintoview: {
486-
description: 'Scroll until text appears',
487-
positionalArgs: ['text'],
486+
usageOverride: 'scrollintoview <text|@ref>',
487+
description: 'Scroll until text appears or a snapshot ref is brought into view',
488+
positionalArgs: ['target'],
489+
allowsExtraPositionals: true,
488490
allowedFlags: [],
489491
},
490492
pinch: {

website/docs/docs/commands.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ agent-device swipe 540 1500 540 500 120
5656
agent-device swipe 540 1500 540 500 120 --count 8 --pause-ms 30 --pattern ping-pong
5757
agent-device longpress 300 500 800
5858
agent-device scroll down 0.5
59+
agent-device scrollintoview "Sign in"
60+
agent-device scrollintoview @e42
5961
agent-device pinch 2.0 # zoom in 2x (iOS simulator)
6062
agent-device pinch 0.5 200 400 # zoom out at coordinates (iOS simulator)
6163
```
@@ -64,6 +66,7 @@ agent-device pinch 0.5 200 400 # zoom out at coordinates (iOS simulator)
6466
On Android, `fill` also verifies text and performs one clear-and-retry pass on mismatch.
6567
`swipe` accepts an optional `durationMs` argument (default `250ms`, range `16..10000`).
6668
On iOS, swipe timing uses a safe normalized duration to avoid longpress side effects.
69+
`scrollintoview` accepts plain text or a snapshot ref (`@eN`); ref mode uses geometry-based scrolling.
6770
`longpress` is supported on iOS and Android.
6871
`pinch` is iOS simulator-only.
6972

0 commit comments

Comments
 (0)