Skip to content

Commit d8440a6

Browse files
authored
fix: make fill clear input and progressively fill the input (#17)
* fix: make fill clear input and progressively fill the input * fixup ios * improve text fill reliability
1 parent 22bfcee commit d8440a6

11 files changed

Lines changed: 257 additions & 66 deletions

File tree

AGENTS.md

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,16 @@ Instructions for AI coding agents working with this codebase.
2626
2) Daemon (`src/daemon.ts`) resolves device, tracks session state, and calls `dispatchCommand`.
2727
3) `dispatchCommand` selects platform interactor.
2828
4) iOS simulator path:
29-
- Prefer `simctl` input when available (`simctlSupportsInput`).
30-
- Snapshot default backend: AX (`snapshotAx`), fallback to iOS runner if AX is unavailable.
31-
- Fallback to iOS runner via `runIosRunnerCommand` for tap/type/swipe/list.
29+
- Snapshot default backend: XCTest for everything, fallback to AX if XCTest returns 0 nodes.
3230
5) iOS runner uses xcodebuild `test-without-building` with an injected `.xctestrun`,
3331
starts an `NWListener` HTTP server inside the UI test bundle, and executes UI actions.
3432

3533
## Key commands (local)
3634

37-
- Build iOS runner: `xcodebuild build-for-testing -project ios-runner/AgentDeviceRunner/AgentDeviceRunner.xcodeproj -scheme AgentDeviceRunner -destination "platform=iOS Simulator,id=<UDID>" -derivedDataPath ~/.agent-device/ios-runner/derived`
38-
- Build AX snapshot tool: `swift build -c release` in `ios-runner/AXSnapshot`
35+
- Build iOS runner: `pnpm build:xcuitest`
36+
- Build AX snapshot tool: `pnpm build:axsnapshot`
3937
- Build everything: `pnpm build:all`
40-
- Run command: `node bin/agent-device.mjs --platform ios --udid <UDID> open settings --verbose`
41-
- Scroll: `node bin/agent-device.mjs --platform ios --udid <UDID> scroll down 0.5 --verbose`
38+
- Run command: `pnpm ad`
4239

4340
## Run and Test
4441

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,13 @@ Find (semantic):
121121
- `find text|label|value|role|id <value> <action> [value]` for specific locators.
122122
- Actions: `click` (default), `fill`, `type`, `focus`, `get text`, `get attrs`, `wait [timeout]`, `exists`.
123123

124+
Android fill reliability:
125+
- `fill` clears the current value, then enters text.
126+
- `type` enters text into the focused field without clearing.
127+
- `fill` now verifies the entered value on Android.
128+
- If value does not match, agent-device clears the field and retries once with slower typing.
129+
- This reduces IME-related character swaps on long strings (e.g. emails and IDs).
130+
124131
Settings helpers (simulators):
125132
- `settings wifi on|off`
126133
- `settings airplane on|off`

ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests/RunnerTests.swift

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,7 @@ final class RunnerTests: XCTestCase {
8181
return
8282
}
8383
NSLog("AGENT_DEVICE_RUNNER_WAITING")
84-
let timeout = resolveRunnerTimeout()
85-
let effectiveTimeout = timeout > 0 ? timeout : 24 * 60 * 60
86-
let result = XCTWaiter.wait(for: [expectation], timeout: effectiveTimeout)
84+
let result = XCTWaiter.wait(for: [expectation], timeout: 24 * 60 * 60)
8785
NSLog("AGENT_DEVICE_RUNNER_WAIT_RESULT=%@", String(describing: result))
8886
if result != .completed {
8987
XCTFail("runner wait ended with \(result)")
@@ -238,7 +236,19 @@ final class RunnerTests: XCTestCase {
238236
guard let text = command.text else {
239237
return Response(ok: false, error: ErrorPayload(message: "type requires text"))
240238
}
241-
activeApp.typeText(text)
239+
if command.clearFirst == true {
240+
guard let focused = focusedTextInput(app: activeApp) else {
241+
return Response(ok: false, error: ErrorPayload(message: "no focused text input to clear"))
242+
}
243+
clearTextInput(focused)
244+
focused.typeText(text)
245+
return Response(ok: true, data: DataPayload(message: "typed"))
246+
}
247+
if let focused = focusedTextInput(app: activeApp) {
248+
focused.typeText(text)
249+
} else {
250+
activeApp.typeText(text)
251+
}
242252
return Response(ok: true, data: DataPayload(message: "typed"))
243253
case .swipe:
244254
guard let direction = command.direction else {
@@ -343,6 +353,48 @@ final class RunnerTests: XCTestCase {
343353
return element.exists ? element : nil
344354
}
345355

356+
private func clearTextInput(_ element: XCUIElement) {
357+
moveCaretToEnd(element: element)
358+
let count = estimatedDeleteCount(for: element)
359+
let deletes = String(repeating: XCUIKeyboardKey.delete.rawValue, count: count)
360+
element.typeText(deletes)
361+
}
362+
363+
private func focusedTextInput(app: XCUIApplication) -> XCUIElement? {
364+
let focused = app
365+
.descendants(matching: .any)
366+
.matching(NSPredicate(format: "hasKeyboardFocus == 1"))
367+
.firstMatch
368+
guard focused.exists else { return nil }
369+
370+
switch focused.elementType {
371+
case .textField, .secureTextField, .searchField, .textView:
372+
return focused
373+
default:
374+
return nil
375+
}
376+
}
377+
378+
private func moveCaretToEnd(element: XCUIElement) {
379+
let frame = element.frame
380+
guard !frame.isEmpty else {
381+
element.tap()
382+
return
383+
}
384+
let origin = element.coordinate(withNormalizedOffset: CGVector(dx: 0, dy: 0))
385+
let target = origin.withOffset(
386+
CGVector(dx: max(2, frame.width - 4), dy: max(2, frame.height / 2))
387+
)
388+
target.tap()
389+
}
390+
391+
private func estimatedDeleteCount(for element: XCUIElement) -> Int {
392+
let valueText = String(describing: element.value ?? "")
393+
.trimmingCharacters(in: .whitespacesAndNewlines)
394+
let base = valueText.isEmpty ? 24 : (valueText.count + 8)
395+
return max(24, min(120, base))
396+
}
397+
346398
private func findScopeElement(app: XCUIApplication, scope: String) -> XCUIElement? {
347399
let predicate = NSPredicate(
348400
format: "label CONTAINS[c] %@ OR identifier CONTAINS[c] %@",
@@ -738,14 +790,6 @@ private func resolveRunnerPort() -> UInt16 {
738790
return 0
739791
}
740792

741-
private func resolveRunnerTimeout() -> TimeInterval {
742-
if let env = ProcessInfo.processInfo.environment["AGENT_DEVICE_RUNNER_TIMEOUT"],
743-
let parsed = Double(env) {
744-
return parsed
745-
}
746-
return 0
747-
}
748-
749793
enum CommandType: String, Codable {
750794
case tap
751795
case type
@@ -772,6 +816,7 @@ struct Command: Codable {
772816
let command: CommandType
773817
let appBundleId: String?
774818
let text: String?
819+
let clearFirst: Bool?
775820
let action: String?
776821
let x: Double?
777822
let y: Double?

skills/agent-device/SKILL.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ agent-device apps --metadata --platform android
9797
```bash
9898
agent-device click @e1
9999
agent-device focus @e2
100-
agent-device fill @e2 "text" # Tap then type
101-
agent-device type "text" # Type into focused field
100+
agent-device fill @e2 "text" # Clear then type (Android: verifies value and retries once on mismatch)
101+
agent-device type "text" # Type into focused field without clearing
102102
agent-device press 300 500 # Tap by coordinates
103103
agent-device long-press 300 500 800 # Long press (where supported)
104104
agent-device scroll down 0.5
@@ -150,6 +150,9 @@ agent-device apps --platform android --user-installed
150150
- If AX returns the Simulator window or empty tree, restart Simulator or use `--backend xctest`.
151151
- Use `--session <name>` for parallel sessions; avoid device contention.
152152
- Use `--activity <component>` on Android to launch a specific activity (e.g. TV apps with LEANBACK).
153+
- Use `fill` when you want clear-then-type semantics.
154+
- Use `type` when you want to append/enter text without clearing.
155+
- On Android, prefer `fill` for important fields; it verifies entered text and retries once when IME reorders characters.
153156

154157
## References
155158

src/core/dispatch.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ export async function dispatchCommand(
174174
);
175175
await runIosRunnerCommand(
176176
device,
177-
{ command: 'type', text, appBundleId: context?.appBundleId },
177+
{ command: 'type', text, clearFirst: true, appBundleId: context?.appBundleId },
178178
{ verbose: context?.verbose, logPath: context?.logPath, traceLogPath: context?.traceLogPath },
179179
);
180180
} else {

src/daemon.ts

Lines changed: 29 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,32 +1036,17 @@ async function handleRequest(req: DaemonRequest): Promise<DaemonResponse> {
10361036
if (!node?.rect) {
10371037
return { ok: false, error: { code: 'COMMAND_FAILED', message: `Ref ${req.positionals[0]} not found or has no bounds` } };
10381038
}
1039-
const refLabel = resolveRefLabel(node, session.snapshot.nodes);
1040-
const label = node.label?.trim();
1041-
if (session.device.platform === 'ios' && session.device.kind === 'simulator' && isTextInputType(node.type)) {
1042-
const coords = node.rect ? centerOfRect(node.rect) : null;
1043-
if (!coords) {
1044-
return {
1045-
ok: false,
1046-
error: { code: 'COMMAND_FAILED', message: `Ref ${req.positionals[0]} not found or has no bounds` },
1047-
};
1048-
}
1049-
await dispatchCommand(session.device, 'focus', [String(coords.x), String(coords.y)], req.flags?.out, {
1050-
...contextFromFlags(req.flags, session.appBundleId, session.trace?.outPath),
1051-
});
1052-
await runIosRunnerCommand(
1053-
session.device,
1054-
{ command: 'type', text, appBundleId: session.appBundleId },
1055-
{ verbose: req.flags?.verbose, logPath, traceLogPath: session?.trace?.outPath },
1056-
);
1057-
recordAction(session, {
1058-
command,
1059-
positionals: req.positionals ?? [],
1060-
flags: req.flags ?? {},
1061-
result: { ref, refLabel: refLabel ?? label, action: 'fill', text },
1062-
});
1063-
return { ok: true, data: { ref } };
1039+
const nodeType = node.type ?? '';
1040+
if (nodeType && !isFillableType(nodeType, session.device.platform)) {
1041+
return {
1042+
ok: false,
1043+
error: {
1044+
code: 'INVALID_ARGS',
1045+
message: `fill requires a text input element, got "${nodeType}" for ${req.positionals[0]}. Select a text input ref or use click/focus + type.`,
1046+
},
1047+
};
10641048
}
1049+
const refLabel = resolveRefLabel(node, session.snapshot.nodes);
10651050
const { x, y } = centerOfRect(node.rect);
10661051
const data = await dispatchCommand(
10671052
session.device,
@@ -1649,16 +1634,6 @@ function isLabelUnique(nodes: SnapshotState['nodes'], label: string): boolean {
16491634
return count === 1;
16501635
}
16511636

1652-
function isTextInputType(type: string | undefined): boolean {
1653-
const normalized = normalizeType(type ?? '');
1654-
return (
1655-
normalized === 'textfield' ||
1656-
normalized === 'textview' ||
1657-
normalized === 'searchfield' ||
1658-
normalized === 'textarea'
1659-
);
1660-
}
1661-
16621637
function pruneGroupNodes(nodes: RawSnapshotNode[]): RawSnapshotNode[] {
16631638
const skippedDepths: number[] = [];
16641639
const result: RawSnapshotNode[] = [];
@@ -1690,6 +1665,25 @@ function normalizeType(type: string): string {
16901665
return value;
16911666
}
16921667

1668+
function isFillableType(type: string, platform: 'ios' | 'android'): boolean {
1669+
const normalized = normalizeType(type);
1670+
if (!normalized) return true;
1671+
if (platform === 'android') {
1672+
return (
1673+
normalized.includes('edittext') ||
1674+
normalized.includes('autocompletetextview')
1675+
);
1676+
}
1677+
return (
1678+
normalized.includes('textfield') ||
1679+
normalized.includes('securetextfield') ||
1680+
normalized.includes('searchfield') ||
1681+
normalized.includes('textview') ||
1682+
normalized.includes('textarea') ||
1683+
normalized === 'search'
1684+
);
1685+
}
1686+
16931687
function findNearestHittableAncestor(
16941688
nodes: SnapshotState['nodes'],
16951689
node: SnapshotState['nodes'][number],

0 commit comments

Comments
 (0)