Skip to content

Commit 55f805a

Browse files
committed
test: cover device workflow edge cases
1 parent a40167b commit 55f805a

6 files changed

Lines changed: 132 additions & 21 deletions

File tree

src/__tests__/cli-help.test.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,23 @@ test('help workflow prints agent workflow topic and skips daemon dispatch', asyn
6161
assert.match(result.stdout, /Do not use CSS selectors/);
6262
});
6363

64+
test('help workflow preserves known device workaround guidance', async () => {
65+
const result = await runCliCapture(['help', 'workflow']);
66+
assert.equal(result.code, 0);
67+
assert.equal(result.calls.length, 0);
68+
assert.match(result.stdout, /disabled\/hittable:false/);
69+
assert.match(result.stdout, /snapshot -i -c --json/);
70+
assert.match(result.stdout, /@Label_Name/);
71+
assert.match(result.stdout, /press @e12/);
72+
assert.match(result.stdout, /Snapshot legend:/);
73+
assert.match(result.stdout, /preview="Leave at side\.\.\." truncated/);
74+
assert.match(result.stdout, /wait text/);
75+
assert.match(result.stdout, /Never use args/);
76+
assert.match(result.stdout, /Never use args, step/);
77+
assert.match(result.stdout, /scrollintoview/);
78+
assert.match(result.stdout, /--delay-ms/);
79+
});
80+
6481
test('help unknown command prints error plus global usage and skips daemon dispatch', async () => {
6582
const result = await runCliCapture(['help', 'not-a-command']);
6683
assert.equal(result.code, 1);

src/utils/__tests__/args.test.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -789,7 +789,7 @@ test('usage includes agent workflows, config, environment, and examples footers'
789789
assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/);
790790
assert.match(usageText, /Use selectors or refs as positional targets/);
791791
assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/);
792-
assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @ref/);
792+
assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @e12/);
793793
assert.match(usageText, /RN warning\/error overlays can block taps: snapshot -i/);
794794
assert.match(usageText, /Expo Go\/dev clients need their provided exp:\/\//);
795795
assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/);
@@ -834,6 +834,8 @@ test('usageForCommand resolves workflow help topic', () => {
834834
assert.match(help, /agent-device help workflow/);
835835
assert.match(help, /Use selectors as positional targets/);
836836
assert.match(help, /Do not use CSS selectors/);
837+
assert.match(help, /Snapshot legend:/);
838+
assert.match(help, /@e12 \[button\] label="Add to cart"/);
837839
assert.match(help, /Truncated text\/input previews: do not use get text first/);
838840
assert.match(help, /snapshot -s @e7/);
839841
assert.match(help, /Read-only visible\/state question: use snapshot\/get\/is\/find/);
@@ -860,7 +862,7 @@ test('workflow help keeps common copyable command forms', () => {
860862
assert.match(help, /connect --remote-config/);
861863
assert.match(help, /metro reload/);
862864
assert.match(help, /screenshot --overlay-refs/);
863-
assert.match(help, /snapshot -s @ref/);
865+
assert.match(help, /snapshot -s @e7/);
864866
});
865867

866868
test('usageForCommand resolves remote help topic', () => {

src/utils/command-schema.ts

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -175,14 +175,16 @@ const AGENT_WORKFLOWS = [
175175

176176
const AGENT_QUICKSTART_LINES = [
177177
'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.',
178-
'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.',
178+
'Use selectors or refs as positional targets: id="submit", label="Allow", or @e12 from snapshot -i.',
179179
'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.',
180180
'Read-only visible/state question: use snapshot/get/is/find; use snapshot -i only when refs are needed.',
181-
'Truncated text/input preview: expand first with snapshot -s @ref, not get text.',
181+
'Truncated text/input preview: expand first with snapshot -s @e12, not get text.',
182182
'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.',
183183
'Expo Go/dev clients need their provided exp:// or dev-client URL; do not invent app ids.',
184184
'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.',
185185
'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.',
186+
'Raw coordinates are fallback-only: use snapshot -i -c --json rects when iOS refs no-op or child refs are missing.',
187+
'Batch JSON steps use "command", "positionals", "flags"; never "args" or "step".',
186188
'Navigation: app-owned back uses back; system back uses back --system.',
187189
'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.',
188190
'Debug evidence: logs clear/mark/path; trace start ./path; trace stop ./path; network dump --include headers.',
@@ -235,9 +237,8 @@ Command shape:
235237
Put subcommand first, then positionals, then flags:
236238
agent-device open com.example.app --session checkout --platform android --relaunch
237239
agent-device record start ./checkout.mp4 --session checkout
238-
Unknown current ref placeholder: @ref. Use provided labels/ids/selectors when known. Never invent @e#.
239-
After snapshot -i, use @ref in plans when the exact @e number is unknown.
240-
If a task explicitly says to act by @ref, output press @ref or click @ref after refreshing refs.
240+
Snapshot refs look like @e12. After snapshot -i, use the exact @eN ref from that output.
241+
If the exact ref is not known yet, first output snapshot -i, then use a concrete example shape like press @e12 in the next command; do not write @<ref>, @ref, @Label_Name, or @eN placeholders.
241242
Close means agent-device close. App-owned back means back; system back means back --system.
242243
Taps are press or click. Gestures are direct commands: swipe, longpress, pinch.
243244
@@ -249,14 +250,21 @@ Bootstrap:
249250
agent-device install com.example.app ./dist/app.apk --platform android
250251
agent-device reinstall com.example.app ./build/MyApp.app --platform ios
251252
agent-device install-from-source --github-actions-artifact org/repo:app-debug --platform android
252-
If app id is unknown, plan devices, apps, then open <discovered-app-id>. Install arguments are app/package id then artifact path. Fresh install state: open with --relaunch.
253+
agent-device open com.example.app --platform android --relaunch
254+
If app id is unknown, plan devices, apps, then open <discovered-app-id>. Install arguments are app/package id then artifact path. After install, install-from-source, or reinstall, open the installed id with --relaunch for fresh runtime state.
253255
Do not open artifact paths or invent package ids. If apps lookup misses the target and no URL/artifact is provided, ask or stop.
254256
255257
Snapshots and refs:
256258
snapshot reads visible state. snapshot -i gets current interactive refs.
259+
Snapshot legend:
260+
@e12 [button] label="Add to cart" id="add-cart" enabled hittable -> press @e12 or press 'id="add-cart"'.
261+
@e13 [textinput] label="Notes" preview="Leave at side..." truncated -> snapshot -s @e13 before reading.
262+
[off-screen below] 4 items: "Privacy", "About" -> scroll down, then snapshot -i; those are hints, not refs.
257263
Re-snapshot after navigation, submit, modal/list/reload/dynamic changes.
258264
Off-screen summaries are scroll hints; use scroll, not swipe, then snapshot -i.
265+
Missing target in a long list: use a short manual scroll + snapshot loop with a max attempt count; do not rely on unbounded scrollintoview.
259266
Truncated text/input previews: do not use get text first; expand with snapshot -s @ref (for example snapshot -s @e7), then read the scoped output.
267+
Rare iOS accessibility gaps: if a row ref is shown disabled/hittable:false and press @ref reports success but no UI change, or a horizontal tab/filter bar is collapsed into one composite/seekbar with no child refs, run agent-device snapshot -i -c --json to read rects, compute the target center, press x y, then diff snapshot -i. Coordinates are fallback-only; document why you used them.
260268
261269
Selectors:
262270
Use selectors as positional targets: id="field-email" or label="Allow".
@@ -273,20 +281,24 @@ Text entry:
273281
agent-device press 'id="product-note"'
274282
agent-device type "Handle with care" --delay-ms 80
275283
Debounced field with no result selector: agent-device wait 1000. Keyboard read-only: keyboard status/get. Blocked control: keyboard dismiss.
284+
Search-as-you-type fields on iOS can drop characters when driven too fast; use --delay-ms on fill/type before trying clipboard paste.
276285
277286
Read-only and waits:
278287
Read-only visible/state question: use snapshot/get/is/find.
279288
agent-device snapshot
280289
agent-device get text 'id="product-title"'
281290
agent-device get attrs @e4
282291
agent-device is visible 'label="Online"'
283-
agent-device wait visible 'label="Refreshing metrics..."' 3000
292+
agent-device wait text "Refreshing metrics..." 3000
293+
agent-device wait 'label="Ready"' 3000
284294
agent-device find "Increment" press --json
295+
For async/list text presence, prefer wait text over is visible when no interaction is needed.
285296
Use snapshot -i only when refs are needed for an action or targeted query.
286297
Ambiguous find: add --first or --last. If info is not visible/exposed, report that gap instead of typing/searching/navigating to reveal it.
287298
288299
Navigation and gestures:
289300
Use scroll for lists; swipe for coordinate gestures/carousels.
301+
If app-owned back is ambiguous or has just misrouted, prefer a visible nav/back button ref, tab-bar ref, or deep link over repeated back/system back.
290302
Keep count/pause/pattern on one swipe; flags are --count, --pause-ms, --pattern ping-pong.
291303
longpress duration and pinch scale/center are positional:
292304
agent-device longpress 300 500 800
@@ -302,6 +314,9 @@ Validation and evidence:
302314
Startup/CPU/memory: perf --json or metrics. Replay maintenance: replay -u ./flow.ad.
303315
Recording: record start/stop. Tracing: trace start ./trace.log, trace stop ./trace.log. Paths are positional.
304316
Stable known flow: batch ./steps.json, not workflow batch.
317+
Inline batch JSON example:
318+
agent-device batch --steps '[{"command":"open","positionals":["settings"],"flags":{}},{"command":"wait","positionals":["100"],"flags":{}}]'
319+
Batch step keys are command, positionals, flags, and optional runtime. Never use args, step, text, or target as batch step fields.
305320
Android animations: settings animations off/on, not animations disable/restore.
306321
Network headers: network dump --include headers.
307322
Remote config: connect --remote-config ./remote-config.json, open, snapshot, disconnect.

test/skillgym/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ The included suite focuses on the first two layers so it stays stable and CI-saf
1616

1717
- `../../examples/test-app/`: minimal Expo SDK 55 fixture app for broad UI coverage
1818
- `skillgym.config.ts`: starter config that runs Codex and Claude Haiku against this repo
19-
- `suites/agent-device-smoke-suite.ts`: 48-case suite for skill routing, fixture-aware planning, and skill-guidance regressions
19+
- `suites/agent-device-smoke-suite.ts`: 64-case suite for skill routing, fixture-aware planning, and skill-guidance regressions
2020

2121
## Current coverage
2222

@@ -35,12 +35,12 @@ Fixture smoke cases cover concrete app surfaces:
3535
Skill-guidance regression cases cover distinct command-planning habits:
3636

3737
- read-only inspection versus mutation
38-
- fresh `@ref` targeting, durable selectors, and off-screen scroll recovery
38+
- fresh `@ref` targeting, durable selectors, raw-rect fallbacks, and off-screen scroll recovery
3939
- text replacement, append semantics, keyboard status, and keyboard dismiss
40-
- install/open setup, app discovery, session scoping, and in-app back navigation
40+
- install/open setup, app discovery, session scoping, and app-owned navigation fallbacks
4141
- Metro reload, logs, network dump, alert fallback, and screenshot evidence
4242
- performance metrics, React DevTools profiling, gestures, settings, and trace capture
43-
- remote config, macOS menu bar surfaces, replay update, and batch during recording
43+
- remote config, macOS menu bar surfaces, replay update, and batch schema/recording
4444

4545
`assertAgentDeviceEvidence` is intentionally soft when a runner does not expose skill-detection telemetry. When telemetry exists, the suite asserts that `agent-device` was loaded; when it is absent, the cases still judge command-planning output instead of failing on missing runner metadata.
4646

test/skillgym/skillgym.config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ const config: SkillGymConfig = {
1313
cwd: '../..',
1414
outputDir: './.skillgym-results',
1515
reporter: 'standard',
16-
schedule: 'parallel',
16+
schedule: 'isolated-by-runner',
1717
},
1818
defaults: {
1919
timeoutMs: 600_000,

test/skillgym/suites/agent-device-smoke-suite.ts

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ function assertExpectedOutput(report: SessionReport, matchers: Array<string | Re
114114
const RAW_COORDINATE_TARGET =
115115
/(?:^|\n)(?:agent-device\s+)?(?:click|fill|press)\s+-?\d+(?:\.\d+)?\s+-?\d+(?:\.\d+)?/i;
116116
const PSEUDO_ASSERTION_COMMAND = /(?:^|\n)\s*(?:assert|assertVisible|waitFor|waitForText)\b/i;
117+
const COMPACT_RECT_SNAPSHOT = /snapshot\b(?=[^\n]*(?:-c\b|--compact\b))(?=[^\n]*(?:--json|--raw))/i;
117118

118119
function makeCase(options: {
119120
id: string;
@@ -419,17 +420,34 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
419420
contract: [
420421
'App name: Agent Device Tester',
421422
'Current screen: Home tab',
422-
'Control label: Lab online',
423-
'The current @ref is unknown until a fresh interactive snapshot is captured',
423+
'The target control has no stable selector in the task context',
424+
'Fresh interactive snapshot will expose the target as @e12',
424425
],
425-
task: 'Plan the commands to capture fresh interactive refs, press the Lab online control by @ref, then verify the nearby change with diff snapshot -i.',
426+
task: 'Plan the commands to capture fresh interactive refs, press the target control with its @e12 ref, then verify the nearby change with diff snapshot -i.',
426427
outputs: [
427428
/snapshot -i/i,
428-
/(?:^|\n)(?:agent-device\s+)?(?:click|press)\s+@(?:e\d+|ref)\b/i,
429+
/(?:^|\n)(?:agent-device\s+)?(?:click|press)\s+@e12\b/i,
429430
/(?:diff snapshot -i|snapshot\b.*(?:-i\b.*--diff|--diff\b.*-i\b))/i,
430431
],
431432
forbiddenOutputs: [RAW_COORDINATE_TARGET, /\btestID=/i],
432433
}),
434+
makeCase({
435+
id: 'ios-disabled-row-raw-rect-fallback',
436+
contract: [
437+
'Platform: iOS simulator',
438+
'Current screen: Orders list',
439+
'Fresh interactive snapshot shows @e12 [disabled] hittable:false label "Order #1042"',
440+
'press @e12 already returned success, but diff snapshot showed no navigation',
441+
'Compact raw JSON rect center for @e12 is x=196 y=318',
442+
],
443+
task: 'Plan the fallback commands to inspect raw compact snapshot rects, press the row center, then verify the nearby change.',
444+
outputs: [
445+
COMPACT_RECT_SNAPSHOT,
446+
/(?:^|\n)(?:agent-device\s+)?(?:press|click)\s+196\s+318\b/i,
447+
/(?:diff snapshot|snapshot\b.*--diff)/i,
448+
],
449+
forbiddenOutputs: [/(?:^|\n)(?:agent-device\s+)?(?:press|click)\s+@e12\b/i, /scrollintoview/i],
450+
}),
433451
makeCase({
434452
id: 'truncated-text-input-scope-ref',
435453
contract: [
@@ -505,6 +523,39 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
505523
/(?:^|\n)(?:agent-device\s+)?(?:click|press)\s+@(?:e\d+|ref)/i,
506524
],
507525
}),
526+
makeCase({
527+
id: 'ios-composite-horizontal-tabs-coordinate-fallback',
528+
contract: [
529+
'Platform: iOS simulator',
530+
'Current screen: Catalog filters',
531+
'Horizontal filter tabs are collapsed into one [seekbar] in snapshot -i',
532+
'The individual Bakery tab has no @ref or selector on iOS',
533+
'Compact raw JSON plus visual inspection gives Bakery center x=84 y=220',
534+
],
535+
task: 'Plan commands to handle the missing child refs by inspecting raw compact rects, tapping the Bakery center, and verifying the selected filter changed.',
536+
outputs: [
537+
COMPACT_RECT_SNAPSHOT,
538+
/(?:^|\n)(?:agent-device\s+)?(?:press|click)\s+84\s+220\b/i,
539+
/(?:diff snapshot -i|snapshot\b.*(?:-i\b.*--diff|--diff\b.*-i\b)|snapshot -i|Berry Tart|Bakery)/i,
540+
],
541+
forbiddenOutputs: [
542+
/(?:^|\n)(?:agent-device\s+)?(?:press|click)\s+@(?:e\d+|ref)\b/i,
543+
/scrollintoview/i,
544+
],
545+
}),
546+
makeCase({
547+
id: 'list-text-presence-prefers-wait-text',
548+
contract: [
549+
'App name: Agent Device Tester',
550+
'Current screen: Catalog tab',
551+
'List content loads asynchronously',
552+
'Expected list item text: Trip ideas',
553+
'No interaction is needed; this is a presence check for visible text in a list',
554+
],
555+
task: 'Plan the robust read-only command to wait for the Trip ideas list text to appear.',
556+
outputs: [commandPattern('wait'), /Trip ideas/i],
557+
forbiddenOutputs: [commandPattern('is visible'), /snapshot -i/i, commandPattern('press')],
558+
}),
508559
makeCase({
509560
id: 'navigation-back-in-app',
510561
contract: [
@@ -516,6 +567,19 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
516567
outputs: [commandPattern('back')],
517568
forbiddenOutputs: [/back\s+--system/i],
518569
}),
570+
makeCase({
571+
id: 'navigation-back-ambiguous-use-visible-nav',
572+
contract: [
573+
'App name: Agent Device Tester',
574+
'Current screen: nested product detail',
575+
'Previous back command navigated to an unexpected screen',
576+
'Visible app nav button selector: id="nav-back"',
577+
'Goal: return one level inside the app, not trigger system back',
578+
],
579+
task: 'Plan the next navigation command using the visible app-owned control instead of retrying back.',
580+
outputs: [/nav-back/i, commandAlternativesPattern(['press', 'click'])],
581+
forbiddenOutputs: [commandPattern('back'), /back\s+--system/i],
582+
}),
519583
makeCase({
520584
id: 'setup-unknown-app-discover-first',
521585
contract: [
@@ -930,9 +994,9 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
930994
],
931995
task: 'Plan a self-contained remote script that opens the app, captures a snapshot, and disconnects using the remote config on every command.',
932996
outputs: [
933-
/open\b[^\n]*--remote-config\s+\.\/remote-config\.json/i,
934-
/snapshot\b[^\n]*--remote-config\s+\.\/remote-config\.json/i,
935-
/disconnect\b[^\n]*--remote-config\s+\.\/remote-config\.json/i,
997+
/(?:--remote-config\s+\.\/remote-config\.json[^\n]*open|open\b[^\n]*--remote-config\s+\.\/remote-config\.json)/i,
998+
/(?:--remote-config\s+\.\/remote-config\.json[^\n]*snapshot|snapshot\b[^\n]*--remote-config\s+\.\/remote-config\.json)/i,
999+
/(?:--remote-config\s+\.\/remote-config\.json[^\n]*disconnect|disconnect\b[^\n]*--remote-config\s+\.\/remote-config\.json)/i,
9361000
],
9371001
forbiddenOutputs: [/--daemon-base-url/i, /--tenant/i, /--run-id/i],
9381002
}),
@@ -993,6 +1057,19 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
9931057
],
9941058
forbiddenOutputs: [PSEUDO_ASSERTION_COMMAND, /workflow batch/i, commandPattern('trace')],
9951059
}),
1060+
makeCase({
1061+
id: 'batch-inline-step-schema-positionals',
1062+
contract: [
1063+
'Need one inline batch command',
1064+
'Step 1: open settings',
1065+
'Step 2: wait 100 ms',
1066+
'Batch step schema supports command, positionals, flags, and runtime',
1067+
'The args field is invalid and must not be used',
1068+
],
1069+
task: 'Plan the batch command with inline JSON steps using the supported step field for positional arguments.',
1070+
outputs: [commandPattern('batch'), /--steps/i, /"positionals"\s*:/i, /"open"/i, /"wait"/i],
1071+
forbiddenOutputs: [/"args"\s*:/i, /workflow batch/i],
1072+
}),
9961073
];
9971074

9981075
const suite: TestCase[] = [...FIXTURE_SMOKE_CASES, ...SKILL_GUIDANCE_CASES];

0 commit comments

Comments
 (0)