test: cover device workflow edge cases

thymikee · thymikee · commit 55f805a0999e · 2026-04-27T17:33:04.000-04:00
diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts
@@ -61,6 +61,23 @@ test('help workflow prints agent workflow topic and skips daemon dispatch', asyn
   assert.match(result.stdout, /Do not use CSS selectors/);
 });
 
+test('help workflow preserves known device workaround guidance', async () => {
+  const result = await runCliCapture(['help', 'workflow']);
+  assert.equal(result.code, 0);
+  assert.equal(result.calls.length, 0);
+  assert.match(result.stdout, /disabled\/hittable:false/);
+  assert.match(result.stdout, /snapshot -i -c --json/);
+  assert.match(result.stdout, /@Label_Name/);
+  assert.match(result.stdout, /press @e12/);
+  assert.match(result.stdout, /Snapshot legend:/);
+  assert.match(result.stdout, /preview="Leave at side\.\.\." truncated/);
+  assert.match(result.stdout, /wait text/);
+  assert.match(result.stdout, /Never use args/);
+  assert.match(result.stdout, /Never use args, step/);
+  assert.match(result.stdout, /scrollintoview/);
+  assert.match(result.stdout, /--delay-ms/);
+});
+
 test('help unknown command prints error plus global usage and skips daemon dispatch', async () => {
   const result = await runCliCapture(['help', 'not-a-command']);
   assert.equal(result.code, 1);
diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts
@@ -789,7 +789,7 @@ test('usage includes agent workflows, config, environment, and examples footers'
   assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/);
   assert.match(usageText, /Use selectors or refs as positional targets/);
   assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/);
-  assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @ref/);
+  assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @e12/);
   assert.match(usageText, /RN warning\/error overlays can block taps: snapshot -i/);
   assert.match(usageText, /Expo Go\/dev clients need their provided exp:\/\//);
   assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/);
@@ -834,6 +834,8 @@ test('usageForCommand resolves workflow help topic', () => {
   assert.match(help, /agent-device help workflow/);
   assert.match(help, /Use selectors as positional targets/);
   assert.match(help, /Do not use CSS selectors/);
+  assert.match(help, /Snapshot legend:/);
+  assert.match(help, /@e12 \[button\] label="Add to cart"/);
   assert.match(help, /Truncated text\/input previews: do not use get text first/);
   assert.match(help, /snapshot -s @e7/);
   assert.match(help, /Read-only visible\/state question: use snapshot\/get\/is\/find/);
@@ -860,7 +862,7 @@ test('workflow help keeps common copyable command forms', () => {
   assert.match(help, /connect --remote-config/);
   assert.match(help, /metro reload/);
   assert.match(help, /screenshot --overlay-refs/);
-  assert.match(help, /snapshot -s @ref/);
+  assert.match(help, /snapshot -s @e7/);
 });
 
 test('usageForCommand resolves remote help topic', () => {
diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts
@@ -175,14 +175,16 @@ const AGENT_WORKFLOWS = [
 
 const AGENT_QUICKSTART_LINES = [
   'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.',
-  'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.',
+  'Use selectors or refs as positional targets: id="submit", label="Allow", or @e12 from snapshot -i.',
   'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.',
   'Read-only visible/state question: use snapshot/get/is/find; use snapshot -i only when refs are needed.',
-  'Truncated text/input preview: expand first with snapshot -s @ref, not get text.',
+  'Truncated text/input preview: expand first with snapshot -s @e12, not get text.',
   'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.',
   'Expo Go/dev clients need their provided exp:// or dev-client URL; do not invent app ids.',
   'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.',
   'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.',
+  'Raw coordinates are fallback-only: use snapshot -i -c --json rects when iOS refs no-op or child refs are missing.',
+  'Batch JSON steps use "command", "positionals", "flags"; never "args" or "step".',
   'Navigation: app-owned back uses back; system back uses back --system.',
   'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.',
   'Debug evidence: logs clear/mark/path; trace start ./path; trace stop ./path; network dump --include headers.',
@@ -235,9 +237,8 @@ Command shape:
   Put subcommand first, then positionals, then flags:
     agent-device open com.example.app --session checkout --platform android --relaunch
     agent-device record start ./checkout.mp4 --session checkout
-  Unknown current ref placeholder: @ref. Use provided labels/ids/selectors when known. Never invent @e#.
-  After snapshot -i, use @ref in plans when the exact @e number is unknown.
-  If a task explicitly says to act by @ref, output press @ref or click @ref after refreshing refs.
+  Snapshot refs look like @e12. After snapshot -i, use the exact @eN ref from that output.
+  If the exact ref is not known yet, first output snapshot -i, then use a concrete example shape like press @e12 in the next command; do not write @<ref>, @ref, @Label_Name, or @eN placeholders.
   Close means agent-device close. App-owned back means back; system back means back --system.
   Taps are press or click. Gestures are direct commands: swipe, longpress, pinch.
 
@@ -249,14 +250,21 @@ Bootstrap:
   agent-device install com.example.app ./dist/app.apk --platform android
   agent-device reinstall com.example.app ./build/MyApp.app --platform ios
   agent-device install-from-source --github-actions-artifact org/repo:app-debug --platform android
-  If app id is unknown, plan devices, apps, then open <discovered-app-id>. Install arguments are app/package id then artifact path. Fresh install state: open with --relaunch.
+  agent-device open com.example.app --platform android --relaunch
+  If app id is unknown, plan devices, apps, then open <discovered-app-id>. Install arguments are app/package id then artifact path. After install, install-from-source, or reinstall, open the installed id with --relaunch for fresh runtime state.
   Do not open artifact paths or invent package ids. If apps lookup misses the target and no URL/artifact is provided, ask or stop.
 
 Snapshots and refs:
   snapshot reads visible state. snapshot -i gets current interactive refs.
+  Snapshot legend:
+    @e12 [button] label="Add to cart" id="add-cart" enabled hittable -> press @e12 or press 'id="add-cart"'.
+    @e13 [textinput] label="Notes" preview="Leave at side..." truncated -> snapshot -s @e13 before reading.
+    [off-screen below] 4 items: "Privacy", "About" -> scroll down, then snapshot -i; those are hints, not refs.
   Re-snapshot after navigation, submit, modal/list/reload/dynamic changes.
   Off-screen summaries are scroll hints; use scroll, not swipe, then snapshot -i.
+  Missing target in a long list: use a short manual scroll + snapshot loop with a max attempt count; do not rely on unbounded scrollintoview.
   Truncated text/input previews: do not use get text first; expand with snapshot -s @ref (for example snapshot -s @e7), then read the scoped output.
+  Rare iOS accessibility gaps: if a row ref is shown disabled/hittable:false and press @ref reports success but no UI change, or a horizontal tab/filter bar is collapsed into one composite/seekbar with no child refs, run agent-device snapshot -i -c --json to read rects, compute the target center, press x y, then diff snapshot -i. Coordinates are fallback-only; document why you used them.
 
 Selectors:
   Use selectors as positional targets: id="field-email" or label="Allow".
@@ -273,20 +281,24 @@ Text entry:
     agent-device press 'id="product-note"'
     agent-device type "Handle with care" --delay-ms 80
   Debounced field with no result selector: agent-device wait 1000. Keyboard read-only: keyboard status/get. Blocked control: keyboard dismiss.
+  Search-as-you-type fields on iOS can drop characters when driven too fast; use --delay-ms on fill/type before trying clipboard paste.
 
 Read-only and waits:
   Read-only visible/state question: use snapshot/get/is/find.
   agent-device snapshot
   agent-device get text 'id="product-title"'
   agent-device get attrs @e4
   agent-device is visible 'label="Online"'
-  agent-device wait visible 'label="Refreshing metrics..."' 3000
+  agent-device wait text "Refreshing metrics..." 3000
+  agent-device wait 'label="Ready"' 3000
   agent-device find "Increment" press --json
+  For async/list text presence, prefer wait text over is visible when no interaction is needed.
   Use snapshot -i only when refs are needed for an action or targeted query.
   Ambiguous find: add --first or --last. If info is not visible/exposed, report that gap instead of typing/searching/navigating to reveal it.
 
 Navigation and gestures:
   Use scroll for lists; swipe for coordinate gestures/carousels.
+  If app-owned back is ambiguous or has just misrouted, prefer a visible nav/back button ref, tab-bar ref, or deep link over repeated back/system back.
   Keep count/pause/pattern on one swipe; flags are --count, --pause-ms, --pattern ping-pong.
   longpress duration and pinch scale/center are positional:
     agent-device longpress 300 500 800
@@ -302,6 +314,9 @@ Validation and evidence:
   Startup/CPU/memory: perf --json or metrics. Replay maintenance: replay -u ./flow.ad.
   Recording: record start/stop. Tracing: trace start ./trace.log, trace stop ./trace.log. Paths are positional.
   Stable known flow: batch ./steps.json, not workflow batch.
+  Inline batch JSON example:
+    agent-device batch --steps '[{"command":"open","positionals":["settings"],"flags":{}},{"command":"wait","positionals":["100"],"flags":{}}]'
+  Batch step keys are command, positionals, flags, and optional runtime. Never use args, step, text, or target as batch step fields.
   Android animations: settings animations off/on, not animations disable/restore.
   Network headers: network dump --include headers.
   Remote config: connect --remote-config ./remote-config.json, open, snapshot, disconnect.
diff --git a/test/skillgym/README.md b/test/skillgym/README.md
@@ -16,7 +16,7 @@ The included suite focuses on the first two layers so it stays stable and CI-saf
 
 - `../../examples/test-app/`: minimal Expo SDK 55 fixture app for broad UI coverage
 - `skillgym.config.ts`: starter config that runs Codex and Claude Haiku against this repo
-- `suites/agent-device-smoke-suite.ts`: 48-case suite for skill routing, fixture-aware planning, and skill-guidance regressions
+- `suites/agent-device-smoke-suite.ts`: 64-case suite for skill routing, fixture-aware planning, and skill-guidance regressions
 
 ## Current coverage
 
@@ -35,12 +35,12 @@ Fixture smoke cases cover concrete app surfaces:
 Skill-guidance regression cases cover distinct command-planning habits:
 
 - read-only inspection versus mutation
-- fresh `@ref` targeting, durable selectors, and off-screen scroll recovery
+- fresh `@ref` targeting, durable selectors, raw-rect fallbacks, and off-screen scroll recovery
 - text replacement, append semantics, keyboard status, and keyboard dismiss
-- install/open setup, app discovery, session scoping, and in-app back navigation
+- install/open setup, app discovery, session scoping, and app-owned navigation fallbacks
 - Metro reload, logs, network dump, alert fallback, and screenshot evidence
 - performance metrics, React DevTools profiling, gestures, settings, and trace capture
-- remote config, macOS menu bar surfaces, replay update, and batch during recording
+- remote config, macOS menu bar surfaces, replay update, and batch schema/recording
 
 `assertAgentDeviceEvidence` is intentionally soft when a runner does not expose skill-detection telemetry. When telemetry exists, the suite asserts that `agent-device` was loaded; when it is absent, the cases still judge command-planning output instead of failing on missing runner metadata.
 
diff --git a/test/skillgym/skillgym.config.ts b/test/skillgym/skillgym.config.ts
@@ -13,7 +13,7 @@ const config: SkillGymConfig = {
     cwd: '../..',
     outputDir: './.skillgym-results',
     reporter: 'standard',
-    schedule: 'parallel',
+    schedule: 'isolated-by-runner',
   },
   defaults: {
     timeoutMs: 600_000,
diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts
@@ -114,6 +114,7 @@ function assertExpectedOutput(report: SessionReport, matchers: Array<string | Re
 const RAW_COORDINATE_TARGET =
   /(?:^|\n)(?:agent-device\s+)?(?:click|fill|press)\s+-?\d+(?:\.\d+)?\s+-?\d+(?:\.\d+)?/i;
 const PSEUDO_ASSERTION_COMMAND = /(?:^|\n)\s*(?:assert|assertVisible|waitFor|waitForText)\b/i;
+const COMPACT_RECT_SNAPSHOT = /snapshot\b(?=[^\n]*(?:-c\b|--compact\b))(?=[^\n]*(?:--json|--raw))/i;
 
 function makeCase(options: {
   id: string;
@@ -419,17 +420,34 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
     contract: [
       'App name: Agent Device Tester',
       'Current screen: Home tab',
-      'Control label: Lab online',
-      'The current @ref is unknown until a fresh interactive snapshot is captured',
+      'The target control has no stable selector in the task context',
+      'Fresh interactive snapshot will expose the target as @e12',
     ],
-    task: 'Plan the commands to capture fresh interactive refs, press the Lab online control by @ref, then verify the nearby change with diff snapshot -i.',
+    task: 'Plan the commands to capture fresh interactive refs, press the target control with its @e12 ref, then verify the nearby change with diff snapshot -i.',
     outputs: [
       /snapshot -i/i,
-      /(?:^|\n)(?:agent-device\s+)?(?:click|press)\s+@(?:e\d+|ref)\b/i,
+      /(?:^|\n)(?:agent-device\s+)?(?:click|press)\s+@e12\b/i,
       /(?:diff snapshot -i|snapshot\b.*(?:-i\b.*--diff|--diff\b.*-i\b))/i,
     ],
     forbiddenOutputs: [RAW_COORDINATE_TARGET, /\btestID=/i],
   }),
+  makeCase({
+    id: 'ios-disabled-row-raw-rect-fallback',
+    contract: [
+      'Platform: iOS simulator',
+      'Current screen: Orders list',
+      'Fresh interactive snapshot shows @e12 [disabled] hittable:false label "Order #1042"',
+      'press @e12 already returned success, but diff snapshot showed no navigation',
+      'Compact raw JSON rect center for @e12 is x=196 y=318',
+    ],
+    task: 'Plan the fallback commands to inspect raw compact snapshot rects, press the row center, then verify the nearby change.',
+    outputs: [
+      COMPACT_RECT_SNAPSHOT,
+      /(?:^|\n)(?:agent-device\s+)?(?:press|click)\s+196\s+318\b/i,
+      /(?:diff snapshot|snapshot\b.*--diff)/i,
+    ],
+    forbiddenOutputs: [/(?:^|\n)(?:agent-device\s+)?(?:press|click)\s+@e12\b/i, /scrollintoview/i],
+  }),
   makeCase({
     id: 'truncated-text-input-scope-ref',
     contract: [
@@ -505,6 +523,39 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
       /(?:^|\n)(?:agent-device\s+)?(?:click|press)\s+@(?:e\d+|ref)/i,
     ],
   }),
+  makeCase({
+    id: 'ios-composite-horizontal-tabs-coordinate-fallback',
+    contract: [
+      'Platform: iOS simulator',
+      'Current screen: Catalog filters',
+      'Horizontal filter tabs are collapsed into one [seekbar] in snapshot -i',
+      'The individual Bakery tab has no @ref or selector on iOS',
+      'Compact raw JSON plus visual inspection gives Bakery center x=84 y=220',
+    ],
+    task: 'Plan commands to handle the missing child refs by inspecting raw compact rects, tapping the Bakery center, and verifying the selected filter changed.',
+    outputs: [
+      COMPACT_RECT_SNAPSHOT,
+      /(?:^|\n)(?:agent-device\s+)?(?:press|click)\s+84\s+220\b/i,
+      /(?:diff snapshot -i|snapshot\b.*(?:-i\b.*--diff|--diff\b.*-i\b)|snapshot -i|Berry Tart|Bakery)/i,
+    ],
+    forbiddenOutputs: [
+      /(?:^|\n)(?:agent-device\s+)?(?:press|click)\s+@(?:e\d+|ref)\b/i,
+      /scrollintoview/i,
+    ],
+  }),
+  makeCase({
+    id: 'list-text-presence-prefers-wait-text',
+    contract: [
+      'App name: Agent Device Tester',
+      'Current screen: Catalog tab',
+      'List content loads asynchronously',
+      'Expected list item text: Trip ideas',
+      'No interaction is needed; this is a presence check for visible text in a list',
+    ],
+    task: 'Plan the robust read-only command to wait for the Trip ideas list text to appear.',
+    outputs: [commandPattern('wait'), /Trip ideas/i],
+    forbiddenOutputs: [commandPattern('is visible'), /snapshot -i/i, commandPattern('press')],
+  }),
   makeCase({
     id: 'navigation-back-in-app',
     contract: [
@@ -516,6 +567,19 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
     outputs: [commandPattern('back')],
     forbiddenOutputs: [/back\s+--system/i],
   }),
+  makeCase({
+    id: 'navigation-back-ambiguous-use-visible-nav',
+    contract: [
+      'App name: Agent Device Tester',
+      'Current screen: nested product detail',
+      'Previous back command navigated to an unexpected screen',
+      'Visible app nav button selector: id="nav-back"',
+      'Goal: return one level inside the app, not trigger system back',
+    ],
+    task: 'Plan the next navigation command using the visible app-owned control instead of retrying back.',
+    outputs: [/nav-back/i, commandAlternativesPattern(['press', 'click'])],
+    forbiddenOutputs: [commandPattern('back'), /back\s+--system/i],
+  }),
   makeCase({
     id: 'setup-unknown-app-discover-first',
     contract: [
@@ -930,9 +994,9 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
     ],
     task: 'Plan a self-contained remote script that opens the app, captures a snapshot, and disconnects using the remote config on every command.',
     outputs: [
-      /open\b[^\n]*--remote-config\s+\.\/remote-config\.json/i,
-      /snapshot\b[^\n]*--remote-config\s+\.\/remote-config\.json/i,
-      /disconnect\b[^\n]*--remote-config\s+\.\/remote-config\.json/i,
+      /(?:--remote-config\s+\.\/remote-config\.json[^\n]*open|open\b[^\n]*--remote-config\s+\.\/remote-config\.json)/i,
+      /(?:--remote-config\s+\.\/remote-config\.json[^\n]*snapshot|snapshot\b[^\n]*--remote-config\s+\.\/remote-config\.json)/i,
+      /(?:--remote-config\s+\.\/remote-config\.json[^\n]*disconnect|disconnect\b[^\n]*--remote-config\s+\.\/remote-config\.json)/i,
     ],
     forbiddenOutputs: [/--daemon-base-url/i, /--tenant/i, /--run-id/i],
   }),
@@ -993,6 +1057,19 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
     ],
     forbiddenOutputs: [PSEUDO_ASSERTION_COMMAND, /workflow batch/i, commandPattern('trace')],
   }),
+  makeCase({
+    id: 'batch-inline-step-schema-positionals',
+    contract: [
+      'Need one inline batch command',
+      'Step 1: open settings',
+      'Step 2: wait 100 ms',
+      'Batch step schema supports command, positionals, flags, and runtime',
+      'The args field is invalid and must not be used',
+    ],
+    task: 'Plan the batch command with inline JSON steps using the supported step field for positional arguments.',
+    outputs: [commandPattern('batch'), /--steps/i, /"positionals"\s*:/i, /"open"/i, /"wait"/i],
+    forbiddenOutputs: [/"args"\s*:/i, /workflow batch/i],
+  }),
 ];
 
 const suite: TestCase[] = [...FIXTURE_SMOKE_CASES, ...SKILL_GUIDANCE_CASES];