callstack · thymikee · May 31, 2026 · May 30, 2026 · May 31, 2026 · May 31, 2026
diff --git a/.fallowrc.json b/.fallowrc.json
@@ -25,6 +25,7 @@
   ],
   "ignorePatterns": [
     "examples/test-app/**",
+    "scripts/perf/**",
     "ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests.xctestplan",
     "scripts/write-xcuitest-cache-metadata.mjs"
   ],

diff --git a/.github/workflows/perf-nightly.yml b/.github/workflows/perf-nightly.yml
@@ -0,0 +1,127 @@
+name: Perf Nightly
+
+# End-to-end command perf benchmark (scripts/perf). Scheduled + manual only — perf timing on
+# shared CI runners is noisy, so treat this as a trend/regression signal, not absolute numbers.
+# Reuses the same build artifacts as the device suites: the cached iOS XCUITest runner
+# (setup-apple-replay, ios-runner-prebuilt cache) and the Android replay host, and runs the CLI
+# from source via --experimental-strip-types (no dist build), matching the replay workflows.
+
+on:
+  schedule:
+    - cron: "0 4 * * *"
+  workflow_dispatch:
+    inputs:
+      rounds:
+        description: "Measured rounds per command (samples)"
+        required: false
+        default: "5"
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  AGENT_DEVICE_PERF_CLI: "--experimental-strip-types src/bin.ts"
+  PERF_ROUNDS: ${{ github.event.inputs.rounds || '5' }}
+
+jobs:
+  perf-ios:
+    name: iOS Command Perf
+    runs-on: macos-26
+    timeout-minutes: 80
+    env:
+      IOS_RUNTIME_VERSION: "26.2"
+      AGENT_DEVICE_IOS_RUNNER_DERIVED_PATH: ${{ github.workspace }}/.tmp/ios-runner-derived
+    steps:
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Setup toolchain
+        uses: ./.github/actions/setup-node-pnpm
+
+      - name: Setup Apple replay
+        id: apple-replay
+        uses: ./.github/actions/setup-apple-replay
+        with:
+          derived-path: ${{ env.AGENT_DEVICE_IOS_RUNNER_DERIVED_PATH }}
+          cache-key-prefix: ios-runner-prebuilt
+          cache-key-suffix: -ios-${{ env.IOS_RUNTIME_VERSION }}
+          build-command: sh ./scripts/build-xcuitest-apple.sh
+          xcuitest-platform: ios
+          xcuitest-destination: generic/platform=iOS Simulator
+          clean-derived: "1"
+
+      - name: Boot iOS test simulator
+        uses: ./.github/actions/boot-ios-test-simulator
+        with:
+          runtime-version: ${{ env.IOS_RUNTIME_VERSION }}
+          preferred-device-name: iPhone 17 Pro
+
+      - name: Run iOS command perf benchmark
+        run: |
+          pnpm clean:daemon
+          node --experimental-strip-types scripts/perf/run.ts \
+            --platform ios \
+            --device "iPhone 17 Pro" \
+            --n "$PERF_ROUNDS" --warmup 1 \
+            --out-dir "$GITHUB_WORKSPACE/perf-results"
+
+      - name: Upload iOS perf report
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: perf-ios
+          path: perf-results/
+          if-no-files-found: warn
+
+  perf-android:
+    name: Android Command Perf
+    runs-on: ubuntu-latest
+    timeout-minutes: 80
+    steps:
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Setup toolchain
+        uses: ./.github/actions/setup-node-pnpm
+
+      - name: Setup Android replay host
+        id: android-replay-host
+        uses: ./.github/actions/setup-android-replay-host
+
+      - name: Package npm-bundled Android helpers
+        run: |
+          pnpm package:android-snapshot-helper:npm
+          pnpm package:android-multitouch-helper:npm
+
+      - name: Run Android command perf benchmark
+        uses: reactivecircus/android-emulator-runner@b530d96654c385303d652368551fb075bc2f0b6b # v2.35.0
+        with:
+          api-level: 36
+          arch: x86_64
+          profile: pixel_7
+          target: google_apis_playstore
+          emulator-options: -no-window -gpu swiftshader_indirect -no-snapshot -noaudio -no-boot-anim -no-metrics
+          script: |
+            set -e
+            # Disable animations up front so accessibility dumps don't time out (the harness
+            # also runs `settings animations off`, this is belt-and-suspenders).
+            adb -s emulator-5554 shell settings put global window_animation_scale 0 || true
+            adb -s emulator-5554 shell settings put global transition_animation_scale 0 || true
+            adb -s emulator-5554 shell settings put global animator_duration_scale 0 || true
+            node --experimental-strip-types scripts/perf/run.ts \
+              --platform android \
+              --serial emulator-5554 \
+              --n "$PERF_ROUNDS" --warmup 1 \
+              --out-dir "$GITHUB_WORKSPACE/perf-results"
+
+      - name: Upload Android perf report
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: perf-android
+          path: perf-results/
+          if-no-files-found: warn
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 node_modules/
+scripts/perf/.results/
 .pnpm-store/
 .fallow/
 dist/

diff --git a/package.json b/package.json
@@ -98,6 +98,9 @@
     "ad": "node bin/agent-device.mjs",
     "size": "node scripts/size-report.mjs",
     "size:markdown": "node scripts/size-report.mjs --json .tmp/size-report.json --markdown .tmp/size-report.md",
+    "perf": "node --experimental-strip-types scripts/perf/run.ts",
+    "perf:ios": "node --experimental-strip-types scripts/perf/run.ts --platform ios",
+    "perf:android": "node --experimental-strip-types scripts/perf/run.ts --platform android",
     "lint": "oxlint . --deny-warnings",
     "format": "oxfmt --write src test skills package.json tsconfig.json tsconfig.lib.json rslib.config.ts vitest.config.ts .github/actions/setup-node-pnpm/action.yml .oxlintrc.json .oxfmtrc.json '!test/skillgym/.skillgym-results/**'",
     "fallow": "fallow --summary",

diff --git a/scripts/perf/cli.ts b/scripts/perf/cli.ts
@@ -0,0 +1,99 @@
+import { performance } from 'node:perf_hooks';
+import { runCmdSync } from '../../src/utils/exec.ts';
+import { resolveCliArgv, REPO_ROOT } from './config.ts';
+import type { BatchStepSpec } from './scenario.ts';
+import type { CliResult } from './types.ts';
+
+const MAX_BUFFER = 64 * 1024 * 1024;
+const CLI_ARGV = resolveCliArgv();
+
+function tryParseJson(stdout: string): unknown {
+  const trimmed = stdout.trim();
+  if (!trimmed) return undefined;
+  try {
+    return JSON.parse(trimmed);
+  } catch {
+    // Some commands print a trailing line after JSON; try the last JSON-looking block.
+    const start = trimmed.indexOf('{');
+    const end = trimmed.lastIndexOf('}');
+    if (start >= 0 && end > start) {
+      try {
+        return JSON.parse(trimmed.slice(start, end + 1));
+      } catch {
+        return undefined;
+      }
+    }
+    return undefined;
+  }
+}
+
+function jsonOk(json: unknown): boolean {
+  return !(json !== null && typeof json === 'object' && (json as { ok?: unknown }).ok === false);
+}
+
+// Invoke the built CLI once. `args` includes the command + positionals + dash-flags;
+// `baseFlags` carries the isolation + device flags shared by every call.
+export function invokeCli(args: string[], baseFlags: string[]): CliResult {
+  const full = [...CLI_ARGV, ...args, ...baseFlags, '--json'];
+  const t0 = performance.now();
+  let stdout = '';
+  let stderr = '';
+  let exitCode = -1;
+  try {
+    // allowFailure so non-zero exits are recorded as samples instead of thrown; maxBuffer
+    // raised because snapshot payloads exceed Node's ~1MB default.
+    const r = runCmdSync(process.execPath, full, {
+      cwd: REPO_ROOT,
+      maxBuffer: MAX_BUFFER,
+      allowFailure: true,
+    });
+    stdout = r.stdout;
+    stderr = r.stderr;
+    exitCode = r.exitCode;
+  } catch (error) {
+    // Spawn-level failures (missing executable, timeout) — record as a failed sample.
+    stderr = error instanceof Error ? error.message : String(error);
+  }
+  const wallClockMs = performance.now() - t0;
+  const json = tryParseJson(stdout);
+  return { exitCode, wallClockMs, stdout, stderr, json, ok: exitCode === 0 && jsonOk(json) };
+}
+
+// Wrap a single command in its own `batch` invocation to read per-step durationMs.
+export function invokeBatchStep(spec: BatchStepSpec, baseFlags: string[]): CliResult {
+  const result = invokeCli(['batch', '--steps', JSON.stringify([spec])], baseFlags);
+  // Defensive: today's stop-only batch surfaces a failed step as a top-level non-zero/ok:false
+  // (already caught by invokeCli). But if a future on-error mode keeps the batch ok while a step
+  // fails, don't silently count that step as a success — downgrade ok from the step's own ok.
+  const stepOk = firstBatchResult(result.json)?.ok;
+  if (result.ok && stepOk === false) {
+    return { ...result, ok: false };
+  }
+  return result;
+}
+
+function firstBatchResult(json: unknown): Record<string, unknown> | undefined {
+  const data = (json as { data?: { results?: unknown[] } } | undefined)?.data;
+  const first = data?.results?.[0];
+  return first && typeof first === 'object' ? (first as Record<string, unknown>) : undefined;
+}
+
+export function readBatchStepDurationMs(result: CliResult): number | undefined {
+  const v = firstBatchResult(result.json)?.durationMs;
+  return typeof v === 'number' ? v : undefined;
+}
+
+export function readBatchStepError(result: CliResult): { code?: string; message?: string } {
+  const err = (result.json as { error?: { code?: string; message?: string } } | undefined)?.error;
+  return { code: err?.code, message: err?.message };
+}
+
+// Proxy for a11y-tree size: snapshot node count (falls back to distinct @eN refs).
+export function countElements(result: CliResult): number | undefined {
+  const stepData = firstBatchResult(result.json)?.data;
+  if (stepData === undefined || typeof stepData !== 'object') return undefined;
+  const nodes = (stepData as { nodes?: unknown }).nodes;
+  if (Array.isArray(nodes)) return nodes.length;
+  const matches = JSON.stringify(stepData).match(/@e\d+/g);
+  return matches ? new Set(matches).size : 0;
+}
diff --git a/scripts/perf/config.ts b/scripts/perf/config.ts
@@ -0,0 +1,94 @@
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import type { Platform } from './types.ts';
+
+const HERE = path.dirname(fileURLToPath(import.meta.url));
+export const REPO_ROOT = path.resolve(HERE, '..', '..');
+const CLI_BIN = path.join(REPO_ROOT, 'bin', 'agent-device.mjs');
+const DEFAULT_OUT_DIR = path.join(HERE, '.results');
+
+export type PerfConfig = {
+  platform: Platform;
+  rounds: number; // measured rounds (samples per command)
+  warmup: number; // leading rounds dropped from stats
+  keepArtifacts: boolean; // keep temp state dir + leave device booted
+  outDir: string;
+  udid?: string; // iOS device override (UDID)
+  device?: string; // device override by name (e.g. "iPhone 17 Pro"); preferred over udid
+  serial?: string; // Android device override
+};
+
+// How to invoke the CLI. Defaults to the built dist binary (bin/agent-device.mjs).
+// Set AGENT_DEVICE_PERF_CLI to run from source instead, e.g. on CI:
+//   AGENT_DEVICE_PERF_CLI="--experimental-strip-types src/bin.ts"
+// (matches the device workflows, which run from source and skip the dist build).
+export function resolveCliArgv(): string[] {
+  const override = process.env.AGENT_DEVICE_PERF_CLI?.trim();
+  if (override) return override.split(/\s+/);
+  return [CLI_BIN];
+}
+
+export function usesSourceCli(): boolean {
+  return Boolean(process.env.AGENT_DEVICE_PERF_CLI?.trim());
+}
+
+function readValue(argv: string[], i: number, flag: string): string {
+  const v = argv[i + 1];
+  if (v === undefined) throw new Error(`Missing value for ${flag}`);
+  return v;
+}
+
+function readIntValue(argv: string[], i: number, flag: string, min: number): number {
+  const raw = readValue(argv, i, flag);
+  const n = Number(raw);
+  if (!Number.isInteger(n) || n < min) {
+    throw new Error(`${flag} must be an integer >= ${min} (got ${JSON.stringify(raw)})`);
+  }
+  return n;
+}
+
+export function parseConfig(argv: string[]): PerfConfig {
+  const cfg: PerfConfig = {
+    platform: 'ios',
+    rounds: 5,
+    warmup: 1,
+    keepArtifacts: false,
+    outDir: DEFAULT_OUT_DIR,
+  };
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    switch (a) {
+      case '--platform': {
+        const v = readValue(argv, i++, a);
+        if (v !== 'ios' && v !== 'android') throw new Error(`Unknown platform: ${v}`);
+        cfg.platform = v;
+        break;
+      }
+      case '--n':
+      case '--rounds':
+        cfg.rounds = readIntValue(argv, i++, a, 1);
+        break;
+      case '--warmup':
+        cfg.warmup = readIntValue(argv, i++, a, 0);
+        break;
+      case '--keep-artifacts':
+        cfg.keepArtifacts = true;
+        break;
+      case '--out-dir':
+        cfg.outDir = path.resolve(readValue(argv, i++, a));
+        break;
+      case '--udid':
+        cfg.udid = readValue(argv, i++, a);
+        break;
+      case '--device':
+        cfg.device = readValue(argv, i++, a);
+        break;
+      case '--serial':
+        cfg.serial = readValue(argv, i++, a);
+        break;
+      default:
+        throw new Error(`Unknown flag: ${a}`);
+    }
+  }
+  return cfg;
+}