Skip to content

Commit 45cfad5

Browse files
authored
feat: e2e command perf benchmark harness + nightly CI (#630)
* feat: add e2e command perf benchmark harness + nightly CI Adds scripts/perf, a cheap end-to-end perf benchmark that drives the built CLI through an ordered Settings tour of ~24 commands for N rounds, on a fully isolated daemon/state-dir and self-cleaning device, and emits JSON + Markdown reports. Per-command timing comes from wrapping each batchable command in its own single-step batch (daemon durationMs) plus wall-clock around the process. Wires a scheduled + workflow_dispatch CI job (perf-nightly.yml) that reuses the cached iOS XCUITest runner (setup-apple-replay) and the Android replay host, and runs the CLI from source via --experimental-strip-types (no dist build). * refactor(perf): drive the harness CLI via runCmdSync, not spawnSync Review (P2): repo rule is to spawn processes through src/utils/exec.ts, not node:child_process directly. Switch the perf harness's invokeCli to runCmdSync (allowFailure so non-zero exits are recorded as samples) and add a maxBuffer option to ExecOptions/runCmdSync (snapshot payloads exceed Node's ~1MB default). * perf(harness): warm the runner after open so the first measured command is clean The first interaction after open/relaunch pays the one-time iOS XCUITest runner startup (~10s+ cold) and a per-relaunch first-AX-query settle cost (~4s). That was landing on the first measured command each round (snapshot -i), inflating it ~10x vs the next snapshot. Run an untimed warmup snapshot -i after establishSession, after each round's reset-open, and after every freshRoot relaunch, so no measured command absorbs runner startup. Noted in the report header. * refactor(perf): address review + fix Fallow CI - exec.ts: extract spawnRejectionError + commandCloseFailure helpers, deduping the error/close handler clones (Fallow duplication ✗ that surfaced once the maxBuffer change pulled exec.ts into the audit scope). - .fallowrc: exclude scripts/perf/** (non-shipped benchmark tooling, like examples/ test-app) so its naturally-moderate functions don't trip the complexity gate. - config.ts: drop unused exports CLI_BIN/DEFAULT_OUT_DIR; add readIntValue so --n/--rounds/--warmup report the actual flag + reject non-integers clearly. - harness.ts: extract toSample(); type sampleError param as CliResult. - scenario.ts: ScenarioStep is now a discriminated union on execMode (removes step.step!/ step.args ?? []). - comment/legend rewords (platform defaults are local-convenience/CI-overridden; elements = node count). check:fallow now green; typecheck/lint/unit pass. * perf(harness): downgrade sample ok when a batch step reports ok:false Defensive belt-and-suspenders for the Codex review note: stop-only batch already surfaces a failed step as a top-level failure (caught by invokeCli), but if an on-error=continue mode ever keeps the batch ok while a step fails, don't silently count that step as a successful sample — derive ok from the step's own result.ok.
1 parent efc0b21 commit 45cfad5

14 files changed

Lines changed: 969 additions & 23 deletions

File tree

.fallowrc.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
],
2626
"ignorePatterns": [
2727
"examples/test-app/**",
28+
"scripts/perf/**",
2829
"ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests.xctestplan",
2930
"scripts/write-xcuitest-cache-metadata.mjs"
3031
],

.github/workflows/perf-nightly.yml

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
name: Perf Nightly
2+
3+
# End-to-end command perf benchmark (scripts/perf). Scheduled + manual only — perf timing on
4+
# shared CI runners is noisy, so treat this as a trend/regression signal, not absolute numbers.
5+
# Reuses the same build artifacts as the device suites: the cached iOS XCUITest runner
6+
# (setup-apple-replay, ios-runner-prebuilt cache) and the Android replay host, and runs the CLI
7+
# from source via --experimental-strip-types (no dist build), matching the replay workflows.
8+
9+
on:
10+
schedule:
11+
- cron: "0 4 * * *"
12+
workflow_dispatch:
13+
inputs:
14+
rounds:
15+
description: "Measured rounds per command (samples)"
16+
required: false
17+
default: "5"
18+
19+
permissions:
20+
contents: read
21+
22+
concurrency:
23+
group: ci-${{ github.workflow }}-${{ github.ref }}
24+
cancel-in-progress: true
25+
26+
env:
27+
AGENT_DEVICE_PERF_CLI: "--experimental-strip-types src/bin.ts"
28+
PERF_ROUNDS: ${{ github.event.inputs.rounds || '5' }}
29+
30+
jobs:
31+
perf-ios:
32+
name: iOS Command Perf
33+
runs-on: macos-26
34+
timeout-minutes: 80
35+
env:
36+
IOS_RUNTIME_VERSION: "26.2"
37+
AGENT_DEVICE_IOS_RUNNER_DERIVED_PATH: ${{ github.workspace }}/.tmp/ios-runner-derived
38+
steps:
39+
- name: Checkout
40+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
41+
42+
- name: Setup toolchain
43+
uses: ./.github/actions/setup-node-pnpm
44+
45+
- name: Setup Apple replay
46+
id: apple-replay
47+
uses: ./.github/actions/setup-apple-replay
48+
with:
49+
derived-path: ${{ env.AGENT_DEVICE_IOS_RUNNER_DERIVED_PATH }}
50+
cache-key-prefix: ios-runner-prebuilt
51+
cache-key-suffix: -ios-${{ env.IOS_RUNTIME_VERSION }}
52+
build-command: sh ./scripts/build-xcuitest-apple.sh
53+
xcuitest-platform: ios
54+
xcuitest-destination: generic/platform=iOS Simulator
55+
clean-derived: "1"
56+
57+
- name: Boot iOS test simulator
58+
uses: ./.github/actions/boot-ios-test-simulator
59+
with:
60+
runtime-version: ${{ env.IOS_RUNTIME_VERSION }}
61+
preferred-device-name: iPhone 17 Pro
62+
63+
- name: Run iOS command perf benchmark
64+
run: |
65+
pnpm clean:daemon
66+
node --experimental-strip-types scripts/perf/run.ts \
67+
--platform ios \
68+
--device "iPhone 17 Pro" \
69+
--n "$PERF_ROUNDS" --warmup 1 \
70+
--out-dir "$GITHUB_WORKSPACE/perf-results"
71+
72+
- name: Upload iOS perf report
73+
if: always()
74+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
75+
with:
76+
name: perf-ios
77+
path: perf-results/
78+
if-no-files-found: warn
79+
80+
perf-android:
81+
name: Android Command Perf
82+
runs-on: ubuntu-latest
83+
timeout-minutes: 80
84+
steps:
85+
- name: Checkout
86+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
87+
88+
- name: Setup toolchain
89+
uses: ./.github/actions/setup-node-pnpm
90+
91+
- name: Setup Android replay host
92+
id: android-replay-host
93+
uses: ./.github/actions/setup-android-replay-host
94+
95+
- name: Package npm-bundled Android helpers
96+
run: |
97+
pnpm package:android-snapshot-helper:npm
98+
pnpm package:android-multitouch-helper:npm
99+
100+
- name: Run Android command perf benchmark
101+
uses: reactivecircus/android-emulator-runner@b530d96654c385303d652368551fb075bc2f0b6b # v2.35.0
102+
with:
103+
api-level: 36
104+
arch: x86_64
105+
profile: pixel_7
106+
target: google_apis_playstore
107+
emulator-options: -no-window -gpu swiftshader_indirect -no-snapshot -noaudio -no-boot-anim -no-metrics
108+
script: |
109+
set -e
110+
# Disable animations up front so accessibility dumps don't time out (the harness
111+
# also runs `settings animations off`, this is belt-and-suspenders).
112+
adb -s emulator-5554 shell settings put global window_animation_scale 0 || true
113+
adb -s emulator-5554 shell settings put global transition_animation_scale 0 || true
114+
adb -s emulator-5554 shell settings put global animator_duration_scale 0 || true
115+
node --experimental-strip-types scripts/perf/run.ts \
116+
--platform android \
117+
--serial emulator-5554 \
118+
--n "$PERF_ROUNDS" --warmup 1 \
119+
--out-dir "$GITHUB_WORKSPACE/perf-results"
120+
121+
- name: Upload Android perf report
122+
if: always()
123+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
124+
with:
125+
name: perf-android
126+
path: perf-results/
127+
if-no-files-found: warn

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
node_modules/
2+
scripts/perf/.results/
23
.pnpm-store/
34
.fallow/
45
dist/

package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@
9898
"ad": "node bin/agent-device.mjs",
9999
"size": "node scripts/size-report.mjs",
100100
"size:markdown": "node scripts/size-report.mjs --json .tmp/size-report.json --markdown .tmp/size-report.md",
101+
"perf": "node --experimental-strip-types scripts/perf/run.ts",
102+
"perf:ios": "node --experimental-strip-types scripts/perf/run.ts --platform ios",
103+
"perf:android": "node --experimental-strip-types scripts/perf/run.ts --platform android",
101104
"lint": "oxlint . --deny-warnings",
102105
"format": "oxfmt --write src test skills package.json tsconfig.json tsconfig.lib.json rslib.config.ts vitest.config.ts .github/actions/setup-node-pnpm/action.yml .oxlintrc.json .oxfmtrc.json '!test/skillgym/.skillgym-results/**'",
103106
"fallow": "fallow --summary",

scripts/perf/cli.ts

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import { performance } from 'node:perf_hooks';
2+
import { runCmdSync } from '../../src/utils/exec.ts';
3+
import { resolveCliArgv, REPO_ROOT } from './config.ts';
4+
import type { BatchStepSpec } from './scenario.ts';
5+
import type { CliResult } from './types.ts';
6+
7+
const MAX_BUFFER = 64 * 1024 * 1024;
8+
const CLI_ARGV = resolveCliArgv();
9+
10+
function tryParseJson(stdout: string): unknown {
11+
const trimmed = stdout.trim();
12+
if (!trimmed) return undefined;
13+
try {
14+
return JSON.parse(trimmed);
15+
} catch {
16+
// Some commands print a trailing line after JSON; try the last JSON-looking block.
17+
const start = trimmed.indexOf('{');
18+
const end = trimmed.lastIndexOf('}');
19+
if (start >= 0 && end > start) {
20+
try {
21+
return JSON.parse(trimmed.slice(start, end + 1));
22+
} catch {
23+
return undefined;
24+
}
25+
}
26+
return undefined;
27+
}
28+
}
29+
30+
function jsonOk(json: unknown): boolean {
31+
return !(json !== null && typeof json === 'object' && (json as { ok?: unknown }).ok === false);
32+
}
33+
34+
// Invoke the built CLI once. `args` includes the command + positionals + dash-flags;
35+
// `baseFlags` carries the isolation + device flags shared by every call.
36+
export function invokeCli(args: string[], baseFlags: string[]): CliResult {
37+
const full = [...CLI_ARGV, ...args, ...baseFlags, '--json'];
38+
const t0 = performance.now();
39+
let stdout = '';
40+
let stderr = '';
41+
let exitCode = -1;
42+
try {
43+
// allowFailure so non-zero exits are recorded as samples instead of thrown; maxBuffer
44+
// raised because snapshot payloads exceed Node's ~1MB default.
45+
const r = runCmdSync(process.execPath, full, {
46+
cwd: REPO_ROOT,
47+
maxBuffer: MAX_BUFFER,
48+
allowFailure: true,
49+
});
50+
stdout = r.stdout;
51+
stderr = r.stderr;
52+
exitCode = r.exitCode;
53+
} catch (error) {
54+
// Spawn-level failures (missing executable, timeout) — record as a failed sample.
55+
stderr = error instanceof Error ? error.message : String(error);
56+
}
57+
const wallClockMs = performance.now() - t0;
58+
const json = tryParseJson(stdout);
59+
return { exitCode, wallClockMs, stdout, stderr, json, ok: exitCode === 0 && jsonOk(json) };
60+
}
61+
62+
// Wrap a single command in its own `batch` invocation to read per-step durationMs.
63+
export function invokeBatchStep(spec: BatchStepSpec, baseFlags: string[]): CliResult {
64+
const result = invokeCli(['batch', '--steps', JSON.stringify([spec])], baseFlags);
65+
// Defensive: today's stop-only batch surfaces a failed step as a top-level non-zero/ok:false
66+
// (already caught by invokeCli). But if a future on-error mode keeps the batch ok while a step
67+
// fails, don't silently count that step as a success — downgrade ok from the step's own ok.
68+
const stepOk = firstBatchResult(result.json)?.ok;
69+
if (result.ok && stepOk === false) {
70+
return { ...result, ok: false };
71+
}
72+
return result;
73+
}
74+
75+
function firstBatchResult(json: unknown): Record<string, unknown> | undefined {
76+
const data = (json as { data?: { results?: unknown[] } } | undefined)?.data;
77+
const first = data?.results?.[0];
78+
return first && typeof first === 'object' ? (first as Record<string, unknown>) : undefined;
79+
}
80+
81+
export function readBatchStepDurationMs(result: CliResult): number | undefined {
82+
const v = firstBatchResult(result.json)?.durationMs;
83+
return typeof v === 'number' ? v : undefined;
84+
}
85+
86+
export function readBatchStepError(result: CliResult): { code?: string; message?: string } {
87+
const err = (result.json as { error?: { code?: string; message?: string } } | undefined)?.error;
88+
return { code: err?.code, message: err?.message };
89+
}
90+
91+
// Proxy for a11y-tree size: snapshot node count (falls back to distinct @eN refs).
92+
export function countElements(result: CliResult): number | undefined {
93+
const stepData = firstBatchResult(result.json)?.data;
94+
if (stepData === undefined || typeof stepData !== 'object') return undefined;
95+
const nodes = (stepData as { nodes?: unknown }).nodes;
96+
if (Array.isArray(nodes)) return nodes.length;
97+
const matches = JSON.stringify(stepData).match(/@e\d+/g);
98+
return matches ? new Set(matches).size : 0;
99+
}

scripts/perf/config.ts

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import path from 'node:path';
2+
import { fileURLToPath } from 'node:url';
3+
import type { Platform } from './types.ts';
4+
5+
const HERE = path.dirname(fileURLToPath(import.meta.url));
6+
export const REPO_ROOT = path.resolve(HERE, '..', '..');
7+
const CLI_BIN = path.join(REPO_ROOT, 'bin', 'agent-device.mjs');
8+
const DEFAULT_OUT_DIR = path.join(HERE, '.results');
9+
10+
export type PerfConfig = {
11+
platform: Platform;
12+
rounds: number; // measured rounds (samples per command)
13+
warmup: number; // leading rounds dropped from stats
14+
keepArtifacts: boolean; // keep temp state dir + leave device booted
15+
outDir: string;
16+
udid?: string; // iOS device override (UDID)
17+
device?: string; // device override by name (e.g. "iPhone 17 Pro"); preferred over udid
18+
serial?: string; // Android device override
19+
};
20+
21+
// How to invoke the CLI. Defaults to the built dist binary (bin/agent-device.mjs).
22+
// Set AGENT_DEVICE_PERF_CLI to run from source instead, e.g. on CI:
23+
// AGENT_DEVICE_PERF_CLI="--experimental-strip-types src/bin.ts"
24+
// (matches the device workflows, which run from source and skip the dist build).
25+
export function resolveCliArgv(): string[] {
26+
const override = process.env.AGENT_DEVICE_PERF_CLI?.trim();
27+
if (override) return override.split(/\s+/);
28+
return [CLI_BIN];
29+
}
30+
31+
export function usesSourceCli(): boolean {
32+
return Boolean(process.env.AGENT_DEVICE_PERF_CLI?.trim());
33+
}
34+
35+
function readValue(argv: string[], i: number, flag: string): string {
36+
const v = argv[i + 1];
37+
if (v === undefined) throw new Error(`Missing value for ${flag}`);
38+
return v;
39+
}
40+
41+
function readIntValue(argv: string[], i: number, flag: string, min: number): number {
42+
const raw = readValue(argv, i, flag);
43+
const n = Number(raw);
44+
if (!Number.isInteger(n) || n < min) {
45+
throw new Error(`${flag} must be an integer >= ${min} (got ${JSON.stringify(raw)})`);
46+
}
47+
return n;
48+
}
49+
50+
export function parseConfig(argv: string[]): PerfConfig {
51+
const cfg: PerfConfig = {
52+
platform: 'ios',
53+
rounds: 5,
54+
warmup: 1,
55+
keepArtifacts: false,
56+
outDir: DEFAULT_OUT_DIR,
57+
};
58+
for (let i = 0; i < argv.length; i++) {
59+
const a = argv[i];
60+
switch (a) {
61+
case '--platform': {
62+
const v = readValue(argv, i++, a);
63+
if (v !== 'ios' && v !== 'android') throw new Error(`Unknown platform: ${v}`);
64+
cfg.platform = v;
65+
break;
66+
}
67+
case '--n':
68+
case '--rounds':
69+
cfg.rounds = readIntValue(argv, i++, a, 1);
70+
break;
71+
case '--warmup':
72+
cfg.warmup = readIntValue(argv, i++, a, 0);
73+
break;
74+
case '--keep-artifacts':
75+
cfg.keepArtifacts = true;
76+
break;
77+
case '--out-dir':
78+
cfg.outDir = path.resolve(readValue(argv, i++, a));
79+
break;
80+
case '--udid':
81+
cfg.udid = readValue(argv, i++, a);
82+
break;
83+
case '--device':
84+
cfg.device = readValue(argv, i++, a);
85+
break;
86+
case '--serial':
87+
cfg.serial = readValue(argv, i++, a);
88+
break;
89+
default:
90+
throw new Error(`Unknown flag: ${a}`);
91+
}
92+
}
93+
return cfg;
94+
}

0 commit comments

Comments
 (0)