Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .fallowrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
],
"ignorePatterns": [
"examples/test-app/**",
"scripts/perf/**",
"ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests.xctestplan",
"scripts/write-xcuitest-cache-metadata.mjs"
],
Expand Down
127 changes: 127 additions & 0 deletions .github/workflows/perf-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
name: Perf Nightly

# End-to-end command perf benchmark (scripts/perf). Scheduled + manual only — perf timing on
# shared CI runners is noisy, so treat this as a trend/regression signal, not absolute numbers.
# Reuses the same build artifacts as the device suites: the cached iOS XCUITest runner
# (setup-apple-replay, ios-runner-prebuilt cache) and the Android replay host, and runs the CLI
# from source via --experimental-strip-types (no dist build), matching the replay workflows.

on:
schedule:
- cron: "0 4 * * *"
workflow_dispatch:
inputs:
rounds:
description: "Measured rounds per command (samples)"
required: false
default: "5"

permissions:
contents: read

concurrency:
group: ci-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

env:
AGENT_DEVICE_PERF_CLI: "--experimental-strip-types src/bin.ts"
PERF_ROUNDS: ${{ github.event.inputs.rounds || '5' }}

jobs:
perf-ios:
name: iOS Command Perf
runs-on: macos-26
timeout-minutes: 80
env:
IOS_RUNTIME_VERSION: "26.2"
AGENT_DEVICE_IOS_RUNNER_DERIVED_PATH: ${{ github.workspace }}/.tmp/ios-runner-derived
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Setup toolchain
uses: ./.github/actions/setup-node-pnpm

- name: Setup Apple replay
id: apple-replay
uses: ./.github/actions/setup-apple-replay
with:
derived-path: ${{ env.AGENT_DEVICE_IOS_RUNNER_DERIVED_PATH }}
cache-key-prefix: ios-runner-prebuilt
cache-key-suffix: -ios-${{ env.IOS_RUNTIME_VERSION }}
build-command: sh ./scripts/build-xcuitest-apple.sh
xcuitest-platform: ios
xcuitest-destination: generic/platform=iOS Simulator
clean-derived: "1"

- name: Boot iOS test simulator
uses: ./.github/actions/boot-ios-test-simulator
with:
runtime-version: ${{ env.IOS_RUNTIME_VERSION }}
preferred-device-name: iPhone 17 Pro

- name: Run iOS command perf benchmark
run: |
pnpm clean:daemon
node --experimental-strip-types scripts/perf/run.ts \
--platform ios \
--device "iPhone 17 Pro" \
--n "$PERF_ROUNDS" --warmup 1 \
--out-dir "$GITHUB_WORKSPACE/perf-results"

- name: Upload iOS perf report
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: perf-ios
path: perf-results/
if-no-files-found: warn

perf-android:
name: Android Command Perf
runs-on: ubuntu-latest
timeout-minutes: 80
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Setup toolchain
uses: ./.github/actions/setup-node-pnpm

- name: Setup Android replay host
id: android-replay-host
uses: ./.github/actions/setup-android-replay-host

- name: Package npm-bundled Android helpers
run: |
pnpm package:android-snapshot-helper:npm
pnpm package:android-multitouch-helper:npm

- name: Run Android command perf benchmark
uses: reactivecircus/android-emulator-runner@b530d96654c385303d652368551fb075bc2f0b6b # v2.35.0
with:
api-level: 36
arch: x86_64
profile: pixel_7
target: google_apis_playstore
emulator-options: -no-window -gpu swiftshader_indirect -no-snapshot -noaudio -no-boot-anim -no-metrics
script: |
set -e
# Disable animations up front so accessibility dumps don't time out (the harness
# also runs `settings animations off`, this is belt-and-suspenders).
adb -s emulator-5554 shell settings put global window_animation_scale 0 || true
adb -s emulator-5554 shell settings put global transition_animation_scale 0 || true
adb -s emulator-5554 shell settings put global animator_duration_scale 0 || true
node --experimental-strip-types scripts/perf/run.ts \
--platform android \
--serial emulator-5554 \
--n "$PERF_ROUNDS" --warmup 1 \
--out-dir "$GITHUB_WORKSPACE/perf-results"

- name: Upload Android perf report
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: perf-android
path: perf-results/
if-no-files-found: warn
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
node_modules/
scripts/perf/.results/
.pnpm-store/
.fallow/
dist/
Expand Down
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@
"ad": "node bin/agent-device.mjs",
"size": "node scripts/size-report.mjs",
"size:markdown": "node scripts/size-report.mjs --json .tmp/size-report.json --markdown .tmp/size-report.md",
"perf": "node --experimental-strip-types scripts/perf/run.ts",
"perf:ios": "node --experimental-strip-types scripts/perf/run.ts --platform ios",
"perf:android": "node --experimental-strip-types scripts/perf/run.ts --platform android",
"lint": "oxlint . --deny-warnings",
"format": "oxfmt --write src test skills package.json tsconfig.json tsconfig.lib.json rslib.config.ts vitest.config.ts .github/actions/setup-node-pnpm/action.yml .oxlintrc.json .oxfmtrc.json '!test/skillgym/.skillgym-results/**'",
"fallow": "fallow --summary",
Expand Down
99 changes: 99 additions & 0 deletions scripts/perf/cli.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import { performance } from 'node:perf_hooks';
import { runCmdSync } from '../../src/utils/exec.ts';
import { resolveCliArgv, REPO_ROOT } from './config.ts';
import type { BatchStepSpec } from './scenario.ts';
import type { CliResult } from './types.ts';

const MAX_BUFFER = 64 * 1024 * 1024;
const CLI_ARGV = resolveCliArgv();

function tryParseJson(stdout: string): unknown {
const trimmed = stdout.trim();
if (!trimmed) return undefined;
try {
return JSON.parse(trimmed);
} catch {
// Some commands print a trailing line after JSON; try the last JSON-looking block.
const start = trimmed.indexOf('{');
const end = trimmed.lastIndexOf('}');
if (start >= 0 && end > start) {
try {
return JSON.parse(trimmed.slice(start, end + 1));
} catch {
return undefined;
}
}
return undefined;
}
}

function jsonOk(json: unknown): boolean {
return !(json !== null && typeof json === 'object' && (json as { ok?: unknown }).ok === false);
}

// Invoke the built CLI once. `args` includes the command + positionals + dash-flags;
// `baseFlags` carries the isolation + device flags shared by every call.
export function invokeCli(args: string[], baseFlags: string[]): CliResult {
const full = [...CLI_ARGV, ...args, ...baseFlags, '--json'];
const t0 = performance.now();
let stdout = '';
let stderr = '';
let exitCode = -1;
try {
// allowFailure so non-zero exits are recorded as samples instead of thrown; maxBuffer
// raised because snapshot payloads exceed Node's ~1MB default.
const r = runCmdSync(process.execPath, full, {
cwd: REPO_ROOT,
maxBuffer: MAX_BUFFER,
allowFailure: true,
});
stdout = r.stdout;
stderr = r.stderr;
exitCode = r.exitCode;
} catch (error) {
// Spawn-level failures (missing executable, timeout) — record as a failed sample.
stderr = error instanceof Error ? error.message : String(error);
}
const wallClockMs = performance.now() - t0;
const json = tryParseJson(stdout);
return { exitCode, wallClockMs, stdout, stderr, json, ok: exitCode === 0 && jsonOk(json) };
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Honor failed batch step results

When a single-step batch --json invocation reports a nested failure as data.results[0].ok === false while the top-level response remains successful, this line still marks the sample as successful because jsonOk only checks top-level ok. In the nightly perf harness, any failed selector/screenshot/logs batch step will be included in medians with no failure note, corrupting the benchmark report; the batch path should inspect the first result's ok and nested error as well.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed defensively. With today's stop-only batch this can't occur — a failed step yields a top-level ok:false/non-zero exit, already caught by invokeCli. But invokeBatchStep now also downgrades the sample's ok from the step's own result.ok, so a future on-error=continue mode can't silently count a failed step as a successful sample.

}

// Wrap a single command in its own `batch` invocation to read per-step durationMs.
export function invokeBatchStep(spec: BatchStepSpec, baseFlags: string[]): CliResult {
const result = invokeCli(['batch', '--steps', JSON.stringify([spec])], baseFlags);
// Defensive: today's stop-only batch surfaces a failed step as a top-level non-zero/ok:false
// (already caught by invokeCli). But if a future on-error mode keeps the batch ok while a step
// fails, don't silently count that step as a success — downgrade ok from the step's own ok.
const stepOk = firstBatchResult(result.json)?.ok;
if (result.ok && stepOk === false) {
return { ...result, ok: false };
}
return result;
}

function firstBatchResult(json: unknown): Record<string, unknown> | undefined {
const data = (json as { data?: { results?: unknown[] } } | undefined)?.data;
const first = data?.results?.[0];
return first && typeof first === 'object' ? (first as Record<string, unknown>) : undefined;
}

export function readBatchStepDurationMs(result: CliResult): number | undefined {
const v = firstBatchResult(result.json)?.durationMs;
return typeof v === 'number' ? v : undefined;
}

export function readBatchStepError(result: CliResult): { code?: string; message?: string } {
const err = (result.json as { error?: { code?: string; message?: string } } | undefined)?.error;
return { code: err?.code, message: err?.message };
}

// Proxy for a11y-tree size: snapshot node count (falls back to distinct @eN refs).
export function countElements(result: CliResult): number | undefined {
const stepData = firstBatchResult(result.json)?.data;
if (stepData === undefined || typeof stepData !== 'object') return undefined;
const nodes = (stepData as { nodes?: unknown }).nodes;
if (Array.isArray(nodes)) return nodes.length;
const matches = JSON.stringify(stepData).match(/@e\d+/g);
return matches ? new Set(matches).size : 0;
}
94 changes: 94 additions & 0 deletions scripts/perf/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import type { Platform } from './types.ts';

const HERE = path.dirname(fileURLToPath(import.meta.url));
export const REPO_ROOT = path.resolve(HERE, '..', '..');
const CLI_BIN = path.join(REPO_ROOT, 'bin', 'agent-device.mjs');
const DEFAULT_OUT_DIR = path.join(HERE, '.results');

export type PerfConfig = {
platform: Platform;
rounds: number; // measured rounds (samples per command)
warmup: number; // leading rounds dropped from stats
keepArtifacts: boolean; // keep temp state dir + leave device booted
outDir: string;
udid?: string; // iOS device override (UDID)
device?: string; // device override by name (e.g. "iPhone 17 Pro"); preferred over udid
serial?: string; // Android device override
};

// How to invoke the CLI. Defaults to the built dist binary (bin/agent-device.mjs).
// Set AGENT_DEVICE_PERF_CLI to run from source instead, e.g. on CI:
// AGENT_DEVICE_PERF_CLI="--experimental-strip-types src/bin.ts"
// (matches the device workflows, which run from source and skip the dist build).
export function resolveCliArgv(): string[] {
const override = process.env.AGENT_DEVICE_PERF_CLI?.trim();
if (override) return override.split(/\s+/);
return [CLI_BIN];
}

export function usesSourceCli(): boolean {
return Boolean(process.env.AGENT_DEVICE_PERF_CLI?.trim());
}

function readValue(argv: string[], i: number, flag: string): string {
const v = argv[i + 1];
if (v === undefined) throw new Error(`Missing value for ${flag}`);
return v;
}

function readIntValue(argv: string[], i: number, flag: string, min: number): number {
const raw = readValue(argv, i, flag);
const n = Number(raw);
if (!Number.isInteger(n) || n < min) {
throw new Error(`${flag} must be an integer >= ${min} (got ${JSON.stringify(raw)})`);
}
return n;
}

export function parseConfig(argv: string[]): PerfConfig {
const cfg: PerfConfig = {
platform: 'ios',
rounds: 5,
warmup: 1,
keepArtifacts: false,
outDir: DEFAULT_OUT_DIR,
};
for (let i = 0; i < argv.length; i++) {
const a = argv[i];
switch (a) {
case '--platform': {
const v = readValue(argv, i++, a);
if (v !== 'ios' && v !== 'android') throw new Error(`Unknown platform: ${v}`);
cfg.platform = v;
break;
}
case '--n':
case '--rounds':
cfg.rounds = readIntValue(argv, i++, a, 1);
break;
case '--warmup':
cfg.warmup = readIntValue(argv, i++, a, 0);
break;
case '--keep-artifacts':
cfg.keepArtifacts = true;
break;
case '--out-dir':
cfg.outDir = path.resolve(readValue(argv, i++, a));
break;
case '--udid':
cfg.udid = readValue(argv, i++, a);
break;
case '--device':
cfg.device = readValue(argv, i++, a);
break;
case '--serial':
cfg.serial = readValue(argv, i++, a);
break;
default:
throw new Error(`Unknown flag: ${a}`);
}
}
return cfg;
}
Loading
Loading