Skip to content

Commit fe64572

Browse files
authored
feat(benchmarks): Support local Claude UI benchmark suites (#429)
1 parent 3eaed16 commit fe64572

26 files changed

Lines changed: 3215 additions & 833 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,4 @@ DerivedData
118118
/repros
119119
/.xcodebuildmcp
120120
/out.nosync
121+
/benchmarks/claude-ui/local/

benchmarks/claude-ui/README.md

Lines changed: 111 additions & 51 deletions
Large diffs are not rendered by default.

benchmarks/claude-ui/parse_claude_conversation.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#!/usr/bin/env python3
22
"""Parse a Claude Code NDJSON session log into per-turn files.
33
4-
Filters to: user prompts, assistant text replies, and XcodeBuildMCP tool
4+
Filters to: user prompts, assistant text replies, and configured tool
55
calls/results. Strips screenshot image blobs to a short placeholder.
66
77
Usage:
88
parse_claude_conversation.py <session.jsonl> [output_dir] \\
9-
[--tool-prefix=mcp__xcodebuildmcp]
9+
[--tool-prefix=mcp__xcodebuildmcp] [--tool-name=Bash]
1010
"""
1111

1212
from __future__ import annotations
@@ -101,10 +101,14 @@ def extract_user_text(entry: dict) -> str:
101101
return "\n\n".join(parts)
102102

103103

104-
def parse(path: Path, out_dir: Path, tool_prefix: str) -> bool:
104+
def matches_tool_name(name: str, tool_prefixes: list[str], tool_names: set[str]) -> bool:
105+
return name in tool_names or any(name.startswith(prefix) for prefix in tool_prefixes)
106+
107+
108+
def parse(path: Path, out_dir: Path, tool_prefixes: list[str], tool_names: set[str]) -> bool:
105109
out_dir.mkdir(parents=True, exist_ok=True)
106110

107-
# Track tool_use_ids that target our prefix so we keep matching results.
111+
# Track tool_use_ids that target configured tools so we keep matching results.
108112
tracked_ids: set[str] = set()
109113
tool_name_by_id: dict[str, str] = {}
110114
counter = 0
@@ -185,7 +189,7 @@ def next_path(kind: str, label: str | None = None) -> Path:
185189
)
186190
elif btype == "tool_use":
187191
name = block.get("name", "")
188-
if not name.startswith(tool_prefix):
192+
if not matches_tool_name(name, tool_prefixes, tool_names):
189193
continue
190194
tool_id = block.get("id", "")
191195
tracked_ids.add(tool_id)
@@ -220,17 +224,25 @@ def main() -> int:
220224
)
221225
ap.add_argument(
222226
"--tool-prefix",
223-
default="mcp__xcodebuildmcp",
227+
action="append",
228+
default=None,
224229
help="Only include tool calls whose name starts with this prefix",
225230
)
231+
ap.add_argument(
232+
"--tool-name",
233+
action="append",
234+
default=[],
235+
help="Also include tool calls whose name exactly matches this value",
236+
)
226237
args = ap.parse_args()
227238

228239
if not args.jsonl.is_file():
229240
print(f"error: not a file: {args.jsonl}", file=sys.stderr)
230241
return 1
231242

232243
out = args.output or args.jsonl.with_name(f"{args.jsonl.stem}_conversation")
233-
return 0 if parse(args.jsonl, out, args.tool_prefix) else 1
244+
tool_prefixes = args.tool_prefix or ["mcp__xcodebuildmcp"]
245+
return 0 if parse(args.jsonl, out, tool_prefixes, set(args.tool_name)) else 1
234246

235247

236248
if __name__ == "__main__":

benchmarks/claude-ui/suites/contacts.yml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,38 +9,38 @@ firstRunPromptDismissals:
99
- Continue
1010
- Not Now
1111
- OK
12-
timeoutSeconds: 8
12+
timeoutSeconds: 30
1313
baseline:
14-
totalToolCalls: 14
15-
mcpToolCalls: 13
16-
uiAutomationCalls: 11
17-
wallClockSeconds: 97
14+
totalToolCalls: 19
15+
trackedToolCalls: 18
16+
mcpToolCalls: 18
17+
uiAutomationCalls: 16
18+
wallClockSeconds: 102.94
1819
tools:
1920
session_show_defaults: 1
2021
launch_app_sim: 1
21-
snapshot_ui: 1
22+
snapshot_ui: 6
2223
tap: 5
2324
type_text: 5
24-
allowedVariance:
25-
totalToolCalls: 3
26-
mcpToolCalls: 3
27-
uiAutomationCalls: 3
28-
wallClockSeconds: 45
29-
toolCalls: 2
30-
expectedToolSequence:
25+
baselineToolSequence:
3126
- session_show_defaults
3227
- launch_app_sim
3328
- snapshot_ui
3429
- tap
30+
- snapshot_ui
3531
- tap
3632
- type_text
33+
- snapshot_ui
3734
- type_text
3835
- type_text
3936
- tap
37+
- snapshot_ui
4038
- type_text
4139
- tap
40+
- snapshot_ui
4241
- type_text
4342
- tap
43+
- snapshot_ui
4444
failurePatterns:
4545
- STALE_ELEMENT_REF
4646
- SNAPSHOT_MISSING

benchmarks/claude-ui/suites/reminders.yml

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,31 +8,29 @@ firstRunPromptDismissals:
88
labels:
99
- Continue
1010
- Not Now
11-
timeoutSeconds: 12
11+
timeoutSeconds: 30
1212
baseline:
13-
totalToolCalls: 15
14-
mcpToolCalls: 14
15-
uiAutomationCalls: 12
16-
wallClockSeconds: 85
13+
totalToolCalls: 17
14+
trackedToolCalls: 16
15+
mcpToolCalls: 16
16+
uiAutomationCalls: 14
17+
wallClockSeconds: 92.79
1718
tools:
1819
session_show_defaults: 1
1920
launch_app_sim: 1
2021
snapshot_ui: 1
21-
tap: 4
22+
tap: 5
23+
wait_for_ui: 1
2224
type_text: 4
2325
key_press: 2
2426
batch: 1
25-
allowedVariance:
26-
totalToolCalls: 5
27-
mcpToolCalls: 5
28-
uiAutomationCalls: 5
29-
wallClockSeconds: 60
30-
toolCalls: 3
31-
expectedToolSequence:
27+
baselineToolSequence:
3228
- session_show_defaults
3329
- launch_app_sim
3430
- snapshot_ui
3531
- tap
32+
- wait_for_ui
33+
- tap
3634
- type_text
3735
- tap
3836
- tap

benchmarks/claude-ui/suites/weather.yml

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,20 @@ sessionDefaults:
66
scheme: Weather
77
simulatorName: iPhone 17 Pro Max
88
baseline:
9-
totalToolCalls: 13
10-
mcpToolCalls: 12
11-
uiAutomationCalls: 10
12-
wallClockSeconds: 98
9+
totalToolCalls: 14
10+
trackedToolCalls: 13
11+
mcpToolCalls: 13
12+
uiAutomationCalls: 11
13+
wallClockSeconds: 100.03
1314
tools:
1415
session_show_defaults: 1
1516
build_run_sim: 1
16-
snapshot_ui: 1
17+
snapshot_ui: 2
1718
tap: 6
1819
batch: 1
1920
type_text: 1
2021
swipe: 1
21-
allowedVariance:
22-
totalToolCalls: 2
23-
mcpToolCalls: 2
24-
uiAutomationCalls: 2
25-
wallClockSeconds: 45
26-
toolCalls: 2
27-
expectedToolSequence:
22+
baselineToolSequence:
2823
- session_show_defaults
2924
- build_run_sim
3025
- snapshot_ui
@@ -36,6 +31,7 @@ expectedToolSequence:
3631
- tap
3732
- tap
3833
- swipe
34+
- snapshot_ui
3935
- tap
4036
failurePatterns:
4137
- STALE_ELEMENT_REF

0 commit comments

Comments
 (0)