Skip to content

Commit 3dccc3b

Browse files
cameroncookecodex
andcommitted
fix(benchmarks): Harden Claude UI review feedback
Validate activated benchmark skills before setup, preserve authoritative Claude stream results only for harness-terminated runs, and make transcript failure accounting robust for missing or duplicate Bash tool results. Co-Authored-By: OpenAI Codex <noreply@openai.com>
1 parent b8e9353 commit 3dccc3b

7 files changed

Lines changed: 247 additions & 100 deletions

File tree

src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,42 @@ describe('Claude UI benchmark analysis', () => {
359359
).toThrow('weather.yml.claude.activateSkill: requires skillDirs');
360360
});
361361

362+
it('rejects activateSkill that does not match skillDirs when loading config', () => {
363+
expect(() =>
364+
readConfig(
365+
{
366+
name: 'weather',
367+
prompt: 'prompt.md',
368+
claude: {
369+
skillDirs: ['benchmarks/claude-ui/local/skills/vendor-cli'],
370+
activateSkill: 'other-skill',
371+
isolatedWorkingDirectory: true,
372+
},
373+
},
374+
'weather.yml',
375+
),
376+
).toThrow('weather.yml.claude.activateSkill: must match a basename from skillDirs');
377+
});
378+
379+
it('rejects duplicate skillDir basenames when loading config', () => {
380+
expect(() =>
381+
readConfig(
382+
{
383+
name: 'weather',
384+
prompt: 'prompt.md',
385+
claude: {
386+
skillDirs: [
387+
'benchmarks/claude-ui/local/skills/vendor-cli',
388+
'benchmarks/claude-ui/fixtures/skills/vendor-cli',
389+
],
390+
isolatedWorkingDirectory: true,
391+
},
392+
},
393+
'weather.yml',
394+
),
395+
).toThrow("weather.yml.claude.skillDirs: duplicate basename 'vendor-cli'");
396+
});
397+
362398
it('rejects invalid session defaults when loading config', () => {
363399
expect(() =>
364400
readConfig(

src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts

Lines changed: 164 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,10 @@
1-
import { spawn } from 'node:child_process';
2-
import { mkdtemp, readdir, readFile, rm, writeFile } from 'node:fs/promises';
3-
import { tmpdir } from 'node:os';
41
import path from 'node:path';
5-
import { fileURLToPath } from 'node:url';
62
import { buildClaudeArgs } from '../claude-invocation.ts';
73
import { compareBenchmark } from '../compare.ts';
84
import { readConfig } from '../config.ts';
95
import { analyzeClaudeJsonl } from '../transcript.ts';
106
import type { BenchmarkArtifacts, BenchmarkRunMetadata } from '../types.ts';
117

12-
const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../../../..');
13-
148
function line(value: unknown): string {
159
return JSON.stringify(value);
1610
}
@@ -42,28 +36,6 @@ function runMetadata(wallClockSeconds: number): BenchmarkRunMetadata {
4236
};
4337
}
4438

45-
function runParserScript(args: string[]): Promise<{
46-
exitCode: number | null;
47-
stdout: string;
48-
stderr: string;
49-
}> {
50-
return new Promise((resolve, reject) => {
51-
const child = spawn('python3', args, { stdio: ['ignore', 'pipe', 'pipe'] });
52-
const stdout: Buffer[] = [];
53-
const stderr: Buffer[] = [];
54-
child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk));
55-
child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk));
56-
child.on('error', reject);
57-
child.on('close', (exitCode) => {
58-
resolve({
59-
exitCode,
60-
stdout: Buffer.concat(stdout).toString('utf8'),
61-
stderr: Buffer.concat(stderr).toString('utf8'),
62-
});
63-
});
64-
});
65-
}
66-
6739
describe('Claude UI benchmark tool configuration', () => {
6840
it('loads Claude invocation and tool analysis from suite config', () => {
6941
const config = readConfig(
@@ -612,6 +584,120 @@ describe('Claude UI benchmark tool configuration', () => {
612584
});
613585
});
614586

587+
it('handles tracked tool results without content', () => {
588+
const config = readConfig(
589+
{
590+
name: 'private CLI weather',
591+
prompt: 'weather.md',
592+
failurePatterns: ['WAIT_TIMEOUT'],
593+
toolAnalysis: {
594+
matchers: [
595+
{
596+
kind: 'bashCommand',
597+
commandPrefix: 'privatecli',
598+
shortName: 'privatecli.other',
599+
},
600+
],
601+
},
602+
},
603+
'private-cli.yml',
604+
);
605+
const transcript = [
606+
line({
607+
type: 'assistant',
608+
message: {
609+
content: [
610+
{
611+
type: 'tool_use',
612+
id: 'tool-1',
613+
name: 'Bash',
614+
input: { command: 'privatecli --version' },
615+
},
616+
],
617+
},
618+
}),
619+
line({
620+
type: 'user',
621+
message: {
622+
content: [{ type: 'tool_result', tool_use_id: 'tool-1', is_error: true }],
623+
},
624+
}),
625+
].join('\n');
626+
627+
const audit = analyzeClaudeJsonl(transcript, {
628+
toolAnalysis: config.toolAnalysis,
629+
failurePatterns: config.failurePatterns,
630+
});
631+
632+
expect(audit.failures).toEqual([
633+
{
634+
id: 'tool-1',
635+
fullName: 'Bash',
636+
shortName: 'privatecli.other',
637+
line: 2,
638+
message: '',
639+
},
640+
]);
641+
expect(audit.patternFailures).toEqual([]);
642+
});
643+
644+
it('counts repeated matches in one Bash failure result once', () => {
645+
const config = readConfig(
646+
{
647+
name: 'private CLI weather',
648+
prompt: 'weather.md',
649+
toolAnalysis: {
650+
matchers: [
651+
{
652+
kind: 'bashCommand',
653+
commandPrefix: 'privatecli',
654+
shortName: 'privatecli.other',
655+
},
656+
],
657+
},
658+
},
659+
'private-cli.yml',
660+
);
661+
const transcript = [
662+
line({
663+
type: 'assistant',
664+
message: {
665+
content: [
666+
{
667+
type: 'tool_use',
668+
id: 'tool-1',
669+
name: 'Bash',
670+
input: { command: 'privatecli one && privatecli two' },
671+
},
672+
],
673+
},
674+
}),
675+
line({
676+
type: 'user',
677+
message: {
678+
content: [
679+
{
680+
type: 'tool_result',
681+
tool_use_id: 'tool-1',
682+
is_error: true,
683+
content: 'Exit code 1',
684+
},
685+
],
686+
},
687+
}),
688+
].join('\n');
689+
690+
const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis });
691+
const result = compareBenchmark(config, audit, runMetadata(600));
692+
693+
expect(audit.trackedSequence.map((call) => call.shortName)).toEqual([
694+
'privatecli.other',
695+
'privatecli.other',
696+
]);
697+
expect(audit.failures).toHaveLength(1);
698+
expect(result.completion.issueCount).toBe(1);
699+
});
700+
615701
it('marks the benchmark incomplete when Claude exits non-zero', () => {
616702
const config = readConfig(
617703
{
@@ -633,53 +719,55 @@ describe('Claude UI benchmark tool configuration', () => {
633719
});
634720
});
635721

636-
it('lets the parser include configured non-MCP tool names', async () => {
637-
const dir = await mkdtemp(path.join(tmpdir(), 'claude-ui-parser-'));
638-
try {
639-
const jsonlPath = path.join(dir, 'claude.jsonl');
640-
const outputPath = path.join(dir, 'parsed');
641-
await writeFile(
642-
jsonlPath,
643-
[
644-
line({
645-
type: 'assistant',
646-
message: {
647-
content: [
648-
{
649-
type: 'tool_use',
650-
id: 'tool-1',
651-
name: 'Bash',
652-
input: { command: 'vendorcli ui screen --json' },
653-
},
654-
],
655-
},
656-
}),
657-
line({
658-
type: 'user',
659-
message: { content: [{ type: 'tool_result', tool_use_id: 'tool-1', content: 'ok' }] },
660-
}),
661-
].join('\n'),
662-
'utf8',
663-
);
664-
665-
const result = await runParserScript([
666-
path.join(repoRoot, 'benchmarks/claude-ui/parse_claude_conversation.py'),
667-
jsonlPath,
668-
outputPath,
669-
'--tool-prefix=mcp__xcodebuildmcp',
670-
'--tool-name=Bash',
671-
]);
672-
673-
expect(result.exitCode).toBe(0);
674-
expect(await readdir(outputPath)).toEqual([
675-
'0001_tool_call_Bash.md',
676-
'0002_tool_result_Bash.md',
677-
]);
678-
expect(await readFile(path.join(outputPath, '0001_tool_call_Bash.md'), 'utf8')).toContain(
679-
'vendorcli ui screen --json',
680-
);
681-
} finally {
682-
await rm(dir, { recursive: true, force: true });
683-
}
722+
it('keeps configured non-MCP tool names in transcript analysis', () => {
723+
const config = readConfig(
724+
{
725+
name: 'vendor CLI weather',
726+
prompt: 'weather.md',
727+
toolAnalysis: {
728+
matchers: [
729+
{
730+
kind: 'bashCommand',
731+
commandPrefix: 'vendorcli ui screen',
732+
shortName: 'vendorcli.screen',
733+
},
734+
],
735+
},
736+
},
737+
'vendor-cli.yml',
738+
);
739+
const transcript = [
740+
line({
741+
type: 'assistant',
742+
message: {
743+
content: [
744+
{
745+
type: 'tool_use',
746+
id: 'tool-1',
747+
name: 'Bash',
748+
input: { command: 'vendorcli ui screen --json' },
749+
},
750+
],
751+
},
752+
}),
753+
].join('\n');
754+
755+
const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis });
756+
757+
expect(
758+
audit.trackedSequence.map((call) => ({
759+
fullName: call.fullName,
760+
shortName: call.shortName,
761+
isUiAutomation: call.isUiAutomation,
762+
line: call.line,
763+
})),
764+
).toEqual([
765+
{
766+
fullName: 'Bash',
767+
shortName: 'vendorcli.screen',
768+
isUiAutomation: false,
769+
line: 1,
770+
},
771+
]);
684772
});
685773
});

src/benchmarks/claude-ui/config.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { readFile } from 'node:fs/promises';
2+
import path from 'node:path';
23
import { parse as parseYaml } from 'yaml';
34
import * as z from 'zod';
45
import { sessionDefaultsSchema } from '../../utils/session-defaults-schema.ts';
@@ -116,10 +117,27 @@ function readClaudeInvocationConfig(
116117
throw new Error(`${source}.permissionMode: expected 'default' or 'bypassPermissions'`);
117118
}
118119
const skillDirs = readOptionalStringArray(raw, 'skillDirs', source);
120+
if (skillDirs !== undefined) {
121+
const basenames = new Set<string>();
122+
for (const skillDir of skillDirs) {
123+
const basename = path.basename(skillDir);
124+
if (basenames.has(basename)) {
125+
throw new Error(`${source}.skillDirs: duplicate basename '${basename}'`);
126+
}
127+
basenames.add(basename);
128+
}
129+
}
119130
const activateSkill = readOptionalString(raw, 'activateSkill', source);
120131
if (activateSkill !== undefined && (!skillDirs || skillDirs.length === 0)) {
121132
throw new Error(`${source}.activateSkill: requires skillDirs`);
122133
}
134+
if (
135+
activateSkill !== undefined &&
136+
skillDirs !== undefined &&
137+
!skillDirs.some((skillDir) => path.basename(skillDir) === activateSkill)
138+
) {
139+
throw new Error(`${source}.activateSkill: must match a basename from skillDirs`);
140+
}
123141

124142
return {
125143
useMcpServer: readOptionalBoolean(raw, 'useMcpServer', source),

src/benchmarks/claude-ui/harness.ts

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ function runCommand(opts: {
218218
const started = process.hrtime.bigint();
219219
let stdoutBuffer = '';
220220
let terminalResultExitCode: number | undefined;
221+
let terminalResultRequestedTermination = false;
221222
let terminalResultTimer: NodeJS.Timeout | undefined;
222223
let timeoutTimer: NodeJS.Timeout | undefined;
223224
let hardKillTimer: NodeJS.Timeout | undefined;
@@ -272,13 +273,20 @@ function runCommand(opts: {
272273
if (terminalResultExitCode !== undefined || opts.terminalJsonResultGraceMs === undefined)
273274
return;
274275
terminalResultExitCode = result.is_error === true ? 1 : 0;
275-
terminalResultTimer = setTimeout(terminateChild, opts.terminalJsonResultGraceMs);
276+
terminalResultTimer = setTimeout(() => {
277+
terminalResultRequestedTermination = true;
278+
terminateChild();
279+
}, opts.terminalJsonResultGraceMs);
276280
terminalResultTimer.unref();
277281
};
278282

279283
if (opts.timeoutMs !== undefined) {
280284
timeoutTimer = setTimeout(() => {
281-
timedOut = true;
285+
if (terminalResultExitCode === undefined) {
286+
timedOut = true;
287+
} else {
288+
terminalResultRequestedTermination = true;
289+
}
282290
terminateChild();
283291
}, opts.timeoutMs);
284292
timeoutTimer.unref();
@@ -325,12 +333,19 @@ function runCommand(opts: {
325333
clearTimeoutTimer();
326334
clearHardKillTimer();
327335
const durationSeconds = Number(process.hrtime.bigint() - started) / 1_000_000_000;
336+
const resolvedExitCode =
337+
terminalResultExitCode !== undefined &&
338+
(terminalResultRequestedTermination || exitCode === 0 || exitCode === null)
339+
? terminalResultExitCode
340+
: timedOut
341+
? 143
342+
: (exitCode ?? null);
328343
stdout.end();
329344
stderr.end();
330345
Promise.all([finished(stdout), finished(stderr)])
331346
.then(() =>
332347
resolve({
333-
exitCode: timedOut ? 143 : (exitCode ?? terminalResultExitCode ?? null),
348+
exitCode: resolvedExitCode,
334349
durationSeconds,
335350
}),
336351
)

0 commit comments

Comments
 (0)