Skip to content

Commit d1625f5

Browse files
cameroncookecodex
andcommitted
fix(benchmarks): Harden Claude UI benchmark validation
Tighten transcript failure suppression, validate Claude timeout config, and make aggregate artifact roots path-aware. Co-Authored-By: Codex <noreply@openai.com>
1 parent 2889338 commit d1625f5

5 files changed

Lines changed: 157 additions & 4 deletions

File tree

src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { tmpdir } from 'node:os';
33
import path from 'node:path';
44
import { fileURLToPath } from 'node:url';
55
import { compareBenchmark, diffToolSequence } from '../compare.ts';
6+
import { renderAggregate } from '../render.ts';
67
import { readConfig } from '../config.ts';
78
import {
89
listSuitePaths,
@@ -336,6 +337,21 @@ describe('Claude UI benchmark analysis', () => {
336337
);
337338
});
338339

340+
it('rejects invalid Claude timeout values when loading config', () => {
341+
for (const maxClaudeSeconds of [0, -1, Number.NaN, Number.POSITIVE_INFINITY]) {
342+
expect(() =>
343+
readConfig(
344+
{
345+
name: 'weather',
346+
prompt: 'prompt.md',
347+
claude: { maxClaudeSeconds },
348+
},
349+
'weather.yml',
350+
),
351+
).toThrow('weather.yml.claude.maxClaudeSeconds: expected finite positive number');
352+
}
353+
});
354+
339355
it('rejects malformed failure pattern regexes when loading config', () => {
340356
expect(() =>
341357
readConfig(
@@ -602,6 +618,35 @@ describe('Claude UI benchmark analysis', () => {
602618
expect(result.completed).toBe(false);
603619
});
604620

621+
it('renders path-aware aggregate artifact roots', () => {
622+
const first = compareBenchmark(
623+
{ name: 'first', prompt: 'prompt.md' },
624+
analyzeClaudeJsonl('', { mcpToolPrefix: toolPrefix }),
625+
{
626+
...runMetadata(10),
627+
artifacts: {
628+
...runMetadata(10).artifacts,
629+
runDirectory: '/tmp/run/first/20260101T000000Z',
630+
},
631+
},
632+
);
633+
const second = compareBenchmark(
634+
{ name: 'second', prompt: 'prompt.md' },
635+
analyzeClaudeJsonl('', { mcpToolPrefix: toolPrefix }),
636+
{
637+
...runMetadata(20),
638+
artifacts: {
639+
...runMetadata(20).artifacts,
640+
runDirectory: '/tmp/run-extra/second/20260101T000000Z',
641+
},
642+
},
643+
);
644+
645+
expect(renderAggregate([first, second], { color: false, cwd: '/tmp' })).toContain(
646+
'Artifacts: /tmp/',
647+
);
648+
});
649+
605650
it('returns no sequence hunks when expected and actual match', () => {
606651
expect(diffToolSequence(['a', 'b'], ['a', 'b'])).toEqual([]);
607652
});

src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,73 @@ describe('Claude UI benchmark tool configuration', () => {
322322
]);
323323
});
324324

325+
it('reports real failures when ignored and reportable patterns share a result', () => {
326+
const config = readConfig(
327+
{
328+
name: 'private CLI weather',
329+
prompt: 'weather.md',
330+
failurePatterns: ['WAIT_TIMEOUT'],
331+
ignoredFailurePatterns: ['element_disabled'],
332+
toolAnalysis: {
333+
matchers: [
334+
{
335+
kind: 'bashCommand',
336+
commandPrefix: 'privatecli wait',
337+
shortName: 'privatecli.wait',
338+
uiAutomation: true,
339+
},
340+
],
341+
},
342+
},
343+
'private-cli.yml',
344+
);
345+
const transcript = [
346+
line({
347+
type: 'assistant',
348+
message: {
349+
content: [
350+
{
351+
type: 'tool_use',
352+
id: 'tool-1',
353+
name: 'Bash',
354+
input: { command: 'privatecli wait element --label Weather --timeout 1' },
355+
},
356+
],
357+
},
358+
}),
359+
line({
360+
type: 'user',
361+
message: {
362+
content: [
363+
{
364+
type: 'tool_result',
365+
tool_use_id: 'tool-1',
366+
is_error: true,
367+
content: 'Exit code 1\n{"error":{"code":"element_disabled"}}\nWAIT_TIMEOUT',
368+
},
369+
],
370+
},
371+
}),
372+
].join('\n');
373+
374+
const audit = analyzeClaudeJsonl(transcript, {
375+
toolAnalysis: config.toolAnalysis,
376+
failurePatterns: config.failurePatterns,
377+
ignoredFailurePatterns: config.ignoredFailurePatterns,
378+
});
379+
const result = compareBenchmark(config, audit, runMetadata(10));
380+
381+
expect(audit.failures).toHaveLength(1);
382+
expect(audit.patternFailures).toEqual([
383+
{
384+
pattern: 'WAIT_TIMEOUT',
385+
line: 2,
386+
excerpt: 'Exit code 1\n{"error":{"code":"element_disabled"}}\nWAIT_TIMEOUT',
387+
},
388+
]);
389+
expect(result.completed).toBe(false);
390+
});
391+
325392
it('ignores configured non-terminal tool failures', () => {
326393
const config = readConfig(
327394
{

src/benchmarks/claude-ui/config.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,19 @@ function readOptionalNumber(
9191
return raw;
9292
}
9393

94+
function readOptionalPositiveFiniteNumber(
95+
value: Record<string, unknown>,
96+
key: string,
97+
source: string,
98+
): number | undefined {
99+
const raw = readOptionalNumber(value, key, source);
100+
if (raw === undefined) return undefined;
101+
if (!Number.isFinite(raw) || raw <= 0) {
102+
throw new Error(`${source}.${key}: expected finite positive number`);
103+
}
104+
return raw;
105+
}
106+
94107
function readNumberMap(value: unknown, source: string): Record<string, number> | undefined {
95108
if (value === undefined) return undefined;
96109
if (!isRecord(value)) throw new Error(`${source}: expected object`);
@@ -150,7 +163,7 @@ function readClaudeInvocationConfig(
150163
skillDirs,
151164
activateSkill,
152165
isolatedWorkingDirectory: readOptionalBoolean(raw, 'isolatedWorkingDirectory', source),
153-
maxClaudeSeconds: readOptionalNumber(raw, 'maxClaudeSeconds', source),
166+
maxClaudeSeconds: readOptionalPositiveFiniteNumber(raw, 'maxClaudeSeconds', source),
154167
};
155168
}
156169

src/benchmarks/claude-ui/render.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,12 +373,17 @@ export function renderSuiteReport(result: BenchmarkResult, options?: RenderOptio
373373
return `${sections.join('\n')}\n`;
374374
}
375375

376+
function pathContainsOrEquals(root: string, target: string): boolean {
377+
const relative = path.relative(root, target);
378+
return relative === '' || (!relative.startsWith('..') && !path.isAbsolute(relative));
379+
}
380+
376381
function commonArtifactRoot(results: readonly BenchmarkResult[]): string | undefined {
377382
if (results.length === 0) return undefined;
378383
const dirs = results.map((r) => path.dirname(r.run.artifacts.runDirectory));
379384
let root = dirs[0]!;
380385
for (const dir of dirs.slice(1)) {
381-
while (!dir.startsWith(root)) {
386+
while (!pathContainsOrEquals(root, dir)) {
382387
const next = path.dirname(root);
383388
if (next === root) return root;
384389
root = next;

src/benchmarks/claude-ui/transcript.ts

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,24 @@ function matchesAnyPattern(
148148
return matchers.some((matcher) => matcher.regex.test(text));
149149
}
150150

151+
function patternMatcherIsIgnored(
152+
matcher: { pattern: string },
153+
ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>,
154+
): boolean {
155+
return matchesAnyPattern(matcher.pattern, ignoredFailureMatchers);
156+
}
157+
158+
function hasReportablePatternMatch(
159+
text: string,
160+
patternMatchers: Array<{ pattern: string; regex: RegExp }>,
161+
ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>,
162+
): boolean {
163+
return patternMatchers.some(
164+
(matcher) =>
165+
matcher.regex.test(text) && !patternMatcherIsIgnored(matcher, ignoredFailureMatchers),
166+
);
167+
}
168+
151169
function appendPatternFailures(opts: {
152170
text: string;
153171
line: number;
@@ -156,8 +174,8 @@ function appendPatternFailures(opts: {
156174
ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>;
157175
patternFailures: PatternFailureRecord[];
158176
}): void {
159-
if (matchesAnyPattern(opts.text, opts.ignoredFailureMatchers)) return;
160177
for (const matcher of opts.patternMatchers) {
178+
if (patternMatcherIsIgnored(matcher, opts.ignoredFailureMatchers)) continue;
161179
if (matcher.regex.test(opts.text)) {
162180
opts.patternFailures.push({
163181
pattern: matcher.pattern,
@@ -405,7 +423,12 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans
405423
}
406424
if (!resultDidError(block, structured)) continue;
407425

408-
if (matchesAnyPattern(message, ignoredFailureMatchers)) continue;
426+
if (
427+
matchesAnyPattern(message, ignoredFailureMatchers) &&
428+
!hasReportablePatternMatch(message, patternMatchers, ignoredFailureMatchers)
429+
) {
430+
continue;
431+
}
409432

410433
for (const trackedTool of trackedTools) {
411434
const failureKey = [id, trackedTool.fullName, trackedTool.shortName, line, message].join(

0 commit comments

Comments
 (0)