fix(benchmarks): Harden Claude UI benchmark validation

cameroncooke · codex · cameroncooke · commit d1625f502ea0 · 2026-05-26T20:59:41.000+01:00
Tighten transcript failure suppression, validate Claude timeout config,
and make aggregate artifact roots path-aware.

Co-Authored-By: Codex &lt;noreply@openai.com&gt;
diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts
@@ -3,6 +3,7 @@ import { tmpdir } from 'node:os';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { compareBenchmark, diffToolSequence } from '../compare.ts';
+import { renderAggregate } from '../render.ts';
 import { readConfig } from '../config.ts';
 import {
   listSuitePaths,
@@ -336,6 +337,21 @@ describe('Claude UI benchmark analysis', () => {
     );
   });
 
+  it('rejects invalid Claude timeout values when loading config', () => {
+    for (const maxClaudeSeconds of [0, -1, Number.NaN, Number.POSITIVE_INFINITY]) {
+      expect(() =>
+        readConfig(
+          {
+            name: 'weather',
+            prompt: 'prompt.md',
+            claude: { maxClaudeSeconds },
+          },
+          'weather.yml',
+        ),
+      ).toThrow('weather.yml.claude.maxClaudeSeconds: expected finite positive number');
+    }
+  });
+
   it('rejects malformed failure pattern regexes when loading config', () => {
     expect(() =>
       readConfig(
@@ -602,6 +618,35 @@ describe('Claude UI benchmark analysis', () => {
     expect(result.completed).toBe(false);
   });
 
+  it('renders path-aware aggregate artifact roots', () => {
+    const first = compareBenchmark(
+      { name: 'first', prompt: 'prompt.md' },
+      analyzeClaudeJsonl('', { mcpToolPrefix: toolPrefix }),
+      {
+        ...runMetadata(10),
+        artifacts: {
+          ...runMetadata(10).artifacts,
+          runDirectory: '/tmp/run/first/20260101T000000Z',
+        },
+      },
+    );
+    const second = compareBenchmark(
+      { name: 'second', prompt: 'prompt.md' },
+      analyzeClaudeJsonl('', { mcpToolPrefix: toolPrefix }),
+      {
+        ...runMetadata(20),
+        artifacts: {
+          ...runMetadata(20).artifacts,
+          runDirectory: '/tmp/run-extra/second/20260101T000000Z',
+        },
+      },
+    );
+
+    expect(renderAggregate([first, second], { color: false, cwd: '/tmp' })).toContain(
+      'Artifacts: /tmp/',
+    );
+  });
+
   it('returns no sequence hunks when expected and actual match', () => {
     expect(diffToolSequence(['a', 'b'], ['a', 'b'])).toEqual([]);
   });
diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts
@@ -322,6 +322,73 @@ describe('Claude UI benchmark tool configuration', () => {
     ]);
   });
 
+  it('reports real failures when ignored and reportable patterns share a result', () => {
+    const config = readConfig(
+      {
+        name: 'private CLI weather',
+        prompt: 'weather.md',
+        failurePatterns: ['WAIT_TIMEOUT'],
+        ignoredFailurePatterns: ['element_disabled'],
+        toolAnalysis: {
+          matchers: [
+            {
+              kind: 'bashCommand',
+              commandPrefix: 'privatecli wait',
+              shortName: 'privatecli.wait',
+              uiAutomation: true,
+            },
+          ],
+        },
+      },
+      'private-cli.yml',
+    );
+    const transcript = [
+      line({
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'tool_use',
+              id: 'tool-1',
+              name: 'Bash',
+              input: { command: 'privatecli wait element --label Weather --timeout 1' },
+            },
+          ],
+        },
+      }),
+      line({
+        type: 'user',
+        message: {
+          content: [
+            {
+              type: 'tool_result',
+              tool_use_id: 'tool-1',
+              is_error: true,
+              content: 'Exit code 1\n{"error":{"code":"element_disabled"}}\nWAIT_TIMEOUT',
+            },
+          ],
+        },
+      }),
+    ].join('\n');
+
+    const audit = analyzeClaudeJsonl(transcript, {
+      toolAnalysis: config.toolAnalysis,
+      failurePatterns: config.failurePatterns,
+      ignoredFailurePatterns: config.ignoredFailurePatterns,
+    });
+    const result = compareBenchmark(config, audit, runMetadata(10));
+
+    expect(audit.failures).toHaveLength(1);
+    expect(audit.patternFailures).toEqual([
+      {
+        pattern: 'WAIT_TIMEOUT',
+        line: 2,
+        excerpt: 'Exit code 1\n{"error":{"code":"element_disabled"}}\nWAIT_TIMEOUT',
+      },
+    ]);
+    expect(result.completed).toBe(false);
+  });
+
   it('ignores configured non-terminal tool failures', () => {
     const config = readConfig(
       {
diff --git a/src/benchmarks/claude-ui/config.ts b/src/benchmarks/claude-ui/config.ts
@@ -91,6 +91,19 @@ function readOptionalNumber(
   return raw;
 }
 
+function readOptionalPositiveFiniteNumber(
+  value: Record<string, unknown>,
+  key: string,
+  source: string,
+): number | undefined {
+  const raw = readOptionalNumber(value, key, source);
+  if (raw === undefined) return undefined;
+  if (!Number.isFinite(raw) || raw <= 0) {
+    throw new Error(`${source}.${key}: expected finite positive number`);
+  }
+  return raw;
+}
+
 function readNumberMap(value: unknown, source: string): Record<string, number> | undefined {
   if (value === undefined) return undefined;
   if (!isRecord(value)) throw new Error(`${source}: expected object`);
@@ -150,7 +163,7 @@ function readClaudeInvocationConfig(
     skillDirs,
     activateSkill,
     isolatedWorkingDirectory: readOptionalBoolean(raw, 'isolatedWorkingDirectory', source),
-    maxClaudeSeconds: readOptionalNumber(raw, 'maxClaudeSeconds', source),
+    maxClaudeSeconds: readOptionalPositiveFiniteNumber(raw, 'maxClaudeSeconds', source),
   };
 }
 
diff --git a/src/benchmarks/claude-ui/render.ts b/src/benchmarks/claude-ui/render.ts
@@ -373,12 +373,17 @@ export function renderSuiteReport(result: BenchmarkResult, options?: RenderOptio
   return `${sections.join('\n')}\n`;
 }
 
+function pathContainsOrEquals(root: string, target: string): boolean {
+  const relative = path.relative(root, target);
+  return relative === '' || (!relative.startsWith('..') && !path.isAbsolute(relative));
+}
+
 function commonArtifactRoot(results: readonly BenchmarkResult[]): string | undefined {
   if (results.length === 0) return undefined;
   const dirs = results.map((r) => path.dirname(r.run.artifacts.runDirectory));
   let root = dirs[0]!;
   for (const dir of dirs.slice(1)) {
-    while (!dir.startsWith(root)) {
+    while (!pathContainsOrEquals(root, dir)) {
       const next = path.dirname(root);
       if (next === root) return root;
       root = next;
diff --git a/src/benchmarks/claude-ui/transcript.ts b/src/benchmarks/claude-ui/transcript.ts
@@ -148,6 +148,24 @@ function matchesAnyPattern(
   return matchers.some((matcher) => matcher.regex.test(text));
 }
 
+function patternMatcherIsIgnored(
+  matcher: { pattern: string },
+  ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>,
+): boolean {
+  return matchesAnyPattern(matcher.pattern, ignoredFailureMatchers);
+}
+
+function hasReportablePatternMatch(
+  text: string,
+  patternMatchers: Array<{ pattern: string; regex: RegExp }>,
+  ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>,
+): boolean {
+  return patternMatchers.some(
+    (matcher) =>
+      matcher.regex.test(text) && !patternMatcherIsIgnored(matcher, ignoredFailureMatchers),
+  );
+}
+
 function appendPatternFailures(opts: {
   text: string;
   line: number;
@@ -156,8 +174,8 @@ function appendPatternFailures(opts: {
   ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>;
   patternFailures: PatternFailureRecord[];
 }): void {
-  if (matchesAnyPattern(opts.text, opts.ignoredFailureMatchers)) return;
   for (const matcher of opts.patternMatchers) {
+    if (patternMatcherIsIgnored(matcher, opts.ignoredFailureMatchers)) continue;
     if (matcher.regex.test(opts.text)) {
       opts.patternFailures.push({
         pattern: matcher.pattern,
@@ -405,7 +423,12 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans
         }
         if (!resultDidError(block, structured)) continue;
 
-        if (matchesAnyPattern(message, ignoredFailureMatchers)) continue;
+        if (
+          matchesAnyPattern(message, ignoredFailureMatchers) &&
+          !hasReportablePatternMatch(message, patternMatchers, ignoredFailureMatchers)
+        ) {
+          continue;
+        }
 
         for (const trackedTool of trackedTools) {
           const failureKey = [id, trackedTool.fullName, trackedTool.shortName, line, message].join(