fix(benchmarks): Enforce Claude UI failure conditions

cameroncooke · codex · cameroncooke · commit 3ab280ad53b6 · 2026-05-26T11:23:30.000Z
Treat configured failure pattern matches as incomplete benchmark runs so
CI exits non-zero for explicitly declared failure conditions. Reject
activateSkill configs without skillDirs during suite parsing to avoid
late failures after expensive setup.

Co-Authored-By: OpenAI Codex &lt;noreply@openai.com&gt;
diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts
@@ -231,6 +231,52 @@ describe('Claude UI benchmark analysis', () => {
     );
 
     expect(result.completion.issueCount).toBe(1);
+    expect(result.completion.completed).toBe(false);
+    expect(result.completed).toBe(false);
+  });
+
+  it('marks the benchmark incomplete when configured failure patterns match', () => {
+    const transcript = [
+      line({
+        type: 'assistant',
+        message: {
+          content: [
+            { type: 'tool_use', id: 'tool-1', name: `${toolPrefix}wait_for_ui`, input: {} },
+          ],
+        },
+      }),
+      line({
+        type: 'user',
+        message: {
+          content: [
+            {
+              type: 'tool_result',
+              tool_use_id: 'tool-1',
+              is_error: false,
+              content: 'BUILD FAILED',
+            },
+          ],
+        },
+      }),
+    ].join('\n');
+
+    const audit = analyzeClaudeJsonl(transcript, {
+      mcpToolPrefix: toolPrefix,
+      failurePatterns: ['BUILD FAILED'],
+    });
+
+    expect(audit.failures).toEqual([]);
+    expect(audit.patternFailures).toHaveLength(1);
+
+    const result = compareBenchmark(
+      { name: 'weather', prompt: 'prompt.md' },
+      audit,
+      runMetadata(10),
+    );
+
+    expect(result.completion.issueCount).toBe(1);
+    expect(result.completion.completed).toBe(false);
+    expect(result.completed).toBe(false);
   });
 
   it('counts parser failures once when malformed JSONL also records parse errors', () => {
@@ -284,6 +330,22 @@ describe('Claude UI benchmark analysis', () => {
     ).toThrow('weather.yml.failurePatterns[1]: invalid regular expression');
   });
 
+  it('rejects activateSkill without skillDirs when loading config', () => {
+    expect(() =>
+      readConfig(
+        {
+          name: 'weather',
+          prompt: 'prompt.md',
+          claude: {
+            activateSkill: 'vendor-cli',
+            isolatedWorkingDirectory: true,
+          },
+        },
+        'weather.yml',
+      ),
+    ).toThrow('weather.yml.claude.activateSkill: requires skillDirs');
+  });
+
   it('rejects invalid session defaults when loading config', () => {
     expect(() =>
       readConfig(
diff --git a/src/benchmarks/claude-ui/compare.ts b/src/benchmarks/claude-ui/compare.ts
@@ -134,6 +134,7 @@ function processCompleted(run: BenchmarkRunMetadata, audit: TranscriptAudit): bo
   if (audit.parseErrors.length > 0) return false;
   if (run.claudeExitCode !== 0) return false;
   if (run.parserExitCode !== 0) return false;
+  if (audit.patternFailures.length > 0) return false;
   return !audit.failures.some(isTerminalClaudeFailure);
 }
 
diff --git a/src/benchmarks/claude-ui/config.ts b/src/benchmarks/claude-ui/config.ts
@@ -115,6 +115,12 @@ function readClaudeInvocationConfig(
   ) {
     throw new Error(`${source}.permissionMode: expected 'default' or 'bypassPermissions'`);
   }
+  const skillDirs = readOptionalStringArray(raw, 'skillDirs', source);
+  const activateSkill = readOptionalString(raw, 'activateSkill', source);
+  if (activateSkill !== undefined && (!skillDirs || skillDirs.length === 0)) {
+    throw new Error(`${source}.activateSkill: requires skillDirs`);
+  }
+
   return {
     useMcpServer: readOptionalBoolean(raw, 'useMcpServer', source),
     permissionMode,
@@ -123,8 +129,8 @@ function readClaudeInvocationConfig(
     appendSystemPrompt: readOptionalString(raw, 'appendSystemPrompt', source),
     extraArgs: readOptionalStringArray(raw, 'extraArgs', source),
     pluginDirs: readOptionalStringArray(raw, 'pluginDirs', source),
-    skillDirs: readOptionalStringArray(raw, 'skillDirs', source),
-    activateSkill: readOptionalString(raw, 'activateSkill', source),
+    skillDirs,
+    activateSkill,
     isolatedWorkingDirectory: readOptionalBoolean(raw, 'isolatedWorkingDirectory', source),
     maxClaudeSeconds: readOptionalNumber(raw, 'maxClaudeSeconds', source),
   };

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ function processCompleted(run: BenchmarkRunMetadata, audit: TranscriptAudit): bo`
`134`	`134`	`if (audit.parseErrors.length > 0) return false;`
`135`	`135`	`if (run.claudeExitCode !== 0) return false;`
`136`	`136`	`if (run.parserExitCode !== 0) return false;`
	`137`	`+ if (audit.patternFailures.length > 0) return false;`
`137`	`138`	`return !audit.failures.some(isTerminalClaudeFailure);`
`138`	`139`	`}`
`139`	`140`