Skip to content

Commit 3ab280a

Browse files
cameroncookecodex
andcommitted
fix(benchmarks): Enforce Claude UI failure conditions
Treat configured failure pattern matches as incomplete benchmark runs so CI exits non-zero for explicitly declared failure conditions. Reject activateSkill configs without skillDirs during suite parsing to avoid late failures after expensive setup. Co-Authored-By: OpenAI Codex <noreply@openai.com>
1 parent fdbe02b commit 3ab280a

3 files changed

Lines changed: 71 additions & 2 deletions

File tree

src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,52 @@ describe('Claude UI benchmark analysis', () => {
231231
);
232232

233233
expect(result.completion.issueCount).toBe(1);
234+
expect(result.completion.completed).toBe(false);
235+
expect(result.completed).toBe(false);
236+
});
237+
238+
it('marks the benchmark incomplete when configured failure patterns match', () => {
239+
const transcript = [
240+
line({
241+
type: 'assistant',
242+
message: {
243+
content: [
244+
{ type: 'tool_use', id: 'tool-1', name: `${toolPrefix}wait_for_ui`, input: {} },
245+
],
246+
},
247+
}),
248+
line({
249+
type: 'user',
250+
message: {
251+
content: [
252+
{
253+
type: 'tool_result',
254+
tool_use_id: 'tool-1',
255+
is_error: false,
256+
content: 'BUILD FAILED',
257+
},
258+
],
259+
},
260+
}),
261+
].join('\n');
262+
263+
const audit = analyzeClaudeJsonl(transcript, {
264+
mcpToolPrefix: toolPrefix,
265+
failurePatterns: ['BUILD FAILED'],
266+
});
267+
268+
expect(audit.failures).toEqual([]);
269+
expect(audit.patternFailures).toHaveLength(1);
270+
271+
const result = compareBenchmark(
272+
{ name: 'weather', prompt: 'prompt.md' },
273+
audit,
274+
runMetadata(10),
275+
);
276+
277+
expect(result.completion.issueCount).toBe(1);
278+
expect(result.completion.completed).toBe(false);
279+
expect(result.completed).toBe(false);
234280
});
235281

236282
it('counts parser failures once when malformed JSONL also records parse errors', () => {
@@ -284,6 +330,22 @@ describe('Claude UI benchmark analysis', () => {
284330
).toThrow('weather.yml.failurePatterns[1]: invalid regular expression');
285331
});
286332

333+
it('rejects activateSkill without skillDirs when loading config', () => {
334+
expect(() =>
335+
readConfig(
336+
{
337+
name: 'weather',
338+
prompt: 'prompt.md',
339+
claude: {
340+
activateSkill: 'vendor-cli',
341+
isolatedWorkingDirectory: true,
342+
},
343+
},
344+
'weather.yml',
345+
),
346+
).toThrow('weather.yml.claude.activateSkill: requires skillDirs');
347+
});
348+
287349
it('rejects invalid session defaults when loading config', () => {
288350
expect(() =>
289351
readConfig(

src/benchmarks/claude-ui/compare.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ function processCompleted(run: BenchmarkRunMetadata, audit: TranscriptAudit): bo
134134
if (audit.parseErrors.length > 0) return false;
135135
if (run.claudeExitCode !== 0) return false;
136136
if (run.parserExitCode !== 0) return false;
137+
if (audit.patternFailures.length > 0) return false;
137138
return !audit.failures.some(isTerminalClaudeFailure);
138139
}
139140

src/benchmarks/claude-ui/config.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ function readClaudeInvocationConfig(
115115
) {
116116
throw new Error(`${source}.permissionMode: expected 'default' or 'bypassPermissions'`);
117117
}
118+
const skillDirs = readOptionalStringArray(raw, 'skillDirs', source);
119+
const activateSkill = readOptionalString(raw, 'activateSkill', source);
120+
if (activateSkill !== undefined && (!skillDirs || skillDirs.length === 0)) {
121+
throw new Error(`${source}.activateSkill: requires skillDirs`);
122+
}
123+
118124
return {
119125
useMcpServer: readOptionalBoolean(raw, 'useMcpServer', source),
120126
permissionMode,
@@ -123,8 +129,8 @@ function readClaudeInvocationConfig(
123129
appendSystemPrompt: readOptionalString(raw, 'appendSystemPrompt', source),
124130
extraArgs: readOptionalStringArray(raw, 'extraArgs', source),
125131
pluginDirs: readOptionalStringArray(raw, 'pluginDirs', source),
126-
skillDirs: readOptionalStringArray(raw, 'skillDirs', source),
127-
activateSkill: readOptionalString(raw, 'activateSkill', source),
132+
skillDirs,
133+
activateSkill,
128134
isolatedWorkingDirectory: readOptionalBoolean(raw, 'isolatedWorkingDirectory', source),
129135
maxClaudeSeconds: readOptionalNumber(raw, 'maxClaudeSeconds', source),
130136
};

0 commit comments

Comments
 (0)