Skip to content

Commit 81a374e

Browse files
committed
fix: recalibrate success criteria thresholds for correction-aware metrics
First-attempt now means zero corrections, which is stricter than before. Lower threshold to 30% (aspirational), add withCorrectionPassRate at 90% as the primary quality gate, keep withRetryPassRate at 95%.
1 parent 03984c0 commit 81a374e

2 files changed

Lines changed: 46 additions & 26 deletions

File tree

tests/evals/success-criteria.spec.ts

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,65 +2,74 @@ import { describe, it, expect } from 'vitest';
22
import { validateResults, DEFAULT_CRITERIA, type SuccessCriteria } from './success-criteria.js';
33
import type { EvalResult } from './types.js';
44

5-
function makeResult(passed: boolean, attempts: number = 1): EvalResult {
5+
function makeResult(passed: boolean, attempts: number = 1, correctionAttempts: number = 0): EvalResult {
66
return {
77
scenario: `test-${Math.random().toString(36).slice(2)}`,
88
passed,
99
duration: 1000,
1010
attempts,
11+
correctionAttempts,
1112
};
1213
}
1314

1415
describe('success-criteria', () => {
1516
describe('DEFAULT_CRITERIA', () => {
1617
it('has expected default thresholds', () => {
17-
expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.9);
18+
expect(DEFAULT_CRITERIA.firstAttemptPassRate).toBe(0.3);
19+
expect(DEFAULT_CRITERIA.withCorrectionPassRate).toBe(0.9);
1820
expect(DEFAULT_CRITERIA.withRetryPassRate).toBe(0.95);
1921
});
2022
});
2123

2224
describe('validateResults', () => {
2325
it('returns passed=true when all criteria met', () => {
24-
// 10 results, 9 passed on first attempt, 1 passed on retry
26+
// 10 results: 4 clean (40% > 30%), 5 corrected (9/10 = 90% correction), 1 retried (100% retry)
2527
const results: EvalResult[] = [
26-
...Array(9)
28+
...Array(4)
2729
.fill(null)
28-
.map(() => makeResult(true, 1)),
30+
.map(() => makeResult(true, 1, 0)),
31+
...Array(5)
32+
.fill(null)
33+
.map(() => makeResult(true, 1, 1)),
2934
makeResult(true, 2),
3035
];
3136

3237
const validation = validateResults(results);
3338

3439
expect(validation.passed).toBe(true);
3540
expect(validation.failures).toHaveLength(0);
36-
expect(validation.actual.firstAttemptPassRate).toBe(0.9);
41+
expect(validation.actual.firstAttemptPassRate).toBe(0.4);
42+
expect(validation.actual.withCorrectionPassRate).toBe(0.9);
3743
expect(validation.actual.withRetryPassRate).toBe(1);
3844
});
3945

4046
it('returns passed=false when first-attempt rate below threshold', () => {
41-
// 10 results, only 8 passed on first attempt
47+
// 10 results, only 2 passed on first attempt (20% < 30% threshold)
4248
const results: EvalResult[] = [
43-
...Array(8)
49+
...Array(2)
4450
.fill(null)
4551
.map(() => makeResult(true, 1)),
46-
makeResult(true, 2),
52+
...Array(7)
53+
.fill(null)
54+
.map(() => makeResult(true, 2)),
4755
makeResult(true, 2),
4856
];
4957

5058
const validation = validateResults(results);
5159

5260
expect(validation.passed).toBe(false);
53-
expect(validation.failures).toHaveLength(1);
54-
expect(validation.failures[0]).toContain('First-attempt');
55-
expect(validation.failures[0]).toContain('80.0%');
61+
expect(validation.failures.some((f) => f.includes('First-attempt'))).toBe(true);
5662
});
5763

5864
it('returns passed=false when with-retry rate below threshold', () => {
59-
// 10 results, 9 passed first attempt, 1 failed entirely
65+
// 10 results: 4 clean, 5 corrected (90% correction), 1 failed → 90% retry < 95%
6066
const results: EvalResult[] = [
61-
...Array(9)
67+
...Array(4)
6268
.fill(null)
63-
.map(() => makeResult(true, 1)),
69+
.map(() => makeResult(true, 1, 0)),
70+
...Array(5)
71+
.fill(null)
72+
.map(() => makeResult(true, 1, 1)),
6473
makeResult(false, 3),
6574
];
6675

@@ -71,21 +80,24 @@ describe('success-criteria', () => {
7180
expect(validation.failures[0]).toContain('With-retry');
7281
});
7382

74-
it('returns both failures when both criteria not met', () => {
75-
// 10 results, 7 passed first attempt, 1 failed
83+
it('returns both failures when multiple criteria not met', () => {
84+
// 10 results, 2 passed first attempt (20% < 30%), 4 failed entirely (60% < 95% retry)
7685
const results: EvalResult[] = [
77-
...Array(7)
86+
...Array(2)
7887
.fill(null)
7988
.map(() => makeResult(true, 1)),
80-
makeResult(true, 2),
81-
makeResult(true, 2),
82-
makeResult(false, 3),
89+
...Array(4)
90+
.fill(null)
91+
.map(() => makeResult(true, 2)),
92+
...Array(4)
93+
.fill(null)
94+
.map(() => makeResult(false, 3)),
8395
];
8496

8597
const validation = validateResults(results);
8698

8799
expect(validation.passed).toBe(false);
88-
expect(validation.failures).toHaveLength(2);
100+
expect(validation.failures.length).toBeGreaterThanOrEqual(2);
89101
});
90102

91103
it('handles empty results array', () => {
@@ -120,11 +132,18 @@ describe('success-criteria', () => {
120132
});
121133

122134
it('passes when exactly at threshold', () => {
123-
// Exactly 90% first-attempt, 95% with-retry
135+
// 20 results:
136+
// 6 clean first-attempt (attempt=1, corrections=0) → 30% first-attempt
137+
// 12 self-corrected (attempt=1, corrections=1) → 18/20 = 90% with-correction
138+
// 1 passed on scenario retry (attempt=2) → 19/20 = 95% with-retry
139+
// 1 failed (attempt=3)
124140
const results: EvalResult[] = [
125-
...Array(18)
141+
...Array(6)
126142
.fill(null)
127-
.map(() => makeResult(true, 1)),
143+
.map(() => makeResult(true, 1, 0)),
144+
...Array(12)
145+
.fill(null)
146+
.map(() => makeResult(true, 1, 1)),
128147
makeResult(true, 2),
129148
makeResult(false, 3),
130149
];

tests/evals/success-criteria.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ export interface SuccessCriteria {
1717

1818
/** Default thresholds for CI enforcement */
1919
export const DEFAULT_CRITERIA: SuccessCriteria = {
20-
firstAttemptPassRate: 0.9,
20+
firstAttemptPassRate: 0.3,
21+
withCorrectionPassRate: 0.9,
2122
withRetryPassRate: 0.95,
2223
};
2324

0 commit comments

Comments
 (0)