@@ -2,65 +2,74 @@ import { describe, it, expect } from 'vitest';
22import { validateResults , DEFAULT_CRITERIA , type SuccessCriteria } from './success-criteria.js' ;
33import type { EvalResult } from './types.js' ;
44
5- function makeResult ( passed : boolean , attempts : number = 1 ) : EvalResult {
5+ function makeResult ( passed : boolean , attempts : number = 1 , correctionAttempts : number = 0 ) : EvalResult {
66 return {
77 scenario : `test-${ Math . random ( ) . toString ( 36 ) . slice ( 2 ) } ` ,
88 passed,
99 duration : 1000 ,
1010 attempts,
11+ correctionAttempts,
1112 } ;
1213}
1314
1415describe ( 'success-criteria' , ( ) => {
1516 describe ( 'DEFAULT_CRITERIA' , ( ) => {
1617 it ( 'has expected default thresholds' , ( ) => {
17- expect ( DEFAULT_CRITERIA . firstAttemptPassRate ) . toBe ( 0.9 ) ;
18+ expect ( DEFAULT_CRITERIA . firstAttemptPassRate ) . toBe ( 0.3 ) ;
19+ expect ( DEFAULT_CRITERIA . withCorrectionPassRate ) . toBe ( 0.9 ) ;
1820 expect ( DEFAULT_CRITERIA . withRetryPassRate ) . toBe ( 0.95 ) ;
1921 } ) ;
2022 } ) ;
2123
2224 describe ( 'validateResults' , ( ) => {
2325 it ( 'returns passed=true when all criteria met' , ( ) => {
24- // 10 results, 9 passed on first attempt , 1 passed on retry
26+ // 10 results: 4 clean (40% > 30%), 5 corrected (9/10 = 90% correction) , 1 retried (100% retry)
2527 const results : EvalResult [ ] = [
26- ...Array ( 9 )
28+ ...Array ( 4 )
2729 . fill ( null )
28- . map ( ( ) => makeResult ( true , 1 ) ) ,
30+ . map ( ( ) => makeResult ( true , 1 , 0 ) ) ,
31+ ...Array ( 5 )
32+ . fill ( null )
33+ . map ( ( ) => makeResult ( true , 1 , 1 ) ) ,
2934 makeResult ( true , 2 ) ,
3035 ] ;
3136
3237 const validation = validateResults ( results ) ;
3338
3439 expect ( validation . passed ) . toBe ( true ) ;
3540 expect ( validation . failures ) . toHaveLength ( 0 ) ;
36- expect ( validation . actual . firstAttemptPassRate ) . toBe ( 0.9 ) ;
41+ expect ( validation . actual . firstAttemptPassRate ) . toBe ( 0.4 ) ;
42+ expect ( validation . actual . withCorrectionPassRate ) . toBe ( 0.9 ) ;
3743 expect ( validation . actual . withRetryPassRate ) . toBe ( 1 ) ;
3844 } ) ;
3945
4046 it ( 'returns passed=false when first-attempt rate below threshold' , ( ) => {
41- // 10 results, only 8 passed on first attempt
47+ // 10 results, only 2 passed on first attempt (20% < 30% threshold)
4248 const results : EvalResult [ ] = [
43- ...Array ( 8 )
49+ ...Array ( 2 )
4450 . fill ( null )
4551 . map ( ( ) => makeResult ( true , 1 ) ) ,
46- makeResult ( true , 2 ) ,
52+ ...Array ( 7 )
53+ . fill ( null )
54+ . map ( ( ) => makeResult ( true , 2 ) ) ,
4755 makeResult ( true , 2 ) ,
4856 ] ;
4957
5058 const validation = validateResults ( results ) ;
5159
5260 expect ( validation . passed ) . toBe ( false ) ;
53- expect ( validation . failures ) . toHaveLength ( 1 ) ;
54- expect ( validation . failures [ 0 ] ) . toContain ( 'First-attempt' ) ;
55- expect ( validation . failures [ 0 ] ) . toContain ( '80.0%' ) ;
61+ expect ( validation . failures . some ( ( f ) => f . includes ( 'First-attempt' ) ) ) . toBe ( true ) ;
5662 } ) ;
5763
5864 it ( 'returns passed=false when with-retry rate below threshold' , ( ) => {
59- // 10 results, 9 passed first attempt , 1 failed entirely
65+ // 10 results: 4 clean, 5 corrected (90% correction) , 1 failed → 90% retry < 95%
6066 const results : EvalResult [ ] = [
61- ...Array ( 9 )
67+ ...Array ( 4 )
6268 . fill ( null )
63- . map ( ( ) => makeResult ( true , 1 ) ) ,
69+ . map ( ( ) => makeResult ( true , 1 , 0 ) ) ,
70+ ...Array ( 5 )
71+ . fill ( null )
72+ . map ( ( ) => makeResult ( true , 1 , 1 ) ) ,
6473 makeResult ( false , 3 ) ,
6574 ] ;
6675
@@ -71,21 +80,24 @@ describe('success-criteria', () => {
7180 expect ( validation . failures [ 0 ] ) . toContain ( 'With-retry' ) ;
7281 } ) ;
7382
74- it ( 'returns both failures when both criteria not met' , ( ) => {
75- // 10 results, 7 passed first attempt, 1 failed
83+ it ( 'returns both failures when multiple criteria not met' , ( ) => {
84+ // 10 results, 2 passed first attempt (20% < 30%), 4 failed entirely (60% < 95% retry)
7685 const results : EvalResult [ ] = [
77- ...Array ( 7 )
86+ ...Array ( 2 )
7887 . fill ( null )
7988 . map ( ( ) => makeResult ( true , 1 ) ) ,
80- makeResult ( true , 2 ) ,
81- makeResult ( true , 2 ) ,
82- makeResult ( false , 3 ) ,
89+ ...Array ( 4 )
90+ . fill ( null )
91+ . map ( ( ) => makeResult ( true , 2 ) ) ,
92+ ...Array ( 4 )
93+ . fill ( null )
94+ . map ( ( ) => makeResult ( false , 3 ) ) ,
8395 ] ;
8496
8597 const validation = validateResults ( results ) ;
8698
8799 expect ( validation . passed ) . toBe ( false ) ;
88- expect ( validation . failures ) . toHaveLength ( 2 ) ;
100+ expect ( validation . failures . length ) . toBeGreaterThanOrEqual ( 2 ) ;
89101 } ) ;
90102
91103 it ( 'handles empty results array' , ( ) => {
@@ -120,11 +132,18 @@ describe('success-criteria', () => {
120132 } ) ;
121133
122134 it ( 'passes when exactly at threshold' , ( ) => {
123- // Exactly 90% first-attempt, 95% with-retry
135+ // 20 results:
136+ // 6 clean first-attempt (attempt=1, corrections=0) → 30% first-attempt
137+ // 12 self-corrected (attempt=1, corrections=1) → 18/20 = 90% with-correction
138+ // 1 passed on scenario retry (attempt=2) → 19/20 = 95% with-retry
139+ // 1 failed (attempt=3)
124140 const results : EvalResult [ ] = [
125- ...Array ( 18 )
141+ ...Array ( 6 )
126142 . fill ( null )
127- . map ( ( ) => makeResult ( true , 1 ) ) ,
143+ . map ( ( ) => makeResult ( true , 1 , 0 ) ) ,
144+ ...Array ( 12 )
145+ . fill ( null )
146+ . map ( ( ) => makeResult ( true , 1 , 1 ) ) ,
128147 makeResult ( true , 2 ) ,
129148 makeResult ( false , 3 ) ,
130149 ] ;
0 commit comments