Skip to content

Commit b37834e

Browse files
christsoclaude
andauthored
feat(compare): add normalized gain metric (#1101)
* feat(compare): add normalized gain metric to agentv compare Add Hake's normalized gain (g) to compare output, measuring improvement relative to remaining headroom rather than raw absolute delta. Formula: g = (score_candidate − score_baseline) / (1 − score_baseline) This separates genuine scaffolding from ceiling effects — a +5pp gain from a 90% baseline (g=0.5) is proportionally much larger than +5pp from a 10% baseline (g=0.056). Shown as "Norm. gain" in table output and "g" in matrix pairwise summary. Available as mean_normalized_gain in JSON output. Returns null when baseline is 1.0 (perfect score, no headroom). Closes #1100 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor(compare): use standard symbol 'g' for normalized gain Use 'g' consistently in both table summary and matrix pairwise output, matching the standard notation from Hake (1998) and SkillsBench paper. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * docs(compare): document normalized gain metric Add normalized gain (g) to compare docs: formula, interpretation table, updated table/JSON output examples, and tips section. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 5855aad commit b37834e

3 files changed

Lines changed: 247 additions & 35 deletions

File tree

apps/cli/src/commands/compare/index.ts

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ interface MatchedResult {
4040
score1: number;
4141
score2: number;
4242
delta: number;
43+
normalizedGain: number | null;
4344
outcome: 'win' | 'loss' | 'tie';
4445
}
4546

@@ -53,6 +54,7 @@ export interface ComparisonOutput {
5354
losses: number;
5455
ties: number;
5556
meanDelta: number;
57+
meanNormalizedGain: number | null;
5658
};
5759
baseline?: string;
5860
candidate?: string;
@@ -111,6 +113,20 @@ export function loadCombinedResults(filePath: string): Map<string, EvalResult[]>
111113
return groups;
112114
}
113115

116+
/**
117+
* Hake's normalized gain: g = (score_candidate − score_baseline) / (1 − score_baseline)
118+
* Measures improvement relative to remaining headroom. Returns null when baseline is 1.0
119+
* (perfect score leaves no room for improvement).
120+
* Reference: Hake (1998), used by SkillsBench (arXiv:2602.12670).
121+
*/
122+
export function computeNormalizedGain(
123+
baselineScore: number,
124+
candidateScore: number,
125+
): number | null {
126+
if (baselineScore >= 1.0) return null;
127+
return (candidateScore - baselineScore) / (1 - baselineScore);
128+
}
129+
114130
export function classifyOutcome(delta: number, threshold: number): 'win' | 'loss' | 'tie' {
115131
if (delta >= threshold) return 'win';
116132
if (delta <= -threshold) return 'loss';
@@ -137,6 +153,7 @@ export function compareResults(
137153
score1,
138154
score2,
139155
delta,
156+
normalizedGain: computeNormalizedGain(score1, score2),
140157
outcome: classifyOutcome(delta, threshold),
141158
});
142159
matchedIds.add(testId);
@@ -153,6 +170,12 @@ export function compareResults(
153170
const meanDelta =
154171
matched.length > 0 ? matched.reduce((sum, m) => sum + m.delta, 0) / matched.length : 0;
155172

173+
const gainValues = matched.map((m) => m.normalizedGain).filter((g): g is number => g !== null);
174+
const meanNormalizedGain =
175+
gainValues.length > 0
176+
? Math.round((gainValues.reduce((sum, g) => sum + g, 0) / gainValues.length) * 1000) / 1000
177+
: null;
178+
156179
return {
157180
matched,
158181
unmatched: { file1: unmatchedFile1, file2: unmatchedFile2 },
@@ -163,6 +186,7 @@ export function compareResults(
163186
losses,
164187
ties,
165188
meanDelta: Math.round(meanDelta * 1000) / 1000,
189+
meanNormalizedGain,
166190
},
167191
};
168192
}
@@ -323,7 +347,7 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:
323347

324348
// Summary
325349
lines.push('');
326-
const { wins, losses, ties, meanDelta } = comparison.summary;
350+
const { wins, losses, ties, meanDelta, meanNormalizedGain } = comparison.summary;
327351

328352
const winStr =
329353
wins > 0 ? `${c.green}${wins} win${wins !== 1 ? 's' : ''}${c.reset}` : `${wins} wins`;
@@ -340,9 +364,15 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:
340364
? `${c.red}regressed${c.reset}`
341365
: `${c.gray}neutral${c.reset}`;
342366

343-
lines.push(
344-
`${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset} | Status: ${status}`,
345-
);
367+
let summaryLine = `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset}`;
368+
if (meanNormalizedGain != null) {
369+
const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
370+
const gSign = meanNormalizedGain >= 0 ? '+' : '';
371+
summaryLine += ` | g: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
372+
}
373+
summaryLine += ` | Status: ${status}`;
374+
375+
lines.push(summaryLine);
346376
lines.push('');
347377

348378
return lines.join('\n');
@@ -414,13 +444,18 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
414444
...pairwise.map((pw) => ` ${pw.baseline}${pw.candidate}:`.length),
415445
);
416446
for (const p of pairwise) {
417-
const { wins, losses, ties, meanDelta } = p.summary;
447+
const { wins, losses, ties, meanDelta, meanNormalizedGain } = p.summary;
418448
const sign = meanDelta >= 0 ? '+' : '';
419449
const deltaColor = meanDelta > 0 ? c.green : meanDelta < 0 ? c.red : c.gray;
420450
const label = ` ${p.baseline}${p.candidate}:`;
421-
lines.push(
422-
`${padRight(label, maxLabelLen)} ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''} (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset})`,
423-
);
451+
let pairLine = `${padRight(label, maxLabelLen)} ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''} (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset}`;
452+
if (meanNormalizedGain != null) {
453+
const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
454+
const gSign = meanNormalizedGain >= 0 ? '+' : '';
455+
pairLine += `, ${c.bold}g${c.reset} ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
456+
}
457+
pairLine += ')';
458+
lines.push(pairLine);
424459
}
425460
}
426461

apps/cli/test/commands/compare/compare.test.ts

Lines changed: 164 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
classifyOutcome,
88
compareMatrix,
99
compareResults,
10+
computeNormalizedGain,
1011
determineExitCode,
1112
determineMatrixExitCode,
1213
formatMatrix,
@@ -459,7 +460,15 @@ describe('compare command', () => {
459460
{
460461
matched: [],
461462
unmatched: { file1: 0, file2: 0 },
462-
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
463+
summary: {
464+
total: 2,
465+
matched: 1,
466+
wins: 1,
467+
losses: 0,
468+
ties: 0,
469+
meanDelta: 0.1,
470+
meanNormalizedGain: null,
471+
},
463472
baseline: 'base',
464473
candidate: 'cand',
465474
},
@@ -476,14 +485,30 @@ describe('compare command', () => {
476485
{
477486
matched: [],
478487
unmatched: { file1: 0, file2: 0 },
479-
summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 },
488+
summary: {
489+
total: 2,
490+
matched: 1,
491+
wins: 0,
492+
losses: 1,
493+
ties: 0,
494+
meanDelta: -0.1,
495+
meanNormalizedGain: null,
496+
},
480497
baseline: 'base',
481498
candidate: 'cand1',
482499
},
483500
{
484501
matched: [],
485502
unmatched: { file1: 0, file2: 0 },
486-
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
503+
summary: {
504+
total: 2,
505+
matched: 1,
506+
wins: 1,
507+
losses: 0,
508+
ties: 0,
509+
meanDelta: 0.1,
510+
meanNormalizedGain: null,
511+
},
487512
baseline: 'base',
488513
candidate: 'cand2',
489514
},
@@ -500,14 +525,30 @@ describe('compare command', () => {
500525
{
501526
matched: [],
502527
unmatched: { file1: 0, file2: 0 },
503-
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.05 },
528+
summary: {
529+
total: 2,
530+
matched: 1,
531+
wins: 1,
532+
losses: 0,
533+
ties: 0,
534+
meanDelta: 0.05,
535+
meanNormalizedGain: null,
536+
},
504537
baseline: 'base',
505538
candidate: 'cand1',
506539
},
507540
{
508541
matched: [],
509542
unmatched: { file1: 0, file2: 0 },
510-
summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.2 },
543+
summary: {
544+
total: 2,
545+
matched: 1,
546+
wins: 0,
547+
losses: 1,
548+
ties: 0,
549+
meanDelta: -0.2,
550+
meanNormalizedGain: null,
551+
},
511552
baseline: 'cand1',
512553
candidate: 'cand2',
513554
},
@@ -530,7 +571,15 @@ describe('compare command', () => {
530571
matched: [],
531572
unmatched: { file1: 0, file2: 0 },
532573
// delta > 0 means candidate (zeta/baseline) scored higher → alpha regressed
533-
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.2 },
574+
summary: {
575+
total: 2,
576+
matched: 1,
577+
wins: 1,
578+
losses: 0,
579+
ties: 0,
580+
meanDelta: 0.2,
581+
meanNormalizedGain: null,
582+
},
534583
baseline: 'alpha',
535584
candidate: 'zeta',
536585
},
@@ -550,7 +599,15 @@ describe('compare command', () => {
550599
unmatched: { file1: 0, file2: 0 },
551600
// delta < 0 means candidate (zeta/baseline) scored lower → alpha is better
552601
// That means alpha did NOT regress vs baseline zeta
553-
summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 },
602+
summary: {
603+
total: 2,
604+
matched: 1,
605+
wins: 0,
606+
losses: 1,
607+
ties: 0,
608+
meanDelta: -0.1,
609+
meanNormalizedGain: null,
610+
},
554611
baseline: 'alpha',
555612
candidate: 'zeta',
556613
},
@@ -584,7 +641,15 @@ describe('compare command', () => {
584641
{
585642
matched: [],
586643
unmatched: { file1: 0, file2: 0 },
587-
summary: { total: 4, matched: 2, wins: 1, losses: 1, ties: 0, meanDelta: 0.025 },
644+
summary: {
645+
total: 4,
646+
matched: 2,
647+
wins: 1,
648+
losses: 1,
649+
ties: 0,
650+
meanDelta: 0.025,
651+
meanNormalizedGain: null,
652+
},
588653
baseline: 'model-a',
589654
candidate: 'model-b',
590655
},
@@ -622,7 +687,15 @@ describe('compare command', () => {
622687
{
623688
matched: [],
624689
unmatched: { file1: 0, file2: 0 },
625-
summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
690+
summary: {
691+
total: 2,
692+
matched: 1,
693+
wins: 1,
694+
losses: 0,
695+
ties: 0,
696+
meanDelta: 0.1,
697+
meanNormalizedGain: null,
698+
},
626699
baseline: 'a',
627700
candidate: 'b',
628701
},
@@ -648,4 +721,86 @@ describe('compare command', () => {
648721
expect(output).toContain('--');
649722
});
650723
});
724+
725+
describe('computeNormalizedGain', () => {
726+
it('should compute gain relative to remaining headroom', () => {
727+
// baseline 0.5, candidate 0.75 → gained 0.25 out of 0.5 headroom = 0.5
728+
expect(computeNormalizedGain(0.5, 0.75)).toBeCloseTo(0.5, 10);
729+
});
730+
731+
it('should return 1.0 when candidate reaches perfect score', () => {
732+
expect(computeNormalizedGain(0.5, 1.0)).toBeCloseTo(1.0, 10);
733+
});
734+
735+
it('should return negative values when candidate regresses', () => {
736+
// baseline 0.5, candidate 0.25 → lost 0.25 out of 0.5 headroom = -0.5
737+
expect(computeNormalizedGain(0.5, 0.25)).toBeCloseTo(-0.5, 10);
738+
});
739+
740+
it('should return null when baseline is perfect (no headroom)', () => {
741+
expect(computeNormalizedGain(1.0, 1.0)).toBeNull();
742+
expect(computeNormalizedGain(1.0, 0.5)).toBeNull();
743+
});
744+
745+
it('should return 0 when scores are equal', () => {
746+
expect(computeNormalizedGain(0.5, 0.5)).toBeCloseTo(0, 10);
747+
});
748+
749+
it('should handle low baseline correctly', () => {
750+
// baseline 0.1, candidate 0.55 → gained 0.45 out of 0.9 headroom = 0.5
751+
expect(computeNormalizedGain(0.1, 0.55)).toBeCloseTo(0.5, 10);
752+
});
753+
});
754+
755+
describe('compareResults normalized gain', () => {
756+
it('should include normalizedGain in matched results', () => {
757+
const results1 = [{ testId: 'case-1', score: 0.5 }];
758+
const results2 = [{ testId: 'case-1', score: 0.75 }];
759+
760+
const comparison = compareResults(results1, results2, 0.1);
761+
762+
expect(comparison.matched[0].normalizedGain).toBeCloseTo(0.5, 10);
763+
});
764+
765+
it('should compute meanNormalizedGain in summary', () => {
766+
const results1 = [
767+
{ testId: 'case-1', score: 0.5 },
768+
{ testId: 'case-2', score: 0.8 },
769+
];
770+
const results2 = [
771+
{ testId: 'case-1', score: 0.75 }, // g = 0.25/0.5 = 0.5
772+
{ testId: 'case-2', score: 0.9 }, // g = 0.1/0.2 = 0.5
773+
];
774+
775+
const comparison = compareResults(results1, results2, 0.1);
776+
777+
expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10);
778+
});
779+
780+
it('should set normalizedGain to null when baseline is 1.0', () => {
781+
const results1 = [{ testId: 'case-1', score: 1.0 }];
782+
const results2 = [{ testId: 'case-1', score: 1.0 }];
783+
784+
const comparison = compareResults(results1, results2, 0.1);
785+
786+
expect(comparison.matched[0].normalizedGain).toBeNull();
787+
expect(comparison.summary.meanNormalizedGain).toBeNull();
788+
});
789+
790+
it('should exclude null gains from mean computation', () => {
791+
const results1 = [
792+
{ testId: 'case-1', score: 0.5 },
793+
{ testId: 'case-2', score: 1.0 }, // perfect baseline, gain is null
794+
];
795+
const results2 = [
796+
{ testId: 'case-1', score: 0.75 }, // g = 0.5
797+
{ testId: 'case-2', score: 1.0 },
798+
];
799+
800+
const comparison = compareResults(results1, results2, 0.1);
801+
802+
// Only case-1 contributes to mean (g=0.5); case-2 is excluded
803+
expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10);
804+
});
805+
});
651806
});

0 commit comments

Comments
 (0)