feat(compare): add normalized gain metric (#1101)

christso · claude · web-flow · commit b37834eb5ad3 · 2026-04-15T10:22:14.000+10:00
* feat(compare): add normalized gain metric to agentv compare Add Hake's normalized gain (g) to compare output, measuring improvement relative to remaining headroom rather than raw absolute delta. Formula: g = (score_candidate − score_baseline) / (1 − score_baseline) This separates genuine scaffolding from ceiling effects — a +5pp gain from a 90% baseline (g=0.5) is proportionally much larger than +5pp from a 10% baseline (g=0.056). Shown as "Norm. gain" in table output and "g" in matrix pairwise summary. Available as mean_normalized_gain in JSON output. Returns null when baseline is 1.0 (perfect score, no headroom). Closes #1100 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor(compare): use standard symbol 'g' for normalized gain Use 'g' consistently in both table summary and matrix pairwise output, matching the standard notation from Hake (1998) and SkillsBench paper. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * docs(compare): document normalized gain metric Add normalized gain (g) to compare docs: formula, interpretation table, updated table/JSON output examples, and tips section. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
@@ -40,6 +40,7 @@ interface MatchedResult {
   score1: number;
   score2: number;
   delta: number;
+  normalizedGain: number | null;
   outcome: 'win' | 'loss' | 'tie';
 }
 
@@ -53,6 +54,7 @@ export interface ComparisonOutput {
     losses: number;
     ties: number;
     meanDelta: number;
+    meanNormalizedGain: number | null;
   };
   baseline?: string;
   candidate?: string;
@@ -111,6 +113,20 @@ export function loadCombinedResults(filePath: string): Map<string, EvalResult[]>
   return groups;
 }
 
+/**
+ * Hake's normalized gain: g = (score_candidate − score_baseline) / (1 − score_baseline)
+ * Measures improvement relative to remaining headroom. Returns null when baseline is 1.0
+ * (perfect score leaves no room for improvement).
+ * Reference: Hake (1998), used by SkillsBench (arXiv:2602.12670).
+ */
+export function computeNormalizedGain(
+  baselineScore: number,
+  candidateScore: number,
+): number | null {
+  if (baselineScore >= 1.0) return null;
+  return (candidateScore - baselineScore) / (1 - baselineScore);
+}
+
 export function classifyOutcome(delta: number, threshold: number): 'win' | 'loss' | 'tie' {
   if (delta >= threshold) return 'win';
   if (delta <= -threshold) return 'loss';
@@ -137,6 +153,7 @@ export function compareResults(
         score1,
         score2,
         delta,
+        normalizedGain: computeNormalizedGain(score1, score2),
         outcome: classifyOutcome(delta, threshold),
       });
       matchedIds.add(testId);
@@ -153,6 +170,12 @@ export function compareResults(
   const meanDelta =
     matched.length > 0 ? matched.reduce((sum, m) => sum + m.delta, 0) / matched.length : 0;
 
+  const gainValues = matched.map((m) => m.normalizedGain).filter((g): g is number => g !== null);
+  const meanNormalizedGain =
+    gainValues.length > 0
+      ? Math.round((gainValues.reduce((sum, g) => sum + g, 0) / gainValues.length) * 1000) / 1000
+      : null;
+
   return {
     matched,
     unmatched: { file1: unmatchedFile1, file2: unmatchedFile2 },
@@ -163,6 +186,7 @@ export function compareResults(
       losses,
       ties,
       meanDelta: Math.round(meanDelta * 1000) / 1000,
+      meanNormalizedGain,
     },
   };
 }
@@ -323,7 +347,7 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:
 
   // Summary
   lines.push('');
-  const { wins, losses, ties, meanDelta } = comparison.summary;
+  const { wins, losses, ties, meanDelta, meanNormalizedGain } = comparison.summary;
 
   const winStr =
     wins > 0 ? `${c.green}${wins} win${wins !== 1 ? 's' : ''}${c.reset}` : `${wins} wins`;
@@ -340,9 +364,15 @@ export function formatTable(comparison: ComparisonOutput, file1: string, file2:
         ? `${c.red}regressed${c.reset}`
         : `${c.gray}neutral${c.reset}`;
 
-  lines.push(
-    `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset} | Status: ${status}`,
-  );
+  let summaryLine = `${c.bold}Summary:${c.reset} ${winStr}, ${lossStr}, ${tieStr} | Mean Δ: ${deltaColor}${deltaSign}${meanDelta.toFixed(3)}${c.reset}`;
+  if (meanNormalizedGain != null) {
+    const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
+    const gSign = meanNormalizedGain >= 0 ? '+' : '';
+    summaryLine += ` | g: ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
+  }
+  summaryLine += ` | Status: ${status}`;
+
+  lines.push(summaryLine);
   lines.push('');
 
   return lines.join('\n');
@@ -414,13 +444,18 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
       ...pairwise.map((pw) => `  ${pw.baseline} → ${pw.candidate}:`.length),
     );
     for (const p of pairwise) {
-      const { wins, losses, ties, meanDelta } = p.summary;
+      const { wins, losses, ties, meanDelta, meanNormalizedGain } = p.summary;
       const sign = meanDelta >= 0 ? '+' : '';
       const deltaColor = meanDelta > 0 ? c.green : meanDelta < 0 ? c.red : c.gray;
       const label = `  ${p.baseline} → ${p.candidate}:`;
-      lines.push(
-        `${padRight(label, maxLabelLen)}  ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''}  (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset})`,
-      );
+      let pairLine = `${padRight(label, maxLabelLen)}  ${wins} win${wins !== 1 ? 's' : ''}, ${losses} loss${losses !== 1 ? 'es' : ''}, ${ties} tie${ties !== 1 ? 's' : ''}  (${c.bold}Δ${c.reset} ${deltaColor}${sign}${meanDelta.toFixed(3)}${c.reset}`;
+      if (meanNormalizedGain != null) {
+        const gColor = meanNormalizedGain > 0 ? c.green : meanNormalizedGain < 0 ? c.red : c.gray;
+        const gSign = meanNormalizedGain >= 0 ? '+' : '';
+        pairLine += `, ${c.bold}g${c.reset} ${gColor}${gSign}${meanNormalizedGain.toFixed(3)}${c.reset}`;
+      }
+      pairLine += ')';
+      lines.push(pairLine);
     }
   }
 
diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts
@@ -7,6 +7,7 @@ import {
   classifyOutcome,
   compareMatrix,
   compareResults,
+  computeNormalizedGain,
   determineExitCode,
   determineMatrixExitCode,
   formatMatrix,
@@ -459,7 +460,15 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'base',
             candidate: 'cand',
           },
@@ -476,14 +485,30 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 0,
+              losses: 1,
+              ties: 0,
+              meanDelta: -0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'base',
             candidate: 'cand1',
           },
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'base',
             candidate: 'cand2',
           },
@@ -500,14 +525,30 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.05 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.05,
+              meanNormalizedGain: null,
+            },
             baseline: 'base',
             candidate: 'cand1',
           },
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.2 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 0,
+              losses: 1,
+              ties: 0,
+              meanDelta: -0.2,
+              meanNormalizedGain: null,
+            },
             baseline: 'cand1',
             candidate: 'cand2',
           },
@@ -530,7 +571,15 @@ describe('compare command', () => {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
             // delta > 0 means candidate (zeta/baseline) scored higher → alpha regressed
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.2 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.2,
+              meanNormalizedGain: null,
+            },
             baseline: 'alpha',
             candidate: 'zeta',
           },
@@ -550,7 +599,15 @@ describe('compare command', () => {
             unmatched: { file1: 0, file2: 0 },
             // delta < 0 means candidate (zeta/baseline) scored lower → alpha is better
             // That means alpha did NOT regress vs baseline zeta
-            summary: { total: 2, matched: 1, wins: 0, losses: 1, ties: 0, meanDelta: -0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 0,
+              losses: 1,
+              ties: 0,
+              meanDelta: -0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'alpha',
             candidate: 'zeta',
           },
@@ -584,7 +641,15 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 4, matched: 2, wins: 1, losses: 1, ties: 0, meanDelta: 0.025 },
+            summary: {
+              total: 4,
+              matched: 2,
+              wins: 1,
+              losses: 1,
+              ties: 0,
+              meanDelta: 0.025,
+              meanNormalizedGain: null,
+            },
             baseline: 'model-a',
             candidate: 'model-b',
           },
@@ -622,7 +687,15 @@ describe('compare command', () => {
           {
             matched: [],
             unmatched: { file1: 0, file2: 0 },
-            summary: { total: 2, matched: 1, wins: 1, losses: 0, ties: 0, meanDelta: 0.1 },
+            summary: {
+              total: 2,
+              matched: 1,
+              wins: 1,
+              losses: 0,
+              ties: 0,
+              meanDelta: 0.1,
+              meanNormalizedGain: null,
+            },
             baseline: 'a',
             candidate: 'b',
           },
@@ -648,4 +721,86 @@ describe('compare command', () => {
       expect(output).toContain('--');
     });
   });
+
+  describe('computeNormalizedGain', () => {
+    it('should compute gain relative to remaining headroom', () => {
+      // baseline 0.5, candidate 0.75 → gained 0.25 out of 0.5 headroom = 0.5
+      expect(computeNormalizedGain(0.5, 0.75)).toBeCloseTo(0.5, 10);
+    });
+
+    it('should return 1.0 when candidate reaches perfect score', () => {
+      expect(computeNormalizedGain(0.5, 1.0)).toBeCloseTo(1.0, 10);
+    });
+
+    it('should return negative values when candidate regresses', () => {
+      // baseline 0.5, candidate 0.25 → lost 0.25 out of 0.5 headroom = -0.5
+      expect(computeNormalizedGain(0.5, 0.25)).toBeCloseTo(-0.5, 10);
+    });
+
+    it('should return null when baseline is perfect (no headroom)', () => {
+      expect(computeNormalizedGain(1.0, 1.0)).toBeNull();
+      expect(computeNormalizedGain(1.0, 0.5)).toBeNull();
+    });
+
+    it('should return 0 when scores are equal', () => {
+      expect(computeNormalizedGain(0.5, 0.5)).toBeCloseTo(0, 10);
+    });
+
+    it('should handle low baseline correctly', () => {
+      // baseline 0.1, candidate 0.55 → gained 0.45 out of 0.9 headroom = 0.5
+      expect(computeNormalizedGain(0.1, 0.55)).toBeCloseTo(0.5, 10);
+    });
+  });
+
+  describe('compareResults normalized gain', () => {
+    it('should include normalizedGain in matched results', () => {
+      const results1 = [{ testId: 'case-1', score: 0.5 }];
+      const results2 = [{ testId: 'case-1', score: 0.75 }];
+
+      const comparison = compareResults(results1, results2, 0.1);
+
+      expect(comparison.matched[0].normalizedGain).toBeCloseTo(0.5, 10);
+    });
+
+    it('should compute meanNormalizedGain in summary', () => {
+      const results1 = [
+        { testId: 'case-1', score: 0.5 },
+        { testId: 'case-2', score: 0.8 },
+      ];
+      const results2 = [
+        { testId: 'case-1', score: 0.75 }, // g = 0.25/0.5 = 0.5
+        { testId: 'case-2', score: 0.9 }, // g = 0.1/0.2 = 0.5
+      ];
+
+      const comparison = compareResults(results1, results2, 0.1);
+
+      expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10);
+    });
+
+    it('should set normalizedGain to null when baseline is 1.0', () => {
+      const results1 = [{ testId: 'case-1', score: 1.0 }];
+      const results2 = [{ testId: 'case-1', score: 1.0 }];
+
+      const comparison = compareResults(results1, results2, 0.1);
+
+      expect(comparison.matched[0].normalizedGain).toBeNull();
+      expect(comparison.summary.meanNormalizedGain).toBeNull();
+    });
+
+    it('should exclude null gains from mean computation', () => {
+      const results1 = [
+        { testId: 'case-1', score: 0.5 },
+        { testId: 'case-2', score: 1.0 }, // perfect baseline, gain is null
+      ];
+      const results2 = [
+        { testId: 'case-1', score: 0.75 }, // g = 0.5
+        { testId: 'case-2', score: 1.0 },
+      ];
+
+      const comparison = compareResults(results1, results2, 0.1);
+
+      // Only case-1 contributes to mean (g=0.5); case-2 is excluded
+      expect(comparison.summary.meanNormalizedGain).toBeCloseTo(0.5, 10);
+    });
+  });
 });
diff --git a/apps/web/src/content/docs/docs/tools/compare.mdx b/apps/web/src/content/docs/docs/tools/compare.mdx