Add Copeland pairwise scoring as alternative recommendation method (#104)

that-github-user · unknown · claude · web-flow · commit 102b0e6047e2 · 2026-03-28T14:08:14.000-07:00
Implement social choice theory-based scoring: agents compared pairwise on tests, convergence, and files changed. Per-criterion wins tracked. --scoring copeland flag enables it alongside existing weighted method. Agent #5 chosen over #3 via MANUAL review (not thinktank scoring) — better edge case tests (all-identical, non-transitive, single agent) and per-criterion breakdown in CopelandScore type. Closes #103 Co-authored-by: unknown <that-github-user@github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -117,6 +117,26 @@ Normal-sized and thorough diffs all receive the full 10 points. Only outlier-lar
 
 The agent with the highest total score is recommended. Ties broken by the first agent.
 
+### Copeland Pairwise Scoring (alternative)
+
+Enabled with `--scoring copeland`. Instead of assigning absolute point values, Copeland scoring compares every pair of agents head-to-head on three criteria:
+
+| Criterion | Better = |
+|-----------|----------|
+| Tests passed | Passed > Failed |
+| Convergence group size | Larger group > Smaller group |
+| Files changed | Fewer files > More files |
+
+For each pair (A, B):
+1. Count how many criteria A wins vs B wins
+2. If A wins more criteria: A gets +1, B gets −1
+3. If B wins more criteria: B gets +1, A gets −1
+4. If tied on criteria count: both get 0
+
+The agent with the highest cumulative Copeland score is recommended.
+
+**When to use Copeland:** Copeland scoring avoids arbitrary point weights and is resistant to scale distortion. It works well when you want each criterion to have equal importance regardless of magnitude. However, it can produce more ties than weighted scoring, especially with few agents.
+
 ### Why these weights?
 - Tests (100) dominate because correctness trumps everything
 - Convergence (50) is secondary — agreement without tests is weaker evidence
diff --git a/src/cli.ts b/src/cli.ts
@@ -42,6 +42,7 @@ program
     "Convergence clustering similarity threshold (0.0-1.0)",
     String(cfg.threshold),
   )
+  .option("--scoring <method>", "Scoring method: weighted (default) or copeland", "weighted")
   .option("--verbose", "Show detailed output from each agent")
   .action(async (promptArg: string | undefined, opts) => {
     const prompt = resolvePrompt(promptArg, opts.file);
@@ -70,6 +71,12 @@ program
       process.exit(1);
     }
 
+    const validScoring = ["weighted", "copeland"];
+    if (!validScoring.includes(opts.scoring)) {
+      console.error(`Error: --scoring must be one of: ${validScoring.join(", ")}`);
+      process.exit(1);
+    }
+
     const knownModels = ["sonnet", "opus", "haiku"];
     if (!knownModels.includes(opts.model) && !opts.model.startsWith("claude-")) {
       console.warn(
@@ -86,6 +93,7 @@ program
       model: opts.model,
       threshold,
       runner: opts.runner,
+      scoring: opts.scoring,
       verbose: opts.verbose ?? false,
     });
   });
diff --git a/src/commands/run.test.ts b/src/commands/run.test.ts
@@ -12,6 +12,7 @@ function makeOpts(overrides: Partial<RunOptions> = {}): RunOptions {
     model: "sonnet",
     threshold: 0.3,
     verbose: false,
+    scoring: "weighted",
     ...overrides,
   };
 }
diff --git a/src/commands/run.ts b/src/commands/run.ts
@@ -1,7 +1,7 @@
 import { mkdir, writeFile } from "node:fs/promises";
 import { join } from "node:path";
 import { getDefaultRunner, getRunner } from "../runners/registry.js";
-import { analyzeConvergence, recommend } from "../scoring/convergence.js";
+import { analyzeConvergence, copelandRecommend, recommend } from "../scoring/convergence.js";
 import { runTests, validateTestCommand } from "../scoring/test-runner.js";
 import type { AgentResult, EnsembleResult, RunOptions } from "../types.js";
 import { displayApplyInstructions, displayHeader, displayResults } from "../utils/display.js";
@@ -129,18 +129,23 @@ export async function run(opts: RunOptions): Promise<void> {
   const convergence = analyzeConvergence(agents, opts.threshold);
 
   // Phase 5: Recommendation
-  const { recommended, scores } = recommend(agents, testResults, convergence);
+  const { recommended: weightedRec, scores } = recommend(agents, testResults, convergence);
+  const copeland = copelandRecommend(agents, testResults, convergence);
+
+  const recommended = opts.scoring === "copeland" ? copeland.recommended : weightedRec;
 
   // Build result object
   const result: EnsembleResult = {
     prompt: opts.prompt,
     model: opts.model,
     timestamp: new Date().toISOString(),
+    scoring: opts.scoring,
     agents,
     tests: testResults,
     convergence,
     recommended,
     scores,
+    copelandScores: copeland.scores,
   };
 
   // Display results
diff --git a/src/scoring/convergence.test.ts b/src/scoring/convergence.test.ts
@@ -1,7 +1,7 @@
 import assert from "node:assert/strict";
 import { describe, it } from "node:test";
 import type { AgentResult } from "../types.js";
-import { analyzeConvergence, recommend } from "./convergence.js";
+import { analyzeConvergence, copelandRecommend, recommend } from "./convergence.js";
 
 const DIFF_A = `diff --git a/a.ts b/a.ts
 --- a/a.ts
@@ -245,3 +245,146 @@ describe("recommend", () => {
     assert.ok(score1.diffSizePoints < 10);
   });
 });
+
+describe("copelandRecommend", () => {
+  it("returns null for no completed agents", () => {
+    const agents = [makeAgent({ id: 1, status: "error", diff: "" })];
+    const result = copelandRecommend(agents, [], []);
+    assert.equal(result.recommended, null);
+    assert.deepEqual(result.scores, []);
+  });
+
+  it("recommends the agent that dominates all criteria", () => {
+    // Agent 1: passes tests, in larger convergence group, fewer files
+    // Agent 2: fails tests, alone, more files
+    const agents = [
+      makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
+      makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["b.ts", "c.ts"] }),
+    ];
+    const tests = [
+      { agentId: 1, passed: true },
+      { agentId: 2, passed: false },
+    ];
+    const convergence = analyzeConvergence(agents);
+    const result = copelandRecommend(agents, tests, convergence);
+
+    assert.equal(result.recommended, 1);
+    const score1 = result.scores.find((s) => s.agentId === 1);
+    assert.ok(score1);
+    assert.equal(score1.copelandTotal, 1); // wins the one pairwise matchup
+    assert.ok(score1.testsWins > 0);
+  });
+
+  it("all agents identical gives zero Copeland scores", () => {
+    const agents = [
+      makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
+      makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
+      makeAgent({ id: 3, diff: DIFF_A, filesChanged: ["a.ts"] }),
+    ];
+    const tests = [
+      { agentId: 1, passed: true },
+      { agentId: 2, passed: true },
+      { agentId: 3, passed: true },
+    ];
+    const convergence = analyzeConvergence(agents);
+    const result = copelandRecommend(agents, tests, convergence);
+
+    // All agents tie on every criterion — all Copeland scores should be 0
+    for (const score of result.scores) {
+      assert.equal(score.copelandTotal, 0, `Agent #${score.agentId} should have Copeland score 0`);
+      assert.equal(score.testsWins, 0);
+      assert.equal(score.convergenceWins, 0);
+      assert.equal(score.filesChangedWins, 0);
+    }
+    // Still recommends someone (first agent)
+    assert.ok(result.recommended !== null);
+  });
+
+  it("handles agents with different strengths on different criteria (non-transitive)", () => {
+    // Agent 1: passes tests, many files, small group
+    // Agent 2: fails tests, few files, large group
+    // Agent 3: fails tests, many files, large group
+    const agents = [
+      makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "b.ts", "c.ts"] }),
+      makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["x.ts"] }),
+      makeAgent({ id: 3, diff: DIFF_B, filesChanged: ["x.ts", "y.ts", "z.ts"] }),
+    ];
+    const tests = [
+      { agentId: 1, passed: true },
+      { agentId: 2, passed: false },
+      { agentId: 3, passed: false },
+    ];
+    const convergence = analyzeConvergence(agents);
+    const result = copelandRecommend(agents, tests, convergence);
+
+    // Agent 1 vs Agent 2: tests(+1), convergence(-1), files(-1) → Agent 2 wins
+    // Agent 1 vs Agent 3: tests(+1), convergence(-1), files(tie) → tie
+    // Agent 2 vs Agent 3: tests(tie), convergence(tie), files(+1 for 2) → Agent 2 wins
+    // So Agent 2 should have the best Copeland score
+    assert.equal(result.recommended, 2);
+  });
+
+  it("prefers agent with test pass when other criteria are tied", () => {
+    const agents = [
+      makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
+      makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
+    ];
+    const tests = [
+      { agentId: 1, passed: true },
+      { agentId: 2, passed: false },
+    ];
+    const convergence = analyzeConvergence(agents);
+    const result = copelandRecommend(agents, tests, convergence);
+
+    assert.equal(result.recommended, 1);
+    const score1 = result.scores.find((s) => s.agentId === 1);
+    assert.ok(score1);
+    assert.equal(score1.testsWins, 1);
+    assert.equal(score1.copelandTotal, 1);
+  });
+
+  it("prefers fewer files changed when other criteria are equal", () => {
+    const agents = [
+      makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "b.ts", "c.ts"] }),
+      makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
+    ];
+    const convergence = analyzeConvergence(agents);
+    const result = copelandRecommend(agents, [], convergence);
+
+    assert.equal(result.recommended, 2);
+  });
+
+  it("returns per-agent criterion breakdowns", () => {
+    const agents = [
+      makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
+      makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["b.ts", "c.ts"] }),
+    ];
+    const tests = [
+      { agentId: 1, passed: true },
+      { agentId: 2, passed: false },
+    ];
+    const convergence = analyzeConvergence(agents);
+    const result = copelandRecommend(agents, tests, convergence);
+
+    assert.equal(result.scores.length, 2);
+    const score1 = result.scores.find((s) => s.agentId === 1);
+    const score2 = result.scores.find((s) => s.agentId === 2);
+    assert.ok(score1);
+    assert.ok(score2);
+
+    // Score1 wins tests and files, score2 wins neither
+    assert.equal(score1.testsWins, 1);
+    assert.equal(score2.testsWins, -1);
+    assert.equal(score1.filesChangedWins, 1);
+    assert.equal(score2.filesChangedWins, -1);
+  });
+
+  it("handles single agent", () => {
+    const agents = [makeAgent({ id: 1, diff: DIFF_A })];
+    const result = copelandRecommend(agents, [], []);
+
+    assert.equal(result.recommended, 1);
+    assert.equal(result.scores.length, 1);
+    assert.equal(result.scores[0]!.copelandTotal, 0);
+  });
+});
diff --git a/src/scoring/convergence.ts b/src/scoring/convergence.ts
@@ -1,4 +1,4 @@
-import type { AgentResult, AgentScore, ConvergenceGroup } from "../types.js";
+import type { AgentResult, AgentScore, ConvergenceGroup, CopelandScore } from "../types.js";
 import { pairwiseSimilarity } from "./diff-parser.js";
 
 /**
@@ -176,3 +176,106 @@ export function recommend(
 
   return { recommended: bestId, scores: agentScores };
 }
+
+/**
+ * Copeland pairwise scoring: compare every pair of agents head-to-head
+ * on three criteria (tests passed, convergence group size, files changed).
+ * For each pair, the agent winning more criteria gets +1, the loser gets -1, ties get 0.
+ * The agent with the highest Copeland score is recommended.
+ */
+export function copelandRecommend(
+  agents: AgentResult[],
+  testResults: Array<{ agentId: number; passed: boolean }>,
+  convergence: ConvergenceGroup[],
+): { recommended: number | null; scores: CopelandScore[] } {
+  const completed = agents.filter((a) => a.status === "success" && a.diff.length > 0);
+  if (completed.length === 0) return { recommended: null, scores: [] };
+
+  // Pre-compute per-agent criteria values
+  const agentData = completed.map((agent) => {
+    const test = testResults.find((t) => t.agentId === agent.id);
+    const testsPassed = test?.passed ? 1 : 0;
+    const group = convergence.find((g) => g.agents.includes(agent.id));
+    const groupSize = group ? group.agents.length : 0;
+    const filesChanged = agent.filesChanged.length;
+    return { id: agent.id, testsPassed, groupSize, filesChanged };
+  });
+
+  // Initialize scores
+  const scoreMap = new Map<number, CopelandScore>();
+  for (const data of agentData) {
+    scoreMap.set(data.id, {
+      agentId: data.id,
+      testsWins: 0,
+      convergenceWins: 0,
+      filesChangedWins: 0,
+      copelandTotal: 0,
+    });
+  }
+
+  // Pairwise comparison
+  for (let i = 0; i < agentData.length; i++) {
+    for (let j = i + 1; j < agentData.length; j++) {
+      const a = agentData[i]!;
+      const b = agentData[j]!;
+
+      let aWins = 0;
+      let bWins = 0;
+
+      // Criterion 1: tests passed (more is better)
+      if (a.testsPassed > b.testsPassed) {
+        aWins++;
+        scoreMap.get(a.id)!.testsWins++;
+        scoreMap.get(b.id)!.testsWins--;
+      } else if (b.testsPassed > a.testsPassed) {
+        bWins++;
+        scoreMap.get(b.id)!.testsWins++;
+        scoreMap.get(a.id)!.testsWins--;
+      }
+
+      // Criterion 2: convergence group size (larger is better)
+      if (a.groupSize > b.groupSize) {
+        aWins++;
+        scoreMap.get(a.id)!.convergenceWins++;
+        scoreMap.get(b.id)!.convergenceWins--;
+      } else if (b.groupSize > a.groupSize) {
+        bWins++;
+        scoreMap.get(b.id)!.convergenceWins++;
+        scoreMap.get(a.id)!.convergenceWins--;
+      }
+
+      // Criterion 3: files changed (fewer is better — minimal changes preferred)
+      if (a.filesChanged < b.filesChanged) {
+        aWins++;
+        scoreMap.get(a.id)!.filesChangedWins++;
+        scoreMap.get(b.id)!.filesChangedWins--;
+      } else if (b.filesChanged < a.filesChanged) {
+        bWins++;
+        scoreMap.get(b.id)!.filesChangedWins++;
+        scoreMap.get(a.id)!.filesChangedWins--;
+      }
+
+      // Overall Copeland: winner of more criteria gets +1, loser -1
+      if (aWins > bWins) {
+        scoreMap.get(a.id)!.copelandTotal++;
+        scoreMap.get(b.id)!.copelandTotal--;
+      } else if (bWins > aWins) {
+        scoreMap.get(b.id)!.copelandTotal++;
+        scoreMap.get(a.id)!.copelandTotal--;
+      }
+    }
+  }
+
+  const copelandScores = [...scoreMap.values()];
+
+  let bestId: number | null = null;
+  let bestScore = -Infinity;
+  for (const score of copelandScores) {
+    if (score.copelandTotal > bestScore) {
+      bestScore = score.copelandTotal;
+      bestId = score.agentId;
+    }
+  }
+
+  return { recommended: bestId, scores: copelandScores };
+}
diff --git a/src/types.ts b/src/types.ts
@@ -8,6 +8,7 @@ export interface RunOptions {
   threshold: number;
   verbose: boolean;
   runner?: string;
+  scoring: "weighted" | "copeland";
 }
 
 export interface AgentResult {
@@ -46,13 +47,23 @@ export interface AgentScore {
   total: number;
 }
 
+export interface CopelandScore {
+  agentId: number;
+  testsWins: number;
+  convergenceWins: number;
+  filesChangedWins: number;
+  copelandTotal: number;
+}
+
 export interface EnsembleResult {
   prompt: string;
   model: string;
   timestamp: string;
+  scoring: "weighted" | "copeland";
   agents: AgentResult[];
   tests: TestResult[];
   convergence: ConvergenceGroup[];
   recommended: number | null;
   scores: AgentScore[];
+  copelandScores?: CopelandScore[];
 }
diff --git a/src/utils/display.ts b/src/utils/display.ts

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@ function makeOpts(overrides: Partial<RunOptions> = {}): RunOptions {`
`12`	`12`	`model: "sonnet",`
`13`	`13`	`threshold: 0.3,`
`14`	`14`	`verbose: false,`
	`15`	`+ scoring: "weighted",`
`15`	`16`	`...overrides,`
`16`	`17`	`};`
`17`	`18`	`}`