From 4fb3b3ddd7d83f1302d6ffda7783beff85f8858f Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 28 Mar 2026 14:06:38 -0700 Subject: [PATCH] Add Copeland pairwise scoring as alternative recommendation method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement social choice theory-based scoring: agents compared pairwise on tests, convergence, and files changed. Per-criterion wins tracked. --scoring copeland flag enables it alongside existing weighted method. Agent #5 chosen over #3 via MANUAL review (not thinktank scoring) — better edge case tests (all-identical, non-transitive, single agent) and per-criterion breakdown in CopelandScore type. Closes #103 Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/architecture.md | 20 +++++ src/cli.ts | 8 ++ src/commands/run.test.ts | 1 + src/commands/run.ts | 9 +- src/scoring/convergence.test.ts | 145 +++++++++++++++++++++++++++++++- src/scoring/convergence.ts | 105 ++++++++++++++++++++++- src/types.ts | 11 +++ src/utils/display.ts | 33 +++++++- 8 files changed, 327 insertions(+), 5 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index f4edf30..05be7d5 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -117,6 +117,26 @@ Normal-sized and thorough diffs all receive the full 10 points. Only outlier-lar The agent with the highest total score is recommended. Ties broken by the first agent. +### Copeland Pairwise Scoring (alternative) + +Enabled with `--scoring copeland`. Instead of assigning absolute point values, Copeland scoring compares every pair of agents head-to-head on three criteria: + +| Criterion | Better = | +|-----------|----------| +| Tests passed | Passed > Failed | +| Convergence group size | Larger group > Smaller group | +| Files changed | Fewer files > More files | + +For each pair (A, B): +1. Count how many criteria A wins vs B wins +2. If A wins more criteria: A gets +1, B gets −1 +3. If B wins more criteria: B gets +1, A gets −1 +4. If tied on criteria count: both get 0 + +The agent with the highest cumulative Copeland score is recommended. + +**When to use Copeland:** Copeland scoring avoids arbitrary point weights and is resistant to scale distortion. It works well when you want each criterion to have equal importance regardless of magnitude. However, it can produce more ties than weighted scoring, especially with few agents. + ### Why these weights? - Tests (100) dominate because correctness trumps everything - Convergence (50) is secondary — agreement without tests is weaker evidence diff --git a/src/cli.ts b/src/cli.ts index adad376..3c8d854 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -42,6 +42,7 @@ program "Convergence clustering similarity threshold (0.0-1.0)", String(cfg.threshold), ) + .option("--scoring ", "Scoring method: weighted (default) or copeland", "weighted") .option("--verbose", "Show detailed output from each agent") .action(async (promptArg: string | undefined, opts) => { const prompt = resolvePrompt(promptArg, opts.file); @@ -70,6 +71,12 @@ program process.exit(1); } + const validScoring = ["weighted", "copeland"]; + if (!validScoring.includes(opts.scoring)) { + console.error(`Error: --scoring must be one of: ${validScoring.join(", ")}`); + process.exit(1); + } + const knownModels = ["sonnet", "opus", "haiku"]; if (!knownModels.includes(opts.model) && !opts.model.startsWith("claude-")) { console.warn( @@ -86,6 +93,7 @@ program model: opts.model, threshold, runner: opts.runner, + scoring: opts.scoring, verbose: opts.verbose ?? false, }); }); diff --git a/src/commands/run.test.ts b/src/commands/run.test.ts index 7cefd89..0e972c3 100644 --- a/src/commands/run.test.ts +++ b/src/commands/run.test.ts @@ -12,6 +12,7 @@ function makeOpts(overrides: Partial = {}): RunOptions { model: "sonnet", threshold: 0.3, verbose: false, + scoring: "weighted", ...overrides, }; } diff --git a/src/commands/run.ts b/src/commands/run.ts index 9e77429..6c7b118 100644 --- a/src/commands/run.ts +++ b/src/commands/run.ts @@ -1,7 +1,7 @@ import { mkdir, writeFile } from "node:fs/promises"; import { join } from "node:path"; import { getDefaultRunner, getRunner } from "../runners/registry.js"; -import { analyzeConvergence, recommend } from "../scoring/convergence.js"; +import { analyzeConvergence, copelandRecommend, recommend } from "../scoring/convergence.js"; import { runTests, validateTestCommand } from "../scoring/test-runner.js"; import type { AgentResult, EnsembleResult, RunOptions } from "../types.js"; import { displayApplyInstructions, displayHeader, displayResults } from "../utils/display.js"; @@ -129,18 +129,23 @@ export async function run(opts: RunOptions): Promise { const convergence = analyzeConvergence(agents, opts.threshold); // Phase 5: Recommendation - const { recommended, scores } = recommend(agents, testResults, convergence); + const { recommended: weightedRec, scores } = recommend(agents, testResults, convergence); + const copeland = copelandRecommend(agents, testResults, convergence); + + const recommended = opts.scoring === "copeland" ? copeland.recommended : weightedRec; // Build result object const result: EnsembleResult = { prompt: opts.prompt, model: opts.model, timestamp: new Date().toISOString(), + scoring: opts.scoring, agents, tests: testResults, convergence, recommended, scores, + copelandScores: copeland.scores, }; // Display results diff --git a/src/scoring/convergence.test.ts b/src/scoring/convergence.test.ts index 1a99c74..b9a5c60 100644 --- a/src/scoring/convergence.test.ts +++ b/src/scoring/convergence.test.ts @@ -1,7 +1,7 @@ import assert from "node:assert/strict"; import { describe, it } from "node:test"; import type { AgentResult } from "../types.js"; -import { analyzeConvergence, recommend } from "./convergence.js"; +import { analyzeConvergence, copelandRecommend, recommend } from "./convergence.js"; const DIFF_A = `diff --git a/a.ts b/a.ts --- a/a.ts @@ -245,3 +245,146 @@ describe("recommend", () => { assert.ok(score1.diffSizePoints < 10); }); }); + +describe("copelandRecommend", () => { + it("returns null for no completed agents", () => { + const agents = [makeAgent({ id: 1, status: "error", diff: "" })]; + const result = copelandRecommend(agents, [], []); + assert.equal(result.recommended, null); + assert.deepEqual(result.scores, []); + }); + + it("recommends the agent that dominates all criteria", () => { + // Agent 1: passes tests, in larger convergence group, fewer files + // Agent 2: fails tests, alone, more files + const agents = [ + makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }), + makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["b.ts", "c.ts"] }), + ]; + const tests = [ + { agentId: 1, passed: true }, + { agentId: 2, passed: false }, + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, tests, convergence); + + assert.equal(result.recommended, 1); + const score1 = result.scores.find((s) => s.agentId === 1); + assert.ok(score1); + assert.equal(score1.copelandTotal, 1); // wins the one pairwise matchup + assert.ok(score1.testsWins > 0); + }); + + it("all agents identical gives zero Copeland scores", () => { + const agents = [ + makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }), + makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }), + makeAgent({ id: 3, diff: DIFF_A, filesChanged: ["a.ts"] }), + ]; + const tests = [ + { agentId: 1, passed: true }, + { agentId: 2, passed: true }, + { agentId: 3, passed: true }, + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, tests, convergence); + + // All agents tie on every criterion — all Copeland scores should be 0 + for (const score of result.scores) { + assert.equal(score.copelandTotal, 0, `Agent #${score.agentId} should have Copeland score 0`); + assert.equal(score.testsWins, 0); + assert.equal(score.convergenceWins, 0); + assert.equal(score.filesChangedWins, 0); + } + // Still recommends someone (first agent) + assert.ok(result.recommended !== null); + }); + + it("handles agents with different strengths on different criteria (non-transitive)", () => { + // Agent 1: passes tests, many files, small group + // Agent 2: fails tests, few files, large group + // Agent 3: fails tests, many files, large group + const agents = [ + makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "b.ts", "c.ts"] }), + makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["x.ts"] }), + makeAgent({ id: 3, diff: DIFF_B, filesChanged: ["x.ts", "y.ts", "z.ts"] }), + ]; + const tests = [ + { agentId: 1, passed: true }, + { agentId: 2, passed: false }, + { agentId: 3, passed: false }, + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, tests, convergence); + + // Agent 1 vs Agent 2: tests(+1), convergence(-1), files(-1) → Agent 2 wins + // Agent 1 vs Agent 3: tests(+1), convergence(-1), files(tie) → tie + // Agent 2 vs Agent 3: tests(tie), convergence(tie), files(+1 for 2) → Agent 2 wins + // So Agent 2 should have the best Copeland score + assert.equal(result.recommended, 2); + }); + + it("prefers agent with test pass when other criteria are tied", () => { + const agents = [ + makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }), + makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }), + ]; + const tests = [ + { agentId: 1, passed: true }, + { agentId: 2, passed: false }, + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, tests, convergence); + + assert.equal(result.recommended, 1); + const score1 = result.scores.find((s) => s.agentId === 1); + assert.ok(score1); + assert.equal(score1.testsWins, 1); + assert.equal(score1.copelandTotal, 1); + }); + + it("prefers fewer files changed when other criteria are equal", () => { + const agents = [ + makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "b.ts", "c.ts"] }), + makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }), + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, [], convergence); + + assert.equal(result.recommended, 2); + }); + + it("returns per-agent criterion breakdowns", () => { + const agents = [ + makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }), + makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["b.ts", "c.ts"] }), + ]; + const tests = [ + { agentId: 1, passed: true }, + { agentId: 2, passed: false }, + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, tests, convergence); + + assert.equal(result.scores.length, 2); + const score1 = result.scores.find((s) => s.agentId === 1); + const score2 = result.scores.find((s) => s.agentId === 2); + assert.ok(score1); + assert.ok(score2); + + // Score1 wins tests and files, score2 wins neither + assert.equal(score1.testsWins, 1); + assert.equal(score2.testsWins, -1); + assert.equal(score1.filesChangedWins, 1); + assert.equal(score2.filesChangedWins, -1); + }); + + it("handles single agent", () => { + const agents = [makeAgent({ id: 1, diff: DIFF_A })]; + const result = copelandRecommend(agents, [], []); + + assert.equal(result.recommended, 1); + assert.equal(result.scores.length, 1); + assert.equal(result.scores[0]!.copelandTotal, 0); + }); +}); diff --git a/src/scoring/convergence.ts b/src/scoring/convergence.ts index 5c5666a..8351c49 100644 --- a/src/scoring/convergence.ts +++ b/src/scoring/convergence.ts @@ -1,4 +1,4 @@ -import type { AgentResult, AgentScore, ConvergenceGroup } from "../types.js"; +import type { AgentResult, AgentScore, ConvergenceGroup, CopelandScore } from "../types.js"; import { pairwiseSimilarity } from "./diff-parser.js"; /** @@ -176,3 +176,106 @@ export function recommend( return { recommended: bestId, scores: agentScores }; } + +/** + * Copeland pairwise scoring: compare every pair of agents head-to-head + * on three criteria (tests passed, convergence group size, files changed). + * For each pair, the agent winning more criteria gets +1, the loser gets -1, ties get 0. + * The agent with the highest Copeland score is recommended. + */ +export function copelandRecommend( + agents: AgentResult[], + testResults: Array<{ agentId: number; passed: boolean }>, + convergence: ConvergenceGroup[], +): { recommended: number | null; scores: CopelandScore[] } { + const completed = agents.filter((a) => a.status === "success" && a.diff.length > 0); + if (completed.length === 0) return { recommended: null, scores: [] }; + + // Pre-compute per-agent criteria values + const agentData = completed.map((agent) => { + const test = testResults.find((t) => t.agentId === agent.id); + const testsPassed = test?.passed ? 1 : 0; + const group = convergence.find((g) => g.agents.includes(agent.id)); + const groupSize = group ? group.agents.length : 0; + const filesChanged = agent.filesChanged.length; + return { id: agent.id, testsPassed, groupSize, filesChanged }; + }); + + // Initialize scores + const scoreMap = new Map(); + for (const data of agentData) { + scoreMap.set(data.id, { + agentId: data.id, + testsWins: 0, + convergenceWins: 0, + filesChangedWins: 0, + copelandTotal: 0, + }); + } + + // Pairwise comparison + for (let i = 0; i < agentData.length; i++) { + for (let j = i + 1; j < agentData.length; j++) { + const a = agentData[i]!; + const b = agentData[j]!; + + let aWins = 0; + let bWins = 0; + + // Criterion 1: tests passed (more is better) + if (a.testsPassed > b.testsPassed) { + aWins++; + scoreMap.get(a.id)!.testsWins++; + scoreMap.get(b.id)!.testsWins--; + } else if (b.testsPassed > a.testsPassed) { + bWins++; + scoreMap.get(b.id)!.testsWins++; + scoreMap.get(a.id)!.testsWins--; + } + + // Criterion 2: convergence group size (larger is better) + if (a.groupSize > b.groupSize) { + aWins++; + scoreMap.get(a.id)!.convergenceWins++; + scoreMap.get(b.id)!.convergenceWins--; + } else if (b.groupSize > a.groupSize) { + bWins++; + scoreMap.get(b.id)!.convergenceWins++; + scoreMap.get(a.id)!.convergenceWins--; + } + + // Criterion 3: files changed (fewer is better — minimal changes preferred) + if (a.filesChanged < b.filesChanged) { + aWins++; + scoreMap.get(a.id)!.filesChangedWins++; + scoreMap.get(b.id)!.filesChangedWins--; + } else if (b.filesChanged < a.filesChanged) { + bWins++; + scoreMap.get(b.id)!.filesChangedWins++; + scoreMap.get(a.id)!.filesChangedWins--; + } + + // Overall Copeland: winner of more criteria gets +1, loser -1 + if (aWins > bWins) { + scoreMap.get(a.id)!.copelandTotal++; + scoreMap.get(b.id)!.copelandTotal--; + } else if (bWins > aWins) { + scoreMap.get(b.id)!.copelandTotal++; + scoreMap.get(a.id)!.copelandTotal--; + } + } + } + + const copelandScores = [...scoreMap.values()]; + + let bestId: number | null = null; + let bestScore = -Infinity; + for (const score of copelandScores) { + if (score.copelandTotal > bestScore) { + bestScore = score.copelandTotal; + bestId = score.agentId; + } + } + + return { recommended: bestId, scores: copelandScores }; +} diff --git a/src/types.ts b/src/types.ts index 5e4cbeb..760430c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -8,6 +8,7 @@ export interface RunOptions { threshold: number; verbose: boolean; runner?: string; + scoring: "weighted" | "copeland"; } export interface AgentResult { @@ -46,13 +47,23 @@ export interface AgentScore { total: number; } +export interface CopelandScore { + agentId: number; + testsWins: number; + convergenceWins: number; + filesChangedWins: number; + copelandTotal: number; +} + export interface EnsembleResult { prompt: string; model: string; timestamp: string; + scoring: "weighted" | "copeland"; agents: AgentResult[]; tests: TestResult[]; convergence: ConvergenceGroup[]; recommended: number | null; scores: AgentScore[]; + copelandScores?: CopelandScore[]; } diff --git a/src/utils/display.ts b/src/utils/display.ts index 3a4797b..6f6c6fa 100644 --- a/src/utils/display.ts +++ b/src/utils/display.ts @@ -98,11 +98,42 @@ export function displayResults(result: EnsembleResult): void { console.log(); } + // Copeland scoring breakdown + if (result.copelandScores && result.copelandScores.length > 0) { + console.log(pc.bold("Copeland Pairwise Scoring")); + console.log(pc.dim("─".repeat(60))); + console.log( + " " + + padRight("Agent", 8) + + padRight("Tests", 10) + + padRight("Converge", 10) + + padRight("Files", 10) + + padRight("Copeland", 10), + ); + console.log(" " + pc.dim("─".repeat(48))); + + for (const score of result.copelandScores) { + const isRecommended = result.scoring === "copeland" && result.recommended === score.agentId; + const prefix = isRecommended ? pc.cyan(">>") : " "; + const fmt = (n: number): string => (n > 0 ? `+${n}` : String(n)); + console.log( + prefix + + padRight(`#${score.agentId}`, 8) + + padRight(fmt(score.testsWins), 10) + + padRight(fmt(score.convergenceWins), 10) + + padRight(fmt(score.filesChangedWins), 10) + + padRight(fmt(score.copelandTotal), 10), + ); + } + console.log(); + } + // Recommendation if (result.recommended !== null) { + const method = result.scoring === "copeland" ? "Copeland pairwise" : "weighted"; console.log( pc.cyan(` Recommended: Agent #${result.recommended}`) + - pc.dim(" (highest score based on tests + convergence + diff size)"), + pc.dim(` (${method} scoring: tests + convergence + diff size)`), ); console.log(); }