Skip to content

Commit 102b0e6

Browse files
that-github-userunknownclaude
authored
Add Copeland pairwise scoring as alternative recommendation method (#104)
Implement social choice theory-based scoring: agents compared pairwise on tests, convergence, and files changed. Per-criterion wins tracked. --scoring copeland flag enables it alongside existing weighted method. Agent #5 chosen over #3 via MANUAL review (not thinktank scoring) — better edge case tests (all-identical, non-transitive, single agent) and per-criterion breakdown in CopelandScore type. Closes #103 Co-authored-by: unknown <that-github-user@github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 353e120 commit 102b0e6

8 files changed

Lines changed: 327 additions & 5 deletions

File tree

docs/architecture.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,26 @@ Normal-sized and thorough diffs all receive the full 10 points. Only outlier-lar
117117

118118
The agent with the highest total score is recommended. Ties broken by the first agent.
119119

120+
### Copeland Pairwise Scoring (alternative)
121+
122+
Enabled with `--scoring copeland`. Instead of assigning absolute point values, Copeland scoring compares every pair of agents head-to-head on three criteria:
123+
124+
| Criterion | Better = |
125+
|-----------|----------|
126+
| Tests passed | Passed > Failed |
127+
| Convergence group size | Larger group > Smaller group |
128+
| Files changed | Fewer files > More files |
129+
130+
For each pair (A, B):
131+
1. Count how many criteria A wins vs B wins
132+
2. If A wins more criteria: A gets +1, B gets −1
133+
3. If B wins more criteria: B gets +1, A gets −1
134+
4. If tied on criteria count: both get 0
135+
136+
The agent with the highest cumulative Copeland score is recommended.
137+
138+
**When to use Copeland:** Copeland scoring avoids arbitrary point weights and is resistant to scale distortion. It works well when you want each criterion to have equal importance regardless of magnitude. However, it can produce more ties than weighted scoring, especially with few agents.
139+
120140
### Why these weights?
121141
- Tests (100) dominate because correctness trumps everything
122142
- Convergence (50) is secondary — agreement without tests is weaker evidence

src/cli.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ program
4242
"Convergence clustering similarity threshold (0.0-1.0)",
4343
String(cfg.threshold),
4444
)
45+
.option("--scoring <method>", "Scoring method: weighted (default) or copeland", "weighted")
4546
.option("--verbose", "Show detailed output from each agent")
4647
.action(async (promptArg: string | undefined, opts) => {
4748
const prompt = resolvePrompt(promptArg, opts.file);
@@ -70,6 +71,12 @@ program
7071
process.exit(1);
7172
}
7273

74+
const validScoring = ["weighted", "copeland"];
75+
if (!validScoring.includes(opts.scoring)) {
76+
console.error(`Error: --scoring must be one of: ${validScoring.join(", ")}`);
77+
process.exit(1);
78+
}
79+
7380
const knownModels = ["sonnet", "opus", "haiku"];
7481
if (!knownModels.includes(opts.model) && !opts.model.startsWith("claude-")) {
7582
console.warn(
@@ -86,6 +93,7 @@ program
8693
model: opts.model,
8794
threshold,
8895
runner: opts.runner,
96+
scoring: opts.scoring,
8997
verbose: opts.verbose ?? false,
9098
});
9199
});

src/commands/run.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ function makeOpts(overrides: Partial<RunOptions> = {}): RunOptions {
1212
model: "sonnet",
1313
threshold: 0.3,
1414
verbose: false,
15+
scoring: "weighted",
1516
...overrides,
1617
};
1718
}

src/commands/run.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { mkdir, writeFile } from "node:fs/promises";
22
import { join } from "node:path";
33
import { getDefaultRunner, getRunner } from "../runners/registry.js";
4-
import { analyzeConvergence, recommend } from "../scoring/convergence.js";
4+
import { analyzeConvergence, copelandRecommend, recommend } from "../scoring/convergence.js";
55
import { runTests, validateTestCommand } from "../scoring/test-runner.js";
66
import type { AgentResult, EnsembleResult, RunOptions } from "../types.js";
77
import { displayApplyInstructions, displayHeader, displayResults } from "../utils/display.js";
@@ -129,18 +129,23 @@ export async function run(opts: RunOptions): Promise<void> {
129129
const convergence = analyzeConvergence(agents, opts.threshold);
130130

131131
// Phase 5: Recommendation
132-
const { recommended, scores } = recommend(agents, testResults, convergence);
132+
const { recommended: weightedRec, scores } = recommend(agents, testResults, convergence);
133+
const copeland = copelandRecommend(agents, testResults, convergence);
134+
135+
const recommended = opts.scoring === "copeland" ? copeland.recommended : weightedRec;
133136

134137
// Build result object
135138
const result: EnsembleResult = {
136139
prompt: opts.prompt,
137140
model: opts.model,
138141
timestamp: new Date().toISOString(),
142+
scoring: opts.scoring,
139143
agents,
140144
tests: testResults,
141145
convergence,
142146
recommended,
143147
scores,
148+
copelandScores: copeland.scores,
144149
};
145150

146151
// Display results

src/scoring/convergence.test.ts

Lines changed: 144 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import assert from "node:assert/strict";
22
import { describe, it } from "node:test";
33
import type { AgentResult } from "../types.js";
4-
import { analyzeConvergence, recommend } from "./convergence.js";
4+
import { analyzeConvergence, copelandRecommend, recommend } from "./convergence.js";
55

66
const DIFF_A = `diff --git a/a.ts b/a.ts
77
--- a/a.ts
@@ -245,3 +245,146 @@ describe("recommend", () => {
245245
assert.ok(score1.diffSizePoints < 10);
246246
});
247247
});
248+
249+
describe("copelandRecommend", () => {
250+
it("returns null for no completed agents", () => {
251+
const agents = [makeAgent({ id: 1, status: "error", diff: "" })];
252+
const result = copelandRecommend(agents, [], []);
253+
assert.equal(result.recommended, null);
254+
assert.deepEqual(result.scores, []);
255+
});
256+
257+
it("recommends the agent that dominates all criteria", () => {
258+
// Agent 1: passes tests, in larger convergence group, fewer files
259+
// Agent 2: fails tests, alone, more files
260+
const agents = [
261+
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
262+
makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["b.ts", "c.ts"] }),
263+
];
264+
const tests = [
265+
{ agentId: 1, passed: true },
266+
{ agentId: 2, passed: false },
267+
];
268+
const convergence = analyzeConvergence(agents);
269+
const result = copelandRecommend(agents, tests, convergence);
270+
271+
assert.equal(result.recommended, 1);
272+
const score1 = result.scores.find((s) => s.agentId === 1);
273+
assert.ok(score1);
274+
assert.equal(score1.copelandTotal, 1); // wins the one pairwise matchup
275+
assert.ok(score1.testsWins > 0);
276+
});
277+
278+
it("all agents identical gives zero Copeland scores", () => {
279+
const agents = [
280+
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
281+
makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
282+
makeAgent({ id: 3, diff: DIFF_A, filesChanged: ["a.ts"] }),
283+
];
284+
const tests = [
285+
{ agentId: 1, passed: true },
286+
{ agentId: 2, passed: true },
287+
{ agentId: 3, passed: true },
288+
];
289+
const convergence = analyzeConvergence(agents);
290+
const result = copelandRecommend(agents, tests, convergence);
291+
292+
// All agents tie on every criterion — all Copeland scores should be 0
293+
for (const score of result.scores) {
294+
assert.equal(score.copelandTotal, 0, `Agent #${score.agentId} should have Copeland score 0`);
295+
assert.equal(score.testsWins, 0);
296+
assert.equal(score.convergenceWins, 0);
297+
assert.equal(score.filesChangedWins, 0);
298+
}
299+
// Still recommends someone (first agent)
300+
assert.ok(result.recommended !== null);
301+
});
302+
303+
it("handles agents with different strengths on different criteria (non-transitive)", () => {
304+
// Agent 1: passes tests, many files, small group
305+
// Agent 2: fails tests, few files, large group
306+
// Agent 3: fails tests, many files, large group
307+
const agents = [
308+
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "b.ts", "c.ts"] }),
309+
makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["x.ts"] }),
310+
makeAgent({ id: 3, diff: DIFF_B, filesChanged: ["x.ts", "y.ts", "z.ts"] }),
311+
];
312+
const tests = [
313+
{ agentId: 1, passed: true },
314+
{ agentId: 2, passed: false },
315+
{ agentId: 3, passed: false },
316+
];
317+
const convergence = analyzeConvergence(agents);
318+
const result = copelandRecommend(agents, tests, convergence);
319+
320+
// Agent 1 vs Agent 2: tests(+1), convergence(-1), files(-1) → Agent 2 wins
321+
// Agent 1 vs Agent 3: tests(+1), convergence(-1), files(tie) → tie
322+
// Agent 2 vs Agent 3: tests(tie), convergence(tie), files(+1 for 2) → Agent 2 wins
323+
// So Agent 2 should have the best Copeland score
324+
assert.equal(result.recommended, 2);
325+
});
326+
327+
it("prefers agent with test pass when other criteria are tied", () => {
328+
const agents = [
329+
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
330+
makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
331+
];
332+
const tests = [
333+
{ agentId: 1, passed: true },
334+
{ agentId: 2, passed: false },
335+
];
336+
const convergence = analyzeConvergence(agents);
337+
const result = copelandRecommend(agents, tests, convergence);
338+
339+
assert.equal(result.recommended, 1);
340+
const score1 = result.scores.find((s) => s.agentId === 1);
341+
assert.ok(score1);
342+
assert.equal(score1.testsWins, 1);
343+
assert.equal(score1.copelandTotal, 1);
344+
});
345+
346+
it("prefers fewer files changed when other criteria are equal", () => {
347+
const agents = [
348+
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "b.ts", "c.ts"] }),
349+
makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
350+
];
351+
const convergence = analyzeConvergence(agents);
352+
const result = copelandRecommend(agents, [], convergence);
353+
354+
assert.equal(result.recommended, 2);
355+
});
356+
357+
it("returns per-agent criterion breakdowns", () => {
358+
const agents = [
359+
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
360+
makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["b.ts", "c.ts"] }),
361+
];
362+
const tests = [
363+
{ agentId: 1, passed: true },
364+
{ agentId: 2, passed: false },
365+
];
366+
const convergence = analyzeConvergence(agents);
367+
const result = copelandRecommend(agents, tests, convergence);
368+
369+
assert.equal(result.scores.length, 2);
370+
const score1 = result.scores.find((s) => s.agentId === 1);
371+
const score2 = result.scores.find((s) => s.agentId === 2);
372+
assert.ok(score1);
373+
assert.ok(score2);
374+
375+
// Score1 wins tests and files, score2 wins neither
376+
assert.equal(score1.testsWins, 1);
377+
assert.equal(score2.testsWins, -1);
378+
assert.equal(score1.filesChangedWins, 1);
379+
assert.equal(score2.filesChangedWins, -1);
380+
});
381+
382+
it("handles single agent", () => {
383+
const agents = [makeAgent({ id: 1, diff: DIFF_A })];
384+
const result = copelandRecommend(agents, [], []);
385+
386+
assert.equal(result.recommended, 1);
387+
assert.equal(result.scores.length, 1);
388+
assert.equal(result.scores[0]!.copelandTotal, 0);
389+
});
390+
});

src/scoring/convergence.ts

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { AgentResult, AgentScore, ConvergenceGroup } from "../types.js";
1+
import type { AgentResult, AgentScore, ConvergenceGroup, CopelandScore } from "../types.js";
22
import { pairwiseSimilarity } from "./diff-parser.js";
33

44
/**
@@ -176,3 +176,106 @@ export function recommend(
176176

177177
return { recommended: bestId, scores: agentScores };
178178
}
179+
180+
/**
181+
* Copeland pairwise scoring: compare every pair of agents head-to-head
182+
* on three criteria (tests passed, convergence group size, files changed).
183+
* For each pair, the agent winning more criteria gets +1, the loser gets -1, ties get 0.
184+
* The agent with the highest Copeland score is recommended.
185+
*/
186+
export function copelandRecommend(
187+
agents: AgentResult[],
188+
testResults: Array<{ agentId: number; passed: boolean }>,
189+
convergence: ConvergenceGroup[],
190+
): { recommended: number | null; scores: CopelandScore[] } {
191+
const completed = agents.filter((a) => a.status === "success" && a.diff.length > 0);
192+
if (completed.length === 0) return { recommended: null, scores: [] };
193+
194+
// Pre-compute per-agent criteria values
195+
const agentData = completed.map((agent) => {
196+
const test = testResults.find((t) => t.agentId === agent.id);
197+
const testsPassed = test?.passed ? 1 : 0;
198+
const group = convergence.find((g) => g.agents.includes(agent.id));
199+
const groupSize = group ? group.agents.length : 0;
200+
const filesChanged = agent.filesChanged.length;
201+
return { id: agent.id, testsPassed, groupSize, filesChanged };
202+
});
203+
204+
// Initialize scores
205+
const scoreMap = new Map<number, CopelandScore>();
206+
for (const data of agentData) {
207+
scoreMap.set(data.id, {
208+
agentId: data.id,
209+
testsWins: 0,
210+
convergenceWins: 0,
211+
filesChangedWins: 0,
212+
copelandTotal: 0,
213+
});
214+
}
215+
216+
// Pairwise comparison
217+
for (let i = 0; i < agentData.length; i++) {
218+
for (let j = i + 1; j < agentData.length; j++) {
219+
const a = agentData[i]!;
220+
const b = agentData[j]!;
221+
222+
let aWins = 0;
223+
let bWins = 0;
224+
225+
// Criterion 1: tests passed (more is better)
226+
if (a.testsPassed > b.testsPassed) {
227+
aWins++;
228+
scoreMap.get(a.id)!.testsWins++;
229+
scoreMap.get(b.id)!.testsWins--;
230+
} else if (b.testsPassed > a.testsPassed) {
231+
bWins++;
232+
scoreMap.get(b.id)!.testsWins++;
233+
scoreMap.get(a.id)!.testsWins--;
234+
}
235+
236+
// Criterion 2: convergence group size (larger is better)
237+
if (a.groupSize > b.groupSize) {
238+
aWins++;
239+
scoreMap.get(a.id)!.convergenceWins++;
240+
scoreMap.get(b.id)!.convergenceWins--;
241+
} else if (b.groupSize > a.groupSize) {
242+
bWins++;
243+
scoreMap.get(b.id)!.convergenceWins++;
244+
scoreMap.get(a.id)!.convergenceWins--;
245+
}
246+
247+
// Criterion 3: files changed (fewer is better — minimal changes preferred)
248+
if (a.filesChanged < b.filesChanged) {
249+
aWins++;
250+
scoreMap.get(a.id)!.filesChangedWins++;
251+
scoreMap.get(b.id)!.filesChangedWins--;
252+
} else if (b.filesChanged < a.filesChanged) {
253+
bWins++;
254+
scoreMap.get(b.id)!.filesChangedWins++;
255+
scoreMap.get(a.id)!.filesChangedWins--;
256+
}
257+
258+
// Overall Copeland: winner of more criteria gets +1, loser -1
259+
if (aWins > bWins) {
260+
scoreMap.get(a.id)!.copelandTotal++;
261+
scoreMap.get(b.id)!.copelandTotal--;
262+
} else if (bWins > aWins) {
263+
scoreMap.get(b.id)!.copelandTotal++;
264+
scoreMap.get(a.id)!.copelandTotal--;
265+
}
266+
}
267+
}
268+
269+
const copelandScores = [...scoreMap.values()];
270+
271+
let bestId: number | null = null;
272+
let bestScore = -Infinity;
273+
for (const score of copelandScores) {
274+
if (score.copelandTotal > bestScore) {
275+
bestScore = score.copelandTotal;
276+
bestId = score.agentId;
277+
}
278+
}
279+
280+
return { recommended: bestId, scores: copelandScores };
281+
}

src/types.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ export interface RunOptions {
88
threshold: number;
99
verbose: boolean;
1010
runner?: string;
11+
scoring: "weighted" | "copeland";
1112
}
1213

1314
export interface AgentResult {
@@ -46,13 +47,23 @@ export interface AgentScore {
4647
total: number;
4748
}
4849

50+
export interface CopelandScore {
51+
agentId: number;
52+
testsWins: number;
53+
convergenceWins: number;
54+
filesChangedWins: number;
55+
copelandTotal: number;
56+
}
57+
4958
export interface EnsembleResult {
5059
prompt: string;
5160
model: string;
5261
timestamp: string;
62+
scoring: "weighted" | "copeland";
5363
agents: AgentResult[];
5464
tests: TestResult[];
5565
convergence: ConvergenceGroup[];
5666
recommended: number | null;
5767
scores: AgentScore[];
68+
copelandScores?: CopelandScore[];
5869
}

0 commit comments

Comments
 (0)