Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,26 @@ Normal-sized and thorough diffs all receive the full 10 points. Only outlier-lar

The agent with the highest total score is recommended. Ties broken by the first agent.

### Copeland Pairwise Scoring (alternative)

Enabled with `--scoring copeland`. Instead of assigning absolute point values, Copeland scoring compares every pair of agents head-to-head on three criteria:

| Criterion | Better = |
|-----------|----------|
| Tests passed | Passed > Failed |
| Convergence group size | Larger group > Smaller group |
| Files changed | Fewer files > More files |

For each pair (A, B):
1. Count how many criteria A wins vs B wins
2. If A wins more criteria: A gets +1, B gets −1
3. If B wins more criteria: B gets +1, A gets −1
4. If tied on criteria count: both get 0

The agent with the highest cumulative Copeland score is recommended.

**When to use Copeland:** Copeland scoring avoids arbitrary point weights and is resistant to scale distortion. It works well when you want each criterion to have equal importance regardless of magnitude. However, it can produce more ties than weighted scoring, especially with few agents.

### Why these weights?
- Tests (100) dominate because correctness trumps everything
- Convergence (50) is secondary — agreement without tests is weaker evidence
Expand Down
8 changes: 8 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ program
"Convergence clustering similarity threshold (0.0-1.0)",
String(cfg.threshold),
)
.option("--scoring <method>", "Scoring method: weighted (default) or copeland", "weighted")
.option("--verbose", "Show detailed output from each agent")
.action(async (promptArg: string | undefined, opts) => {
const prompt = resolvePrompt(promptArg, opts.file);
Expand Down Expand Up @@ -70,6 +71,12 @@ program
process.exit(1);
}

const validScoring = ["weighted", "copeland"];
if (!validScoring.includes(opts.scoring)) {
console.error(`Error: --scoring must be one of: ${validScoring.join(", ")}`);
process.exit(1);
}

const knownModels = ["sonnet", "opus", "haiku"];
if (!knownModels.includes(opts.model) && !opts.model.startsWith("claude-")) {
console.warn(
Expand All @@ -86,6 +93,7 @@ program
model: opts.model,
threshold,
runner: opts.runner,
scoring: opts.scoring,
verbose: opts.verbose ?? false,
});
});
Expand Down
1 change: 1 addition & 0 deletions src/commands/run.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ function makeOpts(overrides: Partial<RunOptions> = {}): RunOptions {
model: "sonnet",
threshold: 0.3,
verbose: false,
scoring: "weighted",
...overrides,
};
}
Expand Down
9 changes: 7 additions & 2 deletions src/commands/run.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { mkdir, writeFile } from "node:fs/promises";
import { join } from "node:path";
import { getDefaultRunner, getRunner } from "../runners/registry.js";
import { analyzeConvergence, recommend } from "../scoring/convergence.js";
import { analyzeConvergence, copelandRecommend, recommend } from "../scoring/convergence.js";
import { runTests, validateTestCommand } from "../scoring/test-runner.js";
import type { AgentResult, EnsembleResult, RunOptions } from "../types.js";
import { displayApplyInstructions, displayHeader, displayResults } from "../utils/display.js";
Expand Down Expand Up @@ -129,18 +129,23 @@ export async function run(opts: RunOptions): Promise<void> {
const convergence = analyzeConvergence(agents, opts.threshold);

// Phase 5: Recommendation
const { recommended, scores } = recommend(agents, testResults, convergence);
const { recommended: weightedRec, scores } = recommend(agents, testResults, convergence);
const copeland = copelandRecommend(agents, testResults, convergence);

const recommended = opts.scoring === "copeland" ? copeland.recommended : weightedRec;

// Build result object
const result: EnsembleResult = {
prompt: opts.prompt,
model: opts.model,
timestamp: new Date().toISOString(),
scoring: opts.scoring,
agents,
tests: testResults,
convergence,
recommended,
scores,
copelandScores: copeland.scores,
};

// Display results
Expand Down
145 changes: 144 additions & 1 deletion src/scoring/convergence.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import assert from "node:assert/strict";
import { describe, it } from "node:test";
import type { AgentResult } from "../types.js";
import { analyzeConvergence, recommend } from "./convergence.js";
import { analyzeConvergence, copelandRecommend, recommend } from "./convergence.js";

const DIFF_A = `diff --git a/a.ts b/a.ts
--- a/a.ts
Expand Down Expand Up @@ -245,3 +245,146 @@ describe("recommend", () => {
assert.ok(score1.diffSizePoints < 10);
});
});

describe("copelandRecommend", () => {
it("returns null for no completed agents", () => {
const agents = [makeAgent({ id: 1, status: "error", diff: "" })];
const result = copelandRecommend(agents, [], []);
assert.equal(result.recommended, null);
assert.deepEqual(result.scores, []);
});

it("recommends the agent that dominates all criteria", () => {
// Agent 1: passes tests, in larger convergence group, fewer files
// Agent 2: fails tests, alone, more files
const agents = [
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["b.ts", "c.ts"] }),
];
const tests = [
{ agentId: 1, passed: true },
{ agentId: 2, passed: false },
];
const convergence = analyzeConvergence(agents);
const result = copelandRecommend(agents, tests, convergence);

assert.equal(result.recommended, 1);
const score1 = result.scores.find((s) => s.agentId === 1);
assert.ok(score1);
assert.equal(score1.copelandTotal, 1); // wins the one pairwise matchup
assert.ok(score1.testsWins > 0);
});

it("all agents identical gives zero Copeland scores", () => {
const agents = [
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
makeAgent({ id: 3, diff: DIFF_A, filesChanged: ["a.ts"] }),
];
const tests = [
{ agentId: 1, passed: true },
{ agentId: 2, passed: true },
{ agentId: 3, passed: true },
];
const convergence = analyzeConvergence(agents);
const result = copelandRecommend(agents, tests, convergence);

// All agents tie on every criterion — all Copeland scores should be 0
for (const score of result.scores) {
assert.equal(score.copelandTotal, 0, `Agent #${score.agentId} should have Copeland score 0`);
assert.equal(score.testsWins, 0);
assert.equal(score.convergenceWins, 0);
assert.equal(score.filesChangedWins, 0);
}
// Still recommends someone (first agent)
assert.ok(result.recommended !== null);
});

it("handles agents with different strengths on different criteria (non-transitive)", () => {
// Agent 1: passes tests, many files, small group
// Agent 2: fails tests, few files, large group
// Agent 3: fails tests, many files, large group
const agents = [
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "b.ts", "c.ts"] }),
makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["x.ts"] }),
makeAgent({ id: 3, diff: DIFF_B, filesChanged: ["x.ts", "y.ts", "z.ts"] }),
];
const tests = [
{ agentId: 1, passed: true },
{ agentId: 2, passed: false },
{ agentId: 3, passed: false },
];
const convergence = analyzeConvergence(agents);
const result = copelandRecommend(agents, tests, convergence);

// Agent 1 vs Agent 2: tests(+1), convergence(-1), files(-1) → Agent 2 wins
// Agent 1 vs Agent 3: tests(+1), convergence(-1), files(tie) → tie
// Agent 2 vs Agent 3: tests(tie), convergence(tie), files(+1 for 2) → Agent 2 wins
// So Agent 2 should have the best Copeland score
assert.equal(result.recommended, 2);
});

it("prefers agent with test pass when other criteria are tied", () => {
const agents = [
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
];
const tests = [
{ agentId: 1, passed: true },
{ agentId: 2, passed: false },
];
const convergence = analyzeConvergence(agents);
const result = copelandRecommend(agents, tests, convergence);

assert.equal(result.recommended, 1);
const score1 = result.scores.find((s) => s.agentId === 1);
assert.ok(score1);
assert.equal(score1.testsWins, 1);
assert.equal(score1.copelandTotal, 1);
});

it("prefers fewer files changed when other criteria are equal", () => {
const agents = [
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "b.ts", "c.ts"] }),
makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }),
];
const convergence = analyzeConvergence(agents);
const result = copelandRecommend(agents, [], convergence);

assert.equal(result.recommended, 2);
});

it("returns per-agent criterion breakdowns", () => {
const agents = [
makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts"] }),
makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["b.ts", "c.ts"] }),
];
const tests = [
{ agentId: 1, passed: true },
{ agentId: 2, passed: false },
];
const convergence = analyzeConvergence(agents);
const result = copelandRecommend(agents, tests, convergence);

assert.equal(result.scores.length, 2);
const score1 = result.scores.find((s) => s.agentId === 1);
const score2 = result.scores.find((s) => s.agentId === 2);
assert.ok(score1);
assert.ok(score2);

// Score1 wins tests and files, score2 wins neither
assert.equal(score1.testsWins, 1);
assert.equal(score2.testsWins, -1);
assert.equal(score1.filesChangedWins, 1);
assert.equal(score2.filesChangedWins, -1);
});

it("handles single agent", () => {
const agents = [makeAgent({ id: 1, diff: DIFF_A })];
const result = copelandRecommend(agents, [], []);

assert.equal(result.recommended, 1);
assert.equal(result.scores.length, 1);
assert.equal(result.scores[0]!.copelandTotal, 0);
});
});
105 changes: 104 additions & 1 deletion src/scoring/convergence.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { AgentResult, AgentScore, ConvergenceGroup } from "../types.js";
import type { AgentResult, AgentScore, ConvergenceGroup, CopelandScore } from "../types.js";
import { pairwiseSimilarity } from "./diff-parser.js";

/**
Expand Down Expand Up @@ -176,3 +176,106 @@ export function recommend(

return { recommended: bestId, scores: agentScores };
}

/**
* Copeland pairwise scoring: compare every pair of agents head-to-head
* on three criteria (tests passed, convergence group size, files changed).
* For each pair, the agent winning more criteria gets +1, the loser gets -1, ties get 0.
* The agent with the highest Copeland score is recommended.
*/
export function copelandRecommend(
agents: AgentResult[],
testResults: Array<{ agentId: number; passed: boolean }>,
convergence: ConvergenceGroup[],
): { recommended: number | null; scores: CopelandScore[] } {
const completed = agents.filter((a) => a.status === "success" && a.diff.length > 0);
if (completed.length === 0) return { recommended: null, scores: [] };

// Pre-compute per-agent criteria values
const agentData = completed.map((agent) => {
const test = testResults.find((t) => t.agentId === agent.id);
const testsPassed = test?.passed ? 1 : 0;
const group = convergence.find((g) => g.agents.includes(agent.id));
const groupSize = group ? group.agents.length : 0;
const filesChanged = agent.filesChanged.length;
return { id: agent.id, testsPassed, groupSize, filesChanged };
});

// Initialize scores
const scoreMap = new Map<number, CopelandScore>();
for (const data of agentData) {
scoreMap.set(data.id, {
agentId: data.id,
testsWins: 0,
convergenceWins: 0,
filesChangedWins: 0,
copelandTotal: 0,
});
}

// Pairwise comparison
for (let i = 0; i < agentData.length; i++) {
for (let j = i + 1; j < agentData.length; j++) {
const a = agentData[i]!;
const b = agentData[j]!;

let aWins = 0;
let bWins = 0;

// Criterion 1: tests passed (more is better)
if (a.testsPassed > b.testsPassed) {
aWins++;
scoreMap.get(a.id)!.testsWins++;
scoreMap.get(b.id)!.testsWins--;
} else if (b.testsPassed > a.testsPassed) {
bWins++;
scoreMap.get(b.id)!.testsWins++;
scoreMap.get(a.id)!.testsWins--;
}

// Criterion 2: convergence group size (larger is better)
if (a.groupSize > b.groupSize) {
aWins++;
scoreMap.get(a.id)!.convergenceWins++;
scoreMap.get(b.id)!.convergenceWins--;
} else if (b.groupSize > a.groupSize) {
bWins++;
scoreMap.get(b.id)!.convergenceWins++;
scoreMap.get(a.id)!.convergenceWins--;
}

// Criterion 3: files changed (fewer is better — minimal changes preferred)
if (a.filesChanged < b.filesChanged) {
aWins++;
scoreMap.get(a.id)!.filesChangedWins++;
scoreMap.get(b.id)!.filesChangedWins--;
} else if (b.filesChanged < a.filesChanged) {
bWins++;
scoreMap.get(b.id)!.filesChangedWins++;
scoreMap.get(a.id)!.filesChangedWins--;
}

// Overall Copeland: winner of more criteria gets +1, loser -1
if (aWins > bWins) {
scoreMap.get(a.id)!.copelandTotal++;
scoreMap.get(b.id)!.copelandTotal--;
} else if (bWins > aWins) {
scoreMap.get(b.id)!.copelandTotal++;
scoreMap.get(a.id)!.copelandTotal--;
}
}
}

const copelandScores = [...scoreMap.values()];

let bestId: number | null = null;
let bestScore = -Infinity;
for (const score of copelandScores) {
if (score.copelandTotal > bestScore) {
bestScore = score.copelandTotal;
bestId = score.agentId;
}
}

return { recommended: bestId, scores: copelandScores };
}
11 changes: 11 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export interface RunOptions {
threshold: number;
verbose: boolean;
runner?: string;
scoring: "weighted" | "copeland";
}

export interface AgentResult {
Expand Down Expand Up @@ -46,13 +47,23 @@ export interface AgentScore {
total: number;
}

export interface CopelandScore {
agentId: number;
testsWins: number;
convergenceWins: number;
filesChangedWins: number;
copelandTotal: number;
}

export interface EnsembleResult {
prompt: string;
model: string;
timestamp: string;
scoring: "weighted" | "copeland";
agents: AgentResult[];
tests: TestResult[];
convergence: ConvergenceGroup[];
recommended: number | null;
scores: AgentScore[];
copelandScores?: CopelandScore[];
}
Loading
Loading