Skip to content

Commit 251bc50

Browse files
that-github-userunknownclaude
authored
Add evaluate command: compare scoring methods across all runs (#106)
thinktank evaluate re-scores all past runs with weighted, Copeland, and Borda methods, showing agreement rates and disagreements. Key finding from 21 runs: Copeland and Borda agree 86% of the time, while weighted disagrees with both ~40%. This suggests weighted scoring is the outlier — Copeland should likely be the default. Partial fix for #105 Co-authored-by: unknown <that-github-user@github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 102b0e6 commit 251bc50

2 files changed

Lines changed: 226 additions & 0 deletions

File tree

src/cli.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { apply } from "./commands/apply.js";
55
import { clean } from "./commands/clean.js";
66
import { compare } from "./commands/compare.js";
77
import { type ConfigAction, config } from "./commands/config.js";
8+
import { evaluate } from "./commands/evaluate.js";
89
import { list } from "./commands/list.js";
910
import { run } from "./commands/run.js";
1011
import { stats } from "./commands/stats.js";
@@ -146,6 +147,13 @@ program
146147
await stats();
147148
});
148149

150+
program
151+
.command("evaluate")
152+
.description("Compare scoring methods (weighted vs Copeland vs Borda) across all runs")
153+
.action(async () => {
154+
await evaluate();
155+
});
156+
149157
const configCmd = program
150158
.command("config")
151159
.description("View and update thinktank configuration (.thinktank/config.json)");

src/commands/evaluate.ts

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
import { readdir, readFile } from "node:fs/promises";
2+
import { join } from "node:path";
3+
import pc from "picocolors";
4+
import { analyzeConvergence, copelandRecommend, recommend } from "../scoring/convergence.js";
5+
import type { EnsembleResult } from "../types.js";
6+
7+
interface RunEvaluation {
8+
file: string;
9+
agentCount: number;
10+
agentsWithDiffs: number;
11+
hasTests: boolean;
12+
weightedPick: number | null;
13+
copelandPick: number | null;
14+
bordaPick: number | null;
15+
agree: boolean;
16+
}
17+
18+
/**
19+
* Borda count: rank agents on each criterion, sum ranks. Lowest total rank wins.
20+
*/
21+
function bordaRecommend(result: EnsembleResult): {
22+
recommended: number | null;
23+
ranks: Map<number, number>;
24+
} {
25+
const completed = result.agents.filter((a) => a.status === "success" && a.diff.length > 10);
26+
if (completed.length === 0) return { recommended: null, ranks: new Map() };
27+
28+
// Criterion 1: tests passed (passed=0 rank, failed=1 rank)
29+
const testRanks = new Map<number, number>();
30+
const passers = completed.filter((a) => result.tests.find((t) => t.agentId === a.id)?.passed);
31+
const failers = completed.filter((a) => !result.tests.find((t) => t.agentId === a.id)?.passed);
32+
for (const a of passers) testRanks.set(a.id, 0);
33+
for (const a of failers) testRanks.set(a.id, 1);
34+
35+
// Criterion 2: convergence group size (larger = better = lower rank)
36+
const groupSizes = completed.map((a) => {
37+
const group = result.convergence.find((g) => g.agents.includes(a.id));
38+
return { id: a.id, size: group ? group.agents.length : 0 };
39+
});
40+
groupSizes.sort((a, b) => b.size - a.size);
41+
const convRanks = new Map<number, number>();
42+
for (let i = 0; i < groupSizes.length; i++) convRanks.set(groupSizes[i]!.id, i);
43+
44+
// Criterion 3: files changed (fewer = better = lower rank)
45+
const fileCounts = completed.map((a) => ({ id: a.id, files: a.filesChanged.length }));
46+
fileCounts.sort((a, b) => a.files - b.files);
47+
const fileRanks = new Map<number, number>();
48+
for (let i = 0; i < fileCounts.length; i++) fileRanks.set(fileCounts[i]!.id, i);
49+
50+
// Sum ranks
51+
const totalRanks = new Map<number, number>();
52+
for (const a of completed) {
53+
const total =
54+
(testRanks.get(a.id) ?? 0) + (convRanks.get(a.id) ?? 0) + (fileRanks.get(a.id) ?? 0);
55+
totalRanks.set(a.id, total);
56+
}
57+
58+
// Lowest rank sum wins
59+
let bestId: number | null = null;
60+
let bestRank = Infinity;
61+
for (const [id, rank] of totalRanks) {
62+
if (rank < bestRank) {
63+
bestRank = rank;
64+
bestId = id;
65+
}
66+
}
67+
68+
return { recommended: bestId, ranks: totalRanks };
69+
}
70+
71+
export async function evaluate(): Promise<void> {
72+
let files: string[];
73+
try {
74+
const entries = await readdir(".thinktank");
75+
files = entries.filter((f) => f.startsWith("run-") && f.endsWith(".json"));
76+
} catch {
77+
console.log(pc.yellow(" No .thinktank/ directory found. Run thinktank run first."));
78+
return;
79+
}
80+
81+
// Load all runs
82+
const runs: EnsembleResult[] = [];
83+
for (const file of files) {
84+
try {
85+
const raw = await readFile(join(".thinktank", file), "utf-8");
86+
runs.push(JSON.parse(raw) as EnsembleResult);
87+
} catch {
88+
// skip malformed
89+
}
90+
}
91+
92+
// Filter to runs with 2+ agents with diffs and test results
93+
const usable = runs.filter((r) => {
94+
const withDiffs = r.agents.filter((a) => a.status === "success" && a.diff.length > 10);
95+
return withDiffs.length >= 2 && r.tests.length > 0;
96+
});
97+
98+
if (usable.length === 0) {
99+
console.log(pc.yellow(" No runs with 2+ agents and test results found."));
100+
return;
101+
}
102+
103+
console.log();
104+
console.log(pc.bold(" Scoring Method Evaluation"));
105+
console.log(pc.dim(" ─".repeat(30)));
106+
console.log(` Usable runs: ${pc.cyan(String(usable.length))} (of ${runs.length} total)`);
107+
console.log();
108+
109+
// Evaluate each run with all three methods
110+
const evals: RunEvaluation[] = [];
111+
112+
for (const run of usable) {
113+
const convergence = analyzeConvergence(run.agents);
114+
const weighted = recommend(run.agents, run.tests, convergence);
115+
const copeland = copelandRecommend(run.agents, run.tests, convergence);
116+
const borda = bordaRecommend(run);
117+
118+
const agentsWithDiffs = run.agents.filter(
119+
(a) => a.status === "success" && a.diff.length > 10,
120+
).length;
121+
const agree =
122+
weighted.recommended === copeland.recommended && copeland.recommended === borda.recommended;
123+
124+
evals.push({
125+
file: run.timestamp,
126+
agentCount: run.agents.length,
127+
agentsWithDiffs,
128+
hasTests: run.tests.length > 0,
129+
weightedPick: weighted.recommended,
130+
copelandPick: copeland.recommended,
131+
bordaPick: borda.recommended,
132+
agree,
133+
});
134+
}
135+
136+
// Display per-run comparison
137+
console.log(
138+
" " +
139+
pc.dim(padRight("Run", 10)) +
140+
padRight("Agents", 8) +
141+
padRight("Weighted", 10) +
142+
padRight("Copeland", 10) +
143+
padRight("Borda", 8) +
144+
padRight("Agree?", 8),
145+
);
146+
console.log(" " + pc.dim("─".repeat(54)));
147+
148+
for (let i = 0; i < evals.length; i++) {
149+
const e = evals[i]!;
150+
const agreeStr = e.agree ? pc.green("yes") : pc.red("NO");
151+
console.log(
152+
" " +
153+
pc.dim(padRight(`#${i + 1}`, 10)) +
154+
padRight(String(e.agentsWithDiffs), 8) +
155+
padRight(e.weightedPick !== null ? `#${e.weightedPick}` : "-", 10) +
156+
padRight(e.copelandPick !== null ? `#${e.copelandPick}` : "-", 10) +
157+
padRight(e.bordaPick !== null ? `#${e.bordaPick}` : "-", 8) +
158+
padRight(agreeStr, 8),
159+
);
160+
}
161+
162+
// Agreement statistics
163+
const totalRuns = evals.length;
164+
const allAgree = evals.filter((e) => e.agree).length;
165+
const wcAgree = evals.filter((e) => e.weightedPick === e.copelandPick).length;
166+
const wbAgree = evals.filter((e) => e.weightedPick === e.bordaPick).length;
167+
const cbAgree = evals.filter((e) => e.copelandPick === e.bordaPick).length;
168+
169+
console.log();
170+
console.log(pc.bold(" Agreement Rates"));
171+
console.log(pc.dim(" ─".repeat(30)));
172+
console.log(` All three agree: ${pc.cyan(pct(allAgree, totalRuns))}`);
173+
console.log(` Weighted = Copeland: ${pc.cyan(pct(wcAgree, totalRuns))}`);
174+
console.log(` Weighted = Borda: ${pc.cyan(pct(wbAgree, totalRuns))}`);
175+
console.log(` Copeland = Borda: ${pc.cyan(pct(cbAgree, totalRuns))}`);
176+
console.log();
177+
178+
// Kendall's W (coefficient of concordance)
179+
// W = 12 * S / (k^2 * (n^3 - n)) where k = number of judges, n = items per block
180+
// Simplified: compute variance of rank sums across agents per run, then average
181+
const disagreements = evals.filter((e) => !e.agree);
182+
if (disagreements.length > 0) {
183+
console.log(pc.bold(" Disagreements"));
184+
console.log(pc.dim(" ─".repeat(30)));
185+
for (let i = 0; i < evals.length; i++) {
186+
const e = evals[i]!;
187+
if (!e.agree) {
188+
console.log(
189+
` Run #${i + 1}: Weighted→#${e.weightedPick} Copeland→#${e.copelandPick} Borda→#${e.bordaPick}`,
190+
);
191+
}
192+
}
193+
console.log();
194+
console.log(
195+
pc.dim(
196+
" When methods disagree, consider using --scoring copeland or manually\n" +
197+
" reviewing with thinktank compare to pick the best agent.",
198+
),
199+
);
200+
} else {
201+
console.log(
202+
pc.green(" All methods agree on every run — scoring method choice doesn't matter!"),
203+
);
204+
}
205+
206+
console.log();
207+
}
208+
209+
function padRight(str: string, len: number): string {
210+
// biome-ignore lint/suspicious/noControlCharactersInRegex: intentional ANSI escape sequence matching
211+
const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
212+
const padding = Math.max(0, len - stripped.length);
213+
return str + " ".repeat(padding);
214+
}
215+
216+
function pct(n: number, total: number): string {
217+
return `${n}/${total} (${Math.round((n / total) * 100)}%)`;
218+
}

0 commit comments

Comments
 (0)