|
| 1 | +import { readdir, readFile } from "node:fs/promises"; |
| 2 | +import { join } from "node:path"; |
| 3 | +import pc from "picocolors"; |
| 4 | +import { analyzeConvergence, copelandRecommend, recommend } from "../scoring/convergence.js"; |
| 5 | +import type { EnsembleResult } from "../types.js"; |
| 6 | + |
| 7 | +interface RunEvaluation { |
| 8 | + file: string; |
| 9 | + agentCount: number; |
| 10 | + agentsWithDiffs: number; |
| 11 | + hasTests: boolean; |
| 12 | + weightedPick: number | null; |
| 13 | + copelandPick: number | null; |
| 14 | + bordaPick: number | null; |
| 15 | + agree: boolean; |
| 16 | +} |
| 17 | + |
| 18 | +/** |
| 19 | + * Borda count: rank agents on each criterion, sum ranks. Lowest total rank wins. |
| 20 | + */ |
| 21 | +function bordaRecommend(result: EnsembleResult): { |
| 22 | + recommended: number | null; |
| 23 | + ranks: Map<number, number>; |
| 24 | +} { |
| 25 | + const completed = result.agents.filter((a) => a.status === "success" && a.diff.length > 10); |
| 26 | + if (completed.length === 0) return { recommended: null, ranks: new Map() }; |
| 27 | + |
| 28 | + // Criterion 1: tests passed (passed=0 rank, failed=1 rank) |
| 29 | + const testRanks = new Map<number, number>(); |
| 30 | + const passers = completed.filter((a) => result.tests.find((t) => t.agentId === a.id)?.passed); |
| 31 | + const failers = completed.filter((a) => !result.tests.find((t) => t.agentId === a.id)?.passed); |
| 32 | + for (const a of passers) testRanks.set(a.id, 0); |
| 33 | + for (const a of failers) testRanks.set(a.id, 1); |
| 34 | + |
| 35 | + // Criterion 2: convergence group size (larger = better = lower rank) |
| 36 | + const groupSizes = completed.map((a) => { |
| 37 | + const group = result.convergence.find((g) => g.agents.includes(a.id)); |
| 38 | + return { id: a.id, size: group ? group.agents.length : 0 }; |
| 39 | + }); |
| 40 | + groupSizes.sort((a, b) => b.size - a.size); |
| 41 | + const convRanks = new Map<number, number>(); |
| 42 | + for (let i = 0; i < groupSizes.length; i++) convRanks.set(groupSizes[i]!.id, i); |
| 43 | + |
| 44 | + // Criterion 3: files changed (fewer = better = lower rank) |
| 45 | + const fileCounts = completed.map((a) => ({ id: a.id, files: a.filesChanged.length })); |
| 46 | + fileCounts.sort((a, b) => a.files - b.files); |
| 47 | + const fileRanks = new Map<number, number>(); |
| 48 | + for (let i = 0; i < fileCounts.length; i++) fileRanks.set(fileCounts[i]!.id, i); |
| 49 | + |
| 50 | + // Sum ranks |
| 51 | + const totalRanks = new Map<number, number>(); |
| 52 | + for (const a of completed) { |
| 53 | + const total = |
| 54 | + (testRanks.get(a.id) ?? 0) + (convRanks.get(a.id) ?? 0) + (fileRanks.get(a.id) ?? 0); |
| 55 | + totalRanks.set(a.id, total); |
| 56 | + } |
| 57 | + |
| 58 | + // Lowest rank sum wins |
| 59 | + let bestId: number | null = null; |
| 60 | + let bestRank = Infinity; |
| 61 | + for (const [id, rank] of totalRanks) { |
| 62 | + if (rank < bestRank) { |
| 63 | + bestRank = rank; |
| 64 | + bestId = id; |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + return { recommended: bestId, ranks: totalRanks }; |
| 69 | +} |
| 70 | + |
| 71 | +export async function evaluate(): Promise<void> { |
| 72 | + let files: string[]; |
| 73 | + try { |
| 74 | + const entries = await readdir(".thinktank"); |
| 75 | + files = entries.filter((f) => f.startsWith("run-") && f.endsWith(".json")); |
| 76 | + } catch { |
| 77 | + console.log(pc.yellow(" No .thinktank/ directory found. Run thinktank run first.")); |
| 78 | + return; |
| 79 | + } |
| 80 | + |
| 81 | + // Load all runs |
| 82 | + const runs: EnsembleResult[] = []; |
| 83 | + for (const file of files) { |
| 84 | + try { |
| 85 | + const raw = await readFile(join(".thinktank", file), "utf-8"); |
| 86 | + runs.push(JSON.parse(raw) as EnsembleResult); |
| 87 | + } catch { |
| 88 | + // skip malformed |
| 89 | + } |
| 90 | + } |
| 91 | + |
| 92 | + // Filter to runs with 2+ agents with diffs and test results |
| 93 | + const usable = runs.filter((r) => { |
| 94 | + const withDiffs = r.agents.filter((a) => a.status === "success" && a.diff.length > 10); |
| 95 | + return withDiffs.length >= 2 && r.tests.length > 0; |
| 96 | + }); |
| 97 | + |
| 98 | + if (usable.length === 0) { |
| 99 | + console.log(pc.yellow(" No runs with 2+ agents and test results found.")); |
| 100 | + return; |
| 101 | + } |
| 102 | + |
| 103 | + console.log(); |
| 104 | + console.log(pc.bold(" Scoring Method Evaluation")); |
| 105 | + console.log(pc.dim(" ─".repeat(30))); |
| 106 | + console.log(` Usable runs: ${pc.cyan(String(usable.length))} (of ${runs.length} total)`); |
| 107 | + console.log(); |
| 108 | + |
| 109 | + // Evaluate each run with all three methods |
| 110 | + const evals: RunEvaluation[] = []; |
| 111 | + |
| 112 | + for (const run of usable) { |
| 113 | + const convergence = analyzeConvergence(run.agents); |
| 114 | + const weighted = recommend(run.agents, run.tests, convergence); |
| 115 | + const copeland = copelandRecommend(run.agents, run.tests, convergence); |
| 116 | + const borda = bordaRecommend(run); |
| 117 | + |
| 118 | + const agentsWithDiffs = run.agents.filter( |
| 119 | + (a) => a.status === "success" && a.diff.length > 10, |
| 120 | + ).length; |
| 121 | + const agree = |
| 122 | + weighted.recommended === copeland.recommended && copeland.recommended === borda.recommended; |
| 123 | + |
| 124 | + evals.push({ |
| 125 | + file: run.timestamp, |
| 126 | + agentCount: run.agents.length, |
| 127 | + agentsWithDiffs, |
| 128 | + hasTests: run.tests.length > 0, |
| 129 | + weightedPick: weighted.recommended, |
| 130 | + copelandPick: copeland.recommended, |
| 131 | + bordaPick: borda.recommended, |
| 132 | + agree, |
| 133 | + }); |
| 134 | + } |
| 135 | + |
| 136 | + // Display per-run comparison |
| 137 | + console.log( |
| 138 | + " " + |
| 139 | + pc.dim(padRight("Run", 10)) + |
| 140 | + padRight("Agents", 8) + |
| 141 | + padRight("Weighted", 10) + |
| 142 | + padRight("Copeland", 10) + |
| 143 | + padRight("Borda", 8) + |
| 144 | + padRight("Agree?", 8), |
| 145 | + ); |
| 146 | + console.log(" " + pc.dim("─".repeat(54))); |
| 147 | + |
| 148 | + for (let i = 0; i < evals.length; i++) { |
| 149 | + const e = evals[i]!; |
| 150 | + const agreeStr = e.agree ? pc.green("yes") : pc.red("NO"); |
| 151 | + console.log( |
| 152 | + " " + |
| 153 | + pc.dim(padRight(`#${i + 1}`, 10)) + |
| 154 | + padRight(String(e.agentsWithDiffs), 8) + |
| 155 | + padRight(e.weightedPick !== null ? `#${e.weightedPick}` : "-", 10) + |
| 156 | + padRight(e.copelandPick !== null ? `#${e.copelandPick}` : "-", 10) + |
| 157 | + padRight(e.bordaPick !== null ? `#${e.bordaPick}` : "-", 8) + |
| 158 | + padRight(agreeStr, 8), |
| 159 | + ); |
| 160 | + } |
| 161 | + |
| 162 | + // Agreement statistics |
| 163 | + const totalRuns = evals.length; |
| 164 | + const allAgree = evals.filter((e) => e.agree).length; |
| 165 | + const wcAgree = evals.filter((e) => e.weightedPick === e.copelandPick).length; |
| 166 | + const wbAgree = evals.filter((e) => e.weightedPick === e.bordaPick).length; |
| 167 | + const cbAgree = evals.filter((e) => e.copelandPick === e.bordaPick).length; |
| 168 | + |
| 169 | + console.log(); |
| 170 | + console.log(pc.bold(" Agreement Rates")); |
| 171 | + console.log(pc.dim(" ─".repeat(30))); |
| 172 | + console.log(` All three agree: ${pc.cyan(pct(allAgree, totalRuns))}`); |
| 173 | + console.log(` Weighted = Copeland: ${pc.cyan(pct(wcAgree, totalRuns))}`); |
| 174 | + console.log(` Weighted = Borda: ${pc.cyan(pct(wbAgree, totalRuns))}`); |
| 175 | + console.log(` Copeland = Borda: ${pc.cyan(pct(cbAgree, totalRuns))}`); |
| 176 | + console.log(); |
| 177 | + |
| 178 | + // Kendall's W (coefficient of concordance) |
| 179 | + // W = 12 * S / (k^2 * (n^3 - n)) where k = number of judges, n = items per block |
| 180 | + // Simplified: compute variance of rank sums across agents per run, then average |
| 181 | + const disagreements = evals.filter((e) => !e.agree); |
| 182 | + if (disagreements.length > 0) { |
| 183 | + console.log(pc.bold(" Disagreements")); |
| 184 | + console.log(pc.dim(" ─".repeat(30))); |
| 185 | + for (let i = 0; i < evals.length; i++) { |
| 186 | + const e = evals[i]!; |
| 187 | + if (!e.agree) { |
| 188 | + console.log( |
| 189 | + ` Run #${i + 1}: Weighted→#${e.weightedPick} Copeland→#${e.copelandPick} Borda→#${e.bordaPick}`, |
| 190 | + ); |
| 191 | + } |
| 192 | + } |
| 193 | + console.log(); |
| 194 | + console.log( |
| 195 | + pc.dim( |
| 196 | + " When methods disagree, consider using --scoring copeland or manually\n" + |
| 197 | + " reviewing with thinktank compare to pick the best agent.", |
| 198 | + ), |
| 199 | + ); |
| 200 | + } else { |
| 201 | + console.log( |
| 202 | + pc.green(" All methods agree on every run — scoring method choice doesn't matter!"), |
| 203 | + ); |
| 204 | + } |
| 205 | + |
| 206 | + console.log(); |
| 207 | +} |
| 208 | + |
| 209 | +function padRight(str: string, len: number): string { |
| 210 | + // biome-ignore lint/suspicious/noControlCharactersInRegex: intentional ANSI escape sequence matching |
| 211 | + const stripped = str.replace(/\x1b\[[0-9;]*m/g, ""); |
| 212 | + const padding = Math.max(0, len - stripped.length); |
| 213 | + return str + " ".repeat(padding); |
| 214 | +} |
| 215 | + |
| 216 | +function pct(n: number, total: number): string { |
| 217 | + return `${n}/${total} (${Math.round((n / total) * 100)}%)`; |
| 218 | +} |
0 commit comments