From 7e0201a452eafdde51fa349483efed77a2d21e01 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 28 Mar 2026 20:32:40 -0700 Subject: [PATCH 1/2] Major README update: add all commands, Copeland scoring, new features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit README was severely outdated — missing Copeland scoring, evaluate, undo, clean, config, compare, stats filters, --retry, --file, --scoring, --whitespace-insensitive, --no-color, --output-format, --preview, --dry-run, Bedrock support, and technical report link. Added: Commands section with all 10 commands and flags, Scoring section explaining Copeland pairwise, updated example output with Copeland table, updated comparison table, technical reports link. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 131 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 105 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 818d40a..0fc4d8d 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,14 @@

Quick Start · How It Works · + Commands · Contributing · References

--- -Run N parallel Claude Code agents on the same task, then select the best result via test execution and convergence analysis. Based on the principle that **the aggregate of independent attempts outperforms any single attempt** — proven in [ensemble ML](https://en.wikipedia.org/wiki/Ensemble_learning), [superforecasting](https://en.wikipedia.org/wiki/Superforecasting), and [LLM code generation research](#references). +Run N parallel Claude Code agents on the same task, then select the best result via test execution and **Copeland pairwise scoring**. Based on the principle that **the aggregate of independent attempts outperforms any single attempt** — proven in [ensemble ML](https://en.wikipedia.org/wiki/Ensemble_learning), [superforecasting](https://en.wikipedia.org/wiki/Superforecasting), and [LLM code generation research](#references). ## Quick start @@ -36,8 +37,15 @@ thinktank run "fix the authentication bypass" # Run 5 agents with test verification thinktank run "fix the race condition" -n 5 -t "npm test" +# Read prompt from a file (avoids shell expansion issues) +thinktank run -f task.md -n 5 -t "npm test" + # Apply the best result thinktank apply + +# Set persistent defaults +thinktank config set attempts 5 +thinktank config set model opus ``` Requires [Claude Code CLI](https://docs.anthropic.com/en/docs/claude-code) installed and authenticated. @@ -82,9 +90,17 @@ Use `--model` to select a Claude model: `sonnet` (default), `opus`, `haiku`, or 2. Each agent independently solves the task (no shared context = true independence) 3. Runs your **test suite** on each result 4. Analyzes **convergence** — did the agents agree on an approach? -5. **Recommends** the best candidate (tests passing + consensus + smallest diff) +5. **Recommends** the best candidate via Copeland pairwise scoring 6. You review and `thinktank apply` +## Scoring + +The default scoring method is **Copeland pairwise ranking**. Every agent is compared head-to-head against every other agent across four criteria: tests passed, convergence group size, minimal file scope, and test files contributed. The agent that wins the most pairwise matchups is recommended. + +An alternative `--scoring weighted` method is also available, which assigns point values to tests (100), convergence (50), and diff size (10). + +Use `thinktank evaluate` to compare how different scoring methods rank your results. See [docs/scoring-evaluation.md](docs/scoring-evaluation.md) for the full analysis. + ## Why this works Every model ever benchmarked shows **pass@5 >> pass@1**. The gap between "one attempt" and "best of five" is one of the largest free reliability gains in AI coding. But no tool exposes this — until now. @@ -104,28 +120,82 @@ The key insight: **parallel attempts cost more tokens but not more time.** All a - **Complex refactors** — many files, easy to miss something - **Unfamiliar codebases** — agents might go the wrong direction -## Usage +## Commands -```bash -# Run with defaults (3 agents, sonnet model) -thinktank run "add rate limiting to the API" +### `thinktank run [prompt]` -# Run 5 agents with test verification -thinktank run "fix the race condition in the cache layer" -n 5 -t "npm test" +Run N parallel agents on a task. -# Use a specific model -thinktank run "migrate callbacks to async/await" --model opus -n 3 +| Flag | Description | +|------|-------------| +| `-n, --attempts ` | Number of parallel agents (default: 3, max: 20) | +| `-f, --file ` | Read prompt from a file | +| `-t, --test-cmd ` | Test command to verify results | +| `--test-timeout ` | Timeout for test command (default: 120s) | +| `--timeout ` | Timeout per agent (default: 600s) | +| `--model ` | Claude model: sonnet, opus, haiku, or full ID | +| `--scoring ` | Scoring method: `copeland` (default) or `weighted` | +| `--threshold <0-1>` | Convergence clustering similarity threshold | +| `--whitespace-insensitive` | Ignore whitespace in convergence comparison | +| `--retry` | Re-run only failed/timed-out agents from the last run | +| `--output-format ` | Output format: `text` (default) or `json` | +| `--no-color` | Disable colored output | +| `--verbose` | Show detailed agent output | -# Apply the recommended result -thinktank apply +### `thinktank apply` + +Apply the recommended agent's changes to your working tree. + +| Flag | Description | +|------|-------------| +| `-a, --agent ` | Apply a specific agent's result | +| `-p, --preview` | Show the diff without applying | +| `-d, --dry-run` | Show what would be applied without making changes | + +### `thinktank undo` + +Reverse the last applied diff. + +### `thinktank list [run-number]` + +List all past runs, or show details for a specific run. -# Apply a specific agent's result -thinktank apply --agent 2 +### `thinktank compare ` -# View the last run's results -thinktank list +Compare two agents' results side by side. + +### `thinktank stats` + +Show aggregate statistics across all runs. + +| Flag | Description | +|------|-------------| +| `--model ` | Filter to runs using a specific model | +| `--since ` | Show runs from this date onward (ISO 8601) | +| `--until ` | Show runs up to this date (ISO 8601) | +| `--passed-only` | Only runs where at least one agent passed tests | + +### `thinktank evaluate` + +Compare scoring methods (weighted vs Copeland vs Borda) across all runs to see how they differ in recommendations. + +### `thinktank clean` + +Remove thinktank worktrees and branches. Add `--all` to also delete `.thinktank/` run history. + +### `thinktank config set|get|list` + +View and update persistent configuration (stored in `.thinktank/config.json`). + +```bash +thinktank config set attempts 5 # persistent default +thinktank config set model opus +thinktank config get attempts +thinktank config list # show all values ``` +Available keys: `attempts`, `model`, `timeout`, `runner`, `threshold`, `testTimeout`. + ## Example output ``` @@ -152,21 +222,27 @@ Convergence Strong consensus — 3/5 agents changed the same files Files: src/middleware/auth.ts, tests/auth.test.ts - Agents [3]: ████░░░░░░░░░░░░░░░░ 20% - Divergent approach — 1/5 agents went a different direction - Files: src/middleware/auth.ts, src/utils/jwt.ts, tests/auth.test.ts +Copeland Pairwise Scoring +──────────────────────────────────────────────────────────── + Agent Tests Converge Scope TestCov Copeland + ────────────────────────────────────────────────────────── +>> #1 +3 +1 0 +1 +4 + #2 +3 +1 0 +1 +4 + #3 +3 -4 -4 +1 -4 + #4 -4 +1 +4 -4 -4 + #5 +3 +1 0 +1 +4 - Recommended: Agent #1 (highest score based on tests + convergence + diff size) + Recommended: Agent #1 (Copeland winner) ``` ## How it compares -| Approach | Reliability | Cost | Speed | -|----------|-------------|------|-------| -| Single Claude Code run | pass@1 | 1x | Fastest | -| **thinktank (N=3)** | **~pass@3** | **3x** | **Same wall time** | -| **thinktank (N=5)** | **~pass@5** | **5x** | **Same wall time** | -| Manual retry loop | pass@k (sequential) | kx | k × slower | +| Approach | Reliability | Cost | Speed | Selection | +|----------|-------------|------|-------|-----------| +| Single Claude Code run | pass@1 | 1x | Fastest | N/A | +| **thinktank (N=3)** | **~pass@3** | **3x** | **Same wall time** | **Copeland pairwise** | +| **thinktank (N=5)** | **~pass@5** | **5x** | **Same wall time** | **Copeland pairwise** | +| Manual retry loop | pass@k (sequential) | kx | k × slower | Manual | ## References @@ -183,3 +259,6 @@ Convergence ### Ensemble theory - *Superforecasting* — Tetlock & Gardner. The aggregate of independent forecasters consistently beats individuals. - *The Wisdom of Crowds* — Surowiecki. Independent estimates, when aggregated, converge on truth. + +### Technical reports +- [Scoring Method Evaluation](docs/scoring-evaluation.md) — Copeland vs Weighted vs Borda across 21 runs. Key finding: Copeland and Borda agree 86%, weighted disagrees ~40%. From bfbf1fd847242955945919b019f8d7b2484c5525 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 28 Mar 2026 20:42:25 -0700 Subject: [PATCH 2/2] Major README update + fix .git file deletion during agent runs README: Add all 10 commands, Copeland scoring section, all new flags (--scoring, --retry, --file, --whitespace-insensitive, --no-color, --output-format, --threshold, --test-timeout, --dry-run, --preview), Bedrock support, technical report link, updated example output. Runner: Backup .git pointer file before spawning agent, restore it after agent completes if deleted. Fixes the critical dogfooding bug where long-running Opus agents would lose the worktree git context. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 37 ++++++++++++++++++++++++------------- src/runners/claude-code.ts | 20 ++++++++++++++++++++ 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 0fc4d8d..845c849 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,9 @@ thinktank run "fix the race condition" -n 5 -t "npm test" # Read prompt from a file (avoids shell expansion issues) thinktank run -f task.md -n 5 -t "npm test" +# Pipe prompt from stdin +echo "refactor the parser" | thinktank run -n 3 + # Apply the best result thinktank apply @@ -97,9 +100,9 @@ Use `--model` to select a Claude model: `sonnet` (default), `opus`, `haiku`, or The default scoring method is **Copeland pairwise ranking**. Every agent is compared head-to-head against every other agent across four criteria: tests passed, convergence group size, minimal file scope, and test files contributed. The agent that wins the most pairwise matchups is recommended. -An alternative `--scoring weighted` method is also available, which assigns point values to tests (100), convergence (50), and diff size (10). +An alternative `--scoring weighted` method is also available, which assigns point values to tests (100), convergence (50), and diff size (10). A third method, **Borda count** (rank aggregation), is available for comparison via `thinktank evaluate`. -Use `thinktank evaluate` to compare how different scoring methods rank your results. See [docs/scoring-evaluation.md](docs/scoring-evaluation.md) for the full analysis. +Use `thinktank evaluate` to compare how all three scoring methods rank your results. See [docs/scoring-evaluation.md](docs/scoring-evaluation.md) for the full analysis. ## Why this works @@ -118,7 +121,7 @@ The key insight: **parallel attempts cost more tokens but not more time.** All a - **High-stakes changes** — auth, payments, security, data migrations - **Ambiguous tasks** — multiple valid approaches, need to see the spread - **Complex refactors** — many files, easy to miss something -- **Unfamiliar codebases** — agents might go the wrong direction +- **Unfamiliar codebases** — multiple attempts reduce the chance of going down the wrong path ## Commands @@ -131,11 +134,12 @@ Run N parallel agents on a task. | `-n, --attempts ` | Number of parallel agents (default: 3, max: 20) | | `-f, --file ` | Read prompt from a file | | `-t, --test-cmd ` | Test command to verify results | -| `--test-timeout ` | Timeout for test command (default: 120s) | -| `--timeout ` | Timeout per agent (default: 600s) | +| `--test-timeout ` | Timeout for test command in seconds (default: 120, max: 600) | +| `--timeout ` | Timeout per agent in seconds (default: 600, max: 1800) | | `--model ` | Claude model: sonnet, opus, haiku, or full ID | +| `-r, --runner ` | AI coding tool to use (default: claude-code) | | `--scoring ` | Scoring method: `copeland` (default) or `weighted` | -| `--threshold <0-1>` | Convergence clustering similarity threshold | +| `--threshold ` | Convergence clustering similarity threshold, 0.0–1.0 (default: 0.3) | | `--whitespace-insensitive` | Ignore whitespace in convergence comparison | | `--retry` | Re-run only failed/timed-out agents from the last run | | `--output-format ` | Output format: `text` (default) or `json` | @@ -148,9 +152,9 @@ Apply the recommended agent's changes to your working tree. | Flag | Description | |------|-------------| -| `-a, --agent ` | Apply a specific agent's result | +| `-a, --agent ` | Apply a specific agent's result instead of the recommended one | | `-p, --preview` | Show the diff without applying | -| `-d, --dry-run` | Show what would be applied without making changes | +| `-d, --dry-run` | Same as `--preview` (alias) | ### `thinktank undo` @@ -196,6 +200,13 @@ thinktank config list # show all values Available keys: `attempts`, `model`, `timeout`, `runner`, `threshold`, `testTimeout`. +## Pre-flight checks + +Before spawning agents, thinktank validates the environment: + +1. **Disk space** — warns if there isn't enough room for N worktrees +2. **Test suite** — if `--test-cmd` is set, runs the tests once on the main branch to verify the suite passes before spending tokens on parallel agents + ## Example output ``` @@ -226,11 +237,11 @@ Copeland Pairwise Scoring ──────────────────────────────────────────────────────────── Agent Tests Converge Scope TestCov Copeland ────────────────────────────────────────────────────────── ->> #1 +3 +1 0 +1 +4 - #2 +3 +1 0 +1 +4 - #3 +3 -4 -4 +1 -4 - #4 -4 +1 +4 -4 -4 - #5 +3 +1 0 +1 +4 +>> #1 +1 +2 0 +1 +4 + #2 +1 +2 0 0 +3 + #3 +1 -3 -4 0 -6 + #4 -4 -3 +4 -1 -4 + #5 +1 +2 0 0 +3 Recommended: Agent #1 (Copeland winner) ``` diff --git a/src/runners/claude-code.ts b/src/runners/claude-code.ts index 1ce9b59..75fc715 100644 --- a/src/runners/claude-code.ts +++ b/src/runners/claude-code.ts @@ -1,4 +1,6 @@ import { spawn } from "node:child_process"; +import { readFile, writeFile } from "node:fs/promises"; +import { join } from "node:path"; import type { AgentResult } from "../types.js"; import { getDiff, getDiffStats } from "../utils/git.js"; import type { Runner, RunnerOptions } from "./base.js"; @@ -20,6 +22,15 @@ export const claudeCodeRunner: Runner = { async run(id: number, opts: RunnerOptions): Promise { const start = Date.now(); + // Backup the .git pointer file — agents sometimes delete it during long runs + const gitFilePath = join(opts.worktreePath, ".git"); + let gitFileBackup: string | null = null; + try { + gitFileBackup = await readFile(gitFilePath, "utf-8"); + } catch { + // Not a worktree or .git is a directory — skip backup + } + return new Promise((resolve) => { let output = ""; let error = ""; @@ -86,6 +97,15 @@ export const claudeCodeRunner: Runner = { if (settled) return; settled = true; + // Restore .git file if the agent deleted it during execution + if (gitFileBackup) { + try { + await readFile(gitFilePath, "utf-8"); + } catch { + await writeFile(gitFilePath, gitFileBackup).catch(() => {}); + } + } + const duration = Date.now() - start; const diff = await getDiff(opts.worktreePath); const stats = await getDiffStats(opts.worktreePath);