Skip to content

Commit 37f1b46

Browse files
authored
feat: AutoResearch framework + V2EX test suite (60 tasks, SKILL.md optimization) (#717)
* feat: AutoResearch framework + V2EX test suite (40 tasks) AutoResearch framework (Karpathy-style autonomous iteration): - engine.ts: 8-phase loop (review → modify → commit → verify → guard → decide → log) - config.ts: typed config + CLI parser + metric extraction - logger.ts: TSV append-only results log - commands/run.ts: main loop spawning Claude Code per iteration - commands/plan.ts: interactive config wizard - commands/fix.ts: auto-detect broken state, iteratively fix - commands/debug.ts: hypothesis-driven debugging for failing tasks V2EX test suite (5 layers, 40 tasks): - L1 Atomic (10): open, state, click, scroll, eval, back, wait - L2 Single Page (10): hot topics, node list, topic meta, pagination - L3 Multi-Step (10): click-read, navigate-node, tab-then-topic, pagination - L4 Write Ops (5): reply typing, favorite detection, form detection - L5 Complex Chain (5): cross-page collect, multi-node compare, full workflow Presets: operate-reliability, skill-quality, v2ex-reliability * test: V2EX test suite 60/60 — fix selectors, add harder tasks - Fix v2ex-collect-hot-authors selector (pathname-based member link detection) - Fix v2ex-wait-text judge (accept "appeared") - Fix trailing commas in eval step strings - Add 20 harder tasks: state+click interaction + long chain workflows - Baseline: 60/60 across all layers * docs: optimize SKILL.md for efficiency — aggressive chaining, minimize turns - Add Rule #7: minimize total tool calls (3-5 per task, not 15-20) - Strengthen Rule #5: chain aggressively with && - Add explicit good/bad chaining examples - Add click+wait+state chaining pattern - Add type+verify chaining pattern Before: 21 turns for complex V2EX reply task After: 12 turns for same task (-43% turns, -28% cost)
1 parent 2d005d1 commit 37f1b46

15 files changed

Lines changed: 2175 additions & 6 deletions

File tree

autoresearch/commands/debug.ts

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
#!/usr/bin/env npx tsx
2+
/**
3+
* /autoresearch:debug — Hypothesis-driven debugging for specific failing tasks.
4+
*
5+
* Scientific method: Gather → Hypothesize → Test → Classify → Log → Repeat
6+
*
7+
* Usage:
8+
* npx tsx autoresearch/commands/debug.ts --task extract-npm-description
9+
* npx tsx autoresearch/commands/debug.ts --task bench-imdb-matrix --iterations 5
10+
*/
11+
12+
import { execSync } from 'node:child_process';
13+
import { readFileSync, appendFileSync, writeFileSync, existsSync } from 'node:fs';
14+
import { join, dirname } from 'node:path';
15+
import { fileURLToPath } from 'node:url';
16+
import { parseArgs } from '../config.js';
17+
18+
const __dirname = dirname(fileURLToPath(import.meta.url));
19+
const ROOT = join(__dirname, '..', '..');
20+
const TASKS_FILE = join(__dirname, '..', 'browse-tasks.json');
21+
const DEBUG_LOG = join(ROOT, 'debug-results.tsv');
22+
23+
interface BrowseTask {
24+
name: string;
25+
steps: string[];
26+
judge: { type: string; value?: string; minLength?: number; pattern?: string };
27+
}
28+
29+
function exec(cmd: string): string {
30+
try {
31+
return execSync(cmd, {
32+
cwd: ROOT, timeout: 30_000, encoding: 'utf-8',
33+
stdio: ['pipe', 'pipe', 'pipe'],
34+
}).trim();
35+
} catch (err: any) {
36+
return err.stdout?.trim() ?? err.message ?? '';
37+
}
38+
}
39+
40+
function initLog(): void {
41+
if (!existsSync(DEBUG_LOG)) {
42+
writeFileSync(DEBUG_LOG, '# AutoResearch Debug Log\niteration\ttask\thypothesis\tresult\tverdict\tdescription\n', 'utf-8');
43+
}
44+
}
45+
46+
function appendLog(iteration: number, task: string, hypothesis: string, result: string, verdict: string, description: string): void {
47+
appendFileSync(DEBUG_LOG, `${iteration}\t${task}\t${hypothesis}\t${result}\t${verdict}\t${description}\n`, 'utf-8');
48+
}
49+
50+
async function main() {
51+
const args = parseArgs(process.argv.slice(2));
52+
const taskName = args.task;
53+
const maxIterations = args.iterations ?? 10;
54+
55+
if (!taskName) {
56+
console.error('Usage: npx tsx autoresearch/commands/debug.ts --task <task-name> [--iterations N]');
57+
console.error('\nAvailable tasks:');
58+
const tasks: BrowseTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8'));
59+
// Show only failing tasks
60+
for (const task of tasks) {
61+
try { exec('opencli operate close'); } catch {}
62+
let lastOutput = '';
63+
for (const step of task.steps) lastOutput = exec(step);
64+
const passed = lastOutput.trim().length > 0; // simplified check
65+
if (!passed) console.error(` ✗ ${task.name}`);
66+
}
67+
process.exit(1);
68+
}
69+
70+
const tasks: BrowseTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8'));
71+
const task = tasks.find(t => t.name === taskName);
72+
if (!task) {
73+
console.error(`Task not found: ${taskName}`);
74+
process.exit(1);
75+
}
76+
77+
console.log(`\n🔍 AutoResearch Debug: ${taskName}`);
78+
console.log(` Steps: ${task.steps.length}`);
79+
console.log(` Judge: ${task.judge.type}${task.judge.value ? ` "${task.judge.value}"` : ''}`);
80+
console.log(` Max iterations: ${maxIterations}\n`);
81+
82+
initLog();
83+
84+
// Phase 1: Gather — run the task and capture output
85+
console.log('Phase 1: Gathering symptoms...');
86+
try { exec('opencli operate close'); } catch {}
87+
88+
let lastOutput = '';
89+
for (let i = 0; i < task.steps.length; i++) {
90+
const step = task.steps[i];
91+
console.log(` Step ${i + 1}: ${step.slice(0, 80)}`);
92+
lastOutput = exec(step);
93+
if (i < task.steps.length - 1) {
94+
console.log(` → ${lastOutput.slice(0, 100)}`);
95+
}
96+
}
97+
console.log(`\n Final output: ${lastOutput.slice(0, 200)}`);
98+
console.log(` Judge expects: ${JSON.stringify(task.judge)}`);
99+
100+
// Phase 2: Hypothesize + investigate via Claude Code
101+
for (let iter = 1; iter <= maxIterations; iter++) {
102+
console.log(`\n━━━ Debug Iteration ${iter}/${maxIterations} ━━━`);
103+
104+
const prompt = `You are debugging a failing browser automation task.
105+
106+
## Task: ${taskName}
107+
Steps:
108+
${task.steps.map((s, i) => ` ${i + 1}. ${s}`).join('\n')}
109+
110+
## Judge criteria
111+
${JSON.stringify(task.judge)}
112+
113+
## Last output
114+
${lastOutput.slice(0, 500)}
115+
116+
## Instructions
117+
1. Form a SPECIFIC, FALSIFIABLE hypothesis about why this task fails
118+
2. Run the MINIMUM experiment to test your hypothesis (e.g. run one step, check output)
119+
3. Classify: CONFIRMED (bug found), DISPROVEN (try different hypothesis), INCONCLUSIVE
120+
4. If CONFIRMED: describe the root cause and suggest a fix
121+
5. Output format: one line "HYPOTHESIS: ...", one line "RESULT: CONFIRMED|DISPROVEN|INCONCLUSIVE — ..."
122+
123+
Do NOT fix the code — just diagnose. Use opencli operate commands to investigate.`;
124+
125+
try {
126+
const result = execSync(
127+
`claude -p --dangerously-skip-permissions --allowedTools "Bash(opencli:*),Bash(npm:*),Read,Grep,Glob" --output-format text --no-session-persistence "${prompt.replace(/"/g, '\\"')}"`,
128+
{ cwd: ROOT, timeout: 120_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }
129+
).trim();
130+
131+
// Extract hypothesis and result
132+
const hypMatch = result.match(/HYPOTHESIS:\s*(.+)/i);
133+
const resMatch = result.match(/RESULT:\s*(CONFIRMED|DISPROVEN|INCONCLUSIVE)\s*[-]\s*(.+)/i);
134+
135+
const hypothesis = hypMatch?.[1]?.trim() ?? 'unknown';
136+
const verdict = resMatch?.[1]?.trim() ?? 'INCONCLUSIVE';
137+
const description = resMatch?.[2]?.trim() ?? result.split('\n').pop()?.trim() ?? '';
138+
139+
console.log(` Hypothesis: ${hypothesis.slice(0, 100)}`);
140+
console.log(` Verdict: ${verdict}${description.slice(0, 100)}`);
141+
142+
appendLog(iter, taskName, hypothesis, lastOutput.slice(0, 50), verdict, description);
143+
144+
if (verdict === 'CONFIRMED') {
145+
console.log(`\n✅ Root cause found at iteration ${iter}!`);
146+
console.log(` ${description}`);
147+
break;
148+
}
149+
} catch (err: any) {
150+
console.error(` Error: ${err.message?.slice(0, 100)}`);
151+
appendLog(iter, taskName, 'error', '', 'CRASH', err.message?.slice(0, 80) ?? '');
152+
}
153+
154+
// Re-run task for fresh output
155+
try { exec('opencli operate close'); } catch {}
156+
for (const step of task.steps) lastOutput = exec(step);
157+
}
158+
159+
try { exec('opencli operate close'); } catch {}
160+
console.log(`\nDebug log saved to: ${DEBUG_LOG}\n`);
161+
}
162+
163+
main();

autoresearch/commands/fix.ts

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/usr/bin/env npx tsx
2+
/**
3+
* /autoresearch:fix — Iterative error elimination.
4+
*
5+
* Auto-detects broken state (build → test → browse tests) and iteratively
6+
* fixes errors one at a time. Stops when error count reaches 0.
7+
*
8+
* Priority: build errors → test failures → browse task failures
9+
*
10+
* Usage:
11+
* npx tsx autoresearch/commands/fix.ts
12+
* npx tsx autoresearch/commands/fix.ts --iterations 10
13+
*/
14+
15+
import { execSync } from 'node:child_process';
16+
import { join, dirname } from 'node:path';
17+
import { fileURLToPath } from 'node:url';
18+
import { parseArgs } from '../config.js';
19+
import { Engine, type ModifyContext } from '../engine.js';
20+
21+
const __dirname = dirname(fileURLToPath(import.meta.url));
22+
const ROOT = join(__dirname, '..', '..');
23+
24+
function exec(cmd: string): { ok: boolean; output: string } {
25+
try {
26+
const output = execSync(cmd, {
27+
cwd: ROOT, timeout: 120_000, encoding: 'utf-8',
28+
stdio: ['pipe', 'pipe', 'pipe'],
29+
}).trim();
30+
return { ok: true, output };
31+
} catch (err: any) {
32+
return { ok: false, output: (err.stdout ?? '') + '\n' + (err.stderr ?? '') };
33+
}
34+
}
35+
36+
/** Detect current broken state and return verify command + error count */
37+
function detectBrokenState(): { verify: string; errors: number; description: string } | null {
38+
// 1. Build
39+
const build = exec('npm run build 2>&1');
40+
if (!build.ok) {
41+
const errorCount = (build.output.match(/error TS/g) || []).length || 1;
42+
return {
43+
verify: 'npm run build 2>&1 | grep -c "error TS" || echo 0',
44+
errors: errorCount,
45+
description: `${errorCount} TypeScript build error(s)`,
46+
};
47+
}
48+
49+
// 2. Tests
50+
const test = exec('npm test 2>&1');
51+
if (!test.ok) {
52+
const failMatch = test.output.match(/(\d+)\s+fail/i);
53+
const errorCount = failMatch ? parseInt(failMatch[1], 10) : 1;
54+
return {
55+
verify: 'npm test 2>&1 | grep -oP "\\d+(?= fail)" || echo 0',
56+
errors: errorCount,
57+
description: `${errorCount} test failure(s)`,
58+
};
59+
}
60+
61+
// 3. Browse tests
62+
const browse = exec('npx tsx autoresearch/eval-browse.ts 2>&1');
63+
const scoreMatch = browse.output.match(/SCORE=(\d+)\/(\d+)/);
64+
if (scoreMatch) {
65+
const passed = parseInt(scoreMatch[1], 10);
66+
const total = parseInt(scoreMatch[2], 10);
67+
const failures = total - passed;
68+
if (failures > 0) {
69+
return {
70+
verify: 'npx tsx autoresearch/eval-browse.ts 2>&1 | tail -1',
71+
errors: failures,
72+
description: `${failures} browse task failure(s) (${passed}/${total})`,
73+
};
74+
}
75+
}
76+
77+
return null; // all clean
78+
}
79+
80+
async function main() {
81+
const args = parseArgs(process.argv.slice(2));
82+
const maxIterations = args.iterations ?? 20;
83+
84+
console.log('\n🔧 AutoResearch Fix — Detecting broken state...\n');
85+
86+
const broken = detectBrokenState();
87+
if (!broken) {
88+
console.log(' ✓ All clean — nothing to fix!\n');
89+
return;
90+
}
91+
92+
console.log(` Found: ${broken.description}`);
93+
console.log(` Verify: ${broken.verify}\n`);
94+
95+
const config = {
96+
goal: `Fix all errors: ${broken.description}`,
97+
scope: ['src/**/*.ts', 'extension/src/**/*.ts'],
98+
metric: 'error_count',
99+
direction: 'lower' as const,
100+
verify: broken.verify,
101+
guard: 'npm run build',
102+
iterations: maxIterations,
103+
minDelta: 1,
104+
};
105+
106+
const logPath = join(ROOT, 'autoresearch-results.tsv');
107+
const engine = new Engine(config, logPath, {
108+
modify: async (ctx: ModifyContext) => {
109+
const prompt = `Fix ONE error. Current error count: ${ctx.currentMetric}. Goal: 0 errors.
110+
111+
Read the error output, understand the root cause, and make ONE focused fix.
112+
Do NOT fix multiple unrelated errors at once.
113+
Do NOT modify test files.
114+
115+
${ctx.stuckHint ? `STUCK HINT: ${ctx.stuckHint}` : ''}`;
116+
117+
try {
118+
const result = execSync(
119+
`claude -p --dangerously-skip-permissions --allowedTools "Bash(npm:*),Bash(npx:*),Read,Edit,Write,Glob,Grep" --output-format text --no-session-persistence "${prompt.replace(/"/g, '\\"')}"`,
120+
{ cwd: ROOT, timeout: 180_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }
121+
).trim();
122+
const lines = result.split('\n').filter(l => l.trim());
123+
return lines[lines.length - 1]?.trim()?.slice(0, 120) || 'fix attempt';
124+
} catch {
125+
return null;
126+
}
127+
},
128+
onStatus: (msg) => console.log(msg),
129+
});
130+
131+
try {
132+
const results = await engine.run();
133+
const finalMetric = results[results.length - 1]?.metric ?? broken.errors;
134+
if (finalMetric === 0) {
135+
console.log('\n✅ All errors fixed!\n');
136+
} else {
137+
console.log(`\n⚠ ${finalMetric} error(s) remaining after ${maxIterations} iterations.\n`);
138+
}
139+
} catch (err: any) {
140+
console.error(`\n❌ ${err.message}`);
141+
process.exit(1);
142+
}
143+
}
144+
145+
main();

autoresearch/commands/plan.ts

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/usr/bin/env npx tsx
2+
/**
3+
* /autoresearch:plan — Interactive configuration wizard.
4+
*
5+
* Walks through goal, scope, metric, verify, guard settings
6+
* and outputs a ready-to-paste run command.
7+
*
8+
* Usage:
9+
* npx tsx autoresearch/commands/plan.ts
10+
*/
11+
12+
import { execSync } from 'node:child_process';
13+
import { createInterface } from 'node:readline';
14+
import { join, dirname } from 'node:path';
15+
import { fileURLToPath } from 'node:url';
16+
import { PRESETS } from '../presets/index.js';
17+
18+
const __dirname = dirname(fileURLToPath(import.meta.url));
19+
const ROOT = join(__dirname, '..', '..');
20+
21+
const rl = createInterface({ input: process.stdin, output: process.stdout });
22+
const ask = (q: string): Promise<string> => new Promise(r => rl.question(q, r));
23+
24+
async function main() {
25+
console.log('\n🔬 AutoResearch — Configuration Wizard\n');
26+
27+
// Offer presets first
28+
const presetNames = Object.keys(PRESETS);
29+
console.log('Available presets:');
30+
presetNames.forEach((name, i) => {
31+
console.log(` [${i + 1}] ${name}${PRESETS[name].goal}`);
32+
});
33+
console.log(` [0] Custom config\n`);
34+
35+
const choice = await ask('Choose preset or 0 for custom: ');
36+
const idx = parseInt(choice, 10);
37+
38+
if (idx > 0 && idx <= presetNames.length) {
39+
const name = presetNames[idx - 1];
40+
const iterations = await ask('Iterations (empty = unbounded): ');
41+
const iterFlag = iterations ? ` --iterations ${iterations}` : '';
42+
console.log(`\n✅ Ready to run:\n`);
43+
console.log(` npx tsx autoresearch/commands/run.ts --preset ${name}${iterFlag}\n`);
44+
rl.close();
45+
return;
46+
}
47+
48+
// Custom config
49+
const goal = await ask('Goal (what to improve): ');
50+
const scope = await ask('Scope (file globs, comma-separated): ');
51+
const metric = await ask('Metric name (e.g. pass_count, coverage): ');
52+
const direction = await ask('Direction (higher/lower): ') as 'higher' | 'lower';
53+
const verify = await ask('Verify command (must output a number): ');
54+
55+
// Dry-run verify
56+
console.log('\n Dry-running verify command...');
57+
try {
58+
const output = execSync(verify, { cwd: ROOT, timeout: 120_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }).trim();
59+
const { extractMetric } = await import('../config.js');
60+
const value = extractMetric(output);
61+
if (value != null) {
62+
console.log(` ✓ Verify works — current ${metric}: ${value}`);
63+
} else {
64+
console.log(` ⚠ Verify ran but no number extracted from output:\n ${output.slice(0, 200)}`);
65+
}
66+
} catch (err: any) {
67+
console.log(` ✗ Verify failed: ${err.message?.slice(0, 100)}`);
68+
}
69+
70+
const guard = await ask('Guard command (optional, press Enter to skip): ');
71+
const iterations = await ask('Iterations (empty = unbounded): ');
72+
73+
const parts = ['npx tsx autoresearch/commands/run.ts'];
74+
parts.push(`--goal "${goal}"`);
75+
parts.push(`--scope "${scope}"`);
76+
parts.push(`--metric "${metric}"`);
77+
parts.push(`--direction ${direction}`);
78+
parts.push(`--verify "${verify}"`);
79+
if (guard) parts.push(`--guard "${guard}"`);
80+
if (iterations) parts.push(`--iterations ${iterations}`);
81+
82+
console.log(`\n✅ Ready to run:\n`);
83+
console.log(` ${parts.join(' \\\n ')}\n`);
84+
85+
rl.close();
86+
}
87+
88+
main();

0 commit comments

Comments
 (0)