Skip to content

Commit d4e5ad3

Browse files
committed
feat(benchmark): 添加真实仓库基准测试工具和更详细的无头事件
- 新增真实仓库基准测试工具,包含三类固定任务(分析、窄范围修复、跨模块修复) - 增强无头事件协议,添加阶段事件以区分搜索、检查、目标命中等状态 - 改进工具事件,包含目标路径和工具类型信息 - 更新确认提示组件,根据规划模式和最大轮次调整快捷键提示 - 添加基准测试相关脚本和测试覆盖 - 修复类型安全问题和测试中的生成器函数
1 parent deabaee commit d4e5ad3

22 files changed

Lines changed: 1181 additions & 44 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ blade-context/
7171

7272
# Local settings
7373
.blade/settings.local.json
74+
.blade/benchmarks/
7475
.claude/settings.local.json
7576

7677
# Monorepo - 各包的构建输出

packages/cli/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"test:security": "node scripts/test.js security",
3030
"test:coverage": "node scripts/test.js all --coverage",
3131
"test:watch": "vitest --watch --config vitest.config.ts",
32+
"benchmark:repo": "node scripts/run-bun.js run scripts/run-real-repo-benchmark.ts",
3233
"test:update-snapshots": "node scripts/test.js snapshot --update",
3334
"lint": "biome lint src tests",
3435
"lint:fix": "biome lint --write src tests",
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import {
2+
DEFAULT_REAL_REPO_BENCHMARK_CASES,
3+
runRealRepoBenchmark,
4+
} from '../src/commands/headlessBenchmark.js';
5+
6+
function getArgValue(flag: string): string | undefined {
7+
const index = process.argv.indexOf(flag);
8+
if (index === -1) return undefined;
9+
return process.argv[index + 1];
10+
}
11+
12+
async function main() {
13+
const model = getArgValue('--model');
14+
const historyPath = getArgValue('--history-path');
15+
const result = await runRealRepoBenchmark({ model, historyPath });
16+
17+
console.log('Real repo benchmark completed.');
18+
console.log(`History: ${result.historyPath}`);
19+
for (const benchmarkCase of result.results) {
20+
console.log(
21+
[
22+
`${benchmarkCase.label}`,
23+
`success=${benchmarkCase.success}`,
24+
`duration_ms=${benchmarkCase.durationMs.toFixed(1)}`,
25+
`tokens=${benchmarkCase.totalTokens}`,
26+
`read_files=${benchmarkCase.readFilesCount}`,
27+
`blind_search=${benchmarkCase.blindSearchEvents}`,
28+
`target_hit=${benchmarkCase.targetHitEvents}`,
29+
].join(' | ')
30+
);
31+
}
32+
console.log(
33+
[
34+
'summary',
35+
`success_rate=${(result.summary.successRate * 100).toFixed(1)}%`,
36+
`avg_duration_ms=${result.summary.averageDurationMs.toFixed(1)}`,
37+
`avg_tokens=${result.summary.averageTotalTokens.toFixed(1)}`,
38+
`avg_read_files=${result.summary.averageReadFilesCount.toFixed(1)}`,
39+
`cases=${DEFAULT_REAL_REPO_BENCHMARK_CASES.length}`,
40+
].join(' | ')
41+
);
42+
}
43+
44+
void main();

packages/cli/src/agent/loop/StreamingToolExecutor.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,11 @@ function combineAbortSignals(...signals: AbortSignal[]): AbortSignal {
5050
if (validSignals.length === 1) return validSignals[0];
5151

5252
// Use AbortSignal.any if available (Node 20+)
53-
if ('any' in AbortSignal && typeof (AbortSignal as any).any === 'function') {
54-
return (AbortSignal as any).any(validSignals);
53+
const abortSignalWithAny = AbortSignal as typeof AbortSignal & {
54+
any?: (signals: AbortSignal[]) => AbortSignal;
55+
};
56+
if (typeof abortSignalWithAny.any === 'function') {
57+
return abortSignalWithAny.any(validSignals);
5558
}
5659
// Fallback: manual composite
5760
for (const s of validSignals) {
@@ -356,4 +359,4 @@ export class StreamingToolExecutor {
356359
this.activeAborts.delete(toolCall.id);
357360
}
358361
}
359-
}
362+
}

packages/cli/src/commands/headless.ts

Lines changed: 228 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,25 @@ interface HeadlessStreamSnapshot {
104104
wroteAssistantContent: boolean;
105105
}
106106

107+
interface HeadlessPhaseContext {
108+
turn?: number;
109+
toolName?: string;
110+
target?: string;
111+
}
112+
113+
type HeadlessPhaseName =
114+
| 'turn'
115+
| 'searching'
116+
| 'inspecting'
117+
| 'target_hit'
118+
| 'executing'
119+
| 'completed';
120+
type HeadlessPhaseStatus = 'ongoing' | 'hit' | 'done';
121+
122+
interface HeadlessPhaseState {
123+
targetLocked: boolean;
124+
}
125+
107126
class HeadlessStreamState {
108127
private openedThinking = false;
109128
private wroteAssistantContent = false;
@@ -245,6 +264,113 @@ function resolveOutputFormat(outputFormat?: string): HeadlessOutputFormat {
245264
return outputFormat === 'jsonl' ? 'jsonl' : 'text';
246265
}
247266

267+
function extractToolTarget(
268+
toolName: string,
269+
params: Record<string, unknown>
270+
): string | undefined {
271+
const stringParam = (...keys: string[]) => {
272+
for (const key of keys) {
273+
const value = params[key];
274+
if (typeof value === 'string' && value.trim() !== '') {
275+
return value;
276+
}
277+
}
278+
return undefined;
279+
};
280+
281+
switch (toolName) {
282+
case 'Read':
283+
case 'Edit':
284+
case 'Write':
285+
case 'UndoEdit':
286+
return stringParam('file_path');
287+
case 'NotebookEdit':
288+
return stringParam('notebook_path');
289+
case 'Grep':
290+
return stringParam('path', 'pattern');
291+
case 'Glob':
292+
return stringParam('pattern');
293+
case 'WebFetch':
294+
return stringParam('url');
295+
case 'WebSearch':
296+
return stringParam('query');
297+
case 'Bash':
298+
return stringParam('description', 'command');
299+
case 'Task':
300+
return stringParam('description');
301+
case 'LSP':
302+
return stringParam('filePath', 'operation');
303+
default:
304+
return undefined;
305+
}
306+
}
307+
308+
function getPhaseForTool(
309+
toolName: string,
310+
summary: string,
311+
target: string | undefined,
312+
state: HeadlessPhaseState
313+
): {
314+
phase: HeadlessPhaseName;
315+
status: HeadlessPhaseStatus;
316+
message: string;
317+
shouldLockTarget: boolean;
318+
} {
319+
const searchTools = new Set(['Glob', 'Grep', 'WebSearch', 'LS']);
320+
const readTools = new Set(['Read', 'WebFetch']);
321+
const actionTools = new Set([
322+
'Edit',
323+
'Write',
324+
'NotebookEdit',
325+
'Bash',
326+
'LSP',
327+
'UndoEdit',
328+
]);
329+
330+
if (actionTools.has(toolName) && !state.targetLocked) {
331+
return {
332+
phase: 'target_hit',
333+
status: 'hit',
334+
message: `Target locked: ${summary}`,
335+
shouldLockTarget: true,
336+
};
337+
}
338+
339+
if (state.targetLocked) {
340+
return {
341+
phase: 'executing',
342+
status: 'hit',
343+
message: target ? `Working within target: ${summary}` : `Executing: ${summary}`,
344+
shouldLockTarget: false,
345+
};
346+
}
347+
348+
if (searchTools.has(toolName)) {
349+
return {
350+
phase: 'searching',
351+
status: 'ongoing',
352+
message: `Still searching: ${summary}`,
353+
shouldLockTarget: false,
354+
};
355+
}
356+
357+
if (readTools.has(toolName)) {
358+
return {
359+
phase: 'inspecting',
360+
status: 'ongoing',
361+
message: `Inspecting candidate: ${summary}`,
362+
shouldLockTarget: false,
363+
};
364+
}
365+
366+
return {
367+
phase: 'executing',
368+
status: state.targetLocked ? 'hit' : 'ongoing',
369+
message: `Executing: ${summary}`,
370+
shouldLockTarget: false,
371+
};
372+
}
373+
248374
function createEventWriter(
249375
io: HeadlessIO,
250376
outputFormat: HeadlessOutputFormat
@@ -303,20 +429,59 @@ function createEventWriter(
303429
}
304430
writeLine(io.stdout, content);
305431
},
306-
toolStart(toolName: string, summary: string) {
432+
toolStart(
433+
toolName: string,
434+
summary: string,
435+
target?: string,
436+
toolKind?: 'readonly' | 'write' | 'execute'
437+
) {
307438
if (outputFormat === 'jsonl') {
308-
writeJsonl('tool_start', { tool_name: toolName, summary });
439+
writeJsonl('tool_start', {
440+
tool_name: toolName,
441+
summary,
442+
target,
443+
tool_kind: toolKind,
444+
});
309445
return;
310446
}
311447
writeLine(io.stderr, `[tool:start] ${summary}`);
312448
},
313-
toolResult(toolName: string, summary: string) {
449+
toolResult(
450+
toolName: string,
451+
summary: string,
452+
target?: string,
453+
toolKind?: 'readonly' | 'write' | 'execute'
454+
) {
314455
if (outputFormat === 'jsonl') {
315-
writeJsonl('tool_result', { tool_name: toolName, summary });
456+
writeJsonl('tool_result', {
457+
tool_name: toolName,
458+
summary,
459+
target,
460+
tool_kind: toolKind,
461+
});
316462
return;
317463
}
318464
writeLine(io.stderr, `[tool:result] ${summary}`);
319465
},
466+
phase(
467+
phase: HeadlessPhaseName,
468+
status: HeadlessPhaseStatus,
469+
message: string,
470+
context: HeadlessPhaseContext = {}
471+
) {
472+
if (outputFormat === 'jsonl') {
473+
writeJsonl('phase', {
474+
phase,
475+
status,
476+
message,
477+
turn: context.turn,
478+
tool_name: context.toolName,
479+
target: context.target,
480+
});
481+
return;
482+
}
483+
writeLine(io.stderr, `[phase:${phase}] ${message}`);
484+
},
320485
toolDetail(toolName: string, detail: string) {
321486
if (outputFormat === 'jsonl') {
322487
writeJsonl('tool_detail', { tool_name: toolName, detail });
@@ -399,6 +564,7 @@ export async function runHeadless(
399564
let outputFormat: HeadlessOutputFormat = 'text';
400565
let eventWriter = createEventWriter(io, outputFormat);
401566
const streamState = new HeadlessStreamState();
567+
const phaseState: HeadlessPhaseState = { targetLocked: false };
402568

403569
try {
404570
const validatedOptions = validateHeadlessOptions(options);
@@ -481,17 +647,61 @@ export async function runHeadless(
481647
try {
482648
const params = JSON.parse(toolCall.function.arguments);
483649
const summary = formatToolCallSummary(toolCall.function.name, params);
484-
eventWriter.toolStart(toolCall.function.name, summary);
650+
const target = extractToolTarget(toolCall.function.name, params);
651+
const phaseInfo = getPhaseForTool(
652+
toolCall.function.name,
653+
summary,
654+
target,
655+
phaseState
656+
);
657+
if (phaseInfo.shouldLockTarget) {
658+
phaseState.targetLocked = true;
659+
}
660+
eventWriter.phase(phaseInfo.phase, phaseInfo.status, phaseInfo.message, {
661+
toolName: toolCall.function.name,
662+
target,
663+
});
664+
eventWriter.toolStart(
665+
toolCall.function.name,
666+
summary,
667+
target,
668+
event.toolKind
669+
);
485670
} catch {
486-
eventWriter.toolStart(toolCall.function.name, toolCall.function.name);
671+
eventWriter.phase(
672+
phaseState.targetLocked ? 'executing' : 'searching',
673+
phaseState.targetLocked ? 'hit' : 'ongoing',
674+
phaseState.targetLocked
675+
? `Working within target: ${toolCall.function.name}`
676+
: `Still searching: ${toolCall.function.name}`,
677+
{ toolName: toolCall.function.name }
678+
);
679+
eventWriter.toolStart(
680+
toolCall.function.name,
681+
toolCall.function.name,
682+
undefined,
683+
event.toolKind
684+
);
487685
}
488686
break;
489687
}
490688
case 'tool_result': {
491689
const toolCall = event.toolCall;
492690
if (!('function' in toolCall)) break;
691+
let target: string | undefined;
692+
try {
693+
const params = JSON.parse(toolCall.function.arguments);
694+
target = extractToolTarget(toolCall.function.name, params);
695+
} catch {
696+
target = undefined;
697+
}
493698
const display = formatToolDisplay(toolCall.function.name, event.result);
494-
eventWriter.toolResult(toolCall.function.name, display.summary);
699+
eventWriter.toolResult(
700+
toolCall.function.name,
701+
display.summary,
702+
target,
703+
undefined
704+
);
495705
if (display.detail) {
496706
eventWriter.toolDetail(toolCall.function.name, display.detail);
497707
}
@@ -520,6 +730,15 @@ export async function runHeadless(
520730

521731
// --- 系统事件 ---
522732
case 'turn_start':
733+
if (event.turn === 1) {
734+
phaseState.targetLocked = false;
735+
}
736+
eventWriter.phase(
737+
'turn',
738+
phaseState.targetLocked ? 'hit' : 'ongoing',
739+
`Turn ${event.turn} started`,
740+
{ turn: event.turn }
741+
);
523742
break;
524743

525744
default: {
@@ -537,6 +756,8 @@ export async function runHeadless(
537756
);
538757
}
539758

759+
eventWriter.phase('completed', 'done', 'Headless run completed');
760+
540761
return 0;
541762
} catch (error) {
542763
if (streamState.hasOpenThinking() && outputFormat === 'text') {

0 commit comments

Comments
 (0)