Skip to content

Commit 3ecc4d5

Browse files
committed
Add five-lane ContextBench scoring script
1 parent 786b807 commit 3ecc4d5

1 file changed

Lines changed: 189 additions & 0 deletions

File tree

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
2+
import { join } from 'node:path';
3+
import { spawnSync } from 'node:child_process';
4+
5+
const targetTaskId = process.env.TARGET_TASK_ID || 'SWE-Bench-Pro__go__maintenance__bugfix__4df06349';
6+
const root = process.env.ROOT || '/tmp/contextbench-five-lane-score';
7+
const officialContextBench = process.env.OFFICIAL_CONTEXTBENCH;
8+
const selectionsPath = process.env.SELECTIONS_PATH || 'scripts/contextbench-five-lane-selections.json';
9+
const payloads = JSON.parse(readFileSync(process.env.TASK_PAYLOADS, 'utf8'));
10+
const task = payloads.tasks.find((candidate) => candidate.instance_id === targetTaskId);
11+
if (!task) throw new Error(`target task ${targetTaskId} missing from payloads`);
12+
if (!officialContextBench) throw new Error('OFFICIAL_CONTEXTBENCH is required');
13+
if (!existsSync(selectionsPath)) throw new Error(`selection file missing: ${selectionsPath}`);
14+
const selections = JSON.parse(readFileSync(selectionsPath, 'utf8'));
15+
const laneSelections = selections.laneSelections || [];
16+
if (laneSelections.length === 0) throw new Error('selection file has no laneSelections');
17+
18+
function run(cmd, args, opts = {}) {
19+
const started = Date.now();
20+
const r = spawnSync(cmd, args, {
21+
cwd: opts.cwd || process.cwd(),
22+
env: opts.env || process.env,
23+
encoding: 'utf8',
24+
timeout: opts.timeoutMs || 600000,
25+
maxBuffer: 128 * 1024 * 1024,
26+
});
27+
return {
28+
command: [cmd, ...args].join(' '),
29+
cwd: opts.cwd || process.cwd(),
30+
status: typeof r.status === 'number' ? r.status : null,
31+
signal: r.signal,
32+
error: r.error?.message || null,
33+
durationMs: Date.now() - started,
34+
stdout: r.stdout || '',
35+
stderr: r.stderr || '',
36+
};
37+
}
38+
39+
function addSpan(map, file, start, end) {
40+
const clean = String(file || '').replaceAll('\\', '/').replace(/^\.\//, '');
41+
if (!clean) return;
42+
const s = Math.max(1, Number(start) || 1);
43+
const e = Math.max(s, Number(end) || s);
44+
const list = map.get(clean) || [];
45+
list.push({ start: s, end: e });
46+
map.set(clean, list);
47+
}
48+
49+
function resultTableRow(row) {
50+
const final = row.score?.final || {};
51+
return {
52+
lane: row.lane_id,
53+
task: row.task_id,
54+
fileCoverage: final.file?.coverage ?? null,
55+
filePrecision: final.file?.precision ?? null,
56+
symbolCoverage: final.symbol?.coverage ?? null,
57+
symbolPrecision: final.symbol?.precision ?? null,
58+
spanCoverage: final.span?.coverage ?? null,
59+
spanPrecision: final.span?.precision ?? null,
60+
lineCoverage: final.line?.coverage ?? null,
61+
linePrecision: final.line?.precision ?? null,
62+
editlocRecall: row.score?.editloc?.recall ?? null,
63+
editlocPrecision: row.score?.editloc?.precision ?? null,
64+
};
65+
}
66+
67+
const runDir = join(root, 'five-lane-score');
68+
mkdirSync(runDir, { recursive: true });
69+
writeFileSync(join(runDir, 'selections.json'), JSON.stringify(selections, null, 2));
70+
71+
const goldPath = join(runDir, 'gold.json');
72+
const gold = run(
73+
'node',
74+
[
75+
'scripts/contextbench-select-slice.mjs',
76+
'--write-gold',
77+
'--task-id',
78+
task.instance_id,
79+
'--out',
80+
goldPath,
81+
'--payloads',
82+
process.env.TASK_PAYLOADS,
83+
],
84+
{ timeoutMs: 600000 },
85+
);
86+
writeFileSync(join(runDir, 'gold-command.json'), JSON.stringify(gold, null, 2));
87+
if (gold.status !== 0) throw new Error(`gold materialization failed: ${gold.stderr || gold.stdout}`);
88+
89+
const rows = [];
90+
for (const selection of laneSelections) {
91+
const lane = selection.lane_id || selection.lane;
92+
const laneDir = join(runDir, lane);
93+
mkdirSync(laneDir, { recursive: true });
94+
const spans = Array.isArray(selection.spans) ? selection.spans : [];
95+
const files = Array.isArray(selection.files) ? selection.files : [];
96+
const spanMap = new Map();
97+
for (const span of spans) addSpan(spanMap, span.file, span.start, span.end);
98+
const predFiles = [...new Set([...files, ...spans.map((span) => String(span.file || '').replaceAll('\\', '/').replace(/^\.\//, ''))])].filter(Boolean);
99+
const predSpans = Object.fromEntries(spanMap.entries());
100+
const nonEmptyPrediction = predFiles.length > 0 && spans.length > 0;
101+
const readiness = selection.readiness || {};
102+
const rowBase = {
103+
lane_id: lane,
104+
task_id: task.instance_id,
105+
model: selections.model || 'gpt-5.4-mini-high',
106+
predictionSource: selection.predictionSource || selections.predictionSource || 'gpt-5.4-mini-high subagent over real lane candidate pack',
107+
setupStatus: readiness.setupStatus || selection.setupStatus || 'unknown',
108+
indexStatus: readiness.indexStatus || selection.indexStatus || 'unknown',
109+
toolCallable: Boolean(readiness.toolCallable ?? selection.toolCallable),
110+
candidateCount: Number(readiness.candidateCount ?? selection.candidateCount ?? 0),
111+
setupIndex: readiness.setupIndex || selection.setupIndex || null,
112+
nonEmptyPrediction,
113+
predFiles: predFiles.length,
114+
predSpans: spans.length,
115+
rationale: selection.rationale || null,
116+
};
117+
118+
writeFileSync(join(laneDir, 'selection.json'), JSON.stringify(selection, null, 2));
119+
if (!nonEmptyPrediction) {
120+
rows.push({ ...rowBase, status: 'empty_prediction', officialEvaluatorScoreable: false, score: null });
121+
continue;
122+
}
123+
124+
const prediction = {
125+
instance_id: task.instance_id,
126+
repo_url: task.repo_checkout_path,
127+
commit: task.base_commit,
128+
traj_data: {
129+
pred_steps: [{ files: predFiles, spans: predSpans }],
130+
pred_files: predFiles,
131+
pred_spans: predSpans,
132+
},
133+
model_patch: '',
134+
};
135+
const predictionPath = join(laneDir, 'prediction.json');
136+
writeFileSync(predictionPath, JSON.stringify(prediction, null, 2));
137+
138+
const scorePath = join(laneDir, 'official-score.jsonl');
139+
const evaluator = run(
140+
'python',
141+
[
142+
'-m',
143+
'contextbench.evaluate',
144+
'--gold',
145+
goldPath,
146+
'--pred',
147+
predictionPath,
148+
'--cache',
149+
join(laneDir, 'repo-cache'),
150+
'--out',
151+
scorePath,
152+
],
153+
{ cwd: officialContextBench, timeoutMs: 1200000 },
154+
);
155+
writeFileSync(join(laneDir, 'evaluator-command.json'), JSON.stringify(evaluator, null, 2));
156+
let score = null;
157+
if (existsSync(scorePath)) {
158+
const lines = readFileSync(scorePath, 'utf8').trim().split(/\n+/).filter(Boolean);
159+
if (lines.length > 0) score = JSON.parse(lines.at(-1));
160+
}
161+
const scoreable = evaluator.status === 0 && Boolean(score);
162+
rows.push({
163+
...rowBase,
164+
status: scoreable ? 'completed' : 'judge_failed',
165+
officialEvaluatorScoreable: scoreable,
166+
score,
167+
});
168+
}
169+
170+
const scoreableRows = rows.filter((row) => row.officialEvaluatorScoreable);
171+
const summary = {
172+
createdAt: new Date().toISOString(),
173+
attemptedRows: rows.length,
174+
scoreableRows: scoreableRows.length,
175+
requiredCompetitors: 5,
176+
setupIndexCostReportedSeparately: true,
177+
model: selections.model || 'gpt-5.4-mini-high',
178+
predictionSource: selections.predictionSource || 'gpt-5.4-mini-high subagent selections over real lane candidate packs',
179+
caveats: selections.caveats || [],
180+
resultsTable: scoreableRows.map(resultTableRow),
181+
rows,
182+
};
183+
184+
writeFileSync(join(runDir, 'summary.json'), JSON.stringify(summary, null, 2));
185+
writeFileSync(join(root, 'summary.json'), JSON.stringify(summary, null, 2));
186+
console.log('CONTEXTBENCH_FIVE_LANE_SCORE_JSON_START');
187+
console.log(JSON.stringify(summary, null, 2));
188+
console.log('CONTEXTBENCH_FIVE_LANE_SCORE_JSON_END');
189+
if (scoreableRows.length !== rows.length || scoreableRows.length < 5) process.exitCode = 1;

0 commit comments

Comments
 (0)