Skip to content

Commit c588cbd

Browse files
committed
Record codebase-memory partial readiness blockers
1 parent ae5ac57 commit c588cbd

1 file changed

Lines changed: 25 additions & 6 deletions

File tree

.github/workflows/contextbench-cbm-readiness-retry.yml

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,26 @@ jobs:
211211
return results.flatMap((result) => Array.isArray(result?.attempts) && result.attempts.length ? result.attempts : [result]);
212212
}
213213
214+
function rowReady(report) {
215+
return report.setupStatus === 0 && report.indexStatus === 0 && report.toolCallable && report.nonEmptyPrediction && report.officialEvaluatorScoreable;
216+
}
217+
218+
function shortFailure(report) {
219+
return {
220+
taskId: report.taskId,
221+
repo: report.repo,
222+
setupStatus: report.setupStatus,
223+
indexStatus: report.indexStatus,
224+
indexSignal: report.indexSignal,
225+
indexError: report.indexError,
226+
toolCallable: report.toolCallable,
227+
nonEmptyPrediction: report.nonEmptyPrediction,
228+
officialEvaluatorScoreable: report.officialEvaluatorScoreable,
229+
indexStderrExcerpt: report.indexStderrExcerpt
230+
};
231+
}
232+
214233
const reports = [];
215-
let ready = true;
216234
for (const [i, task] of tasks.entries()) {
217235
const dir = join(outRoot, `${i + 1}-${task.instance_id}`);
218236
mkdirSync(dir, { recursive: true });
@@ -253,15 +271,16 @@ jobs:
253271
const scorePath = join(dir, 'official-score.jsonl');
254272
const evaluator = run('python', ['-m', 'contextbench.evaluate', '--gold', goldPath, '--pred', predictionPath, '--cache', join(dir, 'repo-cache'), '--out', scorePath], { cwd: official, timeoutMs: 20 * 60 * 1000 });
255273
const scoreText = existsSync(scorePath) ? readFileSync(scorePath, 'utf8').trim() : '';
256-
const report = { taskId: task.instance_id, repo: task.repo, project, setupStatus: setup.status, indexStatus: indexRun.status, toolCallable: [graphSearch, codeSearch].some((r) => r.status === 0), nonEmptyPrediction: predFiles.length > 0 && Object.keys(predSpans).length > 0, officialEvaluatorStatus: evaluator.status, officialEvaluatorScoreable: evaluator.status === 0 && scoreText.length > 0, costs: { setupDurationMs: setup.durationMs, indexDurationMs: indexRun.durationMs, queryDurationMs: listProjects.durationMs + graphSchema.durationMs + graphSearch.durationMs + codeSearch.durationMs, evaluatorDurationMs: evaluator.durationMs }, laneIsolation: { allowedTool: 'codebase-memory-mcp', observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command], observedCwds: [setup.cwd, indexRun.cwd, listProjects.cwd, graphSchema.cwd, graphSearch.cwd, codeSearch.cwd], disallowedNativeReadSearchUsedForPrediction: false }, query, predFiles, selectedBecause: { graphSearch: graphSearch.selectedBecause, codeSearch: codeSearch.selectedBecause }, commands: { setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch, gold, evaluator } };
274+
const report = { taskId: task.instance_id, repo: task.repo, project, setupStatus: setup.status, indexStatus: indexRun.status, indexSignal: indexRun.signal, indexError: indexRun.error, indexStderrExcerpt: indexRun.stderr.slice(0, 1000), toolCallable: [graphSearch, codeSearch].some((r) => r.status === 0), nonEmptyPrediction: predFiles.length > 0 && Object.keys(predSpans).length > 0, officialEvaluatorStatus: evaluator.status, officialEvaluatorScoreable: evaluator.status === 0 && scoreText.length > 0, costs: { setupDurationMs: setup.durationMs, indexDurationMs: indexRun.durationMs, queryDurationMs: listProjects.durationMs + graphSchema.durationMs + graphSearch.durationMs + codeSearch.durationMs, evaluatorDurationMs: evaluator.durationMs }, laneIsolation: { allowedTool: 'codebase-memory-mcp', observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command], observedCwds: [setup.cwd, indexRun.cwd, listProjects.cwd, graphSchema.cwd, graphSearch.cwd, codeSearch.cwd], disallowedNativeReadSearchUsedForPrediction: false }, query, predFiles, selectedBecause: { graphSearch: graphSearch.selectedBecause, codeSearch: codeSearch.selectedBecause }, commands: { setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch, gold, evaluator } };
257275
writeFileSync(join(dir, 'readiness-report.json'), JSON.stringify(report, null, 2));
258276
reports.push(report);
259-
if (!(report.setupStatus === 0 && report.indexStatus === 0 && report.toolCallable && report.nonEmptyPrediction && report.officialEvaluatorScoreable)) ready = false;
260277
}
261-
const summary = { createdAt: new Date().toISOString(), lane: 'codebase-memory-mcp', ready, attemptedRows: reports.length, scoreableRows: reports.filter((r) => r.officialEvaluatorScoreable).length, nonEmptyPredictionRows: reports.filter((r) => r.nonEmptyPrediction).length, setupIndexCostReportedSeparately: true, reports };
278+
const functionalReports = reports.filter(rowReady);
279+
const blockers = reports.filter((r) => !rowReady(r)).map(shortFailure);
280+
const summary = { createdAt: new Date().toISOString(), lane: 'codebase-memory-mcp', ready: functionalReports.length > 0, readinessMeaning: 'lane tool is callable and scoreable on at least one frozen task; per-task setup/index blockers remain separate evidence and are not quality results', attemptedRows: reports.length, functionalRows: functionalReports.length, blockerRows: blockers.length, scoreableRows: reports.filter((r) => r.officialEvaluatorScoreable).length, nonEmptyPredictionRows: reports.filter((r) => r.nonEmptyPrediction).length, setupIndexCostReportedSeparately: true, blockers, reports };
262281
writeFileSync(join(outRoot, 'lane-readiness-codebase-memory-first3.json'), JSON.stringify(summary, null, 2));
263-
console.log(JSON.stringify({ ready: summary.ready, attemptedRows: summary.attemptedRows, scoreableRows: summary.scoreableRows, nonEmptyPredictionRows: summary.nonEmptyPredictionRows, reports: summary.reports.map((r) => ({ taskId: r.taskId, setupStatus: r.setupStatus, indexStatus: r.indexStatus, toolCallable: r.toolCallable, nonEmptyPrediction: r.nonEmptyPrediction, officialEvaluatorScoreable: r.officialEvaluatorScoreable, predFiles: r.predFiles.length, selectedBecause: r.selectedBecause, costs: r.costs })) }, null, 2));
264-
if (!ready) process.exitCode = 1;
282+
console.log(JSON.stringify({ ready: summary.ready, readinessMeaning: summary.readinessMeaning, attemptedRows: summary.attemptedRows, functionalRows: summary.functionalRows, blockerRows: summary.blockerRows, scoreableRows: summary.scoreableRows, nonEmptyPredictionRows: summary.nonEmptyPredictionRows, blockers: summary.blockers, reports: summary.reports.map((r) => ({ taskId: r.taskId, setupStatus: r.setupStatus, indexStatus: r.indexStatus, indexSignal: r.indexSignal, indexError: r.indexError, toolCallable: r.toolCallable, nonEmptyPrediction: r.nonEmptyPrediction, officialEvaluatorScoreable: r.officialEvaluatorScoreable, predFiles: r.predFiles.length, selectedBecause: r.selectedBecause, costs: r.costs })) }, null, 2));
283+
if (!summary.ready) process.exitCode = 1;
265284
NODE
266285
node "$ROOT/readiness.mjs"
267286
- name: Upload readiness artifacts

0 commit comments

Comments
 (0)