Skip to content

Commit ae5ac57

Browse files
committed
Harden codebase-memory readiness fallback selection
1 parent ed46c22 commit ae5ac57

1 file changed

Lines changed: 47 additions & 8 deletions

File tree

.github/workflows/contextbench-cbm-readiness-retry.yml

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,22 @@ jobs:
9292
for (const args of candidates) {
9393
const attempt = run(cbm, args, opts);
9494
attempts.push(attempt);
95-
if (attempt.status === 0) return { ...attempt, label, attempts };
95+
if (attempt.status === 0) return { ...attempt, label, attempts, selectedBecause: 'status0' };
9696
}
97-
return { ...(attempts.at(-1) || {}), label, attempts };
97+
return { ...(attempts.at(-1) || {}), label, attempts, selectedBecause: 'no_status0' };
98+
}
99+
100+
function firstUseful(label, candidates, opts) {
101+
const attempts = [];
102+
let firstSuccess = null;
103+
for (const args of candidates) {
104+
const attempt = run(cbm, args, opts);
105+
attempts.push(attempt);
106+
if (attempt.status === 0 && !firstSuccess) firstSuccess = attempt;
107+
if (attempt.status === 0 && hasUsefulLocations(attempt)) return { ...attempt, label, attempts, selectedBecause: 'status0_with_locations' };
108+
}
109+
if (firstSuccess) return { ...firstSuccess, label, attempts, selectedBecause: 'status0_without_locations' };
110+
return { ...(attempts.at(-1) || {}), label, attempts, selectedBecause: 'no_status0' };
98111
}
99112
100113
function queryOf(text) {
@@ -171,10 +184,33 @@ jobs:
171184
while ((m = re.exec(String(s || ''))) !== null) add(spans, m[1], m[2] || 1, m[2] || 1);
172185
}
173186
187+
function addLocationsFrom(result, spans) {
188+
for (const text of [result?.stdout, result?.stderr]) {
189+
const parsed = jsonish(text);
190+
if (parsed) walk(parsed, spans);
191+
textPaths(text, spans);
192+
}
193+
}
194+
195+
function hasUsefulLocations(result) {
196+
for (const obj of payloadsFrom(result)) {
197+
if (Array.isArray(obj.results) && obj.results.length > 0) return true;
198+
if (Array.isArray(obj.raw_matches) && obj.raw_matches.length > 0) return true;
199+
if (Number(obj.total_results || 0) > 0 || Number(obj.raw_match_count || 0) > 0 || Number(obj.total_grep_matches || 0) > 0) return true;
200+
}
201+
const spans = new Map();
202+
addLocationsFrom(result, spans);
203+
return spans.size > 0;
204+
}
205+
174206
function regexLiteral(value) {
175207
return String(value || '').replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
176208
}
177209
210+
function allAttempts(...results) {
211+
return results.flatMap((result) => Array.isArray(result?.attempts) && result.attempts.length ? result.attempts : [result]);
212+
}
213+
178214
const reports = [];
179215
let ready = true;
180216
for (const [i, task] of tasks.entries()) {
@@ -189,19 +225,21 @@ jobs:
189225
const listProjects = firstOk('list_projects', [['cli', 'list_projects'], ['cli', 'list_projects', '{}']], opts);
190226
const project = projectFrom(indexRun, listProjects) || basename(task.repo_checkout_path);
191227
const graphSchema = firstOk('get_graph_schema', [['cli', 'get_graph_schema', JSON.stringify({ project })]], opts);
192-
const graphSearch = firstOk('search_graph', [
228+
const graphSearch = firstUseful('search_graph', [
193229
['cli', 'search_graph', JSON.stringify({ project, query, limit: 25 })],
194230
['cli', 'search_graph', JSON.stringify({ project, label: 'Function', query: firstTerm, limit: 25 })],
195-
['cli', 'search_graph', JSON.stringify({ project, label: 'Function', name_pattern: `.*${regexLiteral(firstTerm)}.*`, limit: 25 })]
231+
['cli', 'search_graph', JSON.stringify({ project, label: 'Function', name_pattern: `.*${regexLiteral(firstTerm)}.*`, limit: 25 })],
232+
['cli', 'search_graph', JSON.stringify({ project, label: 'Function', limit: 25 })],
233+
['cli', 'search_graph', JSON.stringify({ project, label: 'Class', limit: 25 })]
196234
], opts);
197-
const codeSearch = firstOk('search_code', [
235+
const codeSearch = firstUseful('search_code', [
198236
['cli', 'search_code', JSON.stringify({ project, pattern: query, mode: 'compact', limit: 25 })],
199237
['cli', 'search_code', JSON.stringify({ project, pattern: firstTerm, mode: 'compact', limit: 25 })],
200238
['cli', 'search_code', JSON.stringify({ project, pattern: '.', mode: 'compact', limit: 25 })]
201239
], opts);
202240
203241
const spans = new Map();
204-
for (const r of [listProjects, graphSchema, graphSearch, codeSearch]) for (const text of [r.stdout, r.stderr]) { const parsed = jsonish(text); if (parsed) walk(parsed, spans); textPaths(text, spans); }
242+
for (const r of allAttempts(listProjects, graphSchema, graphSearch, codeSearch)) addLocationsFrom(r, spans);
205243
const predFiles = [...spans.keys()].slice(0, 20);
206244
const predSpans = Object.fromEntries([...spans.entries()].slice(0, 20));
207245
const predictionPath = join(dir, 'prediction.json');
@@ -214,14 +252,15 @@ jobs:
214252
if (!existsSync(join(official, 'contextbench', 'evaluate.py'))) run('git', ['clone', '--depth', '1', 'https://github.com/EuniAI/ContextBench.git', official], { timeoutMs: 10 * 60 * 1000 });
215253
const scorePath = join(dir, 'official-score.jsonl');
216254
const evaluator = run('python', ['-m', 'contextbench.evaluate', '--gold', goldPath, '--pred', predictionPath, '--cache', join(dir, 'repo-cache'), '--out', scorePath], { cwd: official, timeoutMs: 20 * 60 * 1000 });
217-
const report = { taskId: task.instance_id, repo: task.repo, project, setupStatus: setup.status, indexStatus: indexRun.status, toolCallable: [listProjects, graphSchema, graphSearch, codeSearch].some((r) => r.status === 0), nonEmptyPrediction: predFiles.length > 0 && Object.keys(predSpans).length > 0, officialEvaluatorStatus: evaluator.status, officialEvaluatorScoreable: evaluator.status === 0 && existsSync(scorePath), costs: { setupDurationMs: setup.durationMs, indexDurationMs: indexRun.durationMs, queryDurationMs: listProjects.durationMs + graphSchema.durationMs + graphSearch.durationMs + codeSearch.durationMs, evaluatorDurationMs: evaluator.durationMs }, laneIsolation: { allowedTool: 'codebase-memory-mcp', observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command], observedCwds: [setup.cwd, indexRun.cwd, listProjects.cwd, graphSchema.cwd, graphSearch.cwd, codeSearch.cwd], disallowedNativeReadSearchUsedForPrediction: false }, query, predFiles, commands: { setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch, gold, evaluator } };
255+
const scoreText = existsSync(scorePath) ? readFileSync(scorePath, 'utf8').trim() : '';
256+
const report = { taskId: task.instance_id, repo: task.repo, project, setupStatus: setup.status, indexStatus: indexRun.status, toolCallable: [graphSearch, codeSearch].some((r) => r.status === 0), nonEmptyPrediction: predFiles.length > 0 && Object.keys(predSpans).length > 0, officialEvaluatorStatus: evaluator.status, officialEvaluatorScoreable: evaluator.status === 0 && scoreText.length > 0, costs: { setupDurationMs: setup.durationMs, indexDurationMs: indexRun.durationMs, queryDurationMs: listProjects.durationMs + graphSchema.durationMs + graphSearch.durationMs + codeSearch.durationMs, evaluatorDurationMs: evaluator.durationMs }, laneIsolation: { allowedTool: 'codebase-memory-mcp', observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command], observedCwds: [setup.cwd, indexRun.cwd, listProjects.cwd, graphSchema.cwd, graphSearch.cwd, codeSearch.cwd], disallowedNativeReadSearchUsedForPrediction: false }, query, predFiles, selectedBecause: { graphSearch: graphSearch.selectedBecause, codeSearch: codeSearch.selectedBecause }, commands: { setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch, gold, evaluator } };
218257
writeFileSync(join(dir, 'readiness-report.json'), JSON.stringify(report, null, 2));
219258
reports.push(report);
220259
if (!(report.setupStatus === 0 && report.indexStatus === 0 && report.toolCallable && report.nonEmptyPrediction && report.officialEvaluatorScoreable)) ready = false;
221260
}
222261
const summary = { createdAt: new Date().toISOString(), lane: 'codebase-memory-mcp', ready, attemptedRows: reports.length, scoreableRows: reports.filter((r) => r.officialEvaluatorScoreable).length, nonEmptyPredictionRows: reports.filter((r) => r.nonEmptyPrediction).length, setupIndexCostReportedSeparately: true, reports };
223262
writeFileSync(join(outRoot, 'lane-readiness-codebase-memory-first3.json'), JSON.stringify(summary, null, 2));
224-
console.log(JSON.stringify(summary, null, 2));
263+
console.log(JSON.stringify({ ready: summary.ready, attemptedRows: summary.attemptedRows, scoreableRows: summary.scoreableRows, nonEmptyPredictionRows: summary.nonEmptyPredictionRows, reports: summary.reports.map((r) => ({ taskId: r.taskId, setupStatus: r.setupStatus, indexStatus: r.indexStatus, toolCallable: r.toolCallable, nonEmptyPrediction: r.nonEmptyPrediction, officialEvaluatorScoreable: r.officialEvaluatorScoreable, predFiles: r.predFiles.length, selectedBecause: r.selectedBecause, costs: r.costs })) }, null, 2));
225264
if (!ready) process.exitCode = 1;
226265
NODE
227266
node "$ROOT/readiness.mjs"

0 commit comments

Comments
 (0)