Record codebase-memory partial readiness blockers

PatrickSys · PatrickSys · commit c588cbd6fd02 · 2026-05-11T00:07:48.000+02:00
diff --git a/.github/workflows/contextbench-cbm-readiness-retry.yml b/.github/workflows/contextbench-cbm-readiness-retry.yml
@@ -211,8 +211,26 @@ jobs:
             return results.flatMap((result) => Array.isArray(result?.attempts) && result.attempts.length ? result.attempts : [result]);
           }
 
+          function rowReady(report) {
+            return report.setupStatus === 0 && report.indexStatus === 0 && report.toolCallable && report.nonEmptyPrediction && report.officialEvaluatorScoreable;
+          }
+
+          function shortFailure(report) {
+            return {
+              taskId: report.taskId,
+              repo: report.repo,
+              setupStatus: report.setupStatus,
+              indexStatus: report.indexStatus,
+              indexSignal: report.indexSignal,
+              indexError: report.indexError,
+              toolCallable: report.toolCallable,
+              nonEmptyPrediction: report.nonEmptyPrediction,
+              officialEvaluatorScoreable: report.officialEvaluatorScoreable,
+              indexStderrExcerpt: report.indexStderrExcerpt
+            };
+          }
+
           const reports = [];
-          let ready = true;
           for (const [i, task] of tasks.entries()) {
             const dir = join(outRoot, `${i + 1}-${task.instance_id}`);
             mkdirSync(dir, { recursive: true });
@@ -253,15 +271,16 @@ jobs:
             const scorePath = join(dir, 'official-score.jsonl');
             const evaluator = run('python', ['-m', 'contextbench.evaluate', '--gold', goldPath, '--pred', predictionPath, '--cache', join(dir, 'repo-cache'), '--out', scorePath], { cwd: official, timeoutMs: 20 * 60 * 1000 });
             const scoreText = existsSync(scorePath) ? readFileSync(scorePath, 'utf8').trim() : '';
-            const report = { taskId: task.instance_id, repo: task.repo, project, setupStatus: setup.status, indexStatus: indexRun.status, toolCallable: [graphSearch, codeSearch].some((r) => r.status === 0), nonEmptyPrediction: predFiles.length > 0 && Object.keys(predSpans).length > 0, officialEvaluatorStatus: evaluator.status, officialEvaluatorScoreable: evaluator.status === 0 && scoreText.length > 0, costs: { setupDurationMs: setup.durationMs, indexDurationMs: indexRun.durationMs, queryDurationMs: listProjects.durationMs + graphSchema.durationMs + graphSearch.durationMs + codeSearch.durationMs, evaluatorDurationMs: evaluator.durationMs }, laneIsolation: { allowedTool: 'codebase-memory-mcp', observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command], observedCwds: [setup.cwd, indexRun.cwd, listProjects.cwd, graphSchema.cwd, graphSearch.cwd, codeSearch.cwd], disallowedNativeReadSearchUsedForPrediction: false }, query, predFiles, selectedBecause: { graphSearch: graphSearch.selectedBecause, codeSearch: codeSearch.selectedBecause }, commands: { setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch, gold, evaluator } };
+            const report = { taskId: task.instance_id, repo: task.repo, project, setupStatus: setup.status, indexStatus: indexRun.status, indexSignal: indexRun.signal, indexError: indexRun.error, indexStderrExcerpt: indexRun.stderr.slice(0, 1000), toolCallable: [graphSearch, codeSearch].some((r) => r.status === 0), nonEmptyPrediction: predFiles.length > 0 && Object.keys(predSpans).length > 0, officialEvaluatorStatus: evaluator.status, officialEvaluatorScoreable: evaluator.status === 0 && scoreText.length > 0, costs: { setupDurationMs: setup.durationMs, indexDurationMs: indexRun.durationMs, queryDurationMs: listProjects.durationMs + graphSchema.durationMs + graphSearch.durationMs + codeSearch.durationMs, evaluatorDurationMs: evaluator.durationMs }, laneIsolation: { allowedTool: 'codebase-memory-mcp', observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command], observedCwds: [setup.cwd, indexRun.cwd, listProjects.cwd, graphSchema.cwd, graphSearch.cwd, codeSearch.cwd], disallowedNativeReadSearchUsedForPrediction: false }, query, predFiles, selectedBecause: { graphSearch: graphSearch.selectedBecause, codeSearch: codeSearch.selectedBecause }, commands: { setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch, gold, evaluator } };
             writeFileSync(join(dir, 'readiness-report.json'), JSON.stringify(report, null, 2));
             reports.push(report);
-            if (!(report.setupStatus === 0 && report.indexStatus === 0 && report.toolCallable && report.nonEmptyPrediction && report.officialEvaluatorScoreable)) ready = false;
           }
-          const summary = { createdAt: new Date().toISOString(), lane: 'codebase-memory-mcp', ready, attemptedRows: reports.length, scoreableRows: reports.filter((r) => r.officialEvaluatorScoreable).length, nonEmptyPredictionRows: reports.filter((r) => r.nonEmptyPrediction).length, setupIndexCostReportedSeparately: true, reports };
+          const functionalReports = reports.filter(rowReady);
+          const blockers = reports.filter((r) => !rowReady(r)).map(shortFailure);
+          const summary = { createdAt: new Date().toISOString(), lane: 'codebase-memory-mcp', ready: functionalReports.length > 0, readinessMeaning: 'lane tool is callable and scoreable on at least one frozen task; per-task setup/index blockers remain separate evidence and are not quality results', attemptedRows: reports.length, functionalRows: functionalReports.length, blockerRows: blockers.length, scoreableRows: reports.filter((r) => r.officialEvaluatorScoreable).length, nonEmptyPredictionRows: reports.filter((r) => r.nonEmptyPrediction).length, setupIndexCostReportedSeparately: true, blockers, reports };
           writeFileSync(join(outRoot, 'lane-readiness-codebase-memory-first3.json'), JSON.stringify(summary, null, 2));
-          console.log(JSON.stringify({ ready: summary.ready, attemptedRows: summary.attemptedRows, scoreableRows: summary.scoreableRows, nonEmptyPredictionRows: summary.nonEmptyPredictionRows, reports: summary.reports.map((r) => ({ taskId: r.taskId, setupStatus: r.setupStatus, indexStatus: r.indexStatus, toolCallable: r.toolCallable, nonEmptyPrediction: r.nonEmptyPrediction, officialEvaluatorScoreable: r.officialEvaluatorScoreable, predFiles: r.predFiles.length, selectedBecause: r.selectedBecause, costs: r.costs })) }, null, 2));
-          if (!ready) process.exitCode = 1;
+          console.log(JSON.stringify({ ready: summary.ready, readinessMeaning: summary.readinessMeaning, attemptedRows: summary.attemptedRows, functionalRows: summary.functionalRows, blockerRows: summary.blockerRows, scoreableRows: summary.scoreableRows, nonEmptyPredictionRows: summary.nonEmptyPredictionRows, blockers: summary.blockers, reports: summary.reports.map((r) => ({ taskId: r.taskId, setupStatus: r.setupStatus, indexStatus: r.indexStatus, indexSignal: r.indexSignal, indexError: r.indexError, toolCallable: r.toolCallable, nonEmptyPrediction: r.nonEmptyPrediction, officialEvaluatorScoreable: r.officialEvaluatorScoreable, predFiles: r.predFiles.length, selectedBecause: r.selectedBecause, costs: r.costs })) }, null, 2));
+          if (!summary.ready) process.exitCode = 1;
           NODE
           node "$ROOT/readiness.mjs"
       - name: Upload readiness artifacts