diff --git a/docs/capabilities.md b/docs/capabilities.md index 6e0eca5..1c399a4 100644 --- a/docs/capabilities.md +++ b/docs/capabilities.md @@ -282,12 +282,19 @@ Notes: Reproducible evaluation is shipped as a CLI entrypoint backed by shared scoring/reporting code. -- **Command:** `npm run eval -- ` (builds first, then runs `scripts/run-eval.mjs`) -- **Shared implementation:** `src/eval/harness.ts` + `src/eval/types.ts` (tests and CLI use the same scoring) -- **Frozen fixtures:** - - `tests/fixtures/eval-angular-spotify.json` (real-world) - - `tests/fixtures/eval-controlled.json` + `tests/fixtures/codebases/eval-controlled/` (offline controlled) -- **Reported metrics:** Top-1 accuracy, Top-3 recall, spec contamination rate, and a gate pass/fail +- **Command:** `npm run eval -- [codebaseB] --mode retrieval|discovery [--competitor-results ]` (builds first, then runs `scripts/run-eval.mjs`) +- **Shared implementation:** `src/eval/harness.ts`, `src/eval/discovery-harness.ts`, and `src/eval/types.ts` +- **Frozen retrieval fixtures:** + - `tests/fixtures/eval-angular-spotify.json` + - `tests/fixtures/eval-controlled.json` + `tests/fixtures/codebases/eval-controlled/` +- **Frozen discovery fixtures:** + - `tests/fixtures/discovery-angular-spotify.json` + - `tests/fixtures/discovery-excalidraw.json` + - `tests/fixtures/discovery-benchmark-protocol.json` +- **Retrieval metrics:** Top-1 accuracy, Top-3 recall, spec contamination rate, and a gate pass/fail +- **Discovery metrics:** usefulness score, payload bytes, estimated tokens, first relevant hit, and best-example usefulness +- **Discovery gate:** discovery mode evaluates the frozen ship gate only when the full public suite and comparator metrics are available; missing comparator evidence is reported as pending, not silently treated as pass/fail +- **Limits:** discovery mode is discovery-only, uses current shipped surfaces only, and does not claim implementation quality; named competitor runs remain a documented hybrid/manual lane rather than a built-in automated benchmark ## Limitations diff --git a/scripts/run-eval.mjs b/scripts/run-eval.mjs index d1ed3ca..688b4a2 100644 --- a/scripts/run-eval.mjs +++ b/scripts/run-eval.mjs @@ -11,6 +11,12 @@ import { analyzerRegistry } from '../dist/core/analyzer-registry.js'; import { AngularAnalyzer } from '../dist/analyzers/angular/index.js'; import { GenericAnalyzer } from '../dist/analyzers/generic/index.js'; import { evaluateFixture, formatEvalReport } from '../dist/eval/harness.js'; +import { + combineDiscoverySummaries, + evaluateDiscoveryGate, + evaluateDiscoveryFixture, + formatDiscoveryReport +} from '../dist/eval/discovery-harness.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const projectRoot = path.join(__dirname, '..'); @@ -20,13 +26,34 @@ const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8')); const defaultFixtureA = path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json'); const defaultFixtureB = path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json'); +const defaultDiscoveryFixtureA = path.join( + projectRoot, + 'tests', + 'fixtures', + 'discovery-angular-spotify.json' +); +const defaultDiscoveryFixtureB = path.join( + projectRoot, + 'tests', + 'fixtures', + 'discovery-excalidraw.json' +); +const defaultDiscoveryProtocol = path.join( + projectRoot, + 'tests', + 'fixtures', + 'discovery-benchmark-protocol.json' +); const usage = [ `Usage: node scripts/run-eval.mjs [codebaseB] [options]`, ``, `Options:`, + ` --mode= Select benchmark mode (default: retrieval)`, ` --fixture-a= Override fixture for codebaseA`, ` --fixture-b= Override fixture for codebaseB`, + ` --protocol= Override discovery benchmark protocol`, + ` --competitor-results= JSON file with comparator metrics for discovery gate evaluation`, ` --skip-reindex Skip re-index phase`, ` --no-rerank Disable ambiguity reranker`, ` --no-redact Show full file paths in report`, @@ -87,6 +114,7 @@ async function runSingleEvaluation({ label, codebasePath, fixturePath, + mode, skipReindex, noRerank, redactPaths @@ -98,36 +126,81 @@ async function runSingleEvaluation({ console.log(`\n=== Codebase: ${label} ===`); console.log(`Target: ${resolvedCodebase}`); console.log(`Fixture: ${resolvedFixture}`); - console.log( - `Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}` - ); + console.log(`Mode: ${mode}`); + if (mode === 'retrieval') { + console.log( + `Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}` + ); + } console.log(`Path output: ${redactPaths ? 'REDACTED' : 'FULL'}`); await maybeReindex(resolvedCodebase, skipReindex); - console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`); - const searcher = new CodebaseSearcher(resolvedCodebase); - const summary = await evaluateFixture({ - fixture, - searcher, - limit: 5, - searchOptions: { - enableReranker: !noRerank - } - }); + let summary; + let report; - const report = formatEvalReport({ - codebaseLabel: label, - fixturePath: resolvedFixture, - summary, - redactPaths - }); + if (mode === 'discovery') { + console.log(`\n--- Phase 2: Running ${fixture.tasks.length}-task discovery harness ---`); + summary = await evaluateDiscoveryFixture({ + fixture, + rootPath: resolvedCodebase + }); + report = formatDiscoveryReport({ + codebaseLabel: label, + fixturePath: resolvedFixture, + summary + }); + } else { + console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`); + const searcher = new CodebaseSearcher(resolvedCodebase); + summary = await evaluateFixture({ + fixture, + searcher, + limit: 5, + searchOptions: { + enableReranker: !noRerank + } + }); + + report = formatEvalReport({ + codebaseLabel: label, + fixturePath: resolvedFixture, + summary, + redactPaths + }); + } console.log(report); return summary; } -function printCombinedSummary(summaries) { +function printCombinedSummary(summaries, mode) { + if (mode === 'discovery') { + const totalTasks = summaries.reduce((sum, summary) => sum + summary.totalTasks, 0); + const avgUsefulness = + totalTasks > 0 + ? summaries.reduce((sum, summary) => sum + summary.averageUsefulness * summary.totalTasks, 0) / + totalTasks + : 0; + const avgPayload = + totalTasks > 0 + ? summaries.reduce((sum, summary) => sum + summary.averagePayloadBytes * summary.totalTasks, 0) / + totalTasks + : 0; + const avgTokens = + totalTasks > 0 + ? summaries.reduce((sum, summary) => sum + summary.averageEstimatedTokens * summary.totalTasks, 0) / + totalTasks + : 0; + + console.log(`\n=== Combined Discovery Summary ===`); + console.log(`Average usefulness: ${(avgUsefulness * 100).toFixed(0)}%`); + console.log(`Average payload: ${Math.round(avgPayload)} bytes`); + console.log(`Average estimated tokens: ${Math.round(avgTokens)}`); + console.log(`=================================\n`); + return; + } + const total = summaries.reduce((sum, summary) => sum + summary.total, 0); const top1Correct = summaries.reduce((sum, summary) => sum + summary.top1Correct, 0); const top3RecallCount = summaries.reduce((sum, summary) => sum + summary.top3RecallCount, 0); @@ -156,8 +229,11 @@ async function main() { 'skip-reindex': { type: 'boolean', default: false }, 'no-rerank': { type: 'boolean', default: false }, 'no-redact': { type: 'boolean', default: false }, + mode: { type: 'string', default: 'retrieval' }, 'fixture-a': { type: 'string' }, - 'fixture-b': { type: 'string' } + 'fixture-b': { type: 'string' }, + protocol: { type: 'string' }, + 'competitor-results': { type: 'string' } }, allowPositionals: true }); @@ -176,10 +252,26 @@ async function main() { const codebaseA = positionals[0]; const codebaseB = positionals[1]; - const fixtureA = values['fixture-a'] ? path.resolve(values['fixture-a']) : defaultFixtureA; - const fixtureB = values['fixture-b'] ? path.resolve(values['fixture-b']) : defaultFixtureB; + const mode = values.mode === 'discovery' ? 'discovery' : 'retrieval'; + const fixtureA = values['fixture-a'] + ? path.resolve(values['fixture-a']) + : mode === 'discovery' + ? defaultDiscoveryFixtureA + : defaultFixtureA; + const fixtureB = values['fixture-b'] + ? path.resolve(values['fixture-b']) + : mode === 'discovery' + ? defaultDiscoveryFixtureB + : defaultFixtureB; + const protocolPath = values.protocol + ? path.resolve(values.protocol) + : defaultDiscoveryProtocol; + const comparatorResultsPath = values['competitor-results'] + ? path.resolve(values['competitor-results']) + : null; const sharedOptions = { + mode, skipReindex: values['skip-reindex'], noRerank: values['no-rerank'], redactPaths: !values['no-redact'] @@ -193,7 +285,6 @@ async function main() { }); const summaries = [summaryA]; - let passesAllGates = summaryA.passesGate; if (codebaseB) { const summaryB = await runSingleEvaluation({ @@ -204,10 +295,30 @@ async function main() { }); summaries.push(summaryB); - passesAllGates = passesAllGates && summaryB.passesGate; - printCombinedSummary(summaries); } + if (mode === 'discovery') { + const combinedSummary = combineDiscoverySummaries(summaries); + const protocol = loadFixture(protocolPath); + const comparatorEvidence = comparatorResultsPath ? loadFixture(comparatorResultsPath) : undefined; + const gate = evaluateDiscoveryGate({ + summary: combinedSummary, + protocol, + comparatorEvidence, + suiteComplete: summaries.length > 1 + }); + combinedSummary.gate = gate; + printCombinedSummary([combinedSummary], mode); + console.log(formatDiscoveryReport({ + codebaseLabel: 'combined-suite', + fixturePath: protocolPath, + summary: combinedSummary + })); + process.exit(gate.status === 'failed' ? 1 : 0); + } + + const passesAllGates = summaries.every((summary) => summary.passesGate); + printCombinedSummary(summaries, mode); process.exit(passesAllGates ? 0 : 1); } diff --git a/src/eval/discovery-harness.ts b/src/eval/discovery-harness.ts new file mode 100644 index 0000000..8ca2aa9 --- /dev/null +++ b/src/eval/discovery-harness.ts @@ -0,0 +1,479 @@ +import { createProjectState, type ProjectState } from '../project-state.js'; +import { handle as searchCodebaseHandle } from '../tools/search-codebase.js'; +import { handle as getCodebaseMetadataHandle } from '../tools/get-codebase-metadata.js'; +import { handle as getTeamPatternsHandle } from '../tools/get-team-patterns.js'; +import type { SearchResponse, PatternResponse } from '../tools/types.js'; +import type { + DiscoveryBenchmarkProtocol, + DiscoveryComparatorEvidence, + DiscoveryComparatorGateResult, + DiscoveryGateEvaluation, + DiscoveryMetricComparison, + DiscoveryMetricName, + DiscoverySummary as DiscoverySummaryType, + DiscoverySurface, + DiscoverySurfaceResult as DiscoverySurfaceResultType, + DiscoverySurfaceRunner, + DiscoveryTask as DiscoveryTaskType, + DiscoveryTaskResult as DiscoveryTaskResultEval, + EvaluateDiscoveryFixtureParams, + FormatDiscoveryReportParams +} from './types.js'; +import { countUtf8Bytes, estimateTokenCountFromBytes } from './harness.js'; +import { generateCodebaseIntelligence } from '../resources/codebase-intelligence.js'; + +type JsonRecord = Record; + +function normalizeText(value: string): string { + return value.toLowerCase().replace(/\\/g, '/'); +} + +function parseJsonPayload(payload: string): JsonRecord { + const parsed = JSON.parse(payload) as unknown; + if (typeof parsed === 'object' && parsed !== null) { + return parsed as JsonRecord; + } + return {}; +} + +function createToolProject(rootPath: string): ProjectState { + const project = createProjectState(rootPath); + project.indexState.status = 'ready'; + return project; +} + +async function runSearchCodebase( + task: DiscoveryTaskType, + rootPath: string +): Promise { + const project = createToolProject(rootPath); + const response = await searchCodebaseHandle(task.args ?? { query: task.prompt }, { + indexState: project.indexState, + paths: project.paths, + rootPath: project.rootPath, + performIndexing: () => undefined + }); + const payload = response.content?.[0]?.text ?? '{}'; + const parsed = parseJsonPayload(payload) as SearchResponse & JsonRecord; + const results = Array.isArray(parsed.results) ? parsed.results : []; + const topFiles = results + .map((entry) => (typeof entry.file === 'string' ? entry.file.split(':')[0] : '')) + .filter((entry): entry is string => Boolean(entry)); + const preflight = parsed.preflight; + const bestExample = + preflight && typeof preflight === 'object' && preflight !== null && 'bestExample' in preflight + ? ((preflight.bestExample as string | undefined) ?? null) + : null; + + return { + payload, + topFiles, + bestExample + }; +} + +async function runCodebaseMetadata( + _task: DiscoveryTaskType, + rootPath: string +): Promise { + const project = createToolProject(rootPath); + const response = await getCodebaseMetadataHandle( + {}, + { + indexState: project.indexState, + paths: project.paths, + rootPath: project.rootPath, + performIndexing: () => undefined + } + ); + return { payload: response.content?.[0]?.text ?? '{}' }; +} + +async function runTeamPatterns( + task: DiscoveryTaskType, + rootPath: string +): Promise { + const project = createToolProject(rootPath); + const response = await getTeamPatternsHandle(task.args ?? { category: 'all' }, { + indexState: project.indexState, + paths: project.paths, + rootPath: project.rootPath, + performIndexing: () => undefined + }); + const payload = response.content?.[0]?.text ?? '{}'; + const parsed = parseJsonPayload(payload) as PatternResponse & JsonRecord; + const goldenFiles = Array.isArray(parsed.goldenFiles) ? parsed.goldenFiles : []; + return { + payload, + bestExample: + goldenFiles.length > 0 && typeof goldenFiles[0]?.file === 'string' + ? goldenFiles[0].file + : null + }; +} + +async function runCodebaseContext( + _task: DiscoveryTaskType, + rootPath: string +): Promise { + const project = createToolProject(rootPath); + return { + payload: await generateCodebaseIntelligence(project) + }; +} + +const DEFAULT_SURFACE_RUNNERS: Record = { + search_codebase: runSearchCodebase, + get_codebase_metadata: runCodebaseMetadata, + get_team_patterns: runTeamPatterns, + 'codebase://context': runCodebaseContext +}; + +function matchPatterns(candidates: string[], patterns: string[] | undefined): number | null { + if (!patterns || patterns.length === 0) return null; + const normalizedPatterns = patterns.map(normalizeText); + for (let index = 0; index < candidates.length; index++) { + const normalizedCandidate = normalizeText(candidates[index]); + if (normalizedPatterns.some((pattern) => normalizedCandidate.includes(pattern))) { + return index + 1; + } + } + return null; +} + +function evaluateDiscoveryTask( + task: DiscoveryTaskType, + result: DiscoverySurfaceResultType +): DiscoveryTaskResultEval { + const normalizedPayload = normalizeText(result.payload); + const matchedSignals = task.expectedSignals.filter((signal) => + normalizedPayload.includes(normalizeText(signal)) + ); + const missingSignals = task.expectedSignals.filter((signal) => !matchedSignals.includes(signal)); + const forbiddenHits = (task.forbiddenSignals ?? []).filter((signal) => + normalizedPayload.includes(normalizeText(signal)) + ); + const payloadBytes = countUtf8Bytes(result.payload); + const estimatedTokens = estimateTokenCountFromBytes(payloadBytes); + const usefulnessDenominator = Math.max(task.expectedSignals.length, 1); + const usefulnessScore = Math.max( + 0, + (matchedSignals.length - forbiddenHits.length) / usefulnessDenominator + ); + const firstRelevantHit = matchPatterns(result.topFiles ?? [], task.expectedFilePatterns); + const bestExampleUseful = + task.expectedBestExamplePatterns && task.expectedBestExamplePatterns.length > 0 + ? task.expectedBestExamplePatterns.some((pattern) => + normalizeText(result.bestExample ?? '').includes(normalizeText(pattern)) + ) + : undefined; + + return { + taskId: task.id, + title: task.title, + job: task.job, + surface: task.surface, + usefulnessScore, + matchedSignals, + missingSignals, + forbiddenHits, + payloadBytes, + estimatedTokens, + ...(firstRelevantHit !== null ? { firstRelevantHit } : {}), + ...(typeof bestExampleUseful === 'boolean' ? { bestExampleUseful } : {}) + }; +} + +function summarizeDiscoveryResults(results: DiscoveryTaskResultEval[]): DiscoverySummaryType { + const totalTasks = results.length; + const averageUsefulness = + totalTasks > 0 + ? results.reduce((sum, result) => sum + result.usefulnessScore, 0) / totalTasks + : 0; + const averagePayloadBytes = + totalTasks > 0 ? results.reduce((sum, result) => sum + result.payloadBytes, 0) / totalTasks : 0; + const averageEstimatedTokens = + totalTasks > 0 + ? results.reduce((sum, result) => sum + result.estimatedTokens, 0) / totalTasks + : 0; + const searchResults = results.filter((result) => result.job === 'search'); + const findResults = results.filter((result) => result.job === 'find'); + const mapResults = results.filter((result) => result.job === 'map'); + const searchHits = searchResults + .map((result) => result.firstRelevantHit) + .filter((value): value is number => typeof value === 'number'); + const bestExampleResults = findResults + .map((result) => result.bestExampleUseful) + .filter((value): value is boolean => typeof value === 'boolean'); + + return { + totalTasks, + averageUsefulness, + averagePayloadBytes, + averageEstimatedTokens, + searchTasks: searchResults.length, + findTasks: findResults.length, + mapTasks: mapResults.length, + averageFirstRelevantHit: + searchHits.length > 0 + ? searchHits.reduce((sum, value) => sum + value, 0) / searchHits.length + : null, + bestExampleUsefulnessRate: + bestExampleResults.length > 0 + ? bestExampleResults.filter(Boolean).length / bestExampleResults.length + : null, + results + }; +} + +function getDiscoveryMetricValue( + summary: DiscoverySummaryType | DiscoveryComparatorEvidence[string] | undefined, + metric: DiscoveryMetricName +): number | null { + if (!summary) return null; + const value = summary[metric]; + return typeof value === 'number' && Number.isFinite(value) ? value : null; +} + +function compareMetric( + actualValue: number | null, + comparatorValue: number | null, + metric: DiscoveryMetricName +): DiscoveryMetricComparison { + const lowerIsBetter = + metric === 'averagePayloadBytes' || + metric === 'averageEstimatedTokens' || + metric === 'averageFirstRelevantHit'; + const passes = + actualValue !== null && + comparatorValue !== null && + (lowerIsBetter ? actualValue <= comparatorValue : actualValue >= comparatorValue); + + return { + metric, + comparatorValue, + actualValue, + passes + }; +} + +function compareMetricWithinTolerance( + actualValue: number | null, + comparatorValue: number | null, + metric: DiscoveryMetricName, + tolerancePercent: number +): DiscoveryMetricComparison { + const lowerIsBetter = + metric === 'averagePayloadBytes' || + metric === 'averageEstimatedTokens' || + metric === 'averageFirstRelevantHit'; + const multiplier = 1 + tolerancePercent / 100; + const passes = + actualValue !== null && + comparatorValue !== null && + (lowerIsBetter + ? actualValue <= comparatorValue * multiplier + : actualValue >= comparatorValue * (1 - tolerancePercent / 100)); + + return { + metric, + comparatorValue, + actualValue, + passes + }; +} + +function evaluateBaselineGate( + summary: DiscoverySummaryType, + protocol: DiscoveryBenchmarkProtocol, + comparatorEvidence: DiscoveryComparatorEvidence | undefined +) { + const baselineConfig = protocol.shipGate.baseline; + const baselineMetrics = comparatorEvidence?.[baselineConfig.comparatorName]; + const comparisons = [ + compareMetric( + getDiscoveryMetricValue(summary, baselineConfig.payloadMetric), + getDiscoveryMetricValue(baselineMetrics, baselineConfig.payloadMetric), + baselineConfig.payloadMetric + ), + ...baselineConfig.usefulnessMetrics.map((metric) => + compareMetric( + getDiscoveryMetricValue(summary, metric), + getDiscoveryMetricValue(baselineMetrics, metric), + metric + ) + ) + ]; + const missingMetrics = comparisons + .filter((comparison) => comparison.comparatorValue === null) + .map((comparison) => comparison.metric); + const payloadMetricPassed = comparisons[0]?.passes ?? false; + const beatenUsefulnessMetrics = comparisons + .slice(1) + .filter((comparison) => comparison.passes) + .map((comparison) => comparison.metric); + + return { + comparatorName: baselineConfig.comparatorName, + status: + missingMetrics.length > 0 + ? 'pending_evidence' + : payloadMetricPassed && beatenUsefulnessMetrics.length > 0 + ? 'passed' + : 'failed', + payloadMetric: baselineConfig.payloadMetric, + payloadMetricPassed, + beatenUsefulnessMetrics, + missingMetrics, + comparisons + } as const; +} + +function evaluateComparatorGate( + summary: DiscoverySummaryType, + comparatorName: string, + protocol: DiscoveryBenchmarkProtocol, + comparatorEvidence: DiscoveryComparatorEvidence | undefined +): DiscoveryComparatorGateResult { + const tolerancePercent = protocol.shipGate.comparators.tolerancePercent; + const comparatorMetrics = comparatorEvidence?.[comparatorName]; + const comparisons = protocol.shipGate.comparators.usefulnessMetrics.map((metric) => + compareMetricWithinTolerance( + getDiscoveryMetricValue(summary, metric), + getDiscoveryMetricValue(comparatorMetrics, metric), + metric, + tolerancePercent + ) + ); + const missingMetrics = comparisons + .filter((comparison) => comparison.comparatorValue === null) + .map((comparison) => comparison.metric); + + return { + comparatorName, + status: + missingMetrics.length > 0 + ? 'pending_evidence' + : comparisons.every((comparison) => comparison.passes) + ? 'passed' + : 'failed', + tolerancePercent, + missingMetrics, + comparisons + }; +} + +export function combineDiscoverySummaries(summaries: DiscoverySummaryType[]): DiscoverySummaryType { + return summarizeDiscoveryResults(summaries.flatMap((summary) => summary.results)); +} + +export function evaluateDiscoveryGate({ + summary, + protocol, + comparatorEvidence, + suiteComplete +}: { + summary: DiscoverySummaryType; + protocol: DiscoveryBenchmarkProtocol; + comparatorEvidence?: DiscoveryComparatorEvidence; + suiteComplete: boolean; +}): DiscoveryGateEvaluation { + const baseline = evaluateBaselineGate(summary, protocol, comparatorEvidence); + const comparators = protocol.shipGate.comparators.requiredNames.map((name) => + evaluateComparatorGate(summary, name, protocol, comparatorEvidence) + ); + const missingEvidence: string[] = []; + + if (!suiteComplete) { + missingEvidence.push('fixed public discovery suite is incomplete'); + } + + if (baseline.status === 'pending_evidence') { + missingEvidence.push(`${baseline.comparatorName} baseline metrics missing`); + } + + for (const comparator of comparators) { + if (comparator.status === 'pending_evidence') { + missingEvidence.push(`${comparator.comparatorName} comparator metrics missing`); + } + } + + const status = + missingEvidence.length > 0 + ? 'pending_evidence' + : baseline.status === 'passed' && + comparators.every((comparator) => comparator.status === 'passed') + ? 'passed' + : 'failed'; + + return { + status, + suiteStatus: suiteComplete ? 'complete' : 'incomplete', + baseline, + comparators, + missingEvidence, + claimAllowed: status === 'passed' + }; +} + +export async function evaluateDiscoveryFixture({ + fixture, + rootPath, + surfaceRunners +}: EvaluateDiscoveryFixtureParams): Promise { + const runners: Record = { + ...DEFAULT_SURFACE_RUNNERS, + ...(surfaceRunners ?? {}) + }; + const results: DiscoveryTaskResultEval[] = []; + + for (const task of fixture.tasks) { + const runner = runners[task.surface]; + const payload = await runner(task, rootPath); + results.push(evaluateDiscoveryTask(task, payload)); + } + + return summarizeDiscoveryResults(results); +} + +export function formatDiscoveryReport({ + codebaseLabel, + fixturePath, + summary +}: FormatDiscoveryReportParams): string { + const lines: string[] = []; + lines.push(`\n=== Discovery Eval Report: ${codebaseLabel} ===`); + lines.push(`Fixture: ${fixturePath}`); + lines.push(`Tasks: ${summary.totalTasks}`); + lines.push(`Average usefulness: ${(summary.averageUsefulness * 100).toFixed(0)}%`); + lines.push(`Average payload: ${Math.round(summary.averagePayloadBytes)} bytes`); + lines.push(`Average estimated tokens: ${Math.round(summary.averageEstimatedTokens)}`); + lines.push( + `Average first relevant hit: ${ + summary.averageFirstRelevantHit === null ? 'n/a' : summary.averageFirstRelevantHit.toFixed(2) + }` + ); + lines.push( + `Best-example usefulness: ${ + summary.bestExampleUsefulnessRate === null + ? 'n/a' + : `${(summary.bestExampleUsefulnessRate * 100).toFixed(0)}%` + }` + ); + if (summary.gate) { + lines.push(`Gate: ${summary.gate.status.toUpperCase()}`); + lines.push(`Claim allowed: ${summary.gate.claimAllowed ? 'yes' : 'no'}`); + if (summary.gate.missingEvidence.length > 0) { + lines.push(`Missing evidence: ${summary.gate.missingEvidence.join('; ')}`); + } + } + lines.push(''); + lines.push('Task results:'); + for (const result of summary.results) { + lines.push( + `- ${result.taskId} [${result.job}/${result.surface}] usefulness ${(result.usefulnessScore * 100).toFixed(0)}%, payload ${result.payloadBytes}B/${result.estimatedTokens} tok` + ); + } + lines.push('================================'); + return lines.join('\n'); +} diff --git a/src/eval/harness.ts b/src/eval/harness.ts index 7ef0d7e..dfc5e5e 100644 --- a/src/eval/harness.ts +++ b/src/eval/harness.ts @@ -101,6 +101,14 @@ function hashPath(filePath: string): string { return crypto.createHash('sha1').update(normalizePath(filePath)).digest('hex').slice(0, 8); } +export function countUtf8Bytes(value: string): number { + return Buffer.byteLength(value, 'utf-8'); +} + +export function estimateTokenCountFromBytes(bytes: number): number { + return Math.max(1, Math.ceil(bytes / 4)); +} + function formatPath(filePath: string | null, redactPaths: boolean): string { if (!filePath) { return 'none'; diff --git a/src/eval/types.ts b/src/eval/types.ts index 209f24f..1178cb4 100644 --- a/src/eval/types.ts +++ b/src/eval/types.ts @@ -63,3 +63,182 @@ export interface FormatEvalReportParams { summary: EvalSummary; redactPaths?: boolean; } + +export type DiscoveryJob = 'map' | 'find' | 'search'; + +export type DiscoverySurface = + | 'search_codebase' + | 'get_codebase_metadata' + | 'get_team_patterns' + | 'codebase://context'; + +export interface DiscoveryTask { + id: string; + title: string; + job: DiscoveryJob; + surface: DiscoverySurface; + prompt: string; + args?: Record; + expectedSignals: string[]; + expectedFilePatterns?: string[]; + expectedBestExamplePatterns?: string[]; + forbiddenSignals?: string[]; + notes?: string; +} + +export interface DiscoveryFixture { + description?: string; + codebase?: string; + repository?: string; + repositoryUrl?: string; + repositoryRef?: string; + frozenDate?: string; + notes?: string; + tasks: DiscoveryTask[]; +} + +export interface DiscoveryTaskResult { + taskId: string; + title: string; + job: DiscoveryJob; + surface: DiscoverySurface; + usefulnessScore: number; + matchedSignals: string[]; + missingSignals: string[]; + forbiddenHits: string[]; + payloadBytes: number; + estimatedTokens: number; + firstRelevantHit?: number | null; + bestExampleUseful?: boolean; +} + +export interface DiscoverySummary { + totalTasks: number; + averageUsefulness: number; + averagePayloadBytes: number; + averageEstimatedTokens: number; + searchTasks: number; + findTasks: number; + mapTasks: number; + averageFirstRelevantHit: number | null; + bestExampleUsefulnessRate: number | null; + gate?: DiscoveryGateEvaluation; + results: DiscoveryTaskResult[]; +} + +export interface EvaluateDiscoveryFixtureParams { + fixture: DiscoveryFixture; + rootPath: string; + surfaceRunners?: Partial>; +} + +export interface FormatDiscoveryReportParams { + codebaseLabel: string; + fixturePath: string; + summary: DiscoverySummary; +} + +export type DiscoverySurfaceRunner = ( + task: DiscoveryTask, + rootPath: string +) => Promise; + +export interface DiscoverySurfaceResult { + payload: string; + topFiles?: string[]; + bestExample?: string | null; +} + +export type DiscoveryMetricName = + | 'averageUsefulness' + | 'averagePayloadBytes' + | 'averageEstimatedTokens' + | 'averageFirstRelevantHit' + | 'bestExampleUsefulnessRate'; + +export interface DiscoveryComparatorProtocol { + name: string; + kind: 'baseline' | 'mcp-comparator'; + execution: 'direct-tool' | 'manual-log-capture'; + notes?: string; +} + +export interface DiscoveryGateProtocol { + baselineRule: string; + comparatorRule: string; + claimRule: string; + baseline: { + comparatorName: string; + payloadMetric: DiscoveryMetricName; + usefulnessMetrics: DiscoveryMetricName[]; + }; + comparators: { + requiredNames: string[]; + tolerancePercent: number; + usefulnessMetrics: DiscoveryMetricName[]; + }; +} + +export interface DiscoveryBenchmarkProtocol { + name: string; + frozenDate: string; + scope: 'discovery-only'; + jobs: DiscoveryJob[]; + allowedSurfaces: DiscoverySurface[]; + forbiddenSurfaces: string[]; + primaryLane: 'direct-tool'; + secondaryLane: 'manual-log-capture'; + comparators: DiscoveryComparatorProtocol[]; + metrics: { + payloadCost: DiscoveryMetricName[]; + usefulness: DiscoveryMetricName[]; + }; + fairnessRules: string[]; + shipGate: DiscoveryGateProtocol; +} + +export interface DiscoveryComparatorMetrics { + averageUsefulness?: number | null; + averagePayloadBytes?: number | null; + averageEstimatedTokens?: number | null; + averageFirstRelevantHit?: number | null; + bestExampleUsefulnessRate?: number | null; +} + +export interface DiscoveryComparatorEvidence { + [comparatorName: string]: DiscoveryComparatorMetrics; +} + +export interface DiscoveryMetricComparison { + metric: DiscoveryMetricName; + comparatorValue: number | null; + actualValue: number | null; + passes: boolean; +} + +export interface DiscoveryBaselineGateResult { + comparatorName: string; + status: 'passed' | 'failed' | 'pending_evidence'; + payloadMetric: DiscoveryMetricName; + payloadMetricPassed: boolean; + beatenUsefulnessMetrics: DiscoveryMetricName[]; + missingMetrics: DiscoveryMetricName[]; + comparisons: DiscoveryMetricComparison[]; +} + +export interface DiscoveryComparatorGateResult { + comparatorName: string; + status: 'passed' | 'failed' | 'pending_evidence'; + tolerancePercent: number; + missingMetrics: DiscoveryMetricName[]; + comparisons: DiscoveryMetricComparison[]; +} + +export interface DiscoveryGateEvaluation { + status: 'passed' | 'failed' | 'pending_evidence'; + suiteStatus: 'complete' | 'incomplete'; + baseline: DiscoveryBaselineGateResult; + comparators: DiscoveryComparatorGateResult[]; + missingEvidence: string[]; + claimAllowed: boolean; +} diff --git a/src/index.ts b/src/index.ts index e590f55..9ae677d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -50,6 +50,7 @@ import { getProjectPathFromContextResourceUri, isContextResourceUri } from './resources/uri.js'; +import { generateCodebaseIntelligence } from './resources/codebase-intelligence.js'; import { EXCLUDED_GLOB_PATTERNS } from './constants/codebase-context.js'; import { discoverProjectsWithinRoot, @@ -852,7 +853,7 @@ export function registerHandlers(target: Server): void { { uri: buildProjectContextResourceUri(project.rootPath), mimeType: 'text/plain', - text: await generateCodebaseContext(project) + text: await generateCodebaseIntelligence(project) } ] }; @@ -865,7 +866,9 @@ export function registerHandlers(target: Server): void { { uri: CONTEXT_RESOURCE_URI, mimeType: 'text/plain', - text: project ? await generateCodebaseContext(project) : buildProjectSelectionMessage() + text: project + ? await generateCodebaseIntelligence(project) + : buildProjectSelectionMessage() } ] }; @@ -903,7 +906,7 @@ export function registerHandlers(target: Server): void { type: 'text', text: JSON.stringify({ status: 'indexing', - message: 'Index build in progress — please retry shortly' + message: 'Index build in progress - please retry shortly' }) } ] @@ -930,7 +933,7 @@ export function registerHandlers(target: Server): void { type: 'text', text: JSON.stringify({ status: 'indexing', - message: 'Index rebuild in progress — please retry shortly', + message: 'Index rebuild in progress - please retry shortly', index: indexSignal }) } @@ -1011,7 +1014,7 @@ function buildResources(): Resource[] { return resources; } -async function generateCodebaseContext(project: ProjectState): Promise { +async function _generateCodebaseContext(project: ProjectState): Promise { const intelligencePath = project.paths.intelligence; const index = await ensureValidIndexOrAutoHeal(project); @@ -1033,7 +1036,7 @@ async function generateCodebaseContext(project: ProjectState): Promise { lines.push(''); lines.push( `Index: ${index.status} (${index.confidence}, ${index.action})${ - index.reason ? ` — ${index.reason}` : '' + index.reason ? ` - ${index.reason}` : '' }` ); lines.push(''); @@ -1571,14 +1574,14 @@ function ensureProjectWatcher(project: ProjectState, debounceMs: number): void { if (!shouldRunNow) { if (process.env.CODEBASE_CONTEXT_DEBUG) { console.error( - `[file-watcher] Index in progress — queueing auto-refresh: ${project.rootPath}` + `[file-watcher] Index in progress - queueing auto-refresh: ${project.rootPath}` ); } return; } if (process.env.CODEBASE_CONTEXT_DEBUG) { console.error( - `[file-watcher] Changes detected — incremental reindex starting: ${project.rootPath}` + `[file-watcher] Changes detected - incremental reindex starting: ${project.rootPath}` ); } void performIndexing(project, true); diff --git a/src/resources/codebase-intelligence.ts b/src/resources/codebase-intelligence.ts new file mode 100644 index 0000000..f1c813e --- /dev/null +++ b/src/resources/codebase-intelligence.ts @@ -0,0 +1,192 @@ +import { promises as fs } from 'fs'; +import path from 'path'; +import type { ProjectState } from '../project-state.js'; +import type { IntelligenceData, PatternsData, PatternCandidate } from '../types/index.js'; +import { + isComplementaryPatternCategory, + shouldSkipLegacyTestingFrameworkCategory +} from '../patterns/semantics.js'; +import { RELATIONSHIPS_FILENAME } from '../constants/codebase-context.js'; + +async function fileExists(filePath: string): Promise { + try { + await fs.access(filePath); + return true; + } catch { + return false; + } +} + +async function readIndexSignal(project: ProjectState): Promise<{ + status: 'ready' | 'stale'; + confidence: 'high' | 'low'; + action: 'served'; + reason?: string; +}> { + const hasIntelligence = await fileExists(project.paths.intelligence); + const hasRelationships = await fileExists( + path.join(path.dirname(project.paths.intelligence), RELATIONSHIPS_FILENAME) + ); + + if (!hasIntelligence) { + return { + status: 'stale', + confidence: 'low', + action: 'served', + reason: 'Intelligence artifact missing' + }; + } + + return { + status: 'ready', + confidence: hasRelationships ? 'high' : 'low', + action: 'served', + ...(hasRelationships ? {} : { reason: 'Relationships artifact missing' }) + }; +} + +export async function generateCodebaseIntelligence(project: ProjectState): Promise { + const intelligencePath = project.paths.intelligence; + const index = await readIndexSignal(project); + + try { + const content = await fs.readFile(intelligencePath, 'utf-8'); + const intelligence = JSON.parse(content) as IntelligenceData; + + const lines: string[] = []; + lines.push('# Codebase Intelligence'); + lines.push(''); + lines.push( + `Index: ${index.status} (${index.confidence}, ${index.action})${ + index.reason ? ` - ${index.reason}` : '' + }` + ); + lines.push(''); + lines.push('WARNING: This is what YOUR codebase actually uses, not generic recommendations.'); + lines.push('These are FACTS from analyzing your code, not best practices from the internet.'); + lines.push(''); + + const libraryEntries = Object.entries(intelligence.libraryUsage || {}) + .map(([lib, data]) => ({ + lib, + count: data.count + })) + .sort((a, b) => b.count - a.count); + + if (libraryEntries.length > 0) { + lines.push('## Libraries Actually Used (Top 15)'); + lines.push(''); + + for (const { lib, count } of libraryEntries.slice(0, 15)) { + lines.push(`- **${lib}** (${count} uses)`); + } + lines.push(''); + } + + if (intelligence.tsconfigPaths && Object.keys(intelligence.tsconfigPaths).length > 0) { + lines.push('## Import Aliases (from tsconfig.json)'); + lines.push(''); + lines.push('These path aliases map to internal project code:'); + for (const [alias, paths] of Object.entries(intelligence.tsconfigPaths)) { + lines.push(`- \`${alias}\` -> ${(paths as string[]).join(', ')}`); + } + lines.push(''); + } + + if (intelligence.patterns && Object.keys(intelligence.patterns).length > 0) { + const patterns: PatternsData = intelligence.patterns; + lines.push("## YOUR Codebase's Actual Patterns (Not Generic Best Practices)"); + lines.push(''); + lines.push('These patterns were detected by analyzing your actual code.'); + lines.push('This is what YOUR team does in practice, not what tutorials recommend.'); + lines.push(''); + + for (const [category, data] of Object.entries(patterns)) { + if (shouldSkipLegacyTestingFrameworkCategory(category, patterns)) { + continue; + } + + const primary: PatternCandidate | undefined = data.primary; + const alternatives: PatternCandidate[] = data.alsoDetected ?? []; + + if (!primary) continue; + + if ( + isComplementaryPatternCategory( + category, + [primary.name, ...alternatives.map((alt) => alt.name)].filter(Boolean) + ) + ) { + const secondary = alternatives[0]; + if (secondary) { + const categoryName = category + .replace(/([A-Z])/g, ' $1') + .trim() + .replace(/^./, (str: string) => str.toUpperCase()); + lines.push( + `### ${categoryName}: **${primary.name}** (${primary.frequency}) + **${secondary.name}** (${secondary.frequency})` + ); + lines.push( + ' -> Computed and effect are complementary Signals primitives and are commonly used together.' + ); + lines.push(' -> Treat this as balanced usage, not a hard split decision.'); + lines.push(''); + continue; + } + } + + const percentage = Number.parseInt(primary.frequency, 10); + const categoryName = category + .replace(/([A-Z])/g, ' $1') + .trim() + .replace(/^./, (str: string) => str.toUpperCase()); + + if (percentage === 100) { + lines.push(`### ${categoryName}: **${primary.name}** (${primary.frequency} - unanimous)`); + lines.push(` -> Your codebase is 100% consistent - ALWAYS use ${primary.name}`); + } else if (percentage >= 80) { + lines.push( + `### ${categoryName}: **${primary.name}** (${primary.frequency} - strong consensus)` + ); + lines.push(` -> Your team strongly prefers ${primary.name}`); + if (alternatives.length) { + const alt = alternatives[0]; + lines.push( + ` -> Minority pattern: ${alt.name} (${alt.frequency}) - avoid for new code` + ); + } + } else if (percentage >= 60) { + lines.push(`### ${categoryName}: **${primary.name}** (${primary.frequency} - majority)`); + lines.push(` -> Most code uses ${primary.name}, but not unanimous`); + if (alternatives.length) { + lines.push( + ` -> Also detected: ${alternatives[0].name} (${alternatives[0].frequency})` + ); + } + } else { + lines.push(`### ${categoryName}: WARNING: NO TEAM CONSENSUS`); + lines.push(' Your codebase is split between multiple approaches:'); + lines.push(` - ${primary.name} (${primary.frequency})`); + if (alternatives.length) { + for (const alt of alternatives.slice(0, 2)) { + lines.push(` - ${alt.name} (${alt.frequency})`); + } + } + lines.push(' -> ASK the team which approach to use for new features'); + } + lines.push(''); + } + } + + lines.push('---'); + lines.push(`Generated: ${intelligence.generatedAt || new Date().toISOString()}`); + + return lines.join('\n'); + } catch (error) { + return ( + '# Codebase Intelligence\n\n' + + 'Intelligence data not yet generated. Run indexing first.\n' + + `Error: ${error instanceof Error ? error.message : String(error)}` + ); + } +} diff --git a/tests/discovery-harness.test.ts b/tests/discovery-harness.test.ts new file mode 100644 index 0000000..f7fa200 --- /dev/null +++ b/tests/discovery-harness.test.ts @@ -0,0 +1,305 @@ +import { describe, expect, it } from 'vitest'; +import { + combineDiscoverySummaries, + evaluateDiscoveryGate, + evaluateDiscoveryFixture, + formatDiscoveryReport +} from '../src/eval/discovery-harness.js'; +import type { + DiscoveryBenchmarkProtocol, + DiscoveryFixture, + DiscoverySummary, + DiscoverySurfaceResult +} from '../src/eval/types.js'; +import angularDiscoveryFixture from './fixtures/discovery-angular-spotify.json'; +import excalidrawDiscoveryFixture from './fixtures/discovery-excalidraw.json'; +import discoveryProtocol from './fixtures/discovery-benchmark-protocol.json'; + +describe('Discovery benchmark fixtures', () => { + it('keeps angular-spotify discovery fixture frozen at 12 tasks with balanced job coverage', () => { + expect(angularDiscoveryFixture.tasks).toHaveLength(12); + const counts = angularDiscoveryFixture.tasks.reduce>((acc, task) => { + acc[task.job] = (acc[task.job] ?? 0) + 1; + return acc; + }, {}); + expect(counts.map).toBe(4); + expect(counts.find).toBe(4); + expect(counts.search).toBe(4); + }); + + it('keeps excalidraw discovery fixture frozen at 12 tasks with balanced job coverage', () => { + expect(excalidrawDiscoveryFixture.tasks).toHaveLength(12); + const counts = excalidrawDiscoveryFixture.tasks.reduce>((acc, task) => { + acc[task.job] = (acc[task.job] ?? 0) + 1; + return acc; + }, {}); + expect(counts.map).toBe(4); + expect(counts.find).toBe(4); + expect(counts.search).toBe(4); + }); + + it('freezes the discovery protocol around current shipped surfaces only', () => { + expect(discoveryProtocol.allowedSurfaces).toEqual([ + 'search_codebase', + 'get_codebase_metadata', + 'get_team_patterns', + 'codebase://context' + ]); + expect(discoveryProtocol.forbiddenSurfaces).toContain('get_codebase_map'); + expect(discoveryProtocol.comparators).toHaveLength(4); + }); + + it('pins both public discovery fixtures to concrete repository refs', () => { + expect(angularDiscoveryFixture.repositoryRef).toMatch(/^[0-9a-f]{40}$/); + expect(excalidrawDiscoveryFixture.repositoryRef).toMatch(/^[0-9a-f]{40}$/); + }); +}); + +describe('Discovery harness scoring', () => { + it('scores expected signals, first relevant hit, and best-example usefulness deterministically', async () => { + const fixture: DiscoveryFixture = { + description: 'unit discovery fixture', + tasks: [ + { + id: 'map-1', + title: 'Map task', + job: 'map', + surface: 'codebase://context', + prompt: 'map', + expectedSignals: ['libraries actually used', 'patterns'] + }, + { + id: 'find-1', + title: 'Find task', + job: 'find', + surface: 'get_team_patterns', + prompt: 'find', + expectedSignals: ['dependencyInjection'], + expectedBestExamplePatterns: ['src/auth/auth.interceptor.ts'] + }, + { + id: 'search-1', + title: 'Search task', + job: 'search', + surface: 'search_codebase', + prompt: 'search', + expectedSignals: ['results', 'searchQuality'], + expectedFilePatterns: ['auth.interceptor.ts'] + } + ] + }; + + const summary = await evaluateDiscoveryFixture({ + fixture, + rootPath: 'C:/repo', + surfaceRunners: { + 'codebase://context': async () => ({ + payload: '# Codebase Intelligence\n\n## Libraries Actually Used\n\n## Patterns' + }), + get_team_patterns: async () => ({ + payload: '{"patterns":{"dependencyInjection":{"primary":{"name":"inject()","frequency":"90%"}}}}', + bestExample: 'src/auth/auth.interceptor.ts' + }), + search_codebase: async () => ({ + payload: '{"status":"success","searchQuality":{"status":"ok"},"results":[{"file":"src/auth/auth.interceptor.ts:1-10"}]}', + topFiles: ['src/auth/auth.interceptor.ts'] + }) + } + }); + + expect(summary.totalTasks).toBe(3); + expect(summary.averageUsefulness).toBeCloseTo(1, 4); + expect(summary.averageFirstRelevantHit).toBe(1); + expect(summary.bestExampleUsefulnessRate).toBe(1); + expect(summary.results[2]?.firstRelevantHit).toBe(1); + }); + + it('formats a compact discovery report', async () => { + const fixture: DiscoveryFixture = { + tasks: [ + { + id: 'search-1', + title: 'Search task', + job: 'search', + surface: 'search_codebase', + prompt: 'search', + expectedSignals: ['results'], + expectedFilePatterns: ['player-api.ts'] + } + ] + }; + + const summary = await evaluateDiscoveryFixture({ + fixture, + rootPath: 'C:/repo', + surfaceRunners: { + search_codebase: async (): Promise => ({ + payload: '{"results":[{"file":"src/player-api.ts:1-4"}]}', + topFiles: ['src/player-api.ts'] + }) + } + }); + + const report = formatDiscoveryReport({ + codebaseLabel: 'fixture-repo', + fixturePath: 'tests/fixtures/discovery-angular-spotify.json', + summary + }); + + expect(report).toContain('Discovery Eval Report'); + expect(report).toContain('Average usefulness'); + expect(report).toContain('search-1'); + }); +}); + +describe('Discovery gate evaluation', () => { + const protocol = discoveryProtocol as DiscoveryBenchmarkProtocol; + + function createSummary( + overrides: Partial = {} + ): DiscoverySummary { + return { + totalTasks: 24, + averageUsefulness: 0.9, + averagePayloadBytes: 1200, + averageEstimatedTokens: 300, + searchTasks: 8, + findTasks: 8, + mapTasks: 8, + averageFirstRelevantHit: 1.2, + bestExampleUsefulnessRate: 0.9, + results: [], + ...overrides + }; + } + + it('combines multiple discovery summaries before gate evaluation', () => { + const combined = combineDiscoverySummaries([ + createSummary({ + results: [ + { + taskId: 'one', + title: 'one', + job: 'map', + surface: 'codebase://context', + usefulnessScore: 0.8, + matchedSignals: [], + missingSignals: [], + forbiddenHits: [], + payloadBytes: 100, + estimatedTokens: 25 + } + ] + }), + createSummary({ + results: [ + { + taskId: 'two', + title: 'two', + job: 'search', + surface: 'search_codebase', + usefulnessScore: 1, + matchedSignals: [], + missingSignals: [], + forbiddenHits: [], + payloadBytes: 80, + estimatedTokens: 20, + firstRelevantHit: 1 + } + ] + }) + ]); + + expect(combined.totalTasks).toBe(2); + expect(combined.averageEstimatedTokens).toBe(22.5); + expect(combined.averageFirstRelevantHit).toBe(1); + }); + + it('marks the gate pending when required comparator evidence is missing', () => { + const summary = createSummary(); + const gate = evaluateDiscoveryGate({ + summary, + protocol, + suiteComplete: false + }); + + expect(gate.status).toBe('pending_evidence'); + expect(gate.claimAllowed).toBe(false); + expect(gate.missingEvidence).toContain('fixed public discovery suite is incomplete'); + }); + + it('passes the gate when baseline and comparator metrics satisfy the frozen rules', () => { + const summary = createSummary(); + const gate = evaluateDiscoveryGate({ + summary, + protocol, + suiteComplete: true, + comparatorEvidence: { + 'raw Claude Code': { + averageEstimatedTokens: 450, + averageUsefulness: 0.75, + averageFirstRelevantHit: 1.5, + bestExampleUsefulnessRate: 0.8 + }, + GrepAI: { + averageUsefulness: 0.92, + averageFirstRelevantHit: 1.1, + bestExampleUsefulnessRate: 0.95 + }, + jCodeMunch: { + averageUsefulness: 0.98, + averageFirstRelevantHit: 1.25, + bestExampleUsefulnessRate: 0.98 + }, + 'codebase-memory-mcp': { + averageUsefulness: 0.93, + averageFirstRelevantHit: 1.3, + bestExampleUsefulnessRate: 0.96 + } + } + }); + + expect(gate.status).toBe('passed'); + expect(gate.baseline.payloadMetricPassed).toBe(true); + expect(gate.baseline.beatenUsefulnessMetrics.length).toBeGreaterThan(0); + expect(gate.comparators.every((comparator) => comparator.status === 'passed')).toBe(true); + }); + + it('fails the gate when usefulness falls outside the frozen 15% comparator tolerance', () => { + const summary = createSummary({ + averageUsefulness: 0.6, + bestExampleUsefulnessRate: 0.6 + }); + const gate = evaluateDiscoveryGate({ + summary, + protocol, + suiteComplete: true, + comparatorEvidence: { + 'raw Claude Code': { + averageEstimatedTokens: 450, + averageUsefulness: 0.55, + averageFirstRelevantHit: 1.6, + bestExampleUsefulnessRate: 0.55 + }, + GrepAI: { + averageUsefulness: 0.9, + averageFirstRelevantHit: 1.0, + bestExampleUsefulnessRate: 0.9 + }, + jCodeMunch: { + averageUsefulness: 0.91, + averageFirstRelevantHit: 1.0, + bestExampleUsefulnessRate: 0.91 + }, + 'codebase-memory-mcp': { + averageUsefulness: 0.92, + averageFirstRelevantHit: 1.0, + bestExampleUsefulnessRate: 0.92 + } + } + }); + + expect(gate.status).toBe('failed'); + expect(gate.comparators.some((comparator) => comparator.status === 'failed')).toBe(true); + }); +}); diff --git a/tests/eval-harness.test.ts b/tests/eval-harness.test.ts index 14efe40..9d483bb 100644 --- a/tests/eval-harness.test.ts +++ b/tests/eval-harness.test.ts @@ -2,7 +2,13 @@ import { describe, expect, it, vi } from 'vitest'; import { CodebaseSearcher } from '../src/core/search.js'; import type { CodeChunk, SearchResult } from '../src/types/index.js'; import type { EvalFixture, EvalQuery } from '../src/eval/types.js'; -import { evaluateFixture, summarizeEvaluation, formatEvalReport } from '../src/eval/harness.js'; +import { + countUtf8Bytes, + estimateTokenCountFromBytes, + evaluateFixture, + summarizeEvaluation, + formatEvalReport +} from '../src/eval/harness.js'; import angularFixture from './fixtures/eval-angular-spotify.json'; import controlledFixture from './fixtures/eval-controlled.json'; @@ -82,6 +88,12 @@ describe('Eval Harness - fixtures loaded', () => { }); describe('Eval Harness - scoring logic', () => { + it('estimates payload cost with a fixed bytes-to-token heuristic', () => { + const bytes = countUtf8Bytes('auth interceptor'); + expect(bytes).toBeGreaterThan(0); + expect(estimateTokenCountFromBytes(bytes)).toBe(Math.ceil(bytes / 4)); + }); + it('marks correct top-1 when implementation file is first', async () => { const query: EvalQuery = { id: 7, diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md index e69ae67..18d954c 100644 --- a/tests/fixtures/README.md +++ b/tests/fixtures/README.md @@ -1,11 +1,14 @@ # Evaluation Fixtures -This directory contains frozen evaluation sets for testing code search quality. +This directory contains frozen evaluation sets for testing retrieval and discovery quality. ## Files -- `eval-angular-spotify.json` - 20 semantic queries against [angular-spotify](https://github.com/trungk18/angular-spotify) (public, reproducible) -- `eval-controlled.json` - 20 frozen queries for the in-repo controlled fixture codebase +- `eval-angular-spotify.json` - 20 semantic retrieval queries against [angular-spotify](https://github.com/trungk18/angular-spotify) +- `eval-controlled.json` - 20 frozen retrieval queries for the in-repo controlled fixture codebase +- `discovery-angular-spotify.json` - 12 discovery tasks for `angular-spotify` +- `discovery-excalidraw.json` - 12 discovery tasks for `Excalidraw` +- `discovery-benchmark-protocol.json` - frozen scope, comparator set, fairness rules, and ship gate for the discovery benchmark ## Running Evaluations @@ -24,18 +27,30 @@ npm install npm run build ``` -### Run Evaluation +### Run Retrieval Evaluation ```bash -node scripts/run-eval.mjs /path/to/angular-spotify --fixture tests/fixtures/eval-angular-spotify.json +node scripts/run-eval.mjs /path/to/angular-spotify --mode retrieval --fixture-a tests/fixtures/eval-angular-spotify.json # Controlled fixture example (no network) -node scripts/run-eval.mjs tests/fixtures/codebases/eval-controlled --fixture tests/fixtures/eval-controlled.json +node scripts/run-eval.mjs tests/fixtures/codebases/eval-controlled --mode retrieval --fixture-a tests/fixtures/eval-controlled.json +``` + +### Run Discovery Evaluation + +```bash +node scripts/run-eval.mjs /path/to/angular-spotify /path/to/excalidraw --mode discovery +``` + +Optional comparator evidence file: + +```bash +node scripts/run-eval.mjs /path/to/angular-spotify /path/to/excalidraw --mode discovery --competitor-results /path/to/discovery-comparator-results.json ``` ### Output Format -The eval script outputs: +The retrieval harness outputs: - **Top-1 Accuracy**: % of queries where the best result matches expected patterns - **Top-3 Recall**: % of queries where top-3 results include a match @@ -43,15 +58,29 @@ The eval script outputs: - **Per-category breakdown**: Accuracy by query type (exact-name, conceptual, multi-concept, structural) - **Failure analysis**: Which queries failed and why +The discovery harness outputs: + +- **Average usefulness**: expected-signal match rate with forbidden-signal penalties +- **Average payload**: UTF-8 bytes returned by the current shipped surface +- **Average estimated tokens**: fixed bytes-to-token heuristic for fair comparison +- **Average first relevant hit**: position of the first relevant file for search tasks +- **Best-example usefulness**: whether find tasks surfaced the expected exemplar + ## Evaluation Integrity Rules -⚠️ **CRITICAL**: These eval fixtures are FROZEN. Once committed: +⚠️ **CRITICAL**: These fixtures are FROZEN. Once committed: 1. **DO NOT** adjust expected results to match system output 2. **DO NOT** add queries during development to "improve" scores 3. **DO NOT** remove "hard" queries that the system fails 4. **DO NOT** tune the system on this eval set then report scores +For discovery specifically: + +5. **DO NOT** benchmark an unreleased `map` command or a new MCP map tool +6. **DO NOT** claim implementation quality from this benchmark +7. **DO** keep comparator setup limitations explicit when a lane requires manual log capture + ### Proper Usage ✅ **CORRECT**: @@ -124,18 +153,112 @@ Example: To reproduce published results: -1. Clone the exact codebase version: +1. Clone the exact codebase versions: ```bash -git clone https://github.com/trungk18/angular-spotify -cd angular-spotify -git checkout +git clone https://github.com/trungk18/angular-spotify /path/to/angular-spotify +git -C /path/to/angular-spotify checkout ff9efa765c53cfde78c9a172c62d515ae8ef9fe0 + +git clone https://github.com/excalidraw/excalidraw /path/to/excalidraw +git -C /path/to/excalidraw checkout e18c1dd213000dde0ae94ef7eb00aab537b39708 ``` 2. Use the frozen eval fixture (committed before measurements) -3. Run eval on both baseline and new version +3. Run eval on both pinned repos 4. Compare metrics transparently +## Discovery Benchmark Scope + +Phase 5 freezes discovery around three jobs only: + +1. **Map** - repo orientation and subsystem awareness +2. **Find** - dominant local pattern and best-example discovery +3. **Search** - targeted file and symbol discovery with low noise + +Allowed current-surface lane: + +- `search_codebase` +- `get_codebase_metadata` +- `get_team_patterns` +- `codebase://context` + +Explicitly out of bounds: + +- unreleased `map` CLI behavior +- a new MCP `get_codebase_map` tool +- implementation-quality or code-generation claims + +## Comparator Notes + +- `raw Claude Code` is the primary baseline and uses a manual log-capture lane +- `GrepAI`, `jCodeMunch`, and `codebase-memory-mcp` are the named MCP comparators +- `codebase-memory-mcp` is the heavier structural comparator, not the primary public baseline +- If a comparator cannot be run fairly via direct tool calls, document the public setup and use the manual lane rather than inventing a fake automation path + +## Comparator Setup Commands + +These commands document the public setup path for the named comparator set. They do not convert the benchmark into a built-in automation path; the named comparators still run through the documented manual log-capture lane. + +### raw Claude Code + +Public install path: + +```bash +npm install -g @anthropic-ai/claude-code +cd /path/to/angular-spotify +claude +``` + +Use the same Claude Code version, model, and base instructions across all baseline captures. + +### GrepAI + +Public install path from the project README: + +```bash +curl -sSL https://raw.githubusercontent.com/yoanbernabeu/grepai/main/install.sh | sh +ollama pull nomic-embed-text +cd /path/to/angular-spotify +grepai init +grepai watch +``` + +Windows PowerShell install path: + +```powershell +irm https://raw.githubusercontent.com/yoanbernabeu/grepai/main/install.ps1 | iex +``` + +### jCodeMunch + +Public install path from PyPI: + +```bash +pip install jcodemunch-mcp +claude mcp add jcodemunch uvx jcodemunch-mcp +cd /path/to/angular-spotify +claude +``` + +Use the same MCP-enabled Claude Code session style for the benchmark capture and let jCodeMunch index the project through its documented first-run flow. + +### codebase-memory-mcp + +Public install path from the project README: + +```bash +curl -fsSL https://raw.githubusercontent.com/DeusData/codebase-memory-mcp/main/install.sh | bash +cd /path/to/angular-spotify +``` + +Windows PowerShell install path: + +```powershell +powershell -ExecutionPolicy ByPass -c "irm https://raw.githubusercontent.com/DeusData/codebase-memory-mcp/main/install.ps1 | iex" +``` + +After installation, restart the coding agent and use the documented prompt to index the project before running the manual benchmark capture. + ## Adding New Eval Sets When creating new eval sets: diff --git a/tests/fixtures/discovery-angular-spotify.json b/tests/fixtures/discovery-angular-spotify.json new file mode 100644 index 0000000..709e38b --- /dev/null +++ b/tests/fixtures/discovery-angular-spotify.json @@ -0,0 +1,123 @@ +{ + "description": "Frozen discovery benchmark tasks for angular-spotify using current shipped surfaces only.", + "codebase": "angular-spotify", + "repository": "trungk18/angular-spotify", + "repositoryUrl": "https://github.com/trungk18/angular-spotify", + "repositoryRef": "ff9efa765c53cfde78c9a172c62d515ae8ef9fe0", + "frozenDate": "2026-04-04", + "notes": "Tasks are discovery-only and intentionally benchmark the current resource and tool surface, not future Phase 7 or 8 behavior.", + "tasks": [ + { + "id": "as-map-01", + "title": "Identify the main app areas from the current map surface", + "job": "map", + "surface": "codebase://context", + "prompt": "What are the main app areas in this repo?", + "expectedSignals": ["libraries actually used", "patterns", "generated:"] + }, + { + "id": "as-map-02", + "title": "Orient to the framework and architecture", + "job": "map", + "surface": "get_codebase_metadata", + "prompt": "What framework and architecture shape does this repo use?", + "expectedSignals": ["framework", "architecture", "statistics"] + }, + { + "id": "as-map-03", + "title": "Find store-heavy subsystems in the repo overview", + "job": "map", + "surface": "codebase://context", + "prompt": "Which subsystems look state-heavy?", + "expectedSignals": ["state", "patterns", "libraries actually used"] + }, + { + "id": "as-map-04", + "title": "See whether internal path aliases exist", + "job": "map", + "surface": "codebase://context", + "prompt": "Does this repo expose import aliases?", + "expectedSignals": ["import aliases", "tsconfig"] + }, + { + "id": "as-find-01", + "title": "Find the dominant dependency injection pattern", + "job": "find", + "surface": "get_team_patterns", + "prompt": "What DI pattern dominates here?", + "args": { "category": "di" }, + "expectedSignals": ["dependencyInjection"], + "expectedBestExamplePatterns": ["src", "service"] + }, + { + "id": "as-find-02", + "title": "Find the dominant state-management pattern", + "job": "find", + "surface": "get_team_patterns", + "prompt": "What state-management pattern should an agent imitate?", + "args": { "category": "state" }, + "expectedSignals": ["stateManagement"], + "expectedBestExamplePatterns": ["store", "state"] + }, + { + "id": "as-find-03", + "title": "Find a best local example before editing auth behavior", + "job": "find", + "surface": "search_codebase", + "prompt": "What should I imitate for auth-related request handling?", + "args": { "query": "auth interceptor request handling", "intent": "edit", "limit": 5 }, + "expectedSignals": ["preflight", "bestExample", "patterns"], + "expectedBestExamplePatterns": ["auth", "interceptor"] + }, + { + "id": "as-find-04", + "title": "Find testing conventions and likely examples", + "job": "find", + "surface": "get_team_patterns", + "prompt": "What testing setup dominates here?", + "args": { "category": "testing" }, + "expectedSignals": ["unitTestFramework", "test"], + "expectedBestExamplePatterns": ["spec", "test"] + }, + { + "id": "as-search-01", + "title": "Target the playback API entrypoint", + "job": "search", + "surface": "search_codebase", + "prompt": "skip to next song", + "args": { "query": "skip to next song", "limit": 5 }, + "expectedSignals": ["results", "searchQuality"], + "expectedFilePatterns": ["player-api", "player/api"] + }, + { + "id": "as-search-02", + "title": "Find persistence wiring", + "job": "search", + "surface": "search_codebase", + "prompt": "persist data across browser sessions", + "args": { "query": "persist data across browser sessions", "limit": 5 }, + "expectedSignals": ["results", "searchQuality"], + "expectedFilePatterns": ["storage", "local-storage"] + }, + { + "id": "as-search-03", + "title": "Find where auth headers are added", + "job": "search", + "surface": "search_codebase", + "prompt": "add authorization token to API requests", + "args": { "query": "add authorization token to API requests", "limit": 5 }, + "expectedSignals": ["results", "searchQuality"], + "expectedFilePatterns": ["auth", "interceptor"] + }, + { + "id": "as-search-04", + "title": "Find album selectors or equivalent state wiring", + "job": "search", + "surface": "search_codebase", + "prompt": "album selectors ngrx", + "args": { "query": "album selectors ngrx", "limit": 5 }, + "expectedSignals": ["results", "searchQuality"], + "expectedFilePatterns": ["selector", "store", "album"] + } + ] +} diff --git a/tests/fixtures/discovery-benchmark-protocol.json b/tests/fixtures/discovery-benchmark-protocol.json new file mode 100644 index 0000000..82e7378 --- /dev/null +++ b/tests/fixtures/discovery-benchmark-protocol.json @@ -0,0 +1,74 @@ +{ + "name": "v2-discovery-benchmark", + "frozenDate": "2026-04-04", + "scope": "discovery-only", + "jobs": ["map", "find", "search"], + "allowedSurfaces": [ + "search_codebase", + "get_codebase_metadata", + "get_team_patterns", + "codebase://context" + ], + "forbiddenSurfaces": ["map", "get_codebase_map"], + "primaryLane": "direct-tool", + "secondaryLane": "manual-log-capture", + "comparators": [ + { + "name": "raw Claude Code", + "kind": "baseline", + "execution": "manual-log-capture", + "notes": "Use native grep/glob/read exploration only. Record payload cost and usefulness from captured session logs." + }, + { + "name": "GrepAI", + "kind": "mcp-comparator", + "execution": "manual-log-capture", + "notes": "Run from public install/setup commands only. If an equivalent direct-tool run is unavailable, keep it in the manual lane." + }, + { + "name": "jCodeMunch", + "kind": "mcp-comparator", + "execution": "manual-log-capture", + "notes": "Run from public install/setup commands only. Preserve its documented init and policy flow when collecting logs." + }, + { + "name": "codebase-memory-mcp", + "kind": "mcp-comparator", + "execution": "manual-log-capture", + "notes": "Treat as the heavier structural comparator, not the primary public baseline." + } + ], + "metrics": { + "payloadCost": ["averagePayloadBytes", "averageEstimatedTokens"], + "usefulness": ["averageUsefulness", "averageFirstRelevantHit", "bestExampleUsefulnessRate"] + }, + "fairnessRules": [ + "Use only current shipped codebase-context surfaces in the direct-tool lane.", + "Do not benchmark unreleased map functionality.", + "Do not change task wording or expected signals after product-shaping implementation starts.", + "Do not claim implementation-quality or code-generation quality from this benchmark." + ], + "shipGate": { + "baselineRule": "Beat raw Claude Code on payload cost and at least one usefulness metric across the frozen public tasks.", + "comparatorRule": "Stay within 15% of GrepAI, jCodeMunch, and codebase-memory-mcp on frozen usefulness metrics.", + "claimRule": "If the gate is missed on any slice, report the loss and do not broaden relaunch claims.", + "baseline": { + "comparatorName": "raw Claude Code", + "payloadMetric": "averageEstimatedTokens", + "usefulnessMetrics": [ + "averageUsefulness", + "averageFirstRelevantHit", + "bestExampleUsefulnessRate" + ] + }, + "comparators": { + "requiredNames": ["GrepAI", "jCodeMunch", "codebase-memory-mcp"], + "tolerancePercent": 15, + "usefulnessMetrics": [ + "averageUsefulness", + "averageFirstRelevantHit", + "bestExampleUsefulnessRate" + ] + } + } +} diff --git a/tests/fixtures/discovery-excalidraw.json b/tests/fixtures/discovery-excalidraw.json new file mode 100644 index 0000000..4e1e223 --- /dev/null +++ b/tests/fixtures/discovery-excalidraw.json @@ -0,0 +1,123 @@ +{ + "description": "Frozen discovery benchmark tasks for Excalidraw using current shipped surfaces only.", + "codebase": "Excalidraw", + "repository": "excalidraw/excalidraw", + "repositoryUrl": "https://github.com/excalidraw/excalidraw", + "repositoryRef": "e18c1dd213000dde0ae94ef7eb00aab537b39708", + "frozenDate": "2026-04-04", + "notes": "Tasks stay broad enough to benchmark discovery honestly against the current product surface without assuming a future compact map command.", + "tasks": [ + { + "id": "ex-map-01", + "title": "Orient to the main subsystems", + "job": "map", + "surface": "get_codebase_metadata", + "prompt": "What are the main app areas in Excalidraw?", + "expectedSignals": ["framework", "architecture", "statistics"] + }, + { + "id": "ex-map-02", + "title": "Use the existing context resource as a repo overview", + "job": "map", + "surface": "codebase://context", + "prompt": "Give me the current codebase intelligence overview.", + "expectedSignals": ["codebase intelligence", "libraries actually used", "patterns"] + }, + { + "id": "ex-map-03", + "title": "Detect whether the repo uses internal path aliases", + "job": "map", + "surface": "codebase://context", + "prompt": "Does Excalidraw expose import aliases?", + "expectedSignals": ["import aliases", "tsconfig"] + }, + { + "id": "ex-map-04", + "title": "Identify likely scene and element-heavy areas", + "job": "map", + "surface": "codebase://context", + "prompt": "Which areas look scene or element heavy?", + "expectedSignals": ["patterns", "libraries actually used", "generated:"] + }, + { + "id": "ex-find-01", + "title": "Find the dominant state-management pattern", + "job": "find", + "surface": "get_team_patterns", + "prompt": "What state-management pattern dominates here?", + "args": { "category": "state" }, + "expectedSignals": ["stateManagement"], + "expectedBestExamplePatterns": ["appstate", "state", "store"] + }, + { + "id": "ex-find-02", + "title": "Find a best example before editing scene behavior", + "job": "find", + "surface": "search_codebase", + "prompt": "What local example should I imitate for scene updates?", + "args": { "query": "scene update flow", "intent": "edit", "limit": 5 }, + "expectedSignals": ["preflight", "bestExample", "patterns"], + "expectedBestExamplePatterns": ["scene", "app", "element"] + }, + { + "id": "ex-find-03", + "title": "Find testing conventions for UI-heavy code", + "job": "find", + "surface": "get_team_patterns", + "prompt": "What testing setup dominates here?", + "args": { "category": "testing" }, + "expectedSignals": ["test", "framework"], + "expectedBestExamplePatterns": ["test", "spec"] + }, + { + "id": "ex-find-04", + "title": "Find the dominant dependency pattern", + "job": "find", + "surface": "get_team_patterns", + "prompt": "What dependency injection or dependency pattern dominates here?", + "args": { "category": "di" }, + "expectedSignals": ["dependencyInjection"], + "expectedBestExamplePatterns": ["src", "app", "component"] + }, + { + "id": "ex-search-01", + "title": "Find where element types or definitions live", + "job": "search", + "surface": "search_codebase", + "prompt": "element type definitions", + "args": { "query": "element type definitions", "limit": 5 }, + "expectedSignals": ["results", "searchQuality"], + "expectedFilePatterns": ["element", "type"] + }, + { + "id": "ex-search-02", + "title": "Find scene serialization or export logic", + "job": "search", + "surface": "search_codebase", + "prompt": "scene serialization export json", + "args": { "query": "scene serialization export json", "limit": 5 }, + "expectedSignals": ["results", "searchQuality"], + "expectedFilePatterns": ["scene", "json", "data"] + }, + { + "id": "ex-search-03", + "title": "Find app state wiring", + "job": "search", + "surface": "search_codebase", + "prompt": "app state selection and updates", + "args": { "query": "app state selection and updates", "limit": 5 }, + "expectedSignals": ["results", "searchQuality"], + "expectedFilePatterns": ["appstate", "state", "app"] + }, + { + "id": "ex-search-04", + "title": "Find the main canvas or app entry surface", + "job": "search", + "surface": "search_codebase", + "prompt": "main canvas app entry", + "args": { "query": "main canvas app entry", "limit": 5 }, + "expectedSignals": ["results", "searchQuality"], + "expectedFilePatterns": ["app", "excalidraw", "canvas"] + } + ] +}