diff --git a/docs/capabilities.md b/docs/capabilities.md
index 6e0eca5..1c399a4 100644
--- a/docs/capabilities.md
+++ b/docs/capabilities.md
@@ -282,12 +282,19 @@ Notes:
 
 Reproducible evaluation is shipped as a CLI entrypoint backed by shared scoring/reporting code.
 
-- **Command:** `npm run eval -- <codebaseA> <codebaseB>` (builds first, then runs `scripts/run-eval.mjs`)
-- **Shared implementation:** `src/eval/harness.ts` + `src/eval/types.ts` (tests and CLI use the same scoring)
-- **Frozen fixtures:**
-  - `tests/fixtures/eval-angular-spotify.json` (real-world)
-  - `tests/fixtures/eval-controlled.json` + `tests/fixtures/codebases/eval-controlled/` (offline controlled)
-- **Reported metrics:** Top-1 accuracy, Top-3 recall, spec contamination rate, and a gate pass/fail
+- **Command:** `npm run eval -- <codebaseA> [codebaseB] --mode retrieval|discovery [--competitor-results <path>]` (builds first, then runs `scripts/run-eval.mjs`)
+- **Shared implementation:** `src/eval/harness.ts`, `src/eval/discovery-harness.ts`, and `src/eval/types.ts`
+- **Frozen retrieval fixtures:**
+  - `tests/fixtures/eval-angular-spotify.json`
+  - `tests/fixtures/eval-controlled.json` + `tests/fixtures/codebases/eval-controlled/`
+- **Frozen discovery fixtures:**
+  - `tests/fixtures/discovery-angular-spotify.json`
+  - `tests/fixtures/discovery-excalidraw.json`
+  - `tests/fixtures/discovery-benchmark-protocol.json`
+- **Retrieval metrics:** Top-1 accuracy, Top-3 recall, spec contamination rate, and a gate pass/fail
+- **Discovery metrics:** usefulness score, payload bytes, estimated tokens, first relevant hit, and best-example usefulness
+- **Discovery gate:** discovery mode evaluates the frozen ship gate only when the full public suite and comparator metrics are available; missing comparator evidence is reported as pending, not silently treated as pass/fail
+- **Limits:** discovery mode is discovery-only, uses current shipped surfaces only, and does not claim implementation quality; named competitor runs remain a documented hybrid/manual lane rather than a built-in automated benchmark
 
 ## Limitations
 
diff --git a/scripts/run-eval.mjs b/scripts/run-eval.mjs
index d1ed3ca..688b4a2 100644
--- a/scripts/run-eval.mjs
+++ b/scripts/run-eval.mjs
@@ -11,6 +11,12 @@ import { analyzerRegistry } from '../dist/core/analyzer-registry.js';
 import { AngularAnalyzer } from '../dist/analyzers/angular/index.js';
 import { GenericAnalyzer } from '../dist/analyzers/generic/index.js';
 import { evaluateFixture, formatEvalReport } from '../dist/eval/harness.js';
+import {
+  combineDiscoverySummaries,
+  evaluateDiscoveryGate,
+  evaluateDiscoveryFixture,
+  formatDiscoveryReport
+} from '../dist/eval/discovery-harness.js';
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const projectRoot = path.join(__dirname, '..');
@@ -20,13 +26,34 @@ const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
 
 const defaultFixtureA = path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json');
 const defaultFixtureB = path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json');
+const defaultDiscoveryFixtureA = path.join(
+  projectRoot,
+  'tests',
+  'fixtures',
+  'discovery-angular-spotify.json'
+);
+const defaultDiscoveryFixtureB = path.join(
+  projectRoot,
+  'tests',
+  'fixtures',
+  'discovery-excalidraw.json'
+);
+const defaultDiscoveryProtocol = path.join(
+  projectRoot,
+  'tests',
+  'fixtures',
+  'discovery-benchmark-protocol.json'
+);
 
 const usage = [
   `Usage: node scripts/run-eval.mjs <codebaseA> [codebaseB] [options]`,
   ``,
   `Options:`,
+  `  --mode=<retrieval|discovery>  Select benchmark mode (default: retrieval)`,
   `  --fixture-a=<path>  Override fixture for codebaseA`,
   `  --fixture-b=<path>  Override fixture for codebaseB`,
+  `  --protocol=<path>   Override discovery benchmark protocol`,
+  `  --competitor-results=<path>  JSON file with comparator metrics for discovery gate evaluation`,
   `  --skip-reindex      Skip re-index phase`,
   `  --no-rerank         Disable ambiguity reranker`,
   `  --no-redact         Show full file paths in report`,
@@ -87,6 +114,7 @@ async function runSingleEvaluation({
   label,
   codebasePath,
   fixturePath,
+  mode,
   skipReindex,
   noRerank,
   redactPaths
@@ -98,36 +126,81 @@ async function runSingleEvaluation({
   console.log(`\n=== Codebase: ${label} ===`);
   console.log(`Target: ${resolvedCodebase}`);
   console.log(`Fixture: ${resolvedFixture}`);
-  console.log(
-    `Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}`
-  );
+  console.log(`Mode: ${mode}`);
+  if (mode === 'retrieval') {
+    console.log(
+      `Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}`
+    );
+  }
   console.log(`Path output: ${redactPaths ? 'REDACTED' : 'FULL'}`);
 
   await maybeReindex(resolvedCodebase, skipReindex);
 
-  console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`);
-  const searcher = new CodebaseSearcher(resolvedCodebase);
-  const summary = await evaluateFixture({
-    fixture,
-    searcher,
-    limit: 5,
-    searchOptions: {
-      enableReranker: !noRerank
-    }
-  });
+  let summary;
+  let report;
 
-  const report = formatEvalReport({
-    codebaseLabel: label,
-    fixturePath: resolvedFixture,
-    summary,
-    redactPaths
-  });
+  if (mode === 'discovery') {
+    console.log(`\n--- Phase 2: Running ${fixture.tasks.length}-task discovery harness ---`);
+    summary = await evaluateDiscoveryFixture({
+      fixture,
+      rootPath: resolvedCodebase
+    });
+    report = formatDiscoveryReport({
+      codebaseLabel: label,
+      fixturePath: resolvedFixture,
+      summary
+    });
+  } else {
+    console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`);
+    const searcher = new CodebaseSearcher(resolvedCodebase);
+    summary = await evaluateFixture({
+      fixture,
+      searcher,
+      limit: 5,
+      searchOptions: {
+        enableReranker: !noRerank
+      }
+    });
+
+    report = formatEvalReport({
+      codebaseLabel: label,
+      fixturePath: resolvedFixture,
+      summary,
+      redactPaths
+    });
+  }
 
   console.log(report);
   return summary;
 }
 
-function printCombinedSummary(summaries) {
+function printCombinedSummary(summaries, mode) {
+  if (mode === 'discovery') {
+    const totalTasks = summaries.reduce((sum, summary) => sum + summary.totalTasks, 0);
+    const avgUsefulness =
+      totalTasks > 0
+        ? summaries.reduce((sum, summary) => sum + summary.averageUsefulness * summary.totalTasks, 0) /
+          totalTasks
+        : 0;
+    const avgPayload =
+      totalTasks > 0
+        ? summaries.reduce((sum, summary) => sum + summary.averagePayloadBytes * summary.totalTasks, 0) /
+          totalTasks
+        : 0;
+    const avgTokens =
+      totalTasks > 0
+        ? summaries.reduce((sum, summary) => sum + summary.averageEstimatedTokens * summary.totalTasks, 0) /
+          totalTasks
+        : 0;
+
+    console.log(`\n=== Combined Discovery Summary ===`);
+    console.log(`Average usefulness: ${(avgUsefulness * 100).toFixed(0)}%`);
+    console.log(`Average payload: ${Math.round(avgPayload)} bytes`);
+    console.log(`Average estimated tokens: ${Math.round(avgTokens)}`);
+    console.log(`=================================\n`);
+    return;
+  }
+
   const total = summaries.reduce((sum, summary) => sum + summary.total, 0);
   const top1Correct = summaries.reduce((sum, summary) => sum + summary.top1Correct, 0);
   const top3RecallCount = summaries.reduce((sum, summary) => sum + summary.top3RecallCount, 0);
@@ -156,8 +229,11 @@ async function main() {
       'skip-reindex': { type: 'boolean', default: false },
       'no-rerank': { type: 'boolean', default: false },
       'no-redact': { type: 'boolean', default: false },
+      mode: { type: 'string', default: 'retrieval' },
       'fixture-a': { type: 'string' },
-      'fixture-b': { type: 'string' }
+      'fixture-b': { type: 'string' },
+      protocol: { type: 'string' },
+      'competitor-results': { type: 'string' }
     },
     allowPositionals: true
   });
@@ -176,10 +252,26 @@ async function main() {
 
   const codebaseA = positionals[0];
   const codebaseB = positionals[1];
-  const fixtureA = values['fixture-a'] ? path.resolve(values['fixture-a']) : defaultFixtureA;
-  const fixtureB = values['fixture-b'] ? path.resolve(values['fixture-b']) : defaultFixtureB;
+  const mode = values.mode === 'discovery' ? 'discovery' : 'retrieval';
+  const fixtureA = values['fixture-a']
+    ? path.resolve(values['fixture-a'])
+    : mode === 'discovery'
+      ? defaultDiscoveryFixtureA
+      : defaultFixtureA;
+  const fixtureB = values['fixture-b']
+    ? path.resolve(values['fixture-b'])
+    : mode === 'discovery'
+      ? defaultDiscoveryFixtureB
+      : defaultFixtureB;
+  const protocolPath = values.protocol
+    ? path.resolve(values.protocol)
+    : defaultDiscoveryProtocol;
+  const comparatorResultsPath = values['competitor-results']
+    ? path.resolve(values['competitor-results'])
+    : null;
 
   const sharedOptions = {
+    mode,
     skipReindex: values['skip-reindex'],
     noRerank: values['no-rerank'],
     redactPaths: !values['no-redact']
@@ -193,7 +285,6 @@ async function main() {
   });
 
   const summaries = [summaryA];
-  let passesAllGates = summaryA.passesGate;
 
   if (codebaseB) {
     const summaryB = await runSingleEvaluation({
@@ -204,10 +295,30 @@ async function main() {
     });
 
     summaries.push(summaryB);
-    passesAllGates = passesAllGates && summaryB.passesGate;
-    printCombinedSummary(summaries);
   }
 
+  if (mode === 'discovery') {
+    const combinedSummary = combineDiscoverySummaries(summaries);
+    const protocol = loadFixture(protocolPath);
+    const comparatorEvidence = comparatorResultsPath ? loadFixture(comparatorResultsPath) : undefined;
+    const gate = evaluateDiscoveryGate({
+      summary: combinedSummary,
+      protocol,
+      comparatorEvidence,
+      suiteComplete: summaries.length > 1
+    });
+    combinedSummary.gate = gate;
+    printCombinedSummary([combinedSummary], mode);
+    console.log(formatDiscoveryReport({
+      codebaseLabel: 'combined-suite',
+      fixturePath: protocolPath,
+      summary: combinedSummary
+    }));
+    process.exit(gate.status === 'failed' ? 1 : 0);
+  }
+
+  const passesAllGates = summaries.every((summary) => summary.passesGate);
+  printCombinedSummary(summaries, mode);
   process.exit(passesAllGates ? 0 : 1);
 }
 
diff --git a/src/eval/discovery-harness.ts b/src/eval/discovery-harness.ts
new file mode 100644
index 0000000..8ca2aa9
--- /dev/null
+++ b/src/eval/discovery-harness.ts
@@ -0,0 +1,479 @@
+import { createProjectState, type ProjectState } from '../project-state.js';
+import { handle as searchCodebaseHandle } from '../tools/search-codebase.js';
+import { handle as getCodebaseMetadataHandle } from '../tools/get-codebase-metadata.js';
+import { handle as getTeamPatternsHandle } from '../tools/get-team-patterns.js';
+import type { SearchResponse, PatternResponse } from '../tools/types.js';
+import type {
+  DiscoveryBenchmarkProtocol,
+  DiscoveryComparatorEvidence,
+  DiscoveryComparatorGateResult,
+  DiscoveryGateEvaluation,
+  DiscoveryMetricComparison,
+  DiscoveryMetricName,
+  DiscoverySummary as DiscoverySummaryType,
+  DiscoverySurface,
+  DiscoverySurfaceResult as DiscoverySurfaceResultType,
+  DiscoverySurfaceRunner,
+  DiscoveryTask as DiscoveryTaskType,
+  DiscoveryTaskResult as DiscoveryTaskResultEval,
+  EvaluateDiscoveryFixtureParams,
+  FormatDiscoveryReportParams
+} from './types.js';
+import { countUtf8Bytes, estimateTokenCountFromBytes } from './harness.js';
+import { generateCodebaseIntelligence } from '../resources/codebase-intelligence.js';
+
+type JsonRecord = Record<string, unknown>;
+
+function normalizeText(value: string): string {
+  return value.toLowerCase().replace(/\\/g, '/');
+}
+
+function parseJsonPayload(payload: string): JsonRecord {
+  const parsed = JSON.parse(payload) as unknown;
+  if (typeof parsed === 'object' && parsed !== null) {
+    return parsed as JsonRecord;
+  }
+  return {};
+}
+
+function createToolProject(rootPath: string): ProjectState {
+  const project = createProjectState(rootPath);
+  project.indexState.status = 'ready';
+  return project;
+}
+
+async function runSearchCodebase(
+  task: DiscoveryTaskType,
+  rootPath: string
+): Promise<DiscoverySurfaceResultType> {
+  const project = createToolProject(rootPath);
+  const response = await searchCodebaseHandle(task.args ?? { query: task.prompt }, {
+    indexState: project.indexState,
+    paths: project.paths,
+    rootPath: project.rootPath,
+    performIndexing: () => undefined
+  });
+  const payload = response.content?.[0]?.text ?? '{}';
+  const parsed = parseJsonPayload(payload) as SearchResponse & JsonRecord;
+  const results = Array.isArray(parsed.results) ? parsed.results : [];
+  const topFiles = results
+    .map((entry) => (typeof entry.file === 'string' ? entry.file.split(':')[0] : ''))
+    .filter((entry): entry is string => Boolean(entry));
+  const preflight = parsed.preflight;
+  const bestExample =
+    preflight && typeof preflight === 'object' && preflight !== null && 'bestExample' in preflight
+      ? ((preflight.bestExample as string | undefined) ?? null)
+      : null;
+
+  return {
+    payload,
+    topFiles,
+    bestExample
+  };
+}
+
+async function runCodebaseMetadata(
+  _task: DiscoveryTaskType,
+  rootPath: string
+): Promise<DiscoverySurfaceResultType> {
+  const project = createToolProject(rootPath);
+  const response = await getCodebaseMetadataHandle(
+    {},
+    {
+      indexState: project.indexState,
+      paths: project.paths,
+      rootPath: project.rootPath,
+      performIndexing: () => undefined
+    }
+  );
+  return { payload: response.content?.[0]?.text ?? '{}' };
+}
+
+async function runTeamPatterns(
+  task: DiscoveryTaskType,
+  rootPath: string
+): Promise<DiscoverySurfaceResultType> {
+  const project = createToolProject(rootPath);
+  const response = await getTeamPatternsHandle(task.args ?? { category: 'all' }, {
+    indexState: project.indexState,
+    paths: project.paths,
+    rootPath: project.rootPath,
+    performIndexing: () => undefined
+  });
+  const payload = response.content?.[0]?.text ?? '{}';
+  const parsed = parseJsonPayload(payload) as PatternResponse & JsonRecord;
+  const goldenFiles = Array.isArray(parsed.goldenFiles) ? parsed.goldenFiles : [];
+  return {
+    payload,
+    bestExample:
+      goldenFiles.length > 0 && typeof goldenFiles[0]?.file === 'string'
+        ? goldenFiles[0].file
+        : null
+  };
+}
+
+async function runCodebaseContext(
+  _task: DiscoveryTaskType,
+  rootPath: string
+): Promise<DiscoverySurfaceResultType> {
+  const project = createToolProject(rootPath);
+  return {
+    payload: await generateCodebaseIntelligence(project)
+  };
+}
+
+const DEFAULT_SURFACE_RUNNERS: Record<DiscoverySurface, DiscoverySurfaceRunner> = {
+  search_codebase: runSearchCodebase,
+  get_codebase_metadata: runCodebaseMetadata,
+  get_team_patterns: runTeamPatterns,
+  'codebase://context': runCodebaseContext
+};
+
+function matchPatterns(candidates: string[], patterns: string[] | undefined): number | null {
+  if (!patterns || patterns.length === 0) return null;
+  const normalizedPatterns = patterns.map(normalizeText);
+  for (let index = 0; index < candidates.length; index++) {
+    const normalizedCandidate = normalizeText(candidates[index]);
+    if (normalizedPatterns.some((pattern) => normalizedCandidate.includes(pattern))) {
+      return index + 1;
+    }
+  }
+  return null;
+}
+
+function evaluateDiscoveryTask(
+  task: DiscoveryTaskType,
+  result: DiscoverySurfaceResultType
+): DiscoveryTaskResultEval {
+  const normalizedPayload = normalizeText(result.payload);
+  const matchedSignals = task.expectedSignals.filter((signal) =>
+    normalizedPayload.includes(normalizeText(signal))
+  );
+  const missingSignals = task.expectedSignals.filter((signal) => !matchedSignals.includes(signal));
+  const forbiddenHits = (task.forbiddenSignals ?? []).filter((signal) =>
+    normalizedPayload.includes(normalizeText(signal))
+  );
+  const payloadBytes = countUtf8Bytes(result.payload);
+  const estimatedTokens = estimateTokenCountFromBytes(payloadBytes);
+  const usefulnessDenominator = Math.max(task.expectedSignals.length, 1);
+  const usefulnessScore = Math.max(
+    0,
+    (matchedSignals.length - forbiddenHits.length) / usefulnessDenominator
+  );
+  const firstRelevantHit = matchPatterns(result.topFiles ?? [], task.expectedFilePatterns);
+  const bestExampleUseful =
+    task.expectedBestExamplePatterns && task.expectedBestExamplePatterns.length > 0
+      ? task.expectedBestExamplePatterns.some((pattern) =>
+          normalizeText(result.bestExample ?? '').includes(normalizeText(pattern))
+        )
+      : undefined;
+
+  return {
+    taskId: task.id,
+    title: task.title,
+    job: task.job,
+    surface: task.surface,
+    usefulnessScore,
+    matchedSignals,
+    missingSignals,
+    forbiddenHits,
+    payloadBytes,
+    estimatedTokens,
+    ...(firstRelevantHit !== null ? { firstRelevantHit } : {}),
+    ...(typeof bestExampleUseful === 'boolean' ? { bestExampleUseful } : {})
+  };
+}
+
+function summarizeDiscoveryResults(results: DiscoveryTaskResultEval[]): DiscoverySummaryType {
+  const totalTasks = results.length;
+  const averageUsefulness =
+    totalTasks > 0
+      ? results.reduce((sum, result) => sum + result.usefulnessScore, 0) / totalTasks
+      : 0;
+  const averagePayloadBytes =
+    totalTasks > 0 ? results.reduce((sum, result) => sum + result.payloadBytes, 0) / totalTasks : 0;
+  const averageEstimatedTokens =
+    totalTasks > 0
+      ? results.reduce((sum, result) => sum + result.estimatedTokens, 0) / totalTasks
+      : 0;
+  const searchResults = results.filter((result) => result.job === 'search');
+  const findResults = results.filter((result) => result.job === 'find');
+  const mapResults = results.filter((result) => result.job === 'map');
+  const searchHits = searchResults
+    .map((result) => result.firstRelevantHit)
+    .filter((value): value is number => typeof value === 'number');
+  const bestExampleResults = findResults
+    .map((result) => result.bestExampleUseful)
+    .filter((value): value is boolean => typeof value === 'boolean');
+
+  return {
+    totalTasks,
+    averageUsefulness,
+    averagePayloadBytes,
+    averageEstimatedTokens,
+    searchTasks: searchResults.length,
+    findTasks: findResults.length,
+    mapTasks: mapResults.length,
+    averageFirstRelevantHit:
+      searchHits.length > 0
+        ? searchHits.reduce((sum, value) => sum + value, 0) / searchHits.length
+        : null,
+    bestExampleUsefulnessRate:
+      bestExampleResults.length > 0
+        ? bestExampleResults.filter(Boolean).length / bestExampleResults.length
+        : null,
+    results
+  };
+}
+
+function getDiscoveryMetricValue(
+  summary: DiscoverySummaryType | DiscoveryComparatorEvidence[string] | undefined,
+  metric: DiscoveryMetricName
+): number | null {
+  if (!summary) return null;
+  const value = summary[metric];
+  return typeof value === 'number' && Number.isFinite(value) ? value : null;
+}
+
+function compareMetric(
+  actualValue: number | null,
+  comparatorValue: number | null,
+  metric: DiscoveryMetricName
+): DiscoveryMetricComparison {
+  const lowerIsBetter =
+    metric === 'averagePayloadBytes' ||
+    metric === 'averageEstimatedTokens' ||
+    metric === 'averageFirstRelevantHit';
+  const passes =
+    actualValue !== null &&
+    comparatorValue !== null &&
+    (lowerIsBetter ? actualValue <= comparatorValue : actualValue >= comparatorValue);
+
+  return {
+    metric,
+    comparatorValue,
+    actualValue,
+    passes
+  };
+}
+
+function compareMetricWithinTolerance(
+  actualValue: number | null,
+  comparatorValue: number | null,
+  metric: DiscoveryMetricName,
+  tolerancePercent: number
+): DiscoveryMetricComparison {
+  const lowerIsBetter =
+    metric === 'averagePayloadBytes' ||
+    metric === 'averageEstimatedTokens' ||
+    metric === 'averageFirstRelevantHit';
+  const multiplier = 1 + tolerancePercent / 100;
+  const passes =
+    actualValue !== null &&
+    comparatorValue !== null &&
+    (lowerIsBetter
+      ? actualValue <= comparatorValue * multiplier
+      : actualValue >= comparatorValue * (1 - tolerancePercent / 100));
+
+  return {
+    metric,
+    comparatorValue,
+    actualValue,
+    passes
+  };
+}
+
+function evaluateBaselineGate(
+  summary: DiscoverySummaryType,
+  protocol: DiscoveryBenchmarkProtocol,
+  comparatorEvidence: DiscoveryComparatorEvidence | undefined
+) {
+  const baselineConfig = protocol.shipGate.baseline;
+  const baselineMetrics = comparatorEvidence?.[baselineConfig.comparatorName];
+  const comparisons = [
+    compareMetric(
+      getDiscoveryMetricValue(summary, baselineConfig.payloadMetric),
+      getDiscoveryMetricValue(baselineMetrics, baselineConfig.payloadMetric),
+      baselineConfig.payloadMetric
+    ),
+    ...baselineConfig.usefulnessMetrics.map((metric) =>
+      compareMetric(
+        getDiscoveryMetricValue(summary, metric),
+        getDiscoveryMetricValue(baselineMetrics, metric),
+        metric
+      )
+    )
+  ];
+  const missingMetrics = comparisons
+    .filter((comparison) => comparison.comparatorValue === null)
+    .map((comparison) => comparison.metric);
+  const payloadMetricPassed = comparisons[0]?.passes ?? false;
+  const beatenUsefulnessMetrics = comparisons
+    .slice(1)
+    .filter((comparison) => comparison.passes)
+    .map((comparison) => comparison.metric);
+
+  return {
+    comparatorName: baselineConfig.comparatorName,
+    status:
+      missingMetrics.length > 0
+        ? 'pending_evidence'
+        : payloadMetricPassed && beatenUsefulnessMetrics.length > 0
+          ? 'passed'
+          : 'failed',
+    payloadMetric: baselineConfig.payloadMetric,
+    payloadMetricPassed,
+    beatenUsefulnessMetrics,
+    missingMetrics,
+    comparisons
+  } as const;
+}
+
+function evaluateComparatorGate(
+  summary: DiscoverySummaryType,
+  comparatorName: string,
+  protocol: DiscoveryBenchmarkProtocol,
+  comparatorEvidence: DiscoveryComparatorEvidence | undefined
+): DiscoveryComparatorGateResult {
+  const tolerancePercent = protocol.shipGate.comparators.tolerancePercent;
+  const comparatorMetrics = comparatorEvidence?.[comparatorName];
+  const comparisons = protocol.shipGate.comparators.usefulnessMetrics.map((metric) =>
+    compareMetricWithinTolerance(
+      getDiscoveryMetricValue(summary, metric),
+      getDiscoveryMetricValue(comparatorMetrics, metric),
+      metric,
+      tolerancePercent
+    )
+  );
+  const missingMetrics = comparisons
+    .filter((comparison) => comparison.comparatorValue === null)
+    .map((comparison) => comparison.metric);
+
+  return {
+    comparatorName,
+    status:
+      missingMetrics.length > 0
+        ? 'pending_evidence'
+        : comparisons.every((comparison) => comparison.passes)
+          ? 'passed'
+          : 'failed',
+    tolerancePercent,
+    missingMetrics,
+    comparisons
+  };
+}
+
+export function combineDiscoverySummaries(summaries: DiscoverySummaryType[]): DiscoverySummaryType {
+  return summarizeDiscoveryResults(summaries.flatMap((summary) => summary.results));
+}
+
+export function evaluateDiscoveryGate({
+  summary,
+  protocol,
+  comparatorEvidence,
+  suiteComplete
+}: {
+  summary: DiscoverySummaryType;
+  protocol: DiscoveryBenchmarkProtocol;
+  comparatorEvidence?: DiscoveryComparatorEvidence;
+  suiteComplete: boolean;
+}): DiscoveryGateEvaluation {
+  const baseline = evaluateBaselineGate(summary, protocol, comparatorEvidence);
+  const comparators = protocol.shipGate.comparators.requiredNames.map((name) =>
+    evaluateComparatorGate(summary, name, protocol, comparatorEvidence)
+  );
+  const missingEvidence: string[] = [];
+
+  if (!suiteComplete) {
+    missingEvidence.push('fixed public discovery suite is incomplete');
+  }
+
+  if (baseline.status === 'pending_evidence') {
+    missingEvidence.push(`${baseline.comparatorName} baseline metrics missing`);
+  }
+
+  for (const comparator of comparators) {
+    if (comparator.status === 'pending_evidence') {
+      missingEvidence.push(`${comparator.comparatorName} comparator metrics missing`);
+    }
+  }
+
+  const status =
+    missingEvidence.length > 0
+      ? 'pending_evidence'
+      : baseline.status === 'passed' &&
+          comparators.every((comparator) => comparator.status === 'passed')
+        ? 'passed'
+        : 'failed';
+
+  return {
+    status,
+    suiteStatus: suiteComplete ? 'complete' : 'incomplete',
+    baseline,
+    comparators,
+    missingEvidence,
+    claimAllowed: status === 'passed'
+  };
+}
+
+export async function evaluateDiscoveryFixture({
+  fixture,
+  rootPath,
+  surfaceRunners
+}: EvaluateDiscoveryFixtureParams): Promise<DiscoverySummaryType> {
+  const runners: Record<DiscoverySurface, DiscoverySurfaceRunner> = {
+    ...DEFAULT_SURFACE_RUNNERS,
+    ...(surfaceRunners ?? {})
+  };
+  const results: DiscoveryTaskResultEval[] = [];
+
+  for (const task of fixture.tasks) {
+    const runner = runners[task.surface];
+    const payload = await runner(task, rootPath);
+    results.push(evaluateDiscoveryTask(task, payload));
+  }
+
+  return summarizeDiscoveryResults(results);
+}
+
+export function formatDiscoveryReport({
+  codebaseLabel,
+  fixturePath,
+  summary
+}: FormatDiscoveryReportParams): string {
+  const lines: string[] = [];
+  lines.push(`\n=== Discovery Eval Report: ${codebaseLabel} ===`);
+  lines.push(`Fixture: ${fixturePath}`);
+  lines.push(`Tasks: ${summary.totalTasks}`);
+  lines.push(`Average usefulness: ${(summary.averageUsefulness * 100).toFixed(0)}%`);
+  lines.push(`Average payload: ${Math.round(summary.averagePayloadBytes)} bytes`);
+  lines.push(`Average estimated tokens: ${Math.round(summary.averageEstimatedTokens)}`);
+  lines.push(
+    `Average first relevant hit: ${
+      summary.averageFirstRelevantHit === null ? 'n/a' : summary.averageFirstRelevantHit.toFixed(2)
+    }`
+  );
+  lines.push(
+    `Best-example usefulness: ${
+      summary.bestExampleUsefulnessRate === null
+        ? 'n/a'
+        : `${(summary.bestExampleUsefulnessRate * 100).toFixed(0)}%`
+    }`
+  );
+  if (summary.gate) {
+    lines.push(`Gate: ${summary.gate.status.toUpperCase()}`);
+    lines.push(`Claim allowed: ${summary.gate.claimAllowed ? 'yes' : 'no'}`);
+    if (summary.gate.missingEvidence.length > 0) {
+      lines.push(`Missing evidence: ${summary.gate.missingEvidence.join('; ')}`);
+    }
+  }
+  lines.push('');
+  lines.push('Task results:');
+  for (const result of summary.results) {
+    lines.push(
+      `- ${result.taskId} [${result.job}/${result.surface}] usefulness ${(result.usefulnessScore * 100).toFixed(0)}%, payload ${result.payloadBytes}B/${result.estimatedTokens} tok`
+    );
+  }
+  lines.push('================================');
+  return lines.join('\n');
+}
diff --git a/src/eval/harness.ts b/src/eval/harness.ts
index 7ef0d7e..dfc5e5e 100644
--- a/src/eval/harness.ts
+++ b/src/eval/harness.ts
@@ -101,6 +101,14 @@ function hashPath(filePath: string): string {
   return crypto.createHash('sha1').update(normalizePath(filePath)).digest('hex').slice(0, 8);
 }
 
+export function countUtf8Bytes(value: string): number {
+  return Buffer.byteLength(value, 'utf-8');
+}
+
+export function estimateTokenCountFromBytes(bytes: number): number {
+  return Math.max(1, Math.ceil(bytes / 4));
+}
+
 function formatPath(filePath: string | null, redactPaths: boolean): string {
   if (!filePath) {
     return 'none';
diff --git a/src/eval/types.ts b/src/eval/types.ts
index 209f24f..1178cb4 100644
--- a/src/eval/types.ts
+++ b/src/eval/types.ts
@@ -63,3 +63,182 @@ export interface FormatEvalReportParams {
   summary: EvalSummary;
   redactPaths?: boolean;
 }
+
+export type DiscoveryJob = 'map' | 'find' | 'search';
+
+export type DiscoverySurface =
+  | 'search_codebase'
+  | 'get_codebase_metadata'
+  | 'get_team_patterns'
+  | 'codebase://context';
+
+export interface DiscoveryTask {
+  id: string;
+  title: string;
+  job: DiscoveryJob;
+  surface: DiscoverySurface;
+  prompt: string;
+  args?: Record<string, unknown>;
+  expectedSignals: string[];
+  expectedFilePatterns?: string[];
+  expectedBestExamplePatterns?: string[];
+  forbiddenSignals?: string[];
+  notes?: string;
+}
+
+export interface DiscoveryFixture {
+  description?: string;
+  codebase?: string;
+  repository?: string;
+  repositoryUrl?: string;
+  repositoryRef?: string;
+  frozenDate?: string;
+  notes?: string;
+  tasks: DiscoveryTask[];
+}
+
+export interface DiscoveryTaskResult {
+  taskId: string;
+  title: string;
+  job: DiscoveryJob;
+  surface: DiscoverySurface;
+  usefulnessScore: number;
+  matchedSignals: string[];
+  missingSignals: string[];
+  forbiddenHits: string[];
+  payloadBytes: number;
+  estimatedTokens: number;
+  firstRelevantHit?: number | null;
+  bestExampleUseful?: boolean;
+}
+
+export interface DiscoverySummary {
+  totalTasks: number;
+  averageUsefulness: number;
+  averagePayloadBytes: number;
+  averageEstimatedTokens: number;
+  searchTasks: number;
+  findTasks: number;
+  mapTasks: number;
+  averageFirstRelevantHit: number | null;
+  bestExampleUsefulnessRate: number | null;
+  gate?: DiscoveryGateEvaluation;
+  results: DiscoveryTaskResult[];
+}
+
+export interface EvaluateDiscoveryFixtureParams {
+  fixture: DiscoveryFixture;
+  rootPath: string;
+  surfaceRunners?: Partial<Record<DiscoverySurface, DiscoverySurfaceRunner>>;
+}
+
+export interface FormatDiscoveryReportParams {
+  codebaseLabel: string;
+  fixturePath: string;
+  summary: DiscoverySummary;
+}
+
+export type DiscoverySurfaceRunner = (
+  task: DiscoveryTask,
+  rootPath: string
+) => Promise<DiscoverySurfaceResult>;
+
+export interface DiscoverySurfaceResult {
+  payload: string;
+  topFiles?: string[];
+  bestExample?: string | null;
+}
+
+export type DiscoveryMetricName =
+  | 'averageUsefulness'
+  | 'averagePayloadBytes'
+  | 'averageEstimatedTokens'
+  | 'averageFirstRelevantHit'
+  | 'bestExampleUsefulnessRate';
+
+export interface DiscoveryComparatorProtocol {
+  name: string;
+  kind: 'baseline' | 'mcp-comparator';
+  execution: 'direct-tool' | 'manual-log-capture';
+  notes?: string;
+}
+
+export interface DiscoveryGateProtocol {
+  baselineRule: string;
+  comparatorRule: string;
+  claimRule: string;
+  baseline: {
+    comparatorName: string;
+    payloadMetric: DiscoveryMetricName;
+    usefulnessMetrics: DiscoveryMetricName[];
+  };
+  comparators: {
+    requiredNames: string[];
+    tolerancePercent: number;
+    usefulnessMetrics: DiscoveryMetricName[];
+  };
+}
+
+export interface DiscoveryBenchmarkProtocol {
+  name: string;
+  frozenDate: string;
+  scope: 'discovery-only';
+  jobs: DiscoveryJob[];
+  allowedSurfaces: DiscoverySurface[];
+  forbiddenSurfaces: string[];
+  primaryLane: 'direct-tool';
+  secondaryLane: 'manual-log-capture';
+  comparators: DiscoveryComparatorProtocol[];
+  metrics: {
+    payloadCost: DiscoveryMetricName[];
+    usefulness: DiscoveryMetricName[];
+  };
+  fairnessRules: string[];
+  shipGate: DiscoveryGateProtocol;
+}
+
+export interface DiscoveryComparatorMetrics {
+  averageUsefulness?: number | null;
+  averagePayloadBytes?: number | null;
+  averageEstimatedTokens?: number | null;
+  averageFirstRelevantHit?: number | null;
+  bestExampleUsefulnessRate?: number | null;
+}
+
+export interface DiscoveryComparatorEvidence {
+  [comparatorName: string]: DiscoveryComparatorMetrics;
+}
+
+export interface DiscoveryMetricComparison {
+  metric: DiscoveryMetricName;
+  comparatorValue: number | null;
+  actualValue: number | null;
+  passes: boolean;
+}
+
+export interface DiscoveryBaselineGateResult {
+  comparatorName: string;
+  status: 'passed' | 'failed' | 'pending_evidence';
+  payloadMetric: DiscoveryMetricName;
+  payloadMetricPassed: boolean;
+  beatenUsefulnessMetrics: DiscoveryMetricName[];
+  missingMetrics: DiscoveryMetricName[];
+  comparisons: DiscoveryMetricComparison[];
+}
+
+export interface DiscoveryComparatorGateResult {
+  comparatorName: string;
+  status: 'passed' | 'failed' | 'pending_evidence';
+  tolerancePercent: number;
+  missingMetrics: DiscoveryMetricName[];
+  comparisons: DiscoveryMetricComparison[];
+}
+
+export interface DiscoveryGateEvaluation {
+  status: 'passed' | 'failed' | 'pending_evidence';
+  suiteStatus: 'complete' | 'incomplete';
+  baseline: DiscoveryBaselineGateResult;
+  comparators: DiscoveryComparatorGateResult[];
+  missingEvidence: string[];
+  claimAllowed: boolean;
+}
diff --git a/src/index.ts b/src/index.ts
index e590f55..9ae677d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -50,6 +50,7 @@ import {
   getProjectPathFromContextResourceUri,
   isContextResourceUri
 } from './resources/uri.js';
+import { generateCodebaseIntelligence } from './resources/codebase-intelligence.js';
 import { EXCLUDED_GLOB_PATTERNS } from './constants/codebase-context.js';
 import {
   discoverProjectsWithinRoot,
@@ -852,7 +853,7 @@ export function registerHandlers(target: Server): void {
           {
             uri: buildProjectContextResourceUri(project.rootPath),
             mimeType: 'text/plain',
-            text: await generateCodebaseContext(project)
+            text: await generateCodebaseIntelligence(project)
           }
         ]
       };
@@ -865,7 +866,9 @@ export function registerHandlers(target: Server): void {
           {
             uri: CONTEXT_RESOURCE_URI,
             mimeType: 'text/plain',
-            text: project ? await generateCodebaseContext(project) : buildProjectSelectionMessage()
+            text: project
+              ? await generateCodebaseIntelligence(project)
+              : buildProjectSelectionMessage()
           }
         ]
       };
@@ -903,7 +906,7 @@ export function registerHandlers(target: Server): void {
                 type: 'text',
                 text: JSON.stringify({
                   status: 'indexing',
-                  message: 'Index build in progress — please retry shortly'
+                  message: 'Index build in progress - please retry shortly'
                 })
               }
             ]
@@ -930,7 +933,7 @@ export function registerHandlers(target: Server): void {
                 type: 'text',
                 text: JSON.stringify({
                   status: 'indexing',
-                  message: 'Index rebuild in progress — please retry shortly',
+                  message: 'Index rebuild in progress - please retry shortly',
                   index: indexSignal
                 })
               }
@@ -1011,7 +1014,7 @@ function buildResources(): Resource[] {
   return resources;
 }
 
-async function generateCodebaseContext(project: ProjectState): Promise<string> {
+async function _generateCodebaseContext(project: ProjectState): Promise<string> {
   const intelligencePath = project.paths.intelligence;
 
   const index = await ensureValidIndexOrAutoHeal(project);
@@ -1033,7 +1036,7 @@ async function generateCodebaseContext(project: ProjectState): Promise<string> {
     lines.push('');
     lines.push(
       `Index: ${index.status} (${index.confidence}, ${index.action})${
-        index.reason ? ` — ${index.reason}` : ''
+        index.reason ? ` - ${index.reason}` : ''
       }`
     );
     lines.push('');
@@ -1571,14 +1574,14 @@ function ensureProjectWatcher(project: ProjectState, debounceMs: number): void {
       if (!shouldRunNow) {
         if (process.env.CODEBASE_CONTEXT_DEBUG) {
           console.error(
-            `[file-watcher] Index in progress — queueing auto-refresh: ${project.rootPath}`
+            `[file-watcher] Index in progress - queueing auto-refresh: ${project.rootPath}`
           );
         }
         return;
       }
       if (process.env.CODEBASE_CONTEXT_DEBUG) {
         console.error(
-          `[file-watcher] Changes detected — incremental reindex starting: ${project.rootPath}`
+          `[file-watcher] Changes detected - incremental reindex starting: ${project.rootPath}`
         );
       }
       void performIndexing(project, true);
diff --git a/src/resources/codebase-intelligence.ts b/src/resources/codebase-intelligence.ts
new file mode 100644
index 0000000..f1c813e
--- /dev/null
+++ b/src/resources/codebase-intelligence.ts
@@ -0,0 +1,192 @@
+import { promises as fs } from 'fs';
+import path from 'path';
+import type { ProjectState } from '../project-state.js';
+import type { IntelligenceData, PatternsData, PatternCandidate } from '../types/index.js';
+import {
+  isComplementaryPatternCategory,
+  shouldSkipLegacyTestingFrameworkCategory
+} from '../patterns/semantics.js';
+import { RELATIONSHIPS_FILENAME } from '../constants/codebase-context.js';
+
+async function fileExists(filePath: string): Promise<boolean> {
+  try {
+    await fs.access(filePath);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+async function readIndexSignal(project: ProjectState): Promise<{
+  status: 'ready' | 'stale';
+  confidence: 'high' | 'low';
+  action: 'served';
+  reason?: string;
+}> {
+  const hasIntelligence = await fileExists(project.paths.intelligence);
+  const hasRelationships = await fileExists(
+    path.join(path.dirname(project.paths.intelligence), RELATIONSHIPS_FILENAME)
+  );
+
+  if (!hasIntelligence) {
+    return {
+      status: 'stale',
+      confidence: 'low',
+      action: 'served',
+      reason: 'Intelligence artifact missing'
+    };
+  }
+
+  return {
+    status: 'ready',
+    confidence: hasRelationships ? 'high' : 'low',
+    action: 'served',
+    ...(hasRelationships ? {} : { reason: 'Relationships artifact missing' })
+  };
+}
+
+export async function generateCodebaseIntelligence(project: ProjectState): Promise<string> {
+  const intelligencePath = project.paths.intelligence;
+  const index = await readIndexSignal(project);
+
+  try {
+    const content = await fs.readFile(intelligencePath, 'utf-8');
+    const intelligence = JSON.parse(content) as IntelligenceData;
+
+    const lines: string[] = [];
+    lines.push('# Codebase Intelligence');
+    lines.push('');
+    lines.push(
+      `Index: ${index.status} (${index.confidence}, ${index.action})${
+        index.reason ? ` - ${index.reason}` : ''
+      }`
+    );
+    lines.push('');
+    lines.push('WARNING: This is what YOUR codebase actually uses, not generic recommendations.');
+    lines.push('These are FACTS from analyzing your code, not best practices from the internet.');
+    lines.push('');
+
+    const libraryEntries = Object.entries(intelligence.libraryUsage || {})
+      .map(([lib, data]) => ({
+        lib,
+        count: data.count
+      }))
+      .sort((a, b) => b.count - a.count);
+
+    if (libraryEntries.length > 0) {
+      lines.push('## Libraries Actually Used (Top 15)');
+      lines.push('');
+
+      for (const { lib, count } of libraryEntries.slice(0, 15)) {
+        lines.push(`- **${lib}** (${count} uses)`);
+      }
+      lines.push('');
+    }
+
+    if (intelligence.tsconfigPaths && Object.keys(intelligence.tsconfigPaths).length > 0) {
+      lines.push('## Import Aliases (from tsconfig.json)');
+      lines.push('');
+      lines.push('These path aliases map to internal project code:');
+      for (const [alias, paths] of Object.entries(intelligence.tsconfigPaths)) {
+        lines.push(`- \`${alias}\` -> ${(paths as string[]).join(', ')}`);
+      }
+      lines.push('');
+    }
+
+    if (intelligence.patterns && Object.keys(intelligence.patterns).length > 0) {
+      const patterns: PatternsData = intelligence.patterns;
+      lines.push("## YOUR Codebase's Actual Patterns (Not Generic Best Practices)");
+      lines.push('');
+      lines.push('These patterns were detected by analyzing your actual code.');
+      lines.push('This is what YOUR team does in practice, not what tutorials recommend.');
+      lines.push('');
+
+      for (const [category, data] of Object.entries(patterns)) {
+        if (shouldSkipLegacyTestingFrameworkCategory(category, patterns)) {
+          continue;
+        }
+
+        const primary: PatternCandidate | undefined = data.primary;
+        const alternatives: PatternCandidate[] = data.alsoDetected ?? [];
+
+        if (!primary) continue;
+
+        if (
+          isComplementaryPatternCategory(
+            category,
+            [primary.name, ...alternatives.map((alt) => alt.name)].filter(Boolean)
+          )
+        ) {
+          const secondary = alternatives[0];
+          if (secondary) {
+            const categoryName = category
+              .replace(/([A-Z])/g, ' $1')
+              .trim()
+              .replace(/^./, (str: string) => str.toUpperCase());
+            lines.push(
+              `### ${categoryName}: **${primary.name}** (${primary.frequency}) + **${secondary.name}** (${secondary.frequency})`
+            );
+            lines.push(
+              '   -> Computed and effect are complementary Signals primitives and are commonly used together.'
+            );
+            lines.push('   -> Treat this as balanced usage, not a hard split decision.');
+            lines.push('');
+            continue;
+          }
+        }
+
+        const percentage = Number.parseInt(primary.frequency, 10);
+        const categoryName = category
+          .replace(/([A-Z])/g, ' $1')
+          .trim()
+          .replace(/^./, (str: string) => str.toUpperCase());
+
+        if (percentage === 100) {
+          lines.push(`### ${categoryName}: **${primary.name}** (${primary.frequency} - unanimous)`);
+          lines.push(`   -> Your codebase is 100% consistent - ALWAYS use ${primary.name}`);
+        } else if (percentage >= 80) {
+          lines.push(
+            `### ${categoryName}: **${primary.name}** (${primary.frequency} - strong consensus)`
+          );
+          lines.push(`   -> Your team strongly prefers ${primary.name}`);
+          if (alternatives.length) {
+            const alt = alternatives[0];
+            lines.push(
+              `   -> Minority pattern: ${alt.name} (${alt.frequency}) - avoid for new code`
+            );
+          }
+        } else if (percentage >= 60) {
+          lines.push(`### ${categoryName}: **${primary.name}** (${primary.frequency} - majority)`);
+          lines.push(`   -> Most code uses ${primary.name}, but not unanimous`);
+          if (alternatives.length) {
+            lines.push(
+              `   -> Also detected: ${alternatives[0].name} (${alternatives[0].frequency})`
+            );
+          }
+        } else {
+          lines.push(`### ${categoryName}: WARNING: NO TEAM CONSENSUS`);
+          lines.push('   Your codebase is split between multiple approaches:');
+          lines.push(`   - ${primary.name} (${primary.frequency})`);
+          if (alternatives.length) {
+            for (const alt of alternatives.slice(0, 2)) {
+              lines.push(`   - ${alt.name} (${alt.frequency})`);
+            }
+          }
+          lines.push('   -> ASK the team which approach to use for new features');
+        }
+        lines.push('');
+      }
+    }
+
+    lines.push('---');
+    lines.push(`Generated: ${intelligence.generatedAt || new Date().toISOString()}`);
+
+    return lines.join('\n');
+  } catch (error) {
+    return (
+      '# Codebase Intelligence\n\n' +
+      'Intelligence data not yet generated. Run indexing first.\n' +
+      `Error: ${error instanceof Error ? error.message : String(error)}`
+    );
+  }
+}
diff --git a/tests/discovery-harness.test.ts b/tests/discovery-harness.test.ts
new file mode 100644
index 0000000..f7fa200
--- /dev/null
+++ b/tests/discovery-harness.test.ts
@@ -0,0 +1,305 @@
+import { describe, expect, it } from 'vitest';
+import {
+  combineDiscoverySummaries,
+  evaluateDiscoveryGate,
+  evaluateDiscoveryFixture,
+  formatDiscoveryReport
+} from '../src/eval/discovery-harness.js';
+import type {
+  DiscoveryBenchmarkProtocol,
+  DiscoveryFixture,
+  DiscoverySummary,
+  DiscoverySurfaceResult
+} from '../src/eval/types.js';
+import angularDiscoveryFixture from './fixtures/discovery-angular-spotify.json';
+import excalidrawDiscoveryFixture from './fixtures/discovery-excalidraw.json';
+import discoveryProtocol from './fixtures/discovery-benchmark-protocol.json';
+
+describe('Discovery benchmark fixtures', () => {
+  it('keeps angular-spotify discovery fixture frozen at 12 tasks with balanced job coverage', () => {
+    expect(angularDiscoveryFixture.tasks).toHaveLength(12);
+    const counts = angularDiscoveryFixture.tasks.reduce<Record<string, number>>((acc, task) => {
+      acc[task.job] = (acc[task.job] ?? 0) + 1;
+      return acc;
+    }, {});
+    expect(counts.map).toBe(4);
+    expect(counts.find).toBe(4);
+    expect(counts.search).toBe(4);
+  });
+
+  it('keeps excalidraw discovery fixture frozen at 12 tasks with balanced job coverage', () => {
+    expect(excalidrawDiscoveryFixture.tasks).toHaveLength(12);
+    const counts = excalidrawDiscoveryFixture.tasks.reduce<Record<string, number>>((acc, task) => {
+      acc[task.job] = (acc[task.job] ?? 0) + 1;
+      return acc;
+    }, {});
+    expect(counts.map).toBe(4);
+    expect(counts.find).toBe(4);
+    expect(counts.search).toBe(4);
+  });
+
+  it('freezes the discovery protocol around current shipped surfaces only', () => {
+    expect(discoveryProtocol.allowedSurfaces).toEqual([
+      'search_codebase',
+      'get_codebase_metadata',
+      'get_team_patterns',
+      'codebase://context'
+    ]);
+    expect(discoveryProtocol.forbiddenSurfaces).toContain('get_codebase_map');
+    expect(discoveryProtocol.comparators).toHaveLength(4);
+  });
+
+  it('pins both public discovery fixtures to concrete repository refs', () => {
+    expect(angularDiscoveryFixture.repositoryRef).toMatch(/^[0-9a-f]{40}$/);
+    expect(excalidrawDiscoveryFixture.repositoryRef).toMatch(/^[0-9a-f]{40}$/);
+  });
+});
+
+describe('Discovery harness scoring', () => {
+  it('scores expected signals, first relevant hit, and best-example usefulness deterministically', async () => {
+    const fixture: DiscoveryFixture = {
+      description: 'unit discovery fixture',
+      tasks: [
+        {
+          id: 'map-1',
+          title: 'Map task',
+          job: 'map',
+          surface: 'codebase://context',
+          prompt: 'map',
+          expectedSignals: ['libraries actually used', 'patterns']
+        },
+        {
+          id: 'find-1',
+          title: 'Find task',
+          job: 'find',
+          surface: 'get_team_patterns',
+          prompt: 'find',
+          expectedSignals: ['dependencyInjection'],
+          expectedBestExamplePatterns: ['src/auth/auth.interceptor.ts']
+        },
+        {
+          id: 'search-1',
+          title: 'Search task',
+          job: 'search',
+          surface: 'search_codebase',
+          prompt: 'search',
+          expectedSignals: ['results', 'searchQuality'],
+          expectedFilePatterns: ['auth.interceptor.ts']
+        }
+      ]
+    };
+
+    const summary = await evaluateDiscoveryFixture({
+      fixture,
+      rootPath: 'C:/repo',
+      surfaceRunners: {
+        'codebase://context': async () => ({
+          payload: '# Codebase Intelligence\n\n## Libraries Actually Used\n\n## Patterns'
+        }),
+        get_team_patterns: async () => ({
+          payload: '{"patterns":{"dependencyInjection":{"primary":{"name":"inject()","frequency":"90%"}}}}',
+          bestExample: 'src/auth/auth.interceptor.ts'
+        }),
+        search_codebase: async () => ({
+          payload: '{"status":"success","searchQuality":{"status":"ok"},"results":[{"file":"src/auth/auth.interceptor.ts:1-10"}]}',
+          topFiles: ['src/auth/auth.interceptor.ts']
+        })
+      }
+    });
+
+    expect(summary.totalTasks).toBe(3);
+    expect(summary.averageUsefulness).toBeCloseTo(1, 4);
+    expect(summary.averageFirstRelevantHit).toBe(1);
+    expect(summary.bestExampleUsefulnessRate).toBe(1);
+    expect(summary.results[2]?.firstRelevantHit).toBe(1);
+  });
+
+  it('formats a compact discovery report', async () => {
+    const fixture: DiscoveryFixture = {
+      tasks: [
+        {
+          id: 'search-1',
+          title: 'Search task',
+          job: 'search',
+          surface: 'search_codebase',
+          prompt: 'search',
+          expectedSignals: ['results'],
+          expectedFilePatterns: ['player-api.ts']
+        }
+      ]
+    };
+
+    const summary = await evaluateDiscoveryFixture({
+      fixture,
+      rootPath: 'C:/repo',
+      surfaceRunners: {
+        search_codebase: async (): Promise<DiscoverySurfaceResult> => ({
+          payload: '{"results":[{"file":"src/player-api.ts:1-4"}]}',
+          topFiles: ['src/player-api.ts']
+        })
+      }
+    });
+
+    const report = formatDiscoveryReport({
+      codebaseLabel: 'fixture-repo',
+      fixturePath: 'tests/fixtures/discovery-angular-spotify.json',
+      summary
+    });
+
+    expect(report).toContain('Discovery Eval Report');
+    expect(report).toContain('Average usefulness');
+    expect(report).toContain('search-1');
+  });
+});
+
+describe('Discovery gate evaluation', () => {
+  const protocol = discoveryProtocol as DiscoveryBenchmarkProtocol;
+
+  function createSummary(
+    overrides: Partial<DiscoverySummary> = {}
+  ): DiscoverySummary {
+    return {
+      totalTasks: 24,
+      averageUsefulness: 0.9,
+      averagePayloadBytes: 1200,
+      averageEstimatedTokens: 300,
+      searchTasks: 8,
+      findTasks: 8,
+      mapTasks: 8,
+      averageFirstRelevantHit: 1.2,
+      bestExampleUsefulnessRate: 0.9,
+      results: [],
+      ...overrides
+    };
+  }
+
+  it('combines multiple discovery summaries before gate evaluation', () => {
+    const combined = combineDiscoverySummaries([
+      createSummary({
+        results: [
+          {
+            taskId: 'one',
+            title: 'one',
+            job: 'map',
+            surface: 'codebase://context',
+            usefulnessScore: 0.8,
+            matchedSignals: [],
+            missingSignals: [],
+            forbiddenHits: [],
+            payloadBytes: 100,
+            estimatedTokens: 25
+          }
+        ]
+      }),
+      createSummary({
+        results: [
+          {
+            taskId: 'two',
+            title: 'two',
+            job: 'search',
+            surface: 'search_codebase',
+            usefulnessScore: 1,
+            matchedSignals: [],
+            missingSignals: [],
+            forbiddenHits: [],
+            payloadBytes: 80,
+            estimatedTokens: 20,
+            firstRelevantHit: 1
+          }
+        ]
+      })
+    ]);
+
+    expect(combined.totalTasks).toBe(2);
+    expect(combined.averageEstimatedTokens).toBe(22.5);
+    expect(combined.averageFirstRelevantHit).toBe(1);
+  });
+
+  it('marks the gate pending when required comparator evidence is missing', () => {
+    const summary = createSummary();
+    const gate = evaluateDiscoveryGate({
+      summary,
+      protocol,
+      suiteComplete: false
+    });
+
+    expect(gate.status).toBe('pending_evidence');
+    expect(gate.claimAllowed).toBe(false);
+    expect(gate.missingEvidence).toContain('fixed public discovery suite is incomplete');
+  });
+
+  it('passes the gate when baseline and comparator metrics satisfy the frozen rules', () => {
+    const summary = createSummary();
+    const gate = evaluateDiscoveryGate({
+      summary,
+      protocol,
+      suiteComplete: true,
+      comparatorEvidence: {
+        'raw Claude Code': {
+          averageEstimatedTokens: 450,
+          averageUsefulness: 0.75,
+          averageFirstRelevantHit: 1.5,
+          bestExampleUsefulnessRate: 0.8
+        },
+        GrepAI: {
+          averageUsefulness: 0.92,
+          averageFirstRelevantHit: 1.1,
+          bestExampleUsefulnessRate: 0.95
+        },
+        jCodeMunch: {
+          averageUsefulness: 0.98,
+          averageFirstRelevantHit: 1.25,
+          bestExampleUsefulnessRate: 0.98
+        },
+        'codebase-memory-mcp': {
+          averageUsefulness: 0.93,
+          averageFirstRelevantHit: 1.3,
+          bestExampleUsefulnessRate: 0.96
+        }
+      }
+    });
+
+    expect(gate.status).toBe('passed');
+    expect(gate.baseline.payloadMetricPassed).toBe(true);
+    expect(gate.baseline.beatenUsefulnessMetrics.length).toBeGreaterThan(0);
+    expect(gate.comparators.every((comparator) => comparator.status === 'passed')).toBe(true);
+  });
+
+  it('fails the gate when usefulness falls outside the frozen 15% comparator tolerance', () => {
+    const summary = createSummary({
+      averageUsefulness: 0.6,
+      bestExampleUsefulnessRate: 0.6
+    });
+    const gate = evaluateDiscoveryGate({
+      summary,
+      protocol,
+      suiteComplete: true,
+      comparatorEvidence: {
+        'raw Claude Code': {
+          averageEstimatedTokens: 450,
+          averageUsefulness: 0.55,
+          averageFirstRelevantHit: 1.6,
+          bestExampleUsefulnessRate: 0.55
+        },
+        GrepAI: {
+          averageUsefulness: 0.9,
+          averageFirstRelevantHit: 1.0,
+          bestExampleUsefulnessRate: 0.9
+        },
+        jCodeMunch: {
+          averageUsefulness: 0.91,
+          averageFirstRelevantHit: 1.0,
+          bestExampleUsefulnessRate: 0.91
+        },
+        'codebase-memory-mcp': {
+          averageUsefulness: 0.92,
+          averageFirstRelevantHit: 1.0,
+          bestExampleUsefulnessRate: 0.92
+        }
+      }
+    });
+
+    expect(gate.status).toBe('failed');
+    expect(gate.comparators.some((comparator) => comparator.status === 'failed')).toBe(true);
+  });
+});
diff --git a/tests/eval-harness.test.ts b/tests/eval-harness.test.ts
index 14efe40..9d483bb 100644
--- a/tests/eval-harness.test.ts
+++ b/tests/eval-harness.test.ts
@@ -2,7 +2,13 @@ import { describe, expect, it, vi } from 'vitest';
 import { CodebaseSearcher } from '../src/core/search.js';
 import type { CodeChunk, SearchResult } from '../src/types/index.js';
 import type { EvalFixture, EvalQuery } from '../src/eval/types.js';
-import { evaluateFixture, summarizeEvaluation, formatEvalReport } from '../src/eval/harness.js';
+import {
+  countUtf8Bytes,
+  estimateTokenCountFromBytes,
+  evaluateFixture,
+  summarizeEvaluation,
+  formatEvalReport
+} from '../src/eval/harness.js';
 import angularFixture from './fixtures/eval-angular-spotify.json';
 import controlledFixture from './fixtures/eval-controlled.json';
 
@@ -82,6 +88,12 @@ describe('Eval Harness - fixtures loaded', () => {
 });
 
 describe('Eval Harness - scoring logic', () => {
+  it('estimates payload cost with a fixed bytes-to-token heuristic', () => {
+    const bytes = countUtf8Bytes('auth interceptor');
+    expect(bytes).toBeGreaterThan(0);
+    expect(estimateTokenCountFromBytes(bytes)).toBe(Math.ceil(bytes / 4));
+  });
+
   it('marks correct top-1 when implementation file is first', async () => {
     const query: EvalQuery = {
       id: 7,
diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md
index e69ae67..18d954c 100644
--- a/tests/fixtures/README.md
+++ b/tests/fixtures/README.md
@@ -1,11 +1,14 @@
 # Evaluation Fixtures
 
-This directory contains frozen evaluation sets for testing code search quality.
+This directory contains frozen evaluation sets for testing retrieval and discovery quality.
 
 ## Files
 
-- `eval-angular-spotify.json` - 20 semantic queries against [angular-spotify](https://github.com/trungk18/angular-spotify) (public, reproducible)
-- `eval-controlled.json` - 20 frozen queries for the in-repo controlled fixture codebase
+- `eval-angular-spotify.json` - 20 semantic retrieval queries against [angular-spotify](https://github.com/trungk18/angular-spotify)
+- `eval-controlled.json` - 20 frozen retrieval queries for the in-repo controlled fixture codebase
+- `discovery-angular-spotify.json` - 12 discovery tasks for `angular-spotify`
+- `discovery-excalidraw.json` - 12 discovery tasks for `Excalidraw`
+- `discovery-benchmark-protocol.json` - frozen scope, comparator set, fairness rules, and ship gate for the discovery benchmark
 
 ## Running Evaluations
 
@@ -24,18 +27,30 @@ npm install
 npm run build
 ```
 
-### Run Evaluation
+### Run Retrieval Evaluation
 
 ```bash
-node scripts/run-eval.mjs /path/to/angular-spotify --fixture tests/fixtures/eval-angular-spotify.json
+node scripts/run-eval.mjs /path/to/angular-spotify --mode retrieval --fixture-a tests/fixtures/eval-angular-spotify.json
 
 # Controlled fixture example (no network)
-node scripts/run-eval.mjs tests/fixtures/codebases/eval-controlled --fixture tests/fixtures/eval-controlled.json
+node scripts/run-eval.mjs tests/fixtures/codebases/eval-controlled --mode retrieval --fixture-a tests/fixtures/eval-controlled.json
+```
+
+### Run Discovery Evaluation
+
+```bash
+node scripts/run-eval.mjs /path/to/angular-spotify /path/to/excalidraw --mode discovery
+```
+
+Optional comparator evidence file:
+
+```bash
+node scripts/run-eval.mjs /path/to/angular-spotify /path/to/excalidraw --mode discovery --competitor-results /path/to/discovery-comparator-results.json
 ```
 
 ### Output Format
 
-The eval script outputs:
+The retrieval harness outputs:
 
 - **Top-1 Accuracy**: % of queries where the best result matches expected patterns
 - **Top-3 Recall**: % of queries where top-3 results include a match
@@ -43,15 +58,29 @@ The eval script outputs:
 - **Per-category breakdown**: Accuracy by query type (exact-name, conceptual, multi-concept, structural)
 - **Failure analysis**: Which queries failed and why
 
+The discovery harness outputs:
+
+- **Average usefulness**: expected-signal match rate with forbidden-signal penalties
+- **Average payload**: UTF-8 bytes returned by the current shipped surface
+- **Average estimated tokens**: fixed bytes-to-token heuristic for fair comparison
+- **Average first relevant hit**: position of the first relevant file for search tasks
+- **Best-example usefulness**: whether find tasks surfaced the expected exemplar
+
 ## Evaluation Integrity Rules
 
-⚠️ **CRITICAL**: These eval fixtures are FROZEN. Once committed:
+⚠️ **CRITICAL**: These fixtures are FROZEN. Once committed:
 
 1. **DO NOT** adjust expected results to match system output
 2. **DO NOT** add queries during development to "improve" scores
 3. **DO NOT** remove "hard" queries that the system fails
 4. **DO NOT** tune the system on this eval set then report scores
 
+For discovery specifically:
+
+5. **DO NOT** benchmark an unreleased `map` command or a new MCP map tool
+6. **DO NOT** claim implementation quality from this benchmark
+7. **DO** keep comparator setup limitations explicit when a lane requires manual log capture
+
 ### Proper Usage
 
 ✅ **CORRECT**:
@@ -124,18 +153,112 @@ Example:
 
 To reproduce published results:
 
-1. Clone the exact codebase version:
+1. Clone the exact codebase versions:
 
 ```bash
-git clone https://github.com/trungk18/angular-spotify
-cd angular-spotify
-git checkout <commit-hash-from-published-results>
+git clone https://github.com/trungk18/angular-spotify /path/to/angular-spotify
+git -C /path/to/angular-spotify checkout ff9efa765c53cfde78c9a172c62d515ae8ef9fe0
+
+git clone https://github.com/excalidraw/excalidraw /path/to/excalidraw
+git -C /path/to/excalidraw checkout e18c1dd213000dde0ae94ef7eb00aab537b39708
 ```
 
 2. Use the frozen eval fixture (committed before measurements)
-3. Run eval on both baseline and new version
+3. Run eval on both pinned repos
 4. Compare metrics transparently
 
+## Discovery Benchmark Scope
+
+Phase 5 freezes discovery around three jobs only:
+
+1. **Map** - repo orientation and subsystem awareness
+2. **Find** - dominant local pattern and best-example discovery
+3. **Search** - targeted file and symbol discovery with low noise
+
+Allowed current-surface lane:
+
+- `search_codebase`
+- `get_codebase_metadata`
+- `get_team_patterns`
+- `codebase://context`
+
+Explicitly out of bounds:
+
+- unreleased `map` CLI behavior
+- a new MCP `get_codebase_map` tool
+- implementation-quality or code-generation claims
+
+## Comparator Notes
+
+- `raw Claude Code` is the primary baseline and uses a manual log-capture lane
+- `GrepAI`, `jCodeMunch`, and `codebase-memory-mcp` are the named MCP comparators
+- `codebase-memory-mcp` is the heavier structural comparator, not the primary public baseline
+- If a comparator cannot be run fairly via direct tool calls, document the public setup and use the manual lane rather than inventing a fake automation path
+
+## Comparator Setup Commands
+
+These commands document the public setup path for the named comparator set. They do not convert the benchmark into a built-in automation path; the named comparators still run through the documented manual log-capture lane.
+
+### raw Claude Code
+
+Public install path:
+
+```bash
+npm install -g @anthropic-ai/claude-code
+cd /path/to/angular-spotify
+claude
+```
+
+Use the same Claude Code version, model, and base instructions across all baseline captures.
+
+### GrepAI
+
+Public install path from the project README:
+
+```bash
+curl -sSL https://raw.githubusercontent.com/yoanbernabeu/grepai/main/install.sh | sh
+ollama pull nomic-embed-text
+cd /path/to/angular-spotify
+grepai init
+grepai watch
+```
+
+Windows PowerShell install path:
+
+```powershell
+irm https://raw.githubusercontent.com/yoanbernabeu/grepai/main/install.ps1 | iex
+```
+
+### jCodeMunch
+
+Public install path from PyPI:
+
+```bash
+pip install jcodemunch-mcp
+claude mcp add jcodemunch uvx jcodemunch-mcp
+cd /path/to/angular-spotify
+claude
+```
+
+Use the same MCP-enabled Claude Code session style for the benchmark capture and let jCodeMunch index the project through its documented first-run flow.
+
+### codebase-memory-mcp
+
+Public install path from the project README:
+
+```bash
+curl -fsSL https://raw.githubusercontent.com/DeusData/codebase-memory-mcp/main/install.sh | bash
+cd /path/to/angular-spotify
+```
+
+Windows PowerShell install path:
+
+```powershell
+powershell -ExecutionPolicy ByPass -c "irm https://raw.githubusercontent.com/DeusData/codebase-memory-mcp/main/install.ps1 | iex"
+```
+
+After installation, restart the coding agent and use the documented prompt to index the project before running the manual benchmark capture.
+
 ## Adding New Eval Sets
 
 When creating new eval sets:
diff --git a/tests/fixtures/discovery-angular-spotify.json b/tests/fixtures/discovery-angular-spotify.json
new file mode 100644
index 0000000..709e38b
--- /dev/null
+++ b/tests/fixtures/discovery-angular-spotify.json
@@ -0,0 +1,123 @@
+{
+  "description": "Frozen discovery benchmark tasks for angular-spotify using current shipped surfaces only.",
+  "codebase": "angular-spotify",
+  "repository": "trungk18/angular-spotify",
+  "repositoryUrl": "https://github.com/trungk18/angular-spotify",
+  "repositoryRef": "ff9efa765c53cfde78c9a172c62d515ae8ef9fe0",
+  "frozenDate": "2026-04-04",
+  "notes": "Tasks are discovery-only and intentionally benchmark the current resource and tool surface, not future Phase 7 or 8 behavior.",
+  "tasks": [
+    {
+      "id": "as-map-01",
+      "title": "Identify the main app areas from the current map surface",
+      "job": "map",
+      "surface": "codebase://context",
+      "prompt": "What are the main app areas in this repo?",
+      "expectedSignals": ["libraries actually used", "patterns", "generated:"]
+    },
+    {
+      "id": "as-map-02",
+      "title": "Orient to the framework and architecture",
+      "job": "map",
+      "surface": "get_codebase_metadata",
+      "prompt": "What framework and architecture shape does this repo use?",
+      "expectedSignals": ["framework", "architecture", "statistics"]
+    },
+    {
+      "id": "as-map-03",
+      "title": "Find store-heavy subsystems in the repo overview",
+      "job": "map",
+      "surface": "codebase://context",
+      "prompt": "Which subsystems look state-heavy?",
+      "expectedSignals": ["state", "patterns", "libraries actually used"]
+    },
+    {
+      "id": "as-map-04",
+      "title": "See whether internal path aliases exist",
+      "job": "map",
+      "surface": "codebase://context",
+      "prompt": "Does this repo expose import aliases?",
+      "expectedSignals": ["import aliases", "tsconfig"]
+    },
+    {
+      "id": "as-find-01",
+      "title": "Find the dominant dependency injection pattern",
+      "job": "find",
+      "surface": "get_team_patterns",
+      "prompt": "What DI pattern dominates here?",
+      "args": { "category": "di" },
+      "expectedSignals": ["dependencyInjection"],
+      "expectedBestExamplePatterns": ["src", "service"]
+    },
+    {
+      "id": "as-find-02",
+      "title": "Find the dominant state-management pattern",
+      "job": "find",
+      "surface": "get_team_patterns",
+      "prompt": "What state-management pattern should an agent imitate?",
+      "args": { "category": "state" },
+      "expectedSignals": ["stateManagement"],
+      "expectedBestExamplePatterns": ["store", "state"]
+    },
+    {
+      "id": "as-find-03",
+      "title": "Find a best local example before editing auth behavior",
+      "job": "find",
+      "surface": "search_codebase",
+      "prompt": "What should I imitate for auth-related request handling?",
+      "args": { "query": "auth interceptor request handling", "intent": "edit", "limit": 5 },
+      "expectedSignals": ["preflight", "bestExample", "patterns"],
+      "expectedBestExamplePatterns": ["auth", "interceptor"]
+    },
+    {
+      "id": "as-find-04",
+      "title": "Find testing conventions and likely examples",
+      "job": "find",
+      "surface": "get_team_patterns",
+      "prompt": "What testing setup dominates here?",
+      "args": { "category": "testing" },
+      "expectedSignals": ["unitTestFramework", "test"],
+      "expectedBestExamplePatterns": ["spec", "test"]
+    },
+    {
+      "id": "as-search-01",
+      "title": "Target the playback API entrypoint",
+      "job": "search",
+      "surface": "search_codebase",
+      "prompt": "skip to next song",
+      "args": { "query": "skip to next song", "limit": 5 },
+      "expectedSignals": ["results", "searchQuality"],
+      "expectedFilePatterns": ["player-api", "player/api"]
+    },
+    {
+      "id": "as-search-02",
+      "title": "Find persistence wiring",
+      "job": "search",
+      "surface": "search_codebase",
+      "prompt": "persist data across browser sessions",
+      "args": { "query": "persist data across browser sessions", "limit": 5 },
+      "expectedSignals": ["results", "searchQuality"],
+      "expectedFilePatterns": ["storage", "local-storage"]
+    },
+    {
+      "id": "as-search-03",
+      "title": "Find where auth headers are added",
+      "job": "search",
+      "surface": "search_codebase",
+      "prompt": "add authorization token to API requests",
+      "args": { "query": "add authorization token to API requests", "limit": 5 },
+      "expectedSignals": ["results", "searchQuality"],
+      "expectedFilePatterns": ["auth", "interceptor"]
+    },
+    {
+      "id": "as-search-04",
+      "title": "Find album selectors or equivalent state wiring",
+      "job": "search",
+      "surface": "search_codebase",
+      "prompt": "album selectors ngrx",
+      "args": { "query": "album selectors ngrx", "limit": 5 },
+      "expectedSignals": ["results", "searchQuality"],
+      "expectedFilePatterns": ["selector", "store", "album"]
+    }
+  ]
+}
diff --git a/tests/fixtures/discovery-benchmark-protocol.json b/tests/fixtures/discovery-benchmark-protocol.json
new file mode 100644
index 0000000..82e7378
--- /dev/null
+++ b/tests/fixtures/discovery-benchmark-protocol.json
@@ -0,0 +1,74 @@
+{
+  "name": "v2-discovery-benchmark",
+  "frozenDate": "2026-04-04",
+  "scope": "discovery-only",
+  "jobs": ["map", "find", "search"],
+  "allowedSurfaces": [
+    "search_codebase",
+    "get_codebase_metadata",
+    "get_team_patterns",
+    "codebase://context"
+  ],
+  "forbiddenSurfaces": ["map", "get_codebase_map"],
+  "primaryLane": "direct-tool",
+  "secondaryLane": "manual-log-capture",
+  "comparators": [
+    {
+      "name": "raw Claude Code",
+      "kind": "baseline",
+      "execution": "manual-log-capture",
+      "notes": "Use native grep/glob/read exploration only. Record payload cost and usefulness from captured session logs."
+    },
+    {
+      "name": "GrepAI",
+      "kind": "mcp-comparator",
+      "execution": "manual-log-capture",
+      "notes": "Run from public install/setup commands only. If an equivalent direct-tool run is unavailable, keep it in the manual lane."
+    },
+    {
+      "name": "jCodeMunch",
+      "kind": "mcp-comparator",
+      "execution": "manual-log-capture",
+      "notes": "Run from public install/setup commands only. Preserve its documented init and policy flow when collecting logs."
+    },
+    {
+      "name": "codebase-memory-mcp",
+      "kind": "mcp-comparator",
+      "execution": "manual-log-capture",
+      "notes": "Treat as the heavier structural comparator, not the primary public baseline."
+    }
+  ],
+  "metrics": {
+    "payloadCost": ["averagePayloadBytes", "averageEstimatedTokens"],
+    "usefulness": ["averageUsefulness", "averageFirstRelevantHit", "bestExampleUsefulnessRate"]
+  },
+  "fairnessRules": [
+    "Use only current shipped codebase-context surfaces in the direct-tool lane.",
+    "Do not benchmark unreleased map functionality.",
+    "Do not change task wording or expected signals after product-shaping implementation starts.",
+    "Do not claim implementation-quality or code-generation quality from this benchmark."
+  ],
+  "shipGate": {
+    "baselineRule": "Beat raw Claude Code on payload cost and at least one usefulness metric across the frozen public tasks.",
+    "comparatorRule": "Stay within 15% of GrepAI, jCodeMunch, and codebase-memory-mcp on frozen usefulness metrics.",
+    "claimRule": "If the gate is missed on any slice, report the loss and do not broaden relaunch claims.",
+    "baseline": {
+      "comparatorName": "raw Claude Code",
+      "payloadMetric": "averageEstimatedTokens",
+      "usefulnessMetrics": [
+        "averageUsefulness",
+        "averageFirstRelevantHit",
+        "bestExampleUsefulnessRate"
+      ]
+    },
+    "comparators": {
+      "requiredNames": ["GrepAI", "jCodeMunch", "codebase-memory-mcp"],
+      "tolerancePercent": 15,
+      "usefulnessMetrics": [
+        "averageUsefulness",
+        "averageFirstRelevantHit",
+        "bestExampleUsefulnessRate"
+      ]
+    }
+  }
+}
diff --git a/tests/fixtures/discovery-excalidraw.json b/tests/fixtures/discovery-excalidraw.json
new file mode 100644
index 0000000..4e1e223
--- /dev/null
+++ b/tests/fixtures/discovery-excalidraw.json
@@ -0,0 +1,123 @@
+{
+  "description": "Frozen discovery benchmark tasks for Excalidraw using current shipped surfaces only.",
+  "codebase": "Excalidraw",
+  "repository": "excalidraw/excalidraw",
+  "repositoryUrl": "https://github.com/excalidraw/excalidraw",
+  "repositoryRef": "e18c1dd213000dde0ae94ef7eb00aab537b39708",
+  "frozenDate": "2026-04-04",
+  "notes": "Tasks stay broad enough to benchmark discovery honestly against the current product surface without assuming a future compact map command.",
+  "tasks": [
+    {
+      "id": "ex-map-01",
+      "title": "Orient to the main subsystems",
+      "job": "map",
+      "surface": "get_codebase_metadata",
+      "prompt": "What are the main app areas in Excalidraw?",
+      "expectedSignals": ["framework", "architecture", "statistics"]
+    },
+    {
+      "id": "ex-map-02",
+      "title": "Use the existing context resource as a repo overview",
+      "job": "map",
+      "surface": "codebase://context",
+      "prompt": "Give me the current codebase intelligence overview.",
+      "expectedSignals": ["codebase intelligence", "libraries actually used", "patterns"]
+    },
+    {
+      "id": "ex-map-03",
+      "title": "Detect whether the repo uses internal path aliases",
+      "job": "map",
+      "surface": "codebase://context",
+      "prompt": "Does Excalidraw expose import aliases?",
+      "expectedSignals": ["import aliases", "tsconfig"]
+    },
+    {
+      "id": "ex-map-04",
+      "title": "Identify likely scene and element-heavy areas",
+      "job": "map",
+      "surface": "codebase://context",
+      "prompt": "Which areas look scene or element heavy?",
+      "expectedSignals": ["patterns", "libraries actually used", "generated:"]
+    },
+    {
+      "id": "ex-find-01",
+      "title": "Find the dominant state-management pattern",
+      "job": "find",
+      "surface": "get_team_patterns",
+      "prompt": "What state-management pattern dominates here?",
+      "args": { "category": "state" },
+      "expectedSignals": ["stateManagement"],
+      "expectedBestExamplePatterns": ["appstate", "state", "store"]
+    },
+    {
+      "id": "ex-find-02",
+      "title": "Find a best example before editing scene behavior",
+      "job": "find",
+      "surface": "search_codebase",
+      "prompt": "What local example should I imitate for scene updates?",
+      "args": { "query": "scene update flow", "intent": "edit", "limit": 5 },
+      "expectedSignals": ["preflight", "bestExample", "patterns"],
+      "expectedBestExamplePatterns": ["scene", "app", "element"]
+    },
+    {
+      "id": "ex-find-03",
+      "title": "Find testing conventions for UI-heavy code",
+      "job": "find",
+      "surface": "get_team_patterns",
+      "prompt": "What testing setup dominates here?",
+      "args": { "category": "testing" },
+      "expectedSignals": ["test", "framework"],
+      "expectedBestExamplePatterns": ["test", "spec"]
+    },
+    {
+      "id": "ex-find-04",
+      "title": "Find the dominant dependency pattern",
+      "job": "find",
+      "surface": "get_team_patterns",
+      "prompt": "What dependency injection or dependency pattern dominates here?",
+      "args": { "category": "di" },
+      "expectedSignals": ["dependencyInjection"],
+      "expectedBestExamplePatterns": ["src", "app", "component"]
+    },
+    {
+      "id": "ex-search-01",
+      "title": "Find where element types or definitions live",
+      "job": "search",
+      "surface": "search_codebase",
+      "prompt": "element type definitions",
+      "args": { "query": "element type definitions", "limit": 5 },
+      "expectedSignals": ["results", "searchQuality"],
+      "expectedFilePatterns": ["element", "type"]
+    },
+    {
+      "id": "ex-search-02",
+      "title": "Find scene serialization or export logic",
+      "job": "search",
+      "surface": "search_codebase",
+      "prompt": "scene serialization export json",
+      "args": { "query": "scene serialization export json", "limit": 5 },
+      "expectedSignals": ["results", "searchQuality"],
+      "expectedFilePatterns": ["scene", "json", "data"]
+    },
+    {
+      "id": "ex-search-03",
+      "title": "Find app state wiring",
+      "job": "search",
+      "surface": "search_codebase",
+      "prompt": "app state selection and updates",
+      "args": { "query": "app state selection and updates", "limit": 5 },
+      "expectedSignals": ["results", "searchQuality"],
+      "expectedFilePatterns": ["appstate", "state", "app"]
+    },
+    {
+      "id": "ex-search-04",
+      "title": "Find the main canvas or app entry surface",
+      "job": "search",
+      "surface": "search_codebase",
+      "prompt": "main canvas app entry",
+      "args": { "query": "main canvas app entry", "limit": 5 },
+      "expectedSignals": ["results", "searchQuality"],
+      "expectedFilePatterns": ["app", "excalidraw", "canvas"]
+    }
+  ]
+}