ops-codegraph-tool/tests/benchmarks/resolution/resolution-benchmark.test.ts at 78840a1e7bc76cd218cefb5997e5cff2cd03ea47 · optave/ops-codegraph-tool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
/**
 * Call Resolution Precision/Recall Benchmark Suite (Roadmap 4.4)
 *
 * Builds codegraph for each hand-annotated fixture project, then compares
 * the resolved call edges against the expected-edges.json manifest.
 *
 * Reports precision (correct / total resolved) and recall (correct / total expected)
 * per language and per resolution mode (static, receiver-typed, interface-dispatched).
 *
 * CI gate: fails if precision < 85% or recall < 80% for JavaScript or TypeScript.
 */

import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import { afterAll, beforeAll, describe, expect, test } from 'vitest';
import { openReadonlyOrFail } from '../../../src/db/index.js';
import { buildGraph } from '../../../src/domain/graph/builder.js';

// ── Types ─────────────────────────────────────────────────────────────────

interface ResolvedEdge {
  source_name: string;
  source_file: string;
  target_name: string;
  target_file: string;
  kind: string;
  confidence: number;
}

interface ExpectedEdge {
  source: { name: string; file: string };
  target: { name: string; file: string };
  mode?: string;
}

interface ModeMetrics {
  expected: number;
  resolved: number;
  recall?: number;
}

interface BenchmarkMetrics {
  precision: number;
  recall: number;
  truePositives: number;
  falsePositives: number;
  falseNegatives: number;
  totalResolved: number;
  totalExpected: number;
  byMode: Record<string, ModeMetrics>;
  falsePositiveEdges: string[];
  falseNegativeEdges: string[];
}

// ── Configuration ────────────────────────────────────────────────────────

const FIXTURES_DIR = path.join(import.meta.dirname, 'fixtures');

/**
 * Thresholds are baselines — they ratchet up as resolution improves.
 * Current values reflect measured capabilities as of the initial benchmark.
 * Target: precision ≥85%, recall ≥80% for both JS and TS.
 *
 * Receiver-typed recall thresholds are tracked separately and start lower
 * because cross-file receiver dispatch is still maturing.
 */
const THRESHOLDS = {
  javascript: { precision: 0.85, recall: 0.55, staticRecall: 0.6, receiverRecall: 0.3 },
  typescript: { precision: 0.85, recall: 0.58, staticRecall: 0.9, receiverRecall: 0.45 },
};

// ── Helpers ──────────────────────────────────────────────────────────────

/**
 * Copy fixture to a temp directory so buildGraph can write .codegraph/ without
 * polluting the repo.
 */
function copyFixture(lang: string): string {
  const src = path.join(FIXTURES_DIR, lang);
  const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `codegraph-resolution-${lang}-`));
  for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
    if (entry.name === 'expected-edges.json') continue;
    if (!entry.isFile()) continue;
    fs.copyFileSync(path.join(src, entry.name), path.join(tmp, entry.name));
  }
  return tmp;
}

/**
 * Build graph for a fixture directory.
 */
async function buildFixtureGraph(fixtureDir: string): Promise<void> {
  await buildGraph(fixtureDir, {
    incremental: false,
    engine: 'wasm',
    dataflow: false,
    cfg: false,
    ast: false,
  });
}

/**
 * Extract all call edges from the built graph DB.
 * Returns array of { sourceName, sourceFile, targetName, targetFile, kind, confidence }.
 */
function extractResolvedEdges(fixtureDir: string) {
  const dbPath = path.join(fixtureDir, '.codegraph', 'graph.db');
  const db = openReadonlyOrFail(dbPath);
  try {
    const rows = db
      .prepare(`
      SELECT
        src.name  AS source_name,
        src.file  AS source_file,
        tgt.name  AS target_name,
        tgt.file  AS target_file,
        e.kind    AS kind,
        e.confidence AS confidence
      FROM edges e
      JOIN nodes src ON e.source_id = src.id
      JOIN nodes tgt ON e.target_id = tgt.id
      WHERE e.kind = 'calls'
        AND src.kind IN ('function', 'method')
    `)
      .all();
    return rows;
  } finally {
    db.close();
  }
}

/**
 * Normalize a file path to just the basename for comparison.
 */
function normalizeFile(filePath: string): string {
  return path.basename(filePath);
}

/**
 * Build a string key for an edge to enable set-based comparison.
 */
function edgeKey(
  sourceName: string,
  sourceFile: string,
  targetName: string,
  targetFile: string,
): string {
  return `${sourceName}@${normalizeFile(sourceFile)} -> ${targetName}@${normalizeFile(targetFile)}`;
}

/**
 * Compare resolved edges against expected edges manifest.
 * Returns precision, recall, and detailed breakdown by mode.
 */
function computeMetrics(
  resolvedEdges: ResolvedEdge[],
  expectedEdges: ExpectedEdge[],
): BenchmarkMetrics {
  // Build sets for overall comparison
  const resolvedSet = new Set(
    resolvedEdges.map((e) => edgeKey(e.source_name, e.source_file, e.target_name, e.target_file)),
  );

  const expectedSet = new Set(
    expectedEdges.map((e) => edgeKey(e.source.name, e.source.file, e.target.name, e.target.file)),
  );

  // True positives: edges in both resolved and expected
  const truePositives = new Set([...resolvedSet].filter((k) => expectedSet.has(k)));

  // False positives: resolved but not expected
  const falsePositives = new Set([...resolvedSet].filter((k) => !expectedSet.has(k)));

  // False negatives: expected but not resolved
  const falseNegatives = new Set([...expectedSet].filter((k) => !resolvedSet.has(k)));

  const precision = resolvedSet.size > 0 ? truePositives.size / resolvedSet.size : 0;
  const recall = expectedSet.size > 0 ? truePositives.size / expectedSet.size : 0;

  // Break down by resolution mode
  const byMode: Record<string, ModeMetrics> = {};
  for (const edge of expectedEdges) {
    const mode = edge.mode || 'unknown';
    if (!byMode[mode]) byMode[mode] = { expected: 0, resolved: 0 };
    byMode[mode].expected++;
    const key = edgeKey(edge.source.name, edge.source.file, edge.target.name, edge.target.file);
    if (resolvedSet.has(key)) byMode[mode].resolved++;
  }

  // Compute per-mode recall
  for (const mode of Object.keys(byMode)) {
    const m = byMode[mode];
    m.recall = m.expected > 0 ? m.resolved / m.expected : 0;
  }

  return {
    precision,
    recall,
    truePositives: truePositives.size,
    falsePositives: falsePositives.size,
    falseNegatives: falseNegatives.size,
    totalResolved: resolvedSet.size,
    totalExpected: expectedSet.size,
    byMode,
    // Detailed lists for debugging
    falsePositiveEdges: [...falsePositives],
    falseNegativeEdges: [...falseNegatives],
  };
}

/**
 * Format a metrics report for console output.
 */
function formatReport(lang: string, metrics: BenchmarkMetrics): string {
  const lines = [
    `\n  ── ${lang.toUpperCase()} Resolution Metrics ──`,
    `  Precision: ${(metrics.precision * 100).toFixed(1)}% (${metrics.truePositives} correct / ${metrics.totalResolved} resolved)`,
    `  Recall:    ${(metrics.recall * 100).toFixed(1)}% (${metrics.truePositives} correct / ${metrics.totalExpected} expected)`,
    '',
    '  By resolution mode:',
  ];

  for (const [mode, data] of Object.entries(metrics.byMode)) {
    lines.push(
      `    ${mode}: ${data.resolved}/${data.expected} (${(data.recall * 100).toFixed(1)}% recall)`,
    );
  }

  if (metrics.falseNegativeEdges.length > 0) {
    lines.push('', '  Missing edges (false negatives):');
    for (const e of metrics.falseNegativeEdges) {
      lines.push(`    - ${e}`);
    }
  }

  if (metrics.falsePositiveEdges.length > 0) {
    lines.push('', '  Unexpected edges (false positives):');
    for (const e of metrics.falsePositiveEdges.slice(0, 10)) {
      lines.push(`    + ${e}`);
    }
    if (metrics.falsePositiveEdges.length > 10) {
      lines.push(`    ... and ${metrics.falsePositiveEdges.length - 10} more`);
    }
  }

  return lines.join('\n');
}

// ── Tests ────────────────────────────────────────────────────────────────

/**
 * Discover all fixture languages that have an expected-edges.json manifest.
 */
function discoverFixtures(): string[] {
  if (!fs.existsSync(FIXTURES_DIR)) return [];
  const languages: string[] = [];
  for (const dir of fs.readdirSync(FIXTURES_DIR)) {
    const manifestPath = path.join(FIXTURES_DIR, dir, 'expected-edges.json');
    if (fs.existsSync(manifestPath)) {
      languages.push(dir);
    }
  }
  return languages;
}

const languages = discoverFixtures();

/** Stores all results for the final summary */
const allResults: Record<string, BenchmarkMetrics> = {};

describe('Call Resolution Precision/Recall', () => {
  afterAll(() => {
    // Print combined summary
    const summaryLines = [
      '\n╔══════════════════════════════════════════╗',
      '║  Resolution Benchmark Summary            ║',
      '╚══════════════════════════════════════════╝',
    ];
    for (const [lang, metrics] of Object.entries(allResults)) {
      summaryLines.push(formatReport(lang, metrics));
    }
    summaryLines.push('');
    console.log(summaryLines.join('\n'));
  });

  for (const lang of languages) {
    describe(lang, () => {
      let fixtureDir: string;
      let resolvedEdges: ResolvedEdge[];
      let expectedEdges: ExpectedEdge[];
      let metrics: BenchmarkMetrics;

      beforeAll(async () => {
        fixtureDir = copyFixture(lang);
        await buildFixtureGraph(fixtureDir);

        resolvedEdges = extractResolvedEdges(fixtureDir);

        const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
        const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
        expectedEdges = manifest.edges;

        metrics = computeMetrics(resolvedEdges, expectedEdges);
        allResults[lang] = metrics;
      }, 60_000);

      afterAll(() => {
        if (fixtureDir) {
          fs.rmSync(fixtureDir, { recursive: true, force: true });
        }
      });

      test('builds graph successfully', () => {
        expect(resolvedEdges).toBeDefined();
        expect(resolvedEdges.length).toBeGreaterThan(0);
      });

      test('expected edges manifest is non-empty', () => {
        expect(expectedEdges.length).toBeGreaterThan(0);
      });

      test(`precision meets threshold`, () => {
        const threshold = THRESHOLDS[lang]?.precision ?? 0.85;
        expect(
          metrics.precision,
          `${lang} precision ${(metrics.precision * 100).toFixed(1)}% is below ${(threshold * 100).toFixed(0)}% threshold.\n` +
            `False positives:\n${metrics.falsePositiveEdges.map((e) => `  + ${e}`).join('\n')}`,
        ).toBeGreaterThanOrEqual(threshold);
      });

      test(`recall meets threshold`, () => {
        const threshold = THRESHOLDS[lang]?.recall ?? 0.8;
        expect(
          metrics.recall,
          `${lang} recall ${(metrics.recall * 100).toFixed(1)}% is below ${(threshold * 100).toFixed(0)}% threshold.\n` +
            `Missing edges:\n${metrics.falseNegativeEdges.map((e) => `  - ${e}`).join('\n')}`,
        ).toBeGreaterThanOrEqual(threshold);
      });

      test('static call resolution recall', () => {
        const staticMode = metrics.byMode.static;
        if (!staticMode) return; // no static edges in manifest
        const threshold = THRESHOLDS[lang]?.staticRecall ?? 0.8;
        expect(
          staticMode.recall,
          `${lang} static recall ${(staticMode.recall * 100).toFixed(1)}% — ` +
            `${staticMode.resolved}/${staticMode.expected} resolved`,
        ).toBeGreaterThanOrEqual(threshold);
      });

      test('receiver-typed call resolution recall', () => {
        const receiverMode = metrics.byMode['receiver-typed'];
        if (!receiverMode) return; // no receiver-typed edges in manifest
        const threshold = THRESHOLDS[lang]?.receiverRecall ?? 0.5;
        expect(
          receiverMode.recall,
          `${lang} receiver-typed recall ${(receiverMode.recall * 100).toFixed(1)}% — ` +
            `${receiverMode.resolved}/${receiverMode.expected} resolved`,
        ).toBeGreaterThanOrEqual(threshold);
      });
    });
  }
});