ops-codegraph-tool/tests/benchmarks/resolution/resolution-benchmark.test.ts at d697929e38b2b3efe94f613aade132bebcad8dcf · optave/ops-codegraph-tool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
/**
 * Call Resolution Precision/Recall Benchmark Suite (Roadmap 4.4)
 *
 * Builds codegraph for each hand-annotated fixture project, then compares
 * the resolved call edges against the expected-edges.json manifest.
 *
 * Reports precision (correct / total resolved) and recall (correct / total expected)
 * per language and per resolution mode.
 *
 * CI gate: fails if precision or recall drops below per-language thresholds.
 *
 * **Artifact mode (CI):** when `RESOLUTION_RESULT_JSON` points at a result
 * file produced by `scripts/resolution-benchmark.ts`, the suite reads those
 * pre-computed metrics and skips the fixture rebuild — avoiding the duplicate
 * work that doubled pre-publish CI time (issue #1052). Local runs without
 * the env var fall back to the build-from-fixtures path.
 */

import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import { afterAll, beforeAll, describe, expect, test } from 'vitest';
import { openReadonlyOrFail } from '../../../src/db/index.js';
import { buildGraph } from '../../../src/domain/graph/builder.js';

// ── Types ─────────────────────────────────────────────────────────────────

interface ResolvedEdge {
  source_name: string;
  source_file: string;
  target_name: string;
  target_file: string;
  kind: string;
  confidence: number;
}

interface ExpectedEdge {
  source: { name: string; file: string };
  target: { name: string; file: string };
  mode?: string;
}

interface ModeMetrics {
  expected: number;
  resolved: number;
  recall?: number;
}

interface BenchmarkMetrics {
  precision: number;
  recall: number;
  truePositives: number;
  falsePositives: number;
  falseNegatives: number;
  totalResolved: number;
  totalExpected: number;
  byMode: Record<string, ModeMetrics>;
  falsePositiveEdges: string[];
  falseNegativeEdges: string[];
}

// ── Configuration ────────────────────────────────────────────────────────

const FIXTURES_DIR = path.join(import.meta.dirname, 'fixtures');

/**
 * Per-language thresholds. Thresholds ratchet up as resolution improves.
 *
 * Languages with mature resolution (JS/TS) have higher bars.
 * Newer languages start with lower thresholds to avoid blocking CI
 * while still tracking regressions.
 */
const THRESHOLDS: Record<string, { precision: number; recall: number }> = {
  // Mature — high bars (100% precision, high recall)
  javascript: { precision: 0.85, recall: 0.5 },
  typescript: { precision: 0.85, recall: 0.5 },
  tsx: { precision: 0.85, recall: 0.8 },
  // TODO: raise thresholds once bash call resolution is implemented
  bash: { precision: 0.0, recall: 0.0 },
  // TODO: raise thresholds once ruby call resolution is reliable
  ruby: { precision: 0.0, recall: 0.0 },
  c: { precision: 0.6, recall: 0.2 },
  // Established — medium bars
  python: { precision: 0.7, recall: 0.3 },
  go: { precision: 0.7, recall: 0.3 },
  java: { precision: 0.7, recall: 0.3 },
  csharp: { precision: 0.5, recall: 0.2 },
  kotlin: { precision: 0.6, recall: 0.2 },
  // Lower bars — resolution still maturing
  rust: { precision: 0.6, recall: 0.2 },
  cpp: { precision: 0.6, recall: 0.2 },
  swift: { precision: 0.5, recall: 0.15 },
  // TODO(#872): raise haskell thresholds once call resolution lands
  haskell: { precision: 0.0, recall: 0.0 },
  // TODO(#873): raise lua thresholds once call resolution lands
  lua: { precision: 0.0, recall: 0.0 },
  // TODO(#874): raise ocaml thresholds once call resolution lands
  ocaml: { precision: 0.0, recall: 0.0 },
  // Minimal — call resolution not yet implemented or grammar unavailable
  // TODO(#875): raise scala thresholds once call resolution lands
  scala: { precision: 0.0, recall: 0.0 },
  php: { precision: 0.6, recall: 0.2 },
  // TODO: raise thresholds below once call resolution is implemented for each language
  elixir: { precision: 0.0, recall: 0.0 },
  dart: { precision: 0.0, recall: 0.0 },
  zig: { precision: 0.0, recall: 0.0 },
  fsharp: { precision: 0.0, recall: 0.0 },
  gleam: { precision: 0.0, recall: 0.0 },
  clojure: { precision: 0.0, recall: 0.0 },
  julia: { precision: 0.0, recall: 0.0 },
  r: { precision: 0.0, recall: 0.0 },
  erlang: { precision: 0.0, recall: 0.0 },
  solidity: { precision: 0.0, recall: 0.0 },
  // New fixture languages — no parser or call resolution yet
  objc: { precision: 0.0, recall: 0.0 },
  cuda: { precision: 0.0, recall: 0.0 },
  groovy: { precision: 0.0, recall: 0.0 },
  verilog: { precision: 0.0, recall: 0.0 },
  hcl: { precision: 0.0, recall: 0.0 },
};

/** Default thresholds for languages not explicitly listed. */
const DEFAULT_THRESHOLD = { precision: 0.5, recall: 0.15 };

// Files to skip when copying fixtures (not source code for codegraph)
const SKIP_FILES = new Set(['expected-edges.json', 'driver.mjs']);

// ── Helpers ──────────────────────────────────────────────────────────────

function copyFixture(lang: string): string {
  const src = path.join(FIXTURES_DIR, lang);
  const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `codegraph-resolution-${lang}-`));
  for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
    if (SKIP_FILES.has(entry.name)) continue;
    if (!entry.isFile()) continue;
    fs.copyFileSync(path.join(src, entry.name), path.join(tmp, entry.name));
  }
  return tmp;
}

function buildFixtureGraph(fixtureDir: string): Promise<void> {
  return buildGraph(fixtureDir, {
    incremental: false,
    engine: 'wasm',
    dataflow: false,
    cfg: false,
    ast: false,
  });
}

function extractResolvedEdges(fixtureDir: string) {
  const dbPath = path.join(fixtureDir, '.codegraph', 'graph.db');
  const db = openReadonlyOrFail(dbPath);
  try {
    return db
      .prepare(`
      SELECT
        src.name  AS source_name,
        src.file  AS source_file,
        tgt.name  AS target_name,
        tgt.file  AS target_file,
        e.kind    AS kind,
        e.confidence AS confidence
      FROM edges e
      JOIN nodes src ON e.source_id = src.id
      JOIN nodes tgt ON e.target_id = tgt.id
      WHERE e.kind = 'calls'
        AND src.kind IN ('function', 'method')
    `)
      .all();
  } finally {
    db.close();
  }
}

function normalizeFile(filePath: string): string {
  return path.basename(filePath);
}

function edgeKey(
  sourceName: string,
  sourceFile: string,
  targetName: string,
  targetFile: string,
): string {
  return `${sourceName}@${normalizeFile(sourceFile)} -> ${targetName}@${normalizeFile(targetFile)}`;
}

function computeMetrics(
  resolvedEdges: ResolvedEdge[],
  expectedEdges: ExpectedEdge[],
): BenchmarkMetrics {
  const resolvedSet = new Set(
    resolvedEdges.map((e) => edgeKey(e.source_name, e.source_file, e.target_name, e.target_file)),
  );

  const expectedSet = new Set(
    expectedEdges.map((e) => edgeKey(e.source.name, e.source.file, e.target.name, e.target.file)),
  );

  const truePositives = new Set([...resolvedSet].filter((k) => expectedSet.has(k)));
  const falsePositives = new Set([...resolvedSet].filter((k) => !expectedSet.has(k)));
  const falseNegatives = new Set([...expectedSet].filter((k) => !resolvedSet.has(k)));

  const precision = resolvedSet.size > 0 ? truePositives.size / resolvedSet.size : 0;
  const recall = expectedSet.size > 0 ? truePositives.size / expectedSet.size : 0;

  const byMode: Record<string, ModeMetrics> = {};
  for (const edge of expectedEdges) {
    const mode = edge.mode || 'unknown';
    if (!byMode[mode]) byMode[mode] = { expected: 0, resolved: 0 };
    byMode[mode].expected++;
    const key = edgeKey(edge.source.name, edge.source.file, edge.target.name, edge.target.file);
    if (resolvedSet.has(key)) byMode[mode].resolved++;
  }

  for (const mode of Object.keys(byMode)) {
    const m = byMode[mode];
    m.recall = m.expected > 0 ? m.resolved / m.expected : 0;
  }

  return {
    precision,
    recall,
    truePositives: truePositives.size,
    falsePositives: falsePositives.size,
    falseNegatives: falseNegatives.size,
    totalResolved: resolvedSet.size,
    totalExpected: expectedSet.size,
    byMode,
    falsePositiveEdges: [...falsePositives],
    falseNegativeEdges: [...falseNegatives],
  };
}

function formatReport(lang: string, metrics: BenchmarkMetrics): string {
  const lines = [
    `\n  ── ${lang.toUpperCase()} Resolution Metrics ──`,
    `  Precision: ${(metrics.precision * 100).toFixed(1)}% (${metrics.truePositives} correct / ${metrics.totalResolved} resolved)`,
    `  Recall:    ${(metrics.recall * 100).toFixed(1)}% (${metrics.truePositives} correct / ${metrics.totalExpected} expected)`,
    '',
    '  By resolution mode:',
  ];

  for (const [mode, data] of Object.entries(metrics.byMode)) {
    lines.push(
      `    ${mode}: ${data.resolved}/${data.expected} (${((data.recall ?? 0) * 100).toFixed(1)}% recall)`,
    );
  }

  if (metrics.falseNegativeEdges.length > 0) {
    lines.push('', '  Missing edges (false negatives):');
    for (const e of metrics.falseNegativeEdges) {
      lines.push(`    - ${e}`);
    }
  }

  if (metrics.falsePositiveEdges.length > 0) {
    lines.push('', '  Unexpected edges (false positives):');
    for (const e of metrics.falsePositiveEdges.slice(0, 10)) {
      lines.push(`    + ${e}`);
    }
    if (metrics.falsePositiveEdges.length > 10) {
      lines.push(`    ... and ${metrics.falsePositiveEdges.length - 10} more`);
    }
  }

  return lines.join('\n');
}

// ── Artifact loading (CI dedup, issue #1052) ─────────────────────────────

const ARTIFACT_PATH = process.env.RESOLUTION_RESULT_JSON;

interface ArtifactLangResult {
  precision: number;
  recall: number;
  truePositives: number;
  falsePositives: number;
  falseNegatives: number;
  totalResolved: number;
  totalExpected: number;
  byMode: Record<string, ModeMetrics>;
  falsePositiveEdges?: string[];
  falseNegativeEdges?: string[];
}

function loadArtifact(artifactPath: string): Record<string, ArtifactLangResult> {
  if (!fs.existsSync(artifactPath)) {
    throw new Error(
      `RESOLUTION_RESULT_JSON=${artifactPath} not found — run scripts/resolution-benchmark.ts first.`,
    );
  }
  return JSON.parse(fs.readFileSync(artifactPath, 'utf-8'));
}

function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics {
  if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) {
    throw new Error(
      `Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`,
    );
  }
  return {
    precision: raw.precision,
    recall: raw.recall,
    truePositives: raw.truePositives,
    falsePositives: raw.falsePositives,
    falseNegatives: raw.falseNegatives,
    totalResolved: raw.totalResolved,
    totalExpected: raw.totalExpected,
    byMode: raw.byMode,
    falsePositiveEdges: raw.falsePositiveEdges,
    falseNegativeEdges: raw.falseNegativeEdges,
  };
}

// ── Tests ────────────────────────────────────────────────────────────────

function discoverFixtures(): string[] {
  if (!fs.existsSync(FIXTURES_DIR)) return [];
  const languages: string[] = [];
  for (const dir of fs.readdirSync(FIXTURES_DIR)) {
    const manifestPath = path.join(FIXTURES_DIR, dir, 'expected-edges.json');
    if (fs.existsSync(manifestPath)) {
      languages.push(dir);
    }
  }
  return languages;
}

const artifact = ARTIFACT_PATH ? loadArtifact(ARTIFACT_PATH) : null;
// In artifact mode, drive the suite from the keys in the artifact so we never
// silently skip a language the script reported. In local mode, discover from
// the filesystem like before.
const languages = artifact ? Object.keys(artifact).sort() : discoverFixtures();

/** Stores all results for the final summary */
const allResults: Record<string, BenchmarkMetrics> = {};

describe('Call Resolution Precision/Recall', () => {
  afterAll(() => {
    // Print combined summary
    const summaryLines = [
      '\n╔══════════════════════════════════════════╗',
      '║  Resolution Benchmark Summary            ║',
      '╚══════════════════════════════════════════╝',
    ];
    for (const [lang, metrics] of Object.entries(allResults)) {
      summaryLines.push(formatReport(lang, metrics));
    }

    // Print a compact table for quick scanning
    summaryLines.push('\n  ── Summary Table ──');
    summaryLines.push('  Language     | Precision | Recall  | TP  | FP  | FN');
    summaryLines.push('  ------------|-----------|---------|-----|-----|----');
    for (const [lang, m] of Object.entries(allResults)) {
      summaryLines.push(
        `  ${lang.padEnd(12)} | ${(m.precision * 100).toFixed(1).padStart(7)}%  | ${(m.recall * 100).toFixed(1).padStart(5)}%  | ${String(m.truePositives).padStart(3)} | ${String(m.falsePositives).padStart(3)} | ${String(m.falseNegatives).padStart(3)}`,
      );
    }

    summaryLines.push('');
    console.log(summaryLines.join('\n'));
  });

  for (const lang of languages) {
    describe(lang, () => {
      let fixtureDir: string | null = null;
      let metrics: BenchmarkMetrics;

      beforeAll(async () => {
        if (artifact) {
          metrics = metricsFromArtifact(lang, artifact[lang]);
        } else {
          fixtureDir = copyFixture(lang);
          await buildFixtureGraph(fixtureDir);

          const resolvedEdges = extractResolvedEdges(fixtureDir) as ResolvedEdge[];

          const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
          const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
          const expectedEdges: ExpectedEdge[] = manifest.edges;

          metrics = computeMetrics(resolvedEdges, expectedEdges);
        }
        allResults[lang] = metrics;
      }, 60_000);

      afterAll(() => {
        if (fixtureDir) {
          fs.rmSync(fixtureDir, { recursive: true, force: true });
        }
      });

      test('metrics are populated', () => {
        expect(metrics).toBeDefined();
        expect(metrics.totalResolved).toBeGreaterThanOrEqual(0);
      });

      test('expected edges manifest is non-empty', () => {
        expect(metrics.totalExpected).toBeGreaterThan(0);
      });

      test('precision meets threshold', () => {
        const threshold = THRESHOLDS[lang]?.precision ?? DEFAULT_THRESHOLD.precision;
        expect(
          metrics.precision,
          `${lang} precision ${(metrics.precision * 100).toFixed(1)}% is below ${(threshold * 100).toFixed(0)}% threshold.\n` +
            `False positives:\n${metrics.falsePositiveEdges.map((e) => `  + ${e}`).join('\n')}`,
        ).toBeGreaterThanOrEqual(threshold);
      });

      test('recall meets threshold', () => {
        const threshold = THRESHOLDS[lang]?.recall ?? DEFAULT_THRESHOLD.recall;
        expect(
          metrics.recall,
          `${lang} recall ${(metrics.recall * 100).toFixed(1)}% is below ${(threshold * 100).toFixed(0)}% threshold.\n` +
            `Missing edges:\n${metrics.falseNegativeEdges.map((e) => `  - ${e}`).join('\n')}`,
        ).toBeGreaterThanOrEqual(threshold);
      });

      // Per-mode recall tests — run for every mode present in the manifest
      test('per-mode recall breakdown', () => {
        for (const [mode, data] of Object.entries(metrics.byMode)) {
          const modeRecall = data.recall ?? 0;
          // Log per-mode results for visibility (not a hard gate)
          console.log(
            `    [${lang}] ${mode}: ${data.resolved}/${data.expected} (${(modeRecall * 100).toFixed(1)}% recall)`,
          );
        }
        // At least verify that some mode data exists
        expect(Object.keys(metrics.byMode).length).toBeGreaterThan(0);
      });
    });
  }
});