diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 7437c3b2e..666f0a7eb 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -333,6 +333,12 @@ jobs: - name: Gate on resolution thresholds timeout-minutes: 30 + # Reuse the metrics produced by the previous step instead of rebuilding + # every fixture from scratch (issue #1052). The gate test falls back to + # the build-from-fixtures path when this env var is unset, so local + # runs (`npx vitest run …`) still work standalone. + env: + RESOLUTION_RESULT_JSON: ${{ github.workspace }}/resolution-result.json run: npx vitest run tests/benchmarks/resolution/resolution-benchmark.test.ts --reporter=verbose - name: Run tracer validation (same-file edge recall) diff --git a/scripts/resolution-benchmark.ts b/scripts/resolution-benchmark.ts index 9252028e4..e91016e9d 100644 --- a/scripts/resolution-benchmark.ts +++ b/scripts/resolution-benchmark.ts @@ -64,6 +64,10 @@ interface LangResult { totalResolved: number; totalExpected: number; byMode: Record; + // Edge lists are included so the gate test can reuse this artifact + // instead of rebuilding fixtures from scratch (see issue #1052). + falsePositiveEdges: string[]; + falseNegativeEdges: string[]; dynamicEdges?: number; dynamicConfirmed?: number; } @@ -123,15 +127,20 @@ function computeMetrics(resolvedEdges: ResolvedEdge[], expectedEdges: ExpectedEd m.recall = m.expected > 0 ? m.resolved / m.expected : 0; } + // Keep full precision so the artifact-mode gate compares the exact same + // values the fixture-mode gate would compute. Rounding here let a near-miss + // like 0.8497 round up to 0.850 and silently clear a 0.85 threshold. return { - precision: Math.round(precision * 1000) / 1000, - recall: Math.round(recall * 1000) / 1000, + precision, + recall, truePositives: truePositives.size, falsePositives: falsePositives.size, falseNegatives: falseNegatives.size, totalResolved: resolvedSet.size, totalExpected: expectedSet.size, byMode, + falsePositiveEdges: [...falsePositives], + falseNegativeEdges: [...falseNegatives], }; } diff --git a/tests/benchmarks/resolution/resolution-benchmark.test.ts b/tests/benchmarks/resolution/resolution-benchmark.test.ts index 0b4032057..1fa79a4f8 100644 --- a/tests/benchmarks/resolution/resolution-benchmark.test.ts +++ b/tests/benchmarks/resolution/resolution-benchmark.test.ts @@ -8,6 +8,12 @@ * per language and per resolution mode. * * CI gate: fails if precision or recall drops below per-language thresholds. + * + * **Artifact mode (CI):** when `RESOLUTION_RESULT_JSON` points at a result + * file produced by `scripts/resolution-benchmark.ts`, the suite reads those + * pre-computed metrics and skips the fixture rebuild — avoiding the duplicate + * work that doubled pre-publish CI time (issue #1052). Local runs without + * the env var fall back to the build-from-fixtures path. */ import fs from 'node:fs'; @@ -262,6 +268,79 @@ function formatReport(lang: string, metrics: BenchmarkMetrics): string { return lines.join('\n'); } +// ── Artifact loading (CI dedup, issue #1052) ───────────────────────────── + +const ARTIFACT_PATH = process.env.RESOLUTION_RESULT_JSON; + +interface ArtifactLangResult { + precision: number; + recall: number; + truePositives: number; + falsePositives: number; + falseNegatives: number; + totalResolved: number; + totalExpected: number; + byMode: Record; + falsePositiveEdges?: string[]; + falseNegativeEdges?: string[]; +} + +function loadArtifact(artifactPath: string): Record { + if (!fs.existsSync(artifactPath)) { + throw new Error( + `RESOLUTION_RESULT_JSON=${artifactPath} not found — run scripts/resolution-benchmark.ts first.`, + ); + } + const parsed = JSON.parse(fs.readFileSync(artifactPath, 'utf-8')) as Record< + string, + ArtifactLangResult + >; + // Refuse to proceed on an empty artifact: with zero languages, vitest would + // register no describe blocks and exit 0, silently passing the gate without + // evaluating a single threshold. + if (!parsed || typeof parsed !== 'object' || Object.keys(parsed).length === 0) { + throw new Error( + `RESOLUTION_RESULT_JSON=${artifactPath} contains no language results — regenerate with scripts/resolution-benchmark.ts.`, + ); + } + return parsed; +} + +function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics { + if ( + typeof raw.precision !== 'number' || + typeof raw.recall !== 'number' || + typeof raw.truePositives !== 'number' || + typeof raw.falsePositives !== 'number' || + typeof raw.falseNegatives !== 'number' || + typeof raw.totalResolved !== 'number' || + typeof raw.totalExpected !== 'number' || + !raw.byMode || + typeof raw.byMode !== 'object' + ) { + throw new Error( + `Resolution artifact for ${lang} is missing required numeric fields — regenerate with the current resolution-benchmark.ts.`, + ); + } + if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) { + throw new Error( + `Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`, + ); + } + return { + precision: raw.precision, + recall: raw.recall, + truePositives: raw.truePositives, + falsePositives: raw.falsePositives, + falseNegatives: raw.falseNegatives, + totalResolved: raw.totalResolved, + totalExpected: raw.totalExpected, + byMode: raw.byMode, + falsePositiveEdges: raw.falsePositiveEdges, + falseNegativeEdges: raw.falseNegativeEdges, + }; +} + // ── Tests ──────────────────────────────────────────────────────────────── function discoverFixtures(): string[] { @@ -276,7 +355,11 @@ function discoverFixtures(): string[] { return languages; } -const languages = discoverFixtures(); +const artifact = ARTIFACT_PATH ? loadArtifact(ARTIFACT_PATH) : null; +// In artifact mode, drive the suite from the keys in the artifact so we never +// silently skip a language the script reported. In local mode, discover from +// the filesystem like before. +const languages = artifact ? Object.keys(artifact).sort() : discoverFixtures(); /** Stores all results for the final summary */ const allResults: Record = {}; @@ -309,22 +392,24 @@ describe('Call Resolution Precision/Recall', () => { for (const lang of languages) { describe(lang, () => { - let fixtureDir: string; - let resolvedEdges: ResolvedEdge[]; - let expectedEdges: ExpectedEdge[]; + let fixtureDir: string | null = null; let metrics: BenchmarkMetrics; beforeAll(async () => { - fixtureDir = copyFixture(lang); - await buildFixtureGraph(fixtureDir); + if (artifact) { + metrics = metricsFromArtifact(lang, artifact[lang]); + } else { + fixtureDir = copyFixture(lang); + await buildFixtureGraph(fixtureDir); - resolvedEdges = extractResolvedEdges(fixtureDir); + const resolvedEdges = extractResolvedEdges(fixtureDir) as ResolvedEdge[]; - const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json'); - const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); - expectedEdges = manifest.edges; + const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json'); + const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); + const expectedEdges: ExpectedEdge[] = manifest.edges; - metrics = computeMetrics(resolvedEdges, expectedEdges); + metrics = computeMetrics(resolvedEdges, expectedEdges); + } allResults[lang] = metrics; }, 60_000); @@ -334,16 +419,13 @@ describe('Call Resolution Precision/Recall', () => { } }); - test('builds graph successfully', () => { - expect(resolvedEdges).toBeDefined(); - expect(Array.isArray(resolvedEdges)).toBe(true); - // Some languages may have 0 resolved call edges if resolution isn't - // implemented yet — that's okay, the precision/recall tests will - // catch it at the appropriate threshold level. + test('metrics are populated', () => { + expect(metrics).toBeDefined(); + expect(metrics.totalResolved).toBeGreaterThanOrEqual(0); }); test('expected edges manifest is non-empty', () => { - expect(expectedEdges.length).toBeGreaterThan(0); + expect(metrics.totalExpected).toBeGreaterThan(0); }); test('precision meets threshold', () => {