From d697929e38b2b3efe94f613aade132bebcad8dcf Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 19 May 2026 01:54:49 -0600 Subject: [PATCH 1/2] refactor(ci): let resolution gate reuse benchmark artifact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pre-publish-benchmark job's `Run resolution benchmark` step builds codegraphs for ~34 language fixtures, computes precision/recall, and writes resolution-result.json. The `Gate on resolution thresholds` step that follows then ran the same vitest suite which independently copied every fixture and rebuilt the graphs again — doubling the most expensive slice of the publish pipeline. Extend the script's per-language LangResult to include falsePositiveEdges and falseNegativeEdges so the gate test has everything it needs for the existing precision/recall threshold assertions and failure messages. Refactor the gate test to consume that artifact when RESOLUTION_RESULT_JSON is set, falling back to the build-from-fixtures path when unset so devs can still run `npx vitest run tests/benchmarks/resolution/...` standalone. Wire the env var through the workflow's Gate step. Verified locally: gate test in artifact mode passes 170/170 in ~0.5s against an artifact produced by scripts/resolution-benchmark.ts, and the legacy build-from-fixtures path still passes for the javascript fixture. Closes #1052 --- .github/workflows/publish.yml | 6 ++ scripts/resolution-benchmark.ts | 6 ++ .../resolution/resolution-benchmark.test.ts | 91 +++++++++++++++---- 3 files changed, 85 insertions(+), 18 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 7437c3b2e..666f0a7eb 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -333,6 +333,12 @@ jobs: - name: Gate on resolution thresholds timeout-minutes: 30 + # Reuse the metrics produced by the previous step instead of rebuilding + # every fixture from scratch (issue #1052). The gate test falls back to + # the build-from-fixtures path when this env var is unset, so local + # runs (`npx vitest run …`) still work standalone. + env: + RESOLUTION_RESULT_JSON: ${{ github.workspace }}/resolution-result.json run: npx vitest run tests/benchmarks/resolution/resolution-benchmark.test.ts --reporter=verbose - name: Run tracer validation (same-file edge recall) diff --git a/scripts/resolution-benchmark.ts b/scripts/resolution-benchmark.ts index 9252028e4..b4b09b892 100644 --- a/scripts/resolution-benchmark.ts +++ b/scripts/resolution-benchmark.ts @@ -64,6 +64,10 @@ interface LangResult { totalResolved: number; totalExpected: number; byMode: Record; + // Edge lists are included so the gate test can reuse this artifact + // instead of rebuilding fixtures from scratch (see issue #1052). + falsePositiveEdges: string[]; + falseNegativeEdges: string[]; dynamicEdges?: number; dynamicConfirmed?: number; } @@ -132,6 +136,8 @@ function computeMetrics(resolvedEdges: ResolvedEdge[], expectedEdges: ExpectedEd totalResolved: resolvedSet.size, totalExpected: expectedSet.size, byMode, + falsePositiveEdges: [...falsePositives], + falseNegativeEdges: [...falseNegatives], }; } diff --git a/tests/benchmarks/resolution/resolution-benchmark.test.ts b/tests/benchmarks/resolution/resolution-benchmark.test.ts index 0b4032057..6034eba86 100644 --- a/tests/benchmarks/resolution/resolution-benchmark.test.ts +++ b/tests/benchmarks/resolution/resolution-benchmark.test.ts @@ -8,6 +8,12 @@ * per language and per resolution mode. * * CI gate: fails if precision or recall drops below per-language thresholds. + * + * **Artifact mode (CI):** when `RESOLUTION_RESULT_JSON` points at a result + * file produced by `scripts/resolution-benchmark.ts`, the suite reads those + * pre-computed metrics and skips the fixture rebuild — avoiding the duplicate + * work that doubled pre-publish CI time (issue #1052). Local runs without + * the env var fall back to the build-from-fixtures path. */ import fs from 'node:fs'; @@ -262,6 +268,52 @@ function formatReport(lang: string, metrics: BenchmarkMetrics): string { return lines.join('\n'); } +// ── Artifact loading (CI dedup, issue #1052) ───────────────────────────── + +const ARTIFACT_PATH = process.env.RESOLUTION_RESULT_JSON; + +interface ArtifactLangResult { + precision: number; + recall: number; + truePositives: number; + falsePositives: number; + falseNegatives: number; + totalResolved: number; + totalExpected: number; + byMode: Record; + falsePositiveEdges?: string[]; + falseNegativeEdges?: string[]; +} + +function loadArtifact(artifactPath: string): Record { + if (!fs.existsSync(artifactPath)) { + throw new Error( + `RESOLUTION_RESULT_JSON=${artifactPath} not found — run scripts/resolution-benchmark.ts first.`, + ); + } + return JSON.parse(fs.readFileSync(artifactPath, 'utf-8')); +} + +function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics { + if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) { + throw new Error( + `Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`, + ); + } + return { + precision: raw.precision, + recall: raw.recall, + truePositives: raw.truePositives, + falsePositives: raw.falsePositives, + falseNegatives: raw.falseNegatives, + totalResolved: raw.totalResolved, + totalExpected: raw.totalExpected, + byMode: raw.byMode, + falsePositiveEdges: raw.falsePositiveEdges, + falseNegativeEdges: raw.falseNegativeEdges, + }; +} + // ── Tests ──────────────────────────────────────────────────────────────── function discoverFixtures(): string[] { @@ -276,7 +328,11 @@ function discoverFixtures(): string[] { return languages; } -const languages = discoverFixtures(); +const artifact = ARTIFACT_PATH ? loadArtifact(ARTIFACT_PATH) : null; +// In artifact mode, drive the suite from the keys in the artifact so we never +// silently skip a language the script reported. In local mode, discover from +// the filesystem like before. +const languages = artifact ? Object.keys(artifact).sort() : discoverFixtures(); /** Stores all results for the final summary */ const allResults: Record = {}; @@ -309,22 +365,24 @@ describe('Call Resolution Precision/Recall', () => { for (const lang of languages) { describe(lang, () => { - let fixtureDir: string; - let resolvedEdges: ResolvedEdge[]; - let expectedEdges: ExpectedEdge[]; + let fixtureDir: string | null = null; let metrics: BenchmarkMetrics; beforeAll(async () => { - fixtureDir = copyFixture(lang); - await buildFixtureGraph(fixtureDir); + if (artifact) { + metrics = metricsFromArtifact(lang, artifact[lang]); + } else { + fixtureDir = copyFixture(lang); + await buildFixtureGraph(fixtureDir); - resolvedEdges = extractResolvedEdges(fixtureDir); + const resolvedEdges = extractResolvedEdges(fixtureDir) as ResolvedEdge[]; - const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json'); - const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); - expectedEdges = manifest.edges; + const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json'); + const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); + const expectedEdges: ExpectedEdge[] = manifest.edges; - metrics = computeMetrics(resolvedEdges, expectedEdges); + metrics = computeMetrics(resolvedEdges, expectedEdges); + } allResults[lang] = metrics; }, 60_000); @@ -334,16 +392,13 @@ describe('Call Resolution Precision/Recall', () => { } }); - test('builds graph successfully', () => { - expect(resolvedEdges).toBeDefined(); - expect(Array.isArray(resolvedEdges)).toBe(true); - // Some languages may have 0 resolved call edges if resolution isn't - // implemented yet — that's okay, the precision/recall tests will - // catch it at the appropriate threshold level. + test('metrics are populated', () => { + expect(metrics).toBeDefined(); + expect(metrics.totalResolved).toBeGreaterThanOrEqual(0); }); test('expected edges manifest is non-empty', () => { - expect(expectedEdges.length).toBeGreaterThan(0); + expect(metrics.totalExpected).toBeGreaterThan(0); }); test('precision meets threshold', () => { From e7b2f27cf83c078c445e35a6802cad5114781760 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 19 May 2026 02:03:07 -0600 Subject: [PATCH 2/2] fix(ci): preserve full precision and guard malformed resolution artifacts (#1167) - scripts/resolution-benchmark.ts: stop rounding precision/recall to 3 decimals before writing the artifact. The rounding let a near-miss like 0.8497 round up to 0.850 and silently clear a 0.85 threshold in CI artifact mode while failing in fixture mode. - tests/benchmarks/resolution/resolution-benchmark.test.ts: validate numeric fields in metricsFromArtifact so a stale or malformed artifact surfaces a clear 'regenerate' error instead of a confusing TypeError at the threshold assertions. - tests/benchmarks/resolution/resolution-benchmark.test.ts: reject an empty artifact in loadArtifact. Without this guard, an empty {} would register zero describe blocks and vitest would exit 0 with '0 tests', silently passing the gate. --- scripts/resolution-benchmark.ts | 7 +++-- .../resolution/resolution-benchmark.test.ts | 29 ++++++++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/scripts/resolution-benchmark.ts b/scripts/resolution-benchmark.ts index b4b09b892..e91016e9d 100644 --- a/scripts/resolution-benchmark.ts +++ b/scripts/resolution-benchmark.ts @@ -127,9 +127,12 @@ function computeMetrics(resolvedEdges: ResolvedEdge[], expectedEdges: ExpectedEd m.recall = m.expected > 0 ? m.resolved / m.expected : 0; } + // Keep full precision so the artifact-mode gate compares the exact same + // values the fixture-mode gate would compute. Rounding here let a near-miss + // like 0.8497 round up to 0.850 and silently clear a 0.85 threshold. return { - precision: Math.round(precision * 1000) / 1000, - recall: Math.round(recall * 1000) / 1000, + precision, + recall, truePositives: truePositives.size, falsePositives: falsePositives.size, falseNegatives: falseNegatives.size, diff --git a/tests/benchmarks/resolution/resolution-benchmark.test.ts b/tests/benchmarks/resolution/resolution-benchmark.test.ts index 6034eba86..1fa79a4f8 100644 --- a/tests/benchmarks/resolution/resolution-benchmark.test.ts +++ b/tests/benchmarks/resolution/resolution-benchmark.test.ts @@ -291,10 +291,37 @@ function loadArtifact(artifactPath: string): Record `RESOLUTION_RESULT_JSON=${artifactPath} not found — run scripts/resolution-benchmark.ts first.`, ); } - return JSON.parse(fs.readFileSync(artifactPath, 'utf-8')); + const parsed = JSON.parse(fs.readFileSync(artifactPath, 'utf-8')) as Record< + string, + ArtifactLangResult + >; + // Refuse to proceed on an empty artifact: with zero languages, vitest would + // register no describe blocks and exit 0, silently passing the gate without + // evaluating a single threshold. + if (!parsed || typeof parsed !== 'object' || Object.keys(parsed).length === 0) { + throw new Error( + `RESOLUTION_RESULT_JSON=${artifactPath} contains no language results — regenerate with scripts/resolution-benchmark.ts.`, + ); + } + return parsed; } function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics { + if ( + typeof raw.precision !== 'number' || + typeof raw.recall !== 'number' || + typeof raw.truePositives !== 'number' || + typeof raw.falsePositives !== 'number' || + typeof raw.falseNegatives !== 'number' || + typeof raw.totalResolved !== 'number' || + typeof raw.totalExpected !== 'number' || + !raw.byMode || + typeof raw.byMode !== 'object' + ) { + throw new Error( + `Resolution artifact for ${lang} is missing required numeric fields — regenerate with the current resolution-benchmark.ts.`, + ); + } if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) { throw new Error( `Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`,