refactor(ci): let resolution gate reuse benchmark artifact (#1167)

carlos-alm · web-flow · commit 383cbbef09c9 · 2026-05-19T18:29:18.000-06:00
* refactor(ci): let resolution gate reuse benchmark artifact The pre-publish-benchmark job's `Run resolution benchmark` step builds codegraphs for ~34 language fixtures, computes precision/recall, and writes resolution-result.json. The `Gate on resolution thresholds` step that follows then ran the same vitest suite which independently copied every fixture and rebuilt the graphs again — doubling the most expensive slice of the publish pipeline. Extend the script's per-language LangResult to include falsePositiveEdges and falseNegativeEdges so the gate test has everything it needs for the existing precision/recall threshold assertions and failure messages. Refactor the gate test to consume that artifact when RESOLUTION_RESULT_JSON is set, falling back to the build-from-fixtures path when unset so devs can still run `npx vitest run tests/benchmarks/resolution/...` standalone. Wire the env var through the workflow's Gate step. Verified locally: gate test in artifact mode passes 170/170 in ~0.5s against an artifact produced by scripts/resolution-benchmark.ts, and the legacy build-from-fixtures path still passes for the javascript fixture. Closes #1052 * fix(ci): preserve full precision and guard malformed resolution artifacts (#1167) - scripts/resolution-benchmark.ts: stop rounding precision/recall to 3 decimals before writing the artifact. The rounding let a near-miss like 0.8497 round up to 0.850 and silently clear a 0.85 threshold in CI artifact mode while failing in fixture mode. - tests/benchmarks/resolution/resolution-benchmark.test.ts: validate numeric fields in metricsFromArtifact so a stale or malformed artifact surfaces a clear 'regenerate' error instead of a confusing TypeError at the threshold assertions. - tests/benchmarks/resolution/resolution-benchmark.test.ts: reject an empty artifact in loadArtifact. Without this guard, an empty {} would register zero describe blocks and vitest would exit 0 with '0 tests', silently passing the gate.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -333,6 +333,12 @@ jobs:
 
       - name: Gate on resolution thresholds
         timeout-minutes: 30
+        # Reuse the metrics produced by the previous step instead of rebuilding
+        # every fixture from scratch (issue #1052). The gate test falls back to
+        # the build-from-fixtures path when this env var is unset, so local
+        # runs (`npx vitest run …`) still work standalone.
+        env:
+          RESOLUTION_RESULT_JSON: ${{ github.workspace }}/resolution-result.json
         run: npx vitest run tests/benchmarks/resolution/resolution-benchmark.test.ts --reporter=verbose
 
       - name: Run tracer validation (same-file edge recall)
diff --git a/scripts/resolution-benchmark.ts b/scripts/resolution-benchmark.ts
@@ -64,6 +64,10 @@ interface LangResult {
 	totalResolved: number;
 	totalExpected: number;
 	byMode: Record<string, ModeMetrics>;
+	// Edge lists are included so the gate test can reuse this artifact
+	// instead of rebuilding fixtures from scratch (see issue #1052).
+	falsePositiveEdges: string[];
+	falseNegativeEdges: string[];
 	dynamicEdges?: number;
 	dynamicConfirmed?: number;
 }
@@ -123,15 +127,20 @@ function computeMetrics(resolvedEdges: ResolvedEdge[], expectedEdges: ExpectedEd
 		m.recall = m.expected > 0 ? m.resolved / m.expected : 0;
 	}
 
+	// Keep full precision so the artifact-mode gate compares the exact same
+	// values the fixture-mode gate would compute. Rounding here let a near-miss
+	// like 0.8497 round up to 0.850 and silently clear a 0.85 threshold.
 	return {
-		precision: Math.round(precision * 1000) / 1000,
-		recall: Math.round(recall * 1000) / 1000,
+		precision,
+		recall,
 		truePositives: truePositives.size,
 		falsePositives: falsePositives.size,
 		falseNegatives: falseNegatives.size,
 		totalResolved: resolvedSet.size,
 		totalExpected: expectedSet.size,
 		byMode,
+		falsePositiveEdges: [...falsePositives],
+		falseNegativeEdges: [...falseNegatives],
 	};
 }
 
diff --git a/tests/benchmarks/resolution/resolution-benchmark.test.ts b/tests/benchmarks/resolution/resolution-benchmark.test.ts
@@ -8,6 +8,12 @@
  * per language and per resolution mode.
  *
  * CI gate: fails if precision or recall drops below per-language thresholds.
+ *
+ * **Artifact mode (CI):** when `RESOLUTION_RESULT_JSON` points at a result
+ * file produced by `scripts/resolution-benchmark.ts`, the suite reads those
+ * pre-computed metrics and skips the fixture rebuild — avoiding the duplicate
+ * work that doubled pre-publish CI time (issue #1052). Local runs without
+ * the env var fall back to the build-from-fixtures path.
  */
 
 import fs from 'node:fs';
@@ -262,6 +268,79 @@ function formatReport(lang: string, metrics: BenchmarkMetrics): string {
   return lines.join('\n');
 }
 
+// ── Artifact loading (CI dedup, issue #1052) ─────────────────────────────
+
+const ARTIFACT_PATH = process.env.RESOLUTION_RESULT_JSON;
+
+interface ArtifactLangResult {
+  precision: number;
+  recall: number;
+  truePositives: number;
+  falsePositives: number;
+  falseNegatives: number;
+  totalResolved: number;
+  totalExpected: number;
+  byMode: Record<string, ModeMetrics>;
+  falsePositiveEdges?: string[];
+  falseNegativeEdges?: string[];
+}
+
+function loadArtifact(artifactPath: string): Record<string, ArtifactLangResult> {
+  if (!fs.existsSync(artifactPath)) {
+    throw new Error(
+      `RESOLUTION_RESULT_JSON=${artifactPath} not found — run scripts/resolution-benchmark.ts first.`,
+    );
+  }
+  const parsed = JSON.parse(fs.readFileSync(artifactPath, 'utf-8')) as Record<
+    string,
+    ArtifactLangResult
+  >;
+  // Refuse to proceed on an empty artifact: with zero languages, vitest would
+  // register no describe blocks and exit 0, silently passing the gate without
+  // evaluating a single threshold.
+  if (!parsed || typeof parsed !== 'object' || Object.keys(parsed).length === 0) {
+    throw new Error(
+      `RESOLUTION_RESULT_JSON=${artifactPath} contains no language results — regenerate with scripts/resolution-benchmark.ts.`,
+    );
+  }
+  return parsed;
+}
+
+function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics {
+  if (
+    typeof raw.precision !== 'number' ||
+    typeof raw.recall !== 'number' ||
+    typeof raw.truePositives !== 'number' ||
+    typeof raw.falsePositives !== 'number' ||
+    typeof raw.falseNegatives !== 'number' ||
+    typeof raw.totalResolved !== 'number' ||
+    typeof raw.totalExpected !== 'number' ||
+    !raw.byMode ||
+    typeof raw.byMode !== 'object'
+  ) {
+    throw new Error(
+      `Resolution artifact for ${lang} is missing required numeric fields — regenerate with the current resolution-benchmark.ts.`,
+    );
+  }
+  if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) {
+    throw new Error(
+      `Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`,
+    );
+  }
+  return {
+    precision: raw.precision,
+    recall: raw.recall,
+    truePositives: raw.truePositives,
+    falsePositives: raw.falsePositives,
+    falseNegatives: raw.falseNegatives,
+    totalResolved: raw.totalResolved,
+    totalExpected: raw.totalExpected,
+    byMode: raw.byMode,
+    falsePositiveEdges: raw.falsePositiveEdges,
+    falseNegativeEdges: raw.falseNegativeEdges,
+  };
+}
+
 // ── Tests ────────────────────────────────────────────────────────────────
 
 function discoverFixtures(): string[] {
@@ -276,7 +355,11 @@ function discoverFixtures(): string[] {
   return languages;
 }
 
-const languages = discoverFixtures();
+const artifact = ARTIFACT_PATH ? loadArtifact(ARTIFACT_PATH) : null;
+// In artifact mode, drive the suite from the keys in the artifact so we never
+// silently skip a language the script reported. In local mode, discover from
+// the filesystem like before.
+const languages = artifact ? Object.keys(artifact).sort() : discoverFixtures();
 
 /** Stores all results for the final summary */
 const allResults: Record<string, BenchmarkMetrics> = {};
@@ -309,22 +392,24 @@ describe('Call Resolution Precision/Recall', () => {
 
   for (const lang of languages) {
     describe(lang, () => {
-      let fixtureDir: string;
-      let resolvedEdges: ResolvedEdge[];
-      let expectedEdges: ExpectedEdge[];
+      let fixtureDir: string | null = null;
       let metrics: BenchmarkMetrics;
 
       beforeAll(async () => {
-        fixtureDir = copyFixture(lang);
-        await buildFixtureGraph(fixtureDir);
+        if (artifact) {
+          metrics = metricsFromArtifact(lang, artifact[lang]);
+        } else {
+          fixtureDir = copyFixture(lang);
+          await buildFixtureGraph(fixtureDir);
 
-        resolvedEdges = extractResolvedEdges(fixtureDir);
+          const resolvedEdges = extractResolvedEdges(fixtureDir) as ResolvedEdge[];
 
-        const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
-        const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
-        expectedEdges = manifest.edges;
+          const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
+          const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
+          const expectedEdges: ExpectedEdge[] = manifest.edges;
 
-        metrics = computeMetrics(resolvedEdges, expectedEdges);
+          metrics = computeMetrics(resolvedEdges, expectedEdges);
+        }
         allResults[lang] = metrics;
       }, 60_000);
 
@@ -334,16 +419,13 @@ describe('Call Resolution Precision/Recall', () => {
         }
       });
 
-      test('builds graph successfully', () => {
-        expect(resolvedEdges).toBeDefined();
-        expect(Array.isArray(resolvedEdges)).toBe(true);
-        // Some languages may have 0 resolved call edges if resolution isn't
-        // implemented yet — that's okay, the precision/recall tests will
-        // catch it at the appropriate threshold level.
+      test('metrics are populated', () => {
+        expect(metrics).toBeDefined();
+        expect(metrics.totalResolved).toBeGreaterThanOrEqual(0);
       });
 
       test('expected edges manifest is non-empty', () => {
-        expect(expectedEdges.length).toBeGreaterThan(0);
+        expect(metrics.totalExpected).toBeGreaterThan(0);
       });
 
       test('precision meets threshold', () => {