Skip to content

Commit 383cbbe

Browse files
authored
refactor(ci): let resolution gate reuse benchmark artifact (#1167)
* refactor(ci): let resolution gate reuse benchmark artifact The pre-publish-benchmark job's `Run resolution benchmark` step builds codegraphs for ~34 language fixtures, computes precision/recall, and writes resolution-result.json. The `Gate on resolution thresholds` step that follows then ran the same vitest suite which independently copied every fixture and rebuilt the graphs again — doubling the most expensive slice of the publish pipeline. Extend the script's per-language LangResult to include falsePositiveEdges and falseNegativeEdges so the gate test has everything it needs for the existing precision/recall threshold assertions and failure messages. Refactor the gate test to consume that artifact when RESOLUTION_RESULT_JSON is set, falling back to the build-from-fixtures path when unset so devs can still run `npx vitest run tests/benchmarks/resolution/...` standalone. Wire the env var through the workflow's Gate step. Verified locally: gate test in artifact mode passes 170/170 in ~0.5s against an artifact produced by scripts/resolution-benchmark.ts, and the legacy build-from-fixtures path still passes for the javascript fixture. Closes #1052 * fix(ci): preserve full precision and guard malformed resolution artifacts (#1167) - scripts/resolution-benchmark.ts: stop rounding precision/recall to 3 decimals before writing the artifact. The rounding let a near-miss like 0.8497 round up to 0.850 and silently clear a 0.85 threshold in CI artifact mode while failing in fixture mode. - tests/benchmarks/resolution/resolution-benchmark.test.ts: validate numeric fields in metricsFromArtifact so a stale or malformed artifact surfaces a clear 'regenerate' error instead of a confusing TypeError at the threshold assertions. - tests/benchmarks/resolution/resolution-benchmark.test.ts: reject an empty artifact in loadArtifact. Without this guard, an empty {} would register zero describe blocks and vitest would exit 0 with '0 tests', silently passing the gate.
1 parent 0198c57 commit 383cbbe

3 files changed

Lines changed: 117 additions & 20 deletions

File tree

.github/workflows/publish.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,12 @@ jobs:
333333
334334
- name: Gate on resolution thresholds
335335
timeout-minutes: 30
336+
# Reuse the metrics produced by the previous step instead of rebuilding
337+
# every fixture from scratch (issue #1052). The gate test falls back to
338+
# the build-from-fixtures path when this env var is unset, so local
339+
# runs (`npx vitest run …`) still work standalone.
340+
env:
341+
RESOLUTION_RESULT_JSON: ${{ github.workspace }}/resolution-result.json
336342
run: npx vitest run tests/benchmarks/resolution/resolution-benchmark.test.ts --reporter=verbose
337343

338344
- name: Run tracer validation (same-file edge recall)

scripts/resolution-benchmark.ts

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ interface LangResult {
6464
totalResolved: number;
6565
totalExpected: number;
6666
byMode: Record<string, ModeMetrics>;
67+
// Edge lists are included so the gate test can reuse this artifact
68+
// instead of rebuilding fixtures from scratch (see issue #1052).
69+
falsePositiveEdges: string[];
70+
falseNegativeEdges: string[];
6771
dynamicEdges?: number;
6872
dynamicConfirmed?: number;
6973
}
@@ -123,15 +127,20 @@ function computeMetrics(resolvedEdges: ResolvedEdge[], expectedEdges: ExpectedEd
123127
m.recall = m.expected > 0 ? m.resolved / m.expected : 0;
124128
}
125129

130+
// Keep full precision so the artifact-mode gate compares the exact same
131+
// values the fixture-mode gate would compute. Rounding here let a near-miss
132+
// like 0.8497 round up to 0.850 and silently clear a 0.85 threshold.
126133
return {
127-
precision: Math.round(precision * 1000) / 1000,
128-
recall: Math.round(recall * 1000) / 1000,
134+
precision,
135+
recall,
129136
truePositives: truePositives.size,
130137
falsePositives: falsePositives.size,
131138
falseNegatives: falseNegatives.size,
132139
totalResolved: resolvedSet.size,
133140
totalExpected: expectedSet.size,
134141
byMode,
142+
falsePositiveEdges: [...falsePositives],
143+
falseNegativeEdges: [...falseNegatives],
135144
};
136145
}
137146

tests/benchmarks/resolution/resolution-benchmark.test.ts

Lines changed: 100 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@
88
* per language and per resolution mode.
99
*
1010
* CI gate: fails if precision or recall drops below per-language thresholds.
11+
*
12+
* **Artifact mode (CI):** when `RESOLUTION_RESULT_JSON` points at a result
13+
* file produced by `scripts/resolution-benchmark.ts`, the suite reads those
14+
* pre-computed metrics and skips the fixture rebuild — avoiding the duplicate
15+
* work that doubled pre-publish CI time (issue #1052). Local runs without
16+
* the env var fall back to the build-from-fixtures path.
1117
*/
1218

1319
import fs from 'node:fs';
@@ -262,6 +268,79 @@ function formatReport(lang: string, metrics: BenchmarkMetrics): string {
262268
return lines.join('\n');
263269
}
264270

271+
// ── Artifact loading (CI dedup, issue #1052) ─────────────────────────────
272+
273+
const ARTIFACT_PATH = process.env.RESOLUTION_RESULT_JSON;
274+
275+
interface ArtifactLangResult {
276+
precision: number;
277+
recall: number;
278+
truePositives: number;
279+
falsePositives: number;
280+
falseNegatives: number;
281+
totalResolved: number;
282+
totalExpected: number;
283+
byMode: Record<string, ModeMetrics>;
284+
falsePositiveEdges?: string[];
285+
falseNegativeEdges?: string[];
286+
}
287+
288+
function loadArtifact(artifactPath: string): Record<string, ArtifactLangResult> {
289+
if (!fs.existsSync(artifactPath)) {
290+
throw new Error(
291+
`RESOLUTION_RESULT_JSON=${artifactPath} not found — run scripts/resolution-benchmark.ts first.`,
292+
);
293+
}
294+
const parsed = JSON.parse(fs.readFileSync(artifactPath, 'utf-8')) as Record<
295+
string,
296+
ArtifactLangResult
297+
>;
298+
// Refuse to proceed on an empty artifact: with zero languages, vitest would
299+
// register no describe blocks and exit 0, silently passing the gate without
300+
// evaluating a single threshold.
301+
if (!parsed || typeof parsed !== 'object' || Object.keys(parsed).length === 0) {
302+
throw new Error(
303+
`RESOLUTION_RESULT_JSON=${artifactPath} contains no language results — regenerate with scripts/resolution-benchmark.ts.`,
304+
);
305+
}
306+
return parsed;
307+
}
308+
309+
function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics {
310+
if (
311+
typeof raw.precision !== 'number' ||
312+
typeof raw.recall !== 'number' ||
313+
typeof raw.truePositives !== 'number' ||
314+
typeof raw.falsePositives !== 'number' ||
315+
typeof raw.falseNegatives !== 'number' ||
316+
typeof raw.totalResolved !== 'number' ||
317+
typeof raw.totalExpected !== 'number' ||
318+
!raw.byMode ||
319+
typeof raw.byMode !== 'object'
320+
) {
321+
throw new Error(
322+
`Resolution artifact for ${lang} is missing required numeric fields — regenerate with the current resolution-benchmark.ts.`,
323+
);
324+
}
325+
if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) {
326+
throw new Error(
327+
`Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`,
328+
);
329+
}
330+
return {
331+
precision: raw.precision,
332+
recall: raw.recall,
333+
truePositives: raw.truePositives,
334+
falsePositives: raw.falsePositives,
335+
falseNegatives: raw.falseNegatives,
336+
totalResolved: raw.totalResolved,
337+
totalExpected: raw.totalExpected,
338+
byMode: raw.byMode,
339+
falsePositiveEdges: raw.falsePositiveEdges,
340+
falseNegativeEdges: raw.falseNegativeEdges,
341+
};
342+
}
343+
265344
// ── Tests ────────────────────────────────────────────────────────────────
266345

267346
function discoverFixtures(): string[] {
@@ -276,7 +355,11 @@ function discoverFixtures(): string[] {
276355
return languages;
277356
}
278357

279-
const languages = discoverFixtures();
358+
const artifact = ARTIFACT_PATH ? loadArtifact(ARTIFACT_PATH) : null;
359+
// In artifact mode, drive the suite from the keys in the artifact so we never
360+
// silently skip a language the script reported. In local mode, discover from
361+
// the filesystem like before.
362+
const languages = artifact ? Object.keys(artifact).sort() : discoverFixtures();
280363

281364
/** Stores all results for the final summary */
282365
const allResults: Record<string, BenchmarkMetrics> = {};
@@ -309,22 +392,24 @@ describe('Call Resolution Precision/Recall', () => {
309392

310393
for (const lang of languages) {
311394
describe(lang, () => {
312-
let fixtureDir: string;
313-
let resolvedEdges: ResolvedEdge[];
314-
let expectedEdges: ExpectedEdge[];
395+
let fixtureDir: string | null = null;
315396
let metrics: BenchmarkMetrics;
316397

317398
beforeAll(async () => {
318-
fixtureDir = copyFixture(lang);
319-
await buildFixtureGraph(fixtureDir);
399+
if (artifact) {
400+
metrics = metricsFromArtifact(lang, artifact[lang]);
401+
} else {
402+
fixtureDir = copyFixture(lang);
403+
await buildFixtureGraph(fixtureDir);
320404

321-
resolvedEdges = extractResolvedEdges(fixtureDir);
405+
const resolvedEdges = extractResolvedEdges(fixtureDir) as ResolvedEdge[];
322406

323-
const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
324-
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
325-
expectedEdges = manifest.edges;
407+
const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
408+
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
409+
const expectedEdges: ExpectedEdge[] = manifest.edges;
326410

327-
metrics = computeMetrics(resolvedEdges, expectedEdges);
411+
metrics = computeMetrics(resolvedEdges, expectedEdges);
412+
}
328413
allResults[lang] = metrics;
329414
}, 60_000);
330415

@@ -334,16 +419,13 @@ describe('Call Resolution Precision/Recall', () => {
334419
}
335420
});
336421

337-
test('builds graph successfully', () => {
338-
expect(resolvedEdges).toBeDefined();
339-
expect(Array.isArray(resolvedEdges)).toBe(true);
340-
// Some languages may have 0 resolved call edges if resolution isn't
341-
// implemented yet — that's okay, the precision/recall tests will
342-
// catch it at the appropriate threshold level.
422+
test('metrics are populated', () => {
423+
expect(metrics).toBeDefined();
424+
expect(metrics.totalResolved).toBeGreaterThanOrEqual(0);
343425
});
344426

345427
test('expected edges manifest is non-empty', () => {
346-
expect(expectedEdges.length).toBeGreaterThan(0);
428+
expect(metrics.totalExpected).toBeGreaterThan(0);
347429
});
348430

349431
test('precision meets threshold', () => {

0 commit comments

Comments
 (0)