Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,12 @@ jobs:

- name: Gate on resolution thresholds
timeout-minutes: 30
# Reuse the metrics produced by the previous step instead of rebuilding
# every fixture from scratch (issue #1052). The gate test falls back to
# the build-from-fixtures path when this env var is unset, so local
# runs (`npx vitest run …`) still work standalone.
env:
RESOLUTION_RESULT_JSON: ${{ github.workspace }}/resolution-result.json
run: npx vitest run tests/benchmarks/resolution/resolution-benchmark.test.ts --reporter=verbose

- name: Run tracer validation (same-file edge recall)
Expand Down
6 changes: 6 additions & 0 deletions scripts/resolution-benchmark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ interface LangResult {
totalResolved: number;
totalExpected: number;
byMode: Record<string, ModeMetrics>;
// Edge lists are included so the gate test can reuse this artifact
// instead of rebuilding fixtures from scratch (see issue #1052).
falsePositiveEdges: string[];
falseNegativeEdges: string[];
dynamicEdges?: number;
dynamicConfirmed?: number;
}
Expand Down Expand Up @@ -132,6 +136,8 @@ function computeMetrics(resolvedEdges: ResolvedEdge[], expectedEdges: ExpectedEd
totalResolved: resolvedSet.size,
totalExpected: expectedSet.size,
byMode,
falsePositiveEdges: [...falsePositives],
falseNegativeEdges: [...falseNegatives],
};
}

Expand Down
91 changes: 73 additions & 18 deletions tests/benchmarks/resolution/resolution-benchmark.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
* per language and per resolution mode.
*
* CI gate: fails if precision or recall drops below per-language thresholds.
*
* **Artifact mode (CI):** when `RESOLUTION_RESULT_JSON` points at a result
* file produced by `scripts/resolution-benchmark.ts`, the suite reads those
* pre-computed metrics and skips the fixture rebuild — avoiding the duplicate
* work that doubled pre-publish CI time (issue #1052). Local runs without
* the env var fall back to the build-from-fixtures path.
*/

import fs from 'node:fs';
Expand Down Expand Up @@ -262,6 +268,52 @@ function formatReport(lang: string, metrics: BenchmarkMetrics): string {
return lines.join('\n');
}

// ── Artifact loading (CI dedup, issue #1052) ─────────────────────────────

const ARTIFACT_PATH = process.env.RESOLUTION_RESULT_JSON;

interface ArtifactLangResult {
precision: number;
recall: number;
truePositives: number;
falsePositives: number;
falseNegatives: number;
totalResolved: number;
totalExpected: number;
byMode: Record<string, ModeMetrics>;
falsePositiveEdges?: string[];
falseNegativeEdges?: string[];
}

function loadArtifact(artifactPath: string): Record<string, ArtifactLangResult> {
if (!fs.existsSync(artifactPath)) {
throw new Error(
`RESOLUTION_RESULT_JSON=${artifactPath} not found — run scripts/resolution-benchmark.ts first.`,
);
}
return JSON.parse(fs.readFileSync(artifactPath, 'utf-8'));
}

function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics {
if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) {
throw new Error(
`Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`,
);
}
Comment on lines +309 to +329

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 metricsFromArtifact validates the two edge-list arrays but leaves every numeric field (precision, recall, truePositives, totalResolved, totalExpected, byMode) unchecked. A stale or hand-edited artifact with null or a missing key for one of these fields will throw a confusing TypeError at the threshold assertions rather than the clear "regenerate" message.

Suggested change
function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics {
if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) {
throw new Error(
`Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`,
);
}
function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics {
if (
typeof raw.precision !== 'number' ||
typeof raw.recall !== 'number' ||
typeof raw.truePositives !== 'number' ||
typeof raw.falsePositives !== 'number' ||
typeof raw.falseNegatives !== 'number' ||
typeof raw.totalResolved !== 'number' ||
typeof raw.totalExpected !== 'number' ||
!raw.byMode
) {
throw new Error(
`Resolution artifact for ${lang} is missing required numeric fields — regenerate with the current resolution-benchmark.ts.`,
);
}
if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) {
throw new Error(
`Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`,
);
}

Fix in Claude Code

return {
precision: raw.precision,
recall: raw.recall,
truePositives: raw.truePositives,
falsePositives: raw.falsePositives,
falseNegatives: raw.falseNegatives,
totalResolved: raw.totalResolved,
totalExpected: raw.totalExpected,
byMode: raw.byMode,
falsePositiveEdges: raw.falsePositiveEdges,
falseNegativeEdges: raw.falseNegativeEdges,
};
}

// ── Tests ────────────────────────────────────────────────────────────────

function discoverFixtures(): string[] {
Expand All @@ -276,7 +328,11 @@ function discoverFixtures(): string[] {
return languages;
}

const languages = discoverFixtures();
const artifact = ARTIFACT_PATH ? loadArtifact(ARTIFACT_PATH) : null;
// In artifact mode, drive the suite from the keys in the artifact so we never
// silently skip a language the script reported. In local mode, discover from
// the filesystem like before.
const languages = artifact ? Object.keys(artifact).sort() : discoverFixtures();
Comment on lines +358 to +362

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Empty artifact silently passes the gate

If resolution-result.json is valid JSON but contains {} (e.g., the benchmark script ran with no discoverable fixtures), Object.keys(artifact) returns [], no describe blocks are registered, and vitest exits 0 with "0 tests". The gate would pass without evaluating a single threshold. A guard after loadArtifact throwing on an empty result would make this failure mode explicit.

Fix in Claude Code


/** Stores all results for the final summary */
const allResults: Record<string, BenchmarkMetrics> = {};
Expand Down Expand Up @@ -309,22 +365,24 @@ describe('Call Resolution Precision/Recall', () => {

for (const lang of languages) {
describe(lang, () => {
let fixtureDir: string;
let resolvedEdges: ResolvedEdge[];
let expectedEdges: ExpectedEdge[];
let fixtureDir: string | null = null;
let metrics: BenchmarkMetrics;

beforeAll(async () => {
fixtureDir = copyFixture(lang);
await buildFixtureGraph(fixtureDir);
if (artifact) {
metrics = metricsFromArtifact(lang, artifact[lang]);
} else {
fixtureDir = copyFixture(lang);
await buildFixtureGraph(fixtureDir);

resolvedEdges = extractResolvedEdges(fixtureDir);
const resolvedEdges = extractResolvedEdges(fixtureDir) as ResolvedEdge[];

const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
expectedEdges = manifest.edges;
const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
const expectedEdges: ExpectedEdge[] = manifest.edges;

metrics = computeMetrics(resolvedEdges, expectedEdges);
metrics = computeMetrics(resolvedEdges, expectedEdges);
}
allResults[lang] = metrics;
}, 60_000);

Expand All @@ -334,16 +392,13 @@ describe('Call Resolution Precision/Recall', () => {
}
});

test('builds graph successfully', () => {
expect(resolvedEdges).toBeDefined();
expect(Array.isArray(resolvedEdges)).toBe(true);
// Some languages may have 0 resolved call edges if resolution isn't
// implemented yet — that's okay, the precision/recall tests will
// catch it at the appropriate threshold level.
test('metrics are populated', () => {
expect(metrics).toBeDefined();
expect(metrics.totalResolved).toBeGreaterThanOrEqual(0);
});

test('expected edges manifest is non-empty', () => {
expect(expectedEdges.length).toBeGreaterThan(0);
expect(metrics.totalExpected).toBeGreaterThan(0);
});

test('precision meets threshold', () => {
Expand Down
Loading