Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -333,10 +333,22 @@ jobs:

- name: Gate on resolution thresholds
timeout-minutes: 30
# Reuse the metrics produced by the previous step instead of rebuilding
# every fixture from scratch (issue #1052). The gate test falls back to
# the build-from-fixtures path when this env var is unset, so local
# runs (`npx vitest run …`) still work standalone.
env:
RESOLUTION_RESULT_JSON: ${{ github.workspace }}/resolution-result.json
run: npx vitest run tests/benchmarks/resolution/resolution-benchmark.test.ts --reporter=verbose

- name: Run tracer validation (same-file edge recall)
timeout-minutes: 10
# Reuse the tracer edges captured by Run resolution benchmark instead
# of re-spawning the per-language tracer subprocess (issue #1166). The
# test falls back to spawning run-tracer.mjs when this env var is
# unset, so local runs still work standalone.
env:
RESOLUTION_RESULT_JSON: ${{ github.workspace }}/resolution-result.json
run: npx vitest run tests/benchmarks/resolution/tracer/tracer-validation.test.ts --reporter=verbose

- name: Merge resolution into build result
Expand Down
69 changes: 53 additions & 16 deletions scripts/resolution-benchmark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ interface DynamicEdge {
target_file: string;
}

interface TracerArtifact {
// `skipped` means the tracer's toolchain was unavailable (no edges captured,
// not a recall failure). `ok` means the tracer ran — the edges array may
// still be empty if the language's tracer legitimately produced no calls.
status: 'ok' | 'skipped';
edges: DynamicEdge[];
}

interface LangResult {
precision: number;
recall: number;
Expand All @@ -64,6 +72,14 @@ interface LangResult {
totalResolved: number;
totalExpected: number;
byMode: Record<string, ModeMetrics>;
// Edge lists are included so the gate test can reuse this artifact
// instead of rebuilding fixtures from scratch (see issue #1052).
falsePositiveEdges: string[];
falseNegativeEdges: string[];
// Raw tracer output so tests/benchmarks/resolution/tracer/tracer-validation.test.ts
// can reuse it instead of re-running the per-language tracer subprocess
// (see issue #1166).
tracer: TracerArtifact;
dynamicEdges?: number;
dynamicConfirmed?: number;
}
Expand Down Expand Up @@ -95,7 +111,10 @@ function edgeKey(sourceName: string, sourceFile: string, targetName: string, tar
return `${sourceName}@${normalizeFile(sourceFile)} -> ${targetName}@${normalizeFile(targetFile)}`;
}

function computeMetrics(resolvedEdges: ResolvedEdge[], expectedEdges: ExpectedEdge[]): LangResult {
function computeMetrics(
resolvedEdges: ResolvedEdge[],
expectedEdges: ExpectedEdge[],
): Omit<LangResult, 'tracer' | 'dynamicEdges' | 'dynamicConfirmed'> {
const resolvedSet = new Set(
resolvedEdges.map((e) => edgeKey(e.source_name, e.source_file, e.target_name, e.target_file)),
);
Expand Down Expand Up @@ -123,15 +142,20 @@ function computeMetrics(resolvedEdges: ResolvedEdge[], expectedEdges: ExpectedEd
m.recall = m.expected > 0 ? m.resolved / m.expected : 0;
}

// Keep full precision so the artifact-mode gate compares the exact same
// values the fixture-mode gate would compute. Rounding here let a near-miss
// like 0.8497 round up to 0.850 and silently clear a 0.85 threshold.
return {
precision: Math.round(precision * 1000) / 1000,
recall: Math.round(recall * 1000) / 1000,
precision,
recall,
truePositives: truePositives.size,
falsePositives: falsePositives.size,
falseNegatives: falseNegatives.size,
totalResolved: resolvedSet.size,
totalExpected: expectedSet.size,
byMode,
falsePositiveEdges: [...falsePositives],
falseNegativeEdges: [...falseNegatives],
};
}

Expand All @@ -153,10 +177,15 @@ const TRACER_SCRIPT = path.join(root, 'tests', 'benchmarks', 'resolution', 'trac

/**
* Attempt to run the dynamic call tracer for a language fixture.
* Returns captured edges on success, empty array on failure or unavailability.
*
* Returns `{ status: 'ok', edges }` on a successful run (edges may be empty if
* the tracer produced no calls) and `{ status: 'skipped', edges: [] }` when
* the tracer subprocess failed or its toolchain wasn't available. The status
* distinction lets tests reusing this artifact (see issue #1166) preserve the
* tracer-validation suite's skip-on-toolchain-missing behavior.
*/
function runDynamicTracer(lang: string): DynamicEdge[] {
if (!fs.existsSync(TRACER_SCRIPT)) return [];
function runDynamicTracer(lang: string): TracerArtifact {
if (!fs.existsSync(TRACER_SCRIPT)) return { status: 'skipped', edges: [] };

const fixtureDir = path.join(FIXTURES_DIR, lang);
try {
Expand All @@ -167,12 +196,17 @@ function runDynamicTracer(lang: string): DynamicEdge[] {
stdio: ['pipe', 'pipe', 'pipe'],
});
const parsed = JSON.parse(result);
if (parsed.error) {
const edges: DynamicEdge[] = Array.isArray(parsed.edges) ? parsed.edges : [];
// Mirror tracer-validation.test.ts: when run-tracer.mjs reports an error
// and produced no edges, treat the run as a toolchain-missing skip
// rather than a recall failure.
if (parsed.error && edges.length === 0) {
console.error(` Dynamic tracer for ${lang}: ${parsed.error}`);
return { status: 'skipped', edges: [] };
}
return Array.isArray(parsed.edges) ? parsed.edges : [];
return { status: 'ok', edges };
} catch {
return [];
return { status: 'skipped', edges: [] };
}
}

Expand Down Expand Up @@ -276,20 +310,23 @@ try {
const expectedEdges: ExpectedEdge[] = manifest.edges;

// Run dynamic tracer if available
const dynamicEdges = runDynamicTracer(lang);
const { dynamicConfirmed } = mergeWithDynamic(expectedEdges, dynamicEdges);
const tracer = runDynamicTracer(lang);
const { dynamicConfirmed } = mergeWithDynamic(expectedEdges, tracer.edges);

// Use only expected edges for metrics (dynamic edges are supplemental)
const metrics = computeMetrics(resolvedEdges, expectedEdges);
if (dynamicEdges.length > 0) {
metrics.dynamicEdges = dynamicEdges.length;
const metrics: LangResult = {
...computeMetrics(resolvedEdges, expectedEdges),
tracer,
};
if (tracer.edges.length > 0) {
metrics.dynamicEdges = tracer.edges.length;
metrics.dynamicConfirmed = dynamicConfirmed;
}
results[lang] = metrics;

const dynamicInfo =
dynamicEdges.length > 0
? ` dynamic=${dynamicEdges.length} confirmed=${dynamicConfirmed}`
tracer.edges.length > 0
? ` dynamic=${tracer.edges.length} confirmed=${dynamicConfirmed}`
: '';
console.error(
` ${lang}: precision=${(metrics.precision * 100).toFixed(1)}% recall=${(metrics.recall * 100).toFixed(1)}%${dynamicInfo}`,
Expand Down
118 changes: 100 additions & 18 deletions tests/benchmarks/resolution/resolution-benchmark.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
* per language and per resolution mode.
*
* CI gate: fails if precision or recall drops below per-language thresholds.
*
* **Artifact mode (CI):** when `RESOLUTION_RESULT_JSON` points at a result
* file produced by `scripts/resolution-benchmark.ts`, the suite reads those
* pre-computed metrics and skips the fixture rebuild — avoiding the duplicate
* work that doubled pre-publish CI time (issue #1052). Local runs without
* the env var fall back to the build-from-fixtures path.
*/

import fs from 'node:fs';
Expand Down Expand Up @@ -262,6 +268,79 @@ function formatReport(lang: string, metrics: BenchmarkMetrics): string {
return lines.join('\n');
}

// ── Artifact loading (CI dedup, issue #1052) ─────────────────────────────

const ARTIFACT_PATH = process.env.RESOLUTION_RESULT_JSON;

interface ArtifactLangResult {
precision: number;
recall: number;
truePositives: number;
falsePositives: number;
falseNegatives: number;
totalResolved: number;
totalExpected: number;
byMode: Record<string, ModeMetrics>;
falsePositiveEdges?: string[];
falseNegativeEdges?: string[];
}

function loadArtifact(artifactPath: string): Record<string, ArtifactLangResult> {
if (!fs.existsSync(artifactPath)) {
throw new Error(
`RESOLUTION_RESULT_JSON=${artifactPath} not found — run scripts/resolution-benchmark.ts first.`,
);
}
const parsed = JSON.parse(fs.readFileSync(artifactPath, 'utf-8')) as Record<
string,
ArtifactLangResult
>;
// Refuse to proceed on an empty artifact: with zero languages, vitest would
// register no describe blocks and exit 0, silently passing the gate without
// evaluating a single threshold.
if (!parsed || typeof parsed !== 'object' || Object.keys(parsed).length === 0) {
throw new Error(
`RESOLUTION_RESULT_JSON=${artifactPath} contains no language results — regenerate with scripts/resolution-benchmark.ts.`,
);
}
return parsed;
}

function metricsFromArtifact(lang: string, raw: ArtifactLangResult): BenchmarkMetrics {
if (
typeof raw.precision !== 'number' ||
typeof raw.recall !== 'number' ||
typeof raw.truePositives !== 'number' ||
typeof raw.falsePositives !== 'number' ||
typeof raw.falseNegatives !== 'number' ||
typeof raw.totalResolved !== 'number' ||
typeof raw.totalExpected !== 'number' ||
!raw.byMode ||
typeof raw.byMode !== 'object'
) {
throw new Error(
`Resolution artifact for ${lang} is missing required numeric fields — regenerate with the current resolution-benchmark.ts.`,
);
}
if (!Array.isArray(raw.falsePositiveEdges) || !Array.isArray(raw.falseNegativeEdges)) {
throw new Error(
`Resolution artifact for ${lang} is missing falsePositiveEdges/falseNegativeEdges — regenerate with the current resolution-benchmark.ts.`,
);
}
return {
precision: raw.precision,
recall: raw.recall,
truePositives: raw.truePositives,
falsePositives: raw.falsePositives,
falseNegatives: raw.falseNegatives,
totalResolved: raw.totalResolved,
totalExpected: raw.totalExpected,
byMode: raw.byMode,
falsePositiveEdges: raw.falsePositiveEdges,
falseNegativeEdges: raw.falseNegativeEdges,
};
}

// ── Tests ────────────────────────────────────────────────────────────────

function discoverFixtures(): string[] {
Expand All @@ -276,7 +355,11 @@ function discoverFixtures(): string[] {
return languages;
}

const languages = discoverFixtures();
const artifact = ARTIFACT_PATH ? loadArtifact(ARTIFACT_PATH) : null;
// In artifact mode, drive the suite from the keys in the artifact so we never
// silently skip a language the script reported. In local mode, discover from
// the filesystem like before.
const languages = artifact ? Object.keys(artifact).sort() : discoverFixtures();

/** Stores all results for the final summary */
const allResults: Record<string, BenchmarkMetrics> = {};
Expand Down Expand Up @@ -309,22 +392,24 @@ describe('Call Resolution Precision/Recall', () => {

for (const lang of languages) {
describe(lang, () => {
let fixtureDir: string;
let resolvedEdges: ResolvedEdge[];
let expectedEdges: ExpectedEdge[];
let fixtureDir: string | null = null;
let metrics: BenchmarkMetrics;

beforeAll(async () => {
fixtureDir = copyFixture(lang);
await buildFixtureGraph(fixtureDir);
if (artifact) {
metrics = metricsFromArtifact(lang, artifact[lang]);
} else {
fixtureDir = copyFixture(lang);
await buildFixtureGraph(fixtureDir);

resolvedEdges = extractResolvedEdges(fixtureDir);
const resolvedEdges = extractResolvedEdges(fixtureDir) as ResolvedEdge[];

const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
expectedEdges = manifest.edges;
const manifestPath = path.join(FIXTURES_DIR, lang, 'expected-edges.json');
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
const expectedEdges: ExpectedEdge[] = manifest.edges;

metrics = computeMetrics(resolvedEdges, expectedEdges);
metrics = computeMetrics(resolvedEdges, expectedEdges);
}
allResults[lang] = metrics;
}, 60_000);

Expand All @@ -334,16 +419,13 @@ describe('Call Resolution Precision/Recall', () => {
}
});

test('builds graph successfully', () => {
expect(resolvedEdges).toBeDefined();
expect(Array.isArray(resolvedEdges)).toBe(true);
// Some languages may have 0 resolved call edges if resolution isn't
// implemented yet — that's okay, the precision/recall tests will
// catch it at the appropriate threshold level.
test('metrics are populated', () => {
expect(metrics).toBeDefined();
expect(metrics.totalResolved).toBeGreaterThanOrEqual(0);
});

test('expected edges manifest is non-empty', () => {
expect(expectedEdges.length).toBeGreaterThan(0);
expect(metrics.totalExpected).toBeGreaterThan(0);
});

test('precision meets threshold', () => {
Expand Down
Loading
Loading