Skip to content

Commit 5c5bb87

Browse files
christsoclaude
andauthored
feat(studio): comparison analytics charts for skills/workflow benchmarking (#1104)
* feat(studio): add analytics charts with baseline comparison Rename Compare tab to Analytics. Add recharts for visualization. Implement ?baseline=<target> query param on /api/compare endpoint to compute delta and normalized gain (g) per cell. Add collapsible analytics section below the aggregated matrix with: - Normalized gain bar chart (horizontal, color-coded by effect) - Tag × target pass rate heatmap - Negative delta regression table - Score distribution histogram - Trend-over-time line chart Closes #1102 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(studio): address code review findings for analytics charts - Fix serve.ts: build cells with delta/normalized_gain fields upfront instead of mutating via type bypass - Fix query key collision: use distinct keys for compare vs baseline queries - Add `enabled: !!baseline` guard to prevent unnecessary API calls - Remove dead CostVsImprovement component and unused recharts imports - Fix misleading GainRow.testId → experiment naming - Rename "Compare runs" heading to "Analyze runs" - Fix biome formatting issues Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor(studio): rename CompareTab to AnalyticsTab and update docs - Rename CompareTab.tsx → AnalyticsTab.tsx with updated exports - Update imports in index.tsx and $benchmarkId.tsx route files - Update studio.mdx docs: rename Compare section to Analytics - Add analytics charts documentation with baseline selector, normalized gain chart, tag heatmap, negative delta table, score distribution, and trend-over-time chart descriptions - Add three new screenshots: aggregated matrix, charts with baseline selector, and score trend over time Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7766f39 commit 5c5bb87

13 files changed

Lines changed: 750 additions & 36 deletions

File tree

apps/cli/src/commands/results/serve.ts

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,23 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
695695
}
696696
}
697697

698+
// ── Baseline delta / normalized-gain computation ─────────────────────
699+
const baselineTarget = c.req.query('baseline') ?? '';
700+
if (baselineTarget && !targetsSet.has(baselineTarget)) {
701+
return c.json({ error: `Baseline target "${baselineTarget}" does not exist in the data` }, 400);
702+
}
703+
704+
// Build baseline lookup before constructing cells so we can include
705+
// delta/normalized_gain in the initial cell objects (no mutation needed).
706+
const baselineScores = new Map<string, number>();
707+
if (baselineTarget) {
708+
for (const entry of cellMap.values()) {
709+
if (entry.target === baselineTarget && entry.evalCount > 0) {
710+
baselineScores.set(entry.experiment, entry.scoreSum / entry.evalCount);
711+
}
712+
}
713+
}
714+
698715
const cells = [...cellMap.values()].map((entry) => {
699716
// Deduplicate tests: keep only the latest entry per test_id (last wins by insertion order)
700717
const dedupMap = new Map<string, (typeof entry.tests)[number]>();
@@ -706,15 +723,28 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
706723
// Cap to most recent entries to prevent unbounded payloads
707724
const cappedTests = dedupedTests.slice(-MAX_TESTS_PER_CELL);
708725

709-
return {
726+
const avgScore = entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0;
727+
const cell: Record<string, unknown> = {
710728
experiment: entry.experiment,
711729
target: entry.target,
712730
eval_count: entry.evalCount,
713731
passed_count: entry.passedCount,
714732
pass_rate: entry.evalCount > 0 ? entry.passedCount / entry.evalCount : 0,
715-
avg_score: entry.evalCount > 0 ? entry.scoreSum / entry.evalCount : 0,
733+
avg_score: avgScore,
716734
tests: cappedTests,
717735
};
736+
737+
// Append baseline comparison fields when a baseline is selected
738+
if (baselineTarget && entry.target !== baselineTarget) {
739+
const baseAvg = baselineScores.get(entry.experiment);
740+
if (baseAvg !== undefined) {
741+
cell.delta = Math.round((avgScore - baseAvg) * 1000) / 1000;
742+
cell.normalized_gain =
743+
baseAvg >= 1.0 ? null : Math.round(((avgScore - baseAvg) / (1 - baseAvg)) * 1000) / 1000;
744+
}
745+
}
746+
747+
return cell;
718748
});
719749

720750
// Per-run entries sorted by timestamp descending (newest first).

apps/studio/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
"@tanstack/react-query": "^5.75.5",
1414
"@tanstack/react-router": "^1.120.3",
1515
"react": "^19.1.0",
16-
"react-dom": "^19.1.0"
16+
"react-dom": "^19.1.0",
17+
"recharts": "^3.8.1"
1718
},
1819
"devDependencies": {
1920
"@tailwindcss/vite": "^4.1.7",

0 commit comments

Comments
 (0)