From f9e510e1565bb7ceaa1101aab4288fd14069f523 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Mon, 15 Jun 2026 00:22:41 -0500 Subject: [PATCH 1/3] feat(inference): add per-run benchmark fetch and run-coverage queries --- .../src/app/api/v1/benchmarks/route.test.ts | 26 +++++++- .../app/src/app/api/v1/benchmarks/route.ts | 23 ++++++- .../app/api/v1/workflow-info/route.test.ts | 31 ++++++--- .../app/src/app/api/v1/workflow-info/route.ts | 7 ++- .../app/src/hooks/api/use-benchmarks.test.ts | 22 ++++++- packages/app/src/hooks/api/use-benchmarks.ts | 15 ++++- packages/app/src/lib/api.ts | 22 +++++++ packages/db/src/json-provider.ts | 63 +++++++++++++++++++ packages/db/src/queries/benchmarks.ts | 51 +++++++++++++++ packages/db/src/queries/workflow-info.ts | 41 ++++++++++++ 10 files changed, 284 insertions(+), 17 deletions(-) diff --git a/packages/app/src/app/api/v1/benchmarks/route.test.ts b/packages/app/src/app/api/v1/benchmarks/route.test.ts index 5a5251f5..8b7d573e 100644 --- a/packages/app/src/app/api/v1/benchmarks/route.test.ts +++ b/packages/app/src/app/api/v1/benchmarks/route.test.ts @@ -1,7 +1,8 @@ import { describe, expect, it, vi, beforeEach } from 'vitest'; -const { mockGetLatestBenchmarks, mockGetDb } = vi.hoisted(() => ({ +const { mockGetLatestBenchmarks, mockGetBenchmarksForRun, mockGetDb } = vi.hoisted(() => ({ mockGetLatestBenchmarks: vi.fn(), + mockGetBenchmarksForRun: vi.fn(), mockGetDb: vi.fn(() => 'mock-sql'), })); @@ -13,6 +14,7 @@ vi.mock('@semianalysisai/inferencex-db/connection', () => ({ vi.mock('@semianalysisai/inferencex-db/queries/benchmarks', () => ({ getLatestBenchmarks: mockGetLatestBenchmarks, + getBenchmarksForRun: mockGetBenchmarksForRun, })); vi.mock('@/lib/api-cache', () => ({ @@ -125,6 +127,28 @@ describe('GET /api/v1/benchmarks', () => { ); }); + it('routes exactRun=true + runId to the exact-run query', async () => { + const runRows = [{ id: 1, hardware: 'mi300x' }]; + mockGetBenchmarksForRun.mockResolvedValueOnce(runRows); + + const res = await GET( + req('/api/v1/benchmarks?model=DeepSeek-R1-0528&runId=27489075807&exactRun=true'), + ); + expect(res.status).toBe(200); + expect(await res.json()).toEqual(runRows); + expect(mockGetBenchmarksForRun).toHaveBeenCalledWith('mock-sql', ['dsr1'], '27489075807'); + expect(mockGetLatestBenchmarks).not.toHaveBeenCalled(); + }); + + it('ignores exactRun without a runId (falls back to latest)', async () => { + mockGetLatestBenchmarks.mockResolvedValueOnce([]); + + const res = await GET(req('/api/v1/benchmarks?model=DeepSeek-R1-0528&exactRun=true')); + expect(res.status).toBe(200); + expect(mockGetBenchmarksForRun).not.toHaveBeenCalled(); + expect(mockGetLatestBenchmarks).toHaveBeenCalled(); + }); + it('returns 500 when query throws', async () => { mockGetLatestBenchmarks.mockRejectedValueOnce(new Error('DB down')); diff --git a/packages/app/src/app/api/v1/benchmarks/route.ts b/packages/app/src/app/api/v1/benchmarks/route.ts index 59d30853..dd2267e3 100644 --- a/packages/app/src/app/api/v1/benchmarks/route.ts +++ b/packages/app/src/app/api/v1/benchmarks/route.ts @@ -3,7 +3,10 @@ import { type NextRequest, NextResponse } from 'next/server'; import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants'; import { FIXTURES_MODE, JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection'; import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; -import { getLatestBenchmarks } from '@semianalysisai/inferencex-db/queries/benchmarks'; +import { + getBenchmarksForRun, + getLatestBenchmarks, +} from '@semianalysisai/inferencex-db/queries/benchmarks'; import { cachedJson, cachedQuery } from '@/lib/api-cache'; import { loadFixture } from '@/lib/test-fixtures'; @@ -20,6 +23,17 @@ const getCachedBenchmarks = cachedQuery( { blobOnly: true }, ); +// Exactly one run's results (GPU comparison of individual same-day runs). Cached +// under a distinct key prefix so it never collides with the latest/as-of query. +const getCachedBenchmarksForRun = cachedQuery( + (dbModelKeys: string[], runId: string) => { + if (JSON_MODE) return Promise.resolve(jsonProvider.getBenchmarksForRun(dbModelKeys, runId)); + return getBenchmarksForRun(getDb(), dbModelKeys, runId); + }, + 'benchmarks-run', + { blobOnly: true }, +); + export async function GET(request: NextRequest) { const params = request.nextUrl.searchParams; const model = params.get('model') ?? ''; @@ -28,6 +42,8 @@ export async function GET(request: NextRequest) { // Numeric GitHub run id only — anything else is ignored (treated as "latest"). const runIdParam = params.get('runId'); const runId = runIdParam && /^\d+$/u.test(runIdParam) ? runIdParam : undefined; + // exactRun=true → return exactly this run's results (GPU comparison of same-day runs). + const exactRun = params.get('exactRun') === 'true'; const dbModelKeys = DISPLAY_MODEL_TO_DB[model]; if (!dbModelKeys || dbModelKeys.length === 0) { return NextResponse.json({ error: 'Unknown model' }, { status: 400 }); @@ -35,7 +51,10 @@ export async function GET(request: NextRequest) { if (FIXTURES_MODE) return cachedJson(loadFixture('benchmarks')); try { - const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId); + const rows = + exactRun && runId + ? await getCachedBenchmarksForRun(dbModelKeys, runId) + : await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId); return cachedJson(rows); } catch (error) { console.error('Error fetching benchmarks:', error); diff --git a/packages/app/src/app/api/v1/workflow-info/route.test.ts b/packages/app/src/app/api/v1/workflow-info/route.test.ts index 4eb6765e..fde062f2 100644 --- a/packages/app/src/app/api/v1/workflow-info/route.test.ts +++ b/packages/app/src/app/api/v1/workflow-info/route.test.ts @@ -1,12 +1,18 @@ import { describe, expect, it, vi, beforeEach } from 'vitest'; -const { mockGetWorkflowRunsByDate, mockGetChangelogByDate, mockGetDateConfigs, mockGetDb } = - vi.hoisted(() => ({ - mockGetWorkflowRunsByDate: vi.fn(), - mockGetChangelogByDate: vi.fn(), - mockGetDateConfigs: vi.fn(), - mockGetDb: vi.fn(() => 'mock-sql'), - })); +const { + mockGetWorkflowRunsByDate, + mockGetChangelogByDate, + mockGetDateConfigs, + mockGetRunConfigsByDate, + mockGetDb, +} = vi.hoisted(() => ({ + mockGetWorkflowRunsByDate: vi.fn(), + mockGetChangelogByDate: vi.fn(), + mockGetDateConfigs: vi.fn(), + mockGetRunConfigsByDate: vi.fn(), + mockGetDb: vi.fn(() => 'mock-sql'), +})); vi.mock('@semianalysisai/inferencex-db/connection', () => ({ getDb: mockGetDb, @@ -18,6 +24,7 @@ vi.mock('@semianalysisai/inferencex-db/queries/workflow-info', () => ({ getWorkflowRunsByDate: mockGetWorkflowRunsByDate, getChangelogByDate: mockGetChangelogByDate, getDateConfigs: mockGetDateConfigs, + getRunConfigsByDate: mockGetRunConfigsByDate, })); vi.mock('@/lib/api-cache', () => ({ @@ -60,9 +67,13 @@ describe('GET /api/v1/workflow-info', () => { const mockRuns = [{ id: 1, status: 'completed' }]; const mockChangelogs = [{ version: '1.0', changes: 'Initial' }]; const mockConfigs = [{ model: 'dsr1', gpu: 'h200' }]; + const mockRunConfigs = [ + { github_run_id: 1, model: 'dsr1', hardware: 'h200', framework: 'vllm' }, + ]; mockGetWorkflowRunsByDate.mockResolvedValueOnce(mockRuns); mockGetChangelogByDate.mockResolvedValueOnce(mockChangelogs); mockGetDateConfigs.mockResolvedValueOnce(mockConfigs); + mockGetRunConfigsByDate.mockResolvedValueOnce(mockRunConfigs); const res = await GET(req('/api/v1/workflow-info?date=2026-03-01')); expect(res.status).toBe(200); @@ -71,21 +82,24 @@ describe('GET /api/v1/workflow-info', () => { runs: mockRuns, changelogs: mockChangelogs, configs: mockConfigs, + runConfigs: mockRunConfigs, }); expect(mockGetWorkflowRunsByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01'); expect(mockGetChangelogByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01'); expect(mockGetDateConfigs).toHaveBeenCalledWith('mock-sql', '2026-03-01'); + expect(mockGetRunConfigsByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01'); }); it('accepts empty date param (returns all)', async () => { mockGetWorkflowRunsByDate.mockResolvedValueOnce([]); mockGetChangelogByDate.mockResolvedValueOnce([]); mockGetDateConfigs.mockResolvedValueOnce([]); + mockGetRunConfigsByDate.mockResolvedValueOnce([]); const res = await GET(req('/api/v1/workflow-info')); expect(res.status).toBe(200); const body = await res.json(); - expect(body).toEqual({ runs: [], changelogs: [], configs: [] }); + expect(body).toEqual({ runs: [], changelogs: [], configs: [], runConfigs: [] }); expect(mockGetWorkflowRunsByDate).toHaveBeenCalledWith('mock-sql', ''); }); @@ -93,6 +107,7 @@ describe('GET /api/v1/workflow-info', () => { mockGetWorkflowRunsByDate.mockRejectedValueOnce(new Error('Timeout')); mockGetChangelogByDate.mockResolvedValueOnce([]); mockGetDateConfigs.mockResolvedValueOnce([]); + mockGetRunConfigsByDate.mockResolvedValueOnce([]); const res = await GET(req('/api/v1/workflow-info?date=2026-03-01')); expect(res.status).toBe(500); diff --git a/packages/app/src/app/api/v1/workflow-info/route.ts b/packages/app/src/app/api/v1/workflow-info/route.ts index 66af69d5..fc17db31 100644 --- a/packages/app/src/app/api/v1/workflow-info/route.ts +++ b/packages/app/src/app/api/v1/workflow-info/route.ts @@ -5,6 +5,7 @@ import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider'; import { getChangelogByDate, getDateConfigs, + getRunConfigsByDate, getWorkflowRunsByDate, } from '@semianalysisai/inferencex-db/queries/workflow-info'; @@ -19,15 +20,17 @@ const getCachedWorkflowInfo = cachedQuery(async (date: string) => { runs: jsonProvider.getWorkflowRunsByDate(date), changelogs: jsonProvider.getChangelogByDate(date), configs: jsonProvider.getDateConfigs(date), + runConfigs: jsonProvider.getRunConfigsByDate(date), }; } const sql = getDb(); - const [runs, changelogs, configs] = await Promise.all([ + const [runs, changelogs, configs, runConfigs] = await Promise.all([ getWorkflowRunsByDate(sql, date), getChangelogByDate(sql, date), getDateConfigs(sql, date), + getRunConfigsByDate(sql, date), ]); - return { runs, changelogs, configs }; + return { runs, changelogs, configs, runConfigs }; }, 'workflow-info'); export async function GET(request: NextRequest) { diff --git a/packages/app/src/hooks/api/use-benchmarks.test.ts b/packages/app/src/hooks/api/use-benchmarks.test.ts index 21876018..48a861d1 100644 --- a/packages/app/src/hooks/api/use-benchmarks.test.ts +++ b/packages/app/src/hooks/api/use-benchmarks.test.ts @@ -11,12 +11,20 @@ describe('benchmarkQueryOptions', () => { '2026-03-01', 'latest', 'all', + 'asof', ]); }); it('builds exact query key when exact=true', () => { const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01', true, true); - expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact', 'all']); + expect(opts.queryKey).toEqual([ + 'benchmarks', + 'DeepSeek-R1-0528', + '2026-03-01', + 'exact', + 'all', + 'asof', + ]); }); it('includes the runId in the query key for the as-of-run view', () => { @@ -33,9 +41,21 @@ describe('benchmarkQueryOptions', () => { '2026-03-01', 'latest', '27489075807', + 'asof', ]); }); + it('marks the key as an exact-run query when exactRun=true', () => { + const opts = benchmarkQueryOptions('m', '', true, false, '27489075807', true); + expect(opts.queryKey).toEqual(['benchmarks', 'm', '', 'latest', '27489075807', 'run']); + }); + + it('produces distinct keys for as-of vs exact-run with the same runId', () => { + const asof = benchmarkQueryOptions('m', '2026-03-01', true, false, '100', false); + const exact = benchmarkQueryOptions('m', '2026-03-01', true, false, '100', true); + expect(asof.queryKey).not.toEqual(exact.queryKey); + }); + it('produces distinct keys for different runIds (no cache collision)', () => { const a = benchmarkQueryOptions('m', '2026-03-01', true, false, '100'); const b = benchmarkQueryOptions('m', '2026-03-01', true, false, '101'); diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts index ff32b250..a8d634f1 100644 --- a/packages/app/src/hooks/api/use-benchmarks.ts +++ b/packages/app/src/hooks/api/use-benchmarks.ts @@ -8,13 +8,22 @@ export function benchmarkQueryOptions( date: string, enabled = true, exact?: boolean, - /** GitHub run id for the "as of run" view (main chart only). */ + /** GitHub run id for the "as of run" view (main chart) or the exact-run comparison. */ runId?: string, + /** When true with a runId, fetch exactly that run's results (GPU comparison). */ + exactRun?: boolean, ) { return { - queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest', runId ?? 'all'] as const, + queryKey: [ + 'benchmarks', + model, + date, + exact ? 'exact' : 'latest', + runId ?? 'all', + exactRun ? 'run' : 'asof', + ] as const, queryFn: ({ signal }: { signal: AbortSignal }) => - fetchBenchmarks(model, date, exact, signal, runId), + fetchBenchmarks(model, date, exact, signal, runId, exactRun), enabled: enabled && Boolean(model), }; } diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 49d8d4d0..0dac5883 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -74,10 +74,29 @@ export interface DateConfigRow { disagg: boolean; } +/** + * Per-(run, config) coverage for a date — which workflow runs produced benchmark + * data for which configs. Data-driven, so a run that shipped data without a + * changelog entry still appears (used to enumerate every run on a date). + */ +export interface RunConfigRow { + github_run_id: number; + run_started_at: string | null; + html_url: string | null; + head_sha: string | null; + model: string; + precision: string; + hardware: string; + framework: string; + spec_method: string; + disagg: boolean; +} + export interface WorkflowInfoResponse { runs: WorkflowRunRow[]; changelogs: ChangelogRow[]; configs: DateConfigRow[]; + runConfigs: RunConfigRow[]; } export interface ReliabilityRow { @@ -127,11 +146,14 @@ export function fetchBenchmarks( exact?: boolean, signal?: AbortSignal, runId?: string, + /** When true with a runId, fetch exactly that run's results (GPU comparison). */ + exactRun?: boolean, ) { const params = new URLSearchParams({ model }); if (date) params.set('date', date); if (exact) params.set('exact', 'true'); if (runId) params.set('runId', runId); + if (exactRun) params.set('exactRun', 'true'); return fetchJson(`/api/v1/benchmarks?${params}`, signal); } diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index c77ef687..1ed155c9 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -19,6 +19,7 @@ import type { AvailabilityRow, ChangelogRow, DateConfigRow, + RunConfigRow, WorkflowRunRow, } from './queries/workflow-info.js'; @@ -409,6 +410,32 @@ export function getLatestBenchmarks( }); } +/** In-memory mirror of {@link import('./queries/benchmarks.js').getBenchmarksForRun}. */ +export function getBenchmarksForRun( + modelKey: string | string[], + githubRunId: string | number, +): BenchmarkRow[] { + const s = getStore(); + const modelKeys = new Set(Array.isArray(modelKey) ? modelKey : [modelKey]); + const run = s.latestRuns.get(Number(githubRunId)); + if (!run) return []; + + const seen = new Map(); + for (const br of s.benchmarks) { + if (br.error !== null && br.error !== undefined) continue; + if (br.workflow_run_id !== run.id) continue; + const c = s.configs.get(br.config_id); + if (!c || !modelKeys.has(c.model)) continue; + const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}`; + if (!seen.has(key)) seen.set(key, br); + } + + return [...seen.values()].map((br) => { + const c = s.configs.get(br.config_id)!; + return toBenchmarkRow(br, c, run); + }); +} + export function getAllBenchmarksForHistory( modelKey: string | string[], isl: number, @@ -621,6 +648,42 @@ export function getDateConfigs(date: string): DateConfigRow[] { return rows; } +export function getRunConfigsByDate(date: string): RunConfigRow[] { + const s = getStore(); + const dateStr = toDateString(date); + + const seen = new Set(); + const rows: RunConfigRow[] = []; + + for (const br of s.benchmarks) { + if (br.error !== null && br.error !== undefined) continue; + if (toDateString(br.date) !== dateStr) continue; + const wr = s.latestRunsById.get(br.workflow_run_id); + if (!wr) continue; + const c = s.configs.get(br.config_id); + if (!c) continue; + + const key = `${wr.github_run_id}|${c.model}|${c.precision}|${c.hardware}|${c.framework}|${c.spec_method}|${c.disagg}`; + if (seen.has(key)) continue; + seen.add(key); + + rows.push({ + github_run_id: wr.github_run_id, + run_started_at: wr.run_started_at ?? wr.created_at, + html_url: wr.html_url, + head_sha: wr.head_sha, + model: c.model, + precision: c.precision, + hardware: c.hardware, + framework: c.framework, + spec_method: c.spec_method, + disagg: c.disagg, + }); + } + + return rows; +} + export function getServerLog(benchmarkResultId: number): string | null { const s = getStore(); const logId = s.benchmarkServerLogMap.get(benchmarkResultId); diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 47e3c328..49c60604 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -170,6 +170,57 @@ export async function getLatestBenchmarks( return rows as unknown as BenchmarkRow[]; } +/** + * Fetch the benchmark results produced by ONE specific workflow run (by GitHub + * run id). Unlike {@link getLatestBenchmarks}, this returns exactly what that run + * measured — used by the GPU comparison view to plot individual same-day runs as + * distinct series (e.g. comparing a day-zero sweep against a same-day re-sweep). + * Returns an empty array if the run produced no results for the model. + */ +export async function getBenchmarksForRun( + sql: DbClient, + modelKey: string | string[], + githubRunId: string | number, +): Promise { + const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; + const rows = await sql` + SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + c.hardware, + c.framework, + c.model, + c.precision, + c.spec_method, + c.disagg, + c.is_multinode, + c.prefill_tp, + c.prefill_ep, + c.prefill_dp_attention, + c.prefill_num_workers, + c.decode_tp, + c.decode_ep, + c.decode_dp_attention, + c.decode_num_workers, + c.num_prefill_gpu, + c.num_decode_gpu, + br.isl, + br.osl, + br.conc, + br.image, + br.metrics, + br.workers, + br.date::text, + CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url + FROM benchmark_results br + JOIN configs c ON c.id = br.config_id + JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id + WHERE c.model = ANY(${modelKeys}) + AND br.error IS NULL + AND wr.github_run_id = ${Number(githubRunId)} + ORDER BY br.config_id, br.conc, br.isl, br.osl, br.date DESC + `; + return rows as unknown as BenchmarkRow[]; +} + /** * Fetch ALL benchmark results for a model + sequence across ALL dates. * No DISTINCT ON — returns every successful result, one per (config, conc, date). diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts index b4e4f255..dfcb9e9f 100644 --- a/packages/db/src/queries/workflow-info.ts +++ b/packages/db/src/queries/workflow-info.ts @@ -65,6 +65,47 @@ export async function getChangelogByDate(sql: DbClient, date: string): Promise { + const rows = await sql` + SELECT DISTINCT + wr.github_run_id, + to_char(COALESCE(wr.run_started_at, wr.created_at), 'YYYY-MM-DD"T"HH24:MI:SS"Z"') as run_started_at, + wr.html_url, + wr.head_sha, + c.model, + c.precision, + c.hardware, + c.framework, + c.spec_method, + c.disagg + FROM benchmark_results br + JOIN configs c ON c.id = br.config_id + JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id + WHERE br.date = ${date}::date + AND br.error IS NULL + `; + return rows as unknown as RunConfigRow[]; +} + /** Get distinct model/sequence/precision/hardware combos for a date. */ export async function getDateConfigs(sql: DbClient, date: string): Promise { const rows = await sql` From 02323f03006e743d73eac9089b62d19a283881f8 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Mon, 15 Jun 2026 00:22:58 -0500 Subject: [PATCH 2/3] feat(inference): add comparison run-entry model and run enumeration helpers --- .../inference/utils/comparisonEntry.test.ts | 94 ++++++++++++++ .../inference/utils/comparisonEntry.ts | 122 ++++++++++++++++++ .../inference/utils/runEnumeration.test.ts | 107 +++++++++++++++ .../inference/utils/runEnumeration.ts | 80 ++++++++++++ 4 files changed, 403 insertions(+) create mode 100644 packages/app/src/components/inference/utils/comparisonEntry.test.ts create mode 100644 packages/app/src/components/inference/utils/comparisonEntry.ts create mode 100644 packages/app/src/components/inference/utils/runEnumeration.test.ts create mode 100644 packages/app/src/components/inference/utils/runEnumeration.ts diff --git a/packages/app/src/components/inference/utils/comparisonEntry.test.ts b/packages/app/src/components/inference/utils/comparisonEntry.test.ts new file mode 100644 index 00000000..9eafa341 --- /dev/null +++ b/packages/app/src/components/inference/utils/comparisonEntry.test.ts @@ -0,0 +1,94 @@ +import { describe, expect, it } from 'vitest'; + +import { + buildRunNumbering, + comparisonEntryDate, + comparisonEntryLabel, + comparisonEntrySortValue, + isRunComparisonEntry, + makeRunComparisonEntry, + parseComparisonEntry, + resolveComparisonEntries, +} from './comparisonEntry'; + +describe('comparisonEntry', () => { + const runEntry = makeRunComparisonEntry('2026-06-14', '27489075807'); + + it('round-trips a run entry through make/parse', () => { + expect(runEntry).toBe('2026-06-14~r27489075807'); + expect(parseComparisonEntry(runEntry)).toEqual({ + raw: runEntry, + date: '2026-06-14', + runId: '27489075807', + }); + }); + + it('parses the legacy baked-index form, ignoring the index', () => { + const legacy = '2026-06-14~r27489075807~3of3'; + expect(isRunComparisonEntry(legacy)).toBe(true); + expect(parseComparisonEntry(legacy)).toEqual({ + raw: legacy, + date: '2026-06-14', + runId: '27489075807', + }); + }); + + it('treats a plain date as a non-run entry', () => { + expect(parseComparisonEntry('2026-06-14')).toEqual({ raw: '2026-06-14', date: '2026-06-14' }); + expect(isRunComparisonEntry('2026-06-14')).toBe(false); + expect(comparisonEntryDate(runEntry)).toBe('2026-06-14'); + }); + + it('numbers run entries sequentially in chronological order, gap-free', () => { + // Two non-adjacent run ids (e.g. skipping a same-day MTP run) must still be #1, #2. + const a = makeRunComparisonEntry('2026-06-14', '27485974465'); // earlier + const b = makeRunComparisonEntry('2026-06-14', '27489075807'); // later + const numbering = buildRunNumbering(['2026-06-13', b, a]); + expect(numbering.get(a)).toBe(1); + expect(numbering.get(b)).toBe(2); + expect(numbering.has('2026-06-13')).toBe(false); // plain dates unnumbered + }); + + it('labels run entries with their sequential number and plain dates as-is', () => { + const a = makeRunComparisonEntry('2026-06-14', '27485974465'); + const numbering = buildRunNumbering([a]); + expect(comparisonEntryLabel(a, numbering)).toBe('2026-06-14 #1'); + expect(comparisonEntryLabel('2026-06-14', numbering)).toBe('2026-06-14'); + expect(comparisonEntryLabel(a)).toBe('2026-06-14'); // no numbering → bare date + }); + + it('sorts by date then run id (plain date first within a day)', () => { + const later = makeRunComparisonEntry('2026-06-14', '300'); + const earlier = makeRunComparisonEntry('2026-06-14', '100'); + const sorted = ['2026-06-13', later, earlier, '2026-06-14'].toSorted((a, b) => { + const [ta, ia] = comparisonEntrySortValue(a); + const [tb, ib] = comparisonEntrySortValue(b); + return ta - tb || ia - ib; + }); + expect(sorted).toEqual(['2026-06-13', '2026-06-14', earlier, later]); + }); + + describe('resolveComparisonEntries', () => { + const range = { startDate: '2026-06-13', endDate: '2026-06-14' }; + + it('keeps both range endpoints when no run entries overlap', () => { + expect(resolveComparisonEntries([], range)).toEqual(['2026-06-13', '2026-06-14']); + }); + + it('drops a range endpoint whose date has specific run entries', () => { + const r1 = makeRunComparisonEntry('2026-06-14', '100'); + const r2 = makeRunComparisonEntry('2026-06-14', '200'); + // 2026-06-14 endpoint dropped (runs cover it); 2026-06-13 endpoint kept. + expect(resolveComparisonEntries([r1, r2], range)).toEqual(['2026-06-13', r1, r2]); + }); + + it('returns just the added entries when no range is set', () => { + const r1 = makeRunComparisonEntry('2026-06-14', '100'); + expect(resolveComparisonEntries([r1], { startDate: '', endDate: '' })).toEqual([r1]); + }); + }); + + it('contains no characters that break a CSS class selector', () => { + expect(runEntry).not.toMatch(/[.#\s]/u); + }); +}); diff --git a/packages/app/src/components/inference/utils/comparisonEntry.ts b/packages/app/src/components/inference/utils/comparisonEntry.ts new file mode 100644 index 00000000..ad87cd2b --- /dev/null +++ b/packages/app/src/components/inference/utils/comparisonEntry.ts @@ -0,0 +1,122 @@ +/** + * Comparison selections (the `selectedDates` array / `i_dates` URL param) are + * plain strings so they flow unchanged through the date-keyed GPU comparison + * pipeline (grouping, activeDates, colors, legend). An entry is one of: + * + * - a plain date — "2026-06-14" → the whole day's latest run + * - a run entry — "2026-06-14~r27489075807" → one specific run + * + * The run's display number (#1, #2, …) is NOT baked into the string — it is + * derived at render time from the set of runs actually on the chart (see + * {@link buildRunNumbering}) so it is always sequential and in chronological + * order, and never goes stale when the run list changes. + * + * The separator is `~` (not `_`) because GPUGraph derives ids as + * `${entry}_${hwKey}_${precision}` and splits on the last `_`; a `_` in the entry + * would corrupt that. `~` is URL-safe and never appears in dates or run ids. + * (CSS selectors built from these ids are escaped — see d3-chart/layers/rooflines.) + */ + +// Accepts the current `date~r` form and the legacy `date~r~of` form +// (older saved selections) — the baked index, if present, is ignored. +const RUN_ENTRY_RE = /^(?\d{4}-\d{2}-\d{2})~r(?\d+)(?:~\d+of\d+)?$/u; + +export interface ParsedComparisonEntry { + /** The original entry string (series key). */ + raw: string; + /** Calendar date the entry belongs to. */ + date: string; + /** GitHub run id, when this entry pins a specific run. */ + runId?: string; +} + +/** Build the entry string for a specific run within a date. */ +export function makeRunComparisonEntry(date: string, runId: string): string { + return `${date}~r${runId}`; +} + +/** Parse an entry string into its date and (optional) run components. */ +export function parseComparisonEntry(raw: string): ParsedComparisonEntry { + const m = RUN_ENTRY_RE.exec(raw); + if (!m?.groups) return { raw, date: raw }; + return { raw, date: m.groups.date, runId: m.groups.runId }; +} + +/** True when the entry pins a specific run (vs. the date's latest). */ +export function isRunComparisonEntry(raw: string): boolean { + return RUN_ENTRY_RE.test(raw); +} + +/** Underlying calendar date — used for chronological sorting and matching. */ +export function comparisonEntryDate(raw: string): string { + return parseComparisonEntry(raw).date; +} + +/** + * Sort key for ordering comparison series: by date, then by run id (which grows + * monotonically with time) so a date's runs read earliest → latest. A plain-date + * entry sorts first within its day (run id 0). + */ +export function comparisonEntrySortValue(raw: string): [number, number] { + const { date, runId } = parseComparisonEntry(raw); + const t = new Date(date).getTime(); + return [Number.isNaN(t) ? 0 : t, runId ? Number(runId) : 0]; +} + +/** + * Assign sequential, chronological 1-based numbers to the run entries in a set, + * grouped by date (run ids sort chronologically). Plain-date entries are not + * numbered. The result is gap-free regardless of which runs were selected. + */ +export function buildRunNumbering(entries: string[]): Map { + const byDate = new Map(); + for (const raw of entries) { + const parsed = parseComparisonEntry(raw); + if (!parsed.runId) continue; + const list = byDate.get(parsed.date) ?? []; + list.push(parsed); + byDate.set(parsed.date, list); + } + const numbering = new Map(); + for (const list of byDate.values()) { + list + .toSorted((a, b) => Number(a.runId) - Number(b.runId)) + .forEach((e, i) => { + numbering.set(e.raw, i + 1); + }); + } + return numbering; +} + +/** + * Human-readable label for legends/line labels, e.g. "2026-06-14 #2". Pass the + * numbering from {@link buildRunNumbering} (built from the chart's current series) + * so run entries get their sequential number; plain dates render as the date. + */ +export function comparisonEntryLabel(raw: string, numbering?: Map): string { + const n = numbering?.get(raw); + return n ? `${comparisonEntryDate(raw)} #${n}` : comparisonEntryDate(raw); +} + +/** + * Resolve the final set of comparison series entries from the user's selections: + * the date-range endpoints plus individually-added dates/runs, de-duplicated. A + * range endpoint is dropped when that same date has specific run entries selected + * — the whole-day "latest" line would just duplicate one of the numbered runs. + */ +export function resolveComparisonEntries( + selectedDates: string[], + range: { startDate: string; endDate: string }, +): string[] { + const datesWithRuns = new Set( + selectedDates.filter(isRunComparisonEntry).map(comparisonEntryDate), + ); + const entries: string[] = []; + if (range.startDate && range.endDate) { + for (const d of [range.startDate, range.endDate]) { + if (!datesWithRuns.has(d)) entries.push(d); + } + } + entries.push(...selectedDates); + return [...new Set(entries)]; +} diff --git a/packages/app/src/components/inference/utils/runEnumeration.test.ts b/packages/app/src/components/inference/utils/runEnumeration.test.ts new file mode 100644 index 00000000..fbb94280 --- /dev/null +++ b/packages/app/src/components/inference/utils/runEnumeration.test.ts @@ -0,0 +1,107 @@ +import { describe, expect, it } from 'vitest'; + +import type { RunConfigRow } from '@/lib/api'; + +import { dataRunsForDate } from './runEnumeration'; + +function rc(over: Partial): RunConfigRow { + return { + github_run_id: 1, + run_started_at: '2026-06-14T00:00:00Z', + html_url: null, + head_sha: null, + model: 'minimaxm3', + precision: 'fp8', + hardware: 'mi300x', + framework: 'vllm', + spec_method: 'none', + disagg: false, + ...over, + }; +} + +const SCOPE = { + modelDbKeys: ['minimaxm3'], + selectedGPUs: ['mi300x_vllm'], + selectedPrecisions: ['fp8'], +}; + +describe('dataRunsForDate', () => { + it('enumerates distinct runs for the selected config, earliest first', () => { + const rows = [ + rc({ github_run_id: 27489075807, run_started_at: '2026-06-14T06:43:25Z' }), + rc({ github_run_id: 27485974465, run_started_at: '2026-06-14T04:08:16Z' }), + rc({ github_run_id: 27510667862, run_started_at: '2026-06-14T23:22:40Z' }), + ]; + const runs = dataRunsForDate(rows, SCOPE); + expect(runs.map((r) => r.runId)).toEqual(['27485974465', '27489075807', '27510667862']); + }); + + it('dedupes a run that appears in multiple matching rows into one entry', () => { + const rows = [ + rc({ github_run_id: 100 }), + // same run id appearing again (e.g. another covered row) — still one run + rc({ github_run_id: 100 }), + ]; + const runs = dataRunsForDate(rows, SCOPE); + expect(runs).toHaveLength(1); + expect(runs[0].runId).toBe('100'); + }); + + it('excludes MTP runs when a non-MTP GPU key is selected', () => { + const rows = [ + rc({ github_run_id: 1, spec_method: 'none' }), + rc({ github_run_id: 2, spec_method: 'mtp' }), + ]; + const runs = dataRunsForDate(rows, SCOPE); + expect(runs.map((r) => r.runId)).toEqual(['1']); + }); + + it('includes only MTP runs when the MTP GPU key is selected', () => { + const rows = [ + rc({ github_run_id: 1, spec_method: 'none' }), + rc({ github_run_id: 2, spec_method: 'mtp' }), + ]; + const runs = dataRunsForDate(rows, { ...SCOPE, selectedGPUs: ['mi300x_vllm_mtp'] }); + expect(runs.map((r) => r.runId)).toEqual(['2']); + }); + + it('excludes runs for other models, precisions, and GPUs', () => { + const rows = [ + rc({ github_run_id: 1 }), // matches + rc({ github_run_id: 2, model: 'dsr1' }), // other model + rc({ github_run_id: 3, precision: 'fp4' }), // other precision + rc({ github_run_id: 4, hardware: 'b200' }), // other gpu + rc({ github_run_id: 5, framework: 'sglang' }), // other framework + ]; + const runs = dataRunsForDate(rows, SCOPE); + expect(runs.map((r) => r.runId)).toEqual(['1']); + }); + + it('includes a run for any selected GPU (union across GPUs)', () => { + const rows = [ + rc({ github_run_id: 1, hardware: 'mi300x', framework: 'vllm' }), + rc({ github_run_id: 2, hardware: 'b200', framework: 'vllm' }), + ]; + const runs = dataRunsForDate(rows, { ...SCOPE, selectedGPUs: ['mi300x_vllm', 'b200_vllm'] }); + expect(runs.map((r) => r.runId).toSorted()).toEqual(['1', '2']); + }); + + it('carries run url and head sha through', () => { + const rows = [ + rc({ + github_run_id: 7, + html_url: 'https://github.com/x/actions/runs/7', + head_sha: 'abc123', + }), + ]; + const [run] = dataRunsForDate(rows, SCOPE); + expect(run.runUrl).toBe('https://github.com/x/actions/runs/7'); + expect(run.headSha).toBe('abc123'); + }); + + it('returns nothing when no run matches the selection', () => { + expect(dataRunsForDate([], SCOPE)).toEqual([]); + expect(dataRunsForDate([rc({ model: 'dsr1' })], SCOPE)).toEqual([]); + }); +}); diff --git a/packages/app/src/components/inference/utils/runEnumeration.ts b/packages/app/src/components/inference/utils/runEnumeration.ts new file mode 100644 index 00000000..fda1ae30 --- /dev/null +++ b/packages/app/src/components/inference/utils/runEnumeration.ts @@ -0,0 +1,80 @@ +/** + * Enumerates the workflow runs that produced benchmark data for the currently + * selected model / GPU / precision on a given date. This is the single source of + * truth for "how many runs are on this date" used by both the changelog (to render + * a block per run) and the chart (to expand a plain-date selection into per-run + * series). + * + * It is intentionally DATA-driven (keyed off `runConfigs`, which comes from the + * benchmark rows) rather than changelog-driven: a run can ship data without a + * changelog entry, and that newest run is exactly the one the plain-date "latest" + * view shows — so enumerating from changelog entries alone would silently drop it. + * + * Runs are scoped to the selected GPUs using the canonical {@link getHardwareKey} + * so MTP and disagg variants (separate hw keys) are kept distinct, exactly as the + * chart keys them. + */ + +import type { AggDataEntry } from '@/components/inference/types'; +import type { RunConfigRow } from '@/lib/api'; +import { getHardwareKey } from '@/lib/chart-utils'; + +export interface DataRun { + /** GitHub run id (string). */ + runId: string; + /** ISO-8601 start time (or created_at fallback); orders runs chronologically. */ + runStartedAt: string; + /** Workflow run URL, when known. */ + runUrl?: string; + /** Head commit sha, for the Git Commit link. */ + headSha?: string; +} + +export interface RunScope { + /** DB model keys for the selected display model, e.g. ['minimaxm3']. */ + modelDbKeys: string[]; + /** Selected GPU hw keys, e.g. ['mi300x_vllm']. */ + selectedGPUs: string[]; + /** Selected DB precisions, e.g. ['fp8']. */ + selectedPrecisions: string[]; +} + +/** The hw key a runConfig maps to, built the same way the chart builds series keys. */ +function runConfigHwKey(rc: RunConfigRow): string { + return getHardwareKey({ + hw: rc.hardware, + framework: rc.framework, + disagg: rc.disagg, + spec_decoding: rc.spec_method, + } as unknown as AggDataEntry); +} + +/** + * Distinct runs that produced data for the selected config on a date, earliest + * first. De-duplicated by run id; ordered by start time so the #1/#2/#3 the UI + * assigns read in the order the runs actually happened. + */ +export function dataRunsForDate(runConfigs: RunConfigRow[], scope: RunScope): DataRun[] { + const { modelDbKeys, selectedGPUs, selectedPrecisions } = scope; + const precSet = new Set(selectedPrecisions); + const gpuSet = new Set(selectedGPUs); + const byRun = new Map(); + + for (const rc of runConfigs) { + if (!modelDbKeys.includes(rc.model)) continue; + if (!precSet.has(rc.precision)) continue; + if (!gpuSet.has(runConfigHwKey(rc))) continue; + + const id = String(rc.github_run_id); + if (!byRun.has(id)) { + byRun.set(id, { + runId: id, + runStartedAt: rc.run_started_at ?? '', + runUrl: rc.html_url ?? undefined, + headSha: rc.head_sha ?? undefined, + }); + } + } + + return [...byRun.values()].toSorted((a, b) => a.runStartedAt.localeCompare(b.runStartedAt)); +} From a4855d29ad8e895fca98c766782f97f1c2908a5e Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Mon, 15 Jun 2026 00:23:07 -0500 Subject: [PATCH 3/3] feat(inference): compare individual runs with two-way legend/changelog binding --- .../components/inference/InferenceContext.tsx | 13 +- .../inference/hooks/useChartData.ts | 27 +- .../app/src/components/inference/types.ts | 11 +- .../components/inference/ui/ChartDisplay.tsx | 86 ++++- .../inference/ui/ComparisonChangelog.tsx | 358 +++++++++++++----- .../src/components/inference/ui/GPUGraph.tsx | 85 ++++- .../hooks/api/use-comparison-changelogs.ts | 72 +++- .../app/src/lib/d3-chart/layers/rooflines.ts | 12 +- 8 files changed, 514 insertions(+), 150 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index c2a6a2e7..f8e9f647 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -2,6 +2,7 @@ import { type ReactNode, + type SetStateAction, createContext, useCallback, useContext, @@ -57,6 +58,7 @@ import { import { filterRunsByModel, getDisplayLabel } from '@/lib/utils'; import { useChartData } from './hooks/useChartData'; +import { resolveComparisonEntries } from './utils/comparisonEntry'; /** @internal Exported for test provider wrapping only. */ export const InferenceContext = createContext(undefined); @@ -416,7 +418,10 @@ export function InferenceProvider({ [setSelectedGPUs, clearPresetOnChange], ); const setSelectedDatesAndClear = useCallback( - (v: string[]) => { + // Accept a React state updater (value OR function) so callers adding several + // dates/runs in quick succession can use the functional form and avoid the + // stale-closure race where each click overwrites the last. + (v: SetStateAction) => { setSelectedDates(v); clearPresetOnChange(); }, @@ -564,11 +569,7 @@ export function InferenceProvider({ ); const allDateIds = useMemo(() => { - const dates: string[] = []; - if (selectedDateRange.startDate && selectedDateRange.endDate) { - dates.push(selectedDateRange.startDate, selectedDateRange.endDate); - } - dates.push(...selectedDates); + const dates = resolveComparisonEntries(selectedDates, selectedDateRange); const allIds = new Set(); selectedGPUs.forEach((gpu) => { dates.forEach((date) => allIds.add(`${date}_${gpu}`)); diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 3797318c..da8ac27a 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -13,6 +13,10 @@ import type { YAxisMetricKey, } from '@/components/inference/types'; import { filterDataByCostLimit } from '@/components/inference/utils'; +import { + parseComparisonEntry, + resolveComparisonEntries, +} from '@/components/inference/utils/comparisonEntry'; import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks'; import { GPU_ALIAS_TO_CANONICAL, @@ -31,12 +35,11 @@ export function buildComparisonDates( selectedRunDate: string | undefined, ): string[] { if (selectedGPUs.length === 0) return []; - const dates: string[] = []; - if (selectedDateRange.startDate && selectedDateRange.endDate) { - dates.push(selectedDateRange.startDate, selectedDateRange.endDate); - } - dates.push(...selectedDates); - return [...new Set(dates.filter((d) => d !== selectedRunDate))]; + // Range endpoints + individually-added dates/runs (redundant same-day range + // endpoints dropped), minus the main run date which the primary query covers. + return resolveComparisonEntries(selectedDates, selectedDateRange).filter( + (d) => d !== selectedRunDate, + ); } /** Filter data by GPU key, resolving aliases to canonical keys. */ @@ -116,10 +119,16 @@ export function useChartData( [selectedGPUs, selectedDates, selectedDateRange, selectedRunDate], ); + // Each comparison entry is either a plain date (latest run that day, exact-date + // query) or a specific run encoded as `date~r~of` (exact-run query) so + // multiple same-day runs can be compared as distinct series. const comparisonQueries = useQueries({ - queries: comparisonDates.map((date) => - benchmarkQueryOptions(selectedModel, date, enabled, true), - ), + queries: comparisonDates.map((entry) => { + const parsed = parseComparisonEntry(entry); + return parsed.runId + ? benchmarkQueryOptions(selectedModel, '', enabled, false, parsed.runId, true) + : benchmarkQueryOptions(selectedModel, entry, enabled, true); + }), }); const comparisonLoading = comparisonQueries.some((q) => q.isLoading); diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index cbf64787..cf468741 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -481,6 +481,14 @@ export interface ScatterGraphProps { * playback). */ niceAxes?: boolean; + /** + * Stable run numbering (entry string `date~rRunId` → 1-based number) shared with + * the comparison changelog so legend labels match it exactly. Numbers index ALL + * of a date's runs (not just the ones on the chart), so a removed run leaves a + * gap that lines up with the changelog's still-listed "Add to chart" run. When + * omitted, GPUGraph falls back to gap-free numbering of the on-chart series. + */ + runNumbering?: Map; } /** * @file types.ts @@ -639,7 +647,8 @@ export interface InferenceChartContextType { setSelectedGPUs: (gpus: string[]) => void; availableGPUs: { value: string; label: string }[]; selectedDates: string[]; - setSelectedDates: (dates: string[]) => void; + /** Accepts a value or a state-updater fn (for safe rapid successive adds). */ + setSelectedDates: (dates: string[] | ((prev: string[]) => string[])) => void; selectedDateRange: { startDate: string; endDate: string }; setSelectedDateRange: (dateRange: { startDate: string; endDate: string }) => void; userCosts: Record | null; diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 47dfe600..3c44a433 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -1,7 +1,8 @@ 'use client'; +import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; import dynamic from 'next/dynamic'; -import { useMemo, useRef, useState } from 'react'; +import { useEffect, useMemo, useRef, useState } from 'react'; import { BarChart3, ChevronDown, Table2, X } from 'lucide-react'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; @@ -14,6 +15,11 @@ import type { TrendDataPoint, } from '@/components/inference/types'; import { processOverlayChartData } from '@/components/inference/utils'; +import { + isRunComparisonEntry, + makeRunComparisonEntry, +} from '@/components/inference/utils/comparisonEntry'; +import { dataRunsForDate } from '@/components/inference/utils/runEnumeration'; import InferenceTable from '@/components/inference/ui/InferenceTable'; import ScatterGraph from '@/components/inference/ui/ScatterGraph'; import { Card } from '@/components/ui/card'; @@ -165,6 +171,58 @@ export default function ChartDisplay() { totalDatesQueried, } = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates); + const modelDbKeys = useMemo( + () => DISPLAY_MODEL_TO_DB[selectedModel] ?? [selectedModel], + [selectedModel], + ); + + // Stable run numbering shared by the changelog and the chart legend: each of a + // date's runs gets a fixed 1-based number (by start time) regardless of which + // are on the chart, so the two surfaces always show the same #N for a run and a + // removed run leaves a matching gap. Built from the same data-run enumeration + // the changelog uses. + const runNumbering = useMemo(() => { + const map = new Map(); + for (const c of changelogs) { + dataRunsForDate(c.runConfigs, { modelDbKeys, selectedGPUs, selectedPrecisions }).forEach( + (run, idx) => { + map.set(makeRunComparisonEntry(c.date, run.runId), idx + 1); + }, + ); + } + return map; + }, [changelogs, modelDbKeys, selectedGPUs, selectedPrecisions]); + + // Expand a plain-date selection into one entry per run once that date's runs are + // known. Picking a date that has multiple runs shows each run as its own series + // (matching the changelog, which renders a block per run) instead of a single + // merged "latest" line with no changelog row — keeping the legend and changelog + // in sync. Idempotent: after expansion no expandable plain date remains. + useEffect(() => { + const runConfigsByDate = new Map(changelogs.map((c) => [c.date, c.runConfigs])); + const scope = { modelDbKeys, selectedGPUs, selectedPrecisions }; + setSelectedDates((prev) => { + let changed = false; + const out: string[] = []; + for (const entry of prev) { + if (isRunComparisonEntry(entry)) { + out.push(entry); + continue; + } + const rc = runConfigsByDate.get(entry); + const runs = rc ? dataRunsForDate(rc, scope) : []; + if (runs.length > 1) { + changed = true; + for (const run of runs) out.push(makeRunComparisonEntry(entry, run.runId)); + } else { + out.push(entry); + } + } + if (!changed) return prev; + return [...new Set(out)]; + }); + }, [changelogs, modelDbKeys, selectedGPUs, selectedPrecisions, selectedDates, setSelectedDates]); + const [viewModes, setViewModes] = useState>({}); const replayHandlesRef = useRef>({}); const getViewMode = (index: number): InferenceViewMode => viewModes[index] ?? 'chart'; @@ -342,7 +400,7 @@ export default function ChartDisplay() { - + )) : effectiveGraphs.length === 0 @@ -546,9 +604,9 @@ export default function ChartDisplay() { ); } - return selectedDateRange.startDate && - selectedDateRange.endDate && - selectedGPUs.length > 0 ? ( + return selectedGPUs.length > 0 && + ((selectedDateRange.startDate && selectedDateRange.endDate) || + selectedDates.length > 0) ? ( ) : (
@@ -583,10 +642,11 @@ export default function ChartDisplay() { } /> {selectedGPUs.length > 0 && - (!selectedDateRange.startDate || !selectedDateRange.endDate) && ( + (!selectedDateRange.startDate || !selectedDateRange.endDate) && + selectedDates.length === 0 && (

- Select a date range to view GPU comparison + Select a date range or add a run to view GPU comparison

)} @@ -637,21 +697,21 @@ export default function ChartDisplay() { changelogs={changelogs} selectedGPUs={selectedGPUs} selectedPrecisions={selectedPrecisions} + modelDbKeys={modelDbKeys} loading={changelogsLoading} totalDatesQueried={totalDatesQueried} selectedDates={selectedDates} selectedDateRange={selectedDateRange} onAddDate={(date) => { - if (!selectedDates.includes(date)) { - setSelectedDates([...selectedDates, date]); - } + // Functional updater: adding several runs in quick succession must + // each build on the latest state, not the value captured at render. + setSelectedDates((prev) => (prev.includes(date) ? prev : [...prev, date])); }} onRemoveDate={(date) => { - setSelectedDates(selectedDates.filter((d) => d !== date)); + setSelectedDates((prev) => prev.filter((d) => d !== date)); }} onAddAllDates={(dates) => { - const merged = [...new Set([...selectedDates, ...dates])]; - setSelectedDates(merged); + setSelectedDates((prev) => [...new Set([...prev, ...dates])]); }} firstAvailableDate={dateRangeAvailableDates[0]} /> diff --git a/packages/app/src/components/inference/ui/ComparisonChangelog.tsx b/packages/app/src/components/inference/ui/ComparisonChangelog.tsx index 8106fe90..61a7e28c 100644 --- a/packages/app/src/components/inference/ui/ComparisonChangelog.tsx +++ b/packages/app/src/components/inference/ui/ComparisonChangelog.tsx @@ -11,13 +11,35 @@ import { configKeyMatchesHwKey, formatChangelogDescription, } from '@/components/inference/utils/changelogFormatters'; +import { makeRunComparisonEntry } from '@/components/inference/utils/comparisonEntry'; +import { dataRunsForDate } from '@/components/inference/utils/runEnumeration'; import { getHardwareConfig } from '@/lib/constants'; import { getDisplayLabel, updateRepoUrl } from '@/lib/utils'; +/** One changelog entry's description. The GPU and run # are shown in the entry title. */ +function renderDescription( + entry: { config_keys: string[]; description: string }, + key: number | string, +) { + return ( +
+ {formatChangelogDescription(entry.description)} +
+ ); +} + interface ComparisonChangelogProps { changelogs: ComparisonChangelogType[]; selectedGPUs: string[]; selectedPrecisions: string[]; + /** + * DB model keys for the currently selected model (e.g. ['dsv4']). Changelog + * config keys are `---` and a GPU+framework + * like `b200-vllm` is shared across models, so without this filter the run list + * would offer other models' runs — which then plot nothing (the data fetch is + * model-scoped). + */ + modelDbKeys: string[]; loading?: boolean; totalDatesQueried: number; selectedDates: string[]; @@ -33,6 +55,7 @@ export default function ComparisonChangelog({ changelogs, selectedGPUs, selectedPrecisions, + modelDbKeys, loading, totalDatesQueried, selectedDates, @@ -63,7 +86,9 @@ export default function ComparisonChangelog({ entry.config_keys.some((key) => { const precision = key.split('-')[1]; return ( - precSet.has(precision) && selectedGPUs.some((gpu) => configKeyMatchesHwKey(key, gpu)) + modelDbKeys.some((m) => key.startsWith(`${m}-`)) && + precSet.has(precision) && + selectedGPUs.some((gpu) => configKeyMatchesHwKey(key, gpu)) ); }), ), @@ -72,14 +97,14 @@ export default function ComparisonChangelog({ // Ensure pinned dates are always present for (const date of pinnedDates) { if (!mapped.some((item) => item.date === date)) { - mapped.push({ date, entries: [] }); + mapped.push({ date, entries: [], runs: [], runConfigs: [] }); } } return mapped .filter((item) => item.entries.length > 0 || pinnedDates.has(item.date)) .toSorted((a, b) => new Date(a.date).getTime() - new Date(b.date).getTime()); - }, [changelogs, selectedGPUs, selectedPrecisions, pinnedDates]); + }, [changelogs, modelDbKeys, selectedGPUs, selectedPrecisions, pinnedDates]); const datesOnChart = useMemo(() => { const set = new Set(selectedDates); @@ -88,17 +113,82 @@ export default function ComparisonChangelog({ return set; }, [selectedDates, selectedDateRange]); - const addableDates = useMemo( - () => filteredChangelogs.map((c) => c.date).filter((d) => !datesOnChart.has(d)), - [filteredChangelogs, datesOnChart], + // True when a changelog entry touches one of the selected GPU configs at a + // selected precision — the same predicate used to filter the date list, reused + // to attach changelog notes to the runs that are worth offering as series. + const entryMatchesSelection = useMemo(() => { + const precSet = new Set(selectedPrecisions); + return (configKeys: string[]): boolean => + configKeys.some((key) => { + const precision = key.split('-')[1]; + return ( + modelDbKeys.some((m) => key.startsWith(`${m}-`)) && + precSet.has(precision) && + selectedGPUs.some((gpu) => configKeyMatchesHwKey(key, gpu)) + ); + }); + }, [modelDbKeys, selectedPrecisions, selectedGPUs]); + + /** + * Every run that produced data for the selected config on a date, earliest + * first, with its changelog notes (if any) attached. Data-driven so a run that + * shipped data without a changelog entry still appears as its own series. + */ + const runMetaFor = useMemo( + () => (item: (typeof filteredChangelogs)[number]) => { + const clByRun = new Map(item.runs.map((r) => [r.runId, r])); + return dataRunsForDate(item.runConfigs, { + modelDbKeys, + selectedGPUs, + selectedPrecisions, + }).map((run) => { + const cl = clByRun.get(run.runId); + return { + runId: run.runId, + headRef: cl?.headRef ?? run.headSha, + runUrl: cl?.runUrl ?? run.runUrl, + entries: (cl?.entries ?? []).filter((e) => entryMatchesSelection(e.config_keys)), + }; + }); + }, + [modelDbKeys, selectedGPUs, selectedPrecisions, entryMatchesSelection], ); + // Entries the "Add all to chart" button would add: every run not yet on the + // chart (run-level for multi-run dates, the plain date for single-run dates). + const addableEntries = useMemo(() => { + const out: string[] = []; + for (const item of filteredChangelogs) { + const runs = runMetaFor(item); + if (runs.length > 1) { + for (const run of runs) { + const entry = makeRunComparisonEntry(item.date, run.runId); + if (!selectedDates.includes(entry)) out.push(entry); + } + } else if (!datesOnChart.has(item.date)) { + out.push(item.date); + } + } + return out; + }, [filteredChangelogs, runMetaFor, selectedDates, datesOnChart]); + const handleToggle = () => { const newState = !isExpanded; setIsExpanded(newState); track('inference_comparison_changelog_toggled', { expanded: newState }); }; + /** Display labels of the selected GPUs that a set of changelog entries touches. */ + const gpuLabelsFor = (entries: { config_keys: string[] }[]): string => { + if (selectedGPUs.length <= 1) return ''; + return selectedGPUs + .filter((gpu) => + entries.some((e) => e.config_keys.some((k) => configKeyMatchesHwKey(k, gpu))), + ) + .map((gpu) => getDisplayLabel(getHardwareConfig(gpu))) + .join(', '); + }; + const label = filteredChangelogs.length > 0 ? `Config Changelog (${filteredChangelogs.length} date${filteredChangelogs.length === 1 ? '' : 's'} with changes)` @@ -125,12 +215,12 @@ export default function ComparisonChangelog({ )} - {isExpanded && addableDates.length > 0 && ( + {isExpanded && addableEntries.length > 0 && ( + ) : ( + + )} +
+ {run.entries.length > 0 ? ( + run.entries.map((e, i) => renderDescription(e, i)) + ) : ( + + No changelog notes for this run + )} - {item.runUrl && ( - + ); + }); + } + + // Single (or no) matching run → one block keyed by the date. + const dateGpuLabel = gpuLabelsFor(item.entries); + return ( +
+
+ + {item.date} + {dateGpuLabel ? ` ${dateGpuLabel}` : ''} + + {item.entries.length > 0 && ( + <> + + {item.headRef && ( + + Git Commit + + + )} + {item.runUrl && ( + + Workflow Run + + + )} + + )} + {datesOnChart.has(item.date) ? ( + selectedDates.includes(item.date) ? ( + + ) : ( + + + On chart + + ) + ) : ( - ) : ( - - - On chart - - ) + )} +
+ {item.entries.length > 0 ? ( + item.entries.map((entry, entryIndex) => renderDescription(entry, entryIndex)) ) : ( - + + {item.date === firstAvailableDate + ? 'First benchmark run for this configuration' + : item.date < '2025-12-30' + ? 'No changelog data (tracking began Dec 30, 2025)' + : filteredChangelogs.some( + (c) => c.date < item.date && c.entries.length > 0, + ) + ? 'No config changes — same configuration as previous run' + : 'Initial configuration — no changelog entry recorded'} + )}
- {item.entries.length > 0 ? ( - item.entries.map((entry, entryIndex) => ( -
- {selectedGPUs.length > 1 && - (() => { - const matchingGpus = selectedGPUs.filter((gpu) => - entry.config_keys.some((key) => configKeyMatchesHwKey(key, gpu)), - ); - const labels = matchingGpus.map((gpu) => - getDisplayLabel(getHardwareConfig(gpu)), - ); - return labels.length > 0 ? ( - - {labels.join(', ')} - - ) : null; - })()} - {formatChangelogDescription(entry.description)} -
- )) - ) : ( - - {item.date === firstAvailableDate - ? 'First benchmark run for this configuration' - : item.date < '2025-12-30' - ? 'No changelog data (tracking began Dec 30, 2025)' - : filteredChangelogs.some((c) => c.date < item.date && c.entries.length > 0) - ? 'No config changes — same configuration as previous run' - : 'Initial configuration — no changelog entry recorded'} - - )} - - )) + ); + }) )} diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx index a6e76fcc..e7737a2e 100644 --- a/packages/app/src/components/inference/ui/GPUGraph.tsx +++ b/packages/app/src/components/inference/ui/GPUGraph.tsx @@ -38,6 +38,12 @@ import type { InferenceData, ScatterGraphProps, } from '@/components/inference/types'; +import { + buildRunNumbering, + comparisonEntryLabel, + comparisonEntrySortValue, + resolveComparisonEntries, +} from '@/components/inference/utils/comparisonEntry'; import { generateGPUGraphTooltipContent, getPointLabel, @@ -55,16 +61,24 @@ const CHART_MARGIN = { top: 24, right: 10, bottom: 60, left: 60 }; // both dimensions of the GPU comparison view are legible on the chart, // not only the legend. Falls back to the raw hwKey if the config // lookup misses (legacy data). -function labelTextFor(pts: InferenceData[]): string { +function labelTextFor(pts: InferenceData[], numbering: Map): string { const hwKey = String(pts[0].hwKey); - const date = String(pts[0].date); const cfg = getHardwareConfig(hwKey); const hwLabel = cfg ? getDisplayLabel(cfg) : hwKey; - return `${hwLabel} • ${date}`; + return `${hwLabel} • ${comparisonEntryLabel(String(pts[0].date), numbering)}`; } const GPUGraph = React.memo( - ({ chartId, modelLabel, data, xLabel, yLabel, chartDefinition, caption }: ScatterGraphProps) => { + ({ + chartId, + modelLabel, + data, + xLabel, + yLabel, + chartDefinition, + caption, + runNumbering: providedRunNumbering, + }: ScatterGraphProps) => { const { hardwareConfig, selectedPrecisions, @@ -72,6 +86,7 @@ const GPUGraph = React.memo( selectedGPUs, selectedDateRange, selectedDates, + setSelectedDates, toggleActiveDate, removeActiveDate, activeDates, @@ -94,21 +109,48 @@ const GPUGraph = React.memo( const { resolvedTheme } = useTheme(); const chartRef = useRef(null); - // Shared date+GPU pairs + // Shared date+GPU pairs. `dates` holds comparison-series entries (plain dates + // and/or specific-run entries); a same-day range endpoint is dropped when that + // date also has run entries (resolveComparisonEntries), then sorted earliest → + // latest so a day's runs read #1 → #N. const gpuDatePairs = useMemo(() => { - const dates: string[] = []; - if (selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0) { - dates.push(selectedDateRange.startDate, selectedDateRange.endDate); - } - dates.push(...selectedDates); - const deduplicated = [...new Set(dates)]; - deduplicated.sort((a, b) => new Date(a).getTime() - new Date(b).getTime()); + const deduplicated = resolveComparisonEntries(selectedDates, selectedDateRange); + deduplicated.sort((a, b) => { + const [ta, ia] = comparisonEntrySortValue(a); + const [tb, ib] = comparisonEntrySortValue(b); + return ta - tb || ia - ib; + }); const sortedGPUs = [...selectedGPUs].toSorted( (a, b) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b), ); return { dates: deduplicated, sortedGPUs }; }, [selectedDateRange, selectedDates, selectedGPUs]); + // Run numbers for legend/line labels. Prefer the stable numbering passed by + // the parent (shared with the changelog, so labels match it and removed runs + // leave a gap); fall back to gap-free numbering of the on-chart series. + const runNumbering = useMemo( + () => providedRunNumbering ?? buildRunNumbering(gpuDatePairs.dates), + [providedRunNumbering, gpuDatePairs.dates], + ); + + // Removing a series from the legend should also drop it from the comparison + // selection so the config changelog stays in sync (two-way binding). Legend + // ids are `${entry}_${gpu}`; strip the gpu suffix to recover the entry. Range + // endpoints aren't individual selections, so those fall back to a visibility hide. + const handleLegendRemove = useCallback( + (id: string) => { + const gpu = selectedGPUs.find((g) => id.endsWith(`_${g}`)); + const entry = gpu ? id.slice(0, id.length - gpu.length - 1) : id; + if (selectedDates.includes(entry)) { + setSelectedDates((prev) => prev.filter((e) => e !== entry)); + } else { + removeActiveDate(id); + } + }, + [selectedGPUs, selectedDates, setSelectedDates, removeActiveDate], + ); + const graphIdentifiers = useMemo(() => { const ids: string[] = []; gpuDatePairs.sortedGPUs.forEach((gpu) => @@ -391,7 +433,7 @@ const GPUGraph = React.memo( pts[Math.max(0, Math.floor((pts.length * 2) / 3))], pts.at(-1)!, ]; - const labelText = labelTextFor(pts); + const labelText = labelTextFor(pts, runNumbering); let placedLabel = false; for (const pt of candidates) { const px = xScale(pt.x); @@ -431,7 +473,7 @@ const GPUGraph = React.memo( lineLabels.push({ key, graphId, - label: labelTextFor(pts), + label: labelTextFor(pts, runNumbering), color: getRooflineColor(key), x: xScale(pt.x), y: yScale(pt.y), @@ -596,7 +638,14 @@ const GPUGraph = React.memo( }); }, }), - [showLineLabels, rooflines, isRooflineVisible, getRooflineColor, chartDefinition.chartType], + [ + showLineLabels, + rooflines, + isRooflineVisible, + getRooflineColor, + chartDefinition.chartType, + runNumbering, + ], ); // Dismiss tooltip when pinned point's combo is hidden @@ -782,13 +831,13 @@ const GPUGraph = React.memo( disableActiveSort={true} onItemHover={handleLegendHover} onItemHoverEnd={handleLegendHoverEnd} - onItemRemove={removeActiveDate} + onItemRemove={handleLegendRemove} legendItems={allGraphs .filter(({ id }) => idsWithData.has(id)) .map(({ date, color, hwKey, id }) => ({ - name: `${hwKey} ${date}`, + name: `${hwKey} ${comparisonEntryLabel(date, runNumbering)}`, hw: id, - label: date, + label: comparisonEntryLabel(date, runNumbering), color, title: getDisplayLabel(getHardwareConfig(hwKey)), isActive: activeDates.has(id), diff --git a/packages/app/src/hooks/api/use-comparison-changelogs.ts b/packages/app/src/hooks/api/use-comparison-changelogs.ts index 7d3e2556..fd6a5ca9 100644 --- a/packages/app/src/hooks/api/use-comparison-changelogs.ts +++ b/packages/app/src/hooks/api/use-comparison-changelogs.ts @@ -1,17 +1,44 @@ import { useQueries } from '@tanstack/react-query'; import { useMemo } from 'react'; -import { fetchWorkflowInfo, type ChangelogRow, type WorkflowInfoResponse } from '@/lib/api'; +import { + fetchWorkflowInfo, + type ChangelogRow, + type RunConfigRow, + type WorkflowInfoResponse, +} from '@/lib/api'; + +export interface ComparisonChangelogEntry { + config_keys: string[]; + description: string; + pr_link: string | null; +} + +/** One workflow run on a date, with its own changelog entries. */ +export interface ComparisonRun { + /** GitHub run id (string). */ + runId: string; + headRef?: string; + runUrl?: string; + /** Changelog entries attributed to this specific run. */ + entries: ComparisonChangelogEntry[]; +} export interface ComparisonChangelog { date: string; headRef?: string; runUrl?: string; - entries: { - config_keys: string[]; - description: string; - pr_link: string | null; - }[]; + /** All of the date's changelog entries (flattened across runs). */ + entries: ComparisonChangelogEntry[]; + /** Individual runs on this date, in chronological order (earliest first). */ + runs: ComparisonRun[]; + /** + * Per-(run, config) coverage from the benchmark data itself. Used to enumerate + * every run that produced data on this date — including runs without a changelog + * entry, which `runs` omits. The comparison UI keys run series off this so the + * newest run never silently vanishes just because it lacked changelog notes. + */ + runConfigs: RunConfigRow[]; } export function useComparisonChangelogs( @@ -57,6 +84,37 @@ export function useComparisonChangelogs( const data = query.data as WorkflowInfoResponse; if (!data.changelogs || data.changelogs.length === 0) continue; + // Group changelog entries by the run that produced them. In the API + // response, changelog.workflow_run_id is the GitHub run id (see + // getChangelogByDate's `wr.github_run_id as workflow_run_id`). + const entriesByRun = new Map(); + for (const c of data.changelogs) { + const list = entriesByRun.get(c.workflow_run_id) ?? []; + list.push(c); + entriesByRun.set(c.workflow_run_id, list); + } + + // Order runs chronologically (earliest first) so the #1/#2/#3 indices the + // UI assigns read in the order the runs actually happened. + const orderedRuns = [...data.runs].toSorted((a, b) => + a.created_at.localeCompare(b.created_at), + ); + const runs: ComparisonRun[] = orderedRuns + .map((run) => { + const runEntries = entriesByRun.get(run.github_run_id) ?? []; + return { + runId: String(run.github_run_id), + headRef: runEntries.at(-1)?.head_ref, + runUrl: run.html_url ?? undefined, + entries: runEntries.map((c) => ({ + config_keys: c.config_keys, + description: c.description, + pr_link: c.pr_link, + })), + }; + }) + .filter((r) => r.entries.length > 0); + results.push({ date: datesToQuery[i], headRef: data.changelogs.at(-1)?.head_ref, @@ -66,6 +124,8 @@ export function useComparisonChangelogs( description: c.description, pr_link: c.pr_link, })), + runs, + runConfigs: data.runConfigs ?? [], }); } diff --git a/packages/app/src/lib/d3-chart/layers/rooflines.ts b/packages/app/src/lib/d3-chart/layers/rooflines.ts index 6f6b36d1..5513ef12 100644 --- a/packages/app/src/lib/d3-chart/layers/rooflines.ts +++ b/packages/app/src/lib/d3-chart/layers/rooflines.ts @@ -87,9 +87,19 @@ export function updateRooflinesOnZoom( Object.entries(rooflines).forEach(([key, points]) => { if (points.length < 2) return; - const selection = zoomGroup.select(`.roofline-${key}`); + // Keys can contain characters that are invalid in a CSS selector (e.g. `~` + // from run-comparison series ids), so escape before selecting by class. + const selection = zoomGroup.select(`.${cssEscapeToken(`roofline-${key}`)}`); if (!selection.empty()) { selection.attr('d', lineGenerator(points) as string); } }); } + +/** Escape a class token for safe use in a CSS selector. */ +function cssEscapeToken(token: string): string { + if (typeof CSS !== 'undefined' && typeof CSS.escape === 'function') return CSS.escape(token); + // Fallback (non-DOM environments): escape everything outside the CSS-safe set. + // The token always starts with "roofline-", so a leading digit is never escaped. + return token.replaceAll(/[^a-zA-Z0-9_-]/gu, (c) => `\\${c}`); +}