Skip to content

Commit 8d8f505

Browse files
authored
feat: compare by run (#462)
* feat(inference): add per-run benchmark fetch and run-coverage queries * feat(inference): add comparison run-entry model and run enumeration helpers * feat(inference): compare individual runs with two-way legend/changelog binding
1 parent 528c414 commit 8d8f505

22 files changed

Lines changed: 1201 additions & 167 deletions

packages/app/src/app/api/v1/benchmarks/route.test.ts

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import { describe, expect, it, vi, beforeEach } from 'vitest';
22

3-
const { mockGetLatestBenchmarks, mockGetDb } = vi.hoisted(() => ({
3+
const { mockGetLatestBenchmarks, mockGetBenchmarksForRun, mockGetDb } = vi.hoisted(() => ({
44
mockGetLatestBenchmarks: vi.fn(),
5+
mockGetBenchmarksForRun: vi.fn(),
56
mockGetDb: vi.fn(() => 'mock-sql'),
67
}));
78

@@ -13,6 +14,7 @@ vi.mock('@semianalysisai/inferencex-db/connection', () => ({
1314

1415
vi.mock('@semianalysisai/inferencex-db/queries/benchmarks', () => ({
1516
getLatestBenchmarks: mockGetLatestBenchmarks,
17+
getBenchmarksForRun: mockGetBenchmarksForRun,
1618
}));
1719

1820
vi.mock('@/lib/api-cache', () => ({
@@ -125,6 +127,28 @@ describe('GET /api/v1/benchmarks', () => {
125127
);
126128
});
127129

130+
it('routes exactRun=true + runId to the exact-run query', async () => {
131+
const runRows = [{ id: 1, hardware: 'mi300x' }];
132+
mockGetBenchmarksForRun.mockResolvedValueOnce(runRows);
133+
134+
const res = await GET(
135+
req('/api/v1/benchmarks?model=DeepSeek-R1-0528&runId=27489075807&exactRun=true'),
136+
);
137+
expect(res.status).toBe(200);
138+
expect(await res.json()).toEqual(runRows);
139+
expect(mockGetBenchmarksForRun).toHaveBeenCalledWith('mock-sql', ['dsr1'], '27489075807');
140+
expect(mockGetLatestBenchmarks).not.toHaveBeenCalled();
141+
});
142+
143+
it('ignores exactRun without a runId (falls back to latest)', async () => {
144+
mockGetLatestBenchmarks.mockResolvedValueOnce([]);
145+
146+
const res = await GET(req('/api/v1/benchmarks?model=DeepSeek-R1-0528&exactRun=true'));
147+
expect(res.status).toBe(200);
148+
expect(mockGetBenchmarksForRun).not.toHaveBeenCalled();
149+
expect(mockGetLatestBenchmarks).toHaveBeenCalled();
150+
});
151+
128152
it('returns 500 when query throws', async () => {
129153
mockGetLatestBenchmarks.mockRejectedValueOnce(new Error('DB down'));
130154

packages/app/src/app/api/v1/benchmarks/route.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@ import { type NextRequest, NextResponse } from 'next/server';
33
import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants';
44
import { FIXTURES_MODE, JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
55
import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
6-
import { getLatestBenchmarks } from '@semianalysisai/inferencex-db/queries/benchmarks';
6+
import {
7+
getBenchmarksForRun,
8+
getLatestBenchmarks,
9+
} from '@semianalysisai/inferencex-db/queries/benchmarks';
710

811
import { cachedJson, cachedQuery } from '@/lib/api-cache';
912
import { loadFixture } from '@/lib/test-fixtures';
@@ -20,6 +23,17 @@ const getCachedBenchmarks = cachedQuery(
2023
{ blobOnly: true },
2124
);
2225

26+
// Exactly one run's results (GPU comparison of individual same-day runs). Cached
27+
// under a distinct key prefix so it never collides with the latest/as-of query.
28+
const getCachedBenchmarksForRun = cachedQuery(
29+
(dbModelKeys: string[], runId: string) => {
30+
if (JSON_MODE) return Promise.resolve(jsonProvider.getBenchmarksForRun(dbModelKeys, runId));
31+
return getBenchmarksForRun(getDb(), dbModelKeys, runId);
32+
},
33+
'benchmarks-run',
34+
{ blobOnly: true },
35+
);
36+
2337
export async function GET(request: NextRequest) {
2438
const params = request.nextUrl.searchParams;
2539
const model = params.get('model') ?? '';
@@ -28,14 +42,19 @@ export async function GET(request: NextRequest) {
2842
// Numeric GitHub run id only — anything else is ignored (treated as "latest").
2943
const runIdParam = params.get('runId');
3044
const runId = runIdParam && /^\d+$/u.test(runIdParam) ? runIdParam : undefined;
45+
// exactRun=true → return exactly this run's results (GPU comparison of same-day runs).
46+
const exactRun = params.get('exactRun') === 'true';
3147
const dbModelKeys = DISPLAY_MODEL_TO_DB[model];
3248
if (!dbModelKeys || dbModelKeys.length === 0) {
3349
return NextResponse.json({ error: 'Unknown model' }, { status: 400 });
3450
}
3551
if (FIXTURES_MODE) return cachedJson(loadFixture('benchmarks'));
3652

3753
try {
38-
const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId);
54+
const rows =
55+
exactRun && runId
56+
? await getCachedBenchmarksForRun(dbModelKeys, runId)
57+
: await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId);
3958
return cachedJson(rows);
4059
} catch (error) {
4160
console.error('Error fetching benchmarks:', error);

packages/app/src/app/api/v1/workflow-info/route.test.ts

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
import { describe, expect, it, vi, beforeEach } from 'vitest';
22

3-
const { mockGetWorkflowRunsByDate, mockGetChangelogByDate, mockGetDateConfigs, mockGetDb } =
4-
vi.hoisted(() => ({
5-
mockGetWorkflowRunsByDate: vi.fn(),
6-
mockGetChangelogByDate: vi.fn(),
7-
mockGetDateConfigs: vi.fn(),
8-
mockGetDb: vi.fn(() => 'mock-sql'),
9-
}));
3+
const {
4+
mockGetWorkflowRunsByDate,
5+
mockGetChangelogByDate,
6+
mockGetDateConfigs,
7+
mockGetRunConfigsByDate,
8+
mockGetDb,
9+
} = vi.hoisted(() => ({
10+
mockGetWorkflowRunsByDate: vi.fn(),
11+
mockGetChangelogByDate: vi.fn(),
12+
mockGetDateConfigs: vi.fn(),
13+
mockGetRunConfigsByDate: vi.fn(),
14+
mockGetDb: vi.fn(() => 'mock-sql'),
15+
}));
1016

1117
vi.mock('@semianalysisai/inferencex-db/connection', () => ({
1218
getDb: mockGetDb,
@@ -18,6 +24,7 @@ vi.mock('@semianalysisai/inferencex-db/queries/workflow-info', () => ({
1824
getWorkflowRunsByDate: mockGetWorkflowRunsByDate,
1925
getChangelogByDate: mockGetChangelogByDate,
2026
getDateConfigs: mockGetDateConfigs,
27+
getRunConfigsByDate: mockGetRunConfigsByDate,
2128
}));
2229

2330
vi.mock('@/lib/api-cache', () => ({
@@ -60,9 +67,13 @@ describe('GET /api/v1/workflow-info', () => {
6067
const mockRuns = [{ id: 1, status: 'completed' }];
6168
const mockChangelogs = [{ version: '1.0', changes: 'Initial' }];
6269
const mockConfigs = [{ model: 'dsr1', gpu: 'h200' }];
70+
const mockRunConfigs = [
71+
{ github_run_id: 1, model: 'dsr1', hardware: 'h200', framework: 'vllm' },
72+
];
6373
mockGetWorkflowRunsByDate.mockResolvedValueOnce(mockRuns);
6474
mockGetChangelogByDate.mockResolvedValueOnce(mockChangelogs);
6575
mockGetDateConfigs.mockResolvedValueOnce(mockConfigs);
76+
mockGetRunConfigsByDate.mockResolvedValueOnce(mockRunConfigs);
6677

6778
const res = await GET(req('/api/v1/workflow-info?date=2026-03-01'));
6879
expect(res.status).toBe(200);
@@ -71,28 +82,32 @@ describe('GET /api/v1/workflow-info', () => {
7182
runs: mockRuns,
7283
changelogs: mockChangelogs,
7384
configs: mockConfigs,
85+
runConfigs: mockRunConfigs,
7486
});
7587
expect(mockGetWorkflowRunsByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01');
7688
expect(mockGetChangelogByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01');
7789
expect(mockGetDateConfigs).toHaveBeenCalledWith('mock-sql', '2026-03-01');
90+
expect(mockGetRunConfigsByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01');
7891
});
7992

8093
it('accepts empty date param (returns all)', async () => {
8194
mockGetWorkflowRunsByDate.mockResolvedValueOnce([]);
8295
mockGetChangelogByDate.mockResolvedValueOnce([]);
8396
mockGetDateConfigs.mockResolvedValueOnce([]);
97+
mockGetRunConfigsByDate.mockResolvedValueOnce([]);
8498

8599
const res = await GET(req('/api/v1/workflow-info'));
86100
expect(res.status).toBe(200);
87101
const body = await res.json();
88-
expect(body).toEqual({ runs: [], changelogs: [], configs: [] });
102+
expect(body).toEqual({ runs: [], changelogs: [], configs: [], runConfigs: [] });
89103
expect(mockGetWorkflowRunsByDate).toHaveBeenCalledWith('mock-sql', '');
90104
});
91105

92106
it('returns 500 when any query throws', async () => {
93107
mockGetWorkflowRunsByDate.mockRejectedValueOnce(new Error('Timeout'));
94108
mockGetChangelogByDate.mockResolvedValueOnce([]);
95109
mockGetDateConfigs.mockResolvedValueOnce([]);
110+
mockGetRunConfigsByDate.mockResolvedValueOnce([]);
96111

97112
const res = await GET(req('/api/v1/workflow-info?date=2026-03-01'));
98113
expect(res.status).toBe(500);

packages/app/src/app/api/v1/workflow-info/route.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
55
import {
66
getChangelogByDate,
77
getDateConfigs,
8+
getRunConfigsByDate,
89
getWorkflowRunsByDate,
910
} from '@semianalysisai/inferencex-db/queries/workflow-info';
1011

@@ -19,15 +20,17 @@ const getCachedWorkflowInfo = cachedQuery(async (date: string) => {
1920
runs: jsonProvider.getWorkflowRunsByDate(date),
2021
changelogs: jsonProvider.getChangelogByDate(date),
2122
configs: jsonProvider.getDateConfigs(date),
23+
runConfigs: jsonProvider.getRunConfigsByDate(date),
2224
};
2325
}
2426
const sql = getDb();
25-
const [runs, changelogs, configs] = await Promise.all([
27+
const [runs, changelogs, configs, runConfigs] = await Promise.all([
2628
getWorkflowRunsByDate(sql, date),
2729
getChangelogByDate(sql, date),
2830
getDateConfigs(sql, date),
31+
getRunConfigsByDate(sql, date),
2932
]);
30-
return { runs, changelogs, configs };
33+
return { runs, changelogs, configs, runConfigs };
3134
}, 'workflow-info');
3235

3336
export async function GET(request: NextRequest) {

packages/app/src/components/inference/InferenceContext.tsx

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import {
44
type ReactNode,
5+
type SetStateAction,
56
createContext,
67
useCallback,
78
useContext,
@@ -57,6 +58,7 @@ import {
5758
import { filterRunsByModel, getDisplayLabel } from '@/lib/utils';
5859

5960
import { useChartData } from './hooks/useChartData';
61+
import { resolveComparisonEntries } from './utils/comparisonEntry';
6062

6163
/** @internal Exported for test provider wrapping only. */
6264
export const InferenceContext = createContext<InferenceChartContextType | undefined>(undefined);
@@ -416,7 +418,10 @@ export function InferenceProvider({
416418
[setSelectedGPUs, clearPresetOnChange],
417419
);
418420
const setSelectedDatesAndClear = useCallback(
419-
(v: string[]) => {
421+
// Accept a React state updater (value OR function) so callers adding several
422+
// dates/runs in quick succession can use the functional form and avoid the
423+
// stale-closure race where each click overwrites the last.
424+
(v: SetStateAction<string[]>) => {
420425
setSelectedDates(v);
421426
clearPresetOnChange();
422427
},
@@ -564,11 +569,7 @@ export function InferenceProvider({
564569
);
565570

566571
const allDateIds = useMemo(() => {
567-
const dates: string[] = [];
568-
if (selectedDateRange.startDate && selectedDateRange.endDate) {
569-
dates.push(selectedDateRange.startDate, selectedDateRange.endDate);
570-
}
571-
dates.push(...selectedDates);
572+
const dates = resolveComparisonEntries(selectedDates, selectedDateRange);
572573
const allIds = new Set<string>();
573574
selectedGPUs.forEach((gpu) => {
574575
dates.forEach((date) => allIds.add(`${date}_${gpu}`));

packages/app/src/components/inference/hooks/useChartData.ts

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ import type {
1313
YAxisMetricKey,
1414
} from '@/components/inference/types';
1515
import { filterDataByCostLimit } from '@/components/inference/utils';
16+
import {
17+
parseComparisonEntry,
18+
resolveComparisonEntries,
19+
} from '@/components/inference/utils/comparisonEntry';
1620
import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
1721
import {
1822
GPU_ALIAS_TO_CANONICAL,
@@ -31,12 +35,11 @@ export function buildComparisonDates(
3135
selectedRunDate: string | undefined,
3236
): string[] {
3337
if (selectedGPUs.length === 0) return [];
34-
const dates: string[] = [];
35-
if (selectedDateRange.startDate && selectedDateRange.endDate) {
36-
dates.push(selectedDateRange.startDate, selectedDateRange.endDate);
37-
}
38-
dates.push(...selectedDates);
39-
return [...new Set(dates.filter((d) => d !== selectedRunDate))];
38+
// Range endpoints + individually-added dates/runs (redundant same-day range
39+
// endpoints dropped), minus the main run date which the primary query covers.
40+
return resolveComparisonEntries(selectedDates, selectedDateRange).filter(
41+
(d) => d !== selectedRunDate,
42+
);
4043
}
4144

4245
/** Filter data by GPU key, resolving aliases to canonical keys. */
@@ -116,10 +119,16 @@ export function useChartData(
116119
[selectedGPUs, selectedDates, selectedDateRange, selectedRunDate],
117120
);
118121

122+
// Each comparison entry is either a plain date (latest run that day, exact-date
123+
// query) or a specific run encoded as `date~r<id>~<i>of<n>` (exact-run query) so
124+
// multiple same-day runs can be compared as distinct series.
119125
const comparisonQueries = useQueries({
120-
queries: comparisonDates.map((date) =>
121-
benchmarkQueryOptions(selectedModel, date, enabled, true),
122-
),
126+
queries: comparisonDates.map((entry) => {
127+
const parsed = parseComparisonEntry(entry);
128+
return parsed.runId
129+
? benchmarkQueryOptions(selectedModel, '', enabled, false, parsed.runId, true)
130+
: benchmarkQueryOptions(selectedModel, entry, enabled, true);
131+
}),
123132
});
124133

125134
const comparisonLoading = comparisonQueries.some((q) => q.isLoading);

packages/app/src/components/inference/types.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,14 @@ export interface ScatterGraphProps {
481481
* playback).
482482
*/
483483
niceAxes?: boolean;
484+
/**
485+
* Stable run numbering (entry string `date~rRunId` → 1-based number) shared with
486+
* the comparison changelog so legend labels match it exactly. Numbers index ALL
487+
* of a date's runs (not just the ones on the chart), so a removed run leaves a
488+
* gap that lines up with the changelog's still-listed "Add to chart" run. When
489+
* omitted, GPUGraph falls back to gap-free numbering of the on-chart series.
490+
*/
491+
runNumbering?: Map<string, number>;
484492
}
485493
/**
486494
* @file types.ts
@@ -639,7 +647,8 @@ export interface InferenceChartContextType {
639647
setSelectedGPUs: (gpus: string[]) => void;
640648
availableGPUs: { value: string; label: string }[];
641649
selectedDates: string[];
642-
setSelectedDates: (dates: string[]) => void;
650+
/** Accepts a value or a state-updater fn (for safe rapid successive adds). */
651+
setSelectedDates: (dates: string[] | ((prev: string[]) => string[])) => void;
643652
selectedDateRange: { startDate: string; endDate: string };
644653
setSelectedDateRange: (dateRange: { startDate: string; endDate: string }) => void;
645654
userCosts: Record<string, number | undefined> | null;

0 commit comments

Comments
 (0)