feat(inference): time-travel chart data to the selected run (as-of-run filtering) (#459)

adibarra · web-flow · commit 13038d79021f · 2026-06-14T02:43:33.000-05:00
diff --git a/packages/app/src/app/api/v1/benchmarks/route.test.ts b/packages/app/src/app/api/v1/benchmarks/route.test.ts
@@ -59,6 +59,7 @@ describe('GET /api/v1/benchmarks', () => {
       ['dsr1'],
       undefined,
       undefined,
+      undefined,
     );
   });
 
@@ -72,6 +73,7 @@ describe('GET /api/v1/benchmarks', () => {
       ['dsr1'],
       '2026-03-01',
       undefined,
+      undefined,
     );
   });
 
@@ -82,7 +84,45 @@ describe('GET /api/v1/benchmarks', () => {
       req('/api/v1/benchmarks?model=DeepSeek-R1-0528&date=2026-03-01&exact=true'),
     );
     expect(res.status).toBe(200);
-    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith('mock-sql', ['dsr1'], '2026-03-01', true);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      '2026-03-01',
+      true,
+      undefined,
+    );
+  });
+
+  it('passes a numeric runId through to the query', async () => {
+    mockGetLatestBenchmarks.mockResolvedValueOnce([]);
+
+    const res = await GET(
+      req('/api/v1/benchmarks?model=DeepSeek-R1-0528&date=2026-03-01&runId=27489075807'),
+    );
+    expect(res.status).toBe(200);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      '2026-03-01',
+      undefined,
+      '27489075807',
+    );
+  });
+
+  it('ignores a non-numeric runId (treated as latest)', async () => {
+    mockGetLatestBenchmarks.mockResolvedValueOnce([]);
+
+    const res = await GET(
+      req('/api/v1/benchmarks?model=DeepSeek-R1-0528&date=2026-03-01&runId=not-a-run'),
+    );
+    expect(res.status).toBe(200);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      '2026-03-01',
+      undefined,
+      undefined,
+    );
   });
 
   it('returns 500 when query throws', async () => {
diff --git a/packages/app/src/app/api/v1/benchmarks/route.ts b/packages/app/src/app/api/v1/benchmarks/route.ts
@@ -11,10 +11,10 @@ import { loadFixture } from '@/lib/test-fixtures';
 export const dynamic = 'force-dynamic';
 
 const getCachedBenchmarks = cachedQuery(
-  (dbModelKeys: string[], date?: string, exact?: boolean) => {
+  (dbModelKeys: string[], date?: string, exact?: boolean, runId?: string) => {
     if (JSON_MODE)
-      return Promise.resolve(jsonProvider.getLatestBenchmarks(dbModelKeys, date, exact));
-    return getLatestBenchmarks(getDb(), dbModelKeys, date, exact);
+      return Promise.resolve(jsonProvider.getLatestBenchmarks(dbModelKeys, date, exact, runId));
+    return getLatestBenchmarks(getDb(), dbModelKeys, date, exact, runId);
   },
   'benchmarks',
   { blobOnly: true },
@@ -25,14 +25,17 @@ export async function GET(request: NextRequest) {
   const model = params.get('model') ?? '';
   const date = params.get('date') ?? undefined;
   const exact = params.get('exact') === 'true';
+  // Numeric GitHub run id only — anything else is ignored (treated as "latest").
+  const runIdParam = params.get('runId');
+  const runId = runIdParam && /^\d+$/u.test(runIdParam) ? runIdParam : undefined;
   const dbModelKeys = DISPLAY_MODEL_TO_DB[model];
   if (!dbModelKeys || dbModelKeys.length === 0) {
     return NextResponse.json({ error: 'Unknown model' }, { status: 400 });
   }
   if (FIXTURES_MODE) return cachedJson(loadFixture('benchmarks'));
 
   try {
-    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined);
+    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId);
     return cachedJson(rows);
   } catch (error) {
     console.error('Error fetching benchmarks:', error);
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
@@ -202,6 +202,43 @@ export function InferenceProvider({
   // ── Data fetching (gated by isActive) ──────────────────────────────────────
   const latestDate = availableDates.length > 0 ? availableDates.at(-1) : undefined;
 
+  // Runs available for the current model selection, and which one is selected.
+  // Computed here (above useChartData) so the chart can query "as of" the selected
+  // run. Re-exposed on the context value below.
+  const modelPrefixes = useMemo(
+    () =>
+      Object.entries(MODEL_PREFIX_MAPPING)
+        .filter(([, model]) => model === selectedModel)
+        .map(([prefix]) => prefix),
+    [selectedModel],
+  );
+
+  const filteredAvailableRuns = useMemo(
+    () => filterRunsByModel(availableRuns, modelPrefixes, [...effectivePrecisions]),
+    [availableRuns, modelPrefixes, effectivePrecisions],
+  );
+
+  const effectiveSelectedRunId = useMemo(() => {
+    if (!filteredAvailableRuns) return selectedRunId;
+    const filteredRunIds = Object.keys(filteredAvailableRuns);
+    if (filteredRunIds.length === 0 || filteredRunIds.includes(selectedRunId)) return selectedRunId;
+    return filteredRunIds.reduce((max, id) => (id > max ? id : max), filteredRunIds[0]);
+  }, [filteredAvailableRuns, selectedRunId]);
+
+  // The latest run for this model on the selected date. GitHub run ids increase
+  // monotonically with time, so the lexicographically-greatest id is the newest run.
+  const latestRunIdForModel = useMemo(() => {
+    const ids = filteredAvailableRuns ? Object.keys(filteredAvailableRuns) : [];
+    return ids.length > 0 ? ids.reduce((max, id) => (id > max ? id : max), ids[0]) : '';
+  }, [filteredAvailableRuns]);
+
+  // Only constrain the query when an earlier-than-latest run is selected; otherwise
+  // the chart shows the full latest view (and reuses the materialized-view fast path).
+  const asOfRunId =
+    effectiveSelectedRunId && latestRunIdForModel && effectiveSelectedRunId !== latestRunIdForModel
+      ? effectiveSelectedRunId
+      : undefined;
+
   const {
     graphs,
     loading: chartDataLoading,
@@ -223,6 +260,7 @@ export function InferenceProvider({
     isActive,
     latestDate,
     compareGpuPair ?? null,
+    asOfRunId,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
@@ -699,14 +737,6 @@ export function InferenceProvider({
       setUserPowers((prev) => (prev === null ? prev : null));
   }, [selectedModel, effectiveSequence, effectivePrecisions, selectedYAxisMetric]);
 
-  const modelPrefixes = useMemo(
-    () =>
-      Object.entries(MODEL_PREFIX_MAPPING)
-        .filter(([, model]) => model === selectedModel)
-        .map(([prefix]) => prefix),
-    [selectedModel],
-  );
-
   // ── Debounced GPU selection tracking ─────────────────────────────────────
   // Fire after 3s of no changes so we capture the "settled" selection.
   // Skip the first render (initial data load) to avoid noise.
@@ -922,19 +952,9 @@ export function InferenceProvider({
   }, [applyPreset]);
 
   // ── Filtered runs ─────────────────────────────────────────────────────────
-
-  const filteredAvailableRuns = useMemo(
-    () => filterRunsByModel(availableRuns, modelPrefixes, [...effectivePrecisions]),
-    [availableRuns, modelPrefixes, effectivePrecisions],
-  );
-
-  const effectiveSelectedRunId = useMemo(() => {
-    if (!filteredAvailableRuns) return selectedRunId;
-    const filteredRunIds = Object.keys(filteredAvailableRuns);
-    if (filteredRunIds.length === 0 || filteredRunIds.includes(selectedRunId)) return selectedRunId;
-    return filteredRunIds.reduce((max, id) => (id > max ? id : max), filteredRunIds[0]);
-  }, [filteredAvailableRuns, selectedRunId]);
-
+  // filteredAvailableRuns / effectiveSelectedRunId are computed above the data
+  // fetch (so the chart can query "as of" the selected run).
+  //
   // NOTE: We intentionally do NOT sync effectiveSelectedRunId back to
   // GlobalFilterContext (setSelectedRunId). That would cause a full tree
   // re-render on every precision change because filteredAvailableRuns
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -85,20 +85,30 @@ export function useChartData(
   latestAvailableDate?: string,
   /** When set, only series for these two registry GPU keys are shown (compare pages). */
   compareGpuPair?: readonly [string, string] | null,
+  /**
+   * GitHub run id for the "as of run" view. Set only when an earlier-than-latest
+   * run is selected; the chart then shows the data as it stood at that run.
+   */
+  asOfRunId?: string,
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
   // materialized view instead of firing a redundant second fetch with identical data.
-  const queryDate =
-    selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate
+  //
+  // The '' shortcut hits the materialized view, which has no run-level filter, so it
+  // is only valid for the latest run. When an earlier run is selected (asOfRunId set)
+  // we must query the date-filtered path so the run cutoff applies.
+  const queryDate = asOfRunId
+    ? (selectedRunDate ?? '')
+    : selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate
       ? ''
       : selectedRunDate;
 
   const {
     data: allRows,
     isLoading: queryLoading,
     error: queryError,
-  } = useBenchmarks(selectedModel, queryDate, enabled);
+  } = useBenchmarks(selectedModel, queryDate, enabled, asOfRunId);
 
   // GPU comparison: fetch data for each additional comparison date
   const comparisonDates = useMemo(
diff --git a/packages/app/src/hooks/api/use-benchmarks.test.ts b/packages/app/src/hooks/api/use-benchmarks.test.ts
@@ -5,12 +5,41 @@ import { benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 describe('benchmarkQueryOptions', () => {
   it('builds query key from model and date', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01');
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest']);
+    expect(opts.queryKey).toEqual([
+      'benchmarks',
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      'latest',
+      'all',
+    ]);
   });
 
   it('builds exact query key when exact=true', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01', true, true);
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact']);
+    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact', 'all']);
+  });
+
+  it('includes the runId in the query key for the as-of-run view', () => {
+    const opts = benchmarkQueryOptions(
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      true,
+      false,
+      '27489075807',
+    );
+    expect(opts.queryKey).toEqual([
+      'benchmarks',
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      'latest',
+      '27489075807',
+    ]);
+  });
+
+  it('produces distinct keys for different runIds (no cache collision)', () => {
+    const a = benchmarkQueryOptions('m', '2026-03-01', true, false, '100');
+    const b = benchmarkQueryOptions('m', '2026-03-01', true, false, '101');
+    expect(a.queryKey).not.toEqual(b.queryKey);
   });
 
   it('produces distinct keys for different models', () => {
diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts
@@ -8,14 +8,17 @@ export function benchmarkQueryOptions(
   date: string,
   enabled = true,
   exact?: boolean,
+  /** GitHub run id for the "as of run" view (main chart only). */
+  runId?: string,
 ) {
   return {
-    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest'] as const,
-    queryFn: ({ signal }: { signal: AbortSignal }) => fetchBenchmarks(model, date, exact, signal),
+    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest', runId ?? 'all'] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      fetchBenchmarks(model, date, exact, signal, runId),
     enabled: enabled && Boolean(model),
   };
 }
 
-export function useBenchmarks(model: string, date?: string, enabled = true) {
-  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled));
+export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) {
+  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId));
 }
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
@@ -126,10 +126,12 @@ export function fetchBenchmarks(
   date?: string,
   exact?: boolean,
   signal?: AbortSignal,
+  runId?: string,
 ) {
   const params = new URLSearchParams({ model });
   if (date) params.set('date', date);
   if (exact) params.set('exact', 'true');
+  if (runId) params.set('runId', runId);
   return fetchJson<BenchmarkRow[]>(`/api/v1/benchmarks?${params}`, signal);
 }
 
diff --git a/packages/db/src/json-provider.asof.test.ts b/packages/db/src/json-provider.asof.test.ts
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts