From f9e510e1565bb7ceaa1101aab4288fd14069f523 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 15 Jun 2026 00:22:41 -0500
Subject: [PATCH 1/3] feat(inference): add per-run benchmark fetch and
 run-coverage queries

---
 .../src/app/api/v1/benchmarks/route.test.ts   | 26 +++++++-
 .../app/src/app/api/v1/benchmarks/route.ts    | 23 ++++++-
 .../app/api/v1/workflow-info/route.test.ts    | 31 ++++++---
 .../app/src/app/api/v1/workflow-info/route.ts |  7 ++-
 .../app/src/hooks/api/use-benchmarks.test.ts  | 22 ++++++-
 packages/app/src/hooks/api/use-benchmarks.ts  | 15 ++++-
 packages/app/src/lib/api.ts                   | 22 +++++++
 packages/db/src/json-provider.ts              | 63 +++++++++++++++++++
 packages/db/src/queries/benchmarks.ts         | 51 +++++++++++++++
 packages/db/src/queries/workflow-info.ts      | 41 ++++++++++++
 10 files changed, 284 insertions(+), 17 deletions(-)

diff --git a/packages/app/src/app/api/v1/benchmarks/route.test.ts b/packages/app/src/app/api/v1/benchmarks/route.test.ts
index 5a5251f5..8b7d573e 100644
--- a/packages/app/src/app/api/v1/benchmarks/route.test.ts
+++ b/packages/app/src/app/api/v1/benchmarks/route.test.ts
@@ -1,7 +1,8 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 
-const { mockGetLatestBenchmarks, mockGetDb } = vi.hoisted(() => ({
+const { mockGetLatestBenchmarks, mockGetBenchmarksForRun, mockGetDb } = vi.hoisted(() => ({
   mockGetLatestBenchmarks: vi.fn(),
+  mockGetBenchmarksForRun: vi.fn(),
   mockGetDb: vi.fn(() => 'mock-sql'),
 }));
 
@@ -13,6 +14,7 @@ vi.mock('@semianalysisai/inferencex-db/connection', () => ({
 
 vi.mock('@semianalysisai/inferencex-db/queries/benchmarks', () => ({
   getLatestBenchmarks: mockGetLatestBenchmarks,
+  getBenchmarksForRun: mockGetBenchmarksForRun,
 }));
 
 vi.mock('@/lib/api-cache', () => ({
@@ -125,6 +127,28 @@ describe('GET /api/v1/benchmarks', () => {
     );
   });
 
+  it('routes exactRun=true + runId to the exact-run query', async () => {
+    const runRows = [{ id: 1, hardware: 'mi300x' }];
+    mockGetBenchmarksForRun.mockResolvedValueOnce(runRows);
+
+    const res = await GET(
+      req('/api/v1/benchmarks?model=DeepSeek-R1-0528&runId=27489075807&exactRun=true'),
+    );
+    expect(res.status).toBe(200);
+    expect(await res.json()).toEqual(runRows);
+    expect(mockGetBenchmarksForRun).toHaveBeenCalledWith('mock-sql', ['dsr1'], '27489075807');
+    expect(mockGetLatestBenchmarks).not.toHaveBeenCalled();
+  });
+
+  it('ignores exactRun without a runId (falls back to latest)', async () => {
+    mockGetLatestBenchmarks.mockResolvedValueOnce([]);
+
+    const res = await GET(req('/api/v1/benchmarks?model=DeepSeek-R1-0528&exactRun=true'));
+    expect(res.status).toBe(200);
+    expect(mockGetBenchmarksForRun).not.toHaveBeenCalled();
+    expect(mockGetLatestBenchmarks).toHaveBeenCalled();
+  });
+
   it('returns 500 when query throws', async () => {
     mockGetLatestBenchmarks.mockRejectedValueOnce(new Error('DB down'));
 
diff --git a/packages/app/src/app/api/v1/benchmarks/route.ts b/packages/app/src/app/api/v1/benchmarks/route.ts
index 59d30853..dd2267e3 100644
--- a/packages/app/src/app/api/v1/benchmarks/route.ts
+++ b/packages/app/src/app/api/v1/benchmarks/route.ts
@@ -3,7 +3,10 @@ import { type NextRequest, NextResponse } from 'next/server';
 import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants';
 import { FIXTURES_MODE, JSON_MODE, getDb } from '@semianalysisai/inferencex-db/connection';
 import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
-import { getLatestBenchmarks } from '@semianalysisai/inferencex-db/queries/benchmarks';
+import {
+  getBenchmarksForRun,
+  getLatestBenchmarks,
+} from '@semianalysisai/inferencex-db/queries/benchmarks';
 
 import { cachedJson, cachedQuery } from '@/lib/api-cache';
 import { loadFixture } from '@/lib/test-fixtures';
@@ -20,6 +23,17 @@ const getCachedBenchmarks = cachedQuery(
   { blobOnly: true },
 );
 
+// Exactly one run's results (GPU comparison of individual same-day runs). Cached
+// under a distinct key prefix so it never collides with the latest/as-of query.
+const getCachedBenchmarksForRun = cachedQuery(
+  (dbModelKeys: string[], runId: string) => {
+    if (JSON_MODE) return Promise.resolve(jsonProvider.getBenchmarksForRun(dbModelKeys, runId));
+    return getBenchmarksForRun(getDb(), dbModelKeys, runId);
+  },
+  'benchmarks-run',
+  { blobOnly: true },
+);
+
 export async function GET(request: NextRequest) {
   const params = request.nextUrl.searchParams;
   const model = params.get('model') ?? '';
@@ -28,6 +42,8 @@ export async function GET(request: NextRequest) {
   // Numeric GitHub run id only — anything else is ignored (treated as "latest").
   const runIdParam = params.get('runId');
   const runId = runIdParam && /^\d+$/u.test(runIdParam) ? runIdParam : undefined;
+  // exactRun=true → return exactly this run's results (GPU comparison of same-day runs).
+  const exactRun = params.get('exactRun') === 'true';
   const dbModelKeys = DISPLAY_MODEL_TO_DB[model];
   if (!dbModelKeys || dbModelKeys.length === 0) {
     return NextResponse.json({ error: 'Unknown model' }, { status: 400 });
@@ -35,7 +51,10 @@ export async function GET(request: NextRequest) {
   if (FIXTURES_MODE) return cachedJson(loadFixture('benchmarks'));
 
   try {
-    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId);
+    const rows =
+      exactRun && runId
+        ? await getCachedBenchmarksForRun(dbModelKeys, runId)
+        : await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId);
     return cachedJson(rows);
   } catch (error) {
     console.error('Error fetching benchmarks:', error);
diff --git a/packages/app/src/app/api/v1/workflow-info/route.test.ts b/packages/app/src/app/api/v1/workflow-info/route.test.ts
index 4eb6765e..fde062f2 100644
--- a/packages/app/src/app/api/v1/workflow-info/route.test.ts
+++ b/packages/app/src/app/api/v1/workflow-info/route.test.ts
@@ -1,12 +1,18 @@
 import { describe, expect, it, vi, beforeEach } from 'vitest';
 
-const { mockGetWorkflowRunsByDate, mockGetChangelogByDate, mockGetDateConfigs, mockGetDb } =
-  vi.hoisted(() => ({
-    mockGetWorkflowRunsByDate: vi.fn(),
-    mockGetChangelogByDate: vi.fn(),
-    mockGetDateConfigs: vi.fn(),
-    mockGetDb: vi.fn(() => 'mock-sql'),
-  }));
+const {
+  mockGetWorkflowRunsByDate,
+  mockGetChangelogByDate,
+  mockGetDateConfigs,
+  mockGetRunConfigsByDate,
+  mockGetDb,
+} = vi.hoisted(() => ({
+  mockGetWorkflowRunsByDate: vi.fn(),
+  mockGetChangelogByDate: vi.fn(),
+  mockGetDateConfigs: vi.fn(),
+  mockGetRunConfigsByDate: vi.fn(),
+  mockGetDb: vi.fn(() => 'mock-sql'),
+}));
 
 vi.mock('@semianalysisai/inferencex-db/connection', () => ({
   getDb: mockGetDb,
@@ -18,6 +24,7 @@ vi.mock('@semianalysisai/inferencex-db/queries/workflow-info', () => ({
   getWorkflowRunsByDate: mockGetWorkflowRunsByDate,
   getChangelogByDate: mockGetChangelogByDate,
   getDateConfigs: mockGetDateConfigs,
+  getRunConfigsByDate: mockGetRunConfigsByDate,
 }));
 
 vi.mock('@/lib/api-cache', () => ({
@@ -60,9 +67,13 @@ describe('GET /api/v1/workflow-info', () => {
     const mockRuns = [{ id: 1, status: 'completed' }];
     const mockChangelogs = [{ version: '1.0', changes: 'Initial' }];
     const mockConfigs = [{ model: 'dsr1', gpu: 'h200' }];
+    const mockRunConfigs = [
+      { github_run_id: 1, model: 'dsr1', hardware: 'h200', framework: 'vllm' },
+    ];
     mockGetWorkflowRunsByDate.mockResolvedValueOnce(mockRuns);
     mockGetChangelogByDate.mockResolvedValueOnce(mockChangelogs);
     mockGetDateConfigs.mockResolvedValueOnce(mockConfigs);
+    mockGetRunConfigsByDate.mockResolvedValueOnce(mockRunConfigs);
 
     const res = await GET(req('/api/v1/workflow-info?date=2026-03-01'));
     expect(res.status).toBe(200);
@@ -71,21 +82,24 @@ describe('GET /api/v1/workflow-info', () => {
       runs: mockRuns,
       changelogs: mockChangelogs,
       configs: mockConfigs,
+      runConfigs: mockRunConfigs,
     });
     expect(mockGetWorkflowRunsByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01');
     expect(mockGetChangelogByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01');
     expect(mockGetDateConfigs).toHaveBeenCalledWith('mock-sql', '2026-03-01');
+    expect(mockGetRunConfigsByDate).toHaveBeenCalledWith('mock-sql', '2026-03-01');
   });
 
   it('accepts empty date param (returns all)', async () => {
     mockGetWorkflowRunsByDate.mockResolvedValueOnce([]);
     mockGetChangelogByDate.mockResolvedValueOnce([]);
     mockGetDateConfigs.mockResolvedValueOnce([]);
+    mockGetRunConfigsByDate.mockResolvedValueOnce([]);
 
     const res = await GET(req('/api/v1/workflow-info'));
     expect(res.status).toBe(200);
     const body = await res.json();
-    expect(body).toEqual({ runs: [], changelogs: [], configs: [] });
+    expect(body).toEqual({ runs: [], changelogs: [], configs: [], runConfigs: [] });
     expect(mockGetWorkflowRunsByDate).toHaveBeenCalledWith('mock-sql', '');
   });
 
@@ -93,6 +107,7 @@ describe('GET /api/v1/workflow-info', () => {
     mockGetWorkflowRunsByDate.mockRejectedValueOnce(new Error('Timeout'));
     mockGetChangelogByDate.mockResolvedValueOnce([]);
     mockGetDateConfigs.mockResolvedValueOnce([]);
+    mockGetRunConfigsByDate.mockResolvedValueOnce([]);
 
     const res = await GET(req('/api/v1/workflow-info?date=2026-03-01'));
     expect(res.status).toBe(500);
diff --git a/packages/app/src/app/api/v1/workflow-info/route.ts b/packages/app/src/app/api/v1/workflow-info/route.ts
index 66af69d5..fc17db31 100644
--- a/packages/app/src/app/api/v1/workflow-info/route.ts
+++ b/packages/app/src/app/api/v1/workflow-info/route.ts
@@ -5,6 +5,7 @@ import * as jsonProvider from '@semianalysisai/inferencex-db/json-provider';
 import {
   getChangelogByDate,
   getDateConfigs,
+  getRunConfigsByDate,
   getWorkflowRunsByDate,
 } from '@semianalysisai/inferencex-db/queries/workflow-info';
 
@@ -19,15 +20,17 @@ const getCachedWorkflowInfo = cachedQuery(async (date: string) => {
       runs: jsonProvider.getWorkflowRunsByDate(date),
       changelogs: jsonProvider.getChangelogByDate(date),
       configs: jsonProvider.getDateConfigs(date),
+      runConfigs: jsonProvider.getRunConfigsByDate(date),
     };
   }
   const sql = getDb();
-  const [runs, changelogs, configs] = await Promise.all([
+  const [runs, changelogs, configs, runConfigs] = await Promise.all([
     getWorkflowRunsByDate(sql, date),
     getChangelogByDate(sql, date),
     getDateConfigs(sql, date),
+    getRunConfigsByDate(sql, date),
   ]);
-  return { runs, changelogs, configs };
+  return { runs, changelogs, configs, runConfigs };
 }, 'workflow-info');
 
 export async function GET(request: NextRequest) {
diff --git a/packages/app/src/hooks/api/use-benchmarks.test.ts b/packages/app/src/hooks/api/use-benchmarks.test.ts
index 21876018..48a861d1 100644
--- a/packages/app/src/hooks/api/use-benchmarks.test.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.test.ts
@@ -11,12 +11,20 @@ describe('benchmarkQueryOptions', () => {
       '2026-03-01',
       'latest',
       'all',
+      'asof',
     ]);
   });
 
   it('builds exact query key when exact=true', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01', true, true);
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact', 'all']);
+    expect(opts.queryKey).toEqual([
+      'benchmarks',
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      'exact',
+      'all',
+      'asof',
+    ]);
   });
 
   it('includes the runId in the query key for the as-of-run view', () => {
@@ -33,9 +41,21 @@ describe('benchmarkQueryOptions', () => {
       '2026-03-01',
       'latest',
       '27489075807',
+      'asof',
     ]);
   });
 
+  it('marks the key as an exact-run query when exactRun=true', () => {
+    const opts = benchmarkQueryOptions('m', '', true, false, '27489075807', true);
+    expect(opts.queryKey).toEqual(['benchmarks', 'm', '', 'latest', '27489075807', 'run']);
+  });
+
+  it('produces distinct keys for as-of vs exact-run with the same runId', () => {
+    const asof = benchmarkQueryOptions('m', '2026-03-01', true, false, '100', false);
+    const exact = benchmarkQueryOptions('m', '2026-03-01', true, false, '100', true);
+    expect(asof.queryKey).not.toEqual(exact.queryKey);
+  });
+
   it('produces distinct keys for different runIds (no cache collision)', () => {
     const a = benchmarkQueryOptions('m', '2026-03-01', true, false, '100');
     const b = benchmarkQueryOptions('m', '2026-03-01', true, false, '101');
diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts
index ff32b250..a8d634f1 100644
--- a/packages/app/src/hooks/api/use-benchmarks.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.ts
@@ -8,13 +8,22 @@ export function benchmarkQueryOptions(
   date: string,
   enabled = true,
   exact?: boolean,
-  /** GitHub run id for the "as of run" view (main chart only). */
+  /** GitHub run id for the "as of run" view (main chart) or the exact-run comparison. */
   runId?: string,
+  /** When true with a runId, fetch exactly that run's results (GPU comparison). */
+  exactRun?: boolean,
 ) {
   return {
-    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest', runId ?? 'all'] as const,
+    queryKey: [
+      'benchmarks',
+      model,
+      date,
+      exact ? 'exact' : 'latest',
+      runId ?? 'all',
+      exactRun ? 'run' : 'asof',
+    ] as const,
     queryFn: ({ signal }: { signal: AbortSignal }) =>
-      fetchBenchmarks(model, date, exact, signal, runId),
+      fetchBenchmarks(model, date, exact, signal, runId, exactRun),
     enabled: enabled && Boolean(model),
   };
 }
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 49d8d4d0..0dac5883 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -74,10 +74,29 @@ export interface DateConfigRow {
   disagg: boolean;
 }
 
+/**
+ * Per-(run, config) coverage for a date — which workflow runs produced benchmark
+ * data for which configs. Data-driven, so a run that shipped data without a
+ * changelog entry still appears (used to enumerate every run on a date).
+ */
+export interface RunConfigRow {
+  github_run_id: number;
+  run_started_at: string | null;
+  html_url: string | null;
+  head_sha: string | null;
+  model: string;
+  precision: string;
+  hardware: string;
+  framework: string;
+  spec_method: string;
+  disagg: boolean;
+}
+
 export interface WorkflowInfoResponse {
   runs: WorkflowRunRow[];
   changelogs: ChangelogRow[];
   configs: DateConfigRow[];
+  runConfigs: RunConfigRow[];
 }
 
 export interface ReliabilityRow {
@@ -127,11 +146,14 @@ export function fetchBenchmarks(
   exact?: boolean,
   signal?: AbortSignal,
   runId?: string,
+  /** When true with a runId, fetch exactly that run's results (GPU comparison). */
+  exactRun?: boolean,
 ) {
   const params = new URLSearchParams({ model });
   if (date) params.set('date', date);
   if (exact) params.set('exact', 'true');
   if (runId) params.set('runId', runId);
+  if (exactRun) params.set('exactRun', 'true');
   return fetchJson<BenchmarkRow[]>(`/api/v1/benchmarks?${params}`, signal);
 }
 
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index c77ef687..1ed155c9 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -19,6 +19,7 @@ import type {
   AvailabilityRow,
   ChangelogRow,
   DateConfigRow,
+  RunConfigRow,
   WorkflowRunRow,
 } from './queries/workflow-info.js';
 
@@ -409,6 +410,32 @@ export function getLatestBenchmarks(
   });
 }
 
+/** In-memory mirror of {@link import('./queries/benchmarks.js').getBenchmarksForRun}. */
+export function getBenchmarksForRun(
+  modelKey: string | string[],
+  githubRunId: string | number,
+): BenchmarkRow[] {
+  const s = getStore();
+  const modelKeys = new Set(Array.isArray(modelKey) ? modelKey : [modelKey]);
+  const run = s.latestRuns.get(Number(githubRunId));
+  if (!run) return [];
+
+  const seen = new Map<string, RawBenchmarkResult>();
+  for (const br of s.benchmarks) {
+    if (br.error !== null && br.error !== undefined) continue;
+    if (br.workflow_run_id !== run.id) continue;
+    const c = s.configs.get(br.config_id);
+    if (!c || !modelKeys.has(c.model)) continue;
+    const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}`;
+    if (!seen.has(key)) seen.set(key, br);
+  }
+
+  return [...seen.values()].map((br) => {
+    const c = s.configs.get(br.config_id)!;
+    return toBenchmarkRow(br, c, run);
+  });
+}
+
 export function getAllBenchmarksForHistory(
   modelKey: string | string[],
   isl: number,
@@ -621,6 +648,42 @@ export function getDateConfigs(date: string): DateConfigRow[] {
   return rows;
 }
 
+export function getRunConfigsByDate(date: string): RunConfigRow[] {
+  const s = getStore();
+  const dateStr = toDateString(date);
+
+  const seen = new Set<string>();
+  const rows: RunConfigRow[] = [];
+
+  for (const br of s.benchmarks) {
+    if (br.error !== null && br.error !== undefined) continue;
+    if (toDateString(br.date) !== dateStr) continue;
+    const wr = s.latestRunsById.get(br.workflow_run_id);
+    if (!wr) continue;
+    const c = s.configs.get(br.config_id);
+    if (!c) continue;
+
+    const key = `${wr.github_run_id}|${c.model}|${c.precision}|${c.hardware}|${c.framework}|${c.spec_method}|${c.disagg}`;
+    if (seen.has(key)) continue;
+    seen.add(key);
+
+    rows.push({
+      github_run_id: wr.github_run_id,
+      run_started_at: wr.run_started_at ?? wr.created_at,
+      html_url: wr.html_url,
+      head_sha: wr.head_sha,
+      model: c.model,
+      precision: c.precision,
+      hardware: c.hardware,
+      framework: c.framework,
+      spec_method: c.spec_method,
+      disagg: c.disagg,
+    });
+  }
+
+  return rows;
+}
+
 export function getServerLog(benchmarkResultId: number): string | null {
   const s = getStore();
   const logId = s.benchmarkServerLogMap.get(benchmarkResultId);
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 47e3c328..49c60604 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -170,6 +170,57 @@ export async function getLatestBenchmarks(
   return rows as unknown as BenchmarkRow[];
 }
 
+/**
+ * Fetch the benchmark results produced by ONE specific workflow run (by GitHub
+ * run id). Unlike {@link getLatestBenchmarks}, this returns exactly what that run
+ * measured — used by the GPU comparison view to plot individual same-day runs as
+ * distinct series (e.g. comparing a day-zero sweep against a same-day re-sweep).
+ * Returns an empty array if the run produced no results for the model.
+ */
+export async function getBenchmarksForRun(
+  sql: DbClient,
+  modelKey: string | string[],
+  githubRunId: string | number,
+): Promise<BenchmarkRow[]> {
+  const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
+  const rows = await sql`
+    SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+      c.hardware,
+      c.framework,
+      c.model,
+      c.precision,
+      c.spec_method,
+      c.disagg,
+      c.is_multinode,
+      c.prefill_tp,
+      c.prefill_ep,
+      c.prefill_dp_attention,
+      c.prefill_num_workers,
+      c.decode_tp,
+      c.decode_ep,
+      c.decode_dp_attention,
+      c.decode_num_workers,
+      c.num_prefill_gpu,
+      c.num_decode_gpu,
+      br.isl,
+      br.osl,
+      br.conc,
+      br.image,
+      br.metrics,
+      br.workers,
+      br.date::text,
+      CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url
+    FROM benchmark_results br
+    JOIN configs c ON c.id = br.config_id
+    JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
+    WHERE c.model = ANY(${modelKeys})
+      AND br.error IS NULL
+      AND wr.github_run_id = ${Number(githubRunId)}
+    ORDER BY br.config_id, br.conc, br.isl, br.osl, br.date DESC
+  `;
+  return rows as unknown as BenchmarkRow[];
+}
+
 /**
  * Fetch ALL benchmark results for a model + sequence across ALL dates.
  * No DISTINCT ON — returns every successful result, one per (config, conc, date).
diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts
index b4e4f255..dfcb9e9f 100644
--- a/packages/db/src/queries/workflow-info.ts
+++ b/packages/db/src/queries/workflow-info.ts
@@ -65,6 +65,47 @@ export async function getChangelogByDate(sql: DbClient, date: string): Promise<C
   return rows as unknown as ChangelogRow[];
 }
 
+export interface RunConfigRow {
+  github_run_id: number;
+  run_started_at: string | null;
+  html_url: string | null;
+  head_sha: string | null;
+  model: string;
+  precision: string;
+  hardware: string;
+  framework: string;
+  spec_method: string;
+  disagg: boolean;
+}
+
+/**
+ * Per-(run, config) coverage for a date: which workflow runs produced benchmark
+ * data for which configs. Data-driven (joins benchmark_results) so a run that
+ * shipped data without a changelog entry still surfaces — the comparison UI uses
+ * this to enumerate every run on a date, not just runs with changelog notes.
+ */
+export async function getRunConfigsByDate(sql: DbClient, date: string): Promise<RunConfigRow[]> {
+  const rows = await sql`
+    SELECT DISTINCT
+      wr.github_run_id,
+      to_char(COALESCE(wr.run_started_at, wr.created_at), 'YYYY-MM-DD"T"HH24:MI:SS"Z"') as run_started_at,
+      wr.html_url,
+      wr.head_sha,
+      c.model,
+      c.precision,
+      c.hardware,
+      c.framework,
+      c.spec_method,
+      c.disagg
+    FROM benchmark_results br
+    JOIN configs c ON c.id = br.config_id
+    JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
+    WHERE br.date = ${date}::date
+      AND br.error IS NULL
+  `;
+  return rows as unknown as RunConfigRow[];
+}
+
 /** Get distinct model/sequence/precision/hardware combos for a date. */
 export async function getDateConfigs(sql: DbClient, date: string): Promise<DateConfigRow[]> {
   const rows = await sql`

From 02323f03006e743d73eac9089b62d19a283881f8 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 15 Jun 2026 00:22:58 -0500
Subject: [PATCH 2/3] feat(inference): add comparison run-entry model and run
 enumeration helpers

---
 .../inference/utils/comparisonEntry.test.ts   |  94 ++++++++++++++
 .../inference/utils/comparisonEntry.ts        | 122 ++++++++++++++++++
 .../inference/utils/runEnumeration.test.ts    | 107 +++++++++++++++
 .../inference/utils/runEnumeration.ts         |  80 ++++++++++++
 4 files changed, 403 insertions(+)
 create mode 100644 packages/app/src/components/inference/utils/comparisonEntry.test.ts
 create mode 100644 packages/app/src/components/inference/utils/comparisonEntry.ts
 create mode 100644 packages/app/src/components/inference/utils/runEnumeration.test.ts
 create mode 100644 packages/app/src/components/inference/utils/runEnumeration.ts

diff --git a/packages/app/src/components/inference/utils/comparisonEntry.test.ts b/packages/app/src/components/inference/utils/comparisonEntry.test.ts
new file mode 100644
index 00000000..9eafa341
--- /dev/null
+++ b/packages/app/src/components/inference/utils/comparisonEntry.test.ts
@@ -0,0 +1,94 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  buildRunNumbering,
+  comparisonEntryDate,
+  comparisonEntryLabel,
+  comparisonEntrySortValue,
+  isRunComparisonEntry,
+  makeRunComparisonEntry,
+  parseComparisonEntry,
+  resolveComparisonEntries,
+} from './comparisonEntry';
+
+describe('comparisonEntry', () => {
+  const runEntry = makeRunComparisonEntry('2026-06-14', '27489075807');
+
+  it('round-trips a run entry through make/parse', () => {
+    expect(runEntry).toBe('2026-06-14~r27489075807');
+    expect(parseComparisonEntry(runEntry)).toEqual({
+      raw: runEntry,
+      date: '2026-06-14',
+      runId: '27489075807',
+    });
+  });
+
+  it('parses the legacy baked-index form, ignoring the index', () => {
+    const legacy = '2026-06-14~r27489075807~3of3';
+    expect(isRunComparisonEntry(legacy)).toBe(true);
+    expect(parseComparisonEntry(legacy)).toEqual({
+      raw: legacy,
+      date: '2026-06-14',
+      runId: '27489075807',
+    });
+  });
+
+  it('treats a plain date as a non-run entry', () => {
+    expect(parseComparisonEntry('2026-06-14')).toEqual({ raw: '2026-06-14', date: '2026-06-14' });
+    expect(isRunComparisonEntry('2026-06-14')).toBe(false);
+    expect(comparisonEntryDate(runEntry)).toBe('2026-06-14');
+  });
+
+  it('numbers run entries sequentially in chronological order, gap-free', () => {
+    // Two non-adjacent run ids (e.g. skipping a same-day MTP run) must still be #1, #2.
+    const a = makeRunComparisonEntry('2026-06-14', '27485974465'); // earlier
+    const b = makeRunComparisonEntry('2026-06-14', '27489075807'); // later
+    const numbering = buildRunNumbering(['2026-06-13', b, a]);
+    expect(numbering.get(a)).toBe(1);
+    expect(numbering.get(b)).toBe(2);
+    expect(numbering.has('2026-06-13')).toBe(false); // plain dates unnumbered
+  });
+
+  it('labels run entries with their sequential number and plain dates as-is', () => {
+    const a = makeRunComparisonEntry('2026-06-14', '27485974465');
+    const numbering = buildRunNumbering([a]);
+    expect(comparisonEntryLabel(a, numbering)).toBe('2026-06-14 #1');
+    expect(comparisonEntryLabel('2026-06-14', numbering)).toBe('2026-06-14');
+    expect(comparisonEntryLabel(a)).toBe('2026-06-14'); // no numbering → bare date
+  });
+
+  it('sorts by date then run id (plain date first within a day)', () => {
+    const later = makeRunComparisonEntry('2026-06-14', '300');
+    const earlier = makeRunComparisonEntry('2026-06-14', '100');
+    const sorted = ['2026-06-13', later, earlier, '2026-06-14'].toSorted((a, b) => {
+      const [ta, ia] = comparisonEntrySortValue(a);
+      const [tb, ib] = comparisonEntrySortValue(b);
+      return ta - tb || ia - ib;
+    });
+    expect(sorted).toEqual(['2026-06-13', '2026-06-14', earlier, later]);
+  });
+
+  describe('resolveComparisonEntries', () => {
+    const range = { startDate: '2026-06-13', endDate: '2026-06-14' };
+
+    it('keeps both range endpoints when no run entries overlap', () => {
+      expect(resolveComparisonEntries([], range)).toEqual(['2026-06-13', '2026-06-14']);
+    });
+
+    it('drops a range endpoint whose date has specific run entries', () => {
+      const r1 = makeRunComparisonEntry('2026-06-14', '100');
+      const r2 = makeRunComparisonEntry('2026-06-14', '200');
+      // 2026-06-14 endpoint dropped (runs cover it); 2026-06-13 endpoint kept.
+      expect(resolveComparisonEntries([r1, r2], range)).toEqual(['2026-06-13', r1, r2]);
+    });
+
+    it('returns just the added entries when no range is set', () => {
+      const r1 = makeRunComparisonEntry('2026-06-14', '100');
+      expect(resolveComparisonEntries([r1], { startDate: '', endDate: '' })).toEqual([r1]);
+    });
+  });
+
+  it('contains no characters that break a CSS class selector', () => {
+    expect(runEntry).not.toMatch(/[.#\s]/u);
+  });
+});
diff --git a/packages/app/src/components/inference/utils/comparisonEntry.ts b/packages/app/src/components/inference/utils/comparisonEntry.ts
new file mode 100644
index 00000000..ad87cd2b
--- /dev/null
+++ b/packages/app/src/components/inference/utils/comparisonEntry.ts
@@ -0,0 +1,122 @@
+/**
+ * Comparison selections (the `selectedDates` array / `i_dates` URL param) are
+ * plain strings so they flow unchanged through the date-keyed GPU comparison
+ * pipeline (grouping, activeDates, colors, legend). An entry is one of:
+ *
+ *   - a plain date  — "2026-06-14"             → the whole day's latest run
+ *   - a run entry   — "2026-06-14~r27489075807" → one specific run
+ *
+ * The run's display number (#1, #2, …) is NOT baked into the string — it is
+ * derived at render time from the set of runs actually on the chart (see
+ * {@link buildRunNumbering}) so it is always sequential and in chronological
+ * order, and never goes stale when the run list changes.
+ *
+ * The separator is `~` (not `_`) because GPUGraph derives ids as
+ * `${entry}_${hwKey}_${precision}` and splits on the last `_`; a `_` in the entry
+ * would corrupt that. `~` is URL-safe and never appears in dates or run ids.
+ * (CSS selectors built from these ids are escaped — see d3-chart/layers/rooflines.)
+ */
+
+// Accepts the current `date~r<id>` form and the legacy `date~r<id>~<i>of<n>` form
+// (older saved selections) — the baked index, if present, is ignored.
+const RUN_ENTRY_RE = /^(?<date>\d{4}-\d{2}-\d{2})~r(?<runId>\d+)(?:~\d+of\d+)?$/u;
+
+export interface ParsedComparisonEntry {
+  /** The original entry string (series key). */
+  raw: string;
+  /** Calendar date the entry belongs to. */
+  date: string;
+  /** GitHub run id, when this entry pins a specific run. */
+  runId?: string;
+}
+
+/** Build the entry string for a specific run within a date. */
+export function makeRunComparisonEntry(date: string, runId: string): string {
+  return `${date}~r${runId}`;
+}
+
+/** Parse an entry string into its date and (optional) run components. */
+export function parseComparisonEntry(raw: string): ParsedComparisonEntry {
+  const m = RUN_ENTRY_RE.exec(raw);
+  if (!m?.groups) return { raw, date: raw };
+  return { raw, date: m.groups.date, runId: m.groups.runId };
+}
+
+/** True when the entry pins a specific run (vs. the date's latest). */
+export function isRunComparisonEntry(raw: string): boolean {
+  return RUN_ENTRY_RE.test(raw);
+}
+
+/** Underlying calendar date — used for chronological sorting and matching. */
+export function comparisonEntryDate(raw: string): string {
+  return parseComparisonEntry(raw).date;
+}
+
+/**
+ * Sort key for ordering comparison series: by date, then by run id (which grows
+ * monotonically with time) so a date's runs read earliest → latest. A plain-date
+ * entry sorts first within its day (run id 0).
+ */
+export function comparisonEntrySortValue(raw: string): [number, number] {
+  const { date, runId } = parseComparisonEntry(raw);
+  const t = new Date(date).getTime();
+  return [Number.isNaN(t) ? 0 : t, runId ? Number(runId) : 0];
+}
+
+/**
+ * Assign sequential, chronological 1-based numbers to the run entries in a set,
+ * grouped by date (run ids sort chronologically). Plain-date entries are not
+ * numbered. The result is gap-free regardless of which runs were selected.
+ */
+export function buildRunNumbering(entries: string[]): Map<string, number> {
+  const byDate = new Map<string, ParsedComparisonEntry[]>();
+  for (const raw of entries) {
+    const parsed = parseComparisonEntry(raw);
+    if (!parsed.runId) continue;
+    const list = byDate.get(parsed.date) ?? [];
+    list.push(parsed);
+    byDate.set(parsed.date, list);
+  }
+  const numbering = new Map<string, number>();
+  for (const list of byDate.values()) {
+    list
+      .toSorted((a, b) => Number(a.runId) - Number(b.runId))
+      .forEach((e, i) => {
+        numbering.set(e.raw, i + 1);
+      });
+  }
+  return numbering;
+}
+
+/**
+ * Human-readable label for legends/line labels, e.g. "2026-06-14 #2". Pass the
+ * numbering from {@link buildRunNumbering} (built from the chart's current series)
+ * so run entries get their sequential number; plain dates render as the date.
+ */
+export function comparisonEntryLabel(raw: string, numbering?: Map<string, number>): string {
+  const n = numbering?.get(raw);
+  return n ? `${comparisonEntryDate(raw)} #${n}` : comparisonEntryDate(raw);
+}
+
+/**
+ * Resolve the final set of comparison series entries from the user's selections:
+ * the date-range endpoints plus individually-added dates/runs, de-duplicated. A
+ * range endpoint is dropped when that same date has specific run entries selected
+ * — the whole-day "latest" line would just duplicate one of the numbered runs.
+ */
+export function resolveComparisonEntries(
+  selectedDates: string[],
+  range: { startDate: string; endDate: string },
+): string[] {
+  const datesWithRuns = new Set(
+    selectedDates.filter(isRunComparisonEntry).map(comparisonEntryDate),
+  );
+  const entries: string[] = [];
+  if (range.startDate && range.endDate) {
+    for (const d of [range.startDate, range.endDate]) {
+      if (!datesWithRuns.has(d)) entries.push(d);
+    }
+  }
+  entries.push(...selectedDates);
+  return [...new Set(entries)];
+}
diff --git a/packages/app/src/components/inference/utils/runEnumeration.test.ts b/packages/app/src/components/inference/utils/runEnumeration.test.ts
new file mode 100644
index 00000000..fbb94280
--- /dev/null
+++ b/packages/app/src/components/inference/utils/runEnumeration.test.ts
@@ -0,0 +1,107 @@
+import { describe, expect, it } from 'vitest';
+
+import type { RunConfigRow } from '@/lib/api';
+
+import { dataRunsForDate } from './runEnumeration';
+
+function rc(over: Partial<RunConfigRow>): RunConfigRow {
+  return {
+    github_run_id: 1,
+    run_started_at: '2026-06-14T00:00:00Z',
+    html_url: null,
+    head_sha: null,
+    model: 'minimaxm3',
+    precision: 'fp8',
+    hardware: 'mi300x',
+    framework: 'vllm',
+    spec_method: 'none',
+    disagg: false,
+    ...over,
+  };
+}
+
+const SCOPE = {
+  modelDbKeys: ['minimaxm3'],
+  selectedGPUs: ['mi300x_vllm'],
+  selectedPrecisions: ['fp8'],
+};
+
+describe('dataRunsForDate', () => {
+  it('enumerates distinct runs for the selected config, earliest first', () => {
+    const rows = [
+      rc({ github_run_id: 27489075807, run_started_at: '2026-06-14T06:43:25Z' }),
+      rc({ github_run_id: 27485974465, run_started_at: '2026-06-14T04:08:16Z' }),
+      rc({ github_run_id: 27510667862, run_started_at: '2026-06-14T23:22:40Z' }),
+    ];
+    const runs = dataRunsForDate(rows, SCOPE);
+    expect(runs.map((r) => r.runId)).toEqual(['27485974465', '27489075807', '27510667862']);
+  });
+
+  it('dedupes a run that appears in multiple matching rows into one entry', () => {
+    const rows = [
+      rc({ github_run_id: 100 }),
+      // same run id appearing again (e.g. another covered row) — still one run
+      rc({ github_run_id: 100 }),
+    ];
+    const runs = dataRunsForDate(rows, SCOPE);
+    expect(runs).toHaveLength(1);
+    expect(runs[0].runId).toBe('100');
+  });
+
+  it('excludes MTP runs when a non-MTP GPU key is selected', () => {
+    const rows = [
+      rc({ github_run_id: 1, spec_method: 'none' }),
+      rc({ github_run_id: 2, spec_method: 'mtp' }),
+    ];
+    const runs = dataRunsForDate(rows, SCOPE);
+    expect(runs.map((r) => r.runId)).toEqual(['1']);
+  });
+
+  it('includes only MTP runs when the MTP GPU key is selected', () => {
+    const rows = [
+      rc({ github_run_id: 1, spec_method: 'none' }),
+      rc({ github_run_id: 2, spec_method: 'mtp' }),
+    ];
+    const runs = dataRunsForDate(rows, { ...SCOPE, selectedGPUs: ['mi300x_vllm_mtp'] });
+    expect(runs.map((r) => r.runId)).toEqual(['2']);
+  });
+
+  it('excludes runs for other models, precisions, and GPUs', () => {
+    const rows = [
+      rc({ github_run_id: 1 }), // matches
+      rc({ github_run_id: 2, model: 'dsr1' }), // other model
+      rc({ github_run_id: 3, precision: 'fp4' }), // other precision
+      rc({ github_run_id: 4, hardware: 'b200' }), // other gpu
+      rc({ github_run_id: 5, framework: 'sglang' }), // other framework
+    ];
+    const runs = dataRunsForDate(rows, SCOPE);
+    expect(runs.map((r) => r.runId)).toEqual(['1']);
+  });
+
+  it('includes a run for any selected GPU (union across GPUs)', () => {
+    const rows = [
+      rc({ github_run_id: 1, hardware: 'mi300x', framework: 'vllm' }),
+      rc({ github_run_id: 2, hardware: 'b200', framework: 'vllm' }),
+    ];
+    const runs = dataRunsForDate(rows, { ...SCOPE, selectedGPUs: ['mi300x_vllm', 'b200_vllm'] });
+    expect(runs.map((r) => r.runId).toSorted()).toEqual(['1', '2']);
+  });
+
+  it('carries run url and head sha through', () => {
+    const rows = [
+      rc({
+        github_run_id: 7,
+        html_url: 'https://github.com/x/actions/runs/7',
+        head_sha: 'abc123',
+      }),
+    ];
+    const [run] = dataRunsForDate(rows, SCOPE);
+    expect(run.runUrl).toBe('https://github.com/x/actions/runs/7');
+    expect(run.headSha).toBe('abc123');
+  });
+
+  it('returns nothing when no run matches the selection', () => {
+    expect(dataRunsForDate([], SCOPE)).toEqual([]);
+    expect(dataRunsForDate([rc({ model: 'dsr1' })], SCOPE)).toEqual([]);
+  });
+});
diff --git a/packages/app/src/components/inference/utils/runEnumeration.ts b/packages/app/src/components/inference/utils/runEnumeration.ts
new file mode 100644
index 00000000..fda1ae30
--- /dev/null
+++ b/packages/app/src/components/inference/utils/runEnumeration.ts
@@ -0,0 +1,80 @@
+/**
+ * Enumerates the workflow runs that produced benchmark data for the currently
+ * selected model / GPU / precision on a given date. This is the single source of
+ * truth for "how many runs are on this date" used by both the changelog (to render
+ * a block per run) and the chart (to expand a plain-date selection into per-run
+ * series).
+ *
+ * It is intentionally DATA-driven (keyed off `runConfigs`, which comes from the
+ * benchmark rows) rather than changelog-driven: a run can ship data without a
+ * changelog entry, and that newest run is exactly the one the plain-date "latest"
+ * view shows — so enumerating from changelog entries alone would silently drop it.
+ *
+ * Runs are scoped to the selected GPUs using the canonical {@link getHardwareKey}
+ * so MTP and disagg variants (separate hw keys) are kept distinct, exactly as the
+ * chart keys them.
+ */
+
+import type { AggDataEntry } from '@/components/inference/types';
+import type { RunConfigRow } from '@/lib/api';
+import { getHardwareKey } from '@/lib/chart-utils';
+
+export interface DataRun {
+  /** GitHub run id (string). */
+  runId: string;
+  /** ISO-8601 start time (or created_at fallback); orders runs chronologically. */
+  runStartedAt: string;
+  /** Workflow run URL, when known. */
+  runUrl?: string;
+  /** Head commit sha, for the Git Commit link. */
+  headSha?: string;
+}
+
+export interface RunScope {
+  /** DB model keys for the selected display model, e.g. ['minimaxm3']. */
+  modelDbKeys: string[];
+  /** Selected GPU hw keys, e.g. ['mi300x_vllm']. */
+  selectedGPUs: string[];
+  /** Selected DB precisions, e.g. ['fp8']. */
+  selectedPrecisions: string[];
+}
+
+/** The hw key a runConfig maps to, built the same way the chart builds series keys. */
+function runConfigHwKey(rc: RunConfigRow): string {
+  return getHardwareKey({
+    hw: rc.hardware,
+    framework: rc.framework,
+    disagg: rc.disagg,
+    spec_decoding: rc.spec_method,
+  } as unknown as AggDataEntry);
+}
+
+/**
+ * Distinct runs that produced data for the selected config on a date, earliest
+ * first. De-duplicated by run id; ordered by start time so the #1/#2/#3 the UI
+ * assigns read in the order the runs actually happened.
+ */
+export function dataRunsForDate(runConfigs: RunConfigRow[], scope: RunScope): DataRun[] {
+  const { modelDbKeys, selectedGPUs, selectedPrecisions } = scope;
+  const precSet = new Set(selectedPrecisions);
+  const gpuSet = new Set(selectedGPUs);
+  const byRun = new Map<string, DataRun>();
+
+  for (const rc of runConfigs) {
+    if (!modelDbKeys.includes(rc.model)) continue;
+    if (!precSet.has(rc.precision)) continue;
+    if (!gpuSet.has(runConfigHwKey(rc))) continue;
+
+    const id = String(rc.github_run_id);
+    if (!byRun.has(id)) {
+      byRun.set(id, {
+        runId: id,
+        runStartedAt: rc.run_started_at ?? '',
+        runUrl: rc.html_url ?? undefined,
+        headSha: rc.head_sha ?? undefined,
+      });
+    }
+  }
+
+  return [...byRun.values()].toSorted((a, b) => a.runStartedAt.localeCompare(b.runStartedAt));
+}

From a4855d29ad8e895fca98c766782f97f1c2908a5e Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Mon, 15 Jun 2026 00:23:07 -0500
Subject: [PATCH 3/3] feat(inference): compare individual runs with two-way
 legend/changelog binding

---
 .../components/inference/InferenceContext.tsx |  13 +-
 .../inference/hooks/useChartData.ts           |  27 +-
 .../app/src/components/inference/types.ts     |  11 +-
 .../components/inference/ui/ChartDisplay.tsx  |  86 ++++-
 .../inference/ui/ComparisonChangelog.tsx      | 358 +++++++++++++-----
 .../src/components/inference/ui/GPUGraph.tsx  |  85 ++++-
 .../hooks/api/use-comparison-changelogs.ts    |  72 +++-
 .../app/src/lib/d3-chart/layers/rooflines.ts  |  12 +-
 8 files changed, 514 insertions(+), 150 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index c2a6a2e7..f8e9f647 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -2,6 +2,7 @@
 
 import {
   type ReactNode,
+  type SetStateAction,
   createContext,
   useCallback,
   useContext,
@@ -57,6 +58,7 @@ import {
 import { filterRunsByModel, getDisplayLabel } from '@/lib/utils';
 
 import { useChartData } from './hooks/useChartData';
+import { resolveComparisonEntries } from './utils/comparisonEntry';
 
 /** @internal Exported for test provider wrapping only. */
 export const InferenceContext = createContext<InferenceChartContextType | undefined>(undefined);
@@ -416,7 +418,10 @@ export function InferenceProvider({
     [setSelectedGPUs, clearPresetOnChange],
   );
   const setSelectedDatesAndClear = useCallback(
-    (v: string[]) => {
+    // Accept a React state updater (value OR function) so callers adding several
+    // dates/runs in quick succession can use the functional form and avoid the
+    // stale-closure race where each click overwrites the last.
+    (v: SetStateAction<string[]>) => {
       setSelectedDates(v);
       clearPresetOnChange();
     },
@@ -564,11 +569,7 @@ export function InferenceProvider({
   );
 
   const allDateIds = useMemo(() => {
-    const dates: string[] = [];
-    if (selectedDateRange.startDate && selectedDateRange.endDate) {
-      dates.push(selectedDateRange.startDate, selectedDateRange.endDate);
-    }
-    dates.push(...selectedDates);
+    const dates = resolveComparisonEntries(selectedDates, selectedDateRange);
     const allIds = new Set<string>();
     selectedGPUs.forEach((gpu) => {
       dates.forEach((date) => allIds.add(`${date}_${gpu}`));
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 3797318c..da8ac27a 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -13,6 +13,10 @@ import type {
   YAxisMetricKey,
 } from '@/components/inference/types';
 import { filterDataByCostLimit } from '@/components/inference/utils';
+import {
+  parseComparisonEntry,
+  resolveComparisonEntries,
+} from '@/components/inference/utils/comparisonEntry';
 import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 import {
   GPU_ALIAS_TO_CANONICAL,
@@ -31,12 +35,11 @@ export function buildComparisonDates(
   selectedRunDate: string | undefined,
 ): string[] {
   if (selectedGPUs.length === 0) return [];
-  const dates: string[] = [];
-  if (selectedDateRange.startDate && selectedDateRange.endDate) {
-    dates.push(selectedDateRange.startDate, selectedDateRange.endDate);
-  }
-  dates.push(...selectedDates);
-  return [...new Set(dates.filter((d) => d !== selectedRunDate))];
+  // Range endpoints + individually-added dates/runs (redundant same-day range
+  // endpoints dropped), minus the main run date which the primary query covers.
+  return resolveComparisonEntries(selectedDates, selectedDateRange).filter(
+    (d) => d !== selectedRunDate,
+  );
 }
 
 /** Filter data by GPU key, resolving aliases to canonical keys. */
@@ -116,10 +119,16 @@ export function useChartData(
     [selectedGPUs, selectedDates, selectedDateRange, selectedRunDate],
   );
 
+  // Each comparison entry is either a plain date (latest run that day, exact-date
+  // query) or a specific run encoded as `date~r<id>~<i>of<n>` (exact-run query) so
+  // multiple same-day runs can be compared as distinct series.
   const comparisonQueries = useQueries({
-    queries: comparisonDates.map((date) =>
-      benchmarkQueryOptions(selectedModel, date, enabled, true),
-    ),
+    queries: comparisonDates.map((entry) => {
+      const parsed = parseComparisonEntry(entry);
+      return parsed.runId
+        ? benchmarkQueryOptions(selectedModel, '', enabled, false, parsed.runId, true)
+        : benchmarkQueryOptions(selectedModel, entry, enabled, true);
+    }),
   });
 
   const comparisonLoading = comparisonQueries.some((q) => q.isLoading);
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index cbf64787..cf468741 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -481,6 +481,14 @@ export interface ScatterGraphProps {
    * playback).
    */
   niceAxes?: boolean;
+  /**
+   * Stable run numbering (entry string `date~rRunId` → 1-based number) shared with
+   * the comparison changelog so legend labels match it exactly. Numbers index ALL
+   * of a date's runs (not just the ones on the chart), so a removed run leaves a
+   * gap that lines up with the changelog's still-listed "Add to chart" run. When
+   * omitted, GPUGraph falls back to gap-free numbering of the on-chart series.
+   */
+  runNumbering?: Map<string, number>;
 }
 /**
  * @file types.ts
@@ -639,7 +647,8 @@ export interface InferenceChartContextType {
   setSelectedGPUs: (gpus: string[]) => void;
   availableGPUs: { value: string; label: string }[];
   selectedDates: string[];
-  setSelectedDates: (dates: string[]) => void;
+  /** Accepts a value or a state-updater fn (for safe rapid successive adds). */
+  setSelectedDates: (dates: string[] | ((prev: string[]) => string[])) => void;
   selectedDateRange: { startDate: string; endDate: string };
   setSelectedDateRange: (dateRange: { startDate: string; endDate: string }) => void;
   userCosts: Record<string, number | undefined> | null;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 47dfe600..3c44a433 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -1,7 +1,8 @@
 'use client';
+import { DISPLAY_MODEL_TO_DB } from '@semianalysisai/inferencex-constants';
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
-import { useMemo, useRef, useState } from 'react';
+import { useEffect, useMemo, useRef, useState } from 'react';
 import { BarChart3, ChevronDown, Table2, X } from 'lucide-react';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
@@ -14,6 +15,11 @@ import type {
   TrendDataPoint,
 } from '@/components/inference/types';
 import { processOverlayChartData } from '@/components/inference/utils';
+import {
+  isRunComparisonEntry,
+  makeRunComparisonEntry,
+} from '@/components/inference/utils/comparisonEntry';
+import { dataRunsForDate } from '@/components/inference/utils/runEnumeration';
 import InferenceTable from '@/components/inference/ui/InferenceTable';
 import ScatterGraph from '@/components/inference/ui/ScatterGraph';
 import { Card } from '@/components/ui/card';
@@ -165,6 +171,58 @@ export default function ChartDisplay() {
     totalDatesQueried,
   } = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates);
 
+  const modelDbKeys = useMemo(
+    () => DISPLAY_MODEL_TO_DB[selectedModel] ?? [selectedModel],
+    [selectedModel],
+  );
+
+  // Stable run numbering shared by the changelog and the chart legend: each of a
+  // date's runs gets a fixed 1-based number (by start time) regardless of which
+  // are on the chart, so the two surfaces always show the same #N for a run and a
+  // removed run leaves a matching gap. Built from the same data-run enumeration
+  // the changelog uses.
+  const runNumbering = useMemo(() => {
+    const map = new Map<string, number>();
+    for (const c of changelogs) {
+      dataRunsForDate(c.runConfigs, { modelDbKeys, selectedGPUs, selectedPrecisions }).forEach(
+        (run, idx) => {
+          map.set(makeRunComparisonEntry(c.date, run.runId), idx + 1);
+        },
+      );
+    }
+    return map;
+  }, [changelogs, modelDbKeys, selectedGPUs, selectedPrecisions]);
+
+  // Expand a plain-date selection into one entry per run once that date's runs are
+  // known. Picking a date that has multiple runs shows each run as its own series
+  // (matching the changelog, which renders a block per run) instead of a single
+  // merged "latest" line with no changelog row — keeping the legend and changelog
+  // in sync. Idempotent: after expansion no expandable plain date remains.
+  useEffect(() => {
+    const runConfigsByDate = new Map(changelogs.map((c) => [c.date, c.runConfigs]));
+    const scope = { modelDbKeys, selectedGPUs, selectedPrecisions };
+    setSelectedDates((prev) => {
+      let changed = false;
+      const out: string[] = [];
+      for (const entry of prev) {
+        if (isRunComparisonEntry(entry)) {
+          out.push(entry);
+          continue;
+        }
+        const rc = runConfigsByDate.get(entry);
+        const runs = rc ? dataRunsForDate(rc, scope) : [];
+        if (runs.length > 1) {
+          changed = true;
+          for (const run of runs) out.push(makeRunComparisonEntry(entry, run.runId));
+        } else {
+          out.push(entry);
+        }
+      }
+      if (!changed) return prev;
+      return [...new Set(out)];
+    });
+  }, [changelogs, modelDbKeys, selectedGPUs, selectedPrecisions, selectedDates, setSelectedDates]);
+
   const [viewModes, setViewModes] = useState<Record<number, InferenceViewMode>>({});
   const replayHandlesRef = useRef<Record<number, ReplayLauncherHandle | null>>({});
   const getViewMode = (index: number): InferenceViewMode => viewModes[index] ?? 'chart';
@@ -342,7 +400,7 @@ export default function ChartDisplay() {
         <Card key={`skeleton-${index}`}>
           <Skeleton className="h-7 w-2/4 mb-1" />
           <Skeleton className="h-5 w-3/4 mb-2" />
-          <Skeleton className="h-[600px] w-full" />
+          <Skeleton className="h-150 w-full" />
         </Card>
       ))
     : effectiveGraphs.length === 0
@@ -546,9 +604,9 @@ export default function ChartDisplay() {
                       );
                     }
 
-                    return selectedDateRange.startDate &&
-                      selectedDateRange.endDate &&
-                      selectedGPUs.length > 0 ? (
+                    return selectedGPUs.length > 0 &&
+                      ((selectedDateRange.startDate && selectedDateRange.endDate) ||
+                        selectedDates.length > 0) ? (
                       <GPUGraph
                         chartId={`chart-${graphIndex}`}
                         modelLabel={graph.model}
@@ -561,6 +619,7 @@ export default function ChartDisplay() {
                         }`}
                         chartDefinition={graph.chartDefinition}
                         caption={chartCaption}
+                        runNumbering={runNumbering}
                       />
                     ) : (
                       <div className="relative">
@@ -583,10 +642,11 @@ export default function ChartDisplay() {
                           }
                         />
                         {selectedGPUs.length > 0 &&
-                          (!selectedDateRange.startDate || !selectedDateRange.endDate) && (
+                          (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
+                          selectedDates.length === 0 && (
                             <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
                               <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
-                                Select a date range to view GPU comparison
+                                Select a date range or add a run to view GPU comparison
                               </p>
                             </div>
                           )}
@@ -637,21 +697,21 @@ export default function ChartDisplay() {
                 changelogs={changelogs}
                 selectedGPUs={selectedGPUs}
                 selectedPrecisions={selectedPrecisions}
+                modelDbKeys={modelDbKeys}
                 loading={changelogsLoading}
                 totalDatesQueried={totalDatesQueried}
                 selectedDates={selectedDates}
                 selectedDateRange={selectedDateRange}
                 onAddDate={(date) => {
-                  if (!selectedDates.includes(date)) {
-                    setSelectedDates([...selectedDates, date]);
-                  }
+                  // Functional updater: adding several runs in quick succession must
+                  // each build on the latest state, not the value captured at render.
+                  setSelectedDates((prev) => (prev.includes(date) ? prev : [...prev, date]));
                 }}
                 onRemoveDate={(date) => {
-                  setSelectedDates(selectedDates.filter((d) => d !== date));
+                  setSelectedDates((prev) => prev.filter((d) => d !== date));
                 }}
                 onAddAllDates={(dates) => {
-                  const merged = [...new Set([...selectedDates, ...dates])];
-                  setSelectedDates(merged);
+                  setSelectedDates((prev) => [...new Set([...prev, ...dates])]);
                 }}
                 firstAvailableDate={dateRangeAvailableDates[0]}
               />
diff --git a/packages/app/src/components/inference/ui/ComparisonChangelog.tsx b/packages/app/src/components/inference/ui/ComparisonChangelog.tsx
index 8106fe90..61a7e28c 100644
--- a/packages/app/src/components/inference/ui/ComparisonChangelog.tsx
+++ b/packages/app/src/components/inference/ui/ComparisonChangelog.tsx
@@ -11,13 +11,35 @@ import {
   configKeyMatchesHwKey,
   formatChangelogDescription,
 } from '@/components/inference/utils/changelogFormatters';
+import { makeRunComparisonEntry } from '@/components/inference/utils/comparisonEntry';
+import { dataRunsForDate } from '@/components/inference/utils/runEnumeration';
 import { getHardwareConfig } from '@/lib/constants';
 import { getDisplayLabel, updateRepoUrl } from '@/lib/utils';
 
+/** One changelog entry's description. The GPU and run # are shown in the entry title. */
+function renderDescription(
+  entry: { config_keys: string[]; description: string },
+  key: number | string,
+) {
+  return (
+    <div key={key} className="text-sm text-muted-foreground pl-5">
+      {formatChangelogDescription(entry.description)}
+    </div>
+  );
+}
+
 interface ComparisonChangelogProps {
   changelogs: ComparisonChangelogType[];
   selectedGPUs: string[];
   selectedPrecisions: string[];
+  /**
+   * DB model keys for the currently selected model (e.g. ['dsv4']). Changelog
+   * config keys are `<model>-<precision>-<gpu>-<framework>` and a GPU+framework
+   * like `b200-vllm` is shared across models, so without this filter the run list
+   * would offer other models' runs — which then plot nothing (the data fetch is
+   * model-scoped).
+   */
+  modelDbKeys: string[];
   loading?: boolean;
   totalDatesQueried: number;
   selectedDates: string[];
@@ -33,6 +55,7 @@ export default function ComparisonChangelog({
   changelogs,
   selectedGPUs,
   selectedPrecisions,
+  modelDbKeys,
   loading,
   totalDatesQueried,
   selectedDates,
@@ -63,7 +86,9 @@ export default function ComparisonChangelog({
         entry.config_keys.some((key) => {
           const precision = key.split('-')[1];
           return (
-            precSet.has(precision) && selectedGPUs.some((gpu) => configKeyMatchesHwKey(key, gpu))
+            modelDbKeys.some((m) => key.startsWith(`${m}-`)) &&
+            precSet.has(precision) &&
+            selectedGPUs.some((gpu) => configKeyMatchesHwKey(key, gpu))
           );
         }),
       ),
@@ -72,14 +97,14 @@ export default function ComparisonChangelog({
     // Ensure pinned dates are always present
     for (const date of pinnedDates) {
       if (!mapped.some((item) => item.date === date)) {
-        mapped.push({ date, entries: [] });
+        mapped.push({ date, entries: [], runs: [], runConfigs: [] });
       }
     }
 
     return mapped
       .filter((item) => item.entries.length > 0 || pinnedDates.has(item.date))
       .toSorted((a, b) => new Date(a.date).getTime() - new Date(b.date).getTime());
-  }, [changelogs, selectedGPUs, selectedPrecisions, pinnedDates]);
+  }, [changelogs, modelDbKeys, selectedGPUs, selectedPrecisions, pinnedDates]);
 
   const datesOnChart = useMemo(() => {
     const set = new Set(selectedDates);
@@ -88,17 +113,82 @@ export default function ComparisonChangelog({
     return set;
   }, [selectedDates, selectedDateRange]);
 
-  const addableDates = useMemo(
-    () => filteredChangelogs.map((c) => c.date).filter((d) => !datesOnChart.has(d)),
-    [filteredChangelogs, datesOnChart],
+  // True when a changelog entry touches one of the selected GPU configs at a
+  // selected precision — the same predicate used to filter the date list, reused
+  // to attach changelog notes to the runs that are worth offering as series.
+  const entryMatchesSelection = useMemo(() => {
+    const precSet = new Set(selectedPrecisions);
+    return (configKeys: string[]): boolean =>
+      configKeys.some((key) => {
+        const precision = key.split('-')[1];
+        return (
+          modelDbKeys.some((m) => key.startsWith(`${m}-`)) &&
+          precSet.has(precision) &&
+          selectedGPUs.some((gpu) => configKeyMatchesHwKey(key, gpu))
+        );
+      });
+  }, [modelDbKeys, selectedPrecisions, selectedGPUs]);
+
+  /**
+   * Every run that produced data for the selected config on a date, earliest
+   * first, with its changelog notes (if any) attached. Data-driven so a run that
+   * shipped data without a changelog entry still appears as its own series.
+   */
+  const runMetaFor = useMemo(
+    () => (item: (typeof filteredChangelogs)[number]) => {
+      const clByRun = new Map(item.runs.map((r) => [r.runId, r]));
+      return dataRunsForDate(item.runConfigs, {
+        modelDbKeys,
+        selectedGPUs,
+        selectedPrecisions,
+      }).map((run) => {
+        const cl = clByRun.get(run.runId);
+        return {
+          runId: run.runId,
+          headRef: cl?.headRef ?? run.headSha,
+          runUrl: cl?.runUrl ?? run.runUrl,
+          entries: (cl?.entries ?? []).filter((e) => entryMatchesSelection(e.config_keys)),
+        };
+      });
+    },
+    [modelDbKeys, selectedGPUs, selectedPrecisions, entryMatchesSelection],
   );
 
+  // Entries the "Add all to chart" button would add: every run not yet on the
+  // chart (run-level for multi-run dates, the plain date for single-run dates).
+  const addableEntries = useMemo(() => {
+    const out: string[] = [];
+    for (const item of filteredChangelogs) {
+      const runs = runMetaFor(item);
+      if (runs.length > 1) {
+        for (const run of runs) {
+          const entry = makeRunComparisonEntry(item.date, run.runId);
+          if (!selectedDates.includes(entry)) out.push(entry);
+        }
+      } else if (!datesOnChart.has(item.date)) {
+        out.push(item.date);
+      }
+    }
+    return out;
+  }, [filteredChangelogs, runMetaFor, selectedDates, datesOnChart]);
+
   const handleToggle = () => {
     const newState = !isExpanded;
     setIsExpanded(newState);
     track('inference_comparison_changelog_toggled', { expanded: newState });
   };
 
+  /** Display labels of the selected GPUs that a set of changelog entries touches. */
+  const gpuLabelsFor = (entries: { config_keys: string[] }[]): string => {
+    if (selectedGPUs.length <= 1) return '';
+    return selectedGPUs
+      .filter((gpu) =>
+        entries.some((e) => e.config_keys.some((k) => configKeyMatchesHwKey(k, gpu))),
+      )
+      .map((gpu) => getDisplayLabel(getHardwareConfig(gpu)))
+      .join(', ');
+  };
+
   const label =
     filteredChangelogs.length > 0
       ? `Config Changelog (${filteredChangelogs.length} date${filteredChangelogs.length === 1 ? '' : 's'} with changes)`
@@ -125,12 +215,12 @@ export default function ComparisonChangelog({
             <ChevronDown className="size-4 shrink-0 text-muted-foreground" />
           )}
         </button>
-        {isExpanded && addableDates.length > 0 && (
+        {isExpanded && addableEntries.length > 0 && (
           <button
             type="button"
             onClick={() => {
-              onAddAllDates(addableDates);
-              track('inference_changelog_add_all_dates', { count: addableDates.length });
+              onAddAllDates(addableEntries);
+              track('inference_changelog_add_all_dates', { count: addableEntries.length });
             }}
             className="text-xs font-medium text-brand hover:text-brand/80 transition-colors flex items-center gap-1"
           >
@@ -142,7 +232,7 @@ export default function ComparisonChangelog({
 
       <div
         className={`overflow-hidden transition-all duration-200 ease-in-out ${
-          isExpanded ? 'max-h-[4000px] opacity-100' : 'max-h-0 opacity-0'
+          isExpanded ? 'max-h-1000 opacity-100' : 'max-h-0 opacity-0'
         }`}
       >
         <div className="px-4 pt-2 pb-4 flex flex-col gap-3">
@@ -152,103 +242,179 @@ export default function ComparisonChangelog({
               range. Changelog tracking began Dec 30, 2025.
             </p>
           ) : (
-            filteredChangelogs.map((item) => (
-              <div key={item.date} className="flex flex-col gap-1">
-                <div className="flex items-center gap-2 flex-wrap">
-                  <span className="text-sm font-semibold">{item.date}</span>
-                  {item.entries.length > 0 && (
-                    <>
-                      <span className="text-muted-foreground">&mdash;</span>
-                      {item.headRef && (
-                        <a
-                          href={`https://github.com/SemiAnalysisAI/InferenceX/commit/${item.headRef}`}
-                          target="_blank"
-                          rel="noopener noreferrer"
-                          className="text-sm hover:underline text-foreground underline"
-                        >
-                          Git Commit
-                          <ExternalLinkIcon />
-                        </a>
+            filteredChangelogs.map((item) => {
+              const runs = runMetaFor(item);
+
+              // Multiple runs produced data for the selected config on this date →
+              // render each run as its own first-class entry (its own #, changelog,
+              // and add/remove). Includes runs with no changelog notes so the newest
+              // run is never dropped just because it lacked an entry.
+              if (runs.length > 1) {
+                return runs.map((run, idx) => {
+                  const entry = makeRunComparisonEntry(item.date, run.runId);
+                  const onChart = selectedDates.includes(entry);
+                  const gpuLabel = gpuLabelsFor(run.entries);
+                  return (
+                    <div key={entry} className="flex flex-col gap-1">
+                      <div className="flex items-center gap-2 flex-wrap">
+                        <span className="text-sm font-semibold">
+                          {item.date}
+                          {gpuLabel ? ` ${gpuLabel}` : ''} #{idx + 1}
+                        </span>
+                        <span className="text-muted-foreground">&mdash;</span>
+                        {run.headRef && (
+                          <a
+                            href={`https://github.com/SemiAnalysisAI/InferenceX/commit/${run.headRef}`}
+                            target="_blank"
+                            rel="noopener noreferrer"
+                            className="text-sm hover:underline text-foreground underline"
+                          >
+                            Git Commit
+                            <ExternalLinkIcon />
+                          </a>
+                        )}
+                        {run.runUrl && (
+                          <a
+                            href={updateRepoUrl(run.runUrl)}
+                            target="_blank"
+                            rel="noopener noreferrer"
+                            className="text-sm hover:underline text-foreground underline"
+                          >
+                            Workflow Run
+                            <ExternalLinkIcon />
+                          </a>
+                        )}
+                        {onChart ? (
+                          <button
+                            type="button"
+                            onClick={() => {
+                              onRemoveDate(entry);
+                              track('inference_changelog_remove_run', {
+                                date: item.date,
+                                run: run.runId,
+                              });
+                            }}
+                            className="text-xs font-medium text-muted-foreground hover:text-destructive transition-colors flex items-center gap-0.5"
+                          >
+                            <Minus className="size-3" />
+                            Remove from chart
+                          </button>
+                        ) : (
+                          <button
+                            type="button"
+                            onClick={() => {
+                              onAddDate(entry);
+                              track('inference_changelog_add_run', {
+                                date: item.date,
+                                run: run.runId,
+                              });
+                            }}
+                            className="text-xs font-medium text-brand hover:text-brand/80 transition-colors flex items-center gap-0.5"
+                          >
+                            <Plus className="size-3" />
+                            Add to chart
+                          </button>
+                        )}
+                      </div>
+                      {run.entries.length > 0 ? (
+                        run.entries.map((e, i) => renderDescription(e, i))
+                      ) : (
+                        <span className="text-sm text-muted-foreground italic pl-5">
+                          No changelog notes for this run
+                        </span>
                       )}
-                      {item.runUrl && (
-                        <a
-                          href={updateRepoUrl(item.runUrl)}
-                          target="_blank"
-                          rel="noopener noreferrer"
-                          className="text-sm hover:underline text-foreground underline"
+                    </div>
+                  );
+                });
+              }
+
+              // Single (or no) matching run → one block keyed by the date.
+              const dateGpuLabel = gpuLabelsFor(item.entries);
+              return (
+                <div key={item.date} className="flex flex-col gap-1">
+                  <div className="flex items-center gap-2 flex-wrap">
+                    <span className="text-sm font-semibold">
+                      {item.date}
+                      {dateGpuLabel ? ` ${dateGpuLabel}` : ''}
+                    </span>
+                    {item.entries.length > 0 && (
+                      <>
+                        <span className="text-muted-foreground">&mdash;</span>
+                        {item.headRef && (
+                          <a
+                            href={`https://github.com/SemiAnalysisAI/InferenceX/commit/${item.headRef}`}
+                            target="_blank"
+                            rel="noopener noreferrer"
+                            className="text-sm hover:underline text-foreground underline"
+                          >
+                            Git Commit
+                            <ExternalLinkIcon />
+                          </a>
+                        )}
+                        {item.runUrl && (
+                          <a
+                            href={updateRepoUrl(item.runUrl)}
+                            target="_blank"
+                            rel="noopener noreferrer"
+                            className="text-sm hover:underline text-foreground underline"
+                          >
+                            Workflow Run
+                            <ExternalLinkIcon />
+                          </a>
+                        )}
+                      </>
+                    )}
+                    {datesOnChart.has(item.date) ? (
+                      selectedDates.includes(item.date) ? (
+                        <button
+                          type="button"
+                          onClick={() => {
+                            onRemoveDate(item.date);
+                            track('inference_changelog_remove_date', { date: item.date });
+                          }}
+                          className="text-xs font-medium text-muted-foreground hover:text-destructive transition-colors flex items-center gap-0.5"
                         >
-                          Workflow Run
-                          <ExternalLinkIcon />
-                        </a>
-                      )}
-                    </>
-                  )}
-                  {datesOnChart.has(item.date) ? (
-                    selectedDates.includes(item.date) ? (
+                          <Minus className="size-3" />
+                          Remove from chart
+                        </button>
+                      ) : (
+                        <span className="text-xs text-muted-foreground flex items-center gap-0.5">
+                          <Lock className="size-3" />
+                          On chart
+                        </span>
+                      )
+                    ) : (
                       <button
                         type="button"
                         onClick={() => {
-                          onRemoveDate(item.date);
-                          track('inference_changelog_remove_date', { date: item.date });
+                          onAddDate(item.date);
+                          track('inference_changelog_add_date', { date: item.date });
                         }}
-                        className="text-xs font-medium text-muted-foreground hover:text-destructive transition-colors flex items-center gap-0.5"
+                        className="text-xs font-medium text-brand hover:text-brand/80 transition-colors flex items-center gap-0.5"
                       >
-                        <Minus className="size-3" />
-                        Remove from chart
+                        <Plus className="size-3" />
+                        Add to chart
                       </button>
-                    ) : (
-                      <span className="text-xs text-muted-foreground flex items-center gap-0.5">
-                        <Lock className="size-3" />
-                        On chart
-                      </span>
-                    )
+                    )}
+                  </div>
+                  {item.entries.length > 0 ? (
+                    item.entries.map((entry, entryIndex) => renderDescription(entry, entryIndex))
                   ) : (
-                    <button
-                      type="button"
-                      onClick={() => {
-                        onAddDate(item.date);
-                        track('inference_changelog_add_date', { date: item.date });
-                      }}
-                      className="text-xs font-medium text-brand hover:text-brand/80 transition-colors flex items-center gap-0.5"
-                    >
-                      <Plus className="size-3" />
-                      Add to chart
-                    </button>
+                    <span className="text-sm text-muted-foreground italic pl-5">
+                      {item.date === firstAvailableDate
+                        ? 'First benchmark run for this configuration'
+                        : item.date < '2025-12-30'
+                          ? 'No changelog data (tracking began Dec 30, 2025)'
+                          : filteredChangelogs.some(
+                                (c) => c.date < item.date && c.entries.length > 0,
+                              )
+                            ? 'No config changes — same configuration as previous run'
+                            : 'Initial configuration — no changelog entry recorded'}
+                    </span>
                   )}
                 </div>
-                {item.entries.length > 0 ? (
-                  item.entries.map((entry, entryIndex) => (
-                    <div key={entryIndex} className="text-sm text-muted-foreground pl-5">
-                      {selectedGPUs.length > 1 &&
-                        (() => {
-                          const matchingGpus = selectedGPUs.filter((gpu) =>
-                            entry.config_keys.some((key) => configKeyMatchesHwKey(key, gpu)),
-                          );
-                          const labels = matchingGpus.map((gpu) =>
-                            getDisplayLabel(getHardwareConfig(gpu)),
-                          );
-                          return labels.length > 0 ? (
-                            <span className="text-xs font-medium text-foreground/70">
-                              {labels.join(', ')}
-                            </span>
-                          ) : null;
-                        })()}
-                      {formatChangelogDescription(entry.description)}
-                    </div>
-                  ))
-                ) : (
-                  <span className="text-sm text-muted-foreground italic pl-5">
-                    {item.date === firstAvailableDate
-                      ? 'First benchmark run for this configuration'
-                      : item.date < '2025-12-30'
-                        ? 'No changelog data (tracking began Dec 30, 2025)'
-                        : filteredChangelogs.some((c) => c.date < item.date && c.entries.length > 0)
-                          ? 'No config changes — same configuration as previous run'
-                          : 'Initial configuration — no changelog entry recorded'}
-                  </span>
-                )}
-              </div>
-            ))
+              );
+            })
           )}
         </div>
       </div>
diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx
index a6e76fcc..e7737a2e 100644
--- a/packages/app/src/components/inference/ui/GPUGraph.tsx
+++ b/packages/app/src/components/inference/ui/GPUGraph.tsx
@@ -38,6 +38,12 @@ import type {
   InferenceData,
   ScatterGraphProps,
 } from '@/components/inference/types';
+import {
+  buildRunNumbering,
+  comparisonEntryLabel,
+  comparisonEntrySortValue,
+  resolveComparisonEntries,
+} from '@/components/inference/utils/comparisonEntry';
 import {
   generateGPUGraphTooltipContent,
   getPointLabel,
@@ -55,16 +61,24 @@ const CHART_MARGIN = { top: 24, right: 10, bottom: 60, left: 60 };
 // both dimensions of the GPU comparison view are legible on the chart,
 // not only the legend. Falls back to the raw hwKey if the config
 // lookup misses (legacy data).
-function labelTextFor(pts: InferenceData[]): string {
+function labelTextFor(pts: InferenceData[], numbering: Map<string, number>): string {
   const hwKey = String(pts[0].hwKey);
-  const date = String(pts[0].date);
   const cfg = getHardwareConfig(hwKey);
   const hwLabel = cfg ? getDisplayLabel(cfg) : hwKey;
-  return `${hwLabel} • ${date}`;
+  return `${hwLabel} • ${comparisonEntryLabel(String(pts[0].date), numbering)}`;
 }
 
 const GPUGraph = React.memo(
-  ({ chartId, modelLabel, data, xLabel, yLabel, chartDefinition, caption }: ScatterGraphProps) => {
+  ({
+    chartId,
+    modelLabel,
+    data,
+    xLabel,
+    yLabel,
+    chartDefinition,
+    caption,
+    runNumbering: providedRunNumbering,
+  }: ScatterGraphProps) => {
     const {
       hardwareConfig,
       selectedPrecisions,
@@ -72,6 +86,7 @@ const GPUGraph = React.memo(
       selectedGPUs,
       selectedDateRange,
       selectedDates,
+      setSelectedDates,
       toggleActiveDate,
       removeActiveDate,
       activeDates,
@@ -94,21 +109,48 @@ const GPUGraph = React.memo(
     const { resolvedTheme } = useTheme();
     const chartRef = useRef<D3ChartHandle>(null);
 
-    // Shared date+GPU pairs
+    // Shared date+GPU pairs. `dates` holds comparison-series entries (plain dates
+    // and/or specific-run entries); a same-day range endpoint is dropped when that
+    // date also has run entries (resolveComparisonEntries), then sorted earliest →
+    // latest so a day's runs read #1 → #N.
     const gpuDatePairs = useMemo(() => {
-      const dates: string[] = [];
-      if (selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0) {
-        dates.push(selectedDateRange.startDate, selectedDateRange.endDate);
-      }
-      dates.push(...selectedDates);
-      const deduplicated = [...new Set(dates)];
-      deduplicated.sort((a, b) => new Date(a).getTime() - new Date(b).getTime());
+      const deduplicated = resolveComparisonEntries(selectedDates, selectedDateRange);
+      deduplicated.sort((a, b) => {
+        const [ta, ia] = comparisonEntrySortValue(a);
+        const [tb, ib] = comparisonEntrySortValue(b);
+        return ta - tb || ia - ib;
+      });
       const sortedGPUs = [...selectedGPUs].toSorted(
         (a, b) => getModelSortIndex(a) - getModelSortIndex(b) || a.localeCompare(b),
       );
       return { dates: deduplicated, sortedGPUs };
     }, [selectedDateRange, selectedDates, selectedGPUs]);
 
+    // Run numbers for legend/line labels. Prefer the stable numbering passed by
+    // the parent (shared with the changelog, so labels match it and removed runs
+    // leave a gap); fall back to gap-free numbering of the on-chart series.
+    const runNumbering = useMemo(
+      () => providedRunNumbering ?? buildRunNumbering(gpuDatePairs.dates),
+      [providedRunNumbering, gpuDatePairs.dates],
+    );
+
+    // Removing a series from the legend should also drop it from the comparison
+    // selection so the config changelog stays in sync (two-way binding). Legend
+    // ids are `${entry}_${gpu}`; strip the gpu suffix to recover the entry. Range
+    // endpoints aren't individual selections, so those fall back to a visibility hide.
+    const handleLegendRemove = useCallback(
+      (id: string) => {
+        const gpu = selectedGPUs.find((g) => id.endsWith(`_${g}`));
+        const entry = gpu ? id.slice(0, id.length - gpu.length - 1) : id;
+        if (selectedDates.includes(entry)) {
+          setSelectedDates((prev) => prev.filter((e) => e !== entry));
+        } else {
+          removeActiveDate(id);
+        }
+      },
+      [selectedGPUs, selectedDates, setSelectedDates, removeActiveDate],
+    );
+
     const graphIdentifiers = useMemo(() => {
       const ids: string[] = [];
       gpuDatePairs.sortedGPUs.forEach((gpu) =>
@@ -391,7 +433,7 @@ const GPUGraph = React.memo(
                 pts[Math.max(0, Math.floor((pts.length * 2) / 3))],
                 pts.at(-1)!,
               ];
-              const labelText = labelTextFor(pts);
+              const labelText = labelTextFor(pts, runNumbering);
               let placedLabel = false;
               for (const pt of candidates) {
                 const px = xScale(pt.x);
@@ -431,7 +473,7 @@ const GPUGraph = React.memo(
               lineLabels.push({
                 key,
                 graphId,
-                label: labelTextFor(pts),
+                label: labelTextFor(pts, runNumbering),
                 color: getRooflineColor(key),
                 x: xScale(pt.x),
                 y: yScale(pt.y),
@@ -596,7 +638,14 @@ const GPUGraph = React.memo(
           });
         },
       }),
-      [showLineLabels, rooflines, isRooflineVisible, getRooflineColor, chartDefinition.chartType],
+      [
+        showLineLabels,
+        rooflines,
+        isRooflineVisible,
+        getRooflineColor,
+        chartDefinition.chartType,
+        runNumbering,
+      ],
     );
 
     // Dismiss tooltip when pinned point's combo is hidden
@@ -782,13 +831,13 @@ const GPUGraph = React.memo(
             disableActiveSort={true}
             onItemHover={handleLegendHover}
             onItemHoverEnd={handleLegendHoverEnd}
-            onItemRemove={removeActiveDate}
+            onItemRemove={handleLegendRemove}
             legendItems={allGraphs
               .filter(({ id }) => idsWithData.has(id))
               .map(({ date, color, hwKey, id }) => ({
-                name: `${hwKey} ${date}`,
+                name: `${hwKey} ${comparisonEntryLabel(date, runNumbering)}`,
                 hw: id,
-                label: date,
+                label: comparisonEntryLabel(date, runNumbering),
                 color,
                 title: getDisplayLabel(getHardwareConfig(hwKey)),
                 isActive: activeDates.has(id),
diff --git a/packages/app/src/hooks/api/use-comparison-changelogs.ts b/packages/app/src/hooks/api/use-comparison-changelogs.ts
index 7d3e2556..fd6a5ca9 100644
--- a/packages/app/src/hooks/api/use-comparison-changelogs.ts
+++ b/packages/app/src/hooks/api/use-comparison-changelogs.ts
@@ -1,17 +1,44 @@
 import { useQueries } from '@tanstack/react-query';
 import { useMemo } from 'react';
 
-import { fetchWorkflowInfo, type ChangelogRow, type WorkflowInfoResponse } from '@/lib/api';
+import {
+  fetchWorkflowInfo,
+  type ChangelogRow,
+  type RunConfigRow,
+  type WorkflowInfoResponse,
+} from '@/lib/api';
+
+export interface ComparisonChangelogEntry {
+  config_keys: string[];
+  description: string;
+  pr_link: string | null;
+}
+
+/** One workflow run on a date, with its own changelog entries. */
+export interface ComparisonRun {
+  /** GitHub run id (string). */
+  runId: string;
+  headRef?: string;
+  runUrl?: string;
+  /** Changelog entries attributed to this specific run. */
+  entries: ComparisonChangelogEntry[];
+}
 
 export interface ComparisonChangelog {
   date: string;
   headRef?: string;
   runUrl?: string;
-  entries: {
-    config_keys: string[];
-    description: string;
-    pr_link: string | null;
-  }[];
+  /** All of the date's changelog entries (flattened across runs). */
+  entries: ComparisonChangelogEntry[];
+  /** Individual runs on this date, in chronological order (earliest first). */
+  runs: ComparisonRun[];
+  /**
+   * Per-(run, config) coverage from the benchmark data itself. Used to enumerate
+   * every run that produced data on this date — including runs without a changelog
+   * entry, which `runs` omits. The comparison UI keys run series off this so the
+   * newest run never silently vanishes just because it lacked changelog notes.
+   */
+  runConfigs: RunConfigRow[];
 }
 
 export function useComparisonChangelogs(
@@ -57,6 +84,37 @@ export function useComparisonChangelogs(
       const data = query.data as WorkflowInfoResponse;
       if (!data.changelogs || data.changelogs.length === 0) continue;
 
+      // Group changelog entries by the run that produced them. In the API
+      // response, changelog.workflow_run_id is the GitHub run id (see
+      // getChangelogByDate's `wr.github_run_id as workflow_run_id`).
+      const entriesByRun = new Map<number, ChangelogRow[]>();
+      for (const c of data.changelogs) {
+        const list = entriesByRun.get(c.workflow_run_id) ?? [];
+        list.push(c);
+        entriesByRun.set(c.workflow_run_id, list);
+      }
+
+      // Order runs chronologically (earliest first) so the #1/#2/#3 indices the
+      // UI assigns read in the order the runs actually happened.
+      const orderedRuns = [...data.runs].toSorted((a, b) =>
+        a.created_at.localeCompare(b.created_at),
+      );
+      const runs: ComparisonRun[] = orderedRuns
+        .map((run) => {
+          const runEntries = entriesByRun.get(run.github_run_id) ?? [];
+          return {
+            runId: String(run.github_run_id),
+            headRef: runEntries.at(-1)?.head_ref,
+            runUrl: run.html_url ?? undefined,
+            entries: runEntries.map((c) => ({
+              config_keys: c.config_keys,
+              description: c.description,
+              pr_link: c.pr_link,
+            })),
+          };
+        })
+        .filter((r) => r.entries.length > 0);
+
       results.push({
         date: datesToQuery[i],
         headRef: data.changelogs.at(-1)?.head_ref,
@@ -66,6 +124,8 @@ export function useComparisonChangelogs(
           description: c.description,
           pr_link: c.pr_link,
         })),
+        runs,
+        runConfigs: data.runConfigs ?? [],
       });
     }
 
diff --git a/packages/app/src/lib/d3-chart/layers/rooflines.ts b/packages/app/src/lib/d3-chart/layers/rooflines.ts
index 6f6b36d1..5513ef12 100644
--- a/packages/app/src/lib/d3-chart/layers/rooflines.ts
+++ b/packages/app/src/lib/d3-chart/layers/rooflines.ts
@@ -87,9 +87,19 @@ export function updateRooflinesOnZoom<T extends { x: number; y: number }>(
 
   Object.entries(rooflines).forEach(([key, points]) => {
     if (points.length < 2) return;
-    const selection = zoomGroup.select<SVGPathElement>(`.roofline-${key}`);
+    // Keys can contain characters that are invalid in a CSS selector (e.g. `~`
+    // from run-comparison series ids), so escape before selecting by class.
+    const selection = zoomGroup.select<SVGPathElement>(`.${cssEscapeToken(`roofline-${key}`)}`);
     if (!selection.empty()) {
       selection.attr('d', lineGenerator(points) as string);
     }
   });
 }
+
+/** Escape a class token for safe use in a CSS selector. */
+function cssEscapeToken(token: string): string {
+  if (typeof CSS !== 'undefined' && typeof CSS.escape === 'function') return CSS.escape(token);
+  // Fallback (non-DOM environments): escape everything outside the CSS-safe set.
+  // The token always starts with "roofline-", so a leading digit is never escaped.
+  return token.replaceAll(/[^a-zA-Z0-9_-]/gu, (c) => `\\${c}`);
+}