Skip to content

Commit ae99e3b

Browse files
committed
Refine leaderboard LLM display and logos
1 parent 0804747 commit ae99e3b

7 files changed

Lines changed: 140 additions & 39 deletions

File tree

static/app.js

Lines changed: 130 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ function renderFrontierChart(data) {
314314
bodyFont: { family: "'Fira Code', monospace", size: 16 },
315315
callbacks: {
316316
title: (items) => labels[items[0].dataIndex] || '',
317-
label: c2 => `${c2.dataset.label}: ${c2.parsed.y !== null ? c2.parsed.y.toFixed(1) : '-'}`,
317+
label: c2 => `${getAgentDisplayLabel(data, c2.dataset.label)}: ${c2.parsed.y !== null ? c2.parsed.y.toFixed(1) : '-'}`,
318318
},
319319
},
320320
},
@@ -348,8 +348,6 @@ function renderFrontierChart(data) {
348348

349349
// Custom HTML legend with logos
350350
const legendEl = document.getElementById('chart-legend');
351-
const agentModelLabels = {};
352-
data.agents.forEach(agent => { agentModelLabels[agent] = getAgentModelLabel(data, agent); });
353351
legendEl.innerHTML = frontierChart.data.datasets.map((ds, i) => {
354352
const logo = getAgentLogo(ds.label);
355353
const logoHtml = logo ? `<img src="${logo}" alt="">` : '';
@@ -359,12 +357,24 @@ function renderFrontierChart(data) {
359357
if (ds.label.startsWith('Human')) {
360358
return `<div class="chart-legend-item"><span class="chart-legend-swatch dashed" style="border-color:${ds.borderColor}"></span>${ds.label}</div>`;
361359
}
362-
const modelLabel = agentModelLabels[ds.label];
360+
const displayLabel = getAgentDisplayLabel(data, ds.label);
361+
const modelLabel = getAgentSecondaryLabel(data, ds.label);
363362
const textHtml = modelLabel
364-
? `<span class="chart-legend-text"><span>${esc(ds.label)}</span><span class="chart-legend-model">${esc(modelLabel)}</span></span>`
365-
: `<span>${esc(ds.label)}</span>`;
363+
? `<span class="chart-legend-text"><span>${esc(displayLabel)}</span><span class="chart-legend-model">${esc(modelLabel)}</span></span>`
364+
: `<span>${esc(displayLabel)}</span>`;
366365
return `<div class="chart-legend-item">${logoHtml}<span class="chart-legend-swatch" style="background:${ds.borderColor}"></span>${textHtml}</div>`;
367366
}).join('');
367+
368+
const card = ctx.closest('.card');
369+
if (card) {
370+
let noteEl = card.querySelector('.dashboard-footnote.frontier-footnote');
371+
if (!noteEl) {
372+
noteEl = document.createElement('div');
373+
noteEl.className = 'dashboard-footnote frontier-footnote';
374+
ctx.parentElement.insertAdjacentElement('afterend', noteEl);
375+
}
376+
noteEl.innerHTML = researchHarnessFootnoteHtml();
377+
}
368378
}
369379

370380
function renderLeaderboard(data) {
@@ -396,11 +406,12 @@ function renderLeaderboard(data) {
396406
if (!costText) return `<span class="leaderboard-cell-meta"><span>${timeText}</span></span>`;
397407
return `<span class="leaderboard-cell-meta"><span>${costText}</span><span>${timeText}</span></span>`;
398408
}
399-
function renderScoreBlock(entry, clickable) {
409+
function renderScoreBlock(entry, clickable, extraClass = '') {
400410
if (!entry || !Number.isFinite(entry.score)) return '<span class="score-cell score-cell-empty">-</span>';
401411
const scoreHtml = `<span class="score-cell" style="${cellStyle(entry.score)}">${entry.score.toFixed(1)}</span>`;
402412
const inner = `<div class="leaderboard-score-wrap">${scoreHtml}${renderMetricLines(entry)}</div>`;
403-
return clickable ? `<td class="leaderboard-score-td" onclick="goToRun('${entry.run_id}')">${inner}</td>` : `<td class="leaderboard-score-td">${inner}</td>`;
413+
const tdClass = `leaderboard-score-td${extraClass ? ` ${extraClass}` : ''}`;
414+
return clickable ? `<td class="${tdClass}" onclick="goToRun('${entry.run_id}')">${inner}</td>` : `<td class="${tdClass}">${inner}</td>`;
404415
}
405416
function averageEntry(entries) {
406417
const scored = entries.filter(e => Number.isFinite(e?.score));
@@ -417,6 +428,26 @@ function renderLeaderboard(data) {
417428
.filter(Boolean)
418429
.reduce((best, entry) => !best || entry.score > best.score ? entry : best, null);
419430
}
431+
function averageScoreForAgent(agent) {
432+
return averageEntry(data.tasks.map(task => data.scores[agent]?.[task]).filter(Boolean))?.score ?? -Infinity;
433+
}
434+
function splitAgentGroups(list) {
435+
const agents = [];
436+
const llms = [];
437+
list.forEach(name => {
438+
if (isResearchHarnessAgent(name)) {
439+
llms.push(name);
440+
} else {
441+
agents.push(name);
442+
}
443+
});
444+
llms.sort((a, b) => {
445+
const diff = averageScoreForAgent(b) - averageScoreForAgent(a);
446+
if (diff) return diff;
447+
return getAgentDisplayLabel(data, a).localeCompare(getAgentDisplayLabel(data, b));
448+
});
449+
return { agents, llms };
450+
}
420451
function renderSummaryCell(entry) {
421452
if (!entry || !Number.isFinite(entry.score)) return '<td class="no-score leaderboard-static-cell">-</td>';
422453
const scoreHtml = `<span class="score-cell" style="${cellStyle(entry.score)}">${entry.score.toFixed(1)}</span>`;
@@ -450,52 +481,68 @@ function renderLeaderboard(data) {
450481
);
451482
return { agent, overall, domains: domainsMap };
452483
});
453-
rows.sort((a, b) => {
484+
const sortRows = rowsToSort => rowsToSort.sort((a, b) => {
454485
const av = Number.isFinite(a.overall?.score) ? a.overall.score : -Infinity;
455486
const bv = Number.isFinite(b.overall?.score) ? b.overall.score : -Infinity;
456487
if (bv !== av) return bv - av;
457-
return a.agent.localeCompare(b.agent);
488+
return getAgentDisplayLabel(data, a.agent).localeCompare(getAgentDisplayLabel(data, b.agent));
458489
});
459-
return { domains, rows };
490+
return {
491+
domains,
492+
agentRows: sortRows(rows.filter(row => !isResearchHarnessAgent(row.agent))),
493+
llmRows: sortRows(rows.filter(row => isResearchHarnessAgent(row.agent))),
494+
};
460495
}
461496

497+
const groupedAgents = splitAgentGroups(data.agents);
498+
const orderedTaskAgents = [...groupedAgents.agents, ...groupedAgents.llms];
499+
const firstLlmAgent = groupedAgents.agents.length && groupedAgents.llms.length ? groupedAgents.llms[0] : '';
462500
const domainSummary = summarizeByDomain();
463501

464-
let summaryHtml = '<table class="leaderboard leaderboard-summary"><thead><tr><th>Agent</th><th>Overall</th>';
502+
let summaryHtml = '<table class="leaderboard leaderboard-summary"><thead><tr><th>Agent/LLM</th><th>Overall</th>';
465503
domainSummary.domains.forEach(domain => {
466504
summaryHtml += `<th>${esc(domain)}</th>`;
467505
});
468506
summaryHtml += '</tr></thead><tbody>';
469-
domainSummary.rows.forEach((row, index) => {
470-
const modelLabel = getAgentModelLabel(data, row.agent);
471-
const modelHtml = modelLabel ? `<span class="leaderboard-agent-model">${esc(modelLabel)}</span>` : '';
472-
const medal = Number.isFinite(row.overall?.score) && index < 3 ? ['🥇', '🥈', '🥉'][index] : '';
473-
const medalHtml = medal ? `<span class="leaderboard-medal" aria-hidden="true">${medal}</span>` : '';
474-
summaryHtml += `<tr><td><div class="leaderboard-agent-row"><span class="leaderboard-agent-name">${medalHtml}${agentLogoHtml(row.agent, 18)}<span>${esc(row.agent)}</span></span>${modelHtml}</div></td>`;
475-
summaryHtml += renderSummaryCell(row.overall);
476-
domainSummary.domains.forEach(domain => {
477-
summaryHtml += renderSummaryCell(row.domains[domain]);
507+
function appendSummaryRows(rows, addDivider) {
508+
rows.forEach((row, index) => {
509+
const rowClass = addDivider && index === 0 ? ' class="leaderboard-group-start-row"' : '';
510+
const displayLabel = getAgentDisplayLabel(data, row.agent);
511+
const modelLabel = getAgentSecondaryLabel(data, row.agent);
512+
const modelHtml = modelLabel ? `<span class="leaderboard-agent-model">${esc(modelLabel)}</span>` : '';
513+
const medal = Number.isFinite(row.overall?.score) && index < 3 ? ['🥇', '🥈', '🥉'][index] : '';
514+
const medalHtml = medal ? `<span class="leaderboard-medal" aria-hidden="true">${medal}</span>` : '';
515+
summaryHtml += `<tr${rowClass}><td><div class="leaderboard-agent-row"><span class="leaderboard-agent-name">${medalHtml}${agentLogoHtml(row.agent, 18)}<span>${esc(displayLabel)}</span></span>${modelHtml}</div></td>`;
516+
summaryHtml += renderSummaryCell(row.overall);
517+
domainSummary.domains.forEach(domain => {
518+
summaryHtml += renderSummaryCell(row.domains[domain]);
519+
});
520+
summaryHtml += '</tr>';
478521
});
479-
summaryHtml += '</tr>';
480-
});
522+
}
523+
appendSummaryRows(domainSummary.agentRows, false);
524+
appendSummaryRows(domainSummary.llmRows, domainSummary.agentRows.length > 0);
481525
summaryHtml += '</tbody></table>';
482526

483527
let taskHtml = '<table class="leaderboard"><thead><tr><th>Task</th>';
484-
data.agents.forEach(a => {
485-
const modelLabel = getAgentModelLabel(data, a);
528+
orderedTaskAgents.forEach(a => {
529+
const displayLabel = getAgentDisplayLabel(data, a);
530+
const modelLabel = getAgentSecondaryLabel(data, a);
486531
const modelHtml = modelLabel ? `<span class="leaderboard-agent-model">${esc(modelLabel)}</span>` : '';
487-
taskHtml += `<th><div class="leaderboard-agent-head">${agentLogoHtml(a, 20)}<span class="leaderboard-agent-name">${esc(a)}</span>${modelHtml}</div></th>`;
532+
const dividerClass = a === firstLlmAgent ? ' class="leaderboard-group-divider-left"' : '';
533+
taskHtml += `<th${dividerClass}><div class="leaderboard-agent-head">${agentLogoHtml(a, 20)}<span class="leaderboard-agent-name">${esc(displayLabel)}</span>${modelHtml}</div></th>`;
488534
});
489535
taskHtml += '<th>Frontier</th></tr></thead><tbody>';
490536

491537
data.tasks.forEach(task => {
492538
taskHtml += `<tr><td>${esc(task)}</td>`;
493-
data.agents.forEach(agent => {
539+
orderedTaskAgents.forEach(agent => {
494540
const entry = data.scores[agent]?.[task];
541+
const dividerClass = agent === firstLlmAgent ? 'leaderboard-group-divider-left' : '';
495542
if (entry) {
496-
taskHtml += renderScoreBlock(entry, true);
543+
taskHtml += renderScoreBlock(entry, true, dividerClass);
497544
} else {
498-
taskHtml += '<td class="no-score">-</td>';
545+
taskHtml += `<td class="no-score${dividerClass ? ` ${dividerClass}` : ''}">-</td>`;
499546
}
500547
});
501548
const frontier = frontierEntry(task);
@@ -509,13 +556,14 @@ function renderLeaderboard(data) {
509556

510557
// Average row — only count tasks that have scores
511558
taskHtml += '<tr class="frontier-row"><td>Average</td>';
512-
data.agents.forEach(agent => {
559+
orderedTaskAgents.forEach(agent => {
513560
const avgEntry = averageEntry(data.tasks.map(t => data.scores[agent]?.[t]).filter(Boolean));
561+
const dividerClass = agent === firstLlmAgent ? 'leaderboard-group-divider-left' : '';
514562
if (!avgEntry) {
515-
taskHtml += '<td class="no-score">-</td>';
563+
taskHtml += `<td class="no-score${dividerClass ? ` ${dividerClass}` : ''}">-</td>`;
516564
return;
517565
}
518-
taskHtml += renderScoreBlock(avgEntry, false);
566+
taskHtml += renderScoreBlock(avgEntry, false, dividerClass);
519567
});
520568
const frontierAvgEntry = averageEntry(data.tasks.map(frontierEntry).filter(Boolean));
521569
if (frontierAvgEntry) {
@@ -529,7 +577,8 @@ function renderLeaderboard(data) {
529577
<div class="leaderboard-stack">
530578
${renderSection('summary', 'By Domain', summaryHtml, 'Slide to view more domains')}
531579
${renderSection('task', 'By Task', taskHtml, 'Slide to view more agents', '<span class="leaderboard-note-icon" aria-hidden="true">👉</span> Click any scored cell to jump to run details')}
532-
</div>`;
580+
</div>
581+
<div class="dashboard-footnote leaderboard-footnote">${researchHarnessFootnoteHtml()}</div>`;
533582

534583
container.innerHTML = html;
535584
syncLeaderboardScrollbars();
@@ -1784,8 +1833,28 @@ function getAgentBaseLabel(name) {
17841833
return m ? m[1] : String(name);
17851834
}
17861835

1836+
function getModelLogo(model) {
1837+
const label = String(model || '');
1838+
if (!label) return '';
1839+
const mappings = [
1840+
[/^GPT\b/i, 'static/logos/openai.svg'],
1841+
[/^Claude\b/i, 'static/logos/anthropic.svg'],
1842+
[/^Qwen/i, 'static/logos/qwen.png'],
1843+
[/^GLM\b/i, 'static/logos/glm.webp'],
1844+
[/^Kimi\b/i, 'static/logos/kimi.png'],
1845+
[/^MiMo\b/i, 'static/logos/mimo.png'],
1846+
[/^Grok\b/i, 'static/logos/grok.png'],
1847+
];
1848+
const match = mappings.find(([pattern]) => pattern.test(label));
1849+
return match ? match[1] : '';
1850+
}
1851+
17871852
function getAgentLogo(name) {
1788-
return state.agentLogos[name] || state.agentLogos[getAgentBaseLabel(name)] || '';
1853+
if (state.agentLogos[name]) return state.agentLogos[name];
1854+
if (isResearchHarnessAgent(name)) {
1855+
return getModelLogo(getResearchHarnessModelName(null, name)) || state.agentLogos[getAgentBaseLabel(name)] || '';
1856+
}
1857+
return state.agentLogos[getAgentBaseLabel(name)] || getModelLogo(name) || '';
17891858
}
17901859

17911860
function agentLogoHtml(name, size = 16) {
@@ -1833,6 +1902,33 @@ function getAgentModelLabel(data, agent) {
18331902
return labels[0];
18341903
}
18351904

1905+
function isResearchHarnessAgent(name) {
1906+
return /^ResearchHarness\b/.test(String(name || ''));
1907+
}
1908+
1909+
function getResearchHarnessModelName(data, agent) {
1910+
const match = String(agent || '').match(/^ResearchHarness \((.+)\)$/);
1911+
if (match) return match[1];
1912+
return getAgentModelLabel(data, agent) || '';
1913+
}
1914+
1915+
function getAgentDisplayLabel(data, agent) {
1916+
if (agent === 'Frontier' || String(agent || '').startsWith('Human')) return String(agent || '');
1917+
if (isResearchHarnessAgent(agent)) return getResearchHarnessModelName(data, agent) || String(agent || '');
1918+
return String(agent || '');
1919+
}
1920+
1921+
function getAgentSecondaryLabel(data, agent) {
1922+
if (isResearchHarnessAgent(agent)) return '';
1923+
const modelLabel = getAgentModelLabel(data, agent);
1924+
if (!modelLabel || modelLabel === getAgentDisplayLabel(data, agent)) return '';
1925+
return modelLabel;
1926+
}
1927+
1928+
function researchHarnessFootnoteHtml() {
1929+
return 'Note: All standalone LLM results below are evaluated with <a href="https://github.com/black-yt/ResearchHarness" target="_blank" rel="noopener noreferrer">ResearchHarness</a>.';
1930+
}
1931+
18361932
let _durationTimer = null;
18371933
let _durationStart = null;
18381934

static/logos/glm.webp

1.56 KB
Loading

static/logos/grok.png

6.72 KB
Loading

static/logos/kimi.png

80.3 KB
Loading

static/logos/mimo.png

31.6 KB
Loading

static/logos/qwen.png

10.1 KB
Loading

static/style.css

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -256,11 +256,14 @@ body { height: 100vh; font-family: var(--font); font-size: 17px; line-height: 1.
256256
.chart-legend-model { font-size: 11px; color: var(--text-tertiary); text-transform: uppercase; letter-spacing: 0.05em; }
257257
.chart-legend-swatch { width: 14px; height: 3px; border-radius: 2px; }
258258
.chart-legend-swatch.dashed { border-top: 2px dashed; width: 18px; height: 0; background: none; }
259-
.chart-container { position: relative; width: 100%; height: 416px; }
260-
261-
.stats { display: flex; gap: 28px; }
262-
.stat { text-align: center; }
263-
.stat-value { font-size: 33.5px; font-weight: 700; color: var(--accent); letter-spacing: -1px; }
259+
.chart-container { position: relative; width: 100%; height: 416px; }
260+
.dashboard-footnote { margin-top: 10px; font-size: 12px; line-height: 1.5; color: var(--text-secondary); }
261+
.dashboard-footnote a { color: var(--accent); text-decoration: none; }
262+
.dashboard-footnote a:hover { text-decoration: underline; }
263+
264+
.stats { display: flex; gap: 28px; }
265+
.stat { text-align: center; }
266+
.stat-value { font-size: 33.5px; font-weight: 700; color: var(--accent); letter-spacing: -1px; }
264267
.stat-label { font-size: 12px; color: var(--text-tertiary); text-transform: uppercase; letter-spacing: 0.8px; margin-top: 1px; }
265268

266269
/* leaderboard */
@@ -304,6 +307,8 @@ body { height: 100vh; font-family: var(--font); font-size: 17px; line-height: 1.
304307
.leaderboard-summary th, .leaderboard-summary td { padding: 5px 4px; }
305308
.leaderboard-summary th:first-child, .leaderboard-summary td:first-child { min-width: 74px; }
306309
.leaderboard-summary th:not(:first-child), .leaderboard-summary td:not(:first-child) { width: 126px; min-width: 126px; max-width: 126px; }
310+
.leaderboard-group-start-row td { border-top: 3px solid var(--border-strong); }
311+
.leaderboard-group-divider-left { box-shadow: inset 3px 0 0 var(--border-strong); }
307312
.leaderboard-summary td { cursor: default; }
308313
.leaderboard-summary td:hover { background: transparent; }
309314
.leaderboard-static-cell { cursor: default; }

0 commit comments

Comments
 (0)