Skip to content

Commit 3e92541

Browse files
committed
feat: add scoring notes for all datasets
Show "How is this scored?" on dataset pages (after results table) and run detail pages (under judge verdict). Each dataset explains its grading method: BEAM uses per-rubric-item scoring from the paper, LLM-judged datasets use pass/fail with gold references, MCQ datasets use exact match.
1 parent fb63307 commit 3e92541

8 files changed

Lines changed: 90 additions & 69 deletions

File tree

catalog.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"datasets": {
33
"beam": {
44
"description": "Long-context memory benchmark: 100 conversations (100K\u201310M tokens), 2 000 questions across 10 memory ability categories.",
5+
"scoring_note": "Scoring follows the formula defined in the BEAM paper. Each question has a rubric with multiple items, and an LLM judge scores each item on a 0/0.5/1 scale; the question score is the average across its rubric items. For event ordering questions, Kendall tau-b rank correlation measures how well the predicted order matches the reference. The overall benchmark score is the mean of all individual question scores. Because partial credit is common (scoring 0.5 is straightforward) while achieving a perfect 1.0 requires nailing every rubric item, BEAM scores sit on a different scale than pass/fail benchmarks.",
56
"task": "LLM-judged",
67
"splits": [
78
"100k",
@@ -12,27 +13,31 @@
1213
},
1314
"lifebench": {
1415
"description": "Long-horizon multi-source personalized memory benchmark across 10 users.",
16+
"scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail. The overall score is the pass rate across all questions.",
1517
"task": "LLM-judged",
1618
"splits": [
1719
"en"
1820
]
1921
},
2022
"locomo": {
2123
"description": "Multi-session long-term conversations with 1,986 QA pairs.",
24+
"scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail. The overall score is the pass rate across all questions.",
2225
"task": "LLM-judged",
2326
"splits": [
2427
"locomo10"
2528
]
2629
},
2730
"longmemeval": {
2831
"description": "Long-term memory evaluation in LLM-based chat assistants.",
32+
"scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail, with special handling for time and date questions. The overall score is the pass rate across all questions.",
2933
"task": "LLM-judged",
3034
"splits": [
3135
"s"
3236
]
3337
},
3438
"membench": {
3539
"description": "Agent memory at different abstraction levels and perspectives.",
40+
"scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
3641
"task": "MCQ",
3742
"splits": [
3843
"FirstAgentLowLevel",
@@ -43,6 +48,7 @@
4348
},
4449
"memsim": {
4550
"description": "Chinese daily-life memory simulation with diverse QA types.",
51+
"scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
4652
"task": "MCQ",
4753
"splits": [
4854
"simple",
@@ -55,6 +61,7 @@
5561
},
5662
"personamem": {
5763
"description": "Long-horizon personal preference tracking across conversation sessions.",
64+
"scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
5865
"task": "MCQ",
5966
"splits": [
6067
"32k",

ui/dist/assets/index-Be5t4Uo-.js

Lines changed: 65 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ui/dist/assets/index-Bx2-Tt8l.css

Lines changed: 0 additions & 1 deletion
This file was deleted.

ui/dist/assets/index-CyAqyTbc.js

Lines changed: 0 additions & 65 deletions
This file was deleted.

ui/dist/assets/index-Ud8aUk_I.css

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ui/dist/index.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@
3636

3737
<!-- Analytics -->
3838
<script async src="https://analytics.hindsight.vectorize.io/script.js" data-website-id="d018abae-77ea-464d-a89d-404b1b305425"></script>
39-
<script type="module" crossorigin src="/assets/index-CyAqyTbc.js"></script>
40-
<link rel="stylesheet" crossorigin href="/assets/index-Bx2-Tt8l.css">
39+
<script type="module" crossorigin src="/assets/index-Be5t4Uo-.js"></script>
40+
<link rel="stylesheet" crossorigin href="/assets/index-Ud8aUk_I.css">
4141
</head>
4242
<body>
4343
<div id="app"></div>

ui/src/pages/DatasetDetail.vue

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,12 @@ function hasCategoryData(local, split) {
483483
</template>
484484
</template>
485485
486+
<!-- Scoring note -->
487+
<div v-if="catalog.datasets?.[dataset]?.scoring_note" class="mt-6 mb-2 rounded-md bg-muted/50 px-4 py-3 max-w-3xl">
488+
<p class="text-xs font-medium text-foreground/80 mb-1">How is this scored?</p>
489+
<p class="text-xs text-muted-foreground leading-relaxed">{{ catalog.datasets[dataset].scoring_note }}</p>
490+
</div>
491+
486492
<!-- Unverified -->
487493
<template v-if="external.length">
488494
<div class="mb-3 mt-8">

ui/src/pages/RunDetail.vue

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ const runPath = computed(() => {
1818
})
1919
2020
const data = ref(null)
21+
const catalog = ref({ datasets: {} })
2122
const loading = ref(true)
2223
const error = ref(null)
2324
const filter = ref('all')
@@ -34,7 +35,10 @@ async function load() {
3435
finally { loading.value = false }
3536
}
3637
37-
onMounted(load)
38+
onMounted(() => {
39+
load()
40+
fetch('/api/catalog').then(r => r.json()).then(d => catalog.value = d).catch(() => {})
41+
})
3842
watch(runPath, load)
3943
4044
const results = computed(() => data.value?.results ?? [])
@@ -341,6 +345,10 @@ function toggleCat(axis, cat) {
341345
<span class="font-semibold">{{ active.score != null ? 'Score: ' + active.score.toFixed(3) : (active.correct ? '✓ Correct' : '✗ Incorrect') }}</span>
342346
<span v-if="active.judge_reason" class="ml-2 opacity-75 font-normal">— {{ active.judge_reason }}</span>
343347
</div>
348+
<details v-if="catalog.datasets?.[data?.dataset]?.scoring_note" class="mt-2 text-xs text-muted-foreground">
349+
<summary class="cursor-pointer hover:text-foreground/80 transition-colors">How is this scored?</summary>
350+
<p class="mt-1.5 leading-relaxed pl-3 border-l-2 border-muted">{{ catalog.datasets[data.dataset].scoring_note }}</p>
351+
</details>
344352
</section>
345353
346354
<hr class="border-border/40" />

0 commit comments

Comments
 (0)