vectorize-io
diff --git a/‎catalog.json‎
Lines changed: 7 additions & 0 deletions b/‎catalog.json‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎ui/dist/assets/index-Be5t4Uo-.js‎
Lines changed: 65 additions & 0 deletions b/‎ui/dist/assets/index-Be5t4Uo-.js‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎ui/dist/assets/index-Bx2-Tt8l.css‎
Lines changed: 0 additions & 1 deletion b/‎ui/dist/assets/index-Bx2-Tt8l.css‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ui/dist/assets/index-CyAqyTbc.js‎
Lines changed: 0 additions & 65 deletions b/‎ui/dist/assets/index-CyAqyTbc.js‎
Lines changed: 0 additions & 65 deletions
diff --git a/‎ui/dist/assets/index-Ud8aUk_I.css‎
Lines changed: 1 addition & 0 deletions b/‎ui/dist/assets/index-Ud8aUk_I.css‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ui/dist/index.html‎
Lines changed: 2 additions & 2 deletions b/‎ui/dist/index.html‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ui/src/pages/DatasetDetail.vue‎
Lines changed: 6 additions & 0 deletions b/‎ui/src/pages/DatasetDetail.vue‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎ui/src/pages/RunDetail.vue‎
Lines changed: 9 additions & 1 deletion b/‎ui/src/pages/RunDetail.vue‎
Lines changed: 9 additions & 1 deletion
@@ -2,6 +2,7 @@
   "datasets": {
     "beam": {
       "description": "Long-context memory benchmark: 100 conversations (100K\u201310M tokens), 2 000 questions across 10 memory ability categories.",
+      "scoring_note": "Scoring follows the formula defined in the BEAM paper. Each question has a rubric with multiple items, and an LLM judge scores each item on a 0/0.5/1 scale; the question score is the average across its rubric items. For event ordering questions, Kendall tau-b rank correlation measures how well the predicted order matches the reference. The overall benchmark score is the mean of all individual question scores. Because partial credit is common (scoring 0.5 is straightforward) while achieving a perfect 1.0 requires nailing every rubric item, BEAM scores sit on a different scale than pass/fail benchmarks.",
       "task": "LLM-judged",
       "splits": [
         "100k",
@@ -12,27 +13,31 @@
     },
     "lifebench": {
       "description": "Long-horizon multi-source personalized memory benchmark across 10 users.",
+      "scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail. The overall score is the pass rate across all questions.",
       "task": "LLM-judged",
       "splits": [
         "en"
       ]
     },
     "locomo": {
       "description": "Multi-session long-term conversations with 1,986 QA pairs.",
+      "scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail. The overall score is the pass rate across all questions.",
       "task": "LLM-judged",
       "splits": [
         "locomo10"
       ]
     },
     "longmemeval": {
       "description": "Long-term memory evaluation in LLM-based chat assistants.",
+      "scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail, with special handling for time and date questions. The overall score is the pass rate across all questions.",
       "task": "LLM-judged",
       "splits": [
         "s"
       ]
     },
     "membench": {
       "description": "Agent memory at different abstraction levels and perspectives.",
+      "scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
       "task": "MCQ",
       "splits": [
         "FirstAgentLowLevel",
@@ -43,6 +48,7 @@
     },
     "memsim": {
       "description": "Chinese daily-life memory simulation with diverse QA types.",
+      "scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
       "task": "MCQ",
       "splits": [
         "simple",
@@ -55,6 +61,7 @@
     },
     "personamem": {
       "description": "Long-horizon personal preference tracking across conversation sessions.",
+      "scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
       "task": "MCQ",
       "splits": [
         "32k",
 
@@ -36,8 +36,8 @@
 
   <!-- Analytics -->
   <script async src="https://analytics.hindsight.vectorize.io/script.js" data-website-id="d018abae-77ea-464d-a89d-404b1b305425"></script>
-  <script type="module" crossorigin src="/assets/index-CyAqyTbc.js"></script>
-  <link rel="stylesheet" crossorigin href="/assets/index-Bx2-Tt8l.css">
+  <script type="module" crossorigin src="/assets/index-Be5t4Uo-.js"></script>
+  <link rel="stylesheet" crossorigin href="/assets/index-Ud8aUk_I.css">
 </head>
 <body>
   <div id="app"></div>
 
@@ -483,6 +483,12 @@ function hasCategoryData(local, split) {
             </template>
           </template>
 
+          <!-- Scoring note -->
+          <div v-if="catalog.datasets?.[dataset]?.scoring_note" class="mt-6 mb-2 rounded-md bg-muted/50 px-4 py-3 max-w-3xl">
+            <p class="text-xs font-medium text-foreground/80 mb-1">How is this scored?</p>
+            <p class="text-xs text-muted-foreground leading-relaxed">{{ catalog.datasets[dataset].scoring_note }}</p>
+          </div>
+
           <!-- Unverified -->
           <template v-if="external.length">
             <div class="mb-3 mt-8">
 
@@ -18,6 +18,7 @@ const runPath = computed(() => {
 })
 
 const data        = ref(null)
+const catalog     = ref({ datasets: {} })
 const loading     = ref(true)
 const error       = ref(null)
 const filter      = ref('all')
@@ -34,7 +35,10 @@ async function load() {
   finally { loading.value = false }
 }
 
-onMounted(load)
+onMounted(() => {
+  load()
+  fetch('/api/catalog').then(r => r.json()).then(d => catalog.value = d).catch(() => {})
+})
 watch(runPath, load)
 
 const results = computed(() => data.value?.results ?? [])
@@ -341,6 +345,10 @@ function toggleCat(axis, cat) {
               <span class="font-semibold">{{ active.score != null ? 'Score: ' + active.score.toFixed(3) : (active.correct ? '✓ Correct' : '✗ Incorrect') }}</span>
               <span v-if="active.judge_reason" class="ml-2 opacity-75 font-normal">— {{ active.judge_reason }}</span>
             </div>
+            <details v-if="catalog.datasets?.[data?.dataset]?.scoring_note" class="mt-2 text-xs text-muted-foreground">
+              <summary class="cursor-pointer hover:text-foreground/80 transition-colors">How is this scored?</summary>
+              <p class="mt-1.5 leading-relaxed pl-3 border-l-2 border-muted">{{ catalog.datasets[data.dataset].scoring_note }}</p>
+            </details>
           </section>
 
           <hr class="border-border/40" />