Skip to content

Commit ef3f932

Browse files
committed
benchmark: add LoCoMo dataset conversion + runner support
- Convert LoCoMo conversations to BM markdown notes with proper frontmatter, observations, relations, and MEMORY.md - Runner now accepts --project, --queries, --limit flags - LoCoMo conv-0 first run: 73.3% R@5, 90% multi-hop, 52% single-hop - Dataset files gitignored (2.8MB JSON + 302 generated markdown files)
1 parent 8a17314 commit ef3f932

File tree

3 files changed

+355
-4
lines changed

3 files changed

+355
-4
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,5 @@ vite.config.ts.timestamp-*
142142
bun.lock
143143

144144
.idea/
145+
benchmark/datasets/
146+
benchmark/corpus-locomo/

benchmark/convert-locomo.ts

Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
#!/usr/bin/env bun
2+
/**
3+
* Convert LoCoMo dataset into Basic Memory markdown corpus + queries.
4+
*
5+
* LoCoMo conversations → daily session notes (like an agent's memory)
6+
* LoCoMo QA annotations → benchmark queries with ground truth
7+
*
8+
* Usage:
9+
* bun benchmark/convert-locomo.ts # Convert all 10 conversations
10+
* bun benchmark/convert-locomo.ts --conv=0 # Convert conversation 0 only
11+
* bun benchmark/convert-locomo.ts --conv=0 --conv=1 # Multiple conversations
12+
*/
13+
14+
import { mkdir, readFile, writeFile } from "node:fs/promises"
15+
import { resolve } from "node:path"
16+
17+
// ---------------------------------------------------------------------------
18+
// Types
19+
// ---------------------------------------------------------------------------
20+
21+
interface LoCoMoTurn {
22+
speaker: string
23+
text: string
24+
dia_id: string
25+
img_url?: string
26+
blip_caption?: string
27+
}
28+
29+
interface LoCoMoQA {
30+
question: string
31+
answer?: string
32+
adversarial_answer?: string
33+
category: number
34+
evidence: string[]
35+
}
36+
37+
interface LoCoMoConversation {
38+
sample_id: string
39+
conversation: Record<string, any>
40+
qa: LoCoMoQA[]
41+
observation?: Record<string, string>
42+
session_summary?: Record<string, string>
43+
event_summary?: Record<string, any>
44+
}
45+
46+
interface BenchmarkQuery {
47+
id: string
48+
query: string
49+
category: string
50+
ground_truth: string[]
51+
expected_content?: string
52+
note?: string
53+
}
54+
55+
// ---------------------------------------------------------------------------
56+
// Config
57+
// ---------------------------------------------------------------------------
58+
59+
const BENCHMARK_DIR = resolve(import.meta.dirname!, ".")
60+
const DATASET_PATH = resolve(BENCHMARK_DIR, "datasets/locomo10.json")
61+
62+
const CATEGORY_MAP: Record<number, string> = {
63+
1: "single_hop",
64+
2: "multi_hop",
65+
3: "temporal",
66+
4: "open_domain",
67+
5: "adversarial",
68+
}
69+
70+
// ---------------------------------------------------------------------------
71+
// Helpers
72+
// ---------------------------------------------------------------------------
73+
74+
function parseDateTime(dateStr: string): { date: string; time: string } | null {
75+
// "8:56 pm on 20 July, 2023" → { date: "2023-07-20", time: "20:56" }
76+
const match = dateStr.match(
77+
/(\d{1,2}):(\d{2})\s*(am|pm)\s+on\s+(\d{1,2})\s+(\w+),?\s+(\d{4})/i,
78+
)
79+
if (!match) return null
80+
81+
let [, hour, min, ampm, day, month, year] = match
82+
let h = Number.parseInt(hour)
83+
if (ampm.toLowerCase() === "pm" && h !== 12) h += 12
84+
if (ampm.toLowerCase() === "am" && h === 12) h = 0
85+
86+
const months: Record<string, string> = {
87+
January: "01", February: "02", March: "03", April: "04",
88+
May: "05", June: "06", July: "07", August: "08",
89+
September: "09", October: "10", November: "11", December: "12",
90+
}
91+
92+
const m = months[month]
93+
if (!m) return null
94+
95+
return {
96+
date: `${year}-${m}-${day.padStart(2, "0")}`,
97+
time: `${String(h).padStart(2, "0")}:${min}`,
98+
}
99+
}
100+
101+
function dialogIdToSessionNum(diaId: string): number | null {
102+
// "D1:3" → session 1, "D15:7" → session 15
103+
const match = diaId.match(/^D(\d+):/)
104+
return match ? Number.parseInt(match[1]) : null
105+
}
106+
107+
// ---------------------------------------------------------------------------
108+
// Conversion
109+
// ---------------------------------------------------------------------------
110+
111+
function convertConversation(
112+
conv: LoCoMoConversation,
113+
convIndex: number,
114+
): { files: Map<string, string>; queries: BenchmarkQuery[] } {
115+
const c = conv.conversation
116+
const speakerA = c.speaker_a || "Speaker A"
117+
const speakerB = c.speaker_b || "Speaker B"
118+
const files = new Map<string, string>()
119+
120+
// Find all sessions
121+
const sessionKeys = Object.keys(c)
122+
.filter((k) => k.match(/^session_\d+$/) && Array.isArray(c[k]))
123+
.sort((a, b) => {
124+
const na = Number.parseInt(a.split("_")[1])
125+
const nb = Number.parseInt(b.split("_")[1])
126+
return na - nb
127+
})
128+
129+
// Create a people note for each speaker
130+
const speakerANote = `---
131+
title: ${speakerA}
132+
type: Person
133+
---
134+
135+
# ${speakerA}
136+
137+
## Observations
138+
- [role] Conversation participant
139+
- [relationship] Regularly chats with ${speakerB}
140+
`
141+
files.set(`people/${speakerA.toLowerCase().replace(/\s+/g, "-")}.md`, speakerANote)
142+
143+
const speakerBNote = `---
144+
title: ${speakerB}
145+
type: Person
146+
---
147+
148+
# ${speakerB}
149+
150+
## Observations
151+
- [role] Conversation participant
152+
- [relationship] Regularly chats with ${speakerA}
153+
`
154+
files.set(`people/${speakerB.toLowerCase().replace(/\s+/g, "-")}.md`, speakerBNote)
155+
156+
// Build a MEMORY.md with key facts that accumulate
157+
let memoryLines: string[] = [
158+
`# Long-Term Memory`,
159+
"",
160+
`## People`,
161+
`- ${speakerA} and ${speakerB} are close friends who chat regularly`,
162+
"",
163+
`## Key Events`,
164+
]
165+
166+
// Convert each session to a dated note
167+
for (const sessionKey of sessionKeys) {
168+
const sessionNum = Number.parseInt(sessionKey.split("_")[1])
169+
const turns: LoCoMoTurn[] = c[sessionKey]
170+
const dateTimeStr = c[`${sessionKey}_date_time`]
171+
const parsed = dateTimeStr ? parseDateTime(dateTimeStr) : null
172+
173+
const date = parsed?.date || `2023-01-${String(sessionNum).padStart(2, "0")}`
174+
const time = parsed?.time || "12:00"
175+
176+
// Get session summary and observations if available
177+
const summary = conv.session_summary?.[`${sessionKey}_summary`] || ""
178+
const rawObs = conv.observation?.[`${sessionKey}_observation`]
179+
let observation = ""
180+
if (rawObs && typeof rawObs === "object") {
181+
// { "Speaker": [["observation text", "D1:3"], ...] }
182+
const lines: string[] = []
183+
for (const [speaker, obs] of Object.entries(rawObs)) {
184+
if (Array.isArray(obs)) {
185+
for (const item of obs) {
186+
const text = Array.isArray(item) ? item[0] : item
187+
if (typeof text === "string") lines.push(`- [${speaker.toLowerCase()}] ${text}`)
188+
}
189+
}
190+
}
191+
observation = lines.join("\n")
192+
} else if (typeof rawObs === "string") {
193+
observation = rawObs
194+
}
195+
196+
let content = `---
197+
title: ${date} Session ${sessionNum}
198+
type: note
199+
date: ${date}
200+
---
201+
202+
# ${date} — Session ${sessionNum}
203+
204+
*${speakerA} and ${speakerB}${time}*
205+
206+
`
207+
208+
// Add observation as a summary if available
209+
if (observation) {
210+
content += `## Summary\n${observation}\n\n`
211+
} else if (summary) {
212+
content += `## Summary\n${summary}\n\n`
213+
}
214+
215+
// Add conversation
216+
content += `## Conversation\n`
217+
for (const turn of turns) {
218+
const text = turn.text.replace(/\n/g, "\n> ")
219+
content += `**${turn.speaker}:** ${text}\n\n`
220+
}
221+
222+
// Add relations
223+
content += `## Relations\n`
224+
content += `- mentions [[${speakerA}]]\n`
225+
content += `- mentions [[${speakerB}]]\n`
226+
227+
// Add to memory summary
228+
if (observation) {
229+
const firstObs = observation.split("\n")[0]?.replace(/^- \[\w+\] /, "") || ""
230+
if (firstObs) memoryLines.push(`- [${date}] ${firstObs}`)
231+
}
232+
233+
files.set(`conversations/${date}-session-${sessionNum}.md`, content)
234+
}
235+
236+
// Write MEMORY.md
237+
files.set("MEMORY.md", memoryLines.join("\n") + "\n")
238+
239+
// Convert QA to benchmark queries
240+
const queries: BenchmarkQuery[] = []
241+
242+
// Map evidence dialog IDs to file paths
243+
for (const [qIdx, qa] of conv.qa.entries()) {
244+
const category = CATEGORY_MAP[qa.category] || `cat_${qa.category}`
245+
const answer = qa.answer || qa.adversarial_answer || ""
246+
247+
// Map evidence to ground truth file paths
248+
const groundTruth = new Set<string>()
249+
for (const ev of qa.evidence || []) {
250+
const sessionNum = dialogIdToSessionNum(ev)
251+
if (sessionNum === null) continue
252+
253+
// Find the session's date
254+
const dateTimeStr = c[`session_${sessionNum}_date_time`]
255+
const parsed = dateTimeStr ? parseDateTime(dateTimeStr) : null
256+
const date = parsed?.date || `2023-01-${String(sessionNum).padStart(2, "0")}`
257+
groundTruth.add(`conversations/${date}-session-${sessionNum}.md`)
258+
}
259+
260+
// For adversarial questions, ground truth is that the info doesn't exist
261+
// We still include the evidence files (where the premise is contradicted)
262+
const isAdversarial = qa.category === 5
263+
264+
queries.push({
265+
id: `locomo_c${convIndex}_q${qIdx}`,
266+
query: qa.question,
267+
category,
268+
ground_truth: [...groundTruth],
269+
expected_content: isAdversarial ? undefined : answer.length < 100 ? answer : undefined,
270+
note: isAdversarial ? `Adversarial: correct answer is "${answer}"` : undefined,
271+
})
272+
}
273+
274+
return { files, queries }
275+
}
276+
277+
// ---------------------------------------------------------------------------
278+
// Main
279+
// ---------------------------------------------------------------------------
280+
281+
async function main() {
282+
const args = process.argv.slice(2)
283+
const convIndices = args
284+
.filter((a) => a.startsWith("--conv="))
285+
.map((a) => Number.parseInt(a.split("=")[1]))
286+
287+
console.log("Loading LoCoMo dataset...")
288+
const raw = await readFile(DATASET_PATH, "utf-8")
289+
const data: LoCoMoConversation[] = JSON.parse(raw)
290+
console.log(` ${data.length} conversations loaded`)
291+
292+
const indices = convIndices.length > 0 ? convIndices : data.map((_, i) => i)
293+
let totalFiles = 0
294+
let totalQueries = 0
295+
296+
for (const idx of indices) {
297+
const conv = data[idx]
298+
if (!conv) {
299+
console.error(` Conversation ${idx} not found, skipping`)
300+
continue
301+
}
302+
303+
const convDir = `corpus-locomo/conv-${idx}`
304+
const outDir = resolve(BENCHMARK_DIR, convDir)
305+
306+
console.log(`\nConverting conversation ${idx} (${conv.conversation.speaker_a} & ${conv.conversation.speaker_b})...`)
307+
308+
const { files, queries } = convertConversation(conv, idx)
309+
310+
// Write files
311+
for (const [path, content] of files) {
312+
const fullPath = resolve(outDir, path)
313+
await mkdir(resolve(fullPath, ".."), { recursive: true })
314+
await writeFile(fullPath, content)
315+
}
316+
317+
// Write queries
318+
const queriesPath = resolve(outDir, "queries.json")
319+
await writeFile(queriesPath, JSON.stringify(queries, null, 2))
320+
321+
console.log(` ${files.size} markdown files, ${queries.length} queries`)
322+
totalFiles += files.size
323+
totalQueries += queries.length
324+
325+
// Category breakdown
326+
const cats: Record<string, number> = {}
327+
for (const q of queries) {
328+
cats[q.category] = (cats[q.category] || 0) + 1
329+
}
330+
for (const [cat, count] of Object.entries(cats).sort()) {
331+
console.log(` ${cat}: ${count}`)
332+
}
333+
}
334+
335+
console.log(`\n✅ Total: ${totalFiles} files, ${totalQueries} queries across ${indices.length} conversations`)
336+
console.log(` Output: benchmark/corpus-locomo/`)
337+
}
338+
339+
main().catch((err) => {
340+
console.error("Conversion failed:", err)
341+
process.exit(1)
342+
})

benchmark/run.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,17 @@ interface BenchmarkSummary {
8383

8484
const BENCHMARK_DIR = dirname(new URL(import.meta.url).pathname)
8585
const RESULTS_DIR = resolve(BENCHMARK_DIR, "results")
86-
const QUERIES_PATH = resolve(BENCHMARK_DIR, "queries.json")
8786

8887
const CORPUS_SIZE =
8988
process.argv.find((a) => a.startsWith("--corpus="))?.split("=")[1] || "small"
90-
const BM_PROJECT = "benchmark"
89+
const BM_PROJECT =
90+
process.argv.find((a) => a.startsWith("--project="))?.split("=")[1] || "benchmark"
91+
const QUERIES_PATH =
92+
process.argv.find((a) => a.startsWith("--queries="))?.split("=")[1] ||
93+
resolve(BENCHMARK_DIR, "queries.json")
94+
const QUERY_LIMIT = Number.parseInt(
95+
process.argv.find((a) => a.startsWith("--limit="))?.split("=")[1] || "0",
96+
) || 0
9197

9298
// ---------------------------------------------------------------------------
9399
// MCP Client
@@ -225,8 +231,9 @@ async function main() {
225231
console.log()
226232

227233
// Load queries
228-
const queriesRaw = await readFile(QUERIES_PATH, "utf-8")
229-
const queries: Query[] = JSON.parse(queriesRaw)
234+
const queriesRaw = await readFile(resolve(QUERIES_PATH), "utf-8")
235+
let queries: Query[] = JSON.parse(queriesRaw)
236+
if (QUERY_LIMIT > 0) queries = queries.slice(0, QUERY_LIMIT)
230237
console.log(
231238
`Loaded ${queries.length} queries across ${new Set(queries.map((q) => q.category)).size} categories`,
232239
)

0 commit comments

Comments
 (0)