agent-memory-benchmark/catalog.json at main · vectorize-io/agent-memory-benchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
{
  "datasets": {
    "beam": {
      "description": "Long-context memory benchmark: 100 conversations (100K\u201310M tokens), 2 000 questions across 10 memory ability categories.",
      "scoring_note": "Scoring follows the formula defined in the BEAM paper. Each question has a rubric with multiple items, and an LLM judge scores each item on a 0/0.5/1 scale; the question score is the average across its rubric items. For event ordering questions, Kendall tau-b rank correlation measures how well the predicted order matches the reference. The overall benchmark score is the mean of all individual question scores. Because partial credit is common (scoring 0.5 is straightforward) while achieving a perfect 1.0 requires nailing every rubric item, BEAM scores sit on a different scale than pass/fail benchmarks.",
      "task": "LLM-judged",
      "splits": [
        "100k",
        "500k",
        "1m",
        "10m"
      ]
    },
    "lifebench": {
      "description": "Long-horizon multi-source personalized memory benchmark across 10 users.",
      "scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail. The overall score is the pass rate across all questions.",
      "task": "LLM-judged",
      "splits": [
        "en"
      ]
    },
    "locomo": {
      "description": "Multi-session long-term conversations with 1,986 QA pairs.",
      "scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail. The overall score is the pass rate across all questions.",
      "task": "LLM-judged",
      "splits": [
        "locomo10"
      ]
    },
    "longmemeval": {
      "description": "Long-term memory evaluation in LLM-based chat assistants.",
      "scoring_note": "An LLM judge compares each answer against a gold-standard reference and grades it pass or fail, with special handling for time and date questions. The overall score is the pass rate across all questions.",
      "task": "LLM-judged",
      "splits": [
        "s"
      ]
    },
    "membench": {
      "description": "Agent memory at different abstraction levels and perspectives.",
      "scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
      "task": "MCQ",
      "splits": [
        "FirstAgentLowLevel",
        "FirstAgentHighLevel",
        "ThirdAgentLowLevel",
        "ThirdAgentHighLevel"
      ]
    },
    "memsim": {
      "description": "Chinese daily-life memory simulation with diverse QA types.",
      "scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
      "task": "MCQ",
      "splits": [
        "simple",
        "conditional",
        "comparative",
        "aggregative",
        "post_processing",
        "noisy"
      ]
    },
    "personamem": {
      "description": "Long-horizon personal preference tracking across conversation sessions.",
      "scoring_note": "Multiple-choice questions with a single correct answer. The model must select the right option; the score is the fraction of questions answered correctly.",
      "task": "MCQ",
      "splits": [
        "32k",
        "128k",
        "1M"
      ]
    }
  },
  "providers": {
    "bm25": {
      "key": "bm25",
      "description": "Keyword search baseline. No embeddings \u2014 splits docs into chunks and uses BM25 ranking.",
      "kind": "local",
      "link": null,
      "logo": null
    },
    "cognee": {
      "key": "cognee",
      "description": "Graph-based knowledge extraction with FastEmbed (BAAI/bge-small-en-v1.5) + OpenAI LLM.",
      "kind": "local",
      "link": "https://cognee.ai",
      "logo": "https://www.google.com/s2/favicons?sz=32&domain=cognee.ai"
    },
    "hindsight": {
      "link": "https://hindsight.vectorize.io",
      "logo": "https://www.google.com/s2/favicons?sz=32&domain=hindsight.vectorize.io",
      "variants": {
        "local": {
          "key": "hindsight",
          "description": "Embedded Hindsight fact store using gemini-2.5-flash-lite as the extraction model. Recall uses all memory types (world + experience + observation) with no type filter applied.",
          "kind": "local"
        },
        "cloud": {
          "key": "hindsight-cloud",
          "description": "Hindsight hosted cloud API. Recall uses all memory types (world + experience + observation) with no type filter applied.",
          "kind": "cloud"
        },
        "http": {
          "key": "hindsight-http",
          "description": "Hindsight via a self-hosted HTTP endpoint. Recall uses all memory types (world + experience + observation) with no type filter applied.",
          "kind": "cloud"
        }
      }
    },
    "mastra": {
      "key": "mastra",
      "description": "Mastra semantic recall with LibSQL store and FastEmbed embeddings. topK=10.",
      "kind": "local",
      "link": "https://mastra.ai",
      "logo": "https://www.google.com/s2/favicons?sz=32&domain=mastra.ai"
    },
    "mastra-om": {
      "key": "mastra-om",
      "description": "Mastra Observational Memory: observer/reflector pattern with Gemini 2.5 Flash + GPT-4o.",
      "kind": "local",
      "link": "https://mastra.ai",
      "logo": "https://www.google.com/s2/favicons?sz=32&domain=mastra.ai"
    },
    "mem0": {
      "link": "https://mem0.ai",
      "logo": "https://www.google.com/s2/favicons?sz=32&domain=mem0.ai",
      "variants": {
        "local": {
          "key": "mem0",
          "description": "Agentic memory with Gemini 2.0 Flash for reflective extraction + local Qdrant store.",
          "kind": "local"
        },
        "cloud": {
          "key": "mem0-cloud",
          "description": "Mem0 cloud API. Async indexing \u2014 waits for indexing to complete before eval. k=20.",
          "kind": "cloud"
        }
      }
    },
    "hybrid-search": {
      "key": "qdrant",
      "description": "Hybrid dense + sparse vector search via Qdrant with RRF fusion. Dense: Qwen3-Embedding-0.6B (1024d, asymmetric query/doc encoding). Sparse: BM42 (bm42-all-minilm-l6-v2-attentions). Documents chunked into 512-token windows before indexing. Retrieves top-k=50 chunks (prefetch 100 per branch).",
      "kind": "local",
      "link": "https://qdrant.tech",
      "logo": "https://www.google.com/s2/favicons?sz=32&domain=qdrant.tech"
    },
    "supermemory": {
      "key": "supermemory",
      "description": "Supermemory cloud API with temporal metadata support.",
      "kind": "cloud",
      "link": "https://supermemory.ai",
      "logo": "https://www.google.com/s2/favicons?sz=32&domain=supermemory.ai"
    }
  },
  "modes": {
    "rag": {
      "description": "Default. Provider retrieves top-k documents; they are injected into an LLM prompt as context. Supports both MCQ and open-ended questions."
    },
    "agentic-rag": {
      "description": "The LLM acts as an agent with a recall tool and can make multiple retrieval calls with different queries before finalising its answer."
    },
    "agent": {
      "description": "Bypasses the benchmark retrieval pipeline entirely \u2014 calls the provider's own native direct_answer() for providers that have built-in agentic answering."
    }
  }
}