Skip to content

Commit 2a6fdad

Browse files
feat: add EdTech student-tutor benchmark scenario (#18)
Adds a domain-specific benchmark scenario modelling AI-tutor conversations with hierarchical student facts (name, grade, school, favourite subject, GPA, learning style) and four fact updates that simulate real learning progressions. Changes: - simulator/scenarios/edtech.py — EDTECH_FACTS, EDTECH_FILLER_TURNS, and EDTECH_PERSONA_POOL (5 diverse student personas for multi-seed runs) - simulator/conversation.py — generate_conversation() now accepts an optional filler_turns list for domain-specific Q&A interleaving - evaluation/benchmark.py — run_benchmark() and run_benchmark_multi_seed() accept filler_turns and persona_pool kwargs so any scenario can be plugged in - main.py — --scenario flag (choices: default | edtech) resolves the right facts, filler, and persona pool before calling the benchmark Usage: python main.py --scenario edtech python main.py --scenario edtech --seeds 5 Closes #13 #4
1 parent fd942b9 commit 2a6fdad

5 files changed

Lines changed: 162 additions & 10 deletions

File tree

evaluation/benchmark.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,19 @@ def run_benchmark(
7777
provider: Optional["LLMProvider"] = None,
7878
decay: str = "ebbinghaus",
7979
progress: Optional[Callable[[str], None]] = None,
80+
filler_turns: Optional[List[str]] = None,
8081
) -> Dict[str, BackendResult]:
8182
"""
8283
Run the full MemoryLens benchmark.
8384
8485
Parameters
8586
----------
86-
decay : temporal decay function for CascadingTemporalMemory
87-
'ebbinghaus' (default) | 'exponential' | 'linear' | 'default'
88-
provider : LLMProvider | None
89-
When supplied, LLM answer+judge pass runs at every checkpoint.
87+
decay : temporal decay function for CascadingTemporalMemory
88+
'ebbinghaus' (default) | 'exponential' | 'linear' | 'default'
89+
provider : LLMProvider | None
90+
When supplied, LLM answer+judge pass runs at every checkpoint.
91+
filler_turns : domain-specific filler question list; defaults to the generic
92+
tech-QA set when None.
9093
"""
9194
if eval_checkpoints is None:
9295
eval_checkpoints = [10, 25, 50, 75, 100]
@@ -96,7 +99,7 @@ def run_benchmark(
9699
backends = ["naive", "rag", "cascading"]
97100

98101
total_turns = max(total_turns, max(eval_checkpoints))
99-
events = generate_conversation(facts, total_turns)
102+
events = generate_conversation(facts, total_turns, filler_turns=filler_turns)
100103
checkpoint_set = set(eval_checkpoints)
101104
results: Dict[str, BackendResult] = {}
102105

@@ -234,6 +237,8 @@ def run_benchmark_multi_seed(
234237
provider: Optional["LLMProvider"] = None,
235238
decay: str = "ebbinghaus",
236239
progress: Optional[Callable[[str], None]] = None,
240+
persona_pool: Optional[List[List[Fact]]] = None,
241+
filler_turns: Optional[List[str]] = None,
237242
) -> Dict:
238243
"""
239244
Run the benchmark across multiple personas and aggregate with mean ± std.
@@ -251,11 +256,12 @@ def run_benchmark_multi_seed(
251256
if backends is None:
252257
backends = ["naive", "rag", "cascading"]
253258

254-
n_seeds = min(n_seeds, len(PERSONA_POOL))
259+
pool = persona_pool if persona_pool is not None else PERSONA_POOL
260+
n_seeds = min(n_seeds, len(pool))
255261
all_runs: List[Dict[str, BackendResult]] = []
256262

257263
for seed_idx in range(n_seeds):
258-
persona_facts = PERSONA_POOL[seed_idx]
264+
persona_facts = pool[seed_idx]
259265
if progress:
260266
progress(f"Seed {seed_idx + 1}/{n_seeds}{persona_facts[0].value} ...")
261267
run = run_benchmark(
@@ -265,6 +271,7 @@ def run_benchmark_multi_seed(
265271
backends=backends,
266272
provider=provider,
267273
decay=decay,
274+
filler_turns=filler_turns,
268275
)
269276
all_runs.append(run)
270277

main.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ def main() -> None:
6666
parser.add_argument("--decay", type=str, default="ebbinghaus",
6767
choices=["ebbinghaus", "exponential", "linear", "default"],
6868
help="Temporal decay function for CascadingMemory warm tier")
69+
parser.add_argument("--scenario", type=str, default="default",
70+
choices=["default", "edtech"],
71+
help="Conversation scenario: default (tech Q&A) | edtech (student-tutor)")
6972
args = parser.parse_args()
7073

7174
# ── List providers ────────────────────────────────────────────────────────
@@ -101,6 +104,19 @@ def main() -> None:
101104

102105
multi_seed = args.seeds > 1
103106

107+
# ── Resolve scenario ─────────────────────────────────────────────────────
108+
scenario_facts = None
109+
scenario_filler = None
110+
scenario_pool = None
111+
112+
if args.scenario == "edtech":
113+
from simulator.scenarios.edtech import (
114+
EDTECH_FACTS, EDTECH_FILLER_TURNS, EDTECH_PERSONA_POOL,
115+
)
116+
scenario_facts = EDTECH_FACTS
117+
scenario_filler = EDTECH_FILLER_TURNS
118+
scenario_pool = EDTECH_PERSONA_POOL
119+
104120
# ── Banner ───────────────────────────────────────────────────────────────
105121
print("=" * 65)
106122
print(" MemoryLens -- LLM Memory Decay Benchmark")
@@ -109,6 +125,7 @@ def main() -> None:
109125
print(f" Checkpoints : {sorted(args.checkpoints)}")
110126
print(f" Backends : {args.backends}")
111127
print(f" Decay : {args.decay}")
128+
print(f" Scenario : {args.scenario}")
112129
if multi_seed:
113130
print(f" Seeds : {args.seeds} (multi-seed -- will report mean +/- std)")
114131
print(f" LLM eval : {'ON (' + provider.name + ')' if provider else 'OFF (content-only)'}")
@@ -125,6 +142,8 @@ def main() -> None:
125142
provider=provider,
126143
decay=args.decay,
127144
progress=print,
145+
persona_pool=scenario_pool,
146+
filler_turns=scenario_filler,
128147
)
129148
_print_multi_seed_results(aggregated, args.backends)
130149
_save(aggregated, args.output)
@@ -143,10 +162,12 @@ def main() -> None:
143162
raw = run_benchmark(
144163
total_turns=args.turns,
145164
eval_checkpoints=sorted(args.checkpoints),
165+
facts=scenario_facts,
146166
backends=args.backends,
147167
provider=provider,
148168
decay=args.decay,
149169
progress=print,
170+
filler_turns=scenario_filler,
150171
)
151172
display = results_to_display_dict(raw)
152173
_print_single_seed_results(display, args.backends)

simulator/conversation.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Dict
1+
from typing import List, Dict, Optional
22
from .facts import Fact, BENCHMARK_FACTS
33

44
FILLER_TURNS = [
@@ -25,11 +25,24 @@
2525
]
2626

2727

28-
def generate_conversation(facts: List[Fact], total_turns: int) -> List[Dict]:
28+
def generate_conversation(
29+
facts: List[Fact],
30+
total_turns: int,
31+
filler_turns: Optional[List[str]] = None,
32+
) -> List[Dict]:
2933
"""
3034
Generate a list of conversation events across `total_turns` turns.
3135
Each event: {turn, role, content, is_fact, fact_key, is_update}
36+
37+
Parameters
38+
----------
39+
filler_turns : optional list of domain-specific filler questions.
40+
Defaults to the generic tech-QA FILLER_TURNS list.
41+
Pass a domain-specific list (e.g. EDTECH_FILLER_TURNS) to
42+
run a scenario benchmark in a different conversational domain.
3243
"""
44+
_fillers = filler_turns if filler_turns else FILLER_TURNS
45+
3346
injection_map: Dict[int, Fact] = {f.injected_at: f for f in facts}
3447
update_map: Dict[int, Fact] = {
3548
f.updated_at: f for f in facts if f.updated_at is not None
@@ -54,7 +67,7 @@ def generate_conversation(facts: List[Fact], total_turns: int) -> List[Dict]:
5467
"is_fact": True, "fact_key": fact.key, "is_update": True,
5568
})
5669
else:
57-
msg = FILLER_TURNS[filler_idx % len(FILLER_TURNS)]
70+
msg = _fillers[filler_idx % len(_fillers)]
5871
filler_idx += 1
5972
events.append({
6073
"turn": turn, "role": "user",

simulator/scenarios/__init__.py

Whitespace-only changes.

simulator/scenarios/edtech.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""
2+
simulator/scenarios/edtech.py — EdTech (student-tutor) benchmark scenario.
3+
4+
Models a student interacting with an AI tutor over many turns. Facts cover
5+
hierarchical personal and academic attributes; updates simulate real learning
6+
progressions (grade improvement, subject change, learning-style refinement).
7+
8+
The filler turns are domain-specific tutoring requests (concept explanations,
9+
problem-solving help, study strategy questions) rather than generic tech Q&A,
10+
making this a harder benchmark for memory systems that rely on keyword overlap.
11+
12+
Closes #13 / #4.
13+
"""
14+
15+
from typing import List
16+
from simulator.facts import Fact
17+
18+
19+
# ── Fact sets ─────────────────────────────────────────────────────────────────
20+
21+
EDTECH_FACTS: List[Fact] = [
22+
# Identity
23+
Fact("name", "Priya Nair", injected_at=0),
24+
Fact("grade", "10th grade", injected_at=1, updated_at=45, updated_value="11th grade"),
25+
Fact("school", "Sunrise Academy", injected_at=2),
26+
# Academic profile
27+
Fact("favourite subject","mathematics", injected_at=4, updated_at=55, updated_value="physics"),
28+
Fact("weakest subject", "history", injected_at=6),
29+
Fact("current GPA", "3.2", injected_at=8, updated_at=70, updated_value="3.6"),
30+
# Learning preferences
31+
Fact("learning style", "visual learner", injected_at=10, updated_at=60, updated_value="hands-on learner"),
32+
Fact("study hours per day","2 hours", injected_at=12),
33+
]
34+
35+
36+
# ── Persona pool for multi-seed runs ─────────────────────────────────────────
37+
38+
EDTECH_PERSONA_POOL: List[List[Fact]] = [
39+
# Persona 0 — Priya Nair (baseline)
40+
EDTECH_FACTS,
41+
# Persona 1 — Carlos Mendez
42+
[
43+
Fact("name", "Carlos Mendez", injected_at=0),
44+
Fact("grade", "9th grade", injected_at=1, updated_at=45, updated_value="10th grade"),
45+
Fact("school", "Westbrook High", injected_at=2),
46+
Fact("favourite subject","biology", injected_at=4, updated_at=55, updated_value="chemistry"),
47+
Fact("weakest subject", "algebra", injected_at=6),
48+
Fact("current GPA", "2.9", injected_at=8, updated_at=70, updated_value="3.3"),
49+
Fact("learning style", "auditory learner", injected_at=10, updated_at=60, updated_value="reading/writing learner"),
50+
Fact("study hours per day","1.5 hours", injected_at=12),
51+
],
52+
# Persona 2 — Aisha Kamara
53+
[
54+
Fact("name", "Aisha Kamara", injected_at=0),
55+
Fact("grade", "11th grade", injected_at=1, updated_at=45, updated_value="12th grade"),
56+
Fact("school", "Greenfield IB", injected_at=2),
57+
Fact("favourite subject","literature", injected_at=4, updated_at=55, updated_value="psychology"),
58+
Fact("weakest subject", "calculus", injected_at=6),
59+
Fact("current GPA", "3.5", injected_at=8, updated_at=70, updated_value="3.8"),
60+
Fact("learning style", "reading/writing learner", injected_at=10, updated_at=60, updated_value="visual learner"),
61+
Fact("study hours per day","3 hours", injected_at=12),
62+
],
63+
# Persona 3 — Haruto Tanaka
64+
[
65+
Fact("name", "Haruto Tanaka", injected_at=0),
66+
Fact("grade", "8th grade", injected_at=1, updated_at=45, updated_value="9th grade"),
67+
Fact("school", "Sakura Middle", injected_at=2),
68+
Fact("favourite subject","computer science", injected_at=4, updated_at=55, updated_value="mathematics"),
69+
Fact("weakest subject", "essay writing", injected_at=6),
70+
Fact("current GPA", "3.0", injected_at=8, updated_at=70, updated_value="3.4"),
71+
Fact("learning style", "hands-on learner", injected_at=10, updated_at=60, updated_value="visual learner"),
72+
Fact("study hours per day","2.5 hours", injected_at=12),
73+
],
74+
# Persona 4 — Amelia Brooks
75+
[
76+
Fact("name", "Amelia Brooks", injected_at=0),
77+
Fact("grade", "12th grade", injected_at=1, updated_at=45, updated_value="1st year university"),
78+
Fact("school", "Oakdale High", injected_at=2),
79+
Fact("favourite subject","economics", injected_at=4, updated_at=55, updated_value="statistics"),
80+
Fact("weakest subject", "organic chemistry", injected_at=6),
81+
Fact("current GPA", "3.7", injected_at=8, updated_at=70, updated_value="3.9"),
82+
Fact("learning style", "auditory learner", injected_at=10, updated_at=60, updated_value="hands-on learner"),
83+
Fact("study hours per day","4 hours", injected_at=12),
84+
],
85+
]
86+
87+
88+
# ── Domain-specific filler turns ─────────────────────────────────────────────
89+
90+
EDTECH_FILLER_TURNS: List[str] = [
91+
"Can you explain the Pythagorean theorem with a real-world example?",
92+
"I'm struggling to understand photosynthesis. Can you break it down simply?",
93+
"What is the difference between mitosis and meiosis?",
94+
"How do I solve simultaneous equations using substitution?",
95+
"Can you explain Newton's three laws of motion?",
96+
"What are the key themes in Romeo and Juliet?",
97+
"How do I write a strong thesis statement for an essay?",
98+
"What is the difference between speed and velocity?",
99+
"Can you explain the water cycle to me?",
100+
"How do I factorise a quadratic expression?",
101+
"What caused the First World War?",
102+
"Can you help me understand the concept of supply and demand?",
103+
"What is the difference between an atom and a molecule?",
104+
"How do I find the area of a circle?",
105+
"Can you explain what DNA replication is?",
106+
"What is the significance of the French Revolution?",
107+
"How do I convert fractions to decimals?",
108+
"What are the main parts of a cell and their functions?",
109+
"Can you explain the concept of gravity?",
110+
"How do I improve my reading comprehension skills?",
111+
]

0 commit comments

Comments
 (0)