feat: add EdTech student-tutor benchmark scenario (#18)

Priyanshu-byte-coder · web-flow · commit 2a6fdad4a00e · 2026-05-24T10:03:18.000+05:30
Adds a domain-specific benchmark scenario modelling AI-tutor conversations with hierarchical student facts (name, grade, school, favourite subject, GPA, learning style) and four fact updates that simulate real learning progressions. Changes: - simulator/scenarios/edtech.py — EDTECH_FACTS, EDTECH_FILLER_TURNS, and EDTECH_PERSONA_POOL (5 diverse student personas for multi-seed runs) - simulator/conversation.py — generate_conversation() now accepts an optional filler_turns list for domain-specific Q&A interleaving - evaluation/benchmark.py — run_benchmark() and run_benchmark_multi_seed() accept filler_turns and persona_pool kwargs so any scenario can be plugged in - main.py — --scenario flag (choices: default | edtech) resolves the right facts, filler, and persona pool before calling the benchmark Usage: python main.py --scenario edtech python main.py --scenario edtech --seeds 5 Closes #13 #4
diff --git a/evaluation/benchmark.py b/evaluation/benchmark.py
@@ -77,16 +77,19 @@ def run_benchmark(
     provider:         Optional["LLMProvider"]      = None,
     decay:            str                          = "ebbinghaus",
     progress:         Optional[Callable[[str], None]] = None,
+    filler_turns:     Optional[List[str]]          = None,
 ) -> Dict[str, BackendResult]:
     """
     Run the full MemoryLens benchmark.
 
     Parameters
     ----------
-    decay    : temporal decay function for CascadingTemporalMemory
-               'ebbinghaus' (default) | 'exponential' | 'linear' | 'default'
-    provider : LLMProvider | None
-               When supplied, LLM answer+judge pass runs at every checkpoint.
+    decay        : temporal decay function for CascadingTemporalMemory
+                   'ebbinghaus' (default) | 'exponential' | 'linear' | 'default'
+    provider     : LLMProvider | None
+                   When supplied, LLM answer+judge pass runs at every checkpoint.
+    filler_turns : domain-specific filler question list; defaults to the generic
+                   tech-QA set when None.
     """
     if eval_checkpoints is None:
         eval_checkpoints = [10, 25, 50, 75, 100]
@@ -96,7 +99,7 @@ def run_benchmark(
         backends = ["naive", "rag", "cascading"]
 
     total_turns = max(total_turns, max(eval_checkpoints))
-    events = generate_conversation(facts, total_turns)
+    events = generate_conversation(facts, total_turns, filler_turns=filler_turns)
     checkpoint_set = set(eval_checkpoints)
     results: Dict[str, BackendResult] = {}
 
@@ -234,6 +237,8 @@ def run_benchmark_multi_seed(
     provider:         Optional["LLMProvider"]      = None,
     decay:            str                          = "ebbinghaus",
     progress:         Optional[Callable[[str], None]] = None,
+    persona_pool:     Optional[List[List[Fact]]]   = None,
+    filler_turns:     Optional[List[str]]          = None,
 ) -> Dict:
     """
     Run the benchmark across multiple personas and aggregate with mean ± std.
@@ -251,11 +256,12 @@ def run_benchmark_multi_seed(
     if backends is None:
         backends = ["naive", "rag", "cascading"]
 
-    n_seeds = min(n_seeds, len(PERSONA_POOL))
+    pool = persona_pool if persona_pool is not None else PERSONA_POOL
+    n_seeds = min(n_seeds, len(pool))
     all_runs: List[Dict[str, BackendResult]] = []
 
     for seed_idx in range(n_seeds):
-        persona_facts = PERSONA_POOL[seed_idx]
+        persona_facts = pool[seed_idx]
         if progress:
             progress(f"Seed {seed_idx + 1}/{n_seeds} — {persona_facts[0].value} ...")
         run = run_benchmark(
@@ -265,6 +271,7 @@ def run_benchmark_multi_seed(
             backends=backends,
             provider=provider,
             decay=decay,
+            filler_turns=filler_turns,
         )
         all_runs.append(run)
 
diff --git a/main.py b/main.py
@@ -66,6 +66,9 @@ def main() -> None:
     parser.add_argument("--decay",       type=str,  default="ebbinghaus",
                         choices=["ebbinghaus", "exponential", "linear", "default"],
                         help="Temporal decay function for CascadingMemory warm tier")
+    parser.add_argument("--scenario",    type=str,  default="default",
+                        choices=["default", "edtech"],
+                        help="Conversation scenario: default (tech Q&A) | edtech (student-tutor)")
     args = parser.parse_args()
 
     # ── List providers ────────────────────────────────────────────────────────
@@ -101,6 +104,19 @@ def main() -> None:
 
     multi_seed = args.seeds > 1
 
+    # ── Resolve scenario ─────────────────────────────────────────────────────
+    scenario_facts  = None
+    scenario_filler = None
+    scenario_pool   = None
+
+    if args.scenario == "edtech":
+        from simulator.scenarios.edtech import (
+            EDTECH_FACTS, EDTECH_FILLER_TURNS, EDTECH_PERSONA_POOL,
+        )
+        scenario_facts  = EDTECH_FACTS
+        scenario_filler = EDTECH_FILLER_TURNS
+        scenario_pool   = EDTECH_PERSONA_POOL
+
     # ── Banner ───────────────────────────────────────────────────────────────
     print("=" * 65)
     print("  MemoryLens -- LLM Memory Decay Benchmark")
@@ -109,6 +125,7 @@ def main() -> None:
     print(f"  Checkpoints : {sorted(args.checkpoints)}")
     print(f"  Backends    : {args.backends}")
     print(f"  Decay       : {args.decay}")
+    print(f"  Scenario    : {args.scenario}")
     if multi_seed:
         print(f"  Seeds       : {args.seeds} (multi-seed -- will report mean +/- std)")
     print(f"  LLM eval    : {'ON  (' + provider.name + ')' if provider else 'OFF (content-only)'}")
@@ -125,6 +142,8 @@ def main() -> None:
             provider=provider,
             decay=args.decay,
             progress=print,
+            persona_pool=scenario_pool,
+            filler_turns=scenario_filler,
         )
         _print_multi_seed_results(aggregated, args.backends)
         _save(aggregated, args.output)
@@ -143,10 +162,12 @@ def main() -> None:
         raw = run_benchmark(
             total_turns=args.turns,
             eval_checkpoints=sorted(args.checkpoints),
+            facts=scenario_facts,
             backends=args.backends,
             provider=provider,
             decay=args.decay,
             progress=print,
+            filler_turns=scenario_filler,
         )
         display = results_to_display_dict(raw)
         _print_single_seed_results(display, args.backends)
diff --git a/simulator/conversation.py b/simulator/conversation.py
@@ -1,4 +1,4 @@
-from typing import List, Dict
+from typing import List, Dict, Optional
 from .facts import Fact, BENCHMARK_FACTS
 
 FILLER_TURNS = [
@@ -25,11 +25,24 @@
 ]
 
 
-def generate_conversation(facts: List[Fact], total_turns: int) -> List[Dict]:
+def generate_conversation(
+    facts: List[Fact],
+    total_turns: int,
+    filler_turns: Optional[List[str]] = None,
+) -> List[Dict]:
     """
     Generate a list of conversation events across `total_turns` turns.
     Each event: {turn, role, content, is_fact, fact_key, is_update}
+
+    Parameters
+    ----------
+    filler_turns : optional list of domain-specific filler questions.
+                   Defaults to the generic tech-QA FILLER_TURNS list.
+                   Pass a domain-specific list (e.g. EDTECH_FILLER_TURNS) to
+                   run a scenario benchmark in a different conversational domain.
     """
+    _fillers = filler_turns if filler_turns else FILLER_TURNS
+
     injection_map: Dict[int, Fact] = {f.injected_at: f for f in facts}
     update_map: Dict[int, Fact] = {
         f.updated_at: f for f in facts if f.updated_at is not None
@@ -54,7 +67,7 @@ def generate_conversation(facts: List[Fact], total_turns: int) -> List[Dict]:
                 "is_fact": True, "fact_key": fact.key, "is_update": True,
             })
         else:
-            msg = FILLER_TURNS[filler_idx % len(FILLER_TURNS)]
+            msg = _fillers[filler_idx % len(_fillers)]
             filler_idx += 1
             events.append({
                 "turn": turn, "role": "user",
diff --git a/simulator/scenarios/__init__.py b/simulator/scenarios/__init__.py
diff --git a/simulator/scenarios/edtech.py b/simulator/scenarios/edtech.py
@@ -0,0 +1,111 @@
+"""
+simulator/scenarios/edtech.py — EdTech (student-tutor) benchmark scenario.
+
+Models a student interacting with an AI tutor over many turns.  Facts cover
+hierarchical personal and academic attributes; updates simulate real learning
+progressions (grade improvement, subject change, learning-style refinement).
+
+The filler turns are domain-specific tutoring requests (concept explanations,
+problem-solving help, study strategy questions) rather than generic tech Q&A,
+making this a harder benchmark for memory systems that rely on keyword overlap.
+
+Closes #13 / #4.
+"""
+
+from typing import List
+from simulator.facts import Fact
+
+
+# ── Fact sets ─────────────────────────────────────────────────────────────────
+
+EDTECH_FACTS: List[Fact] = [
+    # Identity
+    Fact("name",            "Priya Nair",       injected_at=0),
+    Fact("grade",           "10th grade",       injected_at=1,  updated_at=45, updated_value="11th grade"),
+    Fact("school",          "Sunrise Academy",  injected_at=2),
+    # Academic profile
+    Fact("favourite subject","mathematics",      injected_at=4,  updated_at=55, updated_value="physics"),
+    Fact("weakest subject",  "history",          injected_at=6),
+    Fact("current GPA",      "3.2",              injected_at=8,  updated_at=70, updated_value="3.6"),
+    # Learning preferences
+    Fact("learning style",   "visual learner",  injected_at=10, updated_at=60, updated_value="hands-on learner"),
+    Fact("study hours per day","2 hours",        injected_at=12),
+]
+
+
+# ── Persona pool for multi-seed runs ─────────────────────────────────────────
+
+EDTECH_PERSONA_POOL: List[List[Fact]] = [
+    # Persona 0 — Priya Nair (baseline)
+    EDTECH_FACTS,
+    # Persona 1 — Carlos Mendez
+    [
+        Fact("name",             "Carlos Mendez",   injected_at=0),
+        Fact("grade",            "9th grade",       injected_at=1,  updated_at=45, updated_value="10th grade"),
+        Fact("school",           "Westbrook High",  injected_at=2),
+        Fact("favourite subject","biology",          injected_at=4,  updated_at=55, updated_value="chemistry"),
+        Fact("weakest subject",  "algebra",          injected_at=6),
+        Fact("current GPA",      "2.9",              injected_at=8,  updated_at=70, updated_value="3.3"),
+        Fact("learning style",   "auditory learner", injected_at=10, updated_at=60, updated_value="reading/writing learner"),
+        Fact("study hours per day","1.5 hours",      injected_at=12),
+    ],
+    # Persona 2 — Aisha Kamara
+    [
+        Fact("name",             "Aisha Kamara",    injected_at=0),
+        Fact("grade",            "11th grade",      injected_at=1,  updated_at=45, updated_value="12th grade"),
+        Fact("school",           "Greenfield IB",   injected_at=2),
+        Fact("favourite subject","literature",       injected_at=4,  updated_at=55, updated_value="psychology"),
+        Fact("weakest subject",  "calculus",         injected_at=6),
+        Fact("current GPA",      "3.5",              injected_at=8,  updated_at=70, updated_value="3.8"),
+        Fact("learning style",   "reading/writing learner", injected_at=10, updated_at=60, updated_value="visual learner"),
+        Fact("study hours per day","3 hours",        injected_at=12),
+    ],
+    # Persona 3 — Haruto Tanaka
+    [
+        Fact("name",             "Haruto Tanaka",   injected_at=0),
+        Fact("grade",            "8th grade",       injected_at=1,  updated_at=45, updated_value="9th grade"),
+        Fact("school",           "Sakura Middle",   injected_at=2),
+        Fact("favourite subject","computer science", injected_at=4,  updated_at=55, updated_value="mathematics"),
+        Fact("weakest subject",  "essay writing",   injected_at=6),
+        Fact("current GPA",      "3.0",             injected_at=8,  updated_at=70, updated_value="3.4"),
+        Fact("learning style",   "hands-on learner", injected_at=10, updated_at=60, updated_value="visual learner"),
+        Fact("study hours per day","2.5 hours",      injected_at=12),
+    ],
+    # Persona 4 — Amelia Brooks
+    [
+        Fact("name",             "Amelia Brooks",   injected_at=0),
+        Fact("grade",            "12th grade",      injected_at=1,  updated_at=45, updated_value="1st year university"),
+        Fact("school",           "Oakdale High",    injected_at=2),
+        Fact("favourite subject","economics",        injected_at=4,  updated_at=55, updated_value="statistics"),
+        Fact("weakest subject",  "organic chemistry", injected_at=6),
+        Fact("current GPA",      "3.7",             injected_at=8,  updated_at=70, updated_value="3.9"),
+        Fact("learning style",   "auditory learner", injected_at=10, updated_at=60, updated_value="hands-on learner"),
+        Fact("study hours per day","4 hours",        injected_at=12),
+    ],
+]
+
+
+# ── Domain-specific filler turns ─────────────────────────────────────────────
+
+EDTECH_FILLER_TURNS: List[str] = [
+    "Can you explain the Pythagorean theorem with a real-world example?",
+    "I'm struggling to understand photosynthesis. Can you break it down simply?",
+    "What is the difference between mitosis and meiosis?",
+    "How do I solve simultaneous equations using substitution?",
+    "Can you explain Newton's three laws of motion?",
+    "What are the key themes in Romeo and Juliet?",
+    "How do I write a strong thesis statement for an essay?",
+    "What is the difference between speed and velocity?",
+    "Can you explain the water cycle to me?",
+    "How do I factorise a quadratic expression?",
+    "What caused the First World War?",
+    "Can you help me understand the concept of supply and demand?",
+    "What is the difference between an atom and a molecule?",
+    "How do I find the area of a circle?",
+    "Can you explain what DNA replication is?",
+    "What is the significance of the French Revolution?",
+    "How do I convert fractions to decimals?",
+    "What are the main parts of a cell and their functions?",
+    "Can you explain the concept of gravity?",
+    "How do I improve my reading comprehension skills?",
+]