From ba9ad87f35b4847fb98d1c4424dd8abb15ef6c82 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Wed, 4 Mar 2026 11:05:55 -0500 Subject: [PATCH] fix: prefer multilevel demo files over plain .txt in eval scripts When both {task_id}_multilevel.txt and {task_id}.txt exist in the demo directory, all demo file lookup paths now prefer the multilevel (Option D) format. Falls back to plain .txt, then .json for backwards compatibility. Files changed: - scripts/run_dc_eval.py - scripts/run_eval_pipeline.py - openadapt_evals/benchmarks/cli.py (_suite_find_demo) - openadapt_evals/benchmarks/comparison_viewer.py Co-Authored-By: Claude Opus 4.6 --- openadapt_evals/benchmarks/cli.py | 19 ++++++++++++------- .../benchmarks/comparison_viewer.py | 11 ++++++++--- scripts/run_dc_eval.py | 5 ++++- scripts/run_eval_pipeline.py | 10 ++++++++-- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/openadapt_evals/benchmarks/cli.py b/openadapt_evals/benchmarks/cli.py index 3ee401b..6a6eb30 100644 --- a/openadapt_evals/benchmarks/cli.py +++ b/openadapt_evals/benchmarks/cli.py @@ -2049,15 +2049,20 @@ def _suite_task_short_name(task_id: str) -> str: def _suite_find_demo(demo_dir: Path, task_id: str) -> Path | None: - """Find demo file (.txt preferred, then .json) for a task ID. + """Find demo file for a task ID. - Prefers .txt (natural language) over .json because: - - .txt demos describe actions in human-readable terms - - .json demos contain normalized coordinates (0-1) that mismatch - the agent's pixel coordinate action space + Preference order: + 1. ``{task_id}_multilevel.txt`` — Option D multi-level conditioning format + 2. ``{task_id}.txt`` — plain natural-language demo + 3. ``{task_id}.json`` — structured JSON (normalized coords, less useful) + + Multilevel demos are preferred because the rigid plain-text format can + cause demo-conditioned agents to abandon the task when UI state doesn't + exactly match the description. The multilevel format uses PLAN + + {Think, Action, Expect} steps with "adapt if needed" framing. """ - for ext in (".txt", ".json"): - p = demo_dir / f"{task_id}{ext}" + for suffix in ("_multilevel.txt", ".txt", ".json"): + p = demo_dir / f"{task_id}{suffix}" if p.exists(): return p return None diff --git a/openadapt_evals/benchmarks/comparison_viewer.py b/openadapt_evals/benchmarks/comparison_viewer.py index 1e6f2c7..0034bbf 100644 --- a/openadapt_evals/benchmarks/comparison_viewer.py +++ b/openadapt_evals/benchmarks/comparison_viewer.py @@ -118,12 +118,17 @@ def generate_comparison_viewer( # Match tasks across runs matched = _match_tasks(loaded_runs) - # Load demo prompts if available + # Load demo prompts if available. + # When both {id}.txt and {id}_multilevel.txt exist, prefer multilevel. demo_prompts: dict[str, str] = {} if demo_prompts_dir and demo_prompts_dir.exists(): - for f in demo_prompts_dir.glob("*.txt"): + for f in sorted(demo_prompts_dir.glob("*.txt")): + stem = f.stem + # Strip _multilevel suffix for the lookup key so multilevel + # demos replace plain ones (sorted order: plain before _multilevel). + key = stem.removesuffix("_multilevel") with open(f) as fh: - demo_prompts[f.stem] = fh.read() + demo_prompts[key] = fh.read() # Build the data structure for JS comparison_data: list[dict[str, Any]] = [] diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py index 87214f8..3379f65 100644 --- a/scripts/run_dc_eval.py +++ b/scripts/run_dc_eval.py @@ -265,7 +265,10 @@ def main() -> int: if not args.dc_only: conditions.append((tid, f"val_zs_{sid}", None)) if not args.zs_only: - demo_path = demo_dir / f"{tid}.txt" + # Prefer multilevel demo (Option D format) over plain .txt + demo_path = demo_dir / f"{tid}_multilevel.txt" + if not demo_path.exists(): + demo_path = demo_dir / f"{tid}.txt" if not demo_path.exists(): demo_path = demo_dir / f"{tid}.json" if not demo_path.exists(): diff --git a/scripts/run_eval_pipeline.py b/scripts/run_eval_pipeline.py index 5f7565c..02bca0a 100644 --- a/scripts/run_eval_pipeline.py +++ b/scripts/run_eval_pipeline.py @@ -85,7 +85,10 @@ def _find_recordings_needing_demos( if not any(task_id.startswith(f) for f in task_filter): continue - demo_path = demo_dir / f"{task_id}.txt" + # Prefer multilevel demo (Option D format) over plain .txt + demo_path = demo_dir / f"{task_id}_multilevel.txt" + if not demo_path.exists(): + demo_path = demo_dir / f"{task_id}.txt" if not demo_path.exists(): missing.append((task_dir, task_id)) @@ -410,7 +413,10 @@ def _build_conditions( if not dc_only: conditions.append((tid, f"val_zs_{sid}", None)) if not zs_only: - demo_path = demo_dir / f"{tid}.txt" + # Prefer multilevel demo (Option D format) over plain .txt + demo_path = demo_dir / f"{tid}_multilevel.txt" + if not demo_path.exists(): + demo_path = demo_dir / f"{tid}.txt" if not demo_path.exists(): demo_path = demo_dir / f"{tid}.json" if demo_path.exists():