From ba9ad87f35b4847fb98d1c4424dd8abb15ef6c82 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Wed, 4 Mar 2026 11:05:55 -0500
Subject: [PATCH] fix: prefer multilevel demo files over plain .txt in eval
 scripts

When both {task_id}_multilevel.txt and {task_id}.txt exist in the demo
directory, all demo file lookup paths now prefer the multilevel (Option D)
format. Falls back to plain .txt, then .json for backwards compatibility.

Files changed:
- scripts/run_dc_eval.py
- scripts/run_eval_pipeline.py
- openadapt_evals/benchmarks/cli.py (_suite_find_demo)
- openadapt_evals/benchmarks/comparison_viewer.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 openadapt_evals/benchmarks/cli.py             | 19 ++++++++++++-------
 .../benchmarks/comparison_viewer.py           | 11 ++++++++---
 scripts/run_dc_eval.py                        |  5 ++++-
 scripts/run_eval_pipeline.py                  | 10 ++++++++--
 4 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/openadapt_evals/benchmarks/cli.py b/openadapt_evals/benchmarks/cli.py
index 3ee401b..6a6eb30 100644
--- a/openadapt_evals/benchmarks/cli.py
+++ b/openadapt_evals/benchmarks/cli.py
@@ -2049,15 +2049,20 @@ def _suite_task_short_name(task_id: str) -> str:
 
 
 def _suite_find_demo(demo_dir: Path, task_id: str) -> Path | None:
-    """Find demo file (.txt preferred, then .json) for a task ID.
+    """Find demo file for a task ID.
 
-    Prefers .txt (natural language) over .json because:
-    - .txt demos describe actions in human-readable terms
-    - .json demos contain normalized coordinates (0-1) that mismatch
-      the agent's pixel coordinate action space
+    Preference order:
+    1. ``{task_id}_multilevel.txt`` — Option D multi-level conditioning format
+    2. ``{task_id}.txt`` — plain natural-language demo
+    3. ``{task_id}.json`` — structured JSON (normalized coords, less useful)
+
+    Multilevel demos are preferred because the rigid plain-text format can
+    cause demo-conditioned agents to abandon the task when UI state doesn't
+    exactly match the description.  The multilevel format uses PLAN +
+    {Think, Action, Expect} steps with "adapt if needed" framing.
     """
-    for ext in (".txt", ".json"):
-        p = demo_dir / f"{task_id}{ext}"
+    for suffix in ("_multilevel.txt", ".txt", ".json"):
+        p = demo_dir / f"{task_id}{suffix}"
         if p.exists():
             return p
     return None
diff --git a/openadapt_evals/benchmarks/comparison_viewer.py b/openadapt_evals/benchmarks/comparison_viewer.py
index 1e6f2c7..0034bbf 100644
--- a/openadapt_evals/benchmarks/comparison_viewer.py
+++ b/openadapt_evals/benchmarks/comparison_viewer.py
@@ -118,12 +118,17 @@ def generate_comparison_viewer(
     # Match tasks across runs
     matched = _match_tasks(loaded_runs)
 
-    # Load demo prompts if available
+    # Load demo prompts if available.
+    # When both {id}.txt and {id}_multilevel.txt exist, prefer multilevel.
     demo_prompts: dict[str, str] = {}
     if demo_prompts_dir and demo_prompts_dir.exists():
-        for f in demo_prompts_dir.glob("*.txt"):
+        for f in sorted(demo_prompts_dir.glob("*.txt")):
+            stem = f.stem
+            # Strip _multilevel suffix for the lookup key so multilevel
+            # demos replace plain ones (sorted order: plain before _multilevel).
+            key = stem.removesuffix("_multilevel")
             with open(f) as fh:
-                demo_prompts[f.stem] = fh.read()
+                demo_prompts[key] = fh.read()
 
     # Build the data structure for JS
     comparison_data: list[dict[str, Any]] = []
diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py
index 87214f8..3379f65 100644
--- a/scripts/run_dc_eval.py
+++ b/scripts/run_dc_eval.py
@@ -265,7 +265,10 @@ def main() -> int:
         if not args.dc_only:
             conditions.append((tid, f"val_zs_{sid}", None))
         if not args.zs_only:
-            demo_path = demo_dir / f"{tid}.txt"
+            # Prefer multilevel demo (Option D format) over plain .txt
+            demo_path = demo_dir / f"{tid}_multilevel.txt"
+            if not demo_path.exists():
+                demo_path = demo_dir / f"{tid}.txt"
             if not demo_path.exists():
                 demo_path = demo_dir / f"{tid}.json"
             if not demo_path.exists():
diff --git a/scripts/run_eval_pipeline.py b/scripts/run_eval_pipeline.py
index 5f7565c..02bca0a 100644
--- a/scripts/run_eval_pipeline.py
+++ b/scripts/run_eval_pipeline.py
@@ -85,7 +85,10 @@ def _find_recordings_needing_demos(
             if not any(task_id.startswith(f) for f in task_filter):
                 continue
 
-        demo_path = demo_dir / f"{task_id}.txt"
+        # Prefer multilevel demo (Option D format) over plain .txt
+        demo_path = demo_dir / f"{task_id}_multilevel.txt"
+        if not demo_path.exists():
+            demo_path = demo_dir / f"{task_id}.txt"
         if not demo_path.exists():
             missing.append((task_dir, task_id))
 
@@ -410,7 +413,10 @@ def _build_conditions(
         if not dc_only:
             conditions.append((tid, f"val_zs_{sid}", None))
         if not zs_only:
-            demo_path = demo_dir / f"{tid}.txt"
+            # Prefer multilevel demo (Option D format) over plain .txt
+            demo_path = demo_dir / f"{tid}_multilevel.txt"
+            if not demo_path.exists():
+                demo_path = demo_dir / f"{tid}.txt"
             if not demo_path.exists():
                 demo_path = demo_dir / f"{tid}.json"
             if demo_path.exists():