File tree Expand file tree Collapse file tree
openadapt_evals/benchmarks Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -2049,15 +2049,20 @@ def _suite_task_short_name(task_id: str) -> str:
20492049
20502050
20512051def _suite_find_demo (demo_dir : Path , task_id : str ) -> Path | None :
2052- """Find demo file (.txt preferred, then .json) for a task ID.
2052+ """Find demo file for a task ID.
20532053
2054- Prefers .txt (natural language) over .json because:
2055- - .txt demos describe actions in human-readable terms
2056- - .json demos contain normalized coordinates (0-1) that mismatch
2057- the agent's pixel coordinate action space
2054+ Preference order:
2055+ 1. ``{task_id}_multilevel.txt`` — Option D multi-level conditioning format
2056+ 2. ``{task_id}.txt`` — plain natural-language demo
2057+ 3. ``{task_id}.json`` — structured JSON (normalized coords, less useful)
2058+
2059+ Multilevel demos are preferred because the rigid plain-text format can
2060+ cause demo-conditioned agents to abandon the task when UI state doesn't
2061+ exactly match the description. The multilevel format uses PLAN +
2062+ {Think, Action, Expect} steps with "adapt if needed" framing.
20582063 """
2059- for ext in (".txt" , ".json" ):
2060- p = demo_dir / f"{ task_id } { ext } "
2064+ for suffix in ("_multilevel.txt" , ".txt" , ".json" ):
2065+ p = demo_dir / f"{ task_id } { suffix } "
20612066 if p .exists ():
20622067 return p
20632068 return None
Original file line number Diff line number Diff line change @@ -118,12 +118,17 @@ def generate_comparison_viewer(
118118 # Match tasks across runs
119119 matched = _match_tasks (loaded_runs )
120120
121- # Load demo prompts if available
121+ # Load demo prompts if available.
122+ # When both {id}.txt and {id}_multilevel.txt exist, prefer multilevel.
122123 demo_prompts : dict [str , str ] = {}
123124 if demo_prompts_dir and demo_prompts_dir .exists ():
124- for f in demo_prompts_dir .glob ("*.txt" ):
125+ for f in sorted (demo_prompts_dir .glob ("*.txt" )):
126+ stem = f .stem
127+ # Strip _multilevel suffix for the lookup key so multilevel
128+ # demos replace plain ones (sorted order: plain before _multilevel).
129+ key = stem .removesuffix ("_multilevel" )
125130 with open (f ) as fh :
126- demo_prompts [f . stem ] = fh .read ()
131+ demo_prompts [key ] = fh .read ()
127132
128133 # Build the data structure for JS
129134 comparison_data : list [dict [str , Any ]] = []
Original file line number Diff line number Diff line change @@ -265,7 +265,10 @@ def main() -> int:
265265 if not args .dc_only :
266266 conditions .append ((tid , f"val_zs_{ sid } " , None ))
267267 if not args .zs_only :
268- demo_path = demo_dir / f"{ tid } .txt"
268+ # Prefer multilevel demo (Option D format) over plain .txt
269+ demo_path = demo_dir / f"{ tid } _multilevel.txt"
270+ if not demo_path .exists ():
271+ demo_path = demo_dir / f"{ tid } .txt"
269272 if not demo_path .exists ():
270273 demo_path = demo_dir / f"{ tid } .json"
271274 if not demo_path .exists ():
Original file line number Diff line number Diff line change @@ -85,7 +85,10 @@ def _find_recordings_needing_demos(
8585 if not any (task_id .startswith (f ) for f in task_filter ):
8686 continue
8787
88- demo_path = demo_dir / f"{ task_id } .txt"
88+ # Prefer multilevel demo (Option D format) over plain .txt
89+ demo_path = demo_dir / f"{ task_id } _multilevel.txt"
90+ if not demo_path .exists ():
91+ demo_path = demo_dir / f"{ task_id } .txt"
8992 if not demo_path .exists ():
9093 missing .append ((task_dir , task_id ))
9194
@@ -410,7 +413,10 @@ def _build_conditions(
410413 if not dc_only :
411414 conditions .append ((tid , f"val_zs_{ sid } " , None ))
412415 if not zs_only :
413- demo_path = demo_dir / f"{ tid } .txt"
416+ # Prefer multilevel demo (Option D format) over plain .txt
417+ demo_path = demo_dir / f"{ tid } _multilevel.txt"
418+ if not demo_path .exists ():
419+ demo_path = demo_dir / f"{ tid } .txt"
414420 if not demo_path .exists ():
415421 demo_path = demo_dir / f"{ tid } .json"
416422 if demo_path .exists ():
You can’t perform that action at this time.
0 commit comments