Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions openadapt_evals/benchmarks/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2049,15 +2049,20 @@ def _suite_task_short_name(task_id: str) -> str:


def _suite_find_demo(demo_dir: Path, task_id: str) -> Path | None:
"""Find demo file (.txt preferred, then .json) for a task ID.
"""Find demo file for a task ID.

Prefers .txt (natural language) over .json because:
- .txt demos describe actions in human-readable terms
- .json demos contain normalized coordinates (0-1) that mismatch
the agent's pixel coordinate action space
Preference order:
1. ``{task_id}_multilevel.txt`` — Option D multi-level conditioning format
2. ``{task_id}.txt`` — plain natural-language demo
3. ``{task_id}.json`` — structured JSON (normalized coords, less useful)

Multilevel demos are preferred because the rigid plain-text format can
cause demo-conditioned agents to abandon the task when UI state doesn't
exactly match the description. The multilevel format uses PLAN +
{Think, Action, Expect} steps with "adapt if needed" framing.
"""
for ext in (".txt", ".json"):
p = demo_dir / f"{task_id}{ext}"
for suffix in ("_multilevel.txt", ".txt", ".json"):
p = demo_dir / f"{task_id}{suffix}"
if p.exists():
return p
return None
Expand Down
11 changes: 8 additions & 3 deletions openadapt_evals/benchmarks/comparison_viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,17 @@ def generate_comparison_viewer(
# Match tasks across runs
matched = _match_tasks(loaded_runs)

# Load demo prompts if available
# Load demo prompts if available.
# When both {id}.txt and {id}_multilevel.txt exist, prefer multilevel.
demo_prompts: dict[str, str] = {}
if demo_prompts_dir and demo_prompts_dir.exists():
for f in demo_prompts_dir.glob("*.txt"):
for f in sorted(demo_prompts_dir.glob("*.txt")):
stem = f.stem
# Strip _multilevel suffix for the lookup key so multilevel
# demos replace plain ones (sorted order: plain before _multilevel).
key = stem.removesuffix("_multilevel")
with open(f) as fh:
demo_prompts[f.stem] = fh.read()
demo_prompts[key] = fh.read()

# Build the data structure for JS
comparison_data: list[dict[str, Any]] = []
Expand Down
5 changes: 4 additions & 1 deletion scripts/run_dc_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,10 @@ def main() -> int:
if not args.dc_only:
conditions.append((tid, f"val_zs_{sid}", None))
if not args.zs_only:
demo_path = demo_dir / f"{tid}.txt"
# Prefer multilevel demo (Option D format) over plain .txt
demo_path = demo_dir / f"{tid}_multilevel.txt"
if not demo_path.exists():
demo_path = demo_dir / f"{tid}.txt"
if not demo_path.exists():
demo_path = demo_dir / f"{tid}.json"
if not demo_path.exists():
Expand Down
10 changes: 8 additions & 2 deletions scripts/run_eval_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@ def _find_recordings_needing_demos(
if not any(task_id.startswith(f) for f in task_filter):
continue

demo_path = demo_dir / f"{task_id}.txt"
# Prefer multilevel demo (Option D format) over plain .txt
demo_path = demo_dir / f"{task_id}_multilevel.txt"
if not demo_path.exists():
demo_path = demo_dir / f"{task_id}.txt"
if not demo_path.exists():
missing.append((task_dir, task_id))

Expand Down Expand Up @@ -410,7 +413,10 @@ def _build_conditions(
if not dc_only:
conditions.append((tid, f"val_zs_{sid}", None))
if not zs_only:
demo_path = demo_dir / f"{tid}.txt"
# Prefer multilevel demo (Option D format) over plain .txt
demo_path = demo_dir / f"{tid}_multilevel.txt"
if not demo_path.exists():
demo_path = demo_dir / f"{tid}.txt"
if not demo_path.exists():
demo_path = demo_dir / f"{tid}.json"
if demo_path.exists():
Expand Down