Skip to content

Commit eb9bc3e

Browse files
abrichrclaude
andauthored
fix: prefer multilevel demo files over plain .txt in eval scripts (#103)
When both {task_id}_multilevel.txt and {task_id}.txt exist in the demo directory, all demo file lookup paths now prefer the multilevel (Option D) format. Falls back to plain .txt, then .json for backwards compatibility. Files changed: - scripts/run_dc_eval.py - scripts/run_eval_pipeline.py - openadapt_evals/benchmarks/cli.py (_suite_find_demo) - openadapt_evals/benchmarks/comparison_viewer.py Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7a63fa1 commit eb9bc3e

4 files changed

Lines changed: 32 additions & 13 deletions

File tree

openadapt_evals/benchmarks/cli.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,15 +2049,20 @@ def _suite_task_short_name(task_id: str) -> str:
20492049

20502050

20512051
def _suite_find_demo(demo_dir: Path, task_id: str) -> Path | None:
2052-
"""Find demo file (.txt preferred, then .json) for a task ID.
2052+
"""Find demo file for a task ID.
20532053
2054-
Prefers .txt (natural language) over .json because:
2055-
- .txt demos describe actions in human-readable terms
2056-
- .json demos contain normalized coordinates (0-1) that mismatch
2057-
the agent's pixel coordinate action space
2054+
Preference order:
2055+
1. ``{task_id}_multilevel.txt`` — Option D multi-level conditioning format
2056+
2. ``{task_id}.txt`` — plain natural-language demo
2057+
3. ``{task_id}.json`` — structured JSON (normalized coords, less useful)
2058+
2059+
Multilevel demos are preferred because the rigid plain-text format can
2060+
cause demo-conditioned agents to abandon the task when UI state doesn't
2061+
exactly match the description. The multilevel format uses PLAN +
2062+
{Think, Action, Expect} steps with "adapt if needed" framing.
20582063
"""
2059-
for ext in (".txt", ".json"):
2060-
p = demo_dir / f"{task_id}{ext}"
2064+
for suffix in ("_multilevel.txt", ".txt", ".json"):
2065+
p = demo_dir / f"{task_id}{suffix}"
20612066
if p.exists():
20622067
return p
20632068
return None

openadapt_evals/benchmarks/comparison_viewer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,17 @@ def generate_comparison_viewer(
118118
# Match tasks across runs
119119
matched = _match_tasks(loaded_runs)
120120

121-
# Load demo prompts if available
121+
# Load demo prompts if available.
122+
# When both {id}.txt and {id}_multilevel.txt exist, prefer multilevel.
122123
demo_prompts: dict[str, str] = {}
123124
if demo_prompts_dir and demo_prompts_dir.exists():
124-
for f in demo_prompts_dir.glob("*.txt"):
125+
for f in sorted(demo_prompts_dir.glob("*.txt")):
126+
stem = f.stem
127+
# Strip _multilevel suffix for the lookup key so multilevel
128+
# demos replace plain ones (sorted order: plain before _multilevel).
129+
key = stem.removesuffix("_multilevel")
125130
with open(f) as fh:
126-
demo_prompts[f.stem] = fh.read()
131+
demo_prompts[key] = fh.read()
127132

128133
# Build the data structure for JS
129134
comparison_data: list[dict[str, Any]] = []

scripts/run_dc_eval.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,10 @@ def main() -> int:
265265
if not args.dc_only:
266266
conditions.append((tid, f"val_zs_{sid}", None))
267267
if not args.zs_only:
268-
demo_path = demo_dir / f"{tid}.txt"
268+
# Prefer multilevel demo (Option D format) over plain .txt
269+
demo_path = demo_dir / f"{tid}_multilevel.txt"
270+
if not demo_path.exists():
271+
demo_path = demo_dir / f"{tid}.txt"
269272
if not demo_path.exists():
270273
demo_path = demo_dir / f"{tid}.json"
271274
if not demo_path.exists():

scripts/run_eval_pipeline.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@ def _find_recordings_needing_demos(
8585
if not any(task_id.startswith(f) for f in task_filter):
8686
continue
8787

88-
demo_path = demo_dir / f"{task_id}.txt"
88+
# Prefer multilevel demo (Option D format) over plain .txt
89+
demo_path = demo_dir / f"{task_id}_multilevel.txt"
90+
if not demo_path.exists():
91+
demo_path = demo_dir / f"{task_id}.txt"
8992
if not demo_path.exists():
9093
missing.append((task_dir, task_id))
9194

@@ -410,7 +413,10 @@ def _build_conditions(
410413
if not dc_only:
411414
conditions.append((tid, f"val_zs_{sid}", None))
412415
if not zs_only:
413-
demo_path = demo_dir / f"{tid}.txt"
416+
# Prefer multilevel demo (Option D format) over plain .txt
417+
demo_path = demo_dir / f"{tid}_multilevel.txt"
418+
if not demo_path.exists():
419+
demo_path = demo_dir / f"{tid}.txt"
414420
if not demo_path.exists():
415421
demo_path = demo_dir / f"{tid}.json"
416422
if demo_path.exists():

0 commit comments

Comments
 (0)