Skip to content

Commit 4c26720

Browse files
abrichrclaude
andcommitted
feat: add done-gate to prevent agents from prematurely declaring task complete
When enabled via --done-gate, the evaluation runner calls adapter.evaluate() when the agent signals "done" to verify the task is actually complete. If the score is below the threshold (default 1.0), the runner overrides the "done" signal, appends a continuation message to the task instruction, and lets the agent continue. Limited to a configurable max overrides (default 3) to prevent infinite loops. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b877297 commit 4c26720

3 files changed

Lines changed: 161 additions & 9 deletions

File tree

openadapt_evals/benchmarks/cli.py

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -271,12 +271,18 @@ def cmd_mock(args: argparse.Namespace) -> int:
271271
return 1
272272

273273
# Create config for trace collection
274+
done_gate = getattr(args, "done_gate", False)
275+
done_gate_max_overrides = getattr(args, "done_gate_max_overrides", 3)
276+
done_gate_threshold = getattr(args, "done_gate_threshold", 1.0)
274277
config = None
275-
if args.output:
278+
if args.output or done_gate:
276279
config = EvaluationConfig(
277-
save_execution_traces=True,
278-
output_dir=args.output,
280+
save_execution_traces=bool(args.output),
281+
output_dir=args.output or "benchmark_results",
279282
run_name=args.run_name or "mock_eval",
283+
done_gate=done_gate,
284+
done_gate_max_overrides=done_gate_max_overrides,
285+
done_gate_threshold=done_gate_threshold,
280286
)
281287

282288
# Run evaluation
@@ -438,6 +444,9 @@ def cmd_run(args: argparse.Namespace) -> int:
438444
save_execution_traces=True,
439445
output_dir=args.output,
440446
run_name=args.run_name,
447+
done_gate=getattr(args, "done_gate", False),
448+
done_gate_max_overrides=getattr(args, "done_gate_max_overrides", 3),
449+
done_gate_threshold=getattr(args, "done_gate_threshold", 1.0),
441450
)
442451

443452
print(f"Running {len(task_ids)} task(s): {', '.join(task_ids)}")
@@ -652,12 +661,18 @@ def cmd_live(args: argparse.Namespace) -> int:
652661
return 1
653662

654663
# Create config for trace collection
664+
done_gate = getattr(args, "done_gate", False)
665+
done_gate_max_overrides = getattr(args, "done_gate_max_overrides", 3)
666+
done_gate_threshold = getattr(args, "done_gate_threshold", 1.0)
655667
eval_config = None
656-
if args.output:
668+
if args.output or done_gate:
657669
eval_config = EvaluationConfig(
658-
save_execution_traces=True,
659-
output_dir=args.output,
670+
save_execution_traces=bool(args.output),
671+
output_dir=args.output or "benchmark_results",
660672
run_name=args.run_name or "live_eval",
673+
done_gate=done_gate,
674+
done_gate_max_overrides=done_gate_max_overrides,
675+
done_gate_threshold=done_gate_threshold,
661676
)
662677

663678
# Load tasks
@@ -2345,6 +2360,12 @@ def main() -> int:
23452360
mock_parser.add_argument("--use-a11y-tree", action="store_true", help="Enable accessibility tree grounding for Qwen3VL")
23462361
mock_parser.add_argument("--output", type=str, help="Output directory for traces")
23472362
mock_parser.add_argument("--run-name", type=str, help="Name for this evaluation run")
2363+
mock_parser.add_argument("--done-gate", action="store_true",
2364+
help="Verify task completion before accepting agent's 'done' signal")
2365+
mock_parser.add_argument("--done-gate-max-overrides", type=int, default=3,
2366+
help="Max times to override premature 'done' (default: 3)")
2367+
mock_parser.add_argument("--done-gate-threshold", type=float, default=1.0,
2368+
help="Minimum score to accept 'done' (default: 1.0)")
23482369

23492370
# Simplified run command (recommended for live evaluation)
23502371
run_parser = subparsers.add_parser(
@@ -2387,6 +2408,12 @@ def main() -> int:
23872408
help="Force network/audio tray icons visible for stable click-coordinate tasks")
23882409
run_parser.add_argument("--waa-image-version", type=str, default=None,
23892410
help="Pinned WAA image version label to record in run metadata")
2411+
run_parser.add_argument("--done-gate", action="store_true",
2412+
help="Verify task completion before accepting agent's 'done' signal")
2413+
run_parser.add_argument("--done-gate-max-overrides", type=int, default=3,
2414+
help="Max times to override premature 'done' (default: 3)")
2415+
run_parser.add_argument("--done-gate-threshold", type=float, default=1.0,
2416+
help="Minimum score to accept 'done' (default: 1.0)")
23902417

23912418
# Live evaluation (full control)
23922419
live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)")
@@ -2415,6 +2442,12 @@ def main() -> int:
24152442
help="Force network/audio tray icons visible for stable click-coordinate tasks")
24162443
live_parser.add_argument("--waa-image-version", type=str, default=None,
24172444
help="Pinned WAA image version label to record in run metadata")
2445+
live_parser.add_argument("--done-gate", action="store_true",
2446+
help="Verify task completion before accepting agent's 'done' signal")
2447+
live_parser.add_argument("--done-gate-max-overrides", type=int, default=3,
2448+
help="Max times to override premature 'done' (default: 3)")
2449+
live_parser.add_argument("--done-gate-threshold", type=float, default=1.0,
2450+
help="Minimum score to accept 'done' (default: 1.0)")
24182451

24192452
# Probe server
24202453
probe_parser = subparsers.add_parser("probe", help="Check if WAA server is reachable")

openadapt_evals/benchmarks/runner.py

Lines changed: 99 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
import copy
1718
import logging
1819
import time
1920
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -58,6 +59,9 @@ class EvaluationConfig:
5859
run_name: Name for this evaluation run.
5960
enable_live_tracking: Whether to enable live evaluation progress tracking.
6061
live_tracking_file: Path to live tracking JSON file.
62+
done_gate: Whether to verify task completion before accepting agent's "done".
63+
done_gate_max_overrides: Max times to override a premature "done" (default 3).
64+
done_gate_threshold: Minimum score to accept "done" (default 1.0).
6165
"""
6266

6367
max_steps: int = 50
@@ -72,6 +76,9 @@ class EvaluationConfig:
7276
run_name: str | None = None
7377
enable_live_tracking: bool = True
7478
live_tracking_file: str = "benchmark_live.json"
79+
done_gate: bool = False
80+
done_gate_max_overrides: int = 3
81+
done_gate_threshold: float = 1.0
7582

7683

7784
def evaluate_agent_on_benchmark(
@@ -319,6 +326,7 @@ def _run_single_task(
319326
done = False
320327
steps = 0
321328
action = None
329+
done_gate_overrides = 0
322330
max_steps = task.time_limit_steps or config.max_steps
323331

324332
while not done and steps < max_steps:
@@ -367,10 +375,98 @@ def _run_single_task(
367375
if action.type in ("done", "error"):
368376
if action.type == "error":
369377
logger.error(f"Step {steps}: Agent error: {action.raw_action}")
378+
done = True
379+
break
380+
381+
# Agent says "done" — apply done-gate if enabled
382+
logger.info(f"Step {steps}: Agent signaled task completion")
383+
384+
if (
385+
config.done_gate
386+
and done_gate_overrides < config.done_gate_max_overrides
387+
):
388+
logger.info(
389+
f"Step {steps}: Done-gate active — evaluating task "
390+
f"(override {done_gate_overrides + 1}/{config.done_gate_max_overrides})"
391+
)
392+
try:
393+
gate_result = adapter.evaluate(task)
394+
gate_score = gate_result.score
395+
except Exception as e:
396+
logger.warning(
397+
f"Step {steps}: Done-gate evaluation failed: {e}. "
398+
"Accepting 'done' to avoid infinite loop."
399+
)
400+
done = True
401+
break
402+
403+
if gate_score >= config.done_gate_threshold:
404+
logger.info(
405+
f"Step {steps}: Done-gate PASSED "
406+
f"(score={gate_score:.2f} >= {config.done_gate_threshold:.2f})"
407+
)
408+
done = True
409+
break
410+
411+
# Override the premature "done"
412+
done_gate_overrides += 1
413+
logger.warning(
414+
f"Step {steps}: Done-gate REJECTED premature 'done' "
415+
f"(score={gate_score:.2f} < {config.done_gate_threshold:.2f}, "
416+
f"override {done_gate_overrides}/{config.done_gate_max_overrides})"
417+
)
418+
419+
# Modify the task instruction to tell the agent to continue.
420+
# Strip any previous done-gate message before appending the new one.
421+
_DONE_GATE_MARKER = "\n\n[SYSTEM: The task is NOT yet complete"
422+
continuation_msg = (
423+
"\n\n[SYSTEM: The task is NOT yet complete based on automated "
424+
"evaluation (score: {score:.0%}). Your previous 'done' signal "
425+
"was overridden ({n}/{max}). Please examine the current screen "
426+
"carefully and continue working on the task. Do NOT declare "
427+
"'done' unless the task is truly finished.]"
428+
).format(
429+
score=gate_score,
430+
n=done_gate_overrides,
431+
max=config.done_gate_max_overrides,
432+
)
433+
434+
# Create a modified task with continuation message
435+
task = copy.copy(task)
436+
# Remove previous done-gate message if present
437+
marker_idx = task.instruction.find(_DONE_GATE_MARKER)
438+
if marker_idx >= 0:
439+
task.instruction = task.instruction[:marker_idx]
440+
task.instruction = task.instruction + continuation_msg
441+
442+
# Get a fresh observation for the agent's next step
443+
# Use a no-op key press to trigger a new screenshot
444+
try:
445+
noop_action = BenchmarkAction(type="key", key="")
446+
obs, env_done, _info = adapter.step(noop_action)
447+
if env_done:
448+
logger.info(
449+
f"Step {steps}: Environment signaled done during "
450+
"done-gate screenshot refresh"
451+
)
452+
done = True
453+
break
454+
except Exception as e:
455+
logger.warning(
456+
f"Step {steps}: Failed to get fresh observation "
457+
f"after done-gate override: {e}. Using previous obs."
458+
)
459+
460+
steps += 1
461+
continue
370462
else:
371-
logger.info(f"Step {steps}: Agent signaled task completion")
372-
done = True
373-
break
463+
if config.done_gate and done_gate_overrides >= config.done_gate_max_overrides:
464+
logger.warning(
465+
f"Step {steps}: Done-gate max overrides reached "
466+
f"({config.done_gate_max_overrides}). Accepting 'done'."
467+
)
468+
done = True
469+
break
374470

375471
# Execute action
376472
try:

scripts/run_dc_eval.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,23 @@ def main() -> int:
256256
default=2,
257257
help="Max replans when using --controller (default: 2)",
258258
)
259+
parser.add_argument(
260+
"--done-gate",
261+
action="store_true",
262+
help="Verify task completion before accepting agent's 'done' signal",
263+
)
264+
parser.add_argument(
265+
"--done-gate-max-overrides",
266+
type=int,
267+
default=3,
268+
help="Max times to override premature 'done' (default: 3)",
269+
)
270+
parser.add_argument(
271+
"--done-gate-threshold",
272+
type=float,
273+
default=1.0,
274+
help="Minimum score to accept 'done' (default: 1.0)",
275+
)
259276
args = parser.parse_args()
260277

261278
demo_dir = Path(args.demo_dir)
@@ -359,6 +376,12 @@ def main() -> int:
359376
cmd.append("--force-tray-icons")
360377
if args.waa_image_version:
361378
cmd.extend(["--waa-image-version", args.waa_image_version])
379+
if args.done_gate:
380+
cmd.extend([
381+
"--done-gate",
382+
"--done-gate-max-overrides", str(args.done_gate_max_overrides),
383+
"--done-gate-threshold", str(args.done_gate_threshold),
384+
])
362385

363386
result = subprocess.run(cmd)
364387
elapsed = time.time() - task_start

0 commit comments

Comments
 (0)