feat: add done-gate to prevent agents from prematurely declaring task complete

abrichr · claude · abrichr · commit 4c267200824f · 2026-03-06T15:35:15.000-05:00
When enabled via --done-gate, the evaluation runner calls adapter.evaluate()
when the agent signals "done" to verify the task is actually complete. If the
score is below the threshold (default 1.0), the runner overrides the "done"
signal, appends a continuation message to the task instruction, and lets the
agent continue. Limited to a configurable max overrides (default 3) to prevent
infinite loops.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/benchmarks/cli.py b/openadapt_evals/benchmarks/cli.py
@@ -271,12 +271,18 @@ def cmd_mock(args: argparse.Namespace) -> int:
         return 1
 
     # Create config for trace collection
+    done_gate = getattr(args, "done_gate", False)
+    done_gate_max_overrides = getattr(args, "done_gate_max_overrides", 3)
+    done_gate_threshold = getattr(args, "done_gate_threshold", 1.0)
     config = None
-    if args.output:
+    if args.output or done_gate:
         config = EvaluationConfig(
-            save_execution_traces=True,
-            output_dir=args.output,
+            save_execution_traces=bool(args.output),
+            output_dir=args.output or "benchmark_results",
             run_name=args.run_name or "mock_eval",
+            done_gate=done_gate,
+            done_gate_max_overrides=done_gate_max_overrides,
+            done_gate_threshold=done_gate_threshold,
         )
 
     # Run evaluation
@@ -438,6 +444,9 @@ def cmd_run(args: argparse.Namespace) -> int:
         save_execution_traces=True,
         output_dir=args.output,
         run_name=args.run_name,
+        done_gate=getattr(args, "done_gate", False),
+        done_gate_max_overrides=getattr(args, "done_gate_max_overrides", 3),
+        done_gate_threshold=getattr(args, "done_gate_threshold", 1.0),
     )
 
     print(f"Running {len(task_ids)} task(s): {', '.join(task_ids)}")
@@ -652,12 +661,18 @@ def cmd_live(args: argparse.Namespace) -> int:
         return 1
 
     # Create config for trace collection
+    done_gate = getattr(args, "done_gate", False)
+    done_gate_max_overrides = getattr(args, "done_gate_max_overrides", 3)
+    done_gate_threshold = getattr(args, "done_gate_threshold", 1.0)
     eval_config = None
-    if args.output:
+    if args.output or done_gate:
         eval_config = EvaluationConfig(
-            save_execution_traces=True,
-            output_dir=args.output,
+            save_execution_traces=bool(args.output),
+            output_dir=args.output or "benchmark_results",
             run_name=args.run_name or "live_eval",
+            done_gate=done_gate,
+            done_gate_max_overrides=done_gate_max_overrides,
+            done_gate_threshold=done_gate_threshold,
         )
 
     # Load tasks
@@ -2345,6 +2360,12 @@ def main() -> int:
     mock_parser.add_argument("--use-a11y-tree", action="store_true", help="Enable accessibility tree grounding for Qwen3VL")
     mock_parser.add_argument("--output", type=str, help="Output directory for traces")
     mock_parser.add_argument("--run-name", type=str, help="Name for this evaluation run")
+    mock_parser.add_argument("--done-gate", action="store_true",
+                            help="Verify task completion before accepting agent's 'done' signal")
+    mock_parser.add_argument("--done-gate-max-overrides", type=int, default=3,
+                            help="Max times to override premature 'done' (default: 3)")
+    mock_parser.add_argument("--done-gate-threshold", type=float, default=1.0,
+                            help="Minimum score to accept 'done' (default: 1.0)")
 
     # Simplified run command (recommended for live evaluation)
     run_parser = subparsers.add_parser(
@@ -2387,6 +2408,12 @@ def main() -> int:
                            help="Force network/audio tray icons visible for stable click-coordinate tasks")
     run_parser.add_argument("--waa-image-version", type=str, default=None,
                            help="Pinned WAA image version label to record in run metadata")
+    run_parser.add_argument("--done-gate", action="store_true",
+                           help="Verify task completion before accepting agent's 'done' signal")
+    run_parser.add_argument("--done-gate-max-overrides", type=int, default=3,
+                           help="Max times to override premature 'done' (default: 3)")
+    run_parser.add_argument("--done-gate-threshold", type=float, default=1.0,
+                           help="Minimum score to accept 'done' (default: 1.0)")
 
     # Live evaluation (full control)
     live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)")
@@ -2415,6 +2442,12 @@ def main() -> int:
                             help="Force network/audio tray icons visible for stable click-coordinate tasks")
     live_parser.add_argument("--waa-image-version", type=str, default=None,
                             help="Pinned WAA image version label to record in run metadata")
+    live_parser.add_argument("--done-gate", action="store_true",
+                            help="Verify task completion before accepting agent's 'done' signal")
+    live_parser.add_argument("--done-gate-max-overrides", type=int, default=3,
+                            help="Max times to override premature 'done' (default: 3)")
+    live_parser.add_argument("--done-gate-threshold", type=float, default=1.0,
+                            help="Minimum score to accept 'done' (default: 1.0)")
 
     # Probe server
     probe_parser = subparsers.add_parser("probe", help="Check if WAA server is reachable")
diff --git a/openadapt_evals/benchmarks/runner.py b/openadapt_evals/benchmarks/runner.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import copy
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -58,6 +59,9 @@ class EvaluationConfig:
         run_name: Name for this evaluation run.
         enable_live_tracking: Whether to enable live evaluation progress tracking.
         live_tracking_file: Path to live tracking JSON file.
+        done_gate: Whether to verify task completion before accepting agent's "done".
+        done_gate_max_overrides: Max times to override a premature "done" (default 3).
+        done_gate_threshold: Minimum score to accept "done" (default 1.0).
     """
 
     max_steps: int = 50
@@ -72,6 +76,9 @@ class EvaluationConfig:
     run_name: str | None = None
     enable_live_tracking: bool = True
     live_tracking_file: str = "benchmark_live.json"
+    done_gate: bool = False
+    done_gate_max_overrides: int = 3
+    done_gate_threshold: float = 1.0
 
 
 def evaluate_agent_on_benchmark(
@@ -319,6 +326,7 @@ def _run_single_task(
         done = False
         steps = 0
         action = None
+        done_gate_overrides = 0
         max_steps = task.time_limit_steps or config.max_steps
 
         while not done and steps < max_steps:
@@ -367,10 +375,98 @@ def _run_single_task(
             if action.type in ("done", "error"):
                 if action.type == "error":
                     logger.error(f"Step {steps}: Agent error: {action.raw_action}")
+                    done = True
+                    break
+
+                # Agent says "done" — apply done-gate if enabled
+                logger.info(f"Step {steps}: Agent signaled task completion")
+
+                if (
+                    config.done_gate
+                    and done_gate_overrides < config.done_gate_max_overrides
+                ):
+                    logger.info(
+                        f"Step {steps}: Done-gate active — evaluating task "
+                        f"(override {done_gate_overrides + 1}/{config.done_gate_max_overrides})"
+                    )
+                    try:
+                        gate_result = adapter.evaluate(task)
+                        gate_score = gate_result.score
+                    except Exception as e:
+                        logger.warning(
+                            f"Step {steps}: Done-gate evaluation failed: {e}. "
+                            "Accepting 'done' to avoid infinite loop."
+                        )
+                        done = True
+                        break
+
+                    if gate_score >= config.done_gate_threshold:
+                        logger.info(
+                            f"Step {steps}: Done-gate PASSED "
+                            f"(score={gate_score:.2f} >= {config.done_gate_threshold:.2f})"
+                        )
+                        done = True
+                        break
+
+                    # Override the premature "done"
+                    done_gate_overrides += 1
+                    logger.warning(
+                        f"Step {steps}: Done-gate REJECTED premature 'done' "
+                        f"(score={gate_score:.2f} < {config.done_gate_threshold:.2f}, "
+                        f"override {done_gate_overrides}/{config.done_gate_max_overrides})"
+                    )
+
+                    # Modify the task instruction to tell the agent to continue.
+                    # Strip any previous done-gate message before appending the new one.
+                    _DONE_GATE_MARKER = "\n\n[SYSTEM: The task is NOT yet complete"
+                    continuation_msg = (
+                        "\n\n[SYSTEM: The task is NOT yet complete based on automated "
+                        "evaluation (score: {score:.0%}). Your previous 'done' signal "
+                        "was overridden ({n}/{max}). Please examine the current screen "
+                        "carefully and continue working on the task. Do NOT declare "
+                        "'done' unless the task is truly finished.]"
+                    ).format(
+                        score=gate_score,
+                        n=done_gate_overrides,
+                        max=config.done_gate_max_overrides,
+                    )
+
+                    # Create a modified task with continuation message
+                    task = copy.copy(task)
+                    # Remove previous done-gate message if present
+                    marker_idx = task.instruction.find(_DONE_GATE_MARKER)
+                    if marker_idx >= 0:
+                        task.instruction = task.instruction[:marker_idx]
+                    task.instruction = task.instruction + continuation_msg
+
+                    # Get a fresh observation for the agent's next step
+                    # Use a no-op key press to trigger a new screenshot
+                    try:
+                        noop_action = BenchmarkAction(type="key", key="")
+                        obs, env_done, _info = adapter.step(noop_action)
+                        if env_done:
+                            logger.info(
+                                f"Step {steps}: Environment signaled done during "
+                                "done-gate screenshot refresh"
+                            )
+                            done = True
+                            break
+                    except Exception as e:
+                        logger.warning(
+                            f"Step {steps}: Failed to get fresh observation "
+                            f"after done-gate override: {e}. Using previous obs."
+                        )
+
+                    steps += 1
+                    continue
                 else:
-                    logger.info(f"Step {steps}: Agent signaled task completion")
-                done = True
-                break
+                    if config.done_gate and done_gate_overrides >= config.done_gate_max_overrides:
+                        logger.warning(
+                            f"Step {steps}: Done-gate max overrides reached "
+                            f"({config.done_gate_max_overrides}). Accepting 'done'."
+                        )
+                    done = True
+                    break
 
             # Execute action
             try:
diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py
@@ -256,6 +256,23 @@ def main() -> int:
         default=2,
         help="Max replans when using --controller (default: 2)",
     )
+    parser.add_argument(
+        "--done-gate",
+        action="store_true",
+        help="Verify task completion before accepting agent's 'done' signal",
+    )
+    parser.add_argument(
+        "--done-gate-max-overrides",
+        type=int,
+        default=3,
+        help="Max times to override premature 'done' (default: 3)",
+    )
+    parser.add_argument(
+        "--done-gate-threshold",
+        type=float,
+        default=1.0,
+        help="Minimum score to accept 'done' (default: 1.0)",
+    )
     args = parser.parse_args()
 
     demo_dir = Path(args.demo_dir)
@@ -359,6 +376,12 @@ def main() -> int:
             cmd.append("--force-tray-icons")
         if args.waa_image_version:
             cmd.extend(["--waa-image-version", args.waa_image_version])
+        if args.done_gate:
+            cmd.extend([
+                "--done-gate",
+                "--done-gate-max-overrides", str(args.done_gate_max_overrides),
+                "--done-gate-threshold", str(args.done_gate_threshold),
+            ])
 
         result = subprocess.run(cmd)
         elapsed = time.time() - task_start