Skip to content

Commit 2c15ca1

Browse files
abrichrclaude
andauthored
feat: add --correction-library and --enable-correction-capture CLI flags (#130)
Wires the correction flywheel into the `live` subcommand: - --correction-library PATH: load/store corrections for the flywheel - --enable-correction-capture: prompt for human corrections on failure - --controller, --max-retries, --max-replans: DemoController flags When --controller is active with --correction-library, the controller checks stored corrections before replanning and captures new corrections when enabled. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent abcafe8 commit 2c15ca1

1 file changed

Lines changed: 53 additions & 7 deletions

File tree

  • openadapt_evals/benchmarks

openadapt_evals/benchmarks/cli.py

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -730,14 +730,50 @@ def cmd_live(args: argparse.Namespace) -> int:
730730
print("Example: --task-ids notepad_1,notepad_2,browser_1")
731731
return 1
732732

733+
# Check if controller mode is requested
734+
use_controller = getattr(args, "controller", False) and demo_text is not None
735+
if use_controller:
736+
print(f"Using DemoController (max_retries={args.max_retries}, max_replans={args.max_replans})")
737+
738+
# Set up correction store if requested
739+
correction_store = None
740+
enable_correction_capture = getattr(args, "enable_correction_capture", False)
741+
correction_library_path = getattr(args, "correction_library", None)
742+
if correction_library_path:
743+
from openadapt_evals.correction_store import CorrectionStore
744+
745+
correction_store = CorrectionStore(correction_library_path)
746+
print(f"Correction library: {correction_library_path}")
747+
if enable_correction_capture:
748+
print("Correction capture: ENABLED (will prompt for human corrections on failure)")
749+
733750
# Run evaluation
734-
results = evaluate_agent_on_benchmark(
735-
agent=agent,
736-
adapter=adapter,
737-
max_steps=args.max_steps,
738-
task_ids=task_ids,
739-
config=eval_config,
740-
)
751+
if use_controller:
752+
from openadapt_evals.demo_controller import run_with_controller
753+
754+
results = []
755+
for tid in task_ids:
756+
task = adapter.load_task(tid)
757+
result = run_with_controller(
758+
agent=agent,
759+
adapter=adapter,
760+
task=task,
761+
demo_text=demo_text,
762+
max_steps=args.max_steps,
763+
max_retries=args.max_retries,
764+
max_replans=args.max_replans,
765+
correction_store=correction_store,
766+
enable_correction_capture=enable_correction_capture,
767+
)
768+
results.append(result)
769+
else:
770+
results = evaluate_agent_on_benchmark(
771+
agent=agent,
772+
adapter=adapter,
773+
max_steps=args.max_steps,
774+
task_ids=task_ids,
775+
config=eval_config,
776+
)
741777

742778
# Compute and display metrics
743779
metrics = compute_metrics(results)
@@ -2517,6 +2553,16 @@ def main() -> int:
25172553
live_parser.add_argument("--focus-check-method", type=str, default="win32",
25182554
choices=["win32", "a11y", "both"],
25192555
help="Method for foreground window check: win32 (fast, default), a11y, or both")
2556+
live_parser.add_argument("--controller", action="store_true",
2557+
help="Use DemoController state machine for step-by-step plan execution with VLM verification")
2558+
live_parser.add_argument("--max-retries", type=int, default=2,
2559+
help="Max retries per step when using --controller (default: 2)")
2560+
live_parser.add_argument("--max-replans", type=int, default=2,
2561+
help="Max replans when using --controller (default: 2)")
2562+
live_parser.add_argument("--correction-library", type=str, default=None,
2563+
help="Path to correction library directory for the correction flywheel")
2564+
live_parser.add_argument("--enable-correction-capture", action="store_true",
2565+
help="Enable HITL correction capture when agent fails (requires --correction-library)")
25202566

25212567
# Probe server
25222568
probe_parser = subparsers.add_parser("probe", help="Check if WAA server is reachable")

0 commit comments

Comments
 (0)