OpenAdaptAI
diff --git a/‎.beads/beads.db‎
0 Bytes b/‎.beads/beads.db‎
0 Bytes
diff --git a/‎.beads/issues.jsonl‎
Lines changed: 1 addition & 1 deletion b/‎.beads/issues.jsonl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎openadapt_evals/benchmarks/cli.py‎
Lines changed: 18 additions & 0 deletions b/‎openadapt_evals/benchmarks/cli.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎openadapt_evals/correction_capture.py‎
Lines changed: 238 additions & 0 deletions b/‎openadapt_evals/correction_capture.py‎
Lines changed: 238 additions & 0 deletions
diff --git a/‎openadapt_evals/correction_parser.py‎
Lines changed: 86 additions & 0 deletions b/‎openadapt_evals/correction_parser.py‎
Lines changed: 86 additions & 0 deletions
@@ -13,5 +13,5 @@
 {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
 {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"}
 {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
-{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-07T01:44:43.380289-05:00"}
+{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-08T12:32:50.259805-04:00"}
 {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
@@ -460,6 +460,18 @@ def cmd_run(args: argparse.Namespace) -> int:
     if use_controller:
         print(f"Using DemoController (max_retries={args.max_retries}, max_replans={args.max_replans})")
 
+    # Set up correction store if requested
+    correction_store = None
+    enable_correction_capture = getattr(args, "enable_correction_capture", False)
+    correction_library_path = getattr(args, "correction_library", None)
+    if correction_library_path:
+        from openadapt_evals.correction_store import CorrectionStore
+
+        correction_store = CorrectionStore(correction_library_path)
+        print(f"Correction library: {correction_library_path}")
+        if enable_correction_capture:
+            print("Correction capture: ENABLED (will prompt for human corrections on failure)")
+
     # Run evaluation
     if use_controller:
         from openadapt_evals.demo_controller import run_with_controller
@@ -475,6 +487,8 @@ def cmd_run(args: argparse.Namespace) -> int:
                 max_steps=args.max_steps,
                 max_retries=args.max_retries,
                 max_replans=args.max_replans,
+                correction_store=correction_store,
+                enable_correction_capture=enable_correction_capture,
             )
             results.append(result)
     else:
@@ -2432,6 +2446,10 @@ def main() -> int:
     run_parser.add_argument("--focus-check-method", type=str, default="win32",
                            choices=["win32", "a11y", "both"],
                            help="Method for foreground window check: win32 (fast, default), a11y, or both")
+    run_parser.add_argument("--correction-library", type=str, default=None,
+                           help="Path to correction library directory for the correction flywheel")
+    run_parser.add_argument("--enable-correction-capture", action="store_true",
+                           help="Enable HITL correction capture when agent fails (requires --correction-library)")
 
     # Live evaluation (full control)
     live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)")
 
@@ -0,0 +1,238 @@
+"""Correction capture for the correction flywheel.
+
+Captures a human correction using openadapt-capture's Recorder (primary path)
+or falls back to simple periodic screenshots via PIL if openadapt-capture is
+not available.
+
+The Recorder provides full input event recording (mouse + keyboard) plus
+action-gated screenshots, which gives the VLM parser much richer context
+for understanding what the human did.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import time
+from dataclasses import dataclass, field
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CorrectionResult:
+    """Result of a correction capture session."""
+
+    screenshots: list[str] = field(default_factory=list)  # paths
+    capture_dir: str | None = None  # openadapt-capture directory (if used)
+    duration_seconds: float = 0.0
+    output_dir: str = ""
+
+
+def _take_screenshot(output_path: str) -> str | None:
+    """Take a screenshot and save to output_path. Returns path or None."""
+    try:
+        from PIL import ImageGrab
+
+        img = ImageGrab.grab()
+        img.save(output_path)
+        return output_path
+    except Exception as exc:
+        logger.warning("Screenshot failed: %s", exc)
+        return None
+
+
+def _has_recorder() -> bool:
+    """Check if openadapt-capture Recorder is available."""
+    try:
+        from openadapt_capture.recorder import Recorder  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def _prompt_user(step_desc: str, explanation: str) -> None:
+    """Print the correction prompt to the terminal."""
+    print("\n" + "=" * 60)
+    print("CORRECTION NEEDED")
+    print("=" * 60)
+    print(f"Failed step: {step_desc}")
+    if explanation:
+        print(f"Reason: {explanation}")
+    print("\nPlease complete this step manually.")
+    print("Press Enter when done...")
+    print("=" * 60 + "\n")
+
+
+def _wait_for_enter(timeout_seconds: int) -> None:
+    """Block until user presses Enter or timeout expires."""
+    try:
+        import select
+        import sys
+
+        if hasattr(select, "select"):
+            remaining = timeout_seconds
+            while remaining > 0:
+                ready, _, _ = select.select([sys.stdin], [], [], 1.0)
+                if ready:
+                    sys.stdin.readline()
+                    break
+                remaining -= 1.0
+        else:
+            input()
+    except EOFError:
+        logger.info("stdin closed, stopping capture after timeout")
+        time.sleep(min(timeout_seconds, 10))
+
+
+class CorrectionCapture:
+    """Capture a human correction for a failed step."""
+
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+
+    def capture_correction(
+        self,
+        failure_context: dict,
+        timeout_seconds: int = 120,
+        interval_seconds: float = 2.0,
+    ) -> CorrectionResult:
+        """Capture a human correction.
+
+        Uses openadapt-capture Recorder if available (full input events +
+        action-gated screenshots), otherwise falls back to periodic PIL
+        screenshots.
+        """
+        # Save the failure screenshot as "before"
+        before_path = os.path.join(self.output_dir, "before.png")
+        before_screenshots = []
+        if failure_context.get("screenshot_bytes"):
+            with open(before_path, "wb") as f:
+                f.write(failure_context["screenshot_bytes"])
+            before_screenshots.append(before_path)
+        elif failure_context.get("screenshot_path"):
+            before_screenshots.append(failure_context["screenshot_path"])
+
+        step_desc = failure_context.get("step_action", "this step")
+        explanation = failure_context.get("explanation", "")
+
+        _prompt_user(step_desc, explanation)
+
+        if _has_recorder():
+            return self._capture_with_recorder(
+                before_screenshots, timeout_seconds
+            )
+        else:
+            logger.info("openadapt-capture not available, using simple screenshot capture")
+            return self._capture_simple(
+                before_screenshots, timeout_seconds, interval_seconds
+            )
+
+    def _capture_with_recorder(
+        self,
+        before_screenshots: list[str],
+        timeout_seconds: int,
+    ) -> CorrectionResult:
+        """Full capture using openadapt-capture Recorder."""
+        from openadapt_capture.recorder import Recorder
+
+        capture_dir = os.path.join(self.output_dir, "recording")
+        start = time.monotonic()
+
+        with Recorder(
+            capture_dir,
+            task_description="Human correction for failed agent step",
+            capture_video=False,  # screenshots only, faster
+            capture_audio=False,
+        ) as recorder:
+            recorder.wait_for_ready(timeout=30)
+            _wait_for_enter(timeout_seconds)
+            recorder.stop()
+
+        duration = time.monotonic() - start
+
+        # Extract screenshots from the capture
+        screenshot_paths = list(before_screenshots)
+        try:
+            from openadapt_capture.capture import CaptureSession
+
+            session = CaptureSession.load(capture_dir)
+            for i, action in enumerate(session.actions()):
+                if action.screenshot is not None:
+                    path = os.path.join(self.output_dir, f"action_{i:04d}.png")
+                    action.screenshot.save(path)
+                    screenshot_paths.append(path)
+        except Exception as exc:
+            logger.warning("Failed to extract screenshots from capture: %s", exc)
+            # Fall back to taking a final screenshot
+            after_path = os.path.join(self.output_dir, "after.png")
+            taken = _take_screenshot(after_path)
+            if taken:
+                screenshot_paths.append(taken)
+
+        logger.info(
+            "Recorder capture complete: %d screenshots in %.1fs",
+            len(screenshot_paths),
+            duration,
+        )
+        return CorrectionResult(
+            screenshots=screenshot_paths,
+            capture_dir=capture_dir,
+            duration_seconds=duration,
+            output_dir=self.output_dir,
+        )
+
+    def _capture_simple(
+        self,
+        before_screenshots: list[str],
+        timeout_seconds: int,
+        interval_seconds: float,
+    ) -> CorrectionResult:
+        """Fallback: periodic PIL screenshots."""
+        import threading
+
+        start = time.monotonic()
+        stop_event = threading.Event()
+        screenshot_paths: list[str] = []
+
+        def _capture_loop():
+            idx = 0
+            while not stop_event.is_set():
+                stop_event.wait(interval_seconds)
+                if stop_event.is_set():
+                    break
+                path = os.path.join(self.output_dir, f"capture_{idx:04d}.png")
+                taken = _take_screenshot(path)
+                if taken:
+                    screenshot_paths.append(taken)
+                idx += 1
+
+        capture_thread = threading.Thread(target=_capture_loop, daemon=True)
+        capture_thread.start()
+
+        _wait_for_enter(timeout_seconds)
+
+        stop_event.set()
+        capture_thread.join(timeout=5)
+
+        # Final "after" screenshot
+        after_path = os.path.join(self.output_dir, "after.png")
+        taken = _take_screenshot(after_path)
+        if taken:
+            screenshot_paths.append(taken)
+
+        all_screenshots = list(before_screenshots) + screenshot_paths
+        duration = time.monotonic() - start
+
+        logger.info(
+            "Simple capture complete: %d screenshots in %.1fs",
+            len(all_screenshots),
+            duration,
+        )
+        return CorrectionResult(
+            screenshots=all_screenshots,
+            duration_seconds=duration,
+            output_dir=self.output_dir,
+        )
@@ -0,0 +1,86 @@
+"""Parse a human correction capture into a PlanStep.
+
+Uses a VLM call to compare before/after screenshots and describe what
+the human did in the same format as a plan step (think/action/expect).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+
+from openadapt_evals.vlm import vlm_call
+
+logger = logging.getLogger(__name__)
+
+_PARSE_PROMPT = """\
+The agent was trying to perform a step but failed. A human then completed the step manually.
+
+Failed step description: {step_action}
+Failure explanation: {failure_explanation}
+
+Compare the BEFORE screenshot (when the agent failed) and the AFTER screenshot \
+(after the human completed the step). Describe what the human did to complete the step.
+
+Respond in this exact JSON format:
+{{
+  "think": "reasoning about what needed to happen and why the agent failed",
+  "action": "concrete description of what the human did (e.g., 'Click the Display button in the left sidebar')",
+  "expect": "what the screen looks like after the action"
+}}
+
+Respond with ONLY the JSON object, no other text."""
+
+
+def parse_correction(
+    step_action: str,
+    failure_explanation: str,
+    before_screenshot: bytes,
+    after_screenshot: bytes,
+    model: str = "gpt-4.1-mini",
+    provider: str = "openai",
+) -> dict:
+    """Parse before/after screenshots into a PlanStep dict.
+
+    Returns dict with keys: think, action, expect.
+    """
+    prompt = _PARSE_PROMPT.format(
+        step_action=step_action,
+        failure_explanation=failure_explanation,
+    )
+
+    response = vlm_call(
+        prompt,
+        images=[before_screenshot, after_screenshot],
+        model=model,
+        provider=provider,
+        max_tokens=512,
+    )
+
+    # Extract JSON from response
+    try:
+        # Try direct parse first
+        result = json.loads(response)
+    except json.JSONDecodeError:
+        # Try to find JSON in the response
+        import re
+
+        match = re.search(r"\{[^}]+\}", response, re.DOTALL)
+        if match:
+            result = json.loads(match.group())
+        else:
+            logger.error("Failed to parse VLM response as JSON: %s", response[:200])
+            result = {
+                "think": f"Human corrected the step: {step_action}",
+                "action": step_action,
+                "expect": "Step completed successfully",
+            }
+
+    # Ensure required keys exist
+    for key in ("think", "action", "expect"):
+        if key not in result:
+            result[key] = ""
+
+    logger.info("Parsed correction: action=%s", result["action"][:80])
+    return result