diff --git a/openadapt_evals/agents/demo_executor.py b/openadapt_evals/agents/demo_executor.py index 1e92157..df82414 100644 --- a/openadapt_evals/agents/demo_executor.py +++ b/openadapt_evals/agents/demo_executor.py @@ -4,9 +4,11 @@ interpret them. The planner is only consulted as a recovery mechanism when the expected screen state doesn't match. -Tier 1 (deterministic): keyboard shortcuts, typing — execute directly. -Tier 2 (grounder-only): clicks — grounder finds element by description. -Tier 3 (planner recovery): unexpected state — planner reasons about +Tier 1 (deterministic): keyboard shortcuts, typing -- execute directly. +Tier 1.5a (text anchoring): clicks -- OCR text matching finds element + by matching target description against on-screen text ($0, <100ms). +Tier 2 (grounder-only): clicks -- grounder finds element by description. +Tier 3 (planner recovery): unexpected state -- planner reasons about what to do when the demo doesn't match reality. Usage: @@ -29,6 +31,7 @@ from openadapt_evals.adapters.base import BenchmarkAction, BenchmarkObservation from openadapt_evals.demo_library import Demo, DemoStep +from openadapt_evals.grounding import GroundingTarget, ground_by_text, run_ocr try: from openadapt_evals.integrations.weave_integration import weave_op @@ -94,19 +97,23 @@ def run( screenshot_dir: Optional directory to save screenshots. Returns: - (score, screenshots) — score from evaluate_dense(). + (score, screenshots) -- score from evaluate_dense(). """ from openadapt_evals.adapters.rl_env import ResetConfig import time as _time + _t0 = _time.time() _tier1 = 0 + _tier15a = 0 _tier2 = 0 try: from openadapt_evals.telemetry import track_demo_execution + track_demo_execution( - phase="start", task_id=task_config.id, + phase="start", + task_id=task_config.id, num_steps=len(demo.steps), ) except Exception: @@ -121,8 +128,9 @@ def run( for i, step in enumerate(demo.steps): logger.info( - "Demo step %d/%d: %s %s — %s", - i + 1, len(demo.steps), + "Demo step %d/%d: %s %s -- %s", + i + 1, + len(demo.steps), step.action_type, step.action_value or "", step.description, @@ -137,6 +145,8 @@ def run( tier = (action.raw_action or {}).get("tier", 2) if tier == 1: _tier1 += 1 + elif tier == 1.5: + _tier15a += 1 else: _tier2 += 1 @@ -158,7 +168,9 @@ def run( if passed > 0: logger.info( "Step %d: milestones %d/%d (high-water)", - i + 1, passed, total, + i + 1, + passed, + total, ) time.sleep(self._step_delay) @@ -171,11 +183,16 @@ def run( try: from openadapt_evals.telemetry import track_demo_execution + track_demo_execution( - phase="completed", task_id=task_config.id, - num_steps=len(demo.steps), score=score, + phase="completed", + task_id=task_config.id, + num_steps=len(demo.steps), + score=score, duration_seconds=_time.time() - _t0, - tier1_count=_tier1, tier2_count=_tier2, + tier1_count=_tier1, + tier15a_count=_tier15a, + tier2_count=_tier2, ) except Exception: pass @@ -190,9 +207,10 @@ def _execute_step( ) -> BenchmarkAction | None: """Produce an action for a demo step using tiered intelligence. - Tier 1: keyboard/type → direct execution (no VLM). - Tier 2: click → grounder finds element by description. - Tier 3: recovery → planner reasons about unexpected state. + Tier 1: keyboard/type -> direct execution (no VLM). + Tier 1.5a: click -> OCR text anchoring ($0, <100ms). + Tier 2: click -> grounder finds element by description. + Tier 3: recovery -> planner reasons about unexpected state. """ if step.action_type == "key": # Tier 1: deterministic keyboard action @@ -212,28 +230,110 @@ def _execute_step( logger.info("Tier 1 (direct): type=%r", text) return BenchmarkAction(type="type", text=text, raw_action={"tier": 1}) - if step.action_type == "click": - # Tier 2: grounder finds element by description + if step.action_type in ("click", "double_click"): description = step.description or step.target_description if not description: description = step.action_description + + # Tier 1.5a: Text anchoring -- try OCR match before VLM + text_action = self._try_text_anchoring(step, obs, description) + if text_action is not None: + if step.action_type == "double_click" and text_action.type == "click": + return BenchmarkAction( + type="double_click", + x=text_action.x, + y=text_action.y, + raw_action=text_action.raw_action, + ) + return text_action + + # Fall through to Tier 2: VLM grounder + if step.action_type == "double_click": + logger.info("Tier 2 (grounder): double-click %s", description) + action = self._ground_click(obs, description) + if action and action.type == "click": + return BenchmarkAction( + type="double_click", + x=action.x, + y=action.y, + ) + return action + logger.info("Tier 2 (grounder): %s", description) return self._ground_click(obs, description) - if step.action_type == "double_click": - description = step.description or step.target_description - logger.info("Tier 2 (grounder): double-click %s", description) - action = self._ground_click(obs, description) - if action and action.type == "click": - return BenchmarkAction( - type="double_click", x=action.x, y=action.y, - ) - return action - - # Unknown action type — log and skip + # Unknown action type -- log and skip logger.warning("Unknown action type %r, skipping", step.action_type) return None + def _try_text_anchoring( + self, + step: DemoStep, + obs: BenchmarkObservation, + description: str, + ) -> BenchmarkAction | None: + """Tier 1.5a: try to ground a click via OCR text matching. + + Returns a BenchmarkAction if a high-confidence text match is + found (local_score > 0.85), otherwise returns None so the + caller falls through to the VLM grounder. + """ + if not obs.screenshot or not description: + return None + + # Build a GroundingTarget from the step + if hasattr(step, "grounding_target") and step.grounding_target is not None: + target = step.grounding_target + else: + target = GroundingTarget(description=description) + + ocr_results = run_ocr(obs.screenshot) + if not ocr_results: + return None + + candidates = ground_by_text(obs.screenshot, target, ocr_results) + if not candidates or candidates[0].local_score <= 0.85: + if candidates: + logger.debug( + "Tier 1.5a: best OCR match '%s' (score=%.2f) " + "below threshold, falling through to VLM", + candidates[0].matched_text, + candidates[0].local_score, + ) + return None + + best = candidates[0] + logger.info( + "Tier 1.5a: OCR text match '%s' (score=%.2f), skipping VLM", + best.matched_text, + best.local_score, + ) + + # Convert pixel coordinates to normalized fractions. + try: + import io as _io + + from PIL import Image + + img = Image.open(_io.BytesIO(obs.screenshot)) + sw, sh = img.size + except Exception: + logger.warning("Tier 1.5a: could not determine screenshot size") + return None + + x_frac = best.point[0] / sw + y_frac = best.point[1] / sh + + return BenchmarkAction( + type="click", + x=x_frac, + y=y_frac, + raw_action={ + "tier": 1.5, + "ocr_matched_text": best.matched_text, + }, + ) + def _ground_click( self, obs: BenchmarkObservation, @@ -267,6 +367,7 @@ def _ground_click_http( serving ``inclusionAI/UI-Venus-1.5-8B``. """ import base64 + import requests endpoint = self._grounder_endpoint.rstrip("/") @@ -286,18 +387,25 @@ def _ground_click_http( ] if obs.screenshot: b64 = base64.b64encode(obs.screenshot).decode() - content.insert(0, { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{b64}"}, - }) + content.insert( + 0, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{b64}"}, + }, + ) try: - resp = requests.post(url, json={ - "model": "UI-Venus-1.5-8B", - "messages": [{"role": "user", "content": content}], - "max_tokens": 128, - "temperature": 0.0, - }, timeout=60) + resp = requests.post( + url, + json={ + "model": "UI-Venus-1.5-8B", + "messages": [{"role": "user", "content": content}], + "max_tokens": 128, + "temperature": 0.0, + }, + timeout=60, + ) resp.raise_for_status() raw = resp.json()["choices"][0]["message"]["content"] except Exception as exc: @@ -306,10 +414,11 @@ def _ground_click_http( logger.info("HTTP grounder: %s", raw[:200]) - # Parse [x1,y1,x2,y2] bbox → center click + # Parse [x1,y1,x2,y2] bbox -> center click from openadapt_evals.agents.planner_grounder_agent import ( PlannerGrounderAgent, ) + return PlannerGrounderAgent._parse_bbox_to_action(raw) def _ground_click_vlm( @@ -341,11 +450,12 @@ def _ground_click_vlm( ) from openadapt_evals.training.trl_rollout import parse_action_json + action = parse_action_json(raw) if action.type == "done": logger.warning( - "Grounder could not find %r — returning click at center", + "Grounder could not find %r -- returning click at center", description, ) return BenchmarkAction(type="click", x=0.5, y=0.5) @@ -358,14 +468,16 @@ def _dispatch_action(self, env, action: BenchmarkAction): x, y = float(action.x), float(action.y) if 0 <= x <= 1 and 0 <= y <= 1: return env.pixel_action( - x_frac=x, y_frac=y, + x_frac=x, + y_frac=y, action_type=action.type, text=action.text, key=action.key, ) else: return env.pixel_action( - x=int(x), y=int(y), + x=int(x), + y=int(y), action_type=action.type, text=action.text, key=action.key, diff --git a/openadapt_evals/grounding.py b/openadapt_evals/grounding.py index 5e50cab..0cd5662 100644 --- a/openadapt_evals/grounding.py +++ b/openadapt_evals/grounding.py @@ -1,16 +1,26 @@ -"""Grounding data model for the DemoExecutor cascade. +"""Grounding data model and text anchoring for the DemoExecutor cascade. Defines GroundingTarget (stored per click step in demo) and GroundingCandidate (produced by each tier during grounding). +Also provides text anchoring (Phase 5 / Tier 1.5a): +- ``run_ocr``: extract text from a screenshot via pytesseract (optional dep). +- ``ground_by_text``: generate grounding candidates by matching OCR text + against a GroundingTarget description. + See docs/design/grounding_cascade_design_v3.md for the full architecture. """ from __future__ import annotations +import logging +import math +from collections import Counter from dataclasses import dataclass, field from typing import Any +logger = logging.getLogger(__name__) + @dataclass class GroundingTarget: @@ -64,7 +74,9 @@ def from_dict(cls, data: dict[str, Any]) -> GroundingTarget: if f in data: val = data[f] # Convert lists to tuples for bbox fields - if f in ("crop_bbox", "click_offset", "region_changed") and isinstance(val, list): + if f in ("crop_bbox", "click_offset", "region_changed") and isinstance( + val, list + ): val = tuple(val) kwargs[f] = val return cls(**kwargs) @@ -87,3 +99,212 @@ class GroundingCandidate: spatial_score: float | None = None # consistency with demo position visual_verify_score: float | None = None # crop resemblance to target accepted: bool = False + + +# --------------------------------------------------------------------------- +# Phase 5 / Tier 1.5a: Text anchoring -- OCR-based candidate generation +# --------------------------------------------------------------------------- + + +def run_ocr(screenshot: bytes) -> list[dict]: + """Run OCR on a screenshot, returning text with bounding boxes. + + Uses ``pytesseract`` if available (optional dependency). When + pytesseract is not installed, returns an empty list so that the + caller can gracefully fall back to VLM grounding. + + Args: + screenshot: PNG bytes of the current screen. + + Returns: + List of dicts, each with keys: + - ``"text"`` (str): detected text string. + - ``"bbox"`` (list[int]): ``[x1, y1, x2, y2]`` in pixels. + - ``"confidence"`` (float): 0.0 -- 1.0 OCR confidence. + """ + try: + import pytesseract # type: ignore[import-untyped] + from PIL import Image # type: ignore[import-untyped] + except ImportError: + logger.debug("pytesseract or Pillow not installed -- OCR unavailable") + return [] + + import io + + try: + img = Image.open(io.BytesIO(screenshot)) + data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) + except Exception: + logger.warning("pytesseract OCR failed", exc_info=True) + return [] + + results: list[dict] = [] + for i, text in enumerate(data["text"]): + text = text.strip() + if not text or len(text) < 2: + continue + conf = data["conf"][i] + # pytesseract returns -1 for unreliable entries + if isinstance(conf, (int, float)) and conf < 0: + continue + x = data["left"][i] + y = data["top"][i] + w = data["width"][i] + h = data["height"][i] + results.append( + { + "text": text, + "bbox": [x, y, x + w, y + h], + "confidence": ( + float(conf) / 100.0 if isinstance(conf, (int, float)) else 0.0 + ), + } + ) + return results + + +def _char_overlap_ratio(a: str, b: str) -> float: + """Compute character-level overlap ratio between two strings. + + Returns the ratio of shared characters to the length of the shorter + string, giving a simple fuzzy-match score in [0.0, 1.0]. + """ + if not a or not b: + return 0.0 + a_lower = a.lower() + b_lower = b.lower() + counter_a = Counter(a_lower) + counter_b = Counter(b_lower) + shared = sum((counter_a & counter_b).values()) + return shared / min(len(a_lower), len(b_lower)) + + +def _bbox_center(bbox: list[int] | tuple[int, ...]) -> tuple[int, int]: + """Return the center point of a bounding box ``[x1, y1, x2, y2]``.""" + x1, y1, x2, y2 = bbox + return ((x1 + x2) // 2, (y1 + y2) // 2) + + +def _bbox_distance( + a: list[int] | tuple[int, ...], + b: list[int] | tuple[int, ...], +) -> float: + """Euclidean distance between the centers of two bounding boxes.""" + cx_a, cy_a = _bbox_center(a) + cx_b, cy_b = _bbox_center(b) + return math.sqrt((cx_a - cx_b) ** 2 + (cy_a - cy_b) ** 2) + + +_NEARBY_TEXT_BOOST = 0.05 +_NEARBY_TEXT_DISTANCE_PX = 100 +_MAX_CANDIDATES = 5 + + +def ground_by_text( + screenshot: bytes, + target: GroundingTarget, + ocr_results: list[dict] | None = None, +) -> list[GroundingCandidate]: + """Generate grounding candidates by matching target text against OCR. + + This is **Tier 1.5a** -- cheap ($0, <100 ms), runs *before* VLM + grounding. + + Matching tiers (highest to lowest): + - Exact match: ``local_score = 0.95`` + - Case-insensitive exact match: ``local_score = 0.90`` + - Substring (target description contains OCR text or vice versa): + ``local_score = 0.70`` + - Fuzzy match (>80 % character overlap): ``local_score = 0.50`` + + When ``target.nearby_text`` is set, candidates that have matching + nearby-text elements within ~100 px receive a small score boost for + spatial consistency. + + Args: + screenshot: Current screenshot PNG bytes (used only if + *ocr_results* is ``None`` to run OCR). + target: :class:`GroundingTarget` with ``description``, + ``nearby_text``, etc. + ocr_results: Pre-computed OCR results as + ``list[{"text": str, "bbox": [x1,y1,x2,y2]}]``. + If ``None``, :func:`run_ocr` is called on the screenshot. + + Returns: + Up to 5 :class:`GroundingCandidate` objects sorted by + ``local_score`` (highest first). + """ + if not target.description: + return [] + + if ocr_results is None: + ocr_results = run_ocr(screenshot) + + if not ocr_results: + return [] + + desc = target.description + candidates: list[GroundingCandidate] = [] + + for item in ocr_results: + ocr_text = item.get("text", "").strip() + if not ocr_text: + continue + bbox = item.get("bbox") + if not bbox or len(bbox) != 4: + continue + + score: float | None = None + + # 1. Exact match + if ocr_text == desc: + score = 0.95 + # 2. Case-insensitive exact match + elif ocr_text.lower() == desc.lower(): + score = 0.90 + # 3. Substring match (either direction) + elif desc.lower() in ocr_text.lower() or ocr_text.lower() in desc.lower(): + score = 0.70 + # 4. Fuzzy match (>80% character overlap) + else: + overlap = _char_overlap_ratio(ocr_text, desc) + if overlap > 0.80: + score = 0.50 + + if score is None: + continue + + center = _bbox_center(bbox) + candidates.append( + GroundingCandidate( + source="ocr", + point=center, + bbox=tuple(bbox), # type: ignore[arg-type] + local_score=score, + matched_text=ocr_text, + ) + ) + + # Nearby-text spatial consistency boost + if target.nearby_text and candidates: + for cand in candidates: + for nearby in target.nearby_text: + for item in ocr_results: + item_text = item.get("text", "").strip() + item_bbox = item.get("bbox") + if not item_text or not item_bbox or len(item_bbox) != 4: + continue + if nearby.lower() in item_text.lower(): + dist = _bbox_distance( + cand.bbox, # type: ignore[arg-type] + item_bbox, + ) + if dist <= _NEARBY_TEXT_DISTANCE_PX: + cand.local_score = min( + 1.0, cand.local_score + _NEARBY_TEXT_BOOST + ) + break + + # Sort by local_score descending, return top N + candidates.sort(key=lambda c: c.local_score, reverse=True) + return candidates[:_MAX_CANDIDATES] diff --git a/openadapt_evals/telemetry.py b/openadapt_evals/telemetry.py index 43a86fa..4392d72 100644 --- a/openadapt_evals/telemetry.py +++ b/openadapt_evals/telemetry.py @@ -157,13 +157,15 @@ def track_demo_execution( score: float | None = None, duration_seconds: float | None = None, tier1_count: int | None = None, + tier15a_count: int | None = None, tier2_count: int | None = None, ) -> bool: return capture_event("demo_execution", { "phase": phase, "task_id": task_id, "num_steps": num_steps, "score": score, "duration_seconds": duration_seconds, - "tier1_count": tier1_count, "tier2_count": tier2_count, + "tier1_count": tier1_count, "tier15a_count": tier15a_count, + "tier2_count": tier2_count, }) diff --git a/tests/test_text_anchoring.py b/tests/test_text_anchoring.py new file mode 100644 index 0000000..c3386c5 --- /dev/null +++ b/tests/test_text_anchoring.py @@ -0,0 +1,362 @@ +"""Tests for Tier 1.5a text anchoring (OCR-based grounding). + +All tests use pre-computed mock OCR results -- no pytesseract required. +""" + +from __future__ import annotations + +import sys +from unittest.mock import patch + +import pytest + +from openadapt_evals.grounding import ( + GroundingCandidate, + GroundingTarget, + ground_by_text, + run_ocr, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_ocr_results(*entries: tuple[str, list[int]]) -> list[dict]: + """Build mock OCR results from (text, bbox) pairs.""" + return [ + {"text": text, "bbox": bbox, "confidence": 0.95} + for text, bbox in entries + ] + + +DUMMY_SCREENSHOT = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100 # fake PNG header + + +# --------------------------------------------------------------------------- +# ground_by_text tests +# --------------------------------------------------------------------------- + + +class TestGroundByTextExactMatch: + """Exact text match should produce local_score = 0.95.""" + + def test_exact_match(self): + target = GroundingTarget(description="Save") + ocr = _make_ocr_results(("Save", [100, 200, 160, 230])) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert len(candidates) == 1 + assert candidates[0].local_score == 0.95 + assert candidates[0].matched_text == "Save" + assert candidates[0].source == "ocr" + # Center of [100, 200, 160, 230] = (130, 215) + assert candidates[0].point == (130, 215) + assert candidates[0].bbox == (100, 200, 160, 230) + + +class TestGroundByTextCaseInsensitive: + """Case-insensitive exact match should produce local_score = 0.90.""" + + def test_case_insensitive_match(self): + target = GroundingTarget(description="Save") + ocr = _make_ocr_results(("save", [100, 200, 160, 230])) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert len(candidates) == 1 + assert candidates[0].local_score == 0.90 + assert candidates[0].matched_text == "save" + + def test_case_insensitive_match_upper(self): + target = GroundingTarget(description="save") + ocr = _make_ocr_results(("SAVE", [100, 200, 160, 230])) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert len(candidates) == 1 + assert candidates[0].local_score == 0.90 + + +class TestGroundByTextSubstring: + """Substring match should produce local_score = 0.70.""" + + def test_description_contains_ocr_text(self): + target = GroundingTarget(description="Save button") + ocr = _make_ocr_results(("Save", [100, 200, 160, 230])) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert len(candidates) == 1 + assert candidates[0].local_score == 0.70 + + def test_ocr_text_contains_description(self): + target = GroundingTarget(description="Save") + ocr = _make_ocr_results(("Save As...", [100, 200, 200, 230])) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert len(candidates) == 1 + assert candidates[0].local_score == 0.70 + + +class TestGroundByTextNoMatch: + """No match should return empty list.""" + + def test_no_match(self): + target = GroundingTarget(description="Delete") + ocr = _make_ocr_results( + ("Save", [100, 200, 160, 230]), + ("Open", [200, 200, 260, 230]), + ) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + assert candidates == [] + + def test_empty_description(self): + target = GroundingTarget(description="") + ocr = _make_ocr_results(("Save", [100, 200, 160, 230])) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + assert candidates == [] + + def test_empty_ocr_results(self): + target = GroundingTarget(description="Save") + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=[]) + assert candidates == [] + + +class TestGroundByTextNearbyTextBoost: + """Candidates near expected text should get boosted score.""" + + def test_nearby_text_boost(self): + target = GroundingTarget( + description="OK", + nearby_text=["Cancel"], + ) + # "OK" at (130, 215), "Cancel" at (210, 215) -- within 100px + ocr = _make_ocr_results( + ("OK", [100, 200, 160, 230]), + ("Cancel", [170, 200, 250, 230]), + ) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + # "OK" should get an exact-match score (0.95) + nearby boost (0.05) + ok_candidates = [c for c in candidates if c.matched_text == "OK"] + assert len(ok_candidates) == 1 + assert ok_candidates[0].local_score == 1.0 # 0.95 + 0.05 capped at 1.0 + + def test_no_nearby_text_no_boost(self): + target = GroundingTarget( + description="OK", + nearby_text=["Cancel"], + ) + # "OK" at (130, 215), "Cancel" far away at (820, 795) -- > 100px + ocr = _make_ocr_results( + ("OK", [100, 200, 160, 230]), + ("Cancel", [780, 780, 860, 810]), + ) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + ok_candidates = [c for c in candidates if c.matched_text == "OK"] + assert len(ok_candidates) == 1 + # No boost -- distance too large + assert ok_candidates[0].local_score == 0.95 + + +class TestGroundByTextSorted: + """Candidates should be sorted by local_score descending.""" + + def test_sorted_by_score(self): + target = GroundingTarget(description="Save") + ocr = _make_ocr_results( + ("save", [100, 200, 160, 230]), # case-insensitive = 0.90 + ("Save", [300, 200, 360, 230]), # exact = 0.95 + ("Save button", [500, 200, 620, 230]), # substring = 0.70 + ) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert len(candidates) == 3 + assert candidates[0].local_score == 0.95 # exact + assert candidates[1].local_score == 0.90 # case-insensitive + assert candidates[2].local_score == 0.70 # substring + + def test_max_5_candidates(self): + target = GroundingTarget(description="Item") + # 7 OCR results all containing "Item" (substring) + ocr = _make_ocr_results( + *[ + (f"Item {i}", [i * 100, 100, i * 100 + 80, 130]) + for i in range(7) + ] + ) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + assert len(candidates) <= 5 + + +class TestGroundByTextFuzzyMatch: + """Fuzzy match (>80% character overlap) should give 0.50.""" + + def test_fuzzy_match(self): + target = GroundingTarget(description="Settings") + # "Settingz" has 7/8 overlap = 87.5% > 80% + ocr = _make_ocr_results(("Settingz", [100, 200, 200, 230])) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert len(candidates) == 1 + assert candidates[0].local_score == 0.50 + + def test_no_fuzzy_below_threshold(self): + target = GroundingTarget(description="Settings") + # "Abcdefgh" has very low overlap + ocr = _make_ocr_results(("Abcdefgh", [100, 200, 200, 230])) + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert candidates == [] + + +# --------------------------------------------------------------------------- +# run_ocr tests +# --------------------------------------------------------------------------- + + +class TestRunOcrNoPytesseract: + """run_ocr should return empty list when pytesseract is not installed.""" + + def test_returns_empty_without_pytesseract(self): + # Simulate pytesseract not being installed + with patch.dict(sys.modules, {"pytesseract": None}): + result = run_ocr(DUMMY_SCREENSHOT) + assert result == [] + + +class TestGroundByTextWithMockOcr: + """ground_by_text with pre-computed OCR results (no pytesseract needed).""" + + def test_full_pipeline_with_mock_ocr(self): + """Simulate a realistic UI with multiple text elements.""" + target = GroundingTarget( + description="Clear browsing data", + nearby_text=["Privacy", "Security"], + ) + ocr = _make_ocr_results( + ("Privacy", [50, 100, 150, 130]), + ("Security", [50, 140, 160, 170]), + ("Clear browsing data", [200, 300, 400, 330]), + ("Downloads", [200, 350, 350, 380]), + ("History", [200, 400, 300, 430]), + ) + + candidates = ground_by_text(DUMMY_SCREENSHOT, target, ocr_results=ocr) + + assert len(candidates) >= 1 + best = candidates[0] + assert best.matched_text == "Clear browsing data" + assert best.local_score >= 0.95 + assert best.source == "ocr" + + def test_none_ocr_results_calls_run_ocr(self): + """When ocr_results is None, ground_by_text should call run_ocr.""" + target = GroundingTarget(description="Save") + + with patch( + "openadapt_evals.grounding.run_ocr", + return_value=_make_ocr_results(("Save", [100, 200, 160, 230])), + ) as mock_ocr: + candidates = ground_by_text( + DUMMY_SCREENSHOT, target, ocr_results=None + ) + + mock_ocr.assert_called_once_with(DUMMY_SCREENSHOT) + assert len(candidates) == 1 + assert candidates[0].matched_text == "Save" + + +# --------------------------------------------------------------------------- +# DemoExecutor integration tests +# --------------------------------------------------------------------------- + + +class TestDemoExecutorTextAnchoring: + """Test that DemoExecutor tries Tier 1.5a before VLM grounder.""" + + def test_text_anchoring_skips_vlm_on_high_confidence(self): + """High-confidence OCR match should bypass VLM grounder.""" + from openadapt_evals.adapters.base import BenchmarkObservation + from openadapt_evals.agents.demo_executor import DemoExecutor + from openadapt_evals.demo_library import DemoStep + + executor = DemoExecutor(grounder_model="gpt-4.1-mini") + + # Create a 1x1 white PNG for screenshot dimension detection + import io + + from PIL import Image + + img = Image.new("RGB", (1920, 1080), color="white") + buf = io.BytesIO() + img.save(buf, format="PNG") + screenshot_bytes = buf.getvalue() + + obs = BenchmarkObservation(screenshot=screenshot_bytes) + step = DemoStep( + step_index=0, + screenshot_path="", + action_type="click", + action_description="click Save button", + target_description="Save", + action_value="", + description="Save", + ) + + # Mock run_ocr to return a high-confidence match + mock_ocr = _make_ocr_results(("Save", [100, 200, 160, 230])) + with patch( + "openadapt_evals.agents.demo_executor.run_ocr", + return_value=mock_ocr, + ): + action = executor._execute_step(step, obs) + + assert action is not None + assert action.type == "click" + # Should have used OCR coordinates, not VLM + raw = action.raw_action or {} + assert raw.get("tier") == 1.5 + assert raw.get("ocr_matched_text") == "Save" + + def test_text_anchoring_falls_through_on_low_confidence(self): + """Low-confidence OCR match should fall through to VLM grounder.""" + from openadapt_evals.adapters.base import ( + BenchmarkAction, + BenchmarkObservation, + ) + from openadapt_evals.agents.demo_executor import DemoExecutor + from openadapt_evals.demo_library import DemoStep + + executor = DemoExecutor(grounder_model="gpt-4.1-mini") + + obs = BenchmarkObservation(screenshot=DUMMY_SCREENSHOT) + step = DemoStep( + step_index=0, + screenshot_path="", + action_type="click", + action_description="click the submit form", + target_description="Submit", + action_value="", + description="Submit", + ) + + # Mock run_ocr with no matching text + with patch( + "openadapt_evals.agents.demo_executor.run_ocr", + return_value=[], + ): + # Mock the VLM grounder to avoid real API calls + with patch.object( + executor, + "_ground_click", + return_value=BenchmarkAction( + type="click", x=0.5, y=0.5 + ), + ) as mock_vlm: + action = executor._execute_step(step, obs) + + # Should have fallen through to VLM + mock_vlm.assert_called_once() + assert action is not None + assert action.type == "click"