diff --git a/openadapt_evals/grounding.py b/openadapt_evals/grounding.py index 19c3ced..852d067 100644 --- a/openadapt_evals/grounding.py +++ b/openadapt_evals/grounding.py @@ -349,8 +349,10 @@ def _bbox_distance( def run_ocr(screenshot: bytes) -> list[dict]: """Run OCR on a screenshot and return detected text regions. - Uses ``pytesseract`` when available. If it is not installed, returns - an empty list (graceful degradation — callers must handle ``[]``). + Tries backends in order: + 1. GLM-OCR (VLM-based, better accuracy on complex UIs, ``pip install glmocr``) + 2. pytesseract (traditional OCR, requires system Tesseract binary) + 3. Returns ``[]`` if neither is available (graceful degradation). Args: screenshot: PNG image bytes. @@ -359,10 +361,79 @@ def run_ocr(screenshot: bytes) -> list[dict]: List of dicts with keys ``"text"``, ``"bbox"`` (``[x1, y1, x2, y2]``), and ``"confidence"`` (``0.0``–``1.0``). """ + # --- Try GLM-OCR first (VLM-based, better accuracy) --- + results = _run_glm_ocr(screenshot) + if results: + return results + + # --- Fallback to pytesseract --- + results = _run_pytesseract(screenshot) + if results: + return results + + logger.debug("No OCR backend available (tried glmocr, pytesseract)") + return [] + + +def _run_glm_ocr(screenshot: bytes) -> list[dict]: + """Run GLM-OCR on a screenshot. + + GLM-OCR uses a VLM (CogViT + GLM-0.5B) for semantic text extraction. + Returns structured results with bounding boxes. + + Requires: ``pip install glmocr`` + """ + try: + from glmocr import parse # type: ignore[import-untyped] + except ImportError: + return [] + + try: + import io + import tempfile + from pathlib import Path + + # GLM-OCR expects a file path, not bytes + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(screenshot) + tmp_path = f.name + + try: + result = parse(tmp_path) + finally: + Path(tmp_path).unlink(missing_ok=True) + + # Convert GLM-OCR output to our standard format + results: list[dict] = [] + # GLM-OCR returns structured blocks with text and coordinates + for block in getattr(result, "blocks", []): + text = getattr(block, "text", "").strip() + if not text or len(text) < 2: + continue + bbox = getattr(block, "bbox", None) + if bbox and len(bbox) >= 4: + results.append({ + "text": text, + "bbox": [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])], + "confidence": getattr(block, "confidence", 0.9), + }) + if results: + logger.info("GLM-OCR extracted %d text regions", len(results)) + return results + + except Exception as exc: + logger.debug("GLM-OCR failed: %s", exc) + return [] + + +def _run_pytesseract(screenshot: bytes) -> list[dict]: + """Run pytesseract OCR on a screenshot. + + Requires: ``pip install pytesseract`` + system Tesseract binary. + """ try: import pytesseract # type: ignore[import-untyped] except ImportError: - logger.debug("pytesseract not installed — returning empty OCR results") return [] try: @@ -372,7 +443,7 @@ def run_ocr(screenshot: bytes) -> list[dict]: image = Image.open(io.BytesIO(screenshot)) data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) except Exception as exc: - logger.warning("OCR failed: %s", exc) + logger.debug("pytesseract failed: %s", exc) return [] results: list[dict] = [] @@ -393,6 +464,8 @@ def run_ocr(screenshot: bytes) -> list[dict]: "bbox": [x, y, x + w, y + h], "confidence": conf / 100.0, }) + if results: + logger.info("pytesseract extracted %d text regions", len(results)) return results diff --git a/pyproject.toml b/pyproject.toml index 7f5e910..565b760 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,11 @@ aws = [ # AWS EC2 management for VM pool operations "boto3>=1.34.0", ] +ocr = [ + # OCR for Tier 1.5a text anchoring in grounding cascade + # GLM-OCR (VLM-based, better accuracy): pip install glmocr + "glmocr>=0.1.0", +] retrieval = [ # For RetrievalAugmentedAgent with automatic demo selection "openadapt-retrieval>=0.1.0",