|
| 1 | +"""Annotation data classes, prompts, and utilities for WAA demo annotation. |
| 2 | +
|
| 3 | +Migrated from ``openadapt_ml.experiments.demo_prompt.annotate`` so that the |
| 4 | +eval workflow (record -> annotate -> demo -> agent -> evaluate) does not |
| 5 | +require a cross-repo dependency on ``openadapt-ml`` for annotation. |
| 6 | +
|
| 7 | +What stays in openadapt-ml: |
| 8 | + - ``coalesce_steps()`` — depends on Episode schema, used by training |
| 9 | + - ``annotate_episode()`` — for local capture annotation (not WAA) |
| 10 | + - ``render_click_marker()`` — used by ``annotate_episode()`` |
| 11 | + - CLI commands for local capture annotation |
| 12 | + - ``format_demo.py`` — used by multiple openadapt-ml modules |
| 13 | +""" |
| 14 | + |
| 15 | +from __future__ import annotations |
| 16 | + |
| 17 | +import json |
| 18 | +import logging |
| 19 | +import re |
| 20 | +from dataclasses import asdict, dataclass |
| 21 | +from pathlib import Path |
| 22 | +from typing import Any |
| 23 | + |
| 24 | +logger = logging.getLogger(__name__) |
| 25 | + |
| 26 | +SCHEMA_VERSION = "0.1" |
| 27 | + |
| 28 | + |
| 29 | +# --------------------------------------------------------------------------- |
| 30 | +# Data model |
| 31 | +# --------------------------------------------------------------------------- |
| 32 | + |
| 33 | + |
| 34 | +@dataclass |
| 35 | +class AnnotatedStep: |
| 36 | + """A single annotated step in a demo trace.""" |
| 37 | + |
| 38 | + step_index: int |
| 39 | + timestamp_ms: int | None |
| 40 | + observation: str |
| 41 | + intent: str |
| 42 | + action: str |
| 43 | + action_raw: str |
| 44 | + action_px: list[int] | None |
| 45 | + result_observation: str |
| 46 | + expected_result: str |
| 47 | + |
| 48 | + |
| 49 | +@dataclass |
| 50 | +class AnnotatedDemo: |
| 51 | + """A fully annotated demo trace, produced by VLM annotation.""" |
| 52 | + |
| 53 | + schema_version: str |
| 54 | + task_id: str | None |
| 55 | + instruction: str |
| 56 | + source: str # "recorded" |
| 57 | + annotator: dict[str, str] |
| 58 | + recording_meta: dict[str, Any] |
| 59 | + steps: list[AnnotatedStep] |
| 60 | + |
| 61 | + def to_json(self, indent: int = 2) -> str: |
| 62 | + return json.dumps(asdict(self), indent=indent, default=str) |
| 63 | + |
| 64 | + def save(self, path: str | Path) -> None: |
| 65 | + path = Path(path) |
| 66 | + path.parent.mkdir(parents=True, exist_ok=True) |
| 67 | + path.write_text(self.to_json(), encoding="utf-8") |
| 68 | + logger.info(f"Saved annotated demo to {path}") |
| 69 | + |
| 70 | + @classmethod |
| 71 | + def load(cls, path: str | Path) -> AnnotatedDemo: |
| 72 | + path = Path(path) |
| 73 | + data = json.loads(path.read_text(encoding="utf-8")) |
| 74 | + steps = [AnnotatedStep(**s) for s in data.pop("steps")] |
| 75 | + return cls(**data, steps=steps) |
| 76 | + |
| 77 | + |
| 78 | +# --------------------------------------------------------------------------- |
| 79 | +# Prompts |
| 80 | +# --------------------------------------------------------------------------- |
| 81 | + |
| 82 | +ANNOTATION_SYSTEM_PROMPT = """\ |
| 83 | +You are annotating a human GUI demonstration for a task automation system. |
| 84 | +Your annotations will be used to guide an AI agent performing the same task on a different screen. |
| 85 | +Be precise about UI element names, labels, and visual landmarks. |
| 86 | +Always respond with valid JSON only — no markdown, no extra text.""" |
| 87 | + |
| 88 | +ANNOTATION_STEP_PROMPT = """\ |
| 89 | +Task: {instruction} |
| 90 | +Step {step_num} of {total_steps}. |
| 91 | +
|
| 92 | +The user performed: {action_raw} |
| 93 | +A red marker on the BEFORE image shows where the user clicked/interacted. |
| 94 | +
|
| 95 | +{previous_context} |
| 96 | +Describe this step: |
| 97 | +
|
| 98 | +- OBSERVATION: Describe what is visible on the BEFORE image. Include: |
| 99 | + - Application/window name |
| 100 | + - Current panel, page, or dialog |
| 101 | + - 3-6 key visible UI elements with relative positions |
| 102 | +
|
| 103 | +- INTENT: Why is the user performing this action? (1 sentence) |
| 104 | +
|
| 105 | +- ACTION: Describe which element was interacted with. Name the element by its visible label/text, not by coordinates. Reference the red marker to identify the target. |
| 106 | +
|
| 107 | +- RESULT: Describe what actually changed between the BEFORE and AFTER images.{no_after_note} |
| 108 | +
|
| 109 | +- EXPECTED_RESULT: What should the screen look like after this action? |
| 110 | +
|
| 111 | +Respond with valid JSON only: |
| 112 | +{{"observation": "...", "intent": "...", "action": "...", "result_observation": "...", "expected_result": "..."}}""" |
| 113 | + |
| 114 | + |
| 115 | +# --------------------------------------------------------------------------- |
| 116 | +# Parsing |
| 117 | +# --------------------------------------------------------------------------- |
| 118 | + |
| 119 | + |
| 120 | +def parse_annotation_response(response: str) -> dict[str, str]: |
| 121 | + """Parse VLM JSON response, tolerant of minor formatting issues.""" |
| 122 | + text = response.strip() |
| 123 | + |
| 124 | + # Strip markdown code fences if present |
| 125 | + if text.startswith("```"): |
| 126 | + lines = text.split("\n") |
| 127 | + # Remove first and last fence lines |
| 128 | + lines = [line for line in lines if not line.strip().startswith("```")] |
| 129 | + text = "\n".join(lines).strip() |
| 130 | + |
| 131 | + try: |
| 132 | + return json.loads(text) |
| 133 | + except json.JSONDecodeError: |
| 134 | + # Try to extract JSON object from the response |
| 135 | + start = text.find("{") |
| 136 | + end = text.rfind("}") + 1 |
| 137 | + if start >= 0 and end > start: |
| 138 | + try: |
| 139 | + return json.loads(text[start:end]) |
| 140 | + except json.JSONDecodeError: |
| 141 | + pass |
| 142 | + |
| 143 | + logger.warning(f"Failed to parse VLM response as JSON: {text[:200]}") |
| 144 | + return { |
| 145 | + "observation": text[:200] if text else "", |
| 146 | + "intent": "", |
| 147 | + "action": "", |
| 148 | + "result_observation": "", |
| 149 | + "expected_result": "", |
| 150 | + } |
| 151 | + |
| 152 | + |
| 153 | +# --------------------------------------------------------------------------- |
| 154 | +# Validation |
| 155 | +# --------------------------------------------------------------------------- |
| 156 | + |
| 157 | + |
| 158 | +def validate_annotations(demo: AnnotatedDemo) -> list[str]: |
| 159 | + """Check annotation quality. Returns list of warnings. |
| 160 | +
|
| 161 | + Checks: |
| 162 | + - All key fields non-empty |
| 163 | + - Action doesn't contain raw coordinates (should be semantic) |
| 164 | + - Result_observation differs from observation |
| 165 | + - No obvious platform mismatch |
| 166 | + """ |
| 167 | + warnings: list[str] = [] |
| 168 | + |
| 169 | + for step in demo.steps: |
| 170 | + prefix = f"Step {step.step_index}" |
| 171 | + |
| 172 | + if not step.observation: |
| 173 | + warnings.append(f"{prefix}: empty observation") |
| 174 | + if not step.intent: |
| 175 | + warnings.append(f"{prefix}: empty intent") |
| 176 | + if not step.action: |
| 177 | + warnings.append(f"{prefix}: empty action") |
| 178 | + if not step.result_observation and not step.expected_result: |
| 179 | + warnings.append(f"{prefix}: no result_observation or expected_result") |
| 180 | + |
| 181 | + # Check if action still contains raw coordinates |
| 182 | + if re.search(r"CLICK\(\s*0\.\d+\s*,\s*0\.\d+\s*\)", step.action): |
| 183 | + warnings.append(f"{prefix}: action contains raw coordinates: {step.action}") |
| 184 | + |
| 185 | + # Check for platform mismatches (Windows recording described with macOS terms) |
| 186 | + platform = (demo.recording_meta or {}).get("platform", "") |
| 187 | + if "win" in platform.lower(): |
| 188 | + mac_terms = ["finder", "dock", "spotlight", "cmd+", "command+"] |
| 189 | + action_lower = step.action.lower() + " " + step.observation.lower() |
| 190 | + for term in mac_terms: |
| 191 | + if term in action_lower: |
| 192 | + warnings.append( |
| 193 | + f"{prefix}: macOS term '{term}' in Windows recording" |
| 194 | + ) |
| 195 | + |
| 196 | + return warnings |
| 197 | + |
| 198 | + |
| 199 | +# --------------------------------------------------------------------------- |
| 200 | +# Formatting for prompt injection |
| 201 | +# --------------------------------------------------------------------------- |
| 202 | + |
| 203 | + |
| 204 | +def format_annotated_demo(demo: AnnotatedDemo, compact: bool = True) -> str: |
| 205 | + """Format AnnotatedDemo as text for prompt injection. |
| 206 | +
|
| 207 | + If compact=True (default), uses brief observation, action, and result. |
| 208 | + If compact=False, includes full observation and intent. |
| 209 | +
|
| 210 | + Args: |
| 211 | + demo: AnnotatedDemo to format. |
| 212 | + compact: If True, use compact format for agent prompt. |
| 213 | +
|
| 214 | + Returns: |
| 215 | + Formatted demo string. |
| 216 | + """ |
| 217 | + lines = [ |
| 218 | + "DEMONSTRATION:", |
| 219 | + f"Goal: {demo.instruction}", |
| 220 | + "", |
| 221 | + ] |
| 222 | + |
| 223 | + for step in demo.steps: |
| 224 | + lines.append(f"Step {step.step_index + 1}:") |
| 225 | + |
| 226 | + if compact: |
| 227 | + # Use first sentence of observation for compactness |
| 228 | + obs = _first_sentence(step.observation) |
| 229 | + lines.append(f" [Screen: {obs}]") |
| 230 | + else: |
| 231 | + lines.append(f" [Screen: {step.observation}]") |
| 232 | + lines.append(f" [Intent: {step.intent}]") |
| 233 | + |
| 234 | + lines.append(f" [Action: {step.action}]") |
| 235 | + |
| 236 | + # Prefer result_observation (grounded), fall back to expected_result |
| 237 | + result = step.result_observation or step.expected_result |
| 238 | + if result: |
| 239 | + lines.append(f" [Result: {result}]") |
| 240 | + |
| 241 | + lines.append("") |
| 242 | + |
| 243 | + return "\n".join(lines) |
| 244 | + |
| 245 | + |
| 246 | +def _first_sentence(text: str) -> str: |
| 247 | + """Extract first sentence from text.""" |
| 248 | + if not text: |
| 249 | + return "" |
| 250 | + # Split on period followed by space or end |
| 251 | + for i, ch in enumerate(text): |
| 252 | + if ch == "." and (i + 1 >= len(text) or text[i + 1] == " "): |
| 253 | + return text[: i + 1] |
| 254 | + return text |
0 commit comments