|
| 1 | +"""Prompt construction and VLM output parsing for GRPO training. |
| 2 | +
|
| 3 | +Copies SYSTEM_PROMPT from openadapt-ml next_action.py so GRPO |
| 4 | +operates in the same prompt distribution as SFT. NO openadapt-ml imports. |
| 5 | +""" |
| 6 | + |
| 7 | +from __future__ import annotations |
| 8 | + |
| 9 | +import json as _json |
| 10 | +import logging |
| 11 | +import re |
| 12 | +from dataclasses import dataclass |
| 13 | +from typing import Any |
| 14 | + |
| 15 | +logger = logging.getLogger(__name__) |
| 16 | +DEFAULT_SCREEN_SIZE: tuple[int, int] = (1920, 1080) |
| 17 | + |
| 18 | +# Copied from openadapt_ml.datasets.next_action.SYSTEM_PROMPT |
| 19 | +SYSTEM_PROMPT = ( |
| 20 | + "You are a GUI automation agent. Given a screenshot and a user goal, " |
| 21 | + "predict the single next action.\n\n" |
| 22 | + "COORDINATE SYSTEM:\n" |
| 23 | + "- x=0.0 is the LEFT edge, x=1.0 is the RIGHT edge\n" |
| 24 | + "- y=0.0 is the TOP edge, y=1.0 is the BOTTOM edge\n" |
| 25 | + "- To click the CENTER of an element, estimate its center position " |
| 26 | + "as a fraction of screen width/height\n" |
| 27 | + "- Example: An element in the middle of the screen would be " |
| 28 | + "approximately x=0.5, y=0.5\n\n" |
| 29 | + "ALLOWED ACTIONS (use exactly this format):\n" |
| 30 | + "- CLICK(x=0.XX, y=0.XX) \u2192 click at normalized coordinates\n" |
| 31 | + '- TYPE(text="...") \u2192 type text into the currently focused field\n' |
| 32 | + "- WAIT() \u2192 wait for UI to update\n" |
| 33 | + "- DONE() \u2192 task is complete\n\n" |
| 34 | + "RESPONSE FORMAT (required):\n" |
| 35 | + "Thought: [Brief reasoning: what element to interact with and why]\n" |
| 36 | + "Action: [Exactly one action, e.g., CLICK(x=0.35, y=0.42)]\n\n" |
| 37 | + "IMPORTANT: Output coordinates with 2 decimal places. " |
| 38 | + "Estimate the center of target elements." |
| 39 | +) |
| 40 | + |
| 41 | + |
| 42 | +@dataclass |
| 43 | +class SimpleAction: |
| 44 | + """Lightweight action (no openadapt-ml dependency).""" |
| 45 | + |
| 46 | + type: str = "done" |
| 47 | + x: float | None = None |
| 48 | + y: float | None = None |
| 49 | + text: str | None = None |
| 50 | + key: str | None = None |
| 51 | + |
| 52 | + |
| 53 | +def build_agent_messages( |
| 54 | + instruction: str, *, include_image: bool = False, action_history: str = "", |
| 55 | +) -> list[dict]: |
| 56 | + """Build chat messages matching the SFT prompt format.""" |
| 57 | + history_text = f"{action_history}\n" if action_history else "" |
| 58 | + text_content = ( |
| 59 | + f"Goal: {instruction}\n\n{history_text}" |
| 60 | + "Look at the screenshot and determine the NEXT action.\n\n" |
| 61 | + "Thought: [what element to interact with and why]\n" |
| 62 | + 'Action: [CLICK(x=..., y=...) or TYPE(text="...") or WAIT() or DONE()]' |
| 63 | + ) |
| 64 | + if include_image: |
| 65 | + user_content: Any = [ |
| 66 | + {"type": "image"}, |
| 67 | + {"type": "text", "text": text_content}, |
| 68 | + ] |
| 69 | + else: |
| 70 | + user_content = text_content |
| 71 | + return [ |
| 72 | + {"role": "system", "content": SYSTEM_PROMPT}, |
| 73 | + {"role": "user", "content": user_content}, |
| 74 | + ] |
| 75 | + |
| 76 | + |
| 77 | +def parse_vlm_output_to_action( |
| 78 | + text: str, screen_size: tuple[int, int] = DEFAULT_SCREEN_SIZE, |
| 79 | +) -> SimpleAction: |
| 80 | + """Parse VLM output to SimpleAction. Supports Thought/Action, bare DSL, and JSON.""" |
| 81 | + text = text.strip() |
| 82 | + width, height = screen_size |
| 83 | + logger.debug("Parsing VLM output (%d chars): %.200s", len(text), text) |
| 84 | + |
| 85 | + # Extract from "Action: ..." format |
| 86 | + action_match = re.search(r"Action:\s*(.+)", text, re.IGNORECASE) |
| 87 | + if action_match: |
| 88 | + text = action_match.group(1).strip() |
| 89 | + |
| 90 | + # JSON: {"action_type": "click", "coordinate": [x, y]} |
| 91 | + json_match = re.search(r'\{[^}]*"action_type"[^}]*\}', text) |
| 92 | + if json_match: |
| 93 | + try: |
| 94 | + d = _json.loads(json_match.group()) |
| 95 | + atype = d.get("action_type", "").lower() |
| 96 | + coord = d.get("coordinate", d.get("coords", [])) |
| 97 | + if atype == "click" and len(coord) >= 2: |
| 98 | + xv, yv = float(coord[0]), float(coord[1]) |
| 99 | + if xv <= 1.0 and yv <= 1.0: |
| 100 | + xv, yv = xv * width, yv * height |
| 101 | + return SimpleAction(type="click", x=int(xv), y=int(yv)) |
| 102 | + if atype == "type": |
| 103 | + return SimpleAction(type="type", text=d.get("text", "")) |
| 104 | + if atype in ("done", "wait"): |
| 105 | + return SimpleAction(type=atype) |
| 106 | + except Exception: |
| 107 | + pass |
| 108 | + |
| 109 | + # CLICK(x=..., y=...) |
| 110 | + m = re.search(r"CLICK\(x=(-?[\d.]+),\s*y=(-?[\d.]+)\)", text, re.IGNORECASE) |
| 111 | + if m: |
| 112 | + xf = max(0.0, min(1.0, float(m.group(1)))) |
| 113 | + yf = max(0.0, min(1.0, float(m.group(2)))) |
| 114 | + return SimpleAction(type="click", x=int(xf * width), y=int(yf * height)) |
| 115 | + |
| 116 | + # TYPE(text="...") |
| 117 | + m = re.search(r"""TYPE\(text=["']([^"'\\]*(?:\\.[^"'\\]*)*)["']\)""", text, re.IGNORECASE) |
| 118 | + if m: |
| 119 | + t = m.group(1).replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'") |
| 120 | + return SimpleAction(type="type", text=t) |
| 121 | + |
| 122 | + if re.search(r"\bWAIT\s*\(\s*\)", text, re.IGNORECASE): |
| 123 | + return SimpleAction(type="wait") |
| 124 | + if re.search(r"\bDONE\s*\(\s*\)", text, re.IGNORECASE): |
| 125 | + return SimpleAction(type="done") |
| 126 | + |
| 127 | + logger.warning("Could not parse VLM output: %s. Defaulting to DONE.", text) |
| 128 | + return SimpleAction(type="done") |
| 129 | + |
| 130 | + |
| 131 | +def format_action_as_text( |
| 132 | + action: SimpleAction, screen_size: tuple[int, int] = DEFAULT_SCREEN_SIZE, |
| 133 | +) -> str: |
| 134 | + """Convert SimpleAction to DSL text for log-prob computation.""" |
| 135 | + width, height = screen_size |
| 136 | + if action.type == "click": |
| 137 | + xf = (action.x or 0) / width if width > 0 else 0.0 |
| 138 | + yf = (action.y or 0) / height if height > 0 else 0.0 |
| 139 | + return f"CLICK(x={xf:.2f}, y={yf:.2f})" |
| 140 | + if action.type == "type": |
| 141 | + escaped = (action.text or "").replace("\\", "\\\\").replace('"', '\\"') |
| 142 | + return f'TYPE(text="{escaped}")' |
| 143 | + if action.type == "wait": |
| 144 | + return "WAIT()" |
| 145 | + return "DONE()" |
0 commit comments