OpenAdaptAI
diff --git a/‎openadapt_evals/training/standalone/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎openadapt_evals/training/standalone/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎openadapt_evals/training/standalone/config.py‎
Lines changed: 30 additions & 0 deletions b/‎openadapt_evals/training/standalone/config.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎openadapt_evals/training/standalone/model_loader.py‎
Lines changed: 51 additions & 0 deletions b/‎openadapt_evals/training/standalone/model_loader.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎openadapt_evals/training/standalone/prompt.py‎
Lines changed: 145 additions & 0 deletions b/‎openadapt_evals/training/standalone/prompt.py‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎openadapt_evals/training/standalone/reward.py‎
Lines changed: 35 additions & 0 deletions b/‎openadapt_evals/training/standalone/reward.py‎
Lines changed: 35 additions & 0 deletions
@@ -0,0 +1,9 @@
+"""Standalone GRPO trainer with direct WAA HTTP integration.
+
+No openadapt-ml dependency. Will migrate to openadapt-ml later.
+"""
+
+from openadapt_evals.training.standalone.config import TrainingConfig
+from openadapt_evals.training.standalone.trainer import GRPOTrainer
+
+__all__ = ["GRPOTrainer", "TrainingConfig"]
@@ -0,0 +1,30 @@
+"""Training configuration for standalone GRPO trainer."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class TrainingConfig:
+    """Configuration for standalone GRPO training."""
+
+    model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"
+    load_in_4bit: bool = True
+    lora_r: int = 16
+    lora_alpha: int = 32
+    lora_checkpoint: str | None = None
+    num_rollouts_per_step: int = 8
+    max_steps_per_episode: int = 15
+    temperature: float = 0.7
+    max_new_tokens: int = 2048  # 100 truncates reasoning -- keep high
+    server_url: str = "http://localhost:5001"
+    task_ids: list[str] = field(default_factory=list)
+    task_dir: str | None = None
+    screen_size: tuple[int, int] = (1920, 1080)
+    stuck_window: int = 3
+    learning_rate: float = 5e-6
+    num_training_steps: int = 1000
+    save_every_steps: int = 50
+    output_dir: str = "checkpoints/grpo"
+    eval_model: str = "gpt-4.1-mini"
@@ -0,0 +1,51 @@
+"""HuggingFace + PEFT model loading for standalone GRPO. No openadapt-ml imports."""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def load_model_and_processor(
+    model_name: str,
+    *,
+    load_in_4bit: bool = True,
+    lora_r: int = 16,
+    lora_alpha: int = 32,
+    lora_checkpoint: str | None = None,
+) -> tuple[Any, Any]:
+    """Load VLM with LoRA. Returns (model, processor)."""
+    import torch
+    from peft import LoraConfig, PeftModel, get_peft_model
+    from transformers import AutoProcessor
+
+    try:
+        from transformers import AutoModelForImageTextToText as AutoVLM
+    except ImportError:
+        from transformers import AutoModelForVision2Seq as AutoVLM
+
+    processor = AutoProcessor.from_pretrained(model_name)
+    load_kwargs: dict[str, Any] = {"torch_dtype": torch.bfloat16, "device_map": "auto"}
+    if load_in_4bit:
+        from transformers import BitsAndBytesConfig
+
+        load_kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4",
+        )
+    model = AutoVLM.from_pretrained(model_name, **load_kwargs)
+
+    if lora_checkpoint:
+        logger.info("Loading existing LoRA from %s", lora_checkpoint)
+        model = PeftModel.from_pretrained(model, lora_checkpoint, is_trainable=True)
+    else:
+        lora_config = LoraConfig(
+            r=lora_r, lora_alpha=lora_alpha,
+            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, lora_config)
+
+    model.print_trainable_parameters()
+    return model, processor
@@ -0,0 +1,145 @@
+"""Prompt construction and VLM output parsing for GRPO training.
+
+Copies SYSTEM_PROMPT from openadapt-ml next_action.py so GRPO
+operates in the same prompt distribution as SFT. NO openadapt-ml imports.
+"""
+
+from __future__ import annotations
+
+import json as _json
+import logging
+import re
+from dataclasses import dataclass
+from typing import Any
+
+logger = logging.getLogger(__name__)
+DEFAULT_SCREEN_SIZE: tuple[int, int] = (1920, 1080)
+
+# Copied from openadapt_ml.datasets.next_action.SYSTEM_PROMPT
+SYSTEM_PROMPT = (
+    "You are a GUI automation agent. Given a screenshot and a user goal, "
+    "predict the single next action.\n\n"
+    "COORDINATE SYSTEM:\n"
+    "- x=0.0 is the LEFT edge, x=1.0 is the RIGHT edge\n"
+    "- y=0.0 is the TOP edge, y=1.0 is the BOTTOM edge\n"
+    "- To click the CENTER of an element, estimate its center position "
+    "as a fraction of screen width/height\n"
+    "- Example: An element in the middle of the screen would be "
+    "approximately x=0.5, y=0.5\n\n"
+    "ALLOWED ACTIONS (use exactly this format):\n"
+    "- CLICK(x=0.XX, y=0.XX)  \u2192 click at normalized coordinates\n"
+    '- TYPE(text="...")     \u2192 type text into the currently focused field\n'
+    "- WAIT()                 \u2192 wait for UI to update\n"
+    "- DONE()                 \u2192 task is complete\n\n"
+    "RESPONSE FORMAT (required):\n"
+    "Thought: [Brief reasoning: what element to interact with and why]\n"
+    "Action: [Exactly one action, e.g., CLICK(x=0.35, y=0.42)]\n\n"
+    "IMPORTANT: Output coordinates with 2 decimal places. "
+    "Estimate the center of target elements."
+)
+
+
+@dataclass
+class SimpleAction:
+    """Lightweight action (no openadapt-ml dependency)."""
+
+    type: str = "done"
+    x: float | None = None
+    y: float | None = None
+    text: str | None = None
+    key: str | None = None
+
+
+def build_agent_messages(
+    instruction: str, *, include_image: bool = False, action_history: str = "",
+) -> list[dict]:
+    """Build chat messages matching the SFT prompt format."""
+    history_text = f"{action_history}\n" if action_history else ""
+    text_content = (
+        f"Goal: {instruction}\n\n{history_text}"
+        "Look at the screenshot and determine the NEXT action.\n\n"
+        "Thought: [what element to interact with and why]\n"
+        'Action: [CLICK(x=..., y=...) or TYPE(text="...") or WAIT() or DONE()]'
+    )
+    if include_image:
+        user_content: Any = [
+            {"type": "image"},
+            {"type": "text", "text": text_content},
+        ]
+    else:
+        user_content = text_content
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": user_content},
+    ]
+
+
+def parse_vlm_output_to_action(
+    text: str, screen_size: tuple[int, int] = DEFAULT_SCREEN_SIZE,
+) -> SimpleAction:
+    """Parse VLM output to SimpleAction. Supports Thought/Action, bare DSL, and JSON."""
+    text = text.strip()
+    width, height = screen_size
+    logger.debug("Parsing VLM output (%d chars): %.200s", len(text), text)
+
+    # Extract from "Action: ..." format
+    action_match = re.search(r"Action:\s*(.+)", text, re.IGNORECASE)
+    if action_match:
+        text = action_match.group(1).strip()
+
+    # JSON: {"action_type": "click", "coordinate": [x, y]}
+    json_match = re.search(r'\{[^}]*"action_type"[^}]*\}', text)
+    if json_match:
+        try:
+            d = _json.loads(json_match.group())
+            atype = d.get("action_type", "").lower()
+            coord = d.get("coordinate", d.get("coords", []))
+            if atype == "click" and len(coord) >= 2:
+                xv, yv = float(coord[0]), float(coord[1])
+                if xv <= 1.0 and yv <= 1.0:
+                    xv, yv = xv * width, yv * height
+                return SimpleAction(type="click", x=int(xv), y=int(yv))
+            if atype == "type":
+                return SimpleAction(type="type", text=d.get("text", ""))
+            if atype in ("done", "wait"):
+                return SimpleAction(type=atype)
+        except Exception:
+            pass
+
+    # CLICK(x=..., y=...)
+    m = re.search(r"CLICK\(x=(-?[\d.]+),\s*y=(-?[\d.]+)\)", text, re.IGNORECASE)
+    if m:
+        xf = max(0.0, min(1.0, float(m.group(1))))
+        yf = max(0.0, min(1.0, float(m.group(2))))
+        return SimpleAction(type="click", x=int(xf * width), y=int(yf * height))
+
+    # TYPE(text="...")
+    m = re.search(r"""TYPE\(text=["']([^"'\\]*(?:\\.[^"'\\]*)*)["']\)""", text, re.IGNORECASE)
+    if m:
+        t = m.group(1).replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'")
+        return SimpleAction(type="type", text=t)
+
+    if re.search(r"\bWAIT\s*\(\s*\)", text, re.IGNORECASE):
+        return SimpleAction(type="wait")
+    if re.search(r"\bDONE\s*\(\s*\)", text, re.IGNORECASE):
+        return SimpleAction(type="done")
+
+    logger.warning("Could not parse VLM output: %s. Defaulting to DONE.", text)
+    return SimpleAction(type="done")
+
+
+def format_action_as_text(
+    action: SimpleAction, screen_size: tuple[int, int] = DEFAULT_SCREEN_SIZE,
+) -> str:
+    """Convert SimpleAction to DSL text for log-prob computation."""
+    width, height = screen_size
+    if action.type == "click":
+        xf = (action.x or 0) / width if width > 0 else 0.0
+        yf = (action.y or 0) / height if height > 0 else 0.0
+        return f"CLICK(x={xf:.2f}, y={yf:.2f})"
+    if action.type == "type":
+        escaped = (action.text or "").replace("\\", "\\\\").replace('"', '\\"')
+        return f'TYPE(text="{escaped}")'
+    if action.type == "wait":
+        return "WAIT()"
+    return "DONE()"
@@ -0,0 +1,35 @@
+"""Reward: group-relative advantages + VLM milestone evaluation. No openadapt-ml imports."""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def compute_group_advantages(rewards: list[float]) -> list[float]:
+    """GRPO group-relative advantages: (r - mean) / (std + eps)."""
+    n = len(rewards)
+    if n == 0:
+        return []
+    mean = sum(rewards) / n
+    variance = sum((r - mean) ** 2 for r in rewards) / n
+    std = variance**0.5
+    if std < 1e-8:
+        return [0.0] * n
+    return [(r - mean) / (std + 1e-8) for r in rewards]
+
+
+def evaluate_milestones_screenshot(
+    task_config: Any, screenshot: bytes, *, model: str = "gpt-4.1-mini",
+) -> float:
+    """VLM screenshot-only milestone evaluation. Returns passed/total [0,1]."""
+    milestones = getattr(task_config, "milestones", [])
+    sm = [m for m in milestones if m.check.check == "screenshot"]
+    if not sm:
+        return 0.0
+    from openadapt_evals.vlm_evaluator import vlm_judge
+
+    passed = sum(1 for m in sm if vlm_judge(screenshot, m.check.description or "", model=model)[0])
+    return passed / len(sm)