OpenAdaptAI
diff --git a/‎openadapt_ml/training/grpo/__init__.py‎
Lines changed: 45 additions & 0 deletions b/‎openadapt_ml/training/grpo/__init__.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎openadapt_ml/training/grpo/config.py‎
Lines changed: 65 additions & 0 deletions b/‎openadapt_ml/training/grpo/config.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎openadapt_ml/training/grpo/cot_warmup.py‎
Lines changed: 212 additions & 0 deletions b/‎openadapt_ml/training/grpo/cot_warmup.py‎
Lines changed: 212 additions & 0 deletions
diff --git a/‎openadapt_ml/training/grpo/reward.py‎
Lines changed: 56 additions & 0 deletions b/‎openadapt_ml/training/grpo/reward.py‎
Lines changed: 56 additions & 0 deletions
@@ -0,0 +1,45 @@
+"""GRPO (Group Relative Policy Optimization) training module.
+
+Provides online RL training for GUI agent VLMs using the GRPO algorithm.
+Connects to openadapt-evals RLEnvironment for rollout collection and
+task evaluation against live Windows Agent Arena VMs.
+
+Key components:
+    - GRPOConfig: Training configuration dataclass
+    - GRPOTrainer: Main training loop
+    - GRPORolloutCollector: Collects rollouts via RLEnvironment
+    - reward functions: Binary task success + group-relative advantages
+    - CoT warm-up: Chain-of-thought SFT before GRPO
+
+Example:
+    from openadapt_ml.training.grpo import GRPOConfig, GRPOTrainer
+
+    config = GRPOConfig(
+        task_ids=["notepad_1", "settings_1"],
+        num_training_steps=100,
+    )
+    trainer = GRPOTrainer(config)
+    trainer.train()
+"""
+
+from __future__ import annotations
+
+from openadapt_ml.training.grpo.config import GRPOConfig
+from openadapt_ml.training.grpo.reward import (
+    binary_task_success,
+    compute_group_advantages,
+)
+from openadapt_ml.training.grpo.rollout_collector import (
+    GRPORolloutCollector,
+    Rollout,
+)
+from openadapt_ml.training.grpo.trainer import GRPOTrainer
+
+__all__ = [
+    "GRPOConfig",
+    "GRPOTrainer",
+    "GRPORolloutCollector",
+    "Rollout",
+    "binary_task_success",
+    "compute_group_advantages",
+]
@@ -0,0 +1,65 @@
+"""GRPO training configuration.
+
+Follows the same pattern as TRLTrainingConfig in trl_trainer.py, with
+additional fields for GRPO-specific hyperparameters and environment setup.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class GRPOConfig:
+    """Configuration for GRPO (Group Relative Policy Optimization) training.
+
+    Groups model/LoRA defaults with TRLTrainingConfig for consistency.
+
+    Attributes:
+        model_name: HuggingFace model identifier.
+        load_in_4bit: Whether to use 4-bit quantization.
+        max_seq_length: Maximum sequence length for the model.
+        lora_r: LoRA rank.
+        lora_alpha: LoRA alpha scaling factor.
+        num_rollouts_per_step: Group size N for GRPO advantage computation.
+        max_steps_per_episode: Maximum actions per rollout episode.
+        temperature: Sampling temperature for action generation during rollouts.
+        kl_coef: KL divergence penalty coefficient against reference policy.
+        server_url: URL of the WAA server for live environment interaction.
+        task_ids: List of WAA task IDs to train on.
+        learning_rate: Optimizer learning rate for LoRA parameter updates.
+        gradient_accumulation_steps: Number of gradient accumulation steps.
+        num_training_steps: Total number of GRPO training steps (outer loop).
+        save_every_steps: Checkpoint frequency.
+        output_dir: Directory for saving checkpoints and logs.
+        stuck_window: Number of identical screenshots before early termination.
+    """
+
+    # Model (same defaults as TRLTrainingConfig)
+    model_name: str = "unsloth/Qwen2.5-VL-7B-Instruct"
+    load_in_4bit: bool = True
+    max_seq_length: int = 4096
+
+    # LoRA
+    lora_r: int = 16
+    lora_alpha: int = 32
+
+    # GRPO-specific
+    num_rollouts_per_step: int = 8  # Group size N
+    max_steps_per_episode: int = 15
+    temperature: float = 0.7  # Sampling temperature for rollouts
+    kl_coef: float = 0.01  # KL divergence penalty
+
+    # Environment
+    server_url: str = "http://localhost:5001"
+    task_ids: list[str] = field(default_factory=list)
+
+    # Training
+    learning_rate: float = 5e-6
+    gradient_accumulation_steps: int = 8
+    num_training_steps: int = 1000
+    save_every_steps: int = 50
+    output_dir: str = "checkpoints/grpo"
+
+    # Stuck detection
+    stuck_window: int = 3
@@ -0,0 +1,212 @@
+"""Chain-of-thought warm-up for GRPO training.
+
+Provides utilities to annotate successful demonstration episodes with
+chain-of-thought reasoning, then convert them to SFT training format.
+This CoT SFT warm-up initializes the policy before GRPO online RL,
+giving the model a better starting point for action generation.
+
+The two-step process:
+    1. generate_cot_annotations(): Use a capable model to add reasoning
+       to each step of successful demonstrations.
+    2. build_cot_sft_samples(): Convert annotated episodes to the
+       TRL SFT format used by trl_trainer.py.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def generate_cot_annotations(
+    episodes: list[Any],
+    annotator_model: str = "gpt-4o",
+) -> list[Any]:
+    """Add chain-of-thought reasoning to successful demonstrations.
+
+    For each step in a successful episode, uses the specified model to
+    generate reasoning that explains the action choice given the
+    screenshot context. This produces <think>...</think> blocks that
+    teach the model to reason before acting.
+
+    Args:
+        episodes: List of Episode objects from openadapt_ml.schema.
+            Only successful episodes (episode.success == True) are
+            annotated; others are returned unchanged.
+        annotator_model: Model identifier for generating CoT annotations.
+            Must support vision (image inputs).
+
+    Returns:
+        List of episodes with reasoning fields populated on each step.
+        Episodes that were already annotated or unsuccessful are
+        returned unchanged.
+    """
+    # Deferred import for optional dependency
+    try:
+        from openadapt_ml.models.api_adapter import get_api_adapter
+    except ImportError:
+        logger.error(
+            "openadapt_ml.models.api_adapter not available. "
+            "Cannot generate CoT annotations."
+        )
+        return episodes
+
+    annotated = []
+
+    for episode in episodes:
+        if not getattr(episode, "success", False):
+            annotated.append(episode)
+            continue
+
+        instruction = getattr(episode, "instruction", "")
+        steps = getattr(episode, "steps", [])
+
+        for step_idx, step in enumerate(steps):
+            # Skip if already annotated
+            if getattr(step, "reasoning", None):
+                continue
+
+            screenshot_path = getattr(
+                getattr(step, "observation", None),
+                "screenshot_path",
+                None,
+            )
+            action = getattr(step, "action", None)
+
+            if not screenshot_path or not action:
+                continue
+
+            prompt = (
+                f"You are analyzing step {step_idx + 1} of {len(steps)} "
+                f"in a GUI automation task.\n\n"
+                f"Task instruction: {instruction}\n\n"
+                f"The action taken at this step was: {action}\n\n"
+                "Explain in 1-2 sentences WHY this action was taken. "
+                "Focus on what the agent sees on screen and how the "
+                "action moves toward completing the task. "
+                "Be concise and specific."
+            )
+
+            try:
+                adapter = get_api_adapter(annotator_model)
+                reasoning = adapter.generate(
+                    {
+                        "images": [screenshot_path],
+                        "messages": [
+                            {"role": "user", "content": prompt},
+                        ],
+                    },
+                    max_new_tokens=150,
+                )
+                step.reasoning = reasoning.strip()
+                logger.debug(
+                    "Annotated step %d: %s",
+                    step_idx,
+                    step.reasoning[:80],
+                )
+            except Exception as e:
+                logger.warning(
+                    "Failed to annotate step %d: %s", step_idx, e
+                )
+
+        annotated.append(episode)
+
+    logger.info(
+        "CoT annotation complete: %d episodes processed", len(annotated)
+    )
+    return annotated
+
+
+def build_cot_sft_samples(annotated_episodes: list[Any]) -> list[dict]:
+    """Convert CoT-annotated episodes to TRL SFT format.
+
+    Produces training samples where the assistant response includes a
+    <think> block before the action, teaching the model to reason
+    step-by-step during inference.
+
+    Format:
+        User: <image>
+              Instruction: Open Notepad and type Hello
+              Previous actions: CLICK(x=0.05, y=0.95)
+
+        Assistant: <think>I see the Start menu is open. The task requires
+                   opening Notepad, so I need to search for it.</think>
+                   TYPE(text="notepad")
+
+    Args:
+        annotated_episodes: Episodes with reasoning fields on steps,
+            typically from generate_cot_annotations().
+
+    Returns:
+        List of SFT sample dicts compatible with trl_trainer.py:
+        {
+            "images": [path],
+            "messages": [system, user, assistant],
+        }
+    """
+    from openadapt_ml.datasets.next_action import (
+        SYSTEM_PROMPT,
+        format_action,
+    )
+
+    samples: list[dict] = []
+
+    for episode in annotated_episodes:
+        if not getattr(episode, "success", False):
+            continue
+
+        instruction = getattr(episode, "instruction", "")
+        steps = getattr(episode, "steps", [])
+
+        for step in steps:
+            screenshot_path = getattr(
+                getattr(step, "observation", None),
+                "screenshot_path",
+                None,
+            )
+            action = getattr(step, "action", None)
+            reasoning = getattr(step, "reasoning", None)
+
+            if not screenshot_path or not action:
+                continue
+
+            # Build action history
+            step_index = getattr(step, "step_index", 0)
+            prev_actions = []
+            for prev_step in steps:
+                prev_idx = getattr(prev_step, "step_index", 0)
+                if prev_idx < step_index:
+                    prev_actions.append(
+                        format_action(prev_step.action)
+                    )
+
+            # Build user content
+            parts = [f"Instruction: {instruction}"]
+            if prev_actions:
+                parts.append(
+                    "Previous actions: "
+                    + " -> ".join(prev_actions)
+                )
+            user_content = "\n".join(parts)
+
+            # Build assistant content with CoT
+            action_text = format_action(action)
+            if reasoning:
+                assistant_content = f"<think>{reasoning}</think>\n{action_text}"
+            else:
+                assistant_content = action_text
+
+            sample = {
+                "images": [screenshot_path],
+                "messages": [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": user_content},
+                    {"role": "assistant", "content": assistant_content},
+                ],
+            }
+            samples.append(sample)
+
+    logger.info("Built %d CoT SFT samples", len(samples))
+    return samples
@@ -0,0 +1,56 @@
+"""Reward functions for GRPO training.
+
+Provides binary task-success rewards and group-relative advantage
+computation following the GRPO algorithm (Shao et al., 2024).
+
+GRPO computes advantages relative to the group mean rather than using
+a learned value function, which is simpler and works well for sparse
+binary rewards (task success/failure).
+"""
+
+from __future__ import annotations
+
+
+def binary_task_success(score: float, threshold: float = 0.5) -> float:
+    """Convert evaluator score to binary reward.
+
+    Args:
+        score: Raw evaluator score (0.0-1.0) from WAA environment.
+        threshold: Score at or above which the task is considered successful.
+
+    Returns:
+        1.0 if score >= threshold, else 0.0.
+    """
+    return 1.0 if score >= threshold else 0.0
+
+
+def compute_group_advantages(rewards: list[float]) -> list[float]:
+    """Compute group-relative advantages for a batch of rollout rewards.
+
+    GRPO normalizes rewards within each group:
+        advantage[i] = (reward[i] - mean) / (std + eps)
+
+    If all rewards are identical (no variance), returns all zeros. This
+    avoids NaN from division by zero and correctly signals that there is
+    no gradient signal when every rollout in the group has the same outcome.
+
+    Args:
+        rewards: List of scalar rewards for each rollout in the group.
+
+    Returns:
+        List of advantage values, same length as rewards.
+    """
+    n = len(rewards)
+    if n == 0:
+        return []
+
+    mean = sum(rewards) / n
+    variance = sum((r - mean) ** 2 for r in rewards) / n
+    std = variance**0.5
+    eps = 1e-8
+
+    # No variance means no gradient signal: all advantages are zero
+    if std < eps:
+        return [0.0] * n
+
+    return [(r - mean) / (std + eps) for r in rewards]