feat: add dual training backend support (standalone + verl-agent) (#51)

abrichr · claude · web-flow · commit 4419b21eaae9 · 2026-03-03T21:21:04.000-05:00
* feat: add dual training backend support (standalone + verl-agent)

Add `backend` field to GRPOConfig ("standalone" or "verl") to support
switching between training backends:

- standalone: existing trainer.py (single-GPU, episode-level rewards)
- verl: verl-agent/VAGEN integration (multi-GPU, GiGPO per-step credit)

New verl_backend.py provides build_vagen_config() to map GRPOConfig
to VAGEN-compatible config, and train_with_verl() as the integration
point (placeholder until full end-to-end is wired up).

No existing function signatures or behavior modified.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* style: format verl_backend.py with ruff

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_ml/training/grpo/__init__.py b/openadapt_ml/training/grpo/__init__.py
@@ -4,14 +4,21 @@
 Connects to openadapt-evals RLEnvironment for rollout collection and
 task evaluation against live Windows Agent Arena VMs.
 
+Supports two training backends (set via GRPOConfig.backend):
+    - "standalone" (default): Built-in trainer using HuggingFace + PEFT.
+      Good for single-GPU prototyping and debugging. See trainer.py.
+    - "verl": Integration with verl-agent/VAGEN for GiGPO and multi-GPU
+      distributed training. See verl_backend.py.
+
 Key components:
-    - GRPOConfig: Training configuration dataclass
-    - GRPOTrainer: Main training loop
+    - GRPOConfig: Training configuration dataclass (includes backend field)
+    - GRPOTrainer: Main training loop (standalone backend)
     - GRPORolloutCollector: Collects rollouts via RLEnvironment
     - reward functions: Binary task success + group-relative advantages
     - CoT warm-up: Chain-of-thought SFT before GRPO
+    - verl_backend: verl-agent/VAGEN integration (verl backend)
 
-Example:
+Example (standalone):
     from openadapt_ml.training.grpo import GRPOConfig, GRPOTrainer
 
     config = GRPOConfig(
@@ -20,6 +27,17 @@
     )
     trainer = GRPOTrainer(config)
     trainer.train()
+
+Example (verl backend):
+    from openadapt_ml.training.grpo import GRPOConfig
+    from openadapt_ml.training.grpo.verl_backend import train_with_verl
+
+    config = GRPOConfig(
+        backend="verl",
+        task_ids=["notepad_1", "settings_1"],
+        num_training_steps=100,
+    )
+    train_with_verl(config)  # Prints instructions; raises NotImplementedError
 """
 
 from __future__ import annotations
@@ -44,6 +62,10 @@
     build_cot_sft_samples,
     generate_cot_annotations,
 )
+from openadapt_ml.training.grpo.verl_backend import (
+    build_vagen_config,
+    train_with_verl,
+)
 
 __all__ = [
     "GRPOConfig",
@@ -58,4 +80,6 @@
     "format_action_as_text",
     "build_cot_sft_samples",
     "generate_cot_annotations",
+    "build_vagen_config",
+    "train_with_verl",
 ]
diff --git a/openadapt_ml/training/grpo/config.py b/openadapt_ml/training/grpo/config.py
@@ -2,6 +2,11 @@
 
 Follows the same pattern as TRLTrainingConfig in trl_trainer.py, with
 additional fields for GRPO-specific hyperparameters and environment setup.
+
+Supports two training backends:
+    - "standalone" (default): Built-in GRPO trainer using HuggingFace + PEFT.
+    - "verl": Integration point for verl-agent/VAGEN, which provides GiGPO
+      and multi-GPU support. See verl_backend.py for details.
 """
 
 from __future__ import annotations
@@ -16,6 +21,9 @@ class GRPOConfig:
     Groups model/LoRA defaults with TRLTrainingConfig for consistency.
 
     Attributes:
+        backend: Training backend to use. "standalone" for the built-in
+            HuggingFace + PEFT trainer, or "verl" for verl-agent/VAGEN
+            integration (requires separate installation).
         model_name: HuggingFace model identifier.
         load_in_4bit: Whether to use 4-bit quantization.
         lora_r: LoRA rank.
@@ -32,6 +40,9 @@ class GRPOConfig:
         stuck_window: Number of identical screenshots before early termination.
     """
 
+    # Backend: "standalone" (built-in HF+PEFT) or "verl" (verl-agent/VAGEN)
+    backend: str = "standalone"
+
     # Model
     model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"
     load_in_4bit: bool = True
diff --git a/openadapt_ml/training/grpo/trainer.py b/openadapt_ml/training/grpo/trainer.py
@@ -1,5 +1,9 @@
 """Minimal GRPO trainer bridging TRL/HuggingFace and openadapt-evals RLEnvironment.
 
+Note: This is the "standalone" backend. For the verl-agent backend (recommended
+for production training with GiGPO and multi-GPU support), see verl_backend.py
+or use the VAGEN training config in openadapt-evals/configs/train_waa_vagen.yaml.
+
 Uses REINFORCE with group-relative advantages (equivalent to single-epoch GRPO).
 The policy_gradient_loss function includes PPO-style clipping for future multi-epoch
 support, but with the current single-epoch design (old_logps == current_logps),
diff --git a/openadapt_ml/training/grpo/verl_backend.py b/openadapt_ml/training/grpo/verl_backend.py
@@ -0,0 +1,125 @@
+"""verl-agent / VAGEN backend for GRPO training.
+
+This module provides the integration point for training via verl-agent
+(https://github.com/VAGEN), which offers:
+    - GiGPO (Generalized Group Relative Policy Optimization)
+    - Multi-GPU distributed training via veRL
+    - Desktop environment integration via WAADesktopEnv
+
+The actual training loop is managed by verl-agent's own training script,
+not by our GRPOTrainer. This module builds the VAGEN-compatible config
+from our GRPOConfig and documents how to run training.
+
+Usage:
+    To train with the verl backend, set backend="verl" in GRPOConfig.
+    The train_with_verl() function will print instructions and raise
+    NotImplementedError until full integration is wired up.
+
+    For now, training with verl-agent should be done via:
+        1. Generate a VAGEN config: train_with_verl(config)
+        2. Run verl-agent's training script with that config
+
+See also:
+    - openadapt-evals/configs/train_waa_vagen.yaml
+    - docs/verl_agent_decision.md (if available)
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from openadapt_ml.training.grpo.config import GRPOConfig
+
+logger = logging.getLogger(__name__)
+
+# Deferred import for openadapt-evals WAADesktopEnv (optional dependency)
+try:
+    from openadapt_evals.adapters.verl_env import WAADesktopEnv
+except ImportError:
+    WAADesktopEnv = None  # type: ignore[assignment, misc]
+
+
+def build_vagen_config(config: GRPOConfig) -> dict[str, Any]:
+    """Build a VAGEN-compatible config dict from GRPOConfig.
+
+    Maps our config fields to the structure expected by verl-agent's
+    training script. This dict can be serialized to YAML for use with
+    VAGEN's CLI.
+
+    Args:
+        config: Our GRPO training configuration.
+
+    Returns:
+        Dict matching VAGEN's expected config structure.
+    """
+    return {
+        "model": {
+            "name": config.model_name,
+            "load_in_4bit": config.load_in_4bit,
+            "lora_r": config.lora_r,
+            "lora_alpha": config.lora_alpha,
+        },
+        "training": {
+            "learning_rate": config.learning_rate,
+            "num_training_steps": config.num_training_steps,
+            "save_every_steps": config.save_every_steps,
+            "output_dir": config.output_dir,
+            "num_rollouts_per_step": config.num_rollouts_per_step,
+            "temperature": config.temperature,
+        },
+        "environment": {
+            "type": "waa_desktop",
+            "server_url": config.server_url,
+            "task_ids": config.task_ids,
+            "max_steps_per_episode": config.max_steps_per_episode,
+            "screen_size": list(config.screen_size),
+            "stuck_window": config.stuck_window,
+        },
+    }
+
+
+def train_with_verl(config: GRPOConfig) -> None:
+    """Entry point for verl-agent backend training.
+
+    Currently a placeholder that documents the integration point.
+    The actual training happens via verl-agent's own CLI/training script,
+    not through this function.
+
+    Args:
+        config: GRPO training configuration with backend="verl".
+
+    Raises:
+        NotImplementedError: Always, until full verl-agent integration
+            is wired up. The error message includes instructions for
+            running training via verl-agent directly.
+    """
+    vagen_config = build_vagen_config(config)
+
+    if WAADesktopEnv is not None:
+        logger.info(
+            "WAADesktopEnv is available. verl-agent can use it for "
+            "desktop environment interaction."
+        )
+    else:
+        logger.warning(
+            "WAADesktopEnv not found. Install openadapt-evals to enable "
+            "desktop environment support: uv add openadapt-evals"
+        )
+
+    logger.info("VAGEN config built from GRPOConfig:")
+    logger.info("  Model: %s", vagen_config["model"]["name"])
+    logger.info("  Tasks: %s", vagen_config["environment"]["task_ids"])
+    logger.info("  Steps: %d", vagen_config["training"]["num_training_steps"])
+    logger.info("")
+    logger.info(
+        "To train with verl-agent, use the VAGEN training script with "
+        "a config derived from the above. Example:"
+    )
+    logger.info("  python -m vagen.train --config configs/train_waa_vagen.yaml")
+
+    raise NotImplementedError(
+        "verl-agent training requires running via VAGEN's training script. "
+        "See docs/verl_agent_decision.md for setup instructions. "
+        "Use build_vagen_config() to generate a compatible config dict."
+    )