From 23e6eeb22a0d4ab48d757ff596051851200a95b4 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 16 Mar 2026 22:08:42 -0400 Subject: [PATCH 1/4] feat: add GiGPO anchor state computation to WAADesktopEnv Add compute_anchor_state() function that produces a state key for GiGPO cross-rollout grouping. Uses a11y tree SHA256 hash (primary) with screenshot MD5 fallback. The state_key is included in the info dict from both reset() and step() so VAGEN/verl can use it for O(1) anchor grouping instead of recomputing perceptual hashes. Co-Authored-By: Claude Opus 4.6 (1M context) --- openadapt_evals/adapters/verl_env.py | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/openadapt_evals/adapters/verl_env.py b/openadapt_evals/adapters/verl_env.py index d6ae31c..796c3f7 100644 --- a/openadapt_evals/adapters/verl_env.py +++ b/openadapt_evals/adapters/verl_env.py @@ -50,6 +50,7 @@ from __future__ import annotations import asyncio +import hashlib import io import logging import re @@ -168,6 +169,45 @@ def _build_obs_dict( return result +# --- Anchor state computation for GiGPO --- + + +def compute_anchor_state(obs: BenchmarkObservation) -> str: + """Compute a state key for GiGPO anchor grouping. + + GiGPO groups identical intermediate states across rollouts to assign + per-action advantages. This function produces a hash that identifies + "the same state" so the training framework can efficiently find groups. + + Strategy: + 1. Accessibility tree hash (primary): Structural identity of UI state. + Two screenshots showing the same dialog with the same elements + will hash identically even if pixel rendering differs slightly. + 2. Screenshot MD5 (fallback): Exact pixel match when a11y tree is + unavailable. + + The returned key is included in the ``info`` dict from ``WAADesktopEnv.step()`` + as ``state_key``. VAGEN/verl can use this for O(1) grouping instead of + recomputing perceptual hashes across all rollout steps. + + Args: + obs: A BenchmarkObservation with screenshot and/or accessibility_tree. + + Returns: + Hex digest string suitable as a dict key. + """ + # Primary: hash the a11y tree (structural state identity) + if obs.accessibility_tree: + tree_str = str(obs.accessibility_tree) + return hashlib.sha256(tree_str.encode()).hexdigest()[:16] + + # Fallback: hash raw screenshot bytes (exact pixel match) + if obs.screenshot: + return hashlib.md5(obs.screenshot).hexdigest()[:16] + + return "empty" + + # --- System prompt (matches openadapt-ml trainer) --- SYSTEM_PROMPT = ( @@ -306,6 +346,7 @@ async def reset(self, seed: int) -> tuple[dict[str, Any], dict[str, Any]]: info: dict[str, Any] = { "task_id": self._task_id, "screen_size": env.screen_size, + "state_key": compute_anchor_state(obs), } return obs_dict, info @@ -371,6 +412,9 @@ async def step( prefix=f"After action (step {self._step_count}):", ) + # GiGPO anchor: include state key for efficient cross-rollout grouping + info["state_key"] = compute_anchor_state(rollout_step.observation) + return obs_dict, reward, done, info From d173c559db73ac667d51a5cf04d55adf0e182df4 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Tue, 17 Mar 2026 12:04:55 -0400 Subject: [PATCH 2/4] docs: clarify VAGEN vs verl-agent distinction in decision doc Add dated addendum (2026-03-16) correcting the earlier conflation of VAGEN and verl-agent as a single project. Key findings: VAGEN-Lite dropped Bi-Level GAE (only vanilla GRPO/PPO), GiGPO lives exclusively in verl-agent which uses its own env_base.py interface (not GymImageEnv), and our train_verl_e2e.py targets the wrong entry point. Outlines a corrected two-phase path: standalone GRPO first, then direct verl-agent integration if per-step credit is needed. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/verl_agent_decision.md | 106 ++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/docs/verl_agent_decision.md b/docs/verl_agent_decision.md index be2222b..c7bc129 100644 --- a/docs/verl_agent_decision.md +++ b/docs/verl_agent_decision.md @@ -358,6 +358,112 @@ To validate the verl-agent integration provides real value over standalone: --- +## Addendum: VAGEN vs verl-agent Clarification (2026-03-16) + +**Status**: Corrects earlier conflation of VAGEN and verl-agent as a single project. + +### Finding: VAGEN and verl-agent Are Separate Projects + +Earlier sections of this document (and our training scripts) treated "verl-agent/VAGEN" +as a single system. Research in March 2026 revealed they are **separate projects** +with different interfaces, capabilities, and trajectories: + +| Aspect | VAGEN (mll-lab-nu/VAGEN) | verl-agent (langfengQ/verl-agent) | +|--------|--------------------------|-----------------------------------| +| **Focus** | Environment framework | Training framework | +| **Key abstraction** | `GymImageEnv` protocol | `env_base.py` (own env interface) | +| **Algorithms** | GRPO, PPO only (on main/Lite) | GiGPO, GRPO, PPO, RLOO, DAPO, GSPO, REINFORCE++ | +| **Credit assignment** | Bi-Level GAE (legacy branch only) | GiGPO (active, on main) | +| **Entry point** | `vagen.main_ppo` | Own entry point + Hydra config system | +| **Current state** | Migrated to "VAGEN-Lite" (Feb 2026) | Actively maintained with GiGPO | + +**VAGEN-Lite** (the current main branch) dropped Bi-Level GAE and only supports +standard GRPO/PPO. The original features survive only on a legacy branch. This means +VAGEN-Lite provides no advantage over our standalone trainer for per-step credit +assignment. + +**verl-agent** is the project that actually implements GiGPO. It uses its **own** +`env_base.py` environment interface, NOT `GymImageEnv`. Our `train_verl_e2e.py` +script targets `vagen.main_ppo`, which is the wrong entry point for GiGPO training. + +### Impact on Our Integration + +1. **`train_verl_e2e.py` targets the wrong entry point** — It calls `vagen.main_ppo` + but GiGPO lives in verl-agent, which has a different entry point and Hydra config + system. This script will need retargeting. + +2. **`configs/train_waa_vagen.yaml` needs updating** — The YAML is structured for + VAGEN's config system, not verl-agent's. + +3. **VAGEN-Lite is not useful for us** — It only provides vanilla GRPO/PPO, which + our standalone trainer (`openadapt_ml/training/grpo/trainer.py`) already does. + There is no advantage to adding the VAGEN dependency for equivalent functionality. + +4. **GymImageEnv vendoring remains correct** — Our vendored `GymImageEnv` in + `openadapt_evals/adapters/_vendored/` is a clean, stable environment interface + regardless of which training framework consumes it. It stays. + +### Corrected Path Forward + +- **Phase 1 (now)**: Use the standalone GRPO trainer in openadapt-ml for initial + validation. It works today, is well-tested (56 unit + 5 E2E tests), and has no + external framework dependencies. + +- **Phase 2 (if GiGPO per-step credit is needed)**: Integrate with **verl-agent + directly** (not VAGEN-Lite). This requires writing a thin adapter from + `WAADesktopEnv` to verl-agent's `env_base.py` interface. The `compute_anchor_state()` + function (screenshot hashing for state grouping) is directly applicable to GiGPO's + state-matching requirement. + +- **Keep vendored GymImageEnv** as our stable environment interface. It decouples + environment definition from training framework choice. + +### What Remains Valid From the Original Decision + +The core reasoning in this document still holds: + +- **GiGPO is still the right algorithm** for per-step credit assignment in 15+ step + desktop automation tasks. The algorithm itself is sound; only the project that + implements it was misidentified. + +- **WAADesktopEnv adapter is solid and reusable** — ~250 lines of well-tested glue + code that translates between our `RLEnvironment` and any Gym-like training interface. + +- **`compute_anchor_state()` is directly applicable** to GiGPO's state grouping + mechanism. Screenshots at identical UI states hash to the same anchor, enabling + cross-rollout step-level advantage computation. + +- **TRL still cannot do multi-turn VLM RL** — Issues + [#5119](https://github.com/huggingface/trl/issues/5119) and + [#5120](https://github.com/huggingface/trl/issues/5120) remain OPEN as of + March 2026. + +- **"The environment is the moat, not the training math"** — This principle is + reinforced by this correction. Training frameworks come and go (VAGEN already + pivoted to Lite); our WAA RL environment is the durable asset. + +### Updated Architecture Diagram + +``` +Phase 1 (Current) Phase 2 (If GiGPO needed) +┌─────────────────────┐ ┌─────────────────────┐ +│ Standalone GRPO │ │ verl-agent │ +│ (openadapt-ml) │ │ GiGPO trainer │ +│ ↓ │ │ ↓ │ +│ trainer.py │ │ env_base.py adapter │ +│ ↓ │ │ (NEW, ~100 lines) │ +│ WAADesktopEnv │ │ ↓ │ +│ (GymImageEnv) │ │ WAADesktopEnv │ +│ ↓ │ │ (GymImageEnv) │ +│ RLEnvironment │ │ ↓ │ +│ ↓ │ │ RLEnvironment │ +│ WAA Flask Server │ │ ↓ │ +└─────────────────────┘ │ WAA Flask Server │ + └─────────────────────┘ +``` + +--- + ## References - [VAGEN](https://github.com/RAGEN-AI/VAGEN) — Multi-turn VLM agent training (GiGPO) From f161746e92379ab6dffbf7ea3b78acf5d1a72ef5 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Tue, 17 Mar 2026 12:17:55 -0400 Subject: [PATCH 3/4] docs: add comprehensive GRPO training research report Covers desktop RL landscape (30+ projects), per-step credit assignment alternatives (HCAPO recommended over GiGPO), scaling architectures (ComputerRL, DART-GUI), and synthetic environment feasibility (GUI-Genesis). Includes revised architecture recommendation: standalone GRPO + HCAPO first, then dense rewards + API-GUI hybrid, then async scaling. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/grpo_training_research_2026_03_17.md | 253 ++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 docs/grpo_training_research_2026_03_17.md diff --git a/docs/grpo_training_research_2026_03_17.md b/docs/grpo_training_research_2026_03_17.md new file mode 100644 index 0000000..ac57d15 --- /dev/null +++ b/docs/grpo_training_research_2026_03_17.md @@ -0,0 +1,253 @@ +# GRPO Training Infrastructure: Comprehensive Research Report + +**Date**: 2026-03-17 +**Scope**: Desktop RL training landscape, per-step credit assignment alternatives, scaling architectures, synthetic environment feasibility + +--- + +## 1. Executive Summary + +We evaluated 30+ open-source projects for desktop GUI RL training. Key findings: + +1. **HCAPO is the recommended per-step credit method** — ~80-120 lines to add to our standalone GRPO trainer, no extra model, 8.3% compute overhead, no anchor states needed. Beats GiGPO on benchmarks. +2. **VAGEN and verl-agent are separate projects** — our `train_verl_e2e.py` targets the wrong entry point. GiGPO lives in verl-agent, not VAGEN-Lite. +3. **ComputerRL's API-GUI hybrid** and **DART-GUI's entropy filtering** are the highest-impact ideas to incorporate, without adopting either framework. +4. **GUI-Genesis is not applicable** to Windows desktop tasks (mobile web only), but the dense reward concept is valuable. +5. **Our WAADesktopEnv is the only OSS project wrapping WAA for RL training.** + +--- + +## 2. VAGEN vs verl-agent Distinction + +**Critical finding**: Our decision doc conflated two separate projects. + +| | VAGEN (mll-lab-nu/VAGEN) | verl-agent (langfengQ/verl-agent) | +|---|---|---| +| **Purpose** | Environment framework | Training framework | +| **Credit assignment** | Dropped Bi-Level GAE in Lite | **GiGPO** (what we want) | +| **Env interface** | `GymImageEnv` (what we built) | Own `env_base.py` (different) | +| **Desktop GUI env** | No | No | +| **Entry point** | `vagen.main_ppo` | Own training scripts | +| **Stars** | 431 | 1,700 | + +**Impact**: Our `train_verl_e2e.py` targets `vagen.main_ppo` — wrong entry point for GiGPO. VAGEN-Lite only provides vanilla GRPO/PPO, which our standalone trainer already does. + +**Corrected path**: +- Phase 1: Standalone GRPO trainer (shipped in PR #55) +- Phase 2: If per-step credit needed, consider HCAPO (standalone) before verl-agent (complex) + +--- + +## 3. Per-Step Credit Assignment Methods + +### 3.1 Comparison Table + +| Criterion | GiGPO | **HCAPO** | iStar | HiPER | +|-----------|-------|-----------|-------|-------| +| **Can add to standalone trainer?** | No (needs verl-agent) | **YES** | Technically, but 2x memory | No (needs verl-agent) | +| **Lines of code** | ~250 adapter + verl stack | **~80-120** | ~150-200 + 2nd model | 500+ + critic | +| **Extra model in memory?** | No | **No** | YES (full policy copy) | YES (critic heads) | +| **Extra compute per step** | Anchor hashing | **8.3% (1 fwd pass/traj)** | ~100% (PRM update) | ~50% (critic + GAE) | +| **Anchor state grouping?** | YES (core limitation) | **No** | No | No | +| **Works with VLMs?** | Yes | Untested (likely yes) | Yes (tested VL-7B) | Untested | +| **Works with binary rewards?** | Yes | **Yes** | Yes | Yes | +| **ALFWorld** | 90.8% | **91.4% (96.9% w/ smoothing)** | N/A | 97.4% | +| **WebShop** | 75.2% | 73.8% | **86.5%** | 83.3% | +| **Public code?** | Yes (verl-agent) | No | No | Yes (verl-agent fork) | + +### 3.2 HCAPO: Recommended Approach + +**How it works**: Uses the policy LLM itself as a post-hoc critic. After collecting a rollout, it scores each intermediate action conditioned on the final state (hindsight probability). This produces an importance weight for credit assignment. + +**Key advantages over GiGPO for desktop tasks**: +- **No anchor state problem**: GiGPO needs identical intermediate states across rollouts. Real desktop screenshots are almost never pixel-identical (mouse position, timing, anti-aliasing). GiGPO's step-level signal would likely be near-zero for WAA. +- **No extra model**: Uses the same policy LLM — one extra non-autoregressive forward pass per trajectory. +- **Do-no-harm mask**: Never penalizes actions in successful trajectories. +- **Temporal smoothing**: Distributes credit to preparatory actions. + +**Implementation plan** (~80-120 lines in openadapt-ml): +1. `compute_hindsight_probabilities(model, processor, rollout, final_screenshot)` — ~40 lines +2. `compute_hcapo_advantages(rollouts, hindsight_probs)` — ~30 lines +3. Modifications to `_training_step()` — ~10 lines +4. Config fields: `hcapo_enabled`, `hcapo_omega`, `hcapo_clip_min/max`, `hcapo_temp`, `hcapo_smoothing_alpha` + +**Hyperparameters** (robust across benchmarks per authors): omega=1.0, C_min=0.8, C_max=1.2, T_temp=5.0, alpha=0.5. + +### 3.3 iStar: Second Choice + +Best absolute results (86.5% WebShop) and only method tested with VLMs on visual tasks. But requires **2x GPU memory** (separate PRM model), which is a dealbreaker on A10G (24GB). Only viable on larger GPUs (L40S 48GB+). + +### 3.4 GiGPO: Deprioritized + +The anchor state matching problem is fundamental for desktop GUI. Screenshots vary in mouse position, animation frames, and rendering artifacts even when the UI state is semantically identical. Our `compute_anchor_state()` (a11y tree hash) partially addresses this, but a11y trees are unreliable on WAA (UIA backend can't always find window elements during transitions). + +--- + +## 4. Scaling Architectures: ComputerRL and DART-GUI + +### 4.1 ComputerRL (Zhipu AI) + +**Score**: 48.9% OSWorld (open-source SOTA) +**Architecture**: gRPC-based distributed cluster, 1000+ concurrent qemu-in-docker Ubuntu VMs +**License**: Apache 2.0 + +**Most valuable ideas**: + +1. **API-GUI Hybrid Paradigm** — Agent can issue either GUI actions OR programmatic API calls. 103 auto-constructed APIs across LibreOffice Calc (27), Impress (22), Writer (19), Chrome (11), VS Code (12), VLC (12). Yielded **134% improvement** over GUI-only and 3-3.6x faster execution. **Directly applicable to WAA** via `/execute_windows` endpoint with `win32com`/PowerShell commands. + +2. **Entropulse** — Alternating RL and SFT phases to prevent entropy collapse: + - RL Phase 1 (180 steps): 31.9% → 42.0% + - SFT on successful rollouts: entropy restored + - RL Phase 2 (180 steps): → 45.8% + - Simple to implement (~100 lines wrapper around training loop) + +3. **OfficeWorld Benchmark** — 180 tasks across LibreOffice Calc/Writer/Impress, directly relevant to our Core4 tasks. + +**Not recommended to adopt as platform**: Tightly coupled to OSWorld/Ubuntu and GLM-4 models. Only 8 git commits, sparse docs. + +### 4.2 DART-GUI + +**Score**: 42.13% OSWorld +**Architecture**: Fully async 4-module design (env cluster, rollout service, data manager, trainer) +**License**: Apache 2.0 + +**Most valuable ideas**: + +1. **Step-wise GRPO with Entropy Filtering** — Train only on top 80% of steps by token entropy. Skips steps where the model is already confident; focuses compute on decision points. Complements (not replaces) per-step credit methods like HCAPO. **Effort: 1-2 weeks.** + +2. **Adaptive Data Curation** (4 levels): + - Task: Experience pool for hard tasks (pre-collect successful trajectories) + - Trajectory: Dynamic rollout count (fewer for easy tasks) + - Step: High-entropy filtering (see above) + - Token: Truncated importance sampling (for async) + +3. **Throughput gains**: Environment utilization 12.2% → 67.7% (5.5x) via rollout-wise scheduling. Even with single VM, avoid batch synchronization barriers. + +4. **Async architecture pattern** — Critical when scaling to 10+ VMs. Not needed for single-VM validation. + +**Not recommended to adopt as platform**: Tightly coupled to OSWorld/Ubuntu, UI-TARS model, Kubernetes, MySQL. Chinese Docker registry (Aliyun). + +### 4.3 Integration Roadmap + +| Priority | Idea | Source | Effort | Impact | +|----------|------|--------|--------|--------| +| **P0** | HCAPO per-step credit | Paper | 1-2 weeks | Per-step signal without verl-agent | +| **P0** | Entropy-based step filtering | DART-GUI | 1-2 weeks | Focus compute on decision points | +| **P0** | Entropulse (RL/SFT alternation) | ComputerRL | 1 week | Prevent entropy collapse | +| **P1** | API-GUI hybrid actions | ComputerRL | 2-3 weeks | 134% improvement potential on Office tasks | +| **P1** | Experience pool for hard tasks | DART-GUI | 1 week | Supplement sparse success signal | +| **P1** | Dense partial-credit rewards | GUI-Genesis concept | 1-2 weeks | Replace binary with continuous rewards | +| **P2** | Async rollout architecture | DART-GUI | 4-6 weeks | Critical for 10+ VM scaling | +| **P2** | Auto-API construction | ComputerRL | 2 weeks/app | Generalize API-GUI to new apps | + +--- + +## 5. GUI-Genesis: Synthetic Environments + +**Paper**: arXiv:2602.14093 (Feb 2026) +**Core idea**: Auto-synthesize Flask web apps that mirror real tasks, with code-native verifiable rewards. + +### Assessment for OpenAdapt: **Not applicable in current form.** + +- Generates mobile web apps (375x812), not desktop environments +- No desktop benchmark results; transfer gap to Windows is unknown and likely large +- Depends on Kimi k2 (proprietary LLM), no code released +- Average trajectory length ~5.63 steps (ours: 15-30) + +### What IS Useful + +1. **Dense code-native rewards** — Instead of binary 0/1 from WAA `/evaluate`, define programmatic assertions that check intermediate state: + - LibreOffice Calc: check cell values via UNO API + - Writer: check font properties via document inspection + - VS Code: check settings.json values + - This gives continuous [0,1] rewards, enabling better credit assignment + +2. **PC Agent-E** (arXiv:2505.13909) is more relevant — takes 312 real Windows trajectories, uses Claude to synthesize 9 alternative actions per step, creating trajectory trees. 141% relative improvement on WAA-V2. Closer to our use case. + +--- + +## 6. Competitive Landscape + +### Desktop RL Training Systems + +| Project | Platform | RL Method | Best Score | Key Innovation | +|---------|----------|-----------|------------|----------------| +| **ComputerRL** | Ubuntu/OSWorld | GRPO + Entropulse | 48.9% OSWorld | API-GUI hybrid, 1000+ VMs | +| **UI-TARS-2** | Win+Linux+Android | Multi-turn RLVR | **50.6% WAA** | Multi-platform (not fully open) | +| **DART-GUI** | Ubuntu/OSWorld | Step-wise GRPO | 42.1% OSWorld | Async 4-module, 1.9x throughput | +| **ARPO** | Ubuntu/OSWorld | GRPO + replay | 29.9% OSWorld | Experience replay buffer | +| **ZeroGUI** | Ubuntu+Android | Online RL | +14% over base | Auto task gen + reward estimation | +| **GUI-Genesis** | Synthetic Flask | GRPO | N/A (mobile) | Synthetic envs, code-native rewards | +| **OpenAdapt** | **Windows/WAA** | **GRPO (+ HCAPO planned)** | Not yet | **Only OSS WAA RL training** | + +### Key RL Training Frameworks + +| Framework | Stars | GiGPO? | Multi-turn VLM? | Desktop Env? | +|-----------|-------|--------|-----------------|--------------| +| verl | 20k | No | Yes | No | +| Agent Lightning | 15.5k | No | Yes | No | +| OpenRLHF | 9.2k | No | Yes (OpenRLHF-M) | No | +| verl-agent | 1.7k | **Yes** | Yes | No | +| Agent-R1 | 1.3k | No | Yes | No | +| AgentGym-RL | 639 | No | Yes | No | +| VAGEN | 431 | No (dropped) | Yes | No | + +--- + +## 7. Revised Architecture Recommendation + +### Phase 1: Standalone GRPO + HCAPO (Current → 2 weeks) +- Standalone GRPO trainer validates RL training works with WAA at all (PR #55 shipped) +- Add HCAPO per-step credit (~80-120 lines, no dependencies) +- Add entropy-based step filtering (~50 lines) +- Add Entropulse wrapper (~100 lines) +- **Goal**: First successful RL training run on Core4 tasks + +### Phase 2: Dense Rewards + API-GUI (2-4 weeks) +- Enhance WAA `/evaluate` for dense partial-credit rewards +- Prototype API-GUI hybrid actions for LibreOffice via `win32com` +- Test PC Agent-E trajectory augmentation on our 7 WAA tasks +- **Goal**: Improve sample efficiency enough for practical training + +### Phase 3: Scaling (4-8 weeks, when needed) +- Adopt DART-GUI async architecture pattern (reimplemented against our pool infra) +- Scale to 10+ parallel WAA VMs +- Only consider verl-agent if HCAPO proves insufficient +- **Goal**: Production-grade training at scale + +### What to Drop +- **VAGEN-Lite as training backend** — vanilla GRPO only, no advantage over standalone +- **GiGPO anchor state approach** — unreliable for pixel-based desktop screenshots +- **GUI-Genesis integration** — mobile web only, no code, proprietary LLM dependency +- **HiPER** — requires verl-agent anyway, adds hierarchical complexity + +--- + +## 8. Sources + +### Papers +- ComputerRL: [arxiv.org/abs/2508.14040](https://arxiv.org/abs/2508.14040) +- DART-GUI: [arxiv.org/abs/2509.23866](https://arxiv.org/abs/2509.23866) +- GUI-Genesis: [arxiv.org/abs/2602.14093](https://arxiv.org/abs/2602.14093) +- HCAPO: [arxiv.org/abs/2603.08754](https://arxiv.org/abs/2603.08754) +- iStar: [arxiv.org/abs/2509.19199](https://arxiv.org/abs/2509.19199) +- GiGPO: [arxiv.org/abs/2505.10978](https://arxiv.org/abs/2505.10978) +- HiPER: [arxiv.org/abs/2602.16165](https://arxiv.org/abs/2602.16165) +- VAGEN: [arxiv.org/abs/2510.16907](https://arxiv.org/abs/2510.16907) +- PC Agent-E: [arxiv.org/abs/2505.13909](https://arxiv.org/abs/2505.13909) +- UI-TARS-2: [arxiv.org/abs/2509.02544](https://arxiv.org/abs/2509.02544) +- DigiRL: [arxiv.org/abs/2406.11896](https://arxiv.org/abs/2406.11896) + +### Repositories +- ComputerRL: [github.com/thudm/ComputerRL](https://github.com/thudm/ComputerRL) +- DART-GUI: [github.com/Computer-use-agents/dart-gui](https://github.com/Computer-use-agents/dart-gui) +- ARPO: [github.com/dvlab-research/ARPO](https://github.com/dvlab-research/ARPO) +- ZeroGUI: [github.com/OpenGVLab/ZeroGUI](https://github.com/OpenGVLab/ZeroGUI) +- verl: [github.com/verl-project/verl](https://github.com/verl-project/verl) +- verl-agent: [github.com/langfengQ/verl-agent](https://github.com/langfengQ/verl-agent) +- VAGEN: [github.com/mll-lab-nu/VAGEN](https://github.com/mll-lab-nu/VAGEN) +- Agent Lightning: [github.com/microsoft/agent-lightning](https://github.com/microsoft/agent-lightning) +- BrowserGym: [github.com/ServiceNow/BrowserGym](https://github.com/ServiceNow/BrowserGym) +- OSWorld: [github.com/xlang-ai/OSWorld](https://github.com/xlang-ai/OSWorld) +- PC Agent-E: [github.com/GAIR-NLP/PC-Agent-E](https://github.com/GAIR-NLP/PC-Agent-E) +- HiPER: [github.com/JonP07/HiPER-agent](https://github.com/JonP07/HiPER-agent) From 659e3c269bb4c46d541463bdcb9bdcabce12046e Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Tue, 17 Mar 2026 12:32:23 -0400 Subject: [PATCH 4/4] =?UTF-8?q?docs:=20correct=20prioritization=20?= =?UTF-8?q?=E2=80=94=20validate=20GRPO=20before=20optimizing=20training=20?= =?UTF-8?q?math?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HCAPO and per-step credit are Phase 3 optimizations, not Phase 1. The bottleneck is rollout success rate (getting non-zero rewards), not loss computation. Dense partial-credit rewards and API-GUI hybrid actions directly increase gradient signal and should come first. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/grpo_training_research_2026_03_17.md | 88 ++++++++++++++++------- 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/docs/grpo_training_research_2026_03_17.md b/docs/grpo_training_research_2026_03_17.md index ac57d15..9997967 100644 --- a/docs/grpo_training_research_2026_03_17.md +++ b/docs/grpo_training_research_2026_03_17.md @@ -129,16 +129,17 @@ The anchor state matching problem is fundamental for desktop GUI. Screenshots va ### 4.3 Integration Roadmap -| Priority | Idea | Source | Effort | Impact | -|----------|------|--------|--------|--------| -| **P0** | HCAPO per-step credit | Paper | 1-2 weeks | Per-step signal without verl-agent | -| **P0** | Entropy-based step filtering | DART-GUI | 1-2 weeks | Focus compute on decision points | -| **P0** | Entropulse (RL/SFT alternation) | ComputerRL | 1 week | Prevent entropy collapse | -| **P1** | API-GUI hybrid actions | ComputerRL | 2-3 weeks | 134% improvement potential on Office tasks | -| **P1** | Experience pool for hard tasks | DART-GUI | 1 week | Supplement sparse success signal | -| **P1** | Dense partial-credit rewards | GUI-Genesis concept | 1-2 weeks | Replace binary with continuous rewards | -| **P2** | Async rollout architecture | DART-GUI | 4-6 weeks | Critical for 10+ VM scaling | -| **P2** | Auto-API construction | ComputerRL | 2 weeks/app | Generalize API-GUI to new apps | +| Priority | Idea | Source | Effort | Impact | Prerequisite | +|----------|------|--------|--------|--------|--------------| +| **P0** | Validate GRPO end-to-end on WAA | PR #55 | 1-2 weeks | Proves the pipeline works at all | None | +| **P0** | Dense partial-credit rewards | GUI-Genesis concept | 1-2 weeks | Turns binary 0/1 into continuous 0-1 | GRPO working | +| **P1** | API-GUI hybrid actions | ComputerRL | 2-3 weeks | 134% improvement, more rollouts succeed | GRPO working | +| **P1** | Experience pool for hard tasks | DART-GUI | 1 week | Supplement sparse success signal | GRPO working | +| **P2** | HCAPO per-step credit | Paper | 1-2 weeks | 7-14% over GRPO (untested on VLMs) | >20% rollout success rate | +| **P2** | Entropy-based step filtering | DART-GUI | 1-2 weeks | Focus compute on decision points | Multiple training steps | +| **P2** | Entropulse (RL/SFT alternation) | ComputerRL | 1 week | Prevent entropy collapse | Extended training runs | +| **P3** | Async rollout architecture | DART-GUI | 4-6 weeks | Critical for 10+ VM scaling | Multi-VM pool | +| **P3** | Auto-API construction | ComputerRL | 2 weeks/app | Generalize API-GUI to new apps | API-GUI validated | --- @@ -196,31 +197,68 @@ The anchor state matching problem is fundamental for desktop GUI. Screenshots va ## 7. Revised Architecture Recommendation -### Phase 1: Standalone GRPO + HCAPO (Current → 2 weeks) -- Standalone GRPO trainer validates RL training works with WAA at all (PR #55 shipped) -- Add HCAPO per-step credit (~80-120 lines, no dependencies) -- Add entropy-based step filtering (~50 lines) -- Add Entropulse wrapper (~100 lines) -- **Goal**: First successful RL training run on Core4 tasks - -### Phase 2: Dense Rewards + API-GUI (2-4 weeks) -- Enhance WAA `/evaluate` for dense partial-credit rewards -- Prototype API-GUI hybrid actions for LibreOffice via `win32com` -- Test PC Agent-E trajectory augmentation on our 7 WAA tasks -- **Goal**: Improve sample efficiency enough for practical training - -### Phase 3: Scaling (4-8 weeks, when needed) +> **Principle**: Don't optimize the training math before validating that training works at all. +> The bottleneck is rollout collection (2-10 min/episode on real VMs), not loss computation +> (seconds). Per-step credit only matters when there are successful rollouts to learn from. + +### Phase 1: Get GRPO Working End-to-End (Current → 1-2 weeks) +- Run `validate_grpo_waa.py` phases 1-5 against real WAA VM (PR #55) +- Get non-zero rewards: use easiest tasks, pre-trained model, short episodes (max_steps=5) +- Fix whatever breaks (infra failures, OOM, reward always 0) +- **Goal**: At least one training step with non-zero loss on a real WAA task +- **Success criteria**: Checkpoint saved, loss non-zero, at least one rollout with reward > 0 + +### Phase 2: Make Rewards Less Sparse (1-3 weeks) +- **Dense partial-credit rewards** — enhance WAA `/evaluate` to return continuous [0,1] scores + via programmatic state checks (cell values, font properties, settings.json). This directly + helps GRPO by giving more gradient signal — turns "all 0 vs all 0" groups into meaningful + advantage estimates. **This is the single highest-impact change.** +- **API-GUI hybrid actions** — let the agent issue programmatic `win32com`/PowerShell commands + alongside GUI clicks (ComputerRL's 134% improvement). Reduces task difficulty, meaning more + rollouts succeed, meaning GRPO has actual gradient signal to work with. +- **Goal**: >20% of rollouts achieving non-zero reward on Core4 tasks + +### Phase 3: Improve Training Efficiency (2-4 weeks, after Phase 2) +- **HCAPO per-step credit** (~80-120 lines) — only valuable once we have a mix of successful + and failed rollouts. HCAPO's 7.7-13.8% improvement over GRPO is meaningful only when GRPO + itself is working. Note: untested with VLMs/screenshots — we'd be the first. +- **Entropy-based step filtering** (DART-GUI) — train on top 80% of steps by token entropy, + focusing compute on decision points. Cheap to add (~50 lines). +- **Entropulse** (ComputerRL) — alternate RL/SFT phases to prevent entropy collapse during + extended training. Only relevant after multiple RL phases (~100 lines). +- **Experience pool** (DART-GUI) — pre-collect successful trajectories for hard tasks to + supplement sparse online success signal. +- **Goal**: Measurable improvement in sample efficiency over Phase 2 baseline + +### Phase 4: Scaling (4-8 weeks, when needed) - Adopt DART-GUI async architecture pattern (reimplemented against our pool infra) - Scale to 10+ parallel WAA VMs -- Only consider verl-agent if HCAPO proves insufficient +- Only consider verl-agent if HCAPO proves insufficient and GiGPO anchors can be made reliable - **Goal**: Production-grade training at scale ### What to Drop - **VAGEN-Lite as training backend** — vanilla GRPO only, no advantage over standalone - **GiGPO anchor state approach** — unreliable for pixel-based desktop screenshots + (mouse position, animation frames, anti-aliasing differ across rollouts even for + semantically identical states; a11y tree hashing is unreliable during UI transitions) - **GUI-Genesis integration** — mobile web only, no code, proprietary LLM dependency - **HiPER** — requires verl-agent anyway, adds hierarchical complexity +### Prioritization Rationale + +The research identified HCAPO as the best per-step credit method, but **per-step credit +is a Phase 3 optimization, not a Phase 1 prerequisite**. Here's why: + +1. If all 8 rollouts score 0, GRPO has zero gradient signal. HCAPO can't fix that — + it redistributes credit *within* a trajectory, but the episode-level advantage is + still zero when all rewards are equal. +2. The published 7.7% WebShop improvement is relative to a working GRPO baseline on + text environments with 5-10 step episodes. Transfer to 15-30 step visual desktop + tasks is unproven. +3. Our bottleneck is rollout success rate, not training math. Dense rewards and + API-GUI actions directly increase the fraction of rollouts with non-zero reward, + which is the prerequisite for any training algorithm to learn. + --- ## 8. Sources