Skip to content

Commit f9e5804

Browse files
abrichrclaude
andcommitted
feat: add GPU training automation for verl-agent E2E workflow
- Add GPU_VM_SIZE_FALLBACKS to azure_vm.py (NC48ads_A100_v4, NC24ads, NC12s_v3) - Add GPU_INSTANCE_TYPE_FALLBACKS to aws_vm.py (p3.8xlarge, g5.12xlarge, p3.2xlarge) - Update find_available_size_and_region(gpu=True) on both providers + protocol - Add scripts/setup_gpu_training.sh: installs conda, vLLM, flash-attn, verl-agent - Add scripts/train_verl_e2e.py: provisions GPU VM, uploads setup, launches training - Add oa-vm gpu-setup and gpu-train CLI commands Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9071fca commit f9e5804

6 files changed

Lines changed: 721 additions & 5 deletions

File tree

openadapt_evals/benchmarks/vm_cli.py

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7784,6 +7784,181 @@ def cmd_resources(args):
77847784
return 0
77857785

77867786

7787+
# =============================================================================
7788+
# GPU Training Commands
7789+
# =============================================================================
7790+
7791+
7792+
def _get_gpu_vm_manager(cloud: str):
7793+
"""Get VM manager for GPU training."""
7794+
if cloud == "azure":
7795+
from openadapt_evals.infrastructure.azure_vm import AzureVMManager
7796+
return AzureVMManager()
7797+
elif cloud == "aws":
7798+
from openadapt_evals.infrastructure.aws_vm import AWSVMManager
7799+
return AWSVMManager()
7800+
raise ValueError(f"Unknown cloud: {cloud}")
7801+
7802+
7803+
def cmd_gpu_setup(args):
7804+
"""Provision a GPU VM and install verl-agent for RL training."""
7805+
import time
7806+
from pathlib import Path
7807+
7808+
from openadapt_evals.infrastructure.azure_vm import ssh_run
7809+
7810+
cloud = getattr(args, "cloud", "azure")
7811+
vm = _get_gpu_vm_manager(cloud)
7812+
username = vm.ssh_username
7813+
gpu_vm_name = "verl-train-00"
7814+
7815+
if args.gpu_ip:
7816+
ip = args.gpu_ip
7817+
print(f"Using existing GPU VM: {ip}")
7818+
else:
7819+
print("Finding available GPU VM size...")
7820+
vm_size, region, cost = vm.find_available_size_and_region(gpu=True)
7821+
print(f"Selected: {vm_size} (${cost:.2f}/hr) in {region}")
7822+
7823+
if args.dry_run:
7824+
print(f"[DRY RUN] Would provision {vm_size} in {region}")
7825+
return 0
7826+
7827+
print(f"Creating GPU VM '{gpu_vm_name}'...")
7828+
info = vm.create_vm(name=gpu_vm_name, region=region, size=vm_size)
7829+
ip = info.get("publicIpAddress") or vm.get_vm_ip(gpu_vm_name)
7830+
vm.set_auto_shutdown(gpu_vm_name, hours=6)
7831+
7832+
# Wait for SSH
7833+
print("Waiting for SSH...")
7834+
for _ in range(30):
7835+
try:
7836+
result = ssh_run(ip, "echo ready", username=username, stream=False)
7837+
if result.returncode == 0:
7838+
break
7839+
except Exception:
7840+
pass
7841+
time.sleep(10)
7842+
else:
7843+
print(f"ERROR: SSH not ready after 5 minutes: {ip}")
7844+
return 1
7845+
7846+
# Upload and run setup script
7847+
setup_script = Path(__file__).parent.parent.parent / "scripts" / "setup_gpu_training.sh"
7848+
if not setup_script.exists():
7849+
print(f"ERROR: Setup script not found: {setup_script}")
7850+
return 1
7851+
7852+
from openadapt_evals.infrastructure.azure_vm import SSH_OPTS
7853+
import subprocess
7854+
7855+
print("Uploading setup script...")
7856+
subprocess.run(
7857+
["scp", *SSH_OPTS, str(setup_script), f"{username}@{ip}:/tmp/setup_gpu_training.sh"],
7858+
check=True,
7859+
)
7860+
7861+
print("Running setup (this may take 15-30 minutes)...")
7862+
result = ssh_run(ip, "bash /tmp/setup_gpu_training.sh", username=username, stream=True)
7863+
if result.returncode != 0:
7864+
print(f"ERROR: Setup failed with exit code {result.returncode}")
7865+
return 1
7866+
7867+
print(f"\nGPU VM ready at: {ip}")
7868+
print(f"SSH: ssh {username}@{ip}")
7869+
return 0
7870+
7871+
7872+
def cmd_gpu_train(args):
7873+
"""Launch verl-agent training on a GPU VM."""
7874+
import time
7875+
from pathlib import Path
7876+
7877+
from openadapt_evals.infrastructure.azure_vm import ssh_run
7878+
7879+
cloud = getattr(args, "cloud", "azure")
7880+
vm = _get_gpu_vm_manager(cloud)
7881+
username = vm.ssh_username
7882+
gpu_vm_name = "verl-train-00"
7883+
7884+
if args.gpu_ip:
7885+
ip = args.gpu_ip
7886+
print(f"Using existing GPU VM: {ip}")
7887+
else:
7888+
# Provision GPU VM
7889+
print("Finding available GPU VM size...")
7890+
vm_size, region, cost = vm.find_available_size_and_region(gpu=True)
7891+
print(f"Selected: {vm_size} (${cost:.2f}/hr) in {region}")
7892+
print(f"Creating GPU VM '{gpu_vm_name}'...")
7893+
info = vm.create_vm(name=gpu_vm_name, region=region, size=vm_size)
7894+
ip = info.get("publicIpAddress") or vm.get_vm_ip(gpu_vm_name)
7895+
vm.set_auto_shutdown(gpu_vm_name, hours=6)
7896+
7897+
# Wait for SSH
7898+
for _ in range(30):
7899+
try:
7900+
result = ssh_run(ip, "echo ready", username=username, stream=False)
7901+
if result.returncode == 0:
7902+
break
7903+
except Exception:
7904+
pass
7905+
time.sleep(10)
7906+
7907+
# Setup if needed
7908+
if not args.skip_setup:
7909+
setup_script = Path(__file__).parent.parent.parent / "scripts" / "setup_gpu_training.sh"
7910+
if setup_script.exists():
7911+
from openadapt_evals.infrastructure.azure_vm import SSH_OPTS
7912+
import subprocess
7913+
subprocess.run(
7914+
["scp", *SSH_OPTS, str(setup_script), f"{username}@{ip}:/tmp/setup_gpu_training.sh"],
7915+
check=True,
7916+
)
7917+
result = ssh_run(ip, "bash /tmp/setup_gpu_training.sh", username=username, stream=True)
7918+
if result.returncode != 0:
7919+
print(f"ERROR: Setup failed")
7920+
return 1
7921+
7922+
# Launch training
7923+
train_cmd = (
7924+
f"cd ~/verl-agent && "
7925+
f"conda activate verl-agent && "
7926+
f"python3 -m verl.trainer.main_ppo "
7927+
f"algorithm.adv_estimator={args.algorithm} "
7928+
f"actor_rollout_ref.model.path={args.model} "
7929+
f"actor_rollout_ref.rollout.name=vllm "
7930+
f"actor_rollout_ref.rollout.tensor_model_parallel_size={args.n_gpus} "
7931+
f"env.env_name=openadapt_evals.adapters.verl_env.WAADesktopEnv "
7932+
f"env.env_kwargs.server_url={args.waa_server} "
7933+
f"env.env_kwargs.task_id={args.task_id} "
7934+
f"env.env_kwargs.max_steps=15 "
7935+
f"env.max_steps=15 "
7936+
f"env.rollout.n=8 "
7937+
f"data.train_batch_size=8 "
7938+
f"data.max_prompt_length=2048 "
7939+
f"data.max_response_length=512 "
7940+
f"data.return_raw_chat=True "
7941+
f"trainer.n_gpus_per_node={args.n_gpus} "
7942+
f"trainer.nnodes=1 "
7943+
f"trainer.total_epochs={args.epochs} "
7944+
f"trainer.logger=['console','wandb'] "
7945+
f"trainer.project_name=openadapt-waa-rl"
7946+
)
7947+
7948+
print(f"Launching {args.algorithm} training on {args.n_gpus} GPU(s)...")
7949+
print(f"Model: {args.model}")
7950+
print(f"WAA server: {args.waa_server}")
7951+
print(f"Task: {args.task_id}")
7952+
7953+
try:
7954+
result = ssh_run(ip, train_cmd, username=username, stream=True)
7955+
return result.returncode
7956+
finally:
7957+
if args.cleanup and not args.gpu_ip:
7958+
print(f"Deallocating GPU VM '{gpu_vm_name}'...")
7959+
vm.deallocate_vm(gpu_vm_name)
7960+
7961+
77877962
# =============================================================================
77887963
# Main
77897964
# =============================================================================
@@ -8898,6 +9073,72 @@ def main():
88989073
)
88999074
p_view_pool.set_defaults(func=cmd_view_pool)
89009075

9076+
# --- GPU Training Commands ---
9077+
9078+
p_gpu_setup = subparsers.add_parser(
9079+
"gpu-setup",
9080+
help="Provision a GPU VM and install verl-agent for RL training",
9081+
)
9082+
p_gpu_setup.add_argument(
9083+
"--cloud", choices=["azure", "aws"], default="azure",
9084+
help="Cloud provider (default: azure)",
9085+
)
9086+
p_gpu_setup.add_argument(
9087+
"--gpu-ip", type=str, default=None,
9088+
help="Use an existing GPU VM (skip provisioning)",
9089+
)
9090+
p_gpu_setup.add_argument(
9091+
"--dry-run", action="store_true",
9092+
help="Show what would happen without doing it",
9093+
)
9094+
p_gpu_setup.set_defaults(func=cmd_gpu_setup)
9095+
9096+
p_gpu_train = subparsers.add_parser(
9097+
"gpu-train",
9098+
help="Launch verl-agent training on a GPU VM",
9099+
)
9100+
p_gpu_train.add_argument(
9101+
"--cloud", choices=["azure", "aws"], default="azure",
9102+
help="Cloud provider (default: azure)",
9103+
)
9104+
p_gpu_train.add_argument(
9105+
"--gpu-ip", type=str, default=None,
9106+
help="Use an existing GPU VM (skip provisioning)",
9107+
)
9108+
p_gpu_train.add_argument(
9109+
"--waa-server", type=str, default="http://localhost:5001",
9110+
help="WAA server URL accessible from GPU VM",
9111+
)
9112+
p_gpu_train.add_argument(
9113+
"--task-id", type=str, required=True,
9114+
help="WAA task UUID to train on",
9115+
)
9116+
p_gpu_train.add_argument(
9117+
"--algorithm", choices=["gigpo", "grpo", "ppo"], default="gigpo",
9118+
help="RL algorithm (default: gigpo)",
9119+
)
9120+
p_gpu_train.add_argument(
9121+
"--model", type=str, default="Qwen/Qwen2.5-VL-3B-Instruct",
9122+
help="Model to train",
9123+
)
9124+
p_gpu_train.add_argument(
9125+
"--n-gpus", type=int, default=2,
9126+
help="Number of GPUs (default: 2)",
9127+
)
9128+
p_gpu_train.add_argument(
9129+
"--epochs", type=int, default=100,
9130+
help="Training epochs (default: 100)",
9131+
)
9132+
p_gpu_train.add_argument(
9133+
"--skip-setup", action="store_true",
9134+
help="Skip setup (VM already configured)",
9135+
)
9136+
p_gpu_train.add_argument(
9137+
"--cleanup", action="store_true",
9138+
help="Deallocate GPU VM after training",
9139+
)
9140+
p_gpu_train.set_defaults(func=cmd_gpu_train)
9141+
89019142
args = parser.parse_args()
89029143

89039144
# Allow --resource-group to override the module-level constant

openadapt_evals/infrastructure/aws_vm.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@
4545
("c5.metal", 4.080),
4646
("m5a.xlarge", 0.172), # Non-KVM fallback (won't run QEMU, for testing only)
4747
]
48+
49+
# GPU instance types for verl-agent RL training.
50+
# verl-agent requires 2+ GPUs for distributed VLM training.
51+
# p3.8xlarge: 4x V100 16GB NVLink — recommended for Qwen2.5-VL-3B training.
52+
# g5.12xlarge: 4x A10G 24GB — budget option (no NVLink).
53+
# p3.2xlarge: 1x V100 16GB — single-GPU baseline (tight for 3B).
54+
GPU_INSTANCE_TYPE_FALLBACKS = [
55+
("p3.8xlarge", 12.24),
56+
("g5.12xlarge", 7.48),
57+
("p3.2xlarge", 3.06),
58+
]
59+
4860
# Regions to try in order of preference
4961
AWS_REGIONS = ["us-east-1", "us-west-2", "us-east-2", "eu-west-1"]
5062

@@ -526,11 +538,17 @@ def set_auto_shutdown(self, name: str, hours: int = 4) -> bool:
526538
logger.warning(f"Failed to set auto-shutdown for {name}: {e}")
527539
return False
528540

529-
def find_available_size_and_region(self) -> tuple[str, str, float]:
541+
def find_available_size_and_region(
542+
self, gpu: bool = False,
543+
) -> tuple[str, str, float]:
530544
"""Find a working EC2 instance type and region.
531545
532546
Checks instance type availability in each region.
533547
548+
Args:
549+
gpu: If True, try GPU instances (for verl-agent training).
550+
Otherwise try CPU/metal instances (for WAA evaluation).
551+
534552
Returns:
535553
Tuple of (instance_type, region, cost_per_hour).
536554
@@ -539,7 +557,8 @@ def find_available_size_and_region(self) -> tuple[str, str, float]:
539557
"""
540558
import boto3
541559

542-
for instance_type, cost in INSTANCE_TYPE_FALLBACKS:
560+
fallbacks = GPU_INSTANCE_TYPE_FALLBACKS if gpu else INSTANCE_TYPE_FALLBACKS
561+
for instance_type, cost in fallbacks:
543562
for region in AWS_REGIONS:
544563
try:
545564
ec2 = boto3.client("ec2", region_name=region)

openadapt_evals/infrastructure/azure_vm.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,18 @@
6565
("Standard_D8ds_v4", 0.38),
6666
("Standard_D8as_v5", 0.34),
6767
]
68+
69+
# GPU VM sizes for verl-agent RL training.
70+
# verl-agent requires 2+ GPUs for distributed VLM training.
71+
# NC48ads_A100_v4: 2x A100 80GB — recommended for Qwen2.5-VL-3B/7B.
72+
# NC24ads_A100_v4: 1x A100 80GB — single-GPU baseline.
73+
# NC12s_v3: 1x V100 16GB — budget option (tight for 3B).
74+
GPU_VM_SIZE_FALLBACKS = [
75+
("Standard_NC48ads_A100_v4", 11.04),
76+
("Standard_NC24ads_A100_v4", 5.52),
77+
("Standard_NC12s_v3", 1.50),
78+
]
79+
6880
VM_REGIONS = ["centralus", "eastus", "westus2", "eastus2"]
6981

7082
# Ubuntu 22.04 LTS image reference for Azure SDK
@@ -522,19 +534,25 @@ def _cli_set_auto_shutdown(self, name: str, hours: int) -> bool:
522534
)
523535
return result.returncode == 0
524536

525-
def find_available_size_and_region(self) -> tuple[str, str, float]:
537+
def find_available_size_and_region(
538+
self, gpu: bool = False,
539+
) -> tuple[str, str, float]:
526540
"""Find a working VM size and region by creating a test VM.
527541
528542
Tries size/region combinations until one succeeds, then cleans up
529543
the test VM.
530544
545+
Args:
546+
gpu: If True, try GPU sizes (for verl-agent training).
547+
Otherwise try CPU sizes (for WAA evaluation).
548+
531549
Returns:
532550
Tuple of (vm_size, region, cost_per_hour).
533551
534552
Raises:
535553
RuntimeError: If no available size/region found.
536554
"""
537-
sizes_to_try = VM_SIZE_FALLBACKS
555+
sizes_to_try = GPU_VM_SIZE_FALLBACKS if gpu else VM_SIZE_FALLBACKS
538556

539557
test_vm_to_cleanup = None
540558
try:

openadapt_evals/infrastructure/vm_provider.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,14 @@ def set_auto_shutdown(self, name: str, hours: int = 4) -> bool:
7474
"""Set auto-shutdown policy on a VM."""
7575
...
7676

77-
def find_available_size_and_region(self) -> tuple[str, str, float]:
77+
def find_available_size_and_region(
78+
self, gpu: bool = False,
79+
) -> tuple[str, str, float]:
7880
"""Find a working VM size and region.
7981
82+
Args:
83+
gpu: If True, try GPU sizes for RL training.
84+
8085
Returns:
8186
Tuple of (vm_size, region, cost_per_hour).
8287
"""

0 commit comments

Comments
 (0)