@@ -7784,6 +7784,181 @@ def cmd_resources(args):
77847784 return 0
77857785
77867786
7787+ # =============================================================================
7788+ # GPU Training Commands
7789+ # =============================================================================
7790+
7791+
7792+ def _get_gpu_vm_manager (cloud : str ):
7793+ """Get VM manager for GPU training."""
7794+ if cloud == "azure" :
7795+ from openadapt_evals .infrastructure .azure_vm import AzureVMManager
7796+ return AzureVMManager ()
7797+ elif cloud == "aws" :
7798+ from openadapt_evals .infrastructure .aws_vm import AWSVMManager
7799+ return AWSVMManager ()
7800+ raise ValueError (f"Unknown cloud: { cloud } " )
7801+
7802+
7803+ def cmd_gpu_setup (args ):
7804+ """Provision a GPU VM and install verl-agent for RL training."""
7805+ import time
7806+ from pathlib import Path
7807+
7808+ from openadapt_evals .infrastructure .azure_vm import ssh_run
7809+
7810+ cloud = getattr (args , "cloud" , "azure" )
7811+ vm = _get_gpu_vm_manager (cloud )
7812+ username = vm .ssh_username
7813+ gpu_vm_name = "verl-train-00"
7814+
7815+ if args .gpu_ip :
7816+ ip = args .gpu_ip
7817+ print (f"Using existing GPU VM: { ip } " )
7818+ else :
7819+ print ("Finding available GPU VM size..." )
7820+ vm_size , region , cost = vm .find_available_size_and_region (gpu = True )
7821+ print (f"Selected: { vm_size } (${ cost :.2f} /hr) in { region } " )
7822+
7823+ if args .dry_run :
7824+ print (f"[DRY RUN] Would provision { vm_size } in { region } " )
7825+ return 0
7826+
7827+ print (f"Creating GPU VM '{ gpu_vm_name } '..." )
7828+ info = vm .create_vm (name = gpu_vm_name , region = region , size = vm_size )
7829+ ip = info .get ("publicIpAddress" ) or vm .get_vm_ip (gpu_vm_name )
7830+ vm .set_auto_shutdown (gpu_vm_name , hours = 6 )
7831+
7832+ # Wait for SSH
7833+ print ("Waiting for SSH..." )
7834+ for _ in range (30 ):
7835+ try :
7836+ result = ssh_run (ip , "echo ready" , username = username , stream = False )
7837+ if result .returncode == 0 :
7838+ break
7839+ except Exception :
7840+ pass
7841+ time .sleep (10 )
7842+ else :
7843+ print (f"ERROR: SSH not ready after 5 minutes: { ip } " )
7844+ return 1
7845+
7846+ # Upload and run setup script
7847+ setup_script = Path (__file__ ).parent .parent .parent / "scripts" / "setup_gpu_training.sh"
7848+ if not setup_script .exists ():
7849+ print (f"ERROR: Setup script not found: { setup_script } " )
7850+ return 1
7851+
7852+ from openadapt_evals .infrastructure .azure_vm import SSH_OPTS
7853+ import subprocess
7854+
7855+ print ("Uploading setup script..." )
7856+ subprocess .run (
7857+ ["scp" , * SSH_OPTS , str (setup_script ), f"{ username } @{ ip } :/tmp/setup_gpu_training.sh" ],
7858+ check = True ,
7859+ )
7860+
7861+ print ("Running setup (this may take 15-30 minutes)..." )
7862+ result = ssh_run (ip , "bash /tmp/setup_gpu_training.sh" , username = username , stream = True )
7863+ if result .returncode != 0 :
7864+ print (f"ERROR: Setup failed with exit code { result .returncode } " )
7865+ return 1
7866+
7867+ print (f"\n GPU VM ready at: { ip } " )
7868+ print (f"SSH: ssh { username } @{ ip } " )
7869+ return 0
7870+
7871+
7872+ def cmd_gpu_train (args ):
7873+ """Launch verl-agent training on a GPU VM."""
7874+ import time
7875+ from pathlib import Path
7876+
7877+ from openadapt_evals .infrastructure .azure_vm import ssh_run
7878+
7879+ cloud = getattr (args , "cloud" , "azure" )
7880+ vm = _get_gpu_vm_manager (cloud )
7881+ username = vm .ssh_username
7882+ gpu_vm_name = "verl-train-00"
7883+
7884+ if args .gpu_ip :
7885+ ip = args .gpu_ip
7886+ print (f"Using existing GPU VM: { ip } " )
7887+ else :
7888+ # Provision GPU VM
7889+ print ("Finding available GPU VM size..." )
7890+ vm_size , region , cost = vm .find_available_size_and_region (gpu = True )
7891+ print (f"Selected: { vm_size } (${ cost :.2f} /hr) in { region } " )
7892+ print (f"Creating GPU VM '{ gpu_vm_name } '..." )
7893+ info = vm .create_vm (name = gpu_vm_name , region = region , size = vm_size )
7894+ ip = info .get ("publicIpAddress" ) or vm .get_vm_ip (gpu_vm_name )
7895+ vm .set_auto_shutdown (gpu_vm_name , hours = 6 )
7896+
7897+ # Wait for SSH
7898+ for _ in range (30 ):
7899+ try :
7900+ result = ssh_run (ip , "echo ready" , username = username , stream = False )
7901+ if result .returncode == 0 :
7902+ break
7903+ except Exception :
7904+ pass
7905+ time .sleep (10 )
7906+
7907+ # Setup if needed
7908+ if not args .skip_setup :
7909+ setup_script = Path (__file__ ).parent .parent .parent / "scripts" / "setup_gpu_training.sh"
7910+ if setup_script .exists ():
7911+ from openadapt_evals .infrastructure .azure_vm import SSH_OPTS
7912+ import subprocess
7913+ subprocess .run (
7914+ ["scp" , * SSH_OPTS , str (setup_script ), f"{ username } @{ ip } :/tmp/setup_gpu_training.sh" ],
7915+ check = True ,
7916+ )
7917+ result = ssh_run (ip , "bash /tmp/setup_gpu_training.sh" , username = username , stream = True )
7918+ if result .returncode != 0 :
7919+ print (f"ERROR: Setup failed" )
7920+ return 1
7921+
7922+ # Launch training
7923+ train_cmd = (
7924+ f"cd ~/verl-agent && "
7925+ f"conda activate verl-agent && "
7926+ f"python3 -m verl.trainer.main_ppo "
7927+ f"algorithm.adv_estimator={ args .algorithm } "
7928+ f"actor_rollout_ref.model.path={ args .model } "
7929+ f"actor_rollout_ref.rollout.name=vllm "
7930+ f"actor_rollout_ref.rollout.tensor_model_parallel_size={ args .n_gpus } "
7931+ f"env.env_name=openadapt_evals.adapters.verl_env.WAADesktopEnv "
7932+ f"env.env_kwargs.server_url={ args .waa_server } "
7933+ f"env.env_kwargs.task_id={ args .task_id } "
7934+ f"env.env_kwargs.max_steps=15 "
7935+ f"env.max_steps=15 "
7936+ f"env.rollout.n=8 "
7937+ f"data.train_batch_size=8 "
7938+ f"data.max_prompt_length=2048 "
7939+ f"data.max_response_length=512 "
7940+ f"data.return_raw_chat=True "
7941+ f"trainer.n_gpus_per_node={ args .n_gpus } "
7942+ f"trainer.nnodes=1 "
7943+ f"trainer.total_epochs={ args .epochs } "
7944+ f"trainer.logger=['console','wandb'] "
7945+ f"trainer.project_name=openadapt-waa-rl"
7946+ )
7947+
7948+ print (f"Launching { args .algorithm } training on { args .n_gpus } GPU(s)..." )
7949+ print (f"Model: { args .model } " )
7950+ print (f"WAA server: { args .waa_server } " )
7951+ print (f"Task: { args .task_id } " )
7952+
7953+ try :
7954+ result = ssh_run (ip , train_cmd , username = username , stream = True )
7955+ return result .returncode
7956+ finally :
7957+ if args .cleanup and not args .gpu_ip :
7958+ print (f"Deallocating GPU VM '{ gpu_vm_name } '..." )
7959+ vm .deallocate_vm (gpu_vm_name )
7960+
7961+
77877962# =============================================================================
77887963# Main
77897964# =============================================================================
@@ -8898,6 +9073,72 @@ def main():
88989073 )
88999074 p_view_pool .set_defaults (func = cmd_view_pool )
89009075
9076+ # --- GPU Training Commands ---
9077+
9078+ p_gpu_setup = subparsers .add_parser (
9079+ "gpu-setup" ,
9080+ help = "Provision a GPU VM and install verl-agent for RL training" ,
9081+ )
9082+ p_gpu_setup .add_argument (
9083+ "--cloud" , choices = ["azure" , "aws" ], default = "azure" ,
9084+ help = "Cloud provider (default: azure)" ,
9085+ )
9086+ p_gpu_setup .add_argument (
9087+ "--gpu-ip" , type = str , default = None ,
9088+ help = "Use an existing GPU VM (skip provisioning)" ,
9089+ )
9090+ p_gpu_setup .add_argument (
9091+ "--dry-run" , action = "store_true" ,
9092+ help = "Show what would happen without doing it" ,
9093+ )
9094+ p_gpu_setup .set_defaults (func = cmd_gpu_setup )
9095+
9096+ p_gpu_train = subparsers .add_parser (
9097+ "gpu-train" ,
9098+ help = "Launch verl-agent training on a GPU VM" ,
9099+ )
9100+ p_gpu_train .add_argument (
9101+ "--cloud" , choices = ["azure" , "aws" ], default = "azure" ,
9102+ help = "Cloud provider (default: azure)" ,
9103+ )
9104+ p_gpu_train .add_argument (
9105+ "--gpu-ip" , type = str , default = None ,
9106+ help = "Use an existing GPU VM (skip provisioning)" ,
9107+ )
9108+ p_gpu_train .add_argument (
9109+ "--waa-server" , type = str , default = "http://localhost:5001" ,
9110+ help = "WAA server URL accessible from GPU VM" ,
9111+ )
9112+ p_gpu_train .add_argument (
9113+ "--task-id" , type = str , required = True ,
9114+ help = "WAA task UUID to train on" ,
9115+ )
9116+ p_gpu_train .add_argument (
9117+ "--algorithm" , choices = ["gigpo" , "grpo" , "ppo" ], default = "gigpo" ,
9118+ help = "RL algorithm (default: gigpo)" ,
9119+ )
9120+ p_gpu_train .add_argument (
9121+ "--model" , type = str , default = "Qwen/Qwen2.5-VL-3B-Instruct" ,
9122+ help = "Model to train" ,
9123+ )
9124+ p_gpu_train .add_argument (
9125+ "--n-gpus" , type = int , default = 2 ,
9126+ help = "Number of GPUs (default: 2)" ,
9127+ )
9128+ p_gpu_train .add_argument (
9129+ "--epochs" , type = int , default = 100 ,
9130+ help = "Training epochs (default: 100)" ,
9131+ )
9132+ p_gpu_train .add_argument (
9133+ "--skip-setup" , action = "store_true" ,
9134+ help = "Skip setup (VM already configured)" ,
9135+ )
9136+ p_gpu_train .add_argument (
9137+ "--cleanup" , action = "store_true" ,
9138+ help = "Deallocate GPU VM after training" ,
9139+ )
9140+ p_gpu_train .set_defaults (func = cmd_gpu_train )
9141+
89019142 args = parser .parse_args ()
89029143
89039144 # Allow --resource-group to override the module-level constant
0 commit comments