@@ -782,9 +782,19 @@ def cmd_pool_create(args):
782782 working_size = vm_size
783783 working_region = region
784784 working_cost = cost
785- # Delete the test VM
785+ # Delete the test VM and wait for completion
786+ log ("POOL" , f" Found working combo, cleaning up test VM..." )
786787 subprocess .run (
787- ["az" , "vm" , "delete" , "-g" , RESOURCE_GROUP , "-n" , test_name , "--yes" , "--force-deletion" , "true" , "--no-wait" ],
788+ ["az" , "vm" , "delete" , "-g" , RESOURCE_GROUP , "-n" , test_name , "--yes" , "--force-deletion" , "true" ],
789+ capture_output = True ,
790+ )
791+ # Also clean up associated resources
792+ subprocess .run (
793+ ["az" , "network" , "nic" , "delete" , "-g" , RESOURCE_GROUP , "-n" , f"{ test_name } VMNic" ],
794+ capture_output = True ,
795+ )
796+ subprocess .run (
797+ ["az" , "network" , "public-ip" , "delete" , "-g" , RESOURCE_GROUP , "-n" , f"{ test_name } PublicIP" ],
788798 capture_output = True ,
789799 )
790800 break
@@ -833,7 +843,15 @@ def create_worker(worker_idx: int) -> tuple[str, str | None, str | None]:
833843 )
834844
835845 if result .returncode != 0 :
836- return (name , None , result .stderr [:200 ] if result .stderr else "unknown error" )
846+ # Parse error for better message
847+ error_msg = result .stderr or "unknown error"
848+ try :
849+ error_json = json .loads (error_msg )
850+ if "error" in error_json :
851+ error_msg = error_json ["error" ].get ("message" , error_msg )[:500 ]
852+ except json .JSONDecodeError :
853+ error_msg = error_msg [:500 ]
854+ return (name , None , error_msg )
837855
838856 try :
839857 vm_info = json .loads (result .stdout )
@@ -892,21 +910,22 @@ def create_worker(worker_idx: int) -> tuple[str, str | None, str | None]:
892910sudo bash -c 'echo "{\\ "data-root\\ ": \\ "/mnt/docker\\ "}" > /etc/docker/daemon.json'
893911sudo systemctl start docker
894912
895- # Pull WAA image
896- docker pull windowsarena/winarena:latest
913+ # Pull WAA image (use sudo since usermod hasn't taken effect yet)
914+ sudo docker pull windowsarena/winarena:latest
897915"""
898916
899- def setup_docker (name_ip : tuple [str , str ]) -> tuple [str , bool ]:
917+ def setup_docker (name_ip : tuple [str , str ]) -> tuple [str , bool , str ]:
900918 name , ip = name_ip
901919 result = ssh_run (ip , docker_setup , stream = False , step = "DOCKER" )
902- return (name , result .returncode == 0 )
920+ error = result .stderr [:200 ] if result .stderr else ""
921+ return (name , result .returncode == 0 , error )
903922
904923 with ThreadPoolExecutor (max_workers = min (len (workers_ready ), 5 )) as executor :
905924 futures = {executor .submit (setup_docker , w ): w [0 ] for w in workers_ready }
906925 workers_docker_ok = []
907926 for future in as_completed (futures ):
908- name , success = future .result ()
909- status = "Docker ready" if success else "Docker FAILED"
927+ name , success , error = future .result ()
928+ status = "Docker ready" if success else f "Docker FAILED: { error [: 100 ] } "
910929 log ("POOL" , f" { name } : { status } " )
911930 if success :
912931 workers_docker_ok .append ((name , dict (workers_ready )[name ]))
@@ -1185,6 +1204,102 @@ def run_on_worker(worker, task_indices: list[int]) -> tuple[str, int, int, str]:
11851204 return 0 if total_failed == 0 else 1
11861205
11871206
1207+ def cmd_pool_cleanup (args ):
1208+ """Clean up orphaned pool resources (VMs, NICs, IPs, disks).
1209+
1210+ Use this after failed pool operations to clean up resources that
1211+ weren't properly deleted.
1212+ """
1213+ init_logging ()
1214+
1215+ log ("POOL-CLEANUP" , "Searching for orphaned pool resources..." )
1216+
1217+ # Find pool VMs
1218+ result = subprocess .run (
1219+ ["az" , "vm" , "list" , "-g" , RESOURCE_GROUP , "--query" , "[?contains(name, 'waa-pool')].name" , "-o" , "tsv" ],
1220+ capture_output = True , text = True ,
1221+ )
1222+ vms = [v .strip () for v in result .stdout .strip ().split ("\n " ) if v .strip ()]
1223+
1224+ # Find NICs
1225+ result = subprocess .run (
1226+ ["az" , "network" , "nic" , "list" , "-g" , RESOURCE_GROUP , "--query" , "[?contains(name, 'waa-pool')].name" , "-o" , "tsv" ],
1227+ capture_output = True , text = True ,
1228+ )
1229+ nics = [n .strip () for n in result .stdout .strip ().split ("\n " ) if n .strip ()]
1230+
1231+ # Find public IPs
1232+ result = subprocess .run (
1233+ ["az" , "network" , "public-ip" , "list" , "-g" , RESOURCE_GROUP , "--query" , "[?contains(name, 'waa-pool')].name" , "-o" , "tsv" ],
1234+ capture_output = True , text = True ,
1235+ )
1236+ ips = [i .strip () for i in result .stdout .strip ().split ("\n " ) if i .strip ()]
1237+
1238+ # Find disks
1239+ result = subprocess .run (
1240+ ["az" , "disk" , "list" , "-g" , RESOURCE_GROUP , "--query" , "[?contains(name, 'waa-pool')].name" , "-o" , "tsv" ],
1241+ capture_output = True , text = True ,
1242+ )
1243+ disks = [d .strip () for d in result .stdout .strip ().split ("\n " ) if d .strip ()]
1244+
1245+ total = len (vms ) + len (nics ) + len (ips ) + len (disks )
1246+
1247+ if total == 0 :
1248+ log ("POOL-CLEANUP" , "No orphaned resources found." )
1249+ return 0
1250+
1251+ log ("POOL-CLEANUP" , f"Found { total } orphaned resources:" )
1252+ if vms :
1253+ log ("POOL-CLEANUP" , f" VMs: { len (vms )} " )
1254+ if nics :
1255+ log ("POOL-CLEANUP" , f" NICs: { len (nics )} " )
1256+ if ips :
1257+ log ("POOL-CLEANUP" , f" Public IPs: { len (ips )} " )
1258+ if disks :
1259+ log ("POOL-CLEANUP" , f" Disks: { len (disks )} " )
1260+
1261+ if not getattr (args , "yes" , False ):
1262+ confirm = input ("\n Delete these resources? [y/N]: " )
1263+ if confirm .lower () != "y" :
1264+ log ("POOL-CLEANUP" , "Aborted." )
1265+ return 0
1266+
1267+ # Delete VMs first (releases NICs)
1268+ for vm in vms :
1269+ log ("POOL-CLEANUP" , f" Deleting VM: { vm } " )
1270+ subprocess .run (
1271+ ["az" , "vm" , "delete" , "-g" , RESOURCE_GROUP , "-n" , vm , "--yes" , "--force-deletion" , "true" ],
1272+ capture_output = True ,
1273+ )
1274+
1275+ # Delete NICs
1276+ for nic in nics :
1277+ log ("POOL-CLEANUP" , f" Deleting NIC: { nic } " )
1278+ subprocess .run (
1279+ ["az" , "network" , "nic" , "delete" , "-g" , RESOURCE_GROUP , "-n" , nic ],
1280+ capture_output = True ,
1281+ )
1282+
1283+ # Delete public IPs
1284+ for ip in ips :
1285+ log ("POOL-CLEANUP" , f" Deleting IP: { ip } " )
1286+ subprocess .run (
1287+ ["az" , "network" , "public-ip" , "delete" , "-g" , RESOURCE_GROUP , "-n" , ip ],
1288+ capture_output = True ,
1289+ )
1290+
1291+ # Delete disks
1292+ for disk in disks :
1293+ log ("POOL-CLEANUP" , f" Deleting disk: { disk } " )
1294+ subprocess .run (
1295+ ["az" , "disk" , "delete" , "-g" , RESOURCE_GROUP , "-n" , disk , "--yes" ],
1296+ capture_output = True ,
1297+ )
1298+
1299+ log ("POOL-CLEANUP" , "Cleanup complete." )
1300+ return 0
1301+
1302+
11881303def cmd_status (args ):
11891304 """Show VM status."""
11901305 ip = get_vm_ip ()
@@ -7056,6 +7171,16 @@ def main():
70567171 )
70577172 p_pool_run .set_defaults (func = cmd_pool_run )
70587173
7174+ # pool-cleanup
7175+ p_pool_cleanup = subparsers .add_parser (
7176+ "pool-cleanup" , help = "Clean up orphaned pool resources (VMs, NICs, IPs, disks)"
7177+ )
7178+ p_pool_cleanup .add_argument (
7179+ "-y" , "--yes" , action = "store_true" ,
7180+ help = "Skip confirmation"
7181+ )
7182+ p_pool_cleanup .set_defaults (func = cmd_pool_cleanup )
7183+
70597184 # status
70607185 p_status = subparsers .add_parser ("status" , help = "Show VM status" )
70617186 p_status .set_defaults (func = cmd_status )
0 commit comments