Skip to content

Commit 6ead5ff

Browse files
abrichrclaude
andcommitted
fix(cli): improve pool-create reliability and error handling
- Properly clean up test VM and associated resources during quota check - Use sudo for docker pull (usermod not effective in same session) - Add pool-cleanup command for orphaned resources - Show full error messages in pool creation failures Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent d988e56 commit 6ead5ff

1 file changed

Lines changed: 134 additions & 9 deletions

File tree

openadapt_ml/benchmarks/cli.py

Lines changed: 134 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -782,9 +782,19 @@ def cmd_pool_create(args):
782782
working_size = vm_size
783783
working_region = region
784784
working_cost = cost
785-
# Delete the test VM
785+
# Delete the test VM and wait for completion
786+
log("POOL", f" Found working combo, cleaning up test VM...")
786787
subprocess.run(
787-
["az", "vm", "delete", "-g", RESOURCE_GROUP, "-n", test_name, "--yes", "--force-deletion", "true", "--no-wait"],
788+
["az", "vm", "delete", "-g", RESOURCE_GROUP, "-n", test_name, "--yes", "--force-deletion", "true"],
789+
capture_output=True,
790+
)
791+
# Also clean up associated resources
792+
subprocess.run(
793+
["az", "network", "nic", "delete", "-g", RESOURCE_GROUP, "-n", f"{test_name}VMNic"],
794+
capture_output=True,
795+
)
796+
subprocess.run(
797+
["az", "network", "public-ip", "delete", "-g", RESOURCE_GROUP, "-n", f"{test_name}PublicIP"],
788798
capture_output=True,
789799
)
790800
break
@@ -833,7 +843,15 @@ def create_worker(worker_idx: int) -> tuple[str, str | None, str | None]:
833843
)
834844

835845
if result.returncode != 0:
836-
return (name, None, result.stderr[:200] if result.stderr else "unknown error")
846+
# Parse error for better message
847+
error_msg = result.stderr or "unknown error"
848+
try:
849+
error_json = json.loads(error_msg)
850+
if "error" in error_json:
851+
error_msg = error_json["error"].get("message", error_msg)[:500]
852+
except json.JSONDecodeError:
853+
error_msg = error_msg[:500]
854+
return (name, None, error_msg)
837855

838856
try:
839857
vm_info = json.loads(result.stdout)
@@ -892,21 +910,22 @@ def create_worker(worker_idx: int) -> tuple[str, str | None, str | None]:
892910
sudo bash -c 'echo "{\\"data-root\\": \\"/mnt/docker\\"}" > /etc/docker/daemon.json'
893911
sudo systemctl start docker
894912
895-
# Pull WAA image
896-
docker pull windowsarena/winarena:latest
913+
# Pull WAA image (use sudo since usermod hasn't taken effect yet)
914+
sudo docker pull windowsarena/winarena:latest
897915
"""
898916

899-
def setup_docker(name_ip: tuple[str, str]) -> tuple[str, bool]:
917+
def setup_docker(name_ip: tuple[str, str]) -> tuple[str, bool, str]:
900918
name, ip = name_ip
901919
result = ssh_run(ip, docker_setup, stream=False, step="DOCKER")
902-
return (name, result.returncode == 0)
920+
error = result.stderr[:200] if result.stderr else ""
921+
return (name, result.returncode == 0, error)
903922

904923
with ThreadPoolExecutor(max_workers=min(len(workers_ready), 5)) as executor:
905924
futures = {executor.submit(setup_docker, w): w[0] for w in workers_ready}
906925
workers_docker_ok = []
907926
for future in as_completed(futures):
908-
name, success = future.result()
909-
status = "Docker ready" if success else "Docker FAILED"
927+
name, success, error = future.result()
928+
status = "Docker ready" if success else f"Docker FAILED: {error[:100]}"
910929
log("POOL", f" {name}: {status}")
911930
if success:
912931
workers_docker_ok.append((name, dict(workers_ready)[name]))
@@ -1185,6 +1204,102 @@ def run_on_worker(worker, task_indices: list[int]) -> tuple[str, int, int, str]:
11851204
return 0 if total_failed == 0 else 1
11861205

11871206

1207+
def cmd_pool_cleanup(args):
1208+
"""Clean up orphaned pool resources (VMs, NICs, IPs, disks).
1209+
1210+
Use this after failed pool operations to clean up resources that
1211+
weren't properly deleted.
1212+
"""
1213+
init_logging()
1214+
1215+
log("POOL-CLEANUP", "Searching for orphaned pool resources...")
1216+
1217+
# Find pool VMs
1218+
result = subprocess.run(
1219+
["az", "vm", "list", "-g", RESOURCE_GROUP, "--query", "[?contains(name, 'waa-pool')].name", "-o", "tsv"],
1220+
capture_output=True, text=True,
1221+
)
1222+
vms = [v.strip() for v in result.stdout.strip().split("\n") if v.strip()]
1223+
1224+
# Find NICs
1225+
result = subprocess.run(
1226+
["az", "network", "nic", "list", "-g", RESOURCE_GROUP, "--query", "[?contains(name, 'waa-pool')].name", "-o", "tsv"],
1227+
capture_output=True, text=True,
1228+
)
1229+
nics = [n.strip() for n in result.stdout.strip().split("\n") if n.strip()]
1230+
1231+
# Find public IPs
1232+
result = subprocess.run(
1233+
["az", "network", "public-ip", "list", "-g", RESOURCE_GROUP, "--query", "[?contains(name, 'waa-pool')].name", "-o", "tsv"],
1234+
capture_output=True, text=True,
1235+
)
1236+
ips = [i.strip() for i in result.stdout.strip().split("\n") if i.strip()]
1237+
1238+
# Find disks
1239+
result = subprocess.run(
1240+
["az", "disk", "list", "-g", RESOURCE_GROUP, "--query", "[?contains(name, 'waa-pool')].name", "-o", "tsv"],
1241+
capture_output=True, text=True,
1242+
)
1243+
disks = [d.strip() for d in result.stdout.strip().split("\n") if d.strip()]
1244+
1245+
total = len(vms) + len(nics) + len(ips) + len(disks)
1246+
1247+
if total == 0:
1248+
log("POOL-CLEANUP", "No orphaned resources found.")
1249+
return 0
1250+
1251+
log("POOL-CLEANUP", f"Found {total} orphaned resources:")
1252+
if vms:
1253+
log("POOL-CLEANUP", f" VMs: {len(vms)}")
1254+
if nics:
1255+
log("POOL-CLEANUP", f" NICs: {len(nics)}")
1256+
if ips:
1257+
log("POOL-CLEANUP", f" Public IPs: {len(ips)}")
1258+
if disks:
1259+
log("POOL-CLEANUP", f" Disks: {len(disks)}")
1260+
1261+
if not getattr(args, "yes", False):
1262+
confirm = input("\nDelete these resources? [y/N]: ")
1263+
if confirm.lower() != "y":
1264+
log("POOL-CLEANUP", "Aborted.")
1265+
return 0
1266+
1267+
# Delete VMs first (releases NICs)
1268+
for vm in vms:
1269+
log("POOL-CLEANUP", f" Deleting VM: {vm}")
1270+
subprocess.run(
1271+
["az", "vm", "delete", "-g", RESOURCE_GROUP, "-n", vm, "--yes", "--force-deletion", "true"],
1272+
capture_output=True,
1273+
)
1274+
1275+
# Delete NICs
1276+
for nic in nics:
1277+
log("POOL-CLEANUP", f" Deleting NIC: {nic}")
1278+
subprocess.run(
1279+
["az", "network", "nic", "delete", "-g", RESOURCE_GROUP, "-n", nic],
1280+
capture_output=True,
1281+
)
1282+
1283+
# Delete public IPs
1284+
for ip in ips:
1285+
log("POOL-CLEANUP", f" Deleting IP: {ip}")
1286+
subprocess.run(
1287+
["az", "network", "public-ip", "delete", "-g", RESOURCE_GROUP, "-n", ip],
1288+
capture_output=True,
1289+
)
1290+
1291+
# Delete disks
1292+
for disk in disks:
1293+
log("POOL-CLEANUP", f" Deleting disk: {disk}")
1294+
subprocess.run(
1295+
["az", "disk", "delete", "-g", RESOURCE_GROUP, "-n", disk, "--yes"],
1296+
capture_output=True,
1297+
)
1298+
1299+
log("POOL-CLEANUP", "Cleanup complete.")
1300+
return 0
1301+
1302+
11881303
def cmd_status(args):
11891304
"""Show VM status."""
11901305
ip = get_vm_ip()
@@ -7056,6 +7171,16 @@ def main():
70567171
)
70577172
p_pool_run.set_defaults(func=cmd_pool_run)
70587173

7174+
# pool-cleanup
7175+
p_pool_cleanup = subparsers.add_parser(
7176+
"pool-cleanup", help="Clean up orphaned pool resources (VMs, NICs, IPs, disks)"
7177+
)
7178+
p_pool_cleanup.add_argument(
7179+
"-y", "--yes", action="store_true",
7180+
help="Skip confirmation"
7181+
)
7182+
p_pool_cleanup.set_defaults(func=cmd_pool_cleanup)
7183+
70597184
# status
70607185
p_status = subparsers.add_parser("status", help="Show VM status")
70617186
p_status.set_defaults(func=cmd_status)

0 commit comments

Comments
 (0)