Skip to content

Commit b0e09e9

Browse files
abrichrclaude
andauthored
fix(cli): fix --task flag concatenation bug and three other issues (#31)
* fix(cli): fix --task flag concatenation bug and three other issues Bug 1 (Critical): --task flag produced `find_task.pycd` due to missing `&&` separator between pre_cmd and `cd /client`. Every `run --task` invocation since v0.4.2 silently failed. Fixed by adding `&&`. Bug 2: --num-tasks defaulted to 1, silently limiting runs. Changed default to None (all tasks). Bug 3: probe --wait timeout of 1200s was too short for first boot (OOBE takes 18-22 min). Increased to 1800s. Bug 4: Default VM size (D4ds_v4, 16GB) OOMs with navi agent's GroundingDINO + SoM models. Changed default to D8ds_v5 (32GB). Added warning when standard mode is used explicitly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * refactor: remove --fast flag, standardize on D8ds_v5 (32GB) VM D4ds_v4 (16GB) OOMs with navi agent's GroundingDINO + SoM models. Standardize on D8ds_v5 across all commands — no more --fast/--standard flags. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 12b6189 commit b0e09e9

3 files changed

Lines changed: 36 additions & 106 deletions

File tree

openadapt_evals/benchmarks/vm_cli.py

Lines changed: 29 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,16 @@
4646
# Constants (single source of truth)
4747
# =============================================================================
4848

49-
# VM sizes with nested virtualization support
50-
# Standard: $0.19/hr, 4 vCPU, 16GB RAM - baseline
51-
# Fast: $0.38/hr, 8 vCPU, 32GB RAM - ~30% faster install, ~40% faster eval
52-
VM_SIZE_STANDARD = "Standard_D4ds_v4"
53-
VM_SIZE_FAST = "Standard_D8ds_v5"
54-
VM_SIZE = VM_SIZE_STANDARD # Default, can be overridden by --fast flag
55-
56-
# Fallback sizes for --fast mode (in order of preference)
49+
# VM size: D8ds_v5 ($0.38/hr, 8 vCPU, 32GB RAM)
50+
# D4ds_v4 (16GB) OOMs with navi agent's GroundingDINO + SoM models — do not use.
51+
VM_SIZE = "Standard_D8ds_v5"
52+
53+
# Fallback VM sizes (in order of preference, all 8 vCPU / 32GB)
5754
# D8ds_v5: First choice (v5 with local SSD)
5855
# D8s_v5: v5 without local SSD
5956
# D8ds_v4: v4 with local SSD
6057
# D8as_v5: AMD version
61-
VM_SIZE_FAST_FALLBACKS = [
58+
VM_SIZE_FALLBACKS = [
6259
("Standard_D8ds_v5", 0.38),
6360
("Standard_D8s_v5", 0.36),
6461
("Standard_D8ds_v4", 0.38),
@@ -258,19 +255,12 @@ def cmd_create(args):
258255
log("CREATE", "Use 'delete' first if you want to recreate")
259256
return 0
260257

261-
# Determine which sizes to try
262-
use_fast = getattr(args, "fast", False)
263-
if use_fast:
264-
# Try multiple fast sizes with fallbacks
265-
sizes_to_try = VM_SIZE_FAST_FALLBACKS
266-
log(
267-
"CREATE",
268-
f"Creating VM '{VM_NAME}' with --fast (trying multiple D8 sizes)...",
269-
)
270-
else:
271-
# Standard mode: single size
272-
sizes_to_try = [(VM_SIZE_STANDARD, 0.19)]
273-
log("CREATE", f"Creating VM '{VM_NAME}' ({VM_SIZE_STANDARD}, $0.19/hr)...")
258+
# Try multiple D8 sizes with fallbacks (all 32GB, required for navi agent)
259+
sizes_to_try = VM_SIZE_FALLBACKS
260+
log(
261+
"CREATE",
262+
f"Creating VM '{VM_NAME}' (trying D8 sizes with fallbacks)...",
263+
)
274264

275265
# Try size+region combinations until one works
276266
vm_created = False
@@ -627,7 +617,6 @@ def cmd_pool_create(args):
627617
from openadapt_evals.infrastructure.pool import PoolManager
628618

629619
num_workers = getattr(args, "workers", 3)
630-
use_standard = getattr(args, "standard", False)
631620
auto_shutdown_hours = getattr(args, "auto_shutdown_hours", 4)
632621

633622
vm_manager = AzureVMManager(resource_group=RESOURCE_GROUP)
@@ -636,7 +625,6 @@ def cmd_pool_create(args):
636625
try:
637626
manager.create(
638627
workers=num_workers,
639-
fast=not use_standard,
640628
auto_shutdown_hours=auto_shutdown_hours,
641629
)
642630
return 0
@@ -1428,18 +1416,10 @@ def cmd_start(args):
14281416
# - Downloads Windows 11 Enterprise if not present
14291417
# - Boots QEMU VM
14301418
# - Runs WAA server automatically via FirstLogonCommands
1431-
# QEMU resource allocation (--fast uses more resources on D8ds_v5)
1432-
if getattr(args, "fast", False):
1433-
ram_size = "16G"
1434-
cpu_cores = 6
1435-
log(
1436-
"START",
1437-
"Starting container with VERSION=11e (FAST mode: 6 cores, 16GB RAM)...",
1438-
)
1439-
else:
1440-
ram_size = "8G"
1441-
cpu_cores = 4
1442-
log("START", "Starting container with VERSION=11e...")
1419+
# QEMU resource allocation — always use 16G/6 cores (D8ds_v5, 32GB host)
1420+
ram_size = "16G"
1421+
cpu_cores = 6
1422+
log("START", "Starting container with VERSION=11e (6 cores, 16GB RAM)...")
14431423

14441424
# Get agent and model from args (defaults match WAA defaults)
14451425
getattr(args, "agent", "navi")
@@ -1568,11 +1548,10 @@ def cmd_test_golden_image(args):
15681548
# Start container from golden image (NOT fresh)
15691549
log("TEST", "Starting container from golden image...")
15701550

1571-
# Use fast mode for quicker boot
1572-
ram_size = "16G" if args.fast else "8G"
1573-
cpu_cores = 6 if args.fast else 4
1574-
mode_str = "FAST mode" if args.fast else "standard mode"
1575-
log("TEST", f" Using {mode_str}: {cpu_cores} cores, {ram_size} RAM")
1551+
# 16GB RAM / 6 cores for D8ds_v5 VM
1552+
ram_size = "16G"
1553+
cpu_cores = 6
1554+
log("TEST", f" Using {cpu_cores} cores, {ram_size} RAM")
15761555

15771556
docker_cmd = f"""docker run -d \\
15781557
--name winarena \\
@@ -1923,7 +1902,6 @@ class FakeArgs:
19231902
log("TEST-ALL", "-" * 30)
19241903

19251904
class FakeArgs2:
1926-
fast = getattr(args, "fast", False)
19271905
timeout = 120
19281906

19291907
results["golden_image"] = cmd_test_golden_image(FakeArgs2()) == 0
@@ -2091,8 +2069,10 @@ def cmd_run(args):
20912069
task_info.append(f"task={task}")
20922070
elif domain != "all":
20932071
task_info.append(f"domain={domain}")
2094-
else:
2072+
elif args.num_tasks:
20952073
task_info.append(f"{args.num_tasks} task(s)")
2074+
else:
2075+
task_info.append("all tasks")
20962076

20972077
log("RUN", f"Starting benchmark: {', '.join(task_info)}, model={model}")
20982078

@@ -2127,7 +2107,7 @@ def cmd_run(args):
21272107
print(f"ERROR: Task {{task_id}} not found in test_all.json")
21282108
sys.exit(1)
21292109
FINDEOF
2130-
python3 /tmp/find_task.py"""
2110+
python3 /tmp/find_task.py && """
21312111
run_args.append("--test_all_meta_path evaluation_examples_windows/test_custom.json")
21322112
pre_cmd = create_custom_test_cmd
21332113
elif args.num_tasks and args.num_tasks < 154:
@@ -4341,13 +4321,11 @@ def cmd_run_azure_ml_auto(args):
43414321
probe_timeout = getattr(args, "probe_timeout", 1800) # 30 min for WAA server
43424322
skip_upload = getattr(args, "skip_upload", False)
43434323
skip_benchmark = getattr(args, "skip_benchmark", False)
4344-
fast_vm = getattr(args, "fast", False)
43454324

43464325
log("AUTO", "Configuration:")
43474326
log("AUTO", f" Workers: {num_workers}")
43484327
log("AUTO", f" Setup timeout: {timeout_minutes} min")
43494328
log("AUTO", f" Probe timeout: {probe_timeout} sec")
4350-
log("AUTO", f" Fast VM: {fast_vm}")
43514329
log("AUTO", "")
43524330

43534331
# =========================================================================
@@ -4390,7 +4368,6 @@ def cmd_run_azure_ml_auto(args):
43904368

43914369
# Build args for cmd_create
43924370
class CreateArgs:
4393-
fast = fast_vm
43944371
workers = 1
43954372

43964373
result = cmd_create(CreateArgs())
@@ -4447,8 +4424,8 @@ class CreateArgs:
44474424
ssh_run(ip, "docker stop winarena 2>/dev/null; docker rm -f winarena 2>/dev/null")
44484425

44494426
# Start container with VERSION=11e
4450-
ram_size = "16G" if fast_vm else "8G"
4451-
cpu_cores = 6 if fast_vm else 4
4427+
ram_size = "16G"
4428+
cpu_cores = 6
44524429

44534430
docker_cmd = f"""docker run -d \\
44544431
--name winarena \\
@@ -7340,11 +7317,6 @@ def main():
73407317

73417318
# create
73427319
p_create = subparsers.add_parser("create", help="Create Azure VM")
7343-
p_create.add_argument(
7344-
"--fast",
7345-
action="store_true",
7346-
help="Use larger VM (D8ds_v5, $0.38/hr) for ~30%% faster install, ~40%% faster eval",
7347-
)
73487320
p_create.add_argument(
73497321
"--workers",
73507322
type=int,
@@ -7390,15 +7362,6 @@ def main():
73907362
default=3,
73917363
help="Number of worker VMs to create (default: 3)",
73927364
)
7393-
p_pool_create.add_argument(
7394-
"--fast",
7395-
action="store_true",
7396-
default=True,
7397-
help="Use D8 (8 vCPU) VMs for faster evaluation (default: True)",
7398-
)
7399-
p_pool_create.add_argument(
7400-
"--standard", action="store_true", help="Use D4 (4 vCPU) VMs to save costs"
7401-
)
74027365
p_pool_create.add_argument(
74037366
"--auto-shutdown-hours",
74047367
type=int,
@@ -7527,11 +7490,6 @@ def main():
75277490
"--fresh", action="store_true", help="Clean storage for fresh Windows install"
75287491
)
75297492
p_start.add_argument("--no-vnc", action="store_true", help="Don't auto-launch VNC viewer")
7530-
p_start.add_argument(
7531-
"--fast",
7532-
action="store_true",
7533-
help="Allocate more CPU/RAM to QEMU (use with D8ds_v5 VM)",
7534-
)
75357493
p_start.set_defaults(func=cmd_start)
75367494

75377495
# stop
@@ -7543,7 +7501,7 @@ def main():
75437501
p_probe = subparsers.add_parser("probe", help="Check if WAA server is ready")
75447502
p_probe.add_argument("--wait", action="store_true", help="Wait until ready")
75457503
p_probe.add_argument(
7546-
"--timeout", type=int, default=1200, help="Timeout in seconds (default: 1200)"
7504+
"--timeout", type=int, default=1800, help="Timeout in seconds (default: 1800)"
75477505
)
75487506
p_probe.set_defaults(func=cmd_probe)
75497507

@@ -7557,11 +7515,6 @@ def main():
75577515
default=180,
75587516
help="Max wait time in seconds (default: 180)",
75597517
)
7560-
p_test_golden.add_argument(
7561-
"--fast",
7562-
action="store_true",
7563-
help="Use more CPU/RAM for faster boot (requires D8ds_v5 VM)",
7564-
)
75657518
p_test_golden.set_defaults(func=cmd_test_golden_image)
75667519

75677520
# test-blob-access
@@ -7584,20 +7537,15 @@ def main():
75847537
"test-all", help="Run all pre-flight tests before Azure ML benchmark"
75857538
)
75867539
p_test_all.add_argument("--api-key", help="OpenAI API key (or set OPENAI_API_KEY in .env)")
7587-
p_test_all.add_argument(
7588-
"--fast",
7589-
action="store_true",
7590-
help="Use more CPU/RAM for faster boot (requires D8ds_v5 VM)",
7591-
)
75927540
p_test_all.set_defaults(func=cmd_test_all)
75937541

75947542
# run
75957543
p_run = subparsers.add_parser("run", help="Run benchmark tasks (uses vanilla WAA navi agent)")
75967544
p_run.add_argument(
75977545
"--num-tasks",
75987546
type=int,
7599-
default=1,
7600-
help="Number of tasks to run (ignored if --task specified)",
7547+
default=None,
7548+
help="Number of tasks to run (default: all; ignored if --task specified)",
76017549
)
76027550
p_run.add_argument("--task", help="Specific task ID to run")
76037551
p_run.add_argument(
@@ -7812,11 +7760,6 @@ def main():
78127760
default=1800,
78137761
help="WAA server probe timeout in seconds (default: 1800 = 30 min)",
78147762
)
7815-
p_azure_ml_auto.add_argument(
7816-
"--fast",
7817-
action="store_true",
7818-
help="Use larger VM (D8ds_v5, $0.38/hr) for faster setup",
7819-
)
78207763
p_azure_ml_auto.add_argument(
78217764
"--skip-upload",
78227765
action="store_true",

openadapt_evals/infrastructure/azure_vm.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,10 @@
5656
"ServerAliveCountMax=10",
5757
]
5858

59-
# VM size constants
60-
VM_SIZE_STANDARD = "Standard_D4ds_v4"
61-
VM_SIZE_FAST = "Standard_D8ds_v5"
62-
VM_SIZE_FAST_FALLBACKS = [
59+
# VM size: D8ds_v5 ($0.38/hr, 8 vCPU, 32GB RAM)
60+
# D4ds_v4 (16GB) OOMs with navi agent's GroundingDINO + SoM models — do not use.
61+
VM_SIZE = "Standard_D8ds_v5"
62+
VM_SIZE_FALLBACKS = [
6363
("Standard_D8ds_v5", 0.38),
6464
("Standard_D8s_v5", 0.36),
6565
("Standard_D8ds_v4", 0.38),
@@ -367,28 +367,19 @@ def _cli_set_auto_shutdown(self, name: str, hours: int) -> bool:
367367
)
368368
return result.returncode == 0
369369

370-
def find_available_size_and_region(
371-
self,
372-
fast: bool = True,
373-
) -> tuple[str, str, float]:
370+
def find_available_size_and_region(self) -> tuple[str, str, float]:
374371
"""Find a working VM size and region by creating a test VM.
375372
376373
Tries size/region combinations until one succeeds, then cleans up
377374
the test VM.
378375
379-
Args:
380-
fast: If True, try D8 sizes first. If False, use standard D4.
381-
382376
Returns:
383377
Tuple of (vm_size, region, cost_per_hour).
384378
385379
Raises:
386380
RuntimeError: If no available size/region found.
387381
"""
388-
if fast:
389-
sizes_to_try = VM_SIZE_FAST_FALLBACKS
390-
else:
391-
sizes_to_try = [(VM_SIZE_STANDARD, 0.19)]
382+
sizes_to_try = VM_SIZE_FALLBACKS
392383

393384
test_vm_to_cleanup = None
394385
try:

openadapt_evals/infrastructure/pool.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,6 @@ def _log(self, step: str, message: str, end: str = "\n") -> None:
173173
def create(
174174
self,
175175
workers: int = 3,
176-
fast: bool = True,
177176
auto_shutdown_hours: int = 4,
178177
) -> VMPool:
179178
"""Create a pool of VMs for parallel WAA evaluation.
@@ -182,7 +181,6 @@ def create(
182181
183182
Args:
184183
workers: Number of worker VMs to create.
185-
fast: If True, use D8 VM sizes. If False, use standard D4.
186184
auto_shutdown_hours: Hours until auto-shutdown (safety net).
187185
188186
Returns:
@@ -199,9 +197,7 @@ def create(
199197

200198
# Find available size/region
201199
self._log("POOL", "Finding available region and VM size...")
202-
vm_size, region, cost = self.vm_manager.find_available_size_and_region(
203-
fast=fast,
204-
)
200+
vm_size, region, cost = self.vm_manager.find_available_size_and_region()
205201
self._log("POOL", f"Using {vm_size} (${cost:.2f}/hr) in {region}")
206202

207203
if auto_shutdown_hours > 0:

0 commit comments

Comments
 (0)