|
| 1 | +"""QEMU Monitor Reset Manager for Windows VMs. |
| 2 | +
|
| 3 | +Provides reliable Windows restart inside QEMU (dockur/windows Docker image) |
| 4 | +by sending ``system_reset`` via the QEMU monitor telnet interface on port 7100. |
| 5 | +
|
| 6 | +The WAA Flask server running inside Windows dies when you send |
| 7 | +``shutdown /r /t 0`` through the ``/execute`` endpoint, making that approach |
| 8 | +unreliable. Sending ``system_reset`` through the QEMU monitor is a hard |
| 9 | +reset that works regardless of the guest OS state. |
| 10 | +
|
| 11 | +Architecture:: |
| 12 | +
|
| 13 | + Local machine |
| 14 | + --> SSH --> Azure VM (Ubuntu host) |
| 15 | + --> docker exec winarena |
| 16 | + --> echo "system_reset" | nc -q1 localhost 7100 |
| 17 | + (QEMU monitor telnet on port 7100) |
| 18 | +
|
| 19 | +After reset, the container's ``entry_setup.sh`` automatically polls |
| 20 | +``172.30.0.2:5000/probe`` and the WAA Flask server comes back up in |
| 21 | +~90 seconds. |
| 22 | +
|
| 23 | +Usage:: |
| 24 | +
|
| 25 | + from openadapt_evals.infrastructure.qemu_reset import QEMUResetManager |
| 26 | +
|
| 27 | + mgr = QEMUResetManager(vm_ip="172.173.66.131") |
| 28 | +
|
| 29 | + # Full restart: send reset + wait for WAA server |
| 30 | + success, message = mgr.restart_windows() |
| 31 | +
|
| 32 | + # Or do each step separately |
| 33 | + mgr.reset_windows() |
| 34 | + mgr.wait_for_waa_ready() |
| 35 | +""" |
| 36 | + |
| 37 | +from __future__ import annotations |
| 38 | + |
| 39 | +import logging |
| 40 | +import subprocess |
| 41 | +import time |
| 42 | + |
| 43 | +import requests |
| 44 | + |
| 45 | +logger = logging.getLogger(__name__) |
| 46 | + |
| 47 | +# SSH options consistent with the rest of the codebase |
| 48 | +_SSH_OPTS = [ |
| 49 | + "-o", "StrictHostKeyChecking=no", |
| 50 | + "-o", "UserKnownHostsFile=/dev/null", |
| 51 | + "-o", "LogLevel=ERROR", |
| 52 | + "-o", "ConnectTimeout=10", |
| 53 | +] |
| 54 | + |
| 55 | + |
| 56 | +class QEMUResetManager: |
| 57 | + """Manage Windows restarts via QEMU monitor inside a Docker container. |
| 58 | +
|
| 59 | + Attributes: |
| 60 | + vm_ip: IP address of the Azure Ubuntu VM hosting the Docker container. |
| 61 | + ssh_user: SSH user for the VM (default ``azureuser``). |
| 62 | + qemu_monitor_port: QEMU monitor telnet port inside the container (default 7100). |
| 63 | + container_name: Docker container name (default ``winarena``). |
| 64 | + timeout_seconds: Maximum seconds to wait for the WAA server after reset. |
| 65 | + """ |
| 66 | + |
| 67 | + def __init__( |
| 68 | + self, |
| 69 | + vm_ip: str, |
| 70 | + ssh_user: str = "azureuser", |
| 71 | + qemu_monitor_port: int = 7100, |
| 72 | + container_name: str = "winarena", |
| 73 | + timeout_seconds: int = 300, |
| 74 | + ) -> None: |
| 75 | + self.vm_ip = vm_ip |
| 76 | + self.ssh_user = ssh_user |
| 77 | + self.qemu_monitor_port = qemu_monitor_port |
| 78 | + self.container_name = container_name |
| 79 | + self.timeout_seconds = timeout_seconds |
| 80 | + |
| 81 | + def reset_windows(self) -> bool: |
| 82 | + """Send ``system_reset`` via the QEMU monitor over SSH. |
| 83 | +
|
| 84 | + Executes:: |
| 85 | +
|
| 86 | + ssh {user}@{ip} "docker exec {container} bash -c |
| 87 | + 'echo system_reset | nc -q1 localhost {port}'" |
| 88 | +
|
| 89 | + Returns: |
| 90 | + True if the SSH + docker exec command succeeded (exit code 0). |
| 91 | + """ |
| 92 | + docker_cmd = ( |
| 93 | + f"docker exec {self.container_name} bash -c " |
| 94 | + f"'echo system_reset | nc -q1 localhost {self.qemu_monitor_port}'" |
| 95 | + ) |
| 96 | + ssh_cmd = [ |
| 97 | + "ssh", |
| 98 | + *_SSH_OPTS, |
| 99 | + f"{self.ssh_user}@{self.vm_ip}", |
| 100 | + docker_cmd, |
| 101 | + ] |
| 102 | + |
| 103 | + logger.info( |
| 104 | + "Sending system_reset via QEMU monitor (port %d) on %s", |
| 105 | + self.qemu_monitor_port, |
| 106 | + self.vm_ip, |
| 107 | + ) |
| 108 | + |
| 109 | + try: |
| 110 | + result = subprocess.run( |
| 111 | + ssh_cmd, |
| 112 | + capture_output=True, |
| 113 | + timeout=30, |
| 114 | + ) |
| 115 | + except subprocess.TimeoutExpired: |
| 116 | + logger.error("SSH command timed out sending system_reset") |
| 117 | + return False |
| 118 | + |
| 119 | + if result.returncode != 0: |
| 120 | + stderr = result.stderr.decode("utf-8", errors="replace").strip() |
| 121 | + logger.error( |
| 122 | + "QEMU monitor reset failed (rc=%d): %s", |
| 123 | + result.returncode, |
| 124 | + stderr, |
| 125 | + ) |
| 126 | + return False |
| 127 | + |
| 128 | + logger.info("QEMU system_reset sent successfully") |
| 129 | + return True |
| 130 | + |
| 131 | + def wait_for_waa_ready( |
| 132 | + self, |
| 133 | + server_url: str = "http://localhost:5001", |
| 134 | + check_interval: int = 10, |
| 135 | + ) -> bool: |
| 136 | + """Poll the WAA ``/probe`` endpoint until it responds or timeout. |
| 137 | +
|
| 138 | + Args: |
| 139 | + server_url: Base URL of the WAA server (through SSH tunnel). |
| 140 | + check_interval: Seconds between probe attempts. |
| 141 | +
|
| 142 | + Returns: |
| 143 | + True if the server responded within ``timeout_seconds``, False on timeout. |
| 144 | + """ |
| 145 | + probe_url = f"{server_url}/probe" |
| 146 | + deadline = time.time() + self.timeout_seconds |
| 147 | + start = time.time() |
| 148 | + |
| 149 | + logger.info( |
| 150 | + "Waiting up to %ds for WAA server at %s", |
| 151 | + self.timeout_seconds, |
| 152 | + probe_url, |
| 153 | + ) |
| 154 | + |
| 155 | + while time.time() < deadline: |
| 156 | + elapsed = int(time.time() - start) |
| 157 | + try: |
| 158 | + resp = requests.get(probe_url, timeout=check_interval) |
| 159 | + if resp.ok: |
| 160 | + logger.info("WAA server ready after %ds", elapsed) |
| 161 | + return True |
| 162 | + except (requests.ConnectionError, requests.Timeout): |
| 163 | + pass |
| 164 | + |
| 165 | + remaining = int(deadline - time.time()) |
| 166 | + if remaining > 0: |
| 167 | + logger.info( |
| 168 | + "[%ds] WAA not ready yet, retrying in %ds (%ds remaining)...", |
| 169 | + elapsed, |
| 170 | + check_interval, |
| 171 | + remaining, |
| 172 | + ) |
| 173 | + time.sleep(check_interval) |
| 174 | + |
| 175 | + elapsed = int(time.time() - start) |
| 176 | + logger.error("WAA server did not become ready within %ds", elapsed) |
| 177 | + return False |
| 178 | + |
| 179 | + def restart_windows( |
| 180 | + self, |
| 181 | + server_url: str = "http://localhost:5001", |
| 182 | + ) -> tuple[bool, str]: |
| 183 | + """Full restart cycle: send QEMU reset then wait for WAA readiness. |
| 184 | +
|
| 185 | + Args: |
| 186 | + server_url: Base URL of the WAA server (through SSH tunnel). |
| 187 | +
|
| 188 | + Returns: |
| 189 | + Tuple of (success, message) where *success* is True if the |
| 190 | + server came back within the timeout. |
| 191 | + """ |
| 192 | + if not self.reset_windows(): |
| 193 | + return False, "Failed to send system_reset via QEMU monitor" |
| 194 | + |
| 195 | + logger.info("Reset sent, waiting for WAA server to come back...") |
| 196 | + |
| 197 | + if self.wait_for_waa_ready(server_url=server_url): |
| 198 | + return True, "Windows restarted and WAA server is ready" |
| 199 | + |
| 200 | + return False, f"WAA server did not come back within {self.timeout_seconds}s" |
| 201 | + |
| 202 | + def is_qemu_monitor_reachable(self) -> bool: |
| 203 | + """Check whether the QEMU monitor telnet port is reachable inside the container. |
| 204 | +
|
| 205 | + This can be used to decide whether to fall back to ``docker restart``. |
| 206 | +
|
| 207 | + Returns: |
| 208 | + True if the QEMU monitor responds. |
| 209 | + """ |
| 210 | + docker_cmd = ( |
| 211 | + f"docker exec {self.container_name} bash -c " |
| 212 | + f"'echo info version | nc -q1 localhost {self.qemu_monitor_port}'" |
| 213 | + ) |
| 214 | + ssh_cmd = [ |
| 215 | + "ssh", |
| 216 | + *_SSH_OPTS, |
| 217 | + f"{self.ssh_user}@{self.vm_ip}", |
| 218 | + docker_cmd, |
| 219 | + ] |
| 220 | + |
| 221 | + try: |
| 222 | + result = subprocess.run( |
| 223 | + ssh_cmd, |
| 224 | + capture_output=True, |
| 225 | + timeout=15, |
| 226 | + ) |
| 227 | + stdout = result.stdout.decode("utf-8", errors="replace") |
| 228 | + reachable = result.returncode == 0 and "QEMU" in stdout |
| 229 | + logger.debug( |
| 230 | + "QEMU monitor reachable: %s (stdout: %s)", |
| 231 | + reachable, |
| 232 | + stdout.strip()[:100], |
| 233 | + ) |
| 234 | + return reachable |
| 235 | + except subprocess.TimeoutExpired: |
| 236 | + logger.debug("QEMU monitor reachability check timed out") |
| 237 | + return False |
0 commit comments