|
| 1 | +import asyncio |
1 | 2 | import os |
2 | 3 | import time |
3 | 4 | from typing import Any |
|
11 | 12 | HTTP_TIMEOUT_SECONDS, |
12 | 13 | MAX_CONCURRENT_PER_REPO, |
13 | 14 | RUNNER_GROUP_ID, |
| 15 | + SANDBOX_MAX_RETRIES, |
| 16 | + SANDBOX_RETRY_BACKOFF, |
14 | 17 | get_gpu_config, |
15 | 18 | ) |
16 | 19 | from runner.core.github import call_github_api |
@@ -272,12 +275,36 @@ async def handle_queued_job( |
272 | 275 | repo_url, repo_full_name, job_id, runner_labels |
273 | 276 | ) |
274 | 277 |
|
275 | | - # Spawn sandbox |
| 278 | + # Spawn sandbox with retry logic for transient failures |
276 | 279 | logger.info( |
277 | 280 | "Spawning sandbox", |
278 | 281 | extra={"job_id": job_id, "repo": repo_full_name, "gpu": gpu_config is not None}, |
279 | 282 | ) |
280 | | - await spawn_sandbox(app, jit_config, job_id, gpu_config) |
| 283 | + sandbox_spawned = False |
| 284 | + for attempt in range(1, SANDBOX_MAX_RETRIES + 2): # +1 for initial attempt |
| 285 | + try: |
| 286 | + await spawn_sandbox(app, jit_config, job_id, gpu_config) |
| 287 | + sandbox_spawned = True |
| 288 | + break |
| 289 | + except Exception as e: |
| 290 | + if attempt <= SANDBOX_MAX_RETRIES: |
| 291 | + delay = SANDBOX_RETRY_BACKOFF[min(attempt - 1, len(SANDBOX_RETRY_BACKOFF) - 1)] |
| 292 | + logger.warning( |
| 293 | + "Sandbox spawn failed, retrying", |
| 294 | + extra={ |
| 295 | + "job_id": job_id, |
| 296 | + "repo": repo_full_name, |
| 297 | + "attempt": attempt, |
| 298 | + "delay": delay, |
| 299 | + "error": type(e).__name__, |
| 300 | + }, |
| 301 | + ) |
| 302 | + await asyncio.sleep(delay) |
| 303 | + else: |
| 304 | + raise |
| 305 | + |
| 306 | + if not sandbox_spawned: |
| 307 | + raise RuntimeError("Failed to spawn sandbox after retries") |
281 | 308 |
|
282 | 309 | # Only mark as processed after successful sandbox provisioning |
283 | 310 | mark_job_processed(job_id) |
|
0 commit comments