feat: add retry logic for sandbox spawn failures

manascb1344 · manascb1344 · commit 26dd2673b8e5 · 2026-05-02T17:40:56.000+05:30
- SANDBOX_MAX_RETRIES env var (default: 3 retries)
- Exponential backoff: 1s, 2s, 4s delays
- Retries on transient sandbox spawn failures
- Configurable via Modal secret
diff --git a/runner/services/job_service.py b/runner/services/job_service.py
@@ -1,3 +1,4 @@
+import asyncio
 import os
 import time
 from typing import Any
@@ -11,6 +12,8 @@
     HTTP_TIMEOUT_SECONDS,
     MAX_CONCURRENT_PER_REPO,
     RUNNER_GROUP_ID,
+    SANDBOX_MAX_RETRIES,
+    SANDBOX_RETRY_BACKOFF,
     get_gpu_config,
 )
 from runner.core.github import call_github_api
@@ -272,12 +275,36 @@ async def handle_queued_job(
             repo_url, repo_full_name, job_id, runner_labels
         )
 
-        # Spawn sandbox
+        # Spawn sandbox with retry logic for transient failures
         logger.info(
             "Spawning sandbox",
             extra={"job_id": job_id, "repo": repo_full_name, "gpu": gpu_config is not None},
         )
-        await spawn_sandbox(app, jit_config, job_id, gpu_config)
+        sandbox_spawned = False
+        for attempt in range(1, SANDBOX_MAX_RETRIES + 2):  # +1 for initial attempt
+            try:
+                await spawn_sandbox(app, jit_config, job_id, gpu_config)
+                sandbox_spawned = True
+                break
+            except Exception as e:
+                if attempt <= SANDBOX_MAX_RETRIES:
+                    delay = SANDBOX_RETRY_BACKOFF[min(attempt - 1, len(SANDBOX_RETRY_BACKOFF) - 1)]
+                    logger.warning(
+                        "Sandbox spawn failed, retrying",
+                        extra={
+                            "job_id": job_id,
+                            "repo": repo_full_name,
+                            "attempt": attempt,
+                            "delay": delay,
+                            "error": type(e).__name__,
+                        },
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    raise
+
+        if not sandbox_spawned:
+            raise RuntimeError("Failed to spawn sandbox after retries")
 
         # Only mark as processed after successful sandbox provisioning
         mark_job_processed(job_id)