Skip to content

Commit 26dd267

Browse files
committed
feat: add retry logic for sandbox spawn failures
- SANDBOX_MAX_RETRIES env var (default: 3 retries) - Exponential backoff: 1s, 2s, 4s delays - Retries on transient sandbox spawn failures - Configurable via Modal secret
1 parent be2e246 commit 26dd267

1 file changed

Lines changed: 29 additions & 2 deletions

File tree

runner/services/job_service.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
import os
23
import time
34
from typing import Any
@@ -11,6 +12,8 @@
1112
HTTP_TIMEOUT_SECONDS,
1213
MAX_CONCURRENT_PER_REPO,
1314
RUNNER_GROUP_ID,
15+
SANDBOX_MAX_RETRIES,
16+
SANDBOX_RETRY_BACKOFF,
1417
get_gpu_config,
1518
)
1619
from runner.core.github import call_github_api
@@ -272,12 +275,36 @@ async def handle_queued_job(
272275
repo_url, repo_full_name, job_id, runner_labels
273276
)
274277

275-
# Spawn sandbox
278+
# Spawn sandbox with retry logic for transient failures
276279
logger.info(
277280
"Spawning sandbox",
278281
extra={"job_id": job_id, "repo": repo_full_name, "gpu": gpu_config is not None},
279282
)
280-
await spawn_sandbox(app, jit_config, job_id, gpu_config)
283+
sandbox_spawned = False
284+
for attempt in range(1, SANDBOX_MAX_RETRIES + 2): # +1 for initial attempt
285+
try:
286+
await spawn_sandbox(app, jit_config, job_id, gpu_config)
287+
sandbox_spawned = True
288+
break
289+
except Exception as e:
290+
if attempt <= SANDBOX_MAX_RETRIES:
291+
delay = SANDBOX_RETRY_BACKOFF[min(attempt - 1, len(SANDBOX_RETRY_BACKOFF) - 1)]
292+
logger.warning(
293+
"Sandbox spawn failed, retrying",
294+
extra={
295+
"job_id": job_id,
296+
"repo": repo_full_name,
297+
"attempt": attempt,
298+
"delay": delay,
299+
"error": type(e).__name__,
300+
},
301+
)
302+
await asyncio.sleep(delay)
303+
else:
304+
raise
305+
306+
if not sandbox_spawned:
307+
raise RuntimeError("Failed to spawn sandbox after retries")
281308

282309
# Only mark as processed after successful sandbox provisioning
283310
mark_job_processed(job_id)

0 commit comments

Comments
 (0)