added systemd oom handling and tests

mishushakov · mishushakov · commit 2f588c615fee · 2026-03-18T16:43:19.000+01:00
diff --git a/js/tests/systemd.test.ts b/js/tests/systemd.test.ts
@@ -0,0 +1,67 @@
+import { expect } from 'vitest'
+import { sandboxTest, wait } from './setup'
+
+async function waitForHealth(sandbox: any, maxRetries = 10, intervalMs = 100) {
+  for (let i = 0; i < maxRetries; i++) {
+    try {
+      const result = await sandbox.commands.run(
+        'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
+      )
+      if (result.stdout.trim() === '200') {
+        return true
+      }
+    } catch {
+      // Connection refused or other error, retry
+    }
+    await wait(intervalMs)
+  }
+  return false
+}
+
+sandboxTest('restart after jupyter kill', async ({ sandbox }) => {
+  // Verify health is up initially
+  const initialHealth = await waitForHealth(sandbox)
+  expect(initialHealth).toBe(true)
+
+  // Kill the jupyter process as root
+  // The command handle may get killed too (since killing jupyter cascades to code-interpreter),
+  // so we catch the error.
+  try {
+    await sandbox.commands.run("kill -9 $(pgrep -f 'jupyter server')", {
+      user: 'root',
+    })
+  } catch {
+    // Expected — the kill cascade may terminate the command handle
+  }
+
+  // Wait for systemd to restart both services
+  const recovered = await waitForHealth(sandbox, 60, 500)
+  expect(recovered).toBe(true)
+
+  // Verify code execution works after recovery
+  const result = await sandbox.runCode('x = 1; x')
+  expect(result.text).toEqual('1')
+})
+
+sandboxTest('restart after code-interpreter kill', async ({ sandbox }) => {
+  // Verify health is up initially
+  const initialHealth = await waitForHealth(sandbox)
+  expect(initialHealth).toBe(true)
+
+  // Kill the code-interpreter process as root
+  try {
+    await sandbox.commands.run("kill -9 $(pgrep -f 'uvicorn main:app')", {
+      user: 'root',
+    })
+  } catch {
+    // Expected — killing code-interpreter may terminate the command handle
+  }
+
+  // Wait for systemd to restart it and health to come back
+  const recovered = await waitForHealth(sandbox, 60, 500)
+  expect(recovered).toBe(true)
+
+  // Verify code execution works after recovery
+  const result = await sandbox.runCode('x = 1; x')
+  expect(result.text).toEqual('1')
+})
diff --git a/python/tests/async/test_async_systemd.py b/python/tests/async/test_async_systemd.py
@@ -0,0 +1,59 @@
+import asyncio
+
+from e2b_code_interpreter.code_interpreter_async import AsyncSandbox
+
+
+async def wait_for_health(sandbox: AsyncSandbox, max_retries=10, interval_ms=100):
+    for _ in range(max_retries):
+        try:
+            result = await sandbox.commands.run(
+                'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
+            )
+            if result.stdout.strip() == "200":
+                return True
+        except Exception:
+            pass
+        await asyncio.sleep(interval_ms / 1000)
+    return False
+
+
+async def test_restart_after_jupyter_kill(async_sandbox: AsyncSandbox):
+    # Verify health is up initially
+    assert await wait_for_health(async_sandbox)
+
+    # Kill the jupyter process as root
+    # The command handle may get killed too (killing jupyter cascades to code-interpreter),
+    # so we catch the error.
+    try:
+        await async_sandbox.commands.run(
+            "kill -9 $(pgrep -f 'jupyter server')", user="root"
+        )
+    except Exception:
+        pass
+
+    # Wait for systemd to restart both services
+    assert await wait_for_health(async_sandbox, 60, 500)
+
+    # Verify code execution works after recovery
+    result = await async_sandbox.run_code("x = 1; x")
+    assert result.text == "1"
+
+
+async def test_restart_after_code_interpreter_kill(async_sandbox: AsyncSandbox):
+    # Verify health is up initially
+    assert await wait_for_health(async_sandbox)
+
+    # Kill the code-interpreter process as root
+    try:
+        await async_sandbox.commands.run(
+            "kill -9 $(pgrep -f 'uvicorn main:app')", user="root"
+        )
+    except Exception:
+        pass
+
+    # Wait for systemd to restart it and health to come back
+    assert await wait_for_health(async_sandbox, 60, 500)
+
+    # Verify code execution works after recovery
+    result = await async_sandbox.run_code("x = 1; x")
+    assert result.text == "1"
diff --git a/python/tests/sync/test_systemd.py b/python/tests/sync/test_systemd.py
@@ -0,0 +1,57 @@
+import time
+
+from e2b_code_interpreter.code_interpreter_sync import Sandbox
+
+
+def wait_for_health(sandbox: Sandbox, max_retries=10, interval_ms=100):
+    for _ in range(max_retries):
+        try:
+            result = sandbox.commands.run(
+                'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
+            )
+            if result.stdout.strip() == "200":
+                return True
+        except Exception:
+            pass
+        time.sleep(interval_ms / 1000)
+    return False
+
+
+def test_restart_after_jupyter_kill(sandbox: Sandbox):
+    # Verify health is up initially
+    assert wait_for_health(sandbox)
+
+    # Kill the jupyter process as root
+    # The command handle may get killed too (killing jupyter cascades to code-interpreter),
+    # so we catch the error.
+    try:
+        sandbox.commands.run("kill -9 $(pgrep -f 'jupyter server')", user="root")
+    except Exception:
+        pass
+
+    # Wait for systemd to restart both services
+    assert wait_for_health(sandbox, 60, 500)
+
+    # Verify code execution works after recovery
+    result = sandbox.run_code("x = 1; x")
+    assert result.text == "1"
+
+
+def test_restart_after_code_interpreter_kill(sandbox: Sandbox):
+    # Verify health is up initially
+    assert wait_for_health(sandbox)
+
+    # Kill the code-interpreter process as root
+    try:
+        sandbox.commands.run(
+            "kill -9 $(pgrep -f 'uvicorn main:app')", user="root"
+        )
+    except Exception:
+        pass
+
+    # Wait for systemd to restart it and health to come back
+    assert wait_for_health(sandbox, 60, 500)
+
+    # Verify code execution works after recovery
+    result = sandbox.run_code("x = 1; x")
+    assert result.text == "1"
diff --git a/template/jupyter-healthcheck.sh b/template/jupyter-healthcheck.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Custom health check for Jupyter Server
+# Verifies the server is responsive via the /api/status endpoint
+
+MAX_RETRIES=50
+RETRY_INTERVAL=0.2
+
+for i in $(seq 1 $MAX_RETRIES); do
+    status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
+
+    if [ "$status_code" -eq 200 ]; then
+        echo "Jupyter Server is healthy"
+        exit 0
+    fi
+
+    if [ $((i % 10)) -eq 0 ]; then
+        echo "Waiting for Jupyter Server to become healthy... (attempt $i/$MAX_RETRIES)"
+    fi
+    sleep $RETRY_INTERVAL
+done
+
+echo "Jupyter Server health check failed after $MAX_RETRIES attempts"
+exit 1
diff --git a/template/start-up.sh b/template/start-up.sh
@@ -1,22 +1,16 @@
 #!/bin/bash
 
-function start_jupyter_server() {
-	counter=0
-	response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
-	while [[ ${response} -ne 200 ]]; do
-		let counter++
-		if ((counter % 20 == 0)); then
-			echo "Waiting for Jupyter Server to start..."
-			sleep 0.1
-		fi
-
-		response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
-	done
+function start_code_interpreter() {
+	/root/.jupyter/jupyter-healthcheck.sh
+	if [ $? -ne 0 ]; then
+		echo "Jupyter Server failed to start, aborting."
+		exit 1
+	fi
 
 	cd /root/.server/
 	.venv/bin/uvicorn main:app --host 0.0.0.0 --port 49999 --workers 1 --no-access-log --no-use-colors --timeout-keep-alive 640
 }
 
 echo "Starting Code Interpreter server..."
-start_jupyter_server &
+start_code_interpreter &
 MATPLOTLIBRC=/root/.config/matplotlib/.matplotlibrc jupyter server --IdentityProvider.token="" >/dev/null 2>&1
diff --git a/template/systemd/code-interpreter.service b/template/systemd/code-interpreter.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=Code Interpreter Server
+Documentation=https://github.com/e2b-dev/code-interpreter
+Requires=jupyter.service
+After=jupyter.service
+BindsTo=jupyter.service
+
+[Service]
+Type=simple
+WorkingDirectory=/root/.server
+ExecStart=/root/.server/.venv/bin/uvicorn main:app --host 0.0.0.0 --port 49999 --workers 1 --no-access-log --no-use-colors --timeout-keep-alive 640
+Restart=always
+RestartSec=2
diff --git a/template/systemd/jupyter.service b/template/systemd/jupyter.service
@@ -0,0 +1,15 @@
+[Unit]
+Description=Jupyter Server
+Documentation=https://jupyter-server.readthedocs.io
+
+[Service]
+Type=simple
+Environment=MATPLOTLIBRC=/root/.config/matplotlib/.matplotlibrc
+Environment=JUPYTER_CONFIG_PATH=/root/.jupyter
+ExecStart=/usr/local/bin/jupyter server --IdentityProvider.token=""
+ExecStartPost=/root/.jupyter/jupyter-healthcheck.sh
+ExecStartPost=-/usr/bin/systemctl start code-interpreter
+Restart=on-failure
+RestartSec=2
+StandardOutput=null
+StandardError=journal
diff --git a/template/template.py b/template/template.py
@@ -110,14 +110,30 @@ def make_template(
     # Copy configuration files
     template = (
         template.copy("matplotlibrc", ".config/matplotlib/.matplotlibrc")
-        .copy("start-up.sh", ".jupyter/start-up.sh")
-        .run_cmd("chmod +x .jupyter/start-up.sh")
+        .copy("jupyter-healthcheck.sh", ".jupyter/jupyter-healthcheck.sh")
+        .run_cmd("chmod +x .jupyter/jupyter-healthcheck.sh")
         .copy("jupyter_server_config.py", ".jupyter/")
         .make_dir(".ipython/profile_default/startup")
         .copy("ipython_kernel_config.py", ".ipython/profile_default/")
         .copy("startup_scripts", ".ipython/profile_default/startup")
     )
 
+    if not is_docker:
+        template = (
+            template.copy(
+                "systemd/jupyter.service", "/etc/systemd/system/jupyter.service"
+            )
+            .copy(
+                "systemd/code-interpreter.service",
+                "/etc/systemd/system/code-interpreter.service",
+            )
+            .run_cmd("systemctl daemon-reload")
+        )
+    else:
+        template = template.copy("start-up.sh", ".jupyter/start-up.sh").run_cmd(
+            "chmod +x .jupyter/start-up.sh"
+        )
+
     if is_docker:
         # create user user and /home/user
         template = template.run_cmd("useradd -m user")
@@ -130,6 +146,11 @@ def make_template(
 
     template = template.set_user("user").set_workdir("/home/user")
 
+    if is_docker:
+        start_cmd = "sudo /root/.jupyter/start-up.sh"
+    else:
+        start_cmd = "systemctl start jupyter"
+
     return template.set_start_cmd(
-        "sudo /root/.jupyter/start-up.sh", wait_for_url("http://localhost:49999/health")
+        start_cmd, wait_for_url("http://localhost:49999/health")
     )