Skip to content

Commit 2f588c6

Browse files
committed
added systemd oom handling and tests
1 parent 5cb8bd5 commit 2f588c6

File tree

8 files changed

+265
-16
lines changed

8 files changed

+265
-16
lines changed

js/tests/systemd.test.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import { expect } from 'vitest'
2+
import { sandboxTest, wait } from './setup'
3+
4+
async function waitForHealth(sandbox: any, maxRetries = 10, intervalMs = 100) {
5+
for (let i = 0; i < maxRetries; i++) {
6+
try {
7+
const result = await sandbox.commands.run(
8+
'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
9+
)
10+
if (result.stdout.trim() === '200') {
11+
return true
12+
}
13+
} catch {
14+
// Connection refused or other error, retry
15+
}
16+
await wait(intervalMs)
17+
}
18+
return false
19+
}
20+
21+
sandboxTest('restart after jupyter kill', async ({ sandbox }) => {
22+
// Verify health is up initially
23+
const initialHealth = await waitForHealth(sandbox)
24+
expect(initialHealth).toBe(true)
25+
26+
// Kill the jupyter process as root
27+
// The command handle may get killed too (since killing jupyter cascades to code-interpreter),
28+
// so we catch the error.
29+
try {
30+
await sandbox.commands.run("kill -9 $(pgrep -f 'jupyter server')", {
31+
user: 'root',
32+
})
33+
} catch {
34+
// Expected — the kill cascade may terminate the command handle
35+
}
36+
37+
// Wait for systemd to restart both services
38+
const recovered = await waitForHealth(sandbox, 60, 500)
39+
expect(recovered).toBe(true)
40+
41+
// Verify code execution works after recovery
42+
const result = await sandbox.runCode('x = 1; x')
43+
expect(result.text).toEqual('1')
44+
})
45+
46+
sandboxTest('restart after code-interpreter kill', async ({ sandbox }) => {
47+
// Verify health is up initially
48+
const initialHealth = await waitForHealth(sandbox)
49+
expect(initialHealth).toBe(true)
50+
51+
// Kill the code-interpreter process as root
52+
try {
53+
await sandbox.commands.run("kill -9 $(pgrep -f 'uvicorn main:app')", {
54+
user: 'root',
55+
})
56+
} catch {
57+
// Expected — killing code-interpreter may terminate the command handle
58+
}
59+
60+
// Wait for systemd to restart it and health to come back
61+
const recovered = await waitForHealth(sandbox, 60, 500)
62+
expect(recovered).toBe(true)
63+
64+
// Verify code execution works after recovery
65+
const result = await sandbox.runCode('x = 1; x')
66+
expect(result.text).toEqual('1')
67+
})
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import asyncio
2+
3+
from e2b_code_interpreter.code_interpreter_async import AsyncSandbox
4+
5+
6+
async def wait_for_health(sandbox: AsyncSandbox, max_retries=10, interval_ms=100):
7+
for _ in range(max_retries):
8+
try:
9+
result = await sandbox.commands.run(
10+
'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
11+
)
12+
if result.stdout.strip() == "200":
13+
return True
14+
except Exception:
15+
pass
16+
await asyncio.sleep(interval_ms / 1000)
17+
return False
18+
19+
20+
async def test_restart_after_jupyter_kill(async_sandbox: AsyncSandbox):
21+
# Verify health is up initially
22+
assert await wait_for_health(async_sandbox)
23+
24+
# Kill the jupyter process as root
25+
# The command handle may get killed too (killing jupyter cascades to code-interpreter),
26+
# so we catch the error.
27+
try:
28+
await async_sandbox.commands.run(
29+
"kill -9 $(pgrep -f 'jupyter server')", user="root"
30+
)
31+
except Exception:
32+
pass
33+
34+
# Wait for systemd to restart both services
35+
assert await wait_for_health(async_sandbox, 60, 500)
36+
37+
# Verify code execution works after recovery
38+
result = await async_sandbox.run_code("x = 1; x")
39+
assert result.text == "1"
40+
41+
42+
async def test_restart_after_code_interpreter_kill(async_sandbox: AsyncSandbox):
43+
# Verify health is up initially
44+
assert await wait_for_health(async_sandbox)
45+
46+
# Kill the code-interpreter process as root
47+
try:
48+
await async_sandbox.commands.run(
49+
"kill -9 $(pgrep -f 'uvicorn main:app')", user="root"
50+
)
51+
except Exception:
52+
pass
53+
54+
# Wait for systemd to restart it and health to come back
55+
assert await wait_for_health(async_sandbox, 60, 500)
56+
57+
# Verify code execution works after recovery
58+
result = await async_sandbox.run_code("x = 1; x")
59+
assert result.text == "1"

python/tests/sync/test_systemd.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import time
2+
3+
from e2b_code_interpreter.code_interpreter_sync import Sandbox
4+
5+
6+
def wait_for_health(sandbox: Sandbox, max_retries=10, interval_ms=100):
7+
for _ in range(max_retries):
8+
try:
9+
result = sandbox.commands.run(
10+
'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
11+
)
12+
if result.stdout.strip() == "200":
13+
return True
14+
except Exception:
15+
pass
16+
time.sleep(interval_ms / 1000)
17+
return False
18+
19+
20+
def test_restart_after_jupyter_kill(sandbox: Sandbox):
21+
# Verify health is up initially
22+
assert wait_for_health(sandbox)
23+
24+
# Kill the jupyter process as root
25+
# The command handle may get killed too (killing jupyter cascades to code-interpreter),
26+
# so we catch the error.
27+
try:
28+
sandbox.commands.run("kill -9 $(pgrep -f 'jupyter server')", user="root")
29+
except Exception:
30+
pass
31+
32+
# Wait for systemd to restart both services
33+
assert wait_for_health(sandbox, 60, 500)
34+
35+
# Verify code execution works after recovery
36+
result = sandbox.run_code("x = 1; x")
37+
assert result.text == "1"
38+
39+
40+
def test_restart_after_code_interpreter_kill(sandbox: Sandbox):
41+
# Verify health is up initially
42+
assert wait_for_health(sandbox)
43+
44+
# Kill the code-interpreter process as root
45+
try:
46+
sandbox.commands.run(
47+
"kill -9 $(pgrep -f 'uvicorn main:app')", user="root"
48+
)
49+
except Exception:
50+
pass
51+
52+
# Wait for systemd to restart it and health to come back
53+
assert wait_for_health(sandbox, 60, 500)
54+
55+
# Verify code execution works after recovery
56+
result = sandbox.run_code("x = 1; x")
57+
assert result.text == "1"

template/jupyter-healthcheck.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
# Custom health check for Jupyter Server
3+
# Verifies the server is responsive via the /api/status endpoint
4+
5+
MAX_RETRIES=50
6+
RETRY_INTERVAL=0.2
7+
8+
for i in $(seq 1 $MAX_RETRIES); do
9+
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
10+
11+
if [ "$status_code" -eq 200 ]; then
12+
echo "Jupyter Server is healthy"
13+
exit 0
14+
fi
15+
16+
if [ $((i % 10)) -eq 0 ]; then
17+
echo "Waiting for Jupyter Server to become healthy... (attempt $i/$MAX_RETRIES)"
18+
fi
19+
sleep $RETRY_INTERVAL
20+
done
21+
22+
echo "Jupyter Server health check failed after $MAX_RETRIES attempts"
23+
exit 1

template/start-up.sh

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,16 @@
11
#!/bin/bash
22

3-
function start_jupyter_server() {
4-
counter=0
5-
response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
6-
while [[ ${response} -ne 200 ]]; do
7-
let counter++
8-
if ((counter % 20 == 0)); then
9-
echo "Waiting for Jupyter Server to start..."
10-
sleep 0.1
11-
fi
12-
13-
response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
14-
done
3+
function start_code_interpreter() {
4+
/root/.jupyter/jupyter-healthcheck.sh
5+
if [ $? -ne 0 ]; then
6+
echo "Jupyter Server failed to start, aborting."
7+
exit 1
8+
fi
159

1610
cd /root/.server/
1711
.venv/bin/uvicorn main:app --host 0.0.0.0 --port 49999 --workers 1 --no-access-log --no-use-colors --timeout-keep-alive 640
1812
}
1913

2014
echo "Starting Code Interpreter server..."
21-
start_jupyter_server &
15+
start_code_interpreter &
2216
MATPLOTLIBRC=/root/.config/matplotlib/.matplotlibrc jupyter server --IdentityProvider.token="" >/dev/null 2>&1
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[Unit]
2+
Description=Code Interpreter Server
3+
Documentation=https://github.com/e2b-dev/code-interpreter
4+
Requires=jupyter.service
5+
After=jupyter.service
6+
BindsTo=jupyter.service
7+
8+
[Service]
9+
Type=simple
10+
WorkingDirectory=/root/.server
11+
ExecStart=/root/.server/.venv/bin/uvicorn main:app --host 0.0.0.0 --port 49999 --workers 1 --no-access-log --no-use-colors --timeout-keep-alive 640
12+
Restart=always
13+
RestartSec=2

template/systemd/jupyter.service

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[Unit]
2+
Description=Jupyter Server
3+
Documentation=https://jupyter-server.readthedocs.io
4+
5+
[Service]
6+
Type=simple
7+
Environment=MATPLOTLIBRC=/root/.config/matplotlib/.matplotlibrc
8+
Environment=JUPYTER_CONFIG_PATH=/root/.jupyter
9+
ExecStart=/usr/local/bin/jupyter server --IdentityProvider.token=""
10+
ExecStartPost=/root/.jupyter/jupyter-healthcheck.sh
11+
ExecStartPost=-/usr/bin/systemctl start code-interpreter
12+
Restart=on-failure
13+
RestartSec=2
14+
StandardOutput=null
15+
StandardError=journal

template/template.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,14 +110,30 @@ def make_template(
110110
# Copy configuration files
111111
template = (
112112
template.copy("matplotlibrc", ".config/matplotlib/.matplotlibrc")
113-
.copy("start-up.sh", ".jupyter/start-up.sh")
114-
.run_cmd("chmod +x .jupyter/start-up.sh")
113+
.copy("jupyter-healthcheck.sh", ".jupyter/jupyter-healthcheck.sh")
114+
.run_cmd("chmod +x .jupyter/jupyter-healthcheck.sh")
115115
.copy("jupyter_server_config.py", ".jupyter/")
116116
.make_dir(".ipython/profile_default/startup")
117117
.copy("ipython_kernel_config.py", ".ipython/profile_default/")
118118
.copy("startup_scripts", ".ipython/profile_default/startup")
119119
)
120120

121+
if not is_docker:
122+
template = (
123+
template.copy(
124+
"systemd/jupyter.service", "/etc/systemd/system/jupyter.service"
125+
)
126+
.copy(
127+
"systemd/code-interpreter.service",
128+
"/etc/systemd/system/code-interpreter.service",
129+
)
130+
.run_cmd("systemctl daemon-reload")
131+
)
132+
else:
133+
template = template.copy("start-up.sh", ".jupyter/start-up.sh").run_cmd(
134+
"chmod +x .jupyter/start-up.sh"
135+
)
136+
121137
if is_docker:
122138
# create user user and /home/user
123139
template = template.run_cmd("useradd -m user")
@@ -130,6 +146,11 @@ def make_template(
130146

131147
template = template.set_user("user").set_workdir("/home/user")
132148

149+
if is_docker:
150+
start_cmd = "sudo /root/.jupyter/start-up.sh"
151+
else:
152+
start_cmd = "systemctl start jupyter"
153+
133154
return template.set_start_cmd(
134-
"sudo /root/.jupyter/start-up.sh", wait_for_url("http://localhost:49999/health")
155+
start_cmd, wait_for_url("http://localhost:49999/health")
135156
)

0 commit comments

Comments
 (0)