Skip to content

Commit e587cf1

Browse files
authored
Merge pull request #15 from onyx-dot-app/health
feat: improve health endpoint
2 parents d124e6a + 5d88a02 commit e587cf1

7 files changed

Lines changed: 229 additions & 3 deletions

File tree

code-interpreter/app/main.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111

1212
from app.api.routes import router as api_router
1313
from app.app_configs import EXECUTOR_BACKEND, HOST, PORT, PYTHON_EXECUTOR_DOCKER_IMAGE
14+
from app.models.schemas import HealthResponse
15+
from app.services.executor_factory import get_executor
1416

1517
# Configure logging
1618
logging.basicConfig(
@@ -101,8 +103,10 @@ def create_app() -> FastAPI:
101103
)
102104

103105
@app.get("/health")
104-
def health() -> dict[str, str]: # sync + strictly typed
105-
return {"status": "ok"}
106+
def health() -> HealthResponse:
107+
"""Health check that verifies the executor backend is operational."""
108+
result = get_executor().check_health()
109+
return HealthResponse(status=result.status, message=result.message)
106110

107111
app.include_router(api_router, prefix="/v1")
108112
return app

code-interpreter/app/models/schemas.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,8 @@ class ListFilesResponse(BaseModel):
115115
default_factory=list,
116116
description="List of all stored files with their metadata.",
117117
)
118+
119+
120+
class HealthResponse(BaseModel):
121+
status: Literal["ok", "error"]
122+
message: StrictStr | None = None

code-interpreter/app/services/executor_base.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,14 @@ class StreamResult:
9898
StreamEvent = StreamChunk | StreamResult
9999

100100

101+
@dataclass(frozen=True, slots=True)
102+
class HealthCheck:
103+
"""Result of an executor health check."""
104+
105+
status: Literal["ok", "error"]
106+
message: str | None = None
107+
108+
101109
class ExecutorProtocol(Protocol):
102110
def execute_python(
103111
self,
@@ -114,6 +122,13 @@ def execute_python(
114122

115123

116124
class BaseExecutor(ABC):
125+
def check_health(self) -> HealthCheck:
126+
"""Check if the executor backend is operational.
127+
128+
Default implementation returns ok. Override for backend-specific checks.
129+
"""
130+
return HealthCheck(status="ok")
131+
117132
@abstractmethod
118133
def execute_python(
119134
self,

code-interpreter/app/services/executor_docker.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
BaseExecutor,
2525
EntryKind,
2626
ExecutionResult,
27+
HealthCheck,
2728
StreamChunk,
2829
StreamEvent,
2930
StreamResult,
@@ -49,6 +50,51 @@ def __init__(self) -> None:
4950
self.image = PYTHON_EXECUTOR_DOCKER_IMAGE
5051
self.run_args = PYTHON_EXECUTOR_DOCKER_RUN_ARGS
5152

53+
def check_health(self) -> HealthCheck:
54+
"""Verify Docker daemon is reachable and the executor image is available."""
55+
# Check Docker daemon connectivity
56+
try:
57+
result = subprocess.run(
58+
[self.docker_binary, "version", "--format", "{{.Server.Version}}"],
59+
capture_output=True,
60+
timeout=5,
61+
check=False,
62+
)
63+
except FileNotFoundError:
64+
return HealthCheck(status="error", message="Docker binary not found")
65+
except subprocess.TimeoutExpired:
66+
return HealthCheck(status="error", message="Docker daemon not responding")
67+
68+
if result.returncode != 0:
69+
stderr = result.stderr.decode("utf-8", errors="replace").strip()
70+
return HealthCheck(
71+
status="error",
72+
message=f"Docker daemon not reachable: {stderr}",
73+
)
74+
75+
# Check executor image is available locally
76+
image_with_tag = f"{self.image}:latest"
77+
try:
78+
img_result = subprocess.run(
79+
[self.docker_binary, "image", "inspect", image_with_tag],
80+
capture_output=True,
81+
timeout=5,
82+
check=False,
83+
)
84+
except subprocess.TimeoutExpired:
85+
return HealthCheck(
86+
status="error",
87+
message=f"Timeout checking image {image_with_tag}",
88+
)
89+
90+
if img_result.returncode != 0:
91+
return HealthCheck(
92+
status="error",
93+
message=f"Executor image {image_with_tag} not available locally",
94+
)
95+
96+
return HealthCheck(status="ok")
97+
5298
def _resolve_docker_binary(self) -> str:
5399
candidate = PYTHON_EXECUTOR_DOCKER_BIN
54100
docker_path = which(candidate)

code-interpreter/app/services/executor_kubernetes.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
BaseExecutor,
3232
EntryKind,
3333
ExecutionResult,
34+
HealthCheck,
3435
WorkspaceEntry,
3536
wrap_last_line_interactive,
3637
)
@@ -76,6 +77,22 @@ def __init__(self) -> None:
7677
self.image = KUBERNETES_EXECUTOR_IMAGE
7778
self.service_account = KUBERNETES_EXECUTOR_SERVICE_ACCOUNT
7879

80+
def check_health(self) -> HealthCheck:
81+
"""Verify Kubernetes API is reachable and the namespace is accessible."""
82+
try:
83+
self.v1.read_namespace(name=self.namespace)
84+
except ApiException as e:
85+
return HealthCheck(
86+
status="error",
87+
message=f"Kubernetes API error (namespace={self.namespace}): {e.reason}",
88+
)
89+
except Exception as e:
90+
return HealthCheck(
91+
status="error",
92+
message=f"Kubernetes API not reachable: {e}",
93+
)
94+
return HealthCheck(status="ok")
95+
7996
def _create_pod_manifest(
8097
self,
8198
pod_name: str,

code-interpreter/tests/e2e/test_basic_flow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def test_execute_endpoint_basic_flow() -> None:
1818
pytest.fail(f"Failed to reach Code Interpreter service at {BASE_URL}: {exc!s}")
1919

2020
assert health_response.status_code == 200, health_response.text
21-
assert health_response.json() == {"status": "ok"}
21+
assert health_response.json()["status"] == "ok"
2222

2323
execute_payload: dict[str, Any] = {
2424
"code": "print('hello from e2e')",
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
from __future__ import annotations
2+
3+
import subprocess
4+
from collections.abc import Generator
5+
from unittest.mock import patch
6+
7+
import pytest
8+
from fastapi.testclient import TestClient
9+
10+
from app.main import create_app
11+
from app.services.executor_base import HealthCheck
12+
from app.services.executor_docker import DockerExecutor
13+
from app.services.executor_factory import get_executor
14+
15+
16+
@pytest.fixture(autouse=True)
17+
def _clear_executor_cache() -> Generator[None, None, None]:
18+
"""Reset the lru_cache on get_executor so patches take effect."""
19+
get_executor.cache_clear()
20+
yield
21+
get_executor.cache_clear()
22+
23+
24+
def test_health_returns_ok_when_backend_healthy() -> None:
25+
client = TestClient(create_app())
26+
response = client.get("/health")
27+
28+
assert response.status_code == 200
29+
body = response.json()
30+
assert body["status"] == "ok"
31+
assert body["message"] is None
32+
33+
34+
def test_health_returns_error_when_backend_unhealthy() -> None:
35+
unhealthy = HealthCheck(status="error", message="daemon down")
36+
37+
with patch.object(DockerExecutor, "check_health", return_value=unhealthy):
38+
client = TestClient(create_app())
39+
response = client.get("/health")
40+
41+
assert response.status_code == 200
42+
body = response.json()
43+
assert body["status"] == "error"
44+
assert body["message"] == "daemon down"
45+
46+
47+
def _make_completed(returncode: int, stderr: bytes = b"") -> subprocess.CompletedProcess[bytes]:
48+
return subprocess.CompletedProcess(args=[], returncode=returncode, stdout=b"", stderr=stderr)
49+
50+
51+
def test_docker_health_ok() -> None:
52+
"""Both Docker daemon and image check succeed."""
53+
with patch("app.services.executor_docker.subprocess.run", return_value=_make_completed(0)):
54+
executor = DockerExecutor()
55+
result = executor.check_health()
56+
57+
assert result.status == "ok"
58+
assert result.message is None
59+
60+
61+
def test_docker_health_daemon_unreachable() -> None:
62+
"""Docker daemon returns non-zero exit code."""
63+
with patch(
64+
"app.services.executor_docker.subprocess.run",
65+
return_value=_make_completed(1, stderr=b"Cannot connect to the Docker daemon"),
66+
):
67+
executor = DockerExecutor()
68+
result = executor.check_health()
69+
70+
assert result.status == "error"
71+
assert "Docker daemon not reachable" in (result.message or "")
72+
73+
74+
def test_docker_health_daemon_timeout() -> None:
75+
"""Docker daemon command times out."""
76+
with patch(
77+
"app.services.executor_docker.subprocess.run",
78+
side_effect=subprocess.TimeoutExpired(cmd="docker", timeout=5),
79+
):
80+
executor = DockerExecutor()
81+
result = executor.check_health()
82+
83+
assert result.status == "error"
84+
assert "not responding" in (result.message or "")
85+
86+
87+
def test_docker_health_binary_not_found() -> None:
88+
"""Docker binary does not exist."""
89+
with patch(
90+
"app.services.executor_docker.subprocess.run",
91+
side_effect=FileNotFoundError,
92+
):
93+
executor = DockerExecutor()
94+
result = executor.check_health()
95+
96+
assert result.status == "error"
97+
assert "not found" in (result.message or "")
98+
99+
100+
def test_docker_health_image_missing() -> None:
101+
"""Docker daemon is reachable but the executor image is not available."""
102+
daemon_ok = _make_completed(0)
103+
image_missing = _make_completed(1)
104+
105+
call_count = 0
106+
107+
def _side_effect(*args: object, **kwargs: object) -> subprocess.CompletedProcess[bytes]:
108+
nonlocal call_count
109+
call_count += 1
110+
# First call: docker version (daemon check) → ok
111+
# Second call: docker image inspect → fail
112+
return daemon_ok if call_count == 1 else image_missing
113+
114+
with patch("app.services.executor_docker.subprocess.run", side_effect=_side_effect):
115+
executor = DockerExecutor()
116+
result = executor.check_health()
117+
118+
assert result.status == "error"
119+
assert "not available locally" in (result.message or "")
120+
121+
122+
def test_docker_health_image_check_timeout() -> None:
123+
"""Docker daemon is reachable but the image inspect times out."""
124+
daemon_ok = _make_completed(0)
125+
call_count = 0
126+
127+
def _side_effect(*args: object, **kwargs: object) -> subprocess.CompletedProcess[bytes]:
128+
nonlocal call_count
129+
call_count += 1
130+
if call_count == 1:
131+
return daemon_ok
132+
raise subprocess.TimeoutExpired(cmd="docker", timeout=5)
133+
134+
with patch("app.services.executor_docker.subprocess.run", side_effect=_side_effect):
135+
executor = DockerExecutor()
136+
result = executor.check_health()
137+
138+
assert result.status == "error"
139+
assert "Timeout checking image" in (result.message or "")

0 commit comments

Comments
 (0)