Skip to content

Commit d1270d0

Browse files
committed
fix(deployments): address CodeRabbit review on Docker backend (AIRCORE-756)
- Exclude already-assigned host ports during multi-port allocation - Use label constants; fix HTTP/TCP probe host/port resolution - Remove TYPE_CHECKING-only docker imports - Deduplicate Docker availability check via docker_availability module - Harden port unit tests; add type hints and cleanup improvements Signed-off-by: Tyler Bray <tbray@nvidia.com>
1 parent f612b6a commit d1270d0

12 files changed

Lines changed: 187 additions & 86 deletions

File tree

plugins/nemo-deployments/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,6 @@ dev = ["pytest>=8.3.4", "pytest-asyncio>=0.25.3", "httpx>=0.27", "fastapi>=0.115
3838
[tool.pytest.ini_options]
3939
testpaths = ["tests"]
4040
asyncio_mode = "auto"
41-
pythonpath = ["src", "tests/unit"]
41+
pythonpath = ["src", "tests/unit", "tests/integration"]
4242

4343
[tool.nemo.openapi]

plugins/nemo-deployments/src/nemo_deployments_plugin/backends/docker/backend.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@
3232
)
3333
from nemo_deployments_plugin.backends.docker.gpu import GPUAllocationError, get_shared_gpu_pool
3434
from nemo_deployments_plugin.backends.docker.labels import (
35+
BACKOFF_LIMIT_LABEL,
3536
CONFIG_NAME_LABEL,
3637
DEPLOYMENT_NAME_LABEL,
3738
DEPLOYMENT_WORKSPACE_LABEL,
39+
MANAGED_BY_KEY,
3840
RESTART_POLICY_LABEL,
3941
container_name,
4042
deployment_identity_labels,
@@ -147,7 +149,11 @@ async def create_deployment(
147149

148150
host_ports: dict[int, int] = {}
149151
for port_spec in container_spec.ports:
150-
host_port = await find_available_port(self._client, docker_cfg)
152+
host_port = await find_available_port(
153+
self._client,
154+
docker_cfg,
155+
exclude_ports=set(host_ports.values()),
156+
)
151157
if host_port is None:
152158
if gpu_ids:
153159
self._gpu_pool.release_gpu(dep_key) # type: ignore[union-attr]
@@ -289,7 +295,7 @@ async def read_status(self, *, workspace: str, name: str) -> BackendStatusUpdate
289295
)
290296
if restart_policy == "OnFailure":
291297
restart_count = int(container.attrs.get("RestartCount", 0))
292-
backoff_limit = int(labels.get("nmp.nvidia.com/backoff-limit", "6"))
298+
backoff_limit = int(labels.get(BACKOFF_LIMIT_LABEL, "6"))
293299
if restart_count < backoff_limit:
294300
return BackendStatusUpdate(
295301
status="STARTING",
@@ -350,7 +356,7 @@ async def list_managed_deployment_names(self) -> list[str]:
350356
seen: set[str] = set()
351357
for container in containers:
352358
container_labels = container.labels or {}
353-
if container_labels.get("managed-by") != MANAGED_BY_LABEL:
359+
if container_labels.get(MANAGED_BY_KEY) != MANAGED_BY_LABEL:
354360
continue
355361
ws = container_labels.get(DEPLOYMENT_WORKSPACE_LABEL)
356362
dep_name = container_labels.get(DEPLOYMENT_NAME_LABEL)
@@ -417,7 +423,7 @@ def _container_matches_deployment(
417423
labels.get(DEPLOYMENT_WORKSPACE_LABEL) == workspace
418424
and labels.get(DEPLOYMENT_NAME_LABEL) == name
419425
and labels.get(CONFIG_NAME_LABEL) == config_name
420-
and labels.get("managed-by") == MANAGED_BY_LABEL
426+
and labels.get(MANAGED_BY_KEY) == MANAGED_BY_LABEL
421427
)
422428

423429
async def _resolve_restart_policy(self, workspace: str, name: str) -> RestartPolicy:

plugins/nemo-deployments/src/nemo_deployments_plugin/backends/docker/ports.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,12 @@
99
import logging
1010
import os
1111
import socket
12-
from typing import TYPE_CHECKING, Any
12+
from typing import Any
1313

1414
from nemo_deployments_plugin.backends.docker.labels import managed_by_filter
1515
from nemo_deployments_plugin.entities import DockerDeploymentConfig
1616

17-
if TYPE_CHECKING:
18-
import docker
17+
import docker
1918

2019
logger = logging.getLogger(__name__)
2120

@@ -53,7 +52,12 @@ def collect_used_host_ports(containers: list[Any]) -> set[int]:
5352
return used
5453

5554

56-
async def find_available_port(client: docker.DockerClient, docker_cfg: DockerDeploymentConfig) -> int | None:
55+
async def find_available_port(
56+
client: docker.DockerClient,
57+
docker_cfg: DockerDeploymentConfig,
58+
*,
59+
exclude_ports: set[int] | None = None,
60+
) -> int | None:
5761
try:
5862
containers = await asyncio.to_thread(
5963
client.containers.list,
@@ -65,6 +69,8 @@ async def find_available_port(client: docker.DockerClient, docker_cfg: DockerDep
6569
return None
6670

6771
used_ports = collect_used_host_ports(containers)
72+
if exclude_ports:
73+
used_ports = used_ports | exclude_ports
6874
for port in range(docker_cfg.port_range_start, docker_cfg.port_range_end + 1):
6975
if port not in used_ports and is_port_free(port):
7076
return port

plugins/nemo-deployments/src/nemo_deployments_plugin/backends/docker/probes.py

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,12 @@
88
import asyncio
99
import logging
1010
import socket
11-
from typing import TYPE_CHECKING
12-
from urllib.parse import urljoin
11+
from urllib.parse import urljoin, urlparse
1312

1413
import httpx
14+
from docker.models.containers import Container as DockerContainer
1515
from nemo_deployments_plugin.entities import Probe
1616

17-
if TYPE_CHECKING:
18-
from docker.models.containers import Container as DockerContainer
19-
2017
logger = logging.getLogger(__name__)
2118

2219

@@ -39,7 +36,7 @@ async def check_readiness_probe(
3936
return await _check_http_probe(host_url, probe, host_ports=host_ports, named_ports=named_ports)
4037

4138
if probe.tcp_socket is not None and host_url is not None:
42-
return await _check_tcp_probe(host_url, probe)
39+
return await _check_tcp_probe(host_url, probe, host_ports=host_ports, named_ports=named_ports)
4340

4441
return True, "probe type not implemented; treating as ready"
4542

@@ -75,6 +72,27 @@ def _run() -> tuple[int, str]:
7572
return False, f"exec probe exit {exit_code}: {output[:200]}"
7673

7774

75+
def _probe_host(host_url: str) -> str:
76+
return urlparse(host_url).hostname or "127.0.0.1"
77+
78+
79+
def _resolve_probe_port(
80+
port: int | str,
81+
*,
82+
host_ports: dict[int, int] | None,
83+
named_ports: dict[str, int] | None,
84+
) -> int | None:
85+
if isinstance(port, int):
86+
return host_ports.get(port, port) if host_ports else port
87+
if named_ports and port in named_ports:
88+
return named_ports[port]
89+
if host_ports:
90+
for container_port, host_port in host_ports.items():
91+
if str(container_port) == port:
92+
return host_port
93+
return None
94+
95+
7896
async def _check_http_probe(
7997
host_url: str,
8098
probe: Probe,
@@ -86,16 +104,11 @@ async def _check_http_probe(
86104
port = probe.http_get.port
87105
path = probe.http_get.path
88106
scheme = probe.http_get.scheme.lower()
89-
base = host_url
90-
if isinstance(port, int):
91-
base = f"{scheme}://127.0.0.1:{port}"
92-
elif isinstance(port, str) and named_ports and port in named_ports:
93-
base = f"{scheme}://127.0.0.1:{named_ports[port]}"
94-
elif isinstance(port, str) and host_ports:
95-
for container_port, host_port in host_ports.items():
96-
if str(container_port) == port:
97-
base = f"{scheme}://127.0.0.1:{host_port}"
98-
break
107+
host = _probe_host(host_url)
108+
resolved_port = _resolve_probe_port(port, host_ports=host_ports, named_ports=named_ports)
109+
if resolved_port is None:
110+
return False, f"http probe port not mapped: {port}"
111+
base = f"{scheme}://{host}:{resolved_port}"
99112
url = urljoin(f"{base.rstrip('/')}/", path.lstrip("/"))
100113
timeout = probe.timeout_seconds
101114
try:
@@ -108,16 +121,23 @@ async def _check_http_probe(
108121
return False, f"http probe failed: {exc}"
109122

110123

111-
async def _check_tcp_probe(host_url: str, probe: Probe) -> tuple[bool, str]:
124+
async def _check_tcp_probe(
125+
host_url: str,
126+
probe: Probe,
127+
*,
128+
host_ports: dict[int, int] | None = None,
129+
named_ports: dict[str, int] | None = None,
130+
) -> tuple[bool, str]:
112131
assert probe.tcp_socket is not None
113132
port_value = probe.tcp_socket.port
114-
if not isinstance(port_value, int):
115-
return False, "tcp probe requires numeric port"
116-
host = "127.0.0.1"
133+
host = _probe_host(host_url)
134+
target_port = _resolve_probe_port(port_value, host_ports=host_ports, named_ports=named_ports)
135+
if target_port is None:
136+
return False, f"tcp probe port not mapped: {port_value}"
117137
timeout = probe.timeout_seconds
118138

119139
def _connect() -> None:
120-
with socket.create_connection((host, port_value), timeout=timeout):
140+
with socket.create_connection((host, target_port), timeout=timeout):
121141
return
122142

123143
try:

plugins/nemo-deployments/src/nemo_deployments_plugin/backends/docker/status.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,14 @@ def format_duration(seconds: float) -> str:
2626

2727

2828
def map_exited_status(exit_code: int, restart_policy: RestartPolicy) -> DeploymentStatus:
29+
"""Map a stopped container's exit code to a terminal deployment status.
30+
31+
Restart-policy-specific retry handling happens in the backend before this
32+
helper is called for non-zero exits.
33+
"""
34+
del restart_policy
2935
if exit_code in SUCCESSFUL_EXIT_CODES:
3036
return "SUCCEEDED"
31-
if restart_policy == "Always":
32-
return "FAILED"
3337
return "FAILED"
3438

3539

plugins/nemo-deployments/src/nemo_deployments_plugin/backends/docker/volumes.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@
77

88
import asyncio
99
import logging
10-
from typing import TYPE_CHECKING, Any
10+
from typing import Any
1111

1212
from nemo_deployments_plugin.backends.base import VolumeStatusUpdate
1313
from nemo_deployments_plugin.backends.docker.labels import docker_volume_name, volume_identity_labels
1414

15-
if TYPE_CHECKING:
16-
import docker
15+
import docker
1716

1817
logger = logging.getLogger(__name__)
1918

plugins/nemo-deployments/tests/integration/backends/docker/test_docker_backend.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@
1010

1111
import pytest
1212
from docker.errors import NotFound
13+
from docker_availability import skip_without_docker
1314
from nemo_deployments_plugin.backends.docker.backend import DockerDeploymentBackend
14-
from nemo_deployments_plugin.backends.docker.labels import container_name, docker_volume_name
15+
from nemo_deployments_plugin.backends.docker.labels import container_name
1516
from nemo_deployments_plugin.backends.registry import BACKEND_CLASSES
1617
from nemo_deployments_plugin.constants import MANAGED_BY_LABEL
1718
from nemo_deployments_plugin.entities import (
@@ -23,17 +24,11 @@
2324
DockerDeploymentConfig,
2425
)
2526

26-
try:
27-
import docker
28-
29-
docker.from_env().ping()
30-
_DOCKER_AVAILABLE = True
31-
except Exception:
32-
_DOCKER_AVAILABLE = False
27+
import docker
3328

3429
pytestmark = [
3530
pytest.mark.skipif("docker" not in BACKEND_CLASSES, reason="Docker backend not registered"),
36-
pytest.mark.skipif(not _DOCKER_AVAILABLE, reason="Docker daemon not available"),
31+
skip_without_docker,
3732
]
3833

3934

@@ -101,6 +96,7 @@ async def test_never_deployment_succeeds(docker_backend: DockerDeploymentBackend
10196
config = _never_config()
10297
docker_backend._entities.get.return_value = config # type: ignore[attr-defined]
10398
c_name = container_name("itest", "echo-job")
99+
client = docker.from_env()
104100

105101
try:
106102
created = await docker_backend.create_deployment(
@@ -123,7 +119,7 @@ async def test_never_deployment_succeeds(docker_backend: DockerDeploymentBackend
123119
finally:
124120
await docker_backend.delete_deployment("itest", "echo-job")
125121
try:
126-
docker_backend._client.containers.get(c_name).remove(force=True)
122+
client.containers.get(c_name).remove(force=True)
127123
except NotFound:
128124
pass
129125

@@ -140,6 +136,7 @@ async def get_side_effect(entity_type, name, workspace=None):
140136

141137
docker_backend._entities.get.side_effect = get_side_effect # type: ignore[attr-defined]
142138
c_name = container_name("itest", "lost-srv")
139+
client = docker.from_env()
143140

144141
try:
145142
created = await docker_backend.create_deployment(
@@ -151,14 +148,14 @@ async def get_side_effect(entity_type, name, workspace=None):
151148
)
152149
assert created.status == "STARTING"
153150

154-
container = docker_backend._client.containers.get(c_name)
151+
container = client.containers.get(c_name)
155152
container.remove(force=True)
156153

157154
status = await docker_backend.read_status(workspace="itest", name="lost-srv")
158155
assert status.status == "LOST"
159156
finally:
160157
await docker_backend.delete_deployment("itest", "lost-srv")
161158
try:
162-
docker_backend._client.volumes.remove(docker_volume_name("itest", "data"))
163-
except Exception:
159+
client.containers.get(c_name).remove(force=True)
160+
except NotFound:
164161
pass

plugins/nemo-deployments/tests/integration/conftest.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,6 @@
55

66
from __future__ import annotations
77

8-
import pytest
8+
from docker_availability import DOCKER_AVAILABLE, skip_without_docker
99

10-
try:
11-
import docker
12-
13-
docker.from_env().ping()
14-
DOCKER_AVAILABLE = True
15-
except Exception:
16-
DOCKER_AVAILABLE = False
17-
18-
skip_without_docker = pytest.mark.skipif(not DOCKER_AVAILABLE, reason="Docker daemon not available")
10+
__all__ = ["DOCKER_AVAILABLE", "skip_without_docker"]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Shared Docker daemon availability check for integration tests."""
5+
6+
from __future__ import annotations
7+
8+
import pytest
9+
10+
try:
11+
import docker
12+
13+
docker.from_env().ping()
14+
DOCKER_AVAILABLE: bool = True
15+
except Exception:
16+
DOCKER_AVAILABLE = False
17+
18+
skip_without_docker: pytest.MarkDecorator = pytest.mark.skipif(
19+
not DOCKER_AVAILABLE, reason="Docker daemon not available"
20+
)

0 commit comments

Comments
 (0)