Skip to content

Commit ea63568

Browse files
committed
fix(snapshot): harden Kubernetes public snapshot runtime
1 parent 64edfc3 commit ea63568

10 files changed

Lines changed: 134 additions & 25 deletions

File tree

kubernetes/charts/opensandbox-server/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,10 @@ configToml: |
9191
informer_enabled = true
9292
informer_resync_seconds = 300
9393
informer_watch_timeout_seconds = 60
94+
snapshot_create_timeout_seconds = 900
9495
workload_provider = "batchsandbox"
9596
batchsandbox_template_file = "/etc/opensandbox/example.batchsandbox-template.yaml"
9697
9798
[egress]
9899
image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.10"
99100
mode = "dns+nft"
100-

server/configuration.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ If `runtime.type = "kubernetes"` and the `[kubernetes]` table is absent, the ser
120120
| `image_pull_policy` | string \| omitted | `"IfNotPresent"` | Image pull policy for the BatchSandbox main container. Values: **`Always`**, **`IfNotPresent`**, **`Never`**. |
121121
| `sandbox_create_timeout_seconds` | integer | `60` | Max time to wait for a new sandbox to become ready (e.g. IP assigned), in seconds. |
122122
| `sandbox_create_poll_interval_seconds` | float | `1.0` | Poll interval while waiting for readiness. |
123+
| `snapshot_create_timeout_seconds` | integer | `900` | Max time to wait for a Kubernetes public snapshot to become ready, in seconds. Set this greater than the controller snapshot `commitJobTimeout` / `--commit-job-timeout`. |
123124
| `informer_enabled` | boolean | `true` | **[Beta]** Use informer/watch cache for reads to reduce API load. |
124125
| `informer_resync_seconds` | integer | `300` | **[Beta]** Full resync period for the informer cache. |
125126
| `informer_watch_timeout_seconds` | integer | `60` | **[Beta]** Watch stream restart interval. |

server/opensandbox_server/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,14 @@ class KubernetesRuntimeConfig(BaseModel):
558558
gt=0,
559559
description="Polling interval in seconds when waiting for a sandbox to become ready after creation.",
560560
)
561+
snapshot_create_timeout_seconds: int = Field(
562+
default=15 * 60,
563+
ge=1,
564+
description=(
565+
"Timeout in seconds to wait for a Kubernetes public snapshot to become ready. "
566+
"Set this greater than the controller snapshot commit-job-timeout."
567+
),
568+
)
561569
execd_init_resources: Optional["ExecdInitResources"] = Field(
562570
default=None,
563571
description=(

server/opensandbox_server/examples/example.config.k8s.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ workload_provider = "batchsandbox"
6464
# Values: "Always", "IfNotPresent", "Never".
6565
image_pull_policy = "IfNotPresent"
6666

67+
# Public snapshot wait timeout. Keep this greater than the Kubernetes
68+
# controller snapshot commit-job-timeout.
69+
snapshot_create_timeout_seconds = 900
70+
6771
# Path to the BatchSandbox template file
6872
batchsandbox_template_file = "~/batchsandbox-template.yaml"
6973

server/opensandbox_server/examples/example.config.k8s.zh.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ workload_provider = "batchsandbox"
6464
# 可选值:"Always"、"IfNotPresent"、"Never"。
6565
image_pull_policy = "IfNotPresent"
6666

67+
# public snapshot 等待超时时间。应大于 Kubernetes controller 的
68+
# snapshot commit-job-timeout。
69+
snapshot_create_timeout_seconds = 900
70+
6771
# Path to the BatchSandbox template file
6872
# Replace with your path
6973
batchsandbox_template_file = "~/batchsandbox-template.yaml"

server/opensandbox_server/services/k8s/snapshot_runtime.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def create_snapshot(
8888
) -> Optional[SnapshotRuntimeStatus]:
8989
snapshot_name = build_public_snapshot_name(snapshot_id)
9090
body = self._build_snapshot_body(snapshot_id, sandbox_id, snapshot_name)
91+
should_validate_existing_source = False
9192

9293
try:
9394
self._k8s_client.create_custom_object(
@@ -105,6 +106,7 @@ def create_snapshot(
105106
message=f"Failed to create Kubernetes SandboxSnapshot {snapshot_name}: {exc}",
106107
)
107108
logger.info("Kubernetes SandboxSnapshot %s already exists; continuing", snapshot_name)
109+
should_validate_existing_source = True
108110
except Exception as exc: # noqa: BLE001
109111
logger.exception("Failed to create Kubernetes SandboxSnapshot %s: %s", snapshot_name, exc)
110112
return SnapshotRuntimeStatus(
@@ -113,10 +115,19 @@ def create_snapshot(
113115
message=f"Failed to create Kubernetes SandboxSnapshot {snapshot_name}: {exc}",
114116
)
115117

116-
current = self._get_snapshot_cr(snapshot_name)
117-
conflict = self._validate_existing_source(current, sandbox_id)
118-
if conflict is not None:
119-
return conflict
118+
if should_validate_existing_source:
119+
try:
120+
current = self._get_snapshot_cr(snapshot_name)
121+
except Exception as exc: # noqa: BLE001
122+
logger.warning(
123+
"Failed to inspect existing Kubernetes SandboxSnapshot %s after create conflict: %s",
124+
snapshot_name,
125+
exc,
126+
)
127+
else:
128+
conflict = self._validate_existing_source(current, sandbox_id)
129+
if conflict is not None:
130+
return conflict
120131

121132
return self._wait_for_terminal_snapshot(snapshot_id)
122133

@@ -144,10 +155,10 @@ def inspect_snapshot(self, snapshot_id: str, image: Optional[str] = None) -> Sna
144155
try:
145156
snapshot = self._get_snapshot_cr(snapshot_name)
146157
except Exception as exc: # noqa: BLE001
147-
logger.exception("Failed to inspect Kubernetes SandboxSnapshot %s: %s", snapshot_name, exc)
158+
logger.warning("Failed to inspect Kubernetes SandboxSnapshot %s: %s", snapshot_name, exc)
148159
return SnapshotRuntimeStatus(
149-
state=SnapshotState.FAILED,
150-
reason="snapshot_recovery_inspect_failed",
160+
state=SnapshotState.CREATING,
161+
reason="snapshot_runtime_inspect_failed",
151162
message=f"Failed to inspect Kubernetes SandboxSnapshot {snapshot_name}: {exc}",
152163
)
153164

server/opensandbox_server/services/snapshot_runtime_factory.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from typing import Optional
2222

23-
from opensandbox_server.config import AppConfig, get_config
23+
from opensandbox_server.config import AppConfig, KubernetesRuntimeConfig, get_config
2424
from opensandbox_server.services.snapshot_runtime import SnapshotRuntime
2525

2626

@@ -44,14 +44,16 @@ def create_snapshot_runtime(
4444
from opensandbox_server.services.k8s.client import K8sClient
4545
from opensandbox_server.services.k8s.snapshot_runtime import KubernetesSnapshotRuntime
4646

47-
kubernetes_config = getattr(active_config, "kubernetes", None)
47+
kubernetes_config = getattr(active_config, "kubernetes", None) or KubernetesRuntimeConfig()
4848
if k8s_client is None:
49-
if kubernetes_config is None:
50-
raise ValueError("kubernetes config is required when runtime.type = 'kubernetes'.")
5149
k8s_client = K8sClient(kubernetes_config)
5250

53-
namespace = getattr(kubernetes_config, "namespace", None) or "default"
54-
return KubernetesSnapshotRuntime(k8s_client, namespace=namespace)
51+
namespace = kubernetes_config.namespace or "default"
52+
return KubernetesSnapshotRuntime(
53+
k8s_client,
54+
namespace=namespace,
55+
wait_timeout_seconds=kubernetes_config.snapshot_create_timeout_seconds,
56+
)
5557

5658
raise ValueError(f"Unsupported snapshot runtime type: {runtime_type}")
5759

server/tests/test_k8s_snapshot_runtime.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ def __init__(self) -> None:
4040
def create_custom_object(self, *, group: str, version: str, namespace: str, plural: str, body: dict):
4141
self.created.append(deepcopy(body))
4242
name = body["metadata"]["name"]
43-
stored = deepcopy(body)
4443
if name in self.objects:
45-
stored = self.objects[name]
44+
raise ApiException(status=409, reason="Already Exists")
45+
stored = deepcopy(body)
4646
self.objects[name] = stored
4747
return stored
4848

@@ -57,6 +57,44 @@ def delete_custom_object(self, *, group: str, version: str, namespace: str, plur
5757
del self.objects[name]
5858

5959

60+
class TransientGetK8sClient(FakeK8sClient):
61+
def __init__(self, *, failures: int) -> None:
62+
super().__init__()
63+
self.failures = failures
64+
65+
def get_custom_object(self, *, group: str, version: str, namespace: str, plural: str, name: str):
66+
if self.failures > 0:
67+
self.failures -= 1
68+
raise ApiException(status=500, reason="temporary apiserver error")
69+
return super().get_custom_object(
70+
group=group,
71+
version=version,
72+
namespace=namespace,
73+
plural=plural,
74+
name=name,
75+
)
76+
77+
78+
class TransientThenReadyK8sClient(TransientGetK8sClient):
79+
def get_custom_object(self, *, group: str, version: str, namespace: str, plural: str, name: str):
80+
obj = super().get_custom_object(
81+
group=group,
82+
version=version,
83+
namespace=namespace,
84+
plural=plural,
85+
name=name,
86+
)
87+
if obj is not None:
88+
obj["status"] = {
89+
"phase": "Succeed",
90+
"containers": [
91+
{"containerName": "sandbox", "imageUri": "registry/sandbox:snap"},
92+
],
93+
}
94+
self.objects[name] = deepcopy(obj)
95+
return obj
96+
97+
6098
def _snapshot_cr(*, phase: str, containers: list[dict] | None = None, sandbox_id: str = SANDBOX_ID) -> dict:
6199
name = build_public_snapshot_name(SNAPSHOT_ID)
62100
return {
@@ -160,6 +198,32 @@ def test_inspect_snapshot_maps_failed_condition() -> None:
160198
assert status.message == "commit job failed"
161199

162200

201+
def test_inspect_snapshot_keeps_transient_read_error_creating() -> None:
202+
k8s_client = TransientGetK8sClient(failures=1)
203+
runtime = KubernetesSnapshotRuntime(k8s_client, namespace="default")
204+
205+
status = runtime.inspect_snapshot(SNAPSHOT_ID)
206+
207+
assert status.state == SnapshotState.CREATING
208+
assert status.reason == "snapshot_runtime_inspect_failed"
209+
assert "temporary apiserver error" in (status.message or "")
210+
211+
212+
def test_create_snapshot_retries_transient_inspect_error_until_controller_ready() -> None:
213+
k8s_client = TransientThenReadyK8sClient(failures=1)
214+
runtime = KubernetesSnapshotRuntime(
215+
k8s_client,
216+
namespace="default",
217+
wait_timeout_seconds=1,
218+
poll_interval_seconds=0,
219+
)
220+
221+
status = runtime.create_snapshot(SNAPSHOT_ID, SANDBOX_ID)
222+
223+
assert status.state == SnapshotState.READY
224+
assert status.image == "registry/sandbox:snap"
225+
226+
163227
def test_delete_snapshot_deletes_cr_and_ignores_missing_cr() -> None:
164228
k8s_client = FakeK8sClient()
165229
runtime = KubernetesSnapshotRuntime(k8s_client, namespace="default")

server/tests/test_routes_snapshots.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def test_snapshot_routes_can_use_persisted_service(
207207
class StubSandboxService:
208208
@staticmethod
209209
def get_sandbox(sandbox_id: str):
210-
return {"id": sandbox_id}
210+
return {"id": sandbox_id, "status": {"state": "Running"}}
211211

212212
class StubSnapshotRuntime:
213213
@staticmethod
@@ -254,7 +254,7 @@ def test_create_snapshot_returns_501_when_runtime_is_not_supported(
254254
class StubSandboxService:
255255
@staticmethod
256256
def get_sandbox(sandbox_id: str):
257-
return {"id": sandbox_id}
257+
return {"id": sandbox_id, "status": {"state": "Running"}}
258258

259259
service = PersistedSnapshotService(
260260
SQLiteSnapshotRepository(tmp_path / "snapshots.db"),

server/tests/test_snapshot_runtime_factory.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,16 @@
1414

1515
from __future__ import annotations
1616

17-
from types import SimpleNamespace
18-
1917
import pytest
2018

19+
from opensandbox_server.config import AppConfig, KubernetesRuntimeConfig, RuntimeConfig
2120
from opensandbox_server.services.docker.snapshot_runtime import DockerSnapshotRuntime
2221
from opensandbox_server.services.k8s.snapshot_runtime import KubernetesSnapshotRuntime
2322
from opensandbox_server.services.snapshot_runtime_factory import create_snapshot_runtime
2423

2524

2625
def test_create_snapshot_runtime_selects_docker_runtime() -> None:
27-
config = SimpleNamespace(runtime=SimpleNamespace(type="docker"))
26+
config = AppConfig(runtime=RuntimeConfig(type="docker", execd_image="opensandbox/execd:test"))
2827
docker_client = object()
2928

3029
runtime = create_snapshot_runtime(config, docker_client=docker_client)
@@ -33,19 +32,35 @@ def test_create_snapshot_runtime_selects_docker_runtime() -> None:
3332

3433

3534
def test_create_snapshot_runtime_selects_kubernetes_runtime() -> None:
36-
config = SimpleNamespace(
37-
runtime=SimpleNamespace(type="kubernetes"),
38-
kubernetes=SimpleNamespace(namespace="default"),
35+
config = AppConfig(
36+
runtime=RuntimeConfig(type="kubernetes", execd_image="opensandbox/execd:test"),
37+
kubernetes=KubernetesRuntimeConfig(namespace="default"),
38+
)
39+
k8s_client = object()
40+
41+
runtime = create_snapshot_runtime(config, k8s_client=k8s_client)
42+
43+
assert isinstance(runtime, KubernetesSnapshotRuntime)
44+
45+
46+
def test_create_snapshot_runtime_uses_kubernetes_snapshot_create_timeout() -> None:
47+
config = AppConfig(
48+
runtime=RuntimeConfig(type="kubernetes", execd_image="opensandbox/execd:test"),
49+
kubernetes=KubernetesRuntimeConfig(
50+
namespace="default",
51+
snapshot_create_timeout_seconds=1234,
52+
),
3953
)
4054
k8s_client = object()
4155

4256
runtime = create_snapshot_runtime(config, k8s_client=k8s_client)
4357

4458
assert isinstance(runtime, KubernetesSnapshotRuntime)
59+
assert runtime._wait_timeout_seconds == 1234
4560

4661

4762
def test_create_snapshot_runtime_requires_docker_client_for_docker() -> None:
48-
config = SimpleNamespace(runtime=SimpleNamespace(type="docker"))
63+
config = AppConfig(runtime=RuntimeConfig(type="docker", execd_image="opensandbox/execd:test"))
4964

5065
with pytest.raises(ValueError, match="docker_client is required"):
5166
create_snapshot_runtime(config)

0 commit comments

Comments
 (0)