Skip to content

Commit 4d70daf

Browse files
authored
Merge pull request opensandbox-group#837 from fengcone/feature/public-k8s-snapshot
feat(snapshot): support public snapshots api on Kubernetes runtime
2 parents 33d386b + ea63568 commit 4d70daf

18 files changed

Lines changed: 939 additions & 25 deletions

docs/pause-resume.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ The OpenSandbox controller requires the following RBAC permissions for pause/res
276276

277277
### Snapshot image naming
278278

279-
Snapshot images are named:
279+
Internal pause/resume snapshot images are named:
280280
```
281281
<snapshot-registry>/<sandboxName>-<containerName>:snap-gen<N>
282282
```
@@ -286,6 +286,17 @@ For example, with `--snapshot-registry=registry.example.com/sandboxes`, sandbox
286286
registry.example.com/sandboxes/my-sandbox-sandbox:snap-gen1
287287
```
288288

289+
Server-managed public snapshots use the same repository layout but a stable
290+
snapshot-id-derived tag:
291+
```
292+
<snapshot-registry>/<sandboxName>-<containerName>:snap-<snapshotIdHex>
293+
```
294+
295+
The controller distinguishes the two modes by owner reference. Pause/resume
296+
snapshots are created by the `BatchSandbox` controller and have a controller
297+
ownerReference to the owning `BatchSandbox`; public snapshots are created by the
298+
Lifecycle server and do not use that ownerReference.
299+
289300
### Commit Job
290301

291302
The controller creates a short-lived Kubernetes `Job` for each pause:

docs/public/api/spec-inline.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

kubernetes/charts/opensandbox-server/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,10 @@ configToml: |
9191
informer_enabled = true
9292
informer_resync_seconds = 300
9393
informer_watch_timeout_seconds = 60
94+
snapshot_create_timeout_seconds = 900
9495
workload_provider = "batchsandbox"
9596
batchsandbox_template_file = "/etc/opensandbox/example.batchsandbox-template.yaml"
9697
9798
[egress]
9899
image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.10"
99100
mode = "dns+nft"
100-

kubernetes/internal/controller/sandboxsnapshot_controller_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,7 @@ func TestSandboxSnapshotHandlePending_UsesSourcePodContainersWhenTemplateMissing
317317
Name: "test-bs",
318318
Namespace: "default",
319319
Generation: 2,
320+
UID: types.UID("test-bs-uid"),
320321
},
321322
Spec: sandboxv1alpha1.BatchSandboxSpec{
322323
PoolRef: "test-pool",
@@ -342,6 +343,16 @@ func TestSandboxSnapshotHandlePending_UsesSourcePodContainersWhenTemplateMissing
342343
ObjectMeta: metav1.ObjectMeta{
343344
Name: "test-snapshot",
344345
Namespace: "default",
346+
OwnerReferences: []metav1.OwnerReference{
347+
{
348+
APIVersion: "sandbox.opensandbox.io/v1alpha1",
349+
Kind: "BatchSandbox",
350+
Name: "test-bs",
351+
UID: types.UID("test-bs-uid"),
352+
Controller: ptrToBool(true),
353+
BlockOwnerDeletion: ptrToBool(true),
354+
},
355+
},
345356
},
346357
Spec: sandboxv1alpha1.SandboxSnapshotSpec{
347358
SandboxName: "test-bs",
@@ -371,6 +382,71 @@ func TestSandboxSnapshotHandlePending_UsesSourcePodContainersWhenTemplateMissing
371382
require.NoError(t, r.Get(context.Background(), types.NamespacedName{Name: "test-snapshot-commit", Namespace: "default"}, job))
372383
}
373384

385+
func TestSandboxSnapshotHandlePending_PublicSnapshotUsesSnapshotIDTag(t *testing.T) {
386+
bs := &sandboxv1alpha1.BatchSandbox{
387+
ObjectMeta: metav1.ObjectMeta{
388+
Name: "test-bs",
389+
Namespace: "default",
390+
Generation: 7,
391+
},
392+
Spec: sandboxv1alpha1.BatchSandboxSpec{
393+
Template: &corev1.PodTemplateSpec{
394+
Spec: corev1.PodSpec{
395+
Containers: []corev1.Container{
396+
{Name: "sandbox", Image: "python:3.11"},
397+
},
398+
},
399+
},
400+
},
401+
}
402+
pod := &corev1.Pod{
403+
ObjectMeta: metav1.ObjectMeta{
404+
Name: "test-bs-0",
405+
Namespace: "default",
406+
Labels: map[string]string{
407+
LabelBatchSandboxNameKey: "test-bs",
408+
},
409+
},
410+
Spec: corev1.PodSpec{
411+
NodeName: "node-a",
412+
Containers: []corev1.Container{
413+
{Name: "sandbox", Image: "python:3.11"},
414+
},
415+
},
416+
Status: corev1.PodStatus{
417+
Phase: corev1.PodRunning,
418+
},
419+
}
420+
snapshot := &sandboxv1alpha1.SandboxSnapshot{
421+
ObjectMeta: metav1.ObjectMeta{
422+
Name: "osb-snap-11111111222243338444555555555555",
423+
Namespace: "default",
424+
},
425+
Spec: sandboxv1alpha1.SandboxSnapshotSpec{
426+
SandboxName: "test-bs",
427+
},
428+
Status: sandboxv1alpha1.SandboxSnapshotStatus{
429+
Phase: sandboxv1alpha1.SandboxSnapshotPhasePending,
430+
},
431+
}
432+
433+
r := newTestSnapshotReconciler(bs, pod, snapshot)
434+
r.SnapshotRegistry = "registry.default.svc.cluster.local:5000"
435+
436+
result, err := r.handlePending(context.Background(), snapshot)
437+
require.NoError(t, err)
438+
assert.Equal(t, time.Second, result.RequeueAfter)
439+
440+
updated := &sandboxv1alpha1.SandboxSnapshot{}
441+
require.NoError(t, r.Get(context.Background(), types.NamespacedName{Name: snapshot.Name, Namespace: "default"}, updated))
442+
require.Len(t, updated.Status.Containers, 1)
443+
assert.Equal(
444+
t,
445+
"registry.default.svc.cluster.local:5000/test-bs-sandbox:snap-11111111222243338444555555555555",
446+
updated.Status.Containers[0].ImageURI,
447+
)
448+
}
449+
374450
func TestBuildCommitJob_SetsBoundedBackoffLimit(t *testing.T) {
375451
snapshot := &sandboxv1alpha1.SandboxSnapshot{
376452
ObjectMeta: metav1.ObjectMeta{

kubernetes/internal/controller/sandboxsnapshot_lifecycle.go

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package controller
1616

1717
import (
1818
"context"
19+
"crypto/sha256"
1920
"encoding/json"
2021
"fmt"
2122
"strings"
@@ -75,7 +76,7 @@ func (r *SandboxSnapshotReconciler) handlePending(ctx context.Context, snapshot
7576

7677
var containers []sandboxv1alpha1.ContainerSnapshot
7778
for _, c := range sourceContainers {
78-
imageURI := fmt.Sprintf("%s/%s-%s:snap-gen%d", r.SnapshotRegistry, bs.Name, c.Name, bs.Generation)
79+
imageURI := r.snapshotImageURI(snapshot, bs, c.Name)
7980
containers = append(containers, sandboxv1alpha1.ContainerSnapshot{
8081
ContainerName: c.Name,
8182
ImageURI: imageURI,
@@ -241,6 +242,62 @@ func (r *SandboxSnapshotReconciler) findPodForSandbox(ctx context.Context, bs *s
241242
return nil, fmt.Errorf("no running pod found for BatchSandbox %s", bs.Name)
242243
}
243244

245+
func (r *SandboxSnapshotReconciler) snapshotImageURI(
246+
snapshot *sandboxv1alpha1.SandboxSnapshot,
247+
bs *sandboxv1alpha1.BatchSandbox,
248+
containerName string,
249+
) string {
250+
return fmt.Sprintf(
251+
"%s/%s-%s:%s",
252+
r.SnapshotRegistry,
253+
bs.Name,
254+
containerName,
255+
snapshotImageTag(snapshot, bs),
256+
)
257+
}
258+
259+
func snapshotImageTag(snapshot *sandboxv1alpha1.SandboxSnapshot, bs *sandboxv1alpha1.BatchSandbox) string {
260+
if hasBatchSandboxControllerOwner(snapshot) {
261+
return fmt.Sprintf("snap-gen%d", bs.Generation)
262+
}
263+
return publicSnapshotImageTag(snapshot.Name)
264+
}
265+
266+
func hasBatchSandboxControllerOwner(snapshot *sandboxv1alpha1.SandboxSnapshot) bool {
267+
for _, owner := range snapshot.OwnerReferences {
268+
if owner.Kind != "BatchSandbox" {
269+
continue
270+
}
271+
if owner.Controller != nil && *owner.Controller {
272+
return true
273+
}
274+
}
275+
return false
276+
}
277+
278+
func publicSnapshotImageTag(snapshotName string) string {
279+
const publicSnapshotNamePrefix = "osb-snap-"
280+
if strings.HasPrefix(snapshotName, publicSnapshotNamePrefix) {
281+
suffix := strings.TrimPrefix(snapshotName, publicSnapshotNamePrefix)
282+
if isLowerHex(suffix) && len(suffix) == 32 {
283+
return "snap-" + suffix
284+
}
285+
}
286+
287+
sum := sha256.Sum256([]byte(snapshotName))
288+
return fmt.Sprintf("snap-%x", sum)[:37]
289+
}
290+
291+
func isLowerHex(value string) bool {
292+
for _, ch := range value {
293+
if (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') {
294+
continue
295+
}
296+
return false
297+
}
298+
return true
299+
}
300+
244301
func (r *SandboxSnapshotReconciler) imageCommitterImage() string {
245302
if r.ImageCommitterImage != "" {
246303
return r.ImageCommitterImage

server/configuration.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ If `runtime.type = "kubernetes"` and the `[kubernetes]` table is absent, the ser
120120
| `image_pull_policy` | string \| omitted | `"IfNotPresent"` | Image pull policy for the BatchSandbox main container. Values: **`Always`**, **`IfNotPresent`**, **`Never`**. |
121121
| `sandbox_create_timeout_seconds` | integer | `60` | Max time to wait for a new sandbox to become ready (e.g. IP assigned), in seconds. |
122122
| `sandbox_create_poll_interval_seconds` | float | `1.0` | Poll interval while waiting for readiness. |
123+
| `snapshot_create_timeout_seconds` | integer | `900` | Max time to wait for a Kubernetes public snapshot to become ready, in seconds. Set this greater than the controller snapshot `commitJobTimeout` / `--commit-job-timeout`. |
123124
| `informer_enabled` | boolean | `true` | **[Beta]** Use informer/watch cache for reads to reduce API load. |
124125
| `informer_resync_seconds` | integer | `300` | **[Beta]** Full resync period for the informer cache. |
125126
| `informer_watch_timeout_seconds` | integer | `60` | **[Beta]** Watch stream restart interval. |

server/opensandbox_server/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,14 @@ class KubernetesRuntimeConfig(BaseModel):
558558
gt=0,
559559
description="Polling interval in seconds when waiting for a sandbox to become ready after creation.",
560560
)
561+
snapshot_create_timeout_seconds: int = Field(
562+
default=15 * 60,
563+
ge=1,
564+
description=(
565+
"Timeout in seconds to wait for a Kubernetes public snapshot to become ready. "
566+
"Set this greater than the controller snapshot commit-job-timeout."
567+
),
568+
)
561569
execd_init_resources: Optional["ExecdInitResources"] = Field(
562570
default=None,
563571
description=(

server/opensandbox_server/examples/example.config.k8s.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ workload_provider = "batchsandbox"
6464
# Values: "Always", "IfNotPresent", "Never".
6565
image_pull_policy = "IfNotPresent"
6666

67+
# Public snapshot wait timeout. Keep this greater than the Kubernetes
68+
# controller snapshot commit-job-timeout.
69+
snapshot_create_timeout_seconds = 900
70+
6771
# Path to the BatchSandbox template file
6872
batchsandbox_template_file = "~/batchsandbox-template.yaml"
6973

server/opensandbox_server/examples/example.config.k8s.zh.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ workload_provider = "batchsandbox"
6464
# 可选值:"Always"、"IfNotPresent"、"Never"。
6565
image_pull_policy = "IfNotPresent"
6666

67+
# public snapshot 等待超时时间。应大于 Kubernetes controller 的
68+
# snapshot commit-job-timeout。
69+
snapshot_create_timeout_seconds = 900
70+
6771
# Path to the BatchSandbox template file
6872
# Replace with your path
6973
batchsandbox_template_file = "~/batchsandbox-template.yaml"

server/opensandbox_server/services/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ class SandboxErrorCodes:
114114
INVALID_STATE = "KUBERNETES::INVALID_STATE"
115115

116116

117+
class SnapshotErrorCodes:
118+
"""Canonical error codes for snapshot service operations."""
119+
120+
INVALID_SOURCE_STATE = "SNAPSHOT::INVALID_SOURCE_STATE"
121+
122+
117123
__all__ = [
118124
"RESERVED_LABEL_PREFIX",
119125
"SANDBOX_ID_LABEL",
@@ -135,4 +141,5 @@ class SandboxErrorCodes:
135141
"EGRESS_MODE_ENV",
136142
"OPENSANDBOX_EGRESS_TOKEN",
137143
"SandboxErrorCodes",
144+
"SnapshotErrorCodes",
138145
]

0 commit comments

Comments
 (0)