Skip to content

Commit 453a039

Browse files
committed
refactor(kubernetes): optimize pod recycling logic and cleanup tests
- Add support for customizing restart timeout via annotation in docs - Replace PodsToRecycle slice with RecyclingPods set for efficient lookup - Refactor allocator to skip pods currently recycling in allocation logic - Remove deprecated canAllocate function and simplify recycling checks - Update pool reconciler to collect recycling pods and handle recycling in batch - Refactor handlePodRecycle to process multiple pods and aggregate errors - Enhance restart tracker logging with timeout and elapsed time details - Remove outdated controller unit tests and reduce e2e test scope for simplicity
1 parent 8aa745a commit 453a039

11 files changed

Lines changed: 84 additions & 1468 deletions

File tree

kubernetes/README-ZH.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ Pool 自定义资源维护一个预热的计算资源池,以实现快速沙箱
2929
- **池容量限制**:通过池范围的最小和最大限制来控制总体资源消耗。
3030
- **回收策略 (Recycle Policies)**:支持不同的 Pod 回收策略:
3131
- **Delete (默认)**:Pod 在返回池时会被删除并根据模板重新创建,确保环境绝对纯净。
32-
- **Restart**:通过向所有容器的 PID 1 发送 SIGTERM 信号优雅终止进程,并依赖 Kubernetes 的 `restartPolicy` 触发重启。这种方式比 `Delete` 更快,但要求 `PodTemplateSpec` 中的 `restartPolicy` 设置为 `Always`
32+
- **Restart**:通过向所有容器的 PID 1 发送 SIGTERM 信号优雅终止进程,并依赖 Kubernetes 的 `restartPolicy` 触发重启。这种方式比 `Delete` 更快,但要求 `PodTemplateSpec` 中的 `restartPolicy` 设置为 `Always`可通过 annotation `pool.opensandbox.io/recycle-timeout-sec` 自定义重启超时时间(默认 90 秒)。
3333
- **自动扩展**:基于当前需求和缓冲区设置进行动态资源分配和释放。
3434
- **实时状态监控**:显示总数、已分配、可用以及正在重启中的 Pod 数量。
3535

kubernetes/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ The Pool custom resource maintains a pool of pre-warmed compute resources to ena
2929
- **Pool Capacity Limits**: Overall resource consumption control with pool-wide minimum and maximum limits.
3030
- **Recycle Policies**: Support for different pod recycling strategies:
3131
- **Delete (Default)**: Pods are deleted and recreated from the template when returned to the pool, ensuring a completely clean environment.
32-
- **Restart**: PID 1 in all containers is gracefully terminated (SIGTERM), and the Kubernetes `restartPolicy` triggers a restart. This is faster than `Delete` but requires the `restartPolicy` in `PodTemplateSpec` to be set to `Always`.
32+
- **Restart**: PID 1 in all containers is gracefully terminated (SIGTERM), and the Kubernetes `restartPolicy` triggers a restart. This is faster than `Delete` but requires the `restartPolicy` in `PodTemplateSpec` to be set to `Always`. The restart timeout can be customized per-pool via the annotation `pool.opensandbox.io/recycle-timeout-sec` (default: 90s).
3333
- **Automatic Scaling**: Dynamic resource allocation and deallocation based on current demand and buffer settings.
3434
- **Real-time Status**: Monitoring of total, allocated, available, and restarting pods.
3535

kubernetes/internal/controller/allocator.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"strconv"
2424

2525
corev1 "k8s.io/api/core/v1"
26+
"k8s.io/apimachinery/pkg/util/sets"
2627
"sigs.k8s.io/controller-runtime/pkg/client"
2728
logf "sigs.k8s.io/controller-runtime/pkg/log"
2829

@@ -177,15 +178,15 @@ type AllocSpec struct {
177178
Pool *sandboxv1alpha1.Pool
178179
// all pods of pool
179180
Pods []*corev1.Pod
181+
182+
RecyclingPods sets.Set[string]
180183
}
181184

182185
type AllocStatus struct {
183186
// PodAllocation maps pod name to sandbox name for currently allocated pods.
184187
PodAllocation map[string]string
185188
// PodSupplement is the number of additional pods needed to meet sandbox demands.
186189
PodSupplement int32
187-
// PoolReconciler handles the actual recycle logic (delete or restart).
188-
PodsToRecycle []string
189190
}
190191

191192
type SandboxSyncInfo struct {
@@ -223,7 +224,11 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec
223224
if _, ok := status.PodAllocation[pod.Name]; ok {
224225
continue
225226
}
226-
if !canAllocate(pod) {
227+
if spec.RecyclingPods.Has(pod.Name) {
228+
continue
229+
}
230+
// Exclude pods that are restarting (have recycle-meta annotation)
231+
if isRecycling(pod) {
227232
continue
228233
}
229234
if pod.Status.Phase != corev1.PodRunning {
@@ -271,7 +276,6 @@ func (allocator *defaultAllocator) initAllocation(ctx context.Context, spec *All
271276
var err error
272277
status := &AllocStatus{
273278
PodAllocation: make(map[string]string),
274-
PodsToRecycle: make([]string, 0),
275279
}
276280
status.PodAllocation, err = allocator.getPodAllocation(ctx, spec.Pool)
277281
if err != nil {
@@ -391,7 +395,6 @@ func (allocator *defaultAllocator) deallocate(ctx context.Context, status *Alloc
391395
for _, pod := range pods {
392396
delete(status.PodAllocation, pod)
393397
poolDeallocate = true
394-
status.PodsToRecycle = append(status.PodsToRecycle, pod)
395398
}
396399
delete(sandboxToPods, name)
397400
}
@@ -413,7 +416,6 @@ func (allocator *defaultAllocator) doDeallocate(ctx context.Context, status *All
413416
if _, ok := status.PodAllocation[pod]; ok {
414417
delete(status.PodAllocation, pod)
415418
deallocate = true
416-
status.PodsToRecycle = append(status.PodsToRecycle, pod)
417419
}
418420
}
419421
pods := make([]string, 0)

kubernetes/internal/controller/allocator_test.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ func TestAllocatorSchedule(t *testing.T) {
109109
"pod2": "sbx2",
110110
},
111111
PodSupplement: 0,
112-
PodsToRecycle: []string{},
113112
},
114113
},
115114
{
@@ -173,7 +172,6 @@ func TestAllocatorSchedule(t *testing.T) {
173172
"pod1": "sbx1",
174173
},
175174
PodSupplement: 1,
176-
PodsToRecycle: []string{},
177175
},
178176
},
179177
{
@@ -241,7 +239,6 @@ func TestAllocatorSchedule(t *testing.T) {
241239
"pod2": "sbx1",
242240
},
243241
PodSupplement: 0,
244-
PodsToRecycle: []string{},
245242
},
246243
},
247244
{
@@ -296,7 +293,6 @@ func TestAllocatorSchedule(t *testing.T) {
296293
wantStatus: &AllocStatus{
297294
PodAllocation: map[string]string{},
298295
PodSupplement: 0,
299-
PodsToRecycle: []string{"pod1"},
300296
},
301297
},
302298
{
@@ -363,7 +359,6 @@ func TestAllocatorSchedule(t *testing.T) {
363359
"pod-normal": "sbx1",
364360
},
365361
PodSupplement: 1, // sbx2 needs a pod but only normal pod available
366-
PodsToRecycle: []string{},
367362
},
368363
},
369364
}

kubernetes/internal/controller/apis.go

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -87,38 +87,12 @@ func setPodRecycleMeta(obj metav1.Object, meta *PodRecycleMeta) {
8787
obj.GetAnnotations()[AnnoPodRecycleMeta] = utils.DumpJSON(meta)
8888
}
8989

90-
// canAllocate checks if a pod is eligible for allocation.
91-
// A pod can be allocated if:
92-
// 1. No deallocated-from label (normal pod), OR
93-
// 2. Has recycle-confirmed label AND no recycle-meta annotation (recycling completed)
94-
func canAllocate(pod *corev1.Pod) bool {
95-
deallocatedFrom := pod.Labels[LabelPodDeallocatedFrom]
96-
if deallocatedFrom == "" {
97-
return true // Normal pod, no deallocation marker
98-
}
99-
100-
// Has deallocated-from, check if recycling is confirmed and completed
101-
recycleConfirmed := pod.Labels[LabelPodRecycleConfirmed]
102-
meta := pod.Annotations[AnnoPodRecycleMeta]
103-
104-
// Can allocate only if recycling is confirmed AND not in restarting state
105-
return recycleConfirmed != "" && meta == ""
90+
func isRestarting(pod *corev1.Pod) bool {
91+
return pod.Annotations[AnnoPodRecycleMeta] != ""
10692
}
10793

108-
func isRestarting(pod *corev1.Pod) bool {
109-
// - recycle-confirmed is set when restart starts
110-
// - recycle-confirmed is KEPT as a receipt after restart completes
111-
// - recycle-meta is cleared when restart completes
112-
meta := pod.Annotations[AnnoPodRecycleMeta]
113-
if meta == "" {
114-
return false
115-
}
116-
// Parse to verify it's in Restarting state (not just stale data)
117-
var recycleMeta PodRecycleMeta
118-
if err := json.Unmarshal([]byte(meta), &recycleMeta); err != nil {
119-
return false
120-
}
121-
return recycleMeta.State == RecycleStateRestarting
94+
func isRecycling(pod *corev1.Pod) bool {
95+
return pod.Labels[LabelPodDeallocatedFrom] != "" || pod.Annotations[AnnoPodRecycleMeta] != ""
12296
}
12397

12498
// AnnotationSandboxEndpoints Use the exported constant from pkg/utils

kubernetes/internal/controller/apis_test.go

Lines changed: 0 additions & 101 deletions
This file was deleted.

0 commit comments

Comments
 (0)