Skip to content

Commit 53be88c

Browse files
author
longsuizhi
committed
fix(allocator): address CR feedback - exclude released and terminal pods from live allocation
- P1: Released pods no longer count toward liveAllocated, preventing stale released entries from blocking subsequent re-allocations. - P2: Only Running+Ready pods are added to livePodSet, so terminal pods (Failed/Evicted) that still have their object present also trigger re-allocation.
1 parent d4b8e95 commit 53be88c

2 files changed

Lines changed: 67 additions & 10 deletions

File tree

kubernetes/internal/controller/allocator.go

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -460,10 +460,14 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec
460460
return nil, err
461461
}
462462

463-
// Build a set of live pool pod names for dead-pod detection during allocation requests.
463+
// Build a set of schedulable pool pod names for dead-pod detection during allocation requests.
464+
// Only Running+Ready pods are considered live; terminal pods (Failed/Evicted) that still have
465+
// their object present should not count as live allocations so the sandbox can be rebound.
464466
livePodSet := make(map[string]struct{}, len(spec.Pods))
465467
for _, p := range spec.Pods {
466-
livePodSet[p.Name] = struct{}{}
468+
if p.Status.Phase == corev1.PodRunning && utils.IsPodReady(p) {
469+
livePodSet[p.Name] = struct{}{}
470+
}
467471
}
468472

469473
// Fetch pool allocation once and reuse it for both stale-sandbox cleanup and available-pod filtering.
@@ -545,21 +549,26 @@ func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbo
545549
releasedSet[r] = struct{}{}
546550
}
547551

548-
// Filter out pods that no longer exist in the pool (e.g. externally deleted).
549-
// Dead pods are treated as released so the sandbox can receive replacement allocations.
552+
// Filter out pods that no longer exist in the pool (e.g. externally deleted) and
553+
// pods that have been released AND are no longer live. Only unreleased live pods
554+
// should count toward the effective allocation when computing supplement.
555+
// Released pods that are still alive keep their slot to avoid premature supplement
556+
// (the recycle flow will eventually remove them from alloc-status).
550557
liveAllocated := make([]string, 0, len(allocated))
551558
deadPods := make([]string, 0)
552559
for _, p := range allocated {
553-
if _, exists := releasedSet[p]; exists {
554-
// Already released, keep in allocated for bookkeeping consistency.
555-
liveAllocated = append(liveAllocated, p)
560+
_, isReleased := releasedSet[p]
561+
_, isAlive := livePodSet[p]
562+
if isReleased && !isAlive {
563+
// Released and gone — no longer occupies a slot.
556564
continue
557565
}
558-
if _, alive := livePodSet[p]; alive {
559-
liveAllocated = append(liveAllocated, p)
560-
} else {
566+
if !isReleased && !isAlive {
567+
// Not released but gone — externally deleted, needs re-allocation.
561568
deadPods = append(deadPods, p)
569+
continue
562570
}
571+
liveAllocated = append(liveAllocated, p)
563572
}
564573
if len(deadPods) > 0 {
565574
log.Info("Detected dead allocated pods, queuing for release to trigger re-allocation",

kubernetes/internal/controller/allocator_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,54 @@ func TestSchedule(t *testing.T) {
259259
PodSupplement: 0,
260260
},
261261
},
262+
{
263+
name: "released pod does not block second re-allocation",
264+
spec: &AllocSpec{
265+
Pods: []*corev1.Pod{
266+
// pod1 was previously allocated and released (recycled).
267+
// pod2 was the replacement but is now also deleted.
268+
// pod3 is a new available pod.
269+
{ObjectMeta: metav1.ObjectMeta{Name: "pod3"}, Status: corev1.PodStatus{Phase: corev1.PodRunning, Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}}},
270+
},
271+
Pool: &sandboxv1alpha1.Pool{ObjectMeta: metav1.ObjectMeta{Name: "pool1"}},
272+
Sandboxes: []*sandboxv1alpha1.BatchSandbox{
273+
{ObjectMeta: metav1.ObjectMeta{Name: "sbx1"}, Spec: sandboxv1alpha1.BatchSandboxSpec{Replicas: &replica1}},
274+
},
275+
},
276+
poolAlloc: &PoolAllocation{PodAllocation: map[string]string{"pod1": "sbx1", "pod2": "sbx1"}},
277+
sandboxAllocs: map[string]*SandboxAllocation{"sbx1": {Pods: []string{"pod1", "pod2"}}},
278+
releases: map[string]*AllocationRelease{"sbx1": {Pods: []string{}}},
279+
released: map[string]*AllocationReleased{"sbx1": {Pods: []string{"pod1"}}},
280+
wantAction: &algorithm.AllocAction{
281+
ToAllocate: map[string][]string{"sbx1": {"pod3"}},
282+
ToRelease: map[string][]string{"sbx1": {"pod2"}},
283+
PodSupplement: 0,
284+
},
285+
},
286+
{
287+
name: "terminal pod (Failed) triggers re-allocation",
288+
spec: &AllocSpec{
289+
Pods: []*corev1.Pod{
290+
// pod1 exists but is in Failed state — should not count as live.
291+
{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Status: corev1.PodStatus{Phase: corev1.PodFailed}},
292+
// pod2 is ready and available.
293+
{ObjectMeta: metav1.ObjectMeta{Name: "pod2"}, Status: corev1.PodStatus{Phase: corev1.PodRunning, Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}}},
294+
},
295+
Pool: &sandboxv1alpha1.Pool{ObjectMeta: metav1.ObjectMeta{Name: "pool1"}},
296+
Sandboxes: []*sandboxv1alpha1.BatchSandbox{
297+
{ObjectMeta: metav1.ObjectMeta{Name: "sbx1"}, Spec: sandboxv1alpha1.BatchSandboxSpec{Replicas: &replica1}},
298+
},
299+
},
300+
poolAlloc: &PoolAllocation{PodAllocation: map[string]string{"pod1": "sbx1"}},
301+
sandboxAllocs: map[string]*SandboxAllocation{"sbx1": {Pods: []string{"pod1"}}},
302+
releases: map[string]*AllocationRelease{"sbx1": {Pods: []string{}}},
303+
released: map[string]*AllocationReleased{"sbx1": {Pods: []string{}}},
304+
wantAction: &algorithm.AllocAction{
305+
ToAllocate: map[string][]string{"sbx1": {"pod2"}},
306+
ToRelease: map[string][]string{"sbx1": {"pod1"}},
307+
PodSupplement: 0,
308+
},
309+
},
262310
{
263311
name: "orphan sandbox - pods in store but sandbox no longer in spec",
264312
spec: &AllocSpec{

0 commit comments

Comments
 (0)