Skip to content

Commit ecdd9ea

Browse files
author
longsuizhi
committed
fix(allocator): address CR feedback - exclude released and terminal pods from live allocation
- P1: Released pods no longer count toward liveAllocated, preventing stale released entries from blocking subsequent re-allocations. - P2: Only Running+Ready pods are added to livePodSet, so terminal pods (Failed/Evicted) that still have their object present also trigger re-allocation.
1 parent d4b8e95 commit ecdd9ea

2 files changed

Lines changed: 95 additions & 15 deletions

File tree

kubernetes/internal/controller/allocator.go

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -461,9 +461,19 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec
461461
}
462462

463463
// Build a set of live pool pod names for dead-pod detection during allocation requests.
464+
// Running, Pending, and Unknown pods are considered live to avoid premature replacement.
465+
// Unknown phase (e.g. node partition) should not trigger re-allocation until Kubernetes
466+
// declares the pod terminal. Only truly terminal pods (Failed/Succeeded) or deleted pods
467+
// should trigger re-allocation.
464468
livePodSet := make(map[string]struct{}, len(spec.Pods))
469+
// allPodSet tracks all pods that still have an object (including terminal ones).
470+
// Used to distinguish "deleted" (object gone) from "terminal" (object exists but Failed/Succeeded).
471+
allPodSet := make(map[string]struct{}, len(spec.Pods))
465472
for _, p := range spec.Pods {
466-
livePodSet[p.Name] = struct{}{}
473+
allPodSet[p.Name] = struct{}{}
474+
if p.Status.Phase != corev1.PodFailed && p.Status.Phase != corev1.PodSucceeded {
475+
livePodSet[p.Name] = struct{}{}
476+
}
467477
}
468478

469479
// Fetch pool allocation once and reuse it for both stale-sandbox cleanup and available-pod filtering.
@@ -479,7 +489,7 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec
479489
// handles them without any special-casing outside this function.
480490
// Terminating sandboxes are handled inside getSandboxRequest: they receive no new supplement and
481491
// all unreleased pods are queued for release.
482-
allRequest, err := allocator.getAllRequest(ctx, spec.Sandboxes, podAllocation, livePodSet)
492+
allRequest, err := allocator.getAllRequest(ctx, spec.Sandboxes, podAllocation, livePodSet, allPodSet)
483493
if err != nil {
484494
return nil, err
485495
}
@@ -500,13 +510,13 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec
500510
// orphan entries for pods in podAllocation whose sandbox is no longer in the sandboxes list
501511
// (e.g. force-deleted). Orphan entries carry PodSupplement=0 and ToRelease set to the orphan
502512
// pods so the normal recycle path handles them without special-casing in the caller.
503-
func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes []*sandboxv1alpha1.BatchSandbox, podAllocation map[string]string, livePodSet map[string]struct{}) ([]*algorithm.SandboxRequest, error) {
513+
func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes []*sandboxv1alpha1.BatchSandbox, podAllocation map[string]string, livePodSet map[string]struct{}, allPodSet map[string]struct{}) ([]*algorithm.SandboxRequest, error) {
504514
log := logf.FromContext(ctx)
505515
existingSandboxes := make(map[string]struct{}, len(sandboxes))
506516
allRequest := make([]*algorithm.SandboxRequest, 0, len(sandboxes))
507517
for _, sandbox := range sandboxes {
508518
existingSandboxes[sandbox.Name] = struct{}{}
509-
request, err := allocator.getSandboxRequest(ctx, sandbox, livePodSet)
519+
request, err := allocator.getSandboxRequest(ctx, sandbox, livePodSet, allPodSet)
510520
if err != nil {
511521
return nil, err
512522
}
@@ -529,7 +539,7 @@ func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes
529539
return allRequest, nil
530540
}
531541

532-
func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox, livePodSet map[string]struct{}) (*algorithm.SandboxRequest, error) {
542+
func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox, livePodSet map[string]struct{}, allPodSet map[string]struct{}) (*algorithm.SandboxRequest, error) {
533543
log := logf.FromContext(ctx)
534544
allocated, err := allocator.GetSandboxAllocation(ctx, sandbox)
535545
if err != nil {
@@ -545,21 +555,34 @@ func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbo
545555
releasedSet[r] = struct{}{}
546556
}
547557

548-
// Filter out pods that no longer exist in the pool (e.g. externally deleted).
549-
// Dead pods are treated as released so the sandbox can receive replacement allocations.
558+
// Filter out pods that no longer exist in the pool (e.g. externally deleted) and
559+
// pods that have been released AND are no longer live. Only unreleased live pods
560+
// should count toward the effective allocation when computing supplement.
561+
// Released pods that are still alive keep their slot to avoid premature supplement
562+
// (the recycle flow will eventually remove them from alloc-status).
563+
// Terminal pods (Failed/Succeeded) that still exist are excluded from liveAllocated
564+
// but NOT added to deadPods/ToRelease — they should not go through the recycle path
565+
// (e.g. RestartRecycler rejects non-Running pods). Pool scaleDown handles their cleanup.
550566
liveAllocated := make([]string, 0, len(allocated))
551567
deadPods := make([]string, 0)
552568
for _, p := range allocated {
553-
if _, exists := releasedSet[p]; exists {
554-
// Already released, keep in allocated for bookkeeping consistency.
555-
liveAllocated = append(liveAllocated, p)
569+
_, isReleased := releasedSet[p]
570+
_, isAlive := livePodSet[p]
571+
_, exists := allPodSet[p]
572+
if isReleased && !isAlive {
573+
// Released and gone/terminal — no longer occupies a slot.
556574
continue
557575
}
558-
if _, alive := livePodSet[p]; alive {
559-
liveAllocated = append(liveAllocated, p)
560-
} else {
561-
deadPods = append(deadPods, p)
576+
if !isReleased && !isAlive {
577+
if !exists {
578+
// Pod object is completely gone — externally deleted, queue for release.
579+
deadPods = append(deadPods, p)
580+
}
581+
// If exists but terminal (Failed/Succeeded): don't count as live,
582+
// don't queue for release. Pool cleanup will handle the terminal pod.
583+
continue
562584
}
585+
liveAllocated = append(liveAllocated, p)
563586
}
564587
if len(deadPods) > 0 {
565588
log.Info("Detected dead allocated pods, queuing for release to trigger re-allocation",
@@ -594,13 +617,20 @@ func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbo
594617
}
595618

596619
toRelease := make([]string, 0)
620+
toReleaseSet := make(map[string]struct{})
597621
for _, r := range release {
598622
if _, exists := releasedSet[r]; !exists {
599623
toRelease = append(toRelease, r)
624+
toReleaseSet[r] = struct{}{}
600625
}
601626
}
602627
// Also queue dead pods for release so their allocation records are cleaned up.
603-
toRelease = append(toRelease, deadPods...)
628+
// Deduplicate against pods already in alloc-release to avoid duplicate recycle operations.
629+
for _, dp := range deadPods {
630+
if _, exists := toReleaseSet[dp]; !exists {
631+
toRelease = append(toRelease, dp)
632+
}
633+
}
604634

605635
replica := int32(0)
606636
if sandbox.Spec.Replicas != nil {

kubernetes/internal/controller/allocator_test.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,56 @@ func TestSchedule(t *testing.T) {
259259
PodSupplement: 0,
260260
},
261261
},
262+
{
263+
name: "released pod does not block second re-allocation",
264+
spec: &AllocSpec{
265+
Pods: []*corev1.Pod{
266+
// pod1 was previously allocated and released (recycled).
267+
// pod2 was the replacement but is now also deleted.
268+
// pod3 is a new available pod.
269+
{ObjectMeta: metav1.ObjectMeta{Name: "pod3"}, Status: corev1.PodStatus{Phase: corev1.PodRunning, Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}}},
270+
},
271+
Pool: &sandboxv1alpha1.Pool{ObjectMeta: metav1.ObjectMeta{Name: "pool1"}},
272+
Sandboxes: []*sandboxv1alpha1.BatchSandbox{
273+
{ObjectMeta: metav1.ObjectMeta{Name: "sbx1"}, Spec: sandboxv1alpha1.BatchSandboxSpec{Replicas: &replica1}},
274+
},
275+
},
276+
poolAlloc: &PoolAllocation{PodAllocation: map[string]string{"pod1": "sbx1", "pod2": "sbx1"}},
277+
sandboxAllocs: map[string]*SandboxAllocation{"sbx1": {Pods: []string{"pod1", "pod2"}}},
278+
releases: map[string]*AllocationRelease{"sbx1": {Pods: []string{}}},
279+
released: map[string]*AllocationReleased{"sbx1": {Pods: []string{"pod1"}}},
280+
wantAction: &algorithm.AllocAction{
281+
ToAllocate: map[string][]string{"sbx1": {"pod3"}},
282+
ToRelease: map[string][]string{"sbx1": {"pod2"}},
283+
PodSupplement: 0,
284+
},
285+
},
286+
{
287+
name: "terminal pod (Failed) triggers re-allocation",
288+
spec: &AllocSpec{
289+
Pods: []*corev1.Pod{
290+
// pod1 exists but is in Failed state — should not count as live.
291+
{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Status: corev1.PodStatus{Phase: corev1.PodFailed}},
292+
// pod2 is ready and available.
293+
{ObjectMeta: metav1.ObjectMeta{Name: "pod2"}, Status: corev1.PodStatus{Phase: corev1.PodRunning, Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}}},
294+
},
295+
Pool: &sandboxv1alpha1.Pool{ObjectMeta: metav1.ObjectMeta{Name: "pool1"}},
296+
Sandboxes: []*sandboxv1alpha1.BatchSandbox{
297+
{ObjectMeta: metav1.ObjectMeta{Name: "sbx1"}, Spec: sandboxv1alpha1.BatchSandboxSpec{Replicas: &replica1}},
298+
},
299+
},
300+
poolAlloc: &PoolAllocation{PodAllocation: map[string]string{"pod1": "sbx1"}},
301+
sandboxAllocs: map[string]*SandboxAllocation{"sbx1": {Pods: []string{"pod1"}}},
302+
releases: map[string]*AllocationRelease{"sbx1": {Pods: []string{}}},
303+
released: map[string]*AllocationReleased{"sbx1": {Pods: []string{}}},
304+
wantAction: &algorithm.AllocAction{
305+
// Terminal pod does not go through ToRelease (avoids restart recycle failure).
306+
// Pool scaleDown handles terminal pod cleanup separately.
307+
ToAllocate: map[string][]string{"sbx1": {"pod2"}},
308+
ToRelease: map[string][]string{},
309+
PodSupplement: 0,
310+
},
311+
},
262312
{
263313
name: "orphan sandbox - pods in store but sandbox no longer in spec",
264314
spec: &AllocSpec{

0 commit comments

Comments
 (0)