Skip to content

Commit 4d30611

Browse files
author
玖宇
committed
fix comments
1 parent 9ce36fd commit 4d30611

5 files changed

Lines changed: 296 additions & 379 deletions

File tree

doc/features/failure-handling.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ spec:
5858
image: nginx:latest
5959
```
6060
61-
## Excluding Components from Restart Policy Trigger (CustomComponentsPattern)
61+
## Excluding Components from Restart Policy Trigger
6262
63-
When using `customComponentsPattern`, you can prevent specific components from triggering the role's restart policy by setting an annotation on the component's pod template. This is useful for auxiliary components (e.g., monitoring, logging sidecars) whose failures should not affect the main workload.
63+
You can prevent specific pods from triggering the role's restart policy by setting an annotation on the pod template. This is useful for auxiliary components (e.g., monitoring, logging sidecars) whose failures should not affect the main workload. The annotation works with any deployment pattern (standalonePattern, leaderWorkerPattern, or customComponentsPattern).
6464
6565
Set the annotation `rbg.workloads.x-k8s.io/restart-trigger-policy: "Ignore"` in the component's `template.metadata.annotations`:
6666

@@ -109,9 +109,9 @@ When this annotation is set to "Ignore" on a component's pod template:
109109
- **RecreateRBGOnPodRestart**: Gateway/router roles that require all downstream services to be healthy.
110110
- **RecreateRoleInstanceOnPodRestart**: Worker roles that can tolerate individual instance failures.
111111
- **None**: Monitoring/logging sidecars that don't affect the main workload.
112-
- **ignoreRestartPolicy**: Auxiliary components within customComponentsPattern that should be isolated from restart policy actions.
112+
- **restart-trigger-policy: "Ignore"** (annotation): Auxiliary components whose pod failures should not trigger the role's restart policy actions.
113113

114114
## Examples
115115

116116
- [Restart Policy Examples](../../examples/basic/rbg/restart-policy/restart-policy.yaml)
117-
- [CustomComponentsPattern with ignoreRestartPolicy](../../examples/basic/rbg/patterns/custom-components-pattern.yaml)
117+
- [CustomComponentsPattern with restart-trigger-policy annotation](../../examples/basic/rbg/patterns/custom-components-pattern.yaml)

examples/basic/rbg/patterns/custom-components-pattern.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
#
66
# Annotation rbg.workloads.x-k8s.io/restart-trigger-policy:
77
# Valid values are "Inherit" (or empty) and "Ignore". When set to "Ignore" on a
8-
# component's pod template, this component's pod restart/delete events will NOT
9-
# trigger the role's restart policy (RecreateRoleInstanceOnPodRestart or
10-
# RecreateRBGOnPodRestart). This is useful for auxiliary components (e.g.,
11-
# monitoring, logging sidecars) whose failures should not affect the main workload.
8+
# pod template, pod restart/delete events from that pod will NOT trigger the
9+
# role's restart policy (RecreateRoleInstanceOnPodRestart or RecreateRBGOnPodRestart).
10+
# This annotation works with any deployment pattern and is useful for auxiliary
11+
# components (e.g., monitoring, logging sidecars) whose failures should not
12+
# affect the main workload.
1213
apiVersion: workloads.x-k8s.io/v1alpha2
1314
kind: RoleBasedGroup
1415
metadata:

internal/controller/workloads/pod_controller_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,61 @@ func TestPodReconciler_podToRBG_EdgeCases(t *testing.T) {
632632
).Obj(),
633633
want: []reconcile.Request{},
634634
},
635+
{
636+
name: "Pod with restart-trigger-policy Ignore annotation should not trigger restart",
637+
args: args{
638+
ctx: context.TODO(),
639+
pod: func() *corev1.Pod {
640+
pod := wrappers.BuildDeletingPod().WithLabels(
641+
map[string]string{
642+
constants.GroupNameLabelKey: "test-rbg",
643+
constants.RoleNameLabelKey: "test-role",
644+
},
645+
).Obj()
646+
pod.Annotations = map[string]string{
647+
constants.RestartTriggerPolicyAnnotationKey: constants.RestartTriggerPolicyIgnore,
648+
}
649+
return pod
650+
}(),
651+
},
652+
setupRBG: wrappersv2.BuildBasicRoleBasedGroup("test-rbg", "default").
653+
WithRoles(
654+
[]workloadsv1alpha2.RoleSpec{
655+
wrappersv2.BuildStandaloneRole("test-role").
656+
WithRestartPolicy(workloadsv1alpha2.RecreateRBGOnPodRestart).
657+
Obj(),
658+
},
659+
).Obj(),
660+
want: []reconcile.Request{},
661+
},
662+
{
663+
name: "Pod without Ignore annotation triggers RecreateRBGOnPodRestart",
664+
args: args{
665+
ctx: context.TODO(),
666+
pod: wrappers.BuildDeletingPod().WithLabels(
667+
map[string]string{
668+
constants.GroupNameLabelKey: "test-rbg",
669+
constants.RoleNameLabelKey: "test-role",
670+
},
671+
).Obj(),
672+
},
673+
setupRBG: wrappersv2.BuildBasicRoleBasedGroup("test-rbg", "default").
674+
WithRoles(
675+
[]workloadsv1alpha2.RoleSpec{
676+
wrappersv2.BuildStandaloneRole("test-role").
677+
WithRestartPolicy(workloadsv1alpha2.RecreateRBGOnPodRestart).
678+
Obj(),
679+
},
680+
).Obj(),
681+
want: []reconcile.Request{
682+
{
683+
NamespacedName: types.NamespacedName{
684+
Name: "test-rbg",
685+
Namespace: "default",
686+
},
687+
},
688+
},
689+
},
635690
}
636691

637692
for _, tt := range tests {

pkg/reconciler/roleinstance/sync/instance_scale.go

Lines changed: 58 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"sigs.k8s.io/controller-runtime/pkg/client"
2929
"sigs.k8s.io/rbgs/api/workloads/constants"
3030
portallocator "sigs.k8s.io/rbgs/pkg/port-allocator"
31+
"sigs.k8s.io/rbgs/pkg/utils"
3132

3233
workloadsv1alpha2 "sigs.k8s.io/rbgs/api/workloads/v1alpha2"
3334
podinplace "sigs.k8s.io/rbgs/pkg/inplace/pod"
@@ -213,22 +214,18 @@ func (c *realControl) createOnePod(ctx context.Context, instance *workloadsv1alp
213214
return nil
214215
}
215216

216-
// shouldRecreateInstance checks if the instance should be recreated (all pods deleted then recreated).
217-
// This applies when:
218-
// 1. restartPolicy = RecreateRoleInstanceOnPodRestart AND Pod Failed
219-
// - Instead of replacement Pod, recreate entire Instance for consistency
217+
// shouldRecreateInstance checks if the instance should be recreated based on RestartPolicy.
218+
// When RestartPolicy is RecreateInstanceOnPodRestart, the instance should be recreated if:
219+
// 1. Any container has restarted (RestartCount > 0)
220+
// 2. Any pod has been deleted (only when Instance was previously Ready)
220221
//
221-
// Note: This function does NOT handle container restart (Dimension 1).
222-
// Container restart with RecreateRoleInstanceOnPodRestart is handled by LWS controller,
223-
// RBG controller does nothing in that case.
222+
// Pods with the restart-trigger-policy annotation set to "Ignore" are excluded from
223+
// both checks — their restarts and deletions will not trigger instance recreation.
224224
//
225-
// For restartPolicy=None or RecreateRBGOnPodRestart, Pod Failed triggers replacement Pod
226-
// creation through normal reconciliation (GetActiveAndInactivePods → createPods).
227-
//
228-
// Per KEP Non-Goals: Succeeded pods are NOT handled here - they represent normal completion
229-
// and should not trigger Instance recreation.
225+
// Note: We use Instance's Ready condition to distinguish between initial creation/scaling
226+
// and pod deletion. This avoids infinite recreation loops.
230227
func shouldRecreateInstance(instance *workloadsv1alpha2.RoleInstance, pods []*v1.Pod) bool {
231-
// Only apply when restartPolicy is RecreateRoleInstanceOnPodRestart
228+
// Only check when RestartPolicy is RecreateInstanceOnPodRestart
232229
if instance.Spec.RestartPolicy != workloadsv1alpha2.RoleInstanceRestartPolicyRecreateOnPodRestart {
233230
return false
234231
}
@@ -238,27 +235,63 @@ func shouldRecreateInstance(instance *workloadsv1alpha2.RoleInstance, pods []*v1
238235
return false
239236
}
240237

241-
// Check if any Pod has become Failed (Dimension 2)
242-
// With RecreateRoleInstanceOnPodRestart policy, Pod Failed triggers Instance recreation
243-
// (instead of just replacement Pod)
244-
//
245-
// Per KEP Non-Goals: Succeeded pods are explicitly excluded - only Failed pods trigger recreation.
246-
// Pod being deleted (with DeletionTimestamp) is also excluded as it's handled separately.
238+
// Check each pod for container restart, skipping pods with Ignore annotation
239+
for _, pod := range pods {
240+
if hasTriggerPolicyIgnore(pod) {
241+
continue
242+
}
243+
if utils.ContainerRestarted(pod) {
244+
return true
245+
}
246+
}
247+
248+
// Check if any pod has been deleted:
249+
// Only trigger recreate if Instance was previously Ready (stable state)
250+
// and spec is not being changed (Generation == ObservedGeneration).
251+
// This avoids triggering recreate during initial creation or scaling up.
247252
if wasInstanceReady(instance) && instance.Generation == instance.Status.ObservedGeneration {
248-
// Check if any Pod is Failed (excluding Succeeded and pods being deleted)
253+
expectedPodCount := getExpectedPodCount(instance)
254+
ignoredExpectedCount := getIgnoredExpectedPodCount(instance)
255+
// Calculate active pods (exclude those being deleted and those with Ignore annotation).
256+
// Pods with DeletionTimestamp != nil are in Terminating state and should not
257+
// be counted as active, otherwise the controller may fail to detect pod deletion.
258+
activeCount := 0
249259
for _, p := range pods {
250-
// Only Failed pods trigger Instance recreation
251-
// Succeeded pods (normal completion) are explicitly excluded per KEP Non-Goals
252-
// Pods being deleted are excluded as they're handled through normal deletion flow
253-
if p.Status.Phase == v1.PodFailed && p.DeletionTimestamp == nil {
254-
return true
260+
if hasTriggerPolicyIgnore(p) {
261+
continue
255262
}
263+
if p.DeletionTimestamp == nil {
264+
activeCount++
265+
}
266+
}
267+
if activeCount < expectedPodCount-ignoredExpectedCount {
268+
return true
256269
}
257270
}
258271

259272
return false
260273
}
261274

275+
// hasTriggerPolicyIgnore returns true if the pod has the restart-trigger-policy
276+
// annotation set to "Ignore".
277+
func hasTriggerPolicyIgnore(pod *v1.Pod) bool {
278+
return pod.Annotations[constants.RestartTriggerPolicyAnnotationKey] == constants.RestartTriggerPolicyIgnore
279+
}
280+
281+
// getIgnoredExpectedPodCount calculates the expected pod count from components
282+
// whose templates have the restart-trigger-policy annotation set to "Ignore".
283+
func getIgnoredExpectedPodCount(instance *workloadsv1alpha2.RoleInstance) int {
284+
count := 0
285+
for _, component := range instance.Spec.Components {
286+
if component.Template.Annotations[constants.RestartTriggerPolicyAnnotationKey] == constants.RestartTriggerPolicyIgnore {
287+
if component.Size != nil {
288+
count += int(*component.Size)
289+
}
290+
}
291+
}
292+
return count
293+
}
294+
262295
// wasInstanceReady checks if the Instance was previously in Ready state
263296
func wasInstanceReady(instance *workloadsv1alpha2.RoleInstance) bool {
264297
for _, cond := range instance.Status.Conditions {

0 commit comments

Comments
 (0)