Skip to content

Commit d555e30

Browse files
authored
chore: Backport v20260428 (#1299)
Merging this PR to unblock the progress -> if there's any concern, please let me know. Thanks 🙏
2 parents ece0d07 + 33edc67 commit d555e30

25 files changed

Lines changed: 1110 additions & 341 deletions

.github/workflows/codespell.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
runs-on: ubuntu-latest
1313
steps:
1414
- name: Harden Runner
15-
uses: step-security/harden-runner@6c3c2f2c1c457b00c10c4848d6f5491db3b629df # v2.18.0
15+
uses: step-security/harden-runner@8d3c67de8e2fe68ef647c8db1e6a09f647780f40 # v2.19.0
1616
with:
1717
egress-policy: audit
1818

.github/workflows/trivy.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
TAG: ${{ env.IMAGE_VERSION }}
6565

6666
- name: Scan ${{ env.REGISTRY }}/${{ env.HUB_AGENT_IMAGE_NAME }}:${{ env.IMAGE_VERSION }}
67-
uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # v0.35.0
67+
uses: aquasecurity/trivy-action@ed142fd0673e97e23eac54620cfb913e5ce36c25 # v0.36.0
6868
with:
6969
image-ref: ${{ env.REGISTRY }}/${{ env.HUB_AGENT_IMAGE_NAME }}:${{ env.IMAGE_VERSION }}
7070
format: 'table'
@@ -80,7 +80,7 @@ jobs:
8080

8181

8282
- name: Scan ${{ env.REGISTRY }}/${{ env.MEMBER_AGENT_IMAGE_NAME }}:${{ env.IMAGE_VERSION }}
83-
uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # v0.35.0
83+
uses: aquasecurity/trivy-action@ed142fd0673e97e23eac54620cfb913e5ce36c25 # v0.36.0
8484
with:
8585
image-ref: ${{ env.REGISTRY }}/${{ env.MEMBER_AGENT_IMAGE_NAME }}:${{ env.IMAGE_VERSION }}
8686
format: 'table'
@@ -95,7 +95,7 @@ jobs:
9595
TRIVY_DB_REPOSITORY: mcr.microsoft.com/mirror/ghcr/aquasecurity/trivy-db
9696

9797
- name: Scan ${{ env.REGISTRY }}/${{ env.REFRESH_TOKEN_IMAGE_NAME }}:${{ env.IMAGE_VERSION }}
98-
uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # v0.35.0
98+
uses: aquasecurity/trivy-action@ed142fd0673e97e23eac54620cfb913e5ce36c25 # v0.36.0
9999
with:
100100
image-ref: ${{ env.REGISTRY }}/${{ env.REFRESH_TOKEN_IMAGE_NAME }}:${{ env.IMAGE_VERSION }}
101101
format: 'table'

pkg/controllers/updaterun/controller.go

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,12 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
109109
return runtime.Result{}, err
110110
}
111111

112+
// Track errors for metrics emission. The error is used to determine the failure type
113+
// (user_error vs internal_error) in the emitted metrics.
114+
var reconcileErr error
112115
// Emit the update run status metric based on status conditions in the updateRun.
113-
defer emitUpdateRunStatusMetric(updateRun)
116+
// Use a closure to capture reconcileErr by reference, so it reflects any updates made during reconciliation.
117+
defer func() { emitUpdateRunStatusMetric(updateRun, reconcileErr) }()
114118

115119
state := updateRun.GetUpdateRunSpec().State
116120

@@ -126,14 +130,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
126130
}
127131

128132
// Initialize the updateRun.
129-
var initErr error
130-
if toBeUpdatedBindings, toBeDeletedBindings, initErr = r.initialize(ctx, updateRun); initErr != nil {
131-
klog.ErrorS(initErr, "Failed to initialize the updateRun", "updateRun", runObjRef)
133+
if toBeUpdatedBindings, toBeDeletedBindings, reconcileErr = r.initialize(ctx, updateRun); reconcileErr != nil {
134+
klog.ErrorS(reconcileErr, "Failed to initialize the updateRun", "updateRun", runObjRef)
132135
// errStagedUpdatedAborted cannot be retried.
133-
if errors.Is(initErr, errStagedUpdatedAborted) {
134-
return runtime.Result{}, r.recordInitializationFailed(ctx, updateRun, initErr.Error())
136+
if errors.Is(reconcileErr, errStagedUpdatedAborted) {
137+
return runtime.Result{}, r.recordInitializationFailed(ctx, updateRun, reconcileErr.Error())
135138
}
136-
return runtime.Result{}, initErr
139+
return runtime.Result{}, reconcileErr
137140
}
138141
updatingStageIndex = 0 // start from the first stage (typically for Initialize or Run states).
139142
klog.V(2).InfoS("Initialized the updateRun", "state", state, "updateRun", runObjRef)
@@ -145,14 +148,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
145148
klog.V(2).InfoS("The updateRun is finished", "finishedSuccessfully", finishedCond.Status, "updateRun", runObjRef)
146149
return runtime.Result{}, nil
147150
}
148-
var validateErr error
149151
// Validate the updateRun status to ensure the update can be continued and get the updating stage index and cluster indices.
150-
if updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings, validateErr = r.validate(ctx, updateRun); validateErr != nil {
152+
if updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings, reconcileErr = r.validate(ctx, updateRun); reconcileErr != nil {
153+
klog.ErrorS(reconcileErr, "Failed to validate the updateRun", "updateRun", runObjRef)
151154
// errStagedUpdatedAborted cannot be retried.
152-
if errors.Is(validateErr, errStagedUpdatedAborted) {
153-
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, validateErr.Error())
155+
if errors.Is(reconcileErr, errStagedUpdatedAborted) {
156+
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, reconcileErr.Error())
154157
}
155-
return runtime.Result{}, validateErr
158+
return runtime.Result{}, reconcileErr
156159
}
157160
klog.V(2).InfoS("The updateRun is validated", "updateRun", runObjRef)
158161
}
@@ -163,45 +166,48 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
163166
return runtime.Result{}, r.recordUpdateRunSucceeded(ctx, updateRun)
164167
}
165168

169+
var finished bool
170+
var waitTime time.Duration
166171
switch state {
167172
case placementv1beta1.StateInitialize:
168173
klog.V(2).InfoS("The updateRun is initialized but not executed, waiting to execute", "state", state, "updateRun", runObjRef)
169174
case placementv1beta1.StateRun:
170175
// Execute the updateRun.
171176
klog.V(2).InfoS("Continue to execute the updateRun", "updatingStageIndex", updatingStageIndex, "updateRun", runObjRef)
172-
finished, waitTime, execErr := r.execute(ctx, updateRun, updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings)
173-
if errors.Is(execErr, errStagedUpdatedAborted) {
177+
finished, waitTime, reconcileErr = r.execute(ctx, updateRun, updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings)
178+
if errors.Is(reconcileErr, errStagedUpdatedAborted) {
174179
// errStagedUpdatedAborted cannot be retried.
175-
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, execErr.Error())
180+
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, reconcileErr.Error())
176181
}
177182

178183
if finished {
179184
klog.V(2).InfoS("The updateRun is completed", "updateRun", runObjRef)
180185
return runtime.Result{}, r.recordUpdateRunSucceeded(ctx, updateRun)
181186
}
182187

183-
return r.handleIncompleteUpdateRun(ctx, updateRun, waitTime, execErr, state, runObjRef)
188+
return r.handleIncompleteUpdateRun(ctx, updateRun, waitTime, reconcileErr, state, runObjRef)
184189
case placementv1beta1.StateStop:
185190
// Stop the updateRun.
186191
klog.V(2).InfoS("Stopping the updateRun", "state", state, "updatingStageIndex", updatingStageIndex, "updateRun", runObjRef)
187-
finished, waitTime, stopErr := r.stop(updateRun, updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings)
188-
if errors.Is(stopErr, errStagedUpdatedAborted) {
192+
finished, waitTime, reconcileErr = r.stop(updateRun, updatingStageIndex, toBeUpdatedBindings, toBeDeletedBindings)
193+
if errors.Is(reconcileErr, errStagedUpdatedAborted) {
189194
// errStagedUpdatedAborted cannot be retried.
190-
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, stopErr.Error())
195+
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, reconcileErr.Error())
191196
}
192197

193198
if finished {
194199
klog.V(2).InfoS("The updateRun is stopped", "updateRun", runObjRef)
195200
return runtime.Result{}, r.recordUpdateRunStopped(ctx, updateRun)
196201
}
197202

198-
return r.handleIncompleteUpdateRun(ctx, updateRun, waitTime, stopErr, state, runObjRef)
203+
return r.handleIncompleteUpdateRun(ctx, updateRun, waitTime, reconcileErr, state, runObjRef)
199204

200205
default:
201206
// Initialize, Run, or Stop are the only supported states.
202-
unexpectedErr := controller.NewUnexpectedBehaviorError(fmt.Errorf("found unsupported updateRun state: %s", state))
203-
klog.ErrorS(unexpectedErr, "Invalid updateRun state", "state", state, "updateRun", runObjRef)
204-
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, unexpectedErr.Error())
207+
reconcileErr = controller.NewUnexpectedBehaviorError(fmt.Errorf("found unsupported updateRun state: %s", state))
208+
klog.ErrorS(reconcileErr, "Invalid updateRun state", "state", state, "updateRun", runObjRef)
209+
// This is an internal error - unsupported state should not happen
210+
return runtime.Result{}, r.recordUpdateRunFailed(ctx, updateRun, reconcileErr.Error())
205211
}
206212
return runtime.Result{}, nil
207213
}

pkg/controllers/updaterun/controller_integration_test.go

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ func generateApprovalStageTaskMetric(
377377
// the current updateRun state if the updateRun has transitioned since then.
378378
func generateMetricsLabels(
379379
updateRun *placementv1beta1.ClusterStagedUpdateRun,
380-
state, condition, status, reason string,
380+
state, condition, status, reason, failureType string,
381381
) []*prometheusclientmodel.LabelPair {
382382
return []*prometheusclientmodel.LabelPair{
383383
{Name: ptr.To("namespace"), Value: &updateRun.Namespace},
@@ -386,23 +386,24 @@ func generateMetricsLabels(
386386
{Name: ptr.To("condition"), Value: ptr.To(condition)},
387387
{Name: ptr.To("status"), Value: ptr.To(status)},
388388
{Name: ptr.To("reason"), Value: ptr.To(reason)},
389+
{Name: ptr.To("failureType"), Value: ptr.To(failureType)},
389390
}
390391
}
391392

392393
func generateInitializationSucceededMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
393394
return &prometheusclientmodel.Metric{
394395
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionInitialized),
395-
string(metav1.ConditionTrue), condition.UpdateRunInitializeSucceededReason),
396+
string(metav1.ConditionTrue), condition.UpdateRunInitializeSucceededReason, string(hubmetrics.UpdateRunFailureTypeNone)),
396397
Gauge: &prometheusclientmodel.Gauge{
397398
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
398399
},
399400
}
400401
}
401402

402-
func generateInitializationFailedMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
403+
func generateInitializationFailedMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun, failureType string) *prometheusclientmodel.Metric {
403404
return &prometheusclientmodel.Metric{
404405
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionInitialized),
405-
string(metav1.ConditionFalse), condition.UpdateRunInitializeFailedReason),
406+
string(metav1.ConditionFalse), condition.UpdateRunInitializeFailedReason, failureType),
406407
Gauge: &prometheusclientmodel.Gauge{
407408
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
408409
},
@@ -412,7 +413,7 @@ func generateInitializationFailedMetric(state placementv1beta1.State, updateRun
412413
func generateProgressingMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
413414
return &prometheusclientmodel.Metric{
414415
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionProgressing),
415-
string(metav1.ConditionTrue), condition.UpdateRunProgressingReason),
416+
string(metav1.ConditionTrue), condition.UpdateRunProgressingReason, string(hubmetrics.UpdateRunFailureTypeNone)),
416417
Gauge: &prometheusclientmodel.Gauge{
417418
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
418419
},
@@ -422,7 +423,7 @@ func generateProgressingMetric(state placementv1beta1.State, updateRun *placemen
422423
func generateWaitingMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
423424
return &prometheusclientmodel.Metric{
424425
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionProgressing),
425-
string(metav1.ConditionFalse), condition.UpdateRunWaitingReason),
426+
string(metav1.ConditionFalse), condition.UpdateRunWaitingReason, string(hubmetrics.UpdateRunFailureTypeNone)),
426427
Gauge: &prometheusclientmodel.Gauge{
427428
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
428429
},
@@ -432,17 +433,17 @@ func generateWaitingMetric(state placementv1beta1.State, updateRun *placementv1b
432433
func generateStuckMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
433434
return &prometheusclientmodel.Metric{
434435
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionProgressing),
435-
string(metav1.ConditionFalse), condition.UpdateRunStuckReason),
436+
string(metav1.ConditionFalse), condition.UpdateRunStuckReason, string(hubmetrics.UpdateRunFailureTypeInternalError)),
436437
Gauge: &prometheusclientmodel.Gauge{
437438
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
438439
},
439440
}
440441
}
441442

442-
func generateFailedMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
443+
func generateFailedMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun, failureType string) *prometheusclientmodel.Metric {
443444
return &prometheusclientmodel.Metric{
444445
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionSucceeded),
445-
string(metav1.ConditionFalse), condition.UpdateRunFailedReason),
446+
string(metav1.ConditionFalse), condition.UpdateRunFailedReason, failureType),
446447
Gauge: &prometheusclientmodel.Gauge{
447448
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
448449
},
@@ -452,7 +453,7 @@ func generateFailedMetric(state placementv1beta1.State, updateRun *placementv1be
452453
func generateStoppingMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
453454
return &prometheusclientmodel.Metric{
454455
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionProgressing),
455-
string(metav1.ConditionUnknown), condition.UpdateRunStoppingReason),
456+
string(metav1.ConditionUnknown), condition.UpdateRunStoppingReason, string(hubmetrics.UpdateRunFailureTypeNone)),
456457
Gauge: &prometheusclientmodel.Gauge{
457458
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
458459
},
@@ -462,7 +463,7 @@ func generateStoppingMetric(state placementv1beta1.State, updateRun *placementv1
462463
func generateStoppedMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
463464
return &prometheusclientmodel.Metric{
464465
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionProgressing),
465-
string(metav1.ConditionFalse), condition.UpdateRunStoppedReason),
466+
string(metav1.ConditionFalse), condition.UpdateRunStoppedReason, string(hubmetrics.UpdateRunFailureTypeNone)),
466467
Gauge: &prometheusclientmodel.Gauge{
467468
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
468469
},
@@ -472,7 +473,7 @@ func generateStoppedMetric(state placementv1beta1.State, updateRun *placementv1b
472473
func generateSucceededMetric(state placementv1beta1.State, updateRun *placementv1beta1.ClusterStagedUpdateRun) *prometheusclientmodel.Metric {
473474
return &prometheusclientmodel.Metric{
474475
Label: generateMetricsLabels(updateRun, string(state), string(placementv1beta1.StagedUpdateRunConditionSucceeded),
475-
string(metav1.ConditionTrue), condition.UpdateRunSucceededReason),
476+
string(metav1.ConditionTrue), condition.UpdateRunSucceededReason, string(hubmetrics.UpdateRunFailureTypeNone)),
476477
Gauge: &prometheusclientmodel.Gauge{
477478
Value: ptr.To(float64(time.Now().UnixNano()) / 1e9),
478479
},
@@ -963,16 +964,16 @@ func generateFalseCondition(obj client.Object, condType any) metav1.Condition {
963964
}
964965
}
965966

966-
func generateFalseProgressingCondition(obj client.Object, condType any, reason string) metav1.Condition {
967+
func generateFalseConditionWithReason(obj client.Object, condType any, reason string) metav1.Condition {
967968
falseCond := generateFalseCondition(obj, condType)
968969
falseCond.Reason = reason
969970
return falseCond
970971
}
971972

972-
func generateFalseConditionWithReason(obj client.Object, condType any, reason string) metav1.Condition {
973-
falseCond := generateFalseCondition(obj, condType)
974-
falseCond.Reason = reason
975-
return falseCond
973+
func generateTrueConditionWithReason(obj client.Object, condType any, reason string) metav1.Condition {
974+
trueCond := generateTrueCondition(obj, condType)
975+
trueCond.Reason = reason
976+
return trueCond
976977
}
977978

978979
func generateProgressingUnknownConditionWithReason(obj client.Object, reason string) metav1.Condition {

pkg/controllers/updaterun/execution.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,13 @@ func (r *Reconciler) execute(
7676
markUpdateRunProgressingIfNotWaitingOrStuck(updateRun)
7777
if updatingStageIndex < len(updateRunStatus.StagesStatus) {
7878
updatingStageStatus = &updateRunStatus.StagesStatus[updatingStageIndex]
79+
// Skip the entire stage when there are 0 clusters.
80+
if len(updatingStageStatus.Clusters) == 0 {
81+
klog.V(2).InfoS("The stage has 0 clusters, skipping the entire stage", "stage", updatingStageStatus.StageName, "updateRun", klog.KObj(updateRun))
82+
markStageUpdatingSkippedNoClusters(updatingStageStatus, updateRun.GetGeneration(), "Stage skipped because it has no clusters")
83+
// No need to wait to get to the next stage.
84+
return false, 0, nil
85+
}
7986
approved, err := r.checkBeforeStageTasksStatus(ctx, updatingStageIndex, updateRun)
8087
if err != nil {
8188
return false, 0, err
@@ -249,7 +256,7 @@ func (r *Reconciler) executeUpdatingStage(
249256
"bindingSpecInSync", inSync, "bindingState", bindingSpec.State,
250257
"bindingRolloutStarted", rolloutStarted, "binding", klog.KObj(binding), "updateRun", updateRunRef)
251258
markClusterUpdatingFailed(clusterStatus, updateRun.GetGeneration(), preemptedErr.Error())
252-
clusterUpdateErrors = append(clusterUpdateErrors, fmt.Errorf("%w: %s", errStagedUpdatedAborted, preemptedErr.Error()))
259+
clusterUpdateErrors = append(clusterUpdateErrors, fmt.Errorf("%w: %w", errStagedUpdatedAborted, preemptedErr))
253260
continue
254261
}
255262

@@ -773,6 +780,30 @@ func markStageUpdatingSucceeded(stageUpdatingStatus *placementv1beta1.StageUpdat
773780
})
774781
}
775782

783+
// markStageUpdatingSkippedNoClusters marks the stage updating status as skipped due to no clusters in memory.
784+
func markStageUpdatingSkippedNoClusters(stageUpdatingStatus *placementv1beta1.StageUpdatingStatus, generation int64, message string) {
785+
if stageUpdatingStatus.StartTime == nil {
786+
stageUpdatingStatus.StartTime = &metav1.Time{Time: time.Now()}
787+
}
788+
if stageUpdatingStatus.EndTime == nil {
789+
stageUpdatingStatus.EndTime = &metav1.Time{Time: time.Now()}
790+
}
791+
meta.SetStatusCondition(&stageUpdatingStatus.Conditions, metav1.Condition{
792+
Type: string(placementv1beta1.StageUpdatingConditionProgressing),
793+
Status: metav1.ConditionFalse,
794+
ObservedGeneration: generation,
795+
Reason: condition.StageUpdatingSkippedNoClustersReason,
796+
Message: message,
797+
})
798+
meta.SetStatusCondition(&stageUpdatingStatus.Conditions, metav1.Condition{
799+
Type: string(placementv1beta1.StageUpdatingConditionSucceeded),
800+
Status: metav1.ConditionTrue,
801+
ObservedGeneration: generation,
802+
Reason: condition.StageUpdatingSkippedNoClustersReason,
803+
Message: message,
804+
})
805+
}
806+
776807
// markStageUpdatingFailed marks the stage updating status as failed in memory.
777808
func markStageUpdatingFailed(stageUpdatingStatus *placementv1beta1.StageUpdatingStatus, generation int64, message string) {
778809
if stageUpdatingStatus.EndTime == nil {

0 commit comments

Comments
 (0)