Skip to content

Commit b7ac94f

Browse files
authored
webhosting-operator: fix edge cases with missing status updates (#640)
* webhosting-operator: fix edge cases with missing status updates * webhosting-operator: drop special handling for conflict errors Rely on the usual exponential backoff and accept the noisy logs. * e2e: allow skipping cleanup on failure
1 parent c53a1f4 commit b7ac94f

3 files changed

Lines changed: 51 additions & 45 deletions

File tree

webhosting-operator/pkg/controllers/webhosting/common.go

Lines changed: 0 additions & 40 deletions
This file was deleted.

webhosting-operator/pkg/controllers/webhosting/website_controller.go

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -480,9 +480,13 @@ func (r *WebsiteReconciler) SetupWithManager(mgr manager.Manager, enableSharding
480480
}
481481

482482
var (
483-
// trigger on spec change and annotation changes (manual trigger for testing purposes)
484-
websitePredicate = predicate.Or(predicate.GenerationChangedPredicate{}, predicate.AnnotationChangedPredicate{})
485-
reconciler = SilenceConflicts(reconcile.AsReconciler[*webhostingv1alpha1.Website](r.Client, r))
483+
// trigger on spec, status, and annotation changes (manual trigger for testing purposes)
484+
websitePredicate = predicate.Or(
485+
predicate.GenerationChangedPredicate{},
486+
predicate.AnnotationChangedPredicate{},
487+
WebsiteStatusChanged,
488+
)
489+
reconciler = reconcile.AsReconciler[*webhostingv1alpha1.Website](r.Client, r)
486490
)
487491

488492
if enableSharding {
@@ -551,6 +555,31 @@ func (r *WebsiteReconciler) MapThemeToWebsites(ctx context.Context, theme client
551555
return requests
552556
}
553557

558+
// WebsiteStatusChanged is a predicate that triggers when the Website status changes.
559+
// The controller skips updating the status if it didn't change the cached object. In fast consecutive retries, this
560+
// can lead to a Website in Error state, where the controller doesn't observe its own status update, and skips the
561+
// transition to Ready afterward because the cached object was still in Ready state.
562+
// To fix this, we trigger the controller one more time when observing its own status updates to ensure a correct
563+
// status. This shouldn't hurt the standard case, because the controller doesn't cause any API calls if not needed.
564+
var WebsiteStatusChanged = predicate.Funcs{
565+
UpdateFunc: func(e event.UpdateEvent) bool {
566+
if e.ObjectOld == nil || e.ObjectNew == nil {
567+
return false
568+
}
569+
570+
oldWebsite, ok := e.ObjectOld.(*webhostingv1alpha1.Website)
571+
if !ok {
572+
return false
573+
}
574+
newWebsite, ok := e.ObjectNew.(*webhostingv1alpha1.Website)
575+
if !ok {
576+
return false
577+
}
578+
579+
return !apiequality.Semantic.DeepEqual(oldWebsite.Status, newWebsite.Status)
580+
},
581+
}
582+
554583
// DeploymentAvailabilityChanged is a predicate for filtering relevant Deployment events.
555584
var DeploymentAvailabilityChanged = predicate.Funcs{
556585
UpdateFunc: func(e event.UpdateEvent) bool {

webhosting-operator/test/e2e/e2e_suite_test.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package e2e
1818

1919
import (
2020
"context"
21+
"flag"
2122
"maps"
2223
"testing"
2324
"time"
@@ -42,6 +43,14 @@ import (
4243
webhostingv1alpha1 "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/apis/webhosting/v1alpha1"
4344
)
4445

46+
var skipCleanup bool
47+
48+
func TestMain(m *testing.M) {
49+
flag.BoolVar(&skipCleanup, "skip-cleanup", false, "Skip cleanup after test failure")
50+
flag.Parse()
51+
m.Run()
52+
}
53+
4554
func TestE2E(t *testing.T) {
4655
RegisterFailHandler(Fail)
4756
RunSpecs(t, "Webhosting Operator E2E Test Suite")
@@ -112,6 +121,10 @@ var _ = BeforeEach(func(ctx SpecContext) {
112121
log.Info("Created test Namespace", "namespace", namespace.Name)
113122

114123
DeferCleanup(func(ctx SpecContext) {
124+
if ctx.SpecReport().Failed() && skipCleanup {
125+
Skip("Leaving state of test failure because --skip-cleanup is set to true")
126+
}
127+
115128
By("Delete all Websites in test Namespace")
116129
Expect(testClient.DeleteAllOf(ctx, &webhostingv1alpha1.Website{}, client.InNamespace(namespace.Name))).To(Succeed())
117130

@@ -124,12 +137,16 @@ var _ = BeforeEach(func(ctx SpecContext) {
124137
By("Set up test ControllerRing")
125138
controllerRing = &shardingv1alpha1.ControllerRing{ObjectMeta: metav1.ObjectMeta{Name: webhostingv1alpha1.WebhostingOperatorName}}
126139

127-
By("Scaling checksum-controller")
140+
By("Scaling webhosting-operator")
128141
controllerDeployment = &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: webhostingv1alpha1.WebhostingOperatorName, Namespace: webhostingv1alpha1.NamespaceSystem}}
129142
scaleController(ctx, 3)
130143

131144
DeferCleanup(func(ctx SpecContext) {
132-
By("Scaling checksum-controller")
145+
if ctx.SpecReport().Failed() && skipCleanup {
146+
Skip("Leaving state of test failure because --skip-cleanup is set to true")
147+
}
148+
149+
By("Scaling webhosting-operator")
133150
scaleController(ctx, 3)
134151
}, NodeTimeout(ShortTimeout))
135152

0 commit comments

Comments
 (0)