Skip to content

Commit e269dd2

Browse files
committed
fix: resolve readiness WaitReady blocking for 5 minutes on startup
Backport fixes from upstream apache/apisix-ingress-controller#2663. Root cause: readiness.Start() is asynchronous. If a controller's reconcile loop calls Done() before Start() finishes registering resources, Done() finds no state entry and returns early. The resource is never removed from state, causing WaitReady to block until the 5-minute timeout. Changes: - Done() now waits for Start() to complete (<-r.started) before operating on state, eliminating the race condition - WaitReady() returns false on timeout instead of true (semantic fix: timed-out != ready) - Remove unnecessary mutex in registerState() since Done() is now guaranteed to run after Start() closes r.started - Add log statements for easier debugging of readiness lifecycle
1 parent b02e842 commit e269dd2

1 file changed

Lines changed: 7 additions & 4 deletions

File tree

internal/manager/readiness/manager.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,8 @@ func (r *readinessManager) Start(ctx context.Context) error {
125125
})
126126
}
127127
if len(expected) > 0 {
128-
r.log.V(1).Info("registering readiness state", "gvk", gvk, "expected", expected)
128+
r.log.Info("registering readiness state", "gvk", gvk, "registered_count", len(expected))
129+
r.log.V(1).Info("registered resources for readiness", "gvk", gvk, "resources", expected)
129130
r.registerState(gvk, expected)
130131
}
131132
}
@@ -135,13 +136,12 @@ func (r *readinessManager) Start(ctx context.Context) error {
135136
r.isReady.Store(true)
136137
close(r.done)
137138
}
139+
r.log.Info("readiness manager started")
138140
})
139141
return err
140142
}
141143

142144
func (r *readinessManager) registerState(gvk schema.GroupVersionKind, list []k8stypes.NamespacedName) {
143-
r.mu.Lock()
144-
defer r.mu.Unlock()
145145
if _, ok := r.state[gvk]; !ok {
146146
r.state[gvk] = make(map[k8stypes.NamespacedName]struct{})
147147
}
@@ -155,9 +155,12 @@ func (r *readinessManager) Done(obj client.Object, nn k8stypes.NamespacedName) {
155155
if r.IsReady() {
156156
return
157157
}
158+
<-r.started
159+
158160
r.mu.Lock()
159161
defer r.mu.Unlock()
160162
gvk := types.GvkOf(obj)
163+
r.log.Info("marking resource as done", "gvk", gvk, "name", nn, "state_count", len(r.state[gvk]))
161164
if _, ok := r.state[gvk]; !ok {
162165
return
163166
}
@@ -191,7 +194,7 @@ func (r *readinessManager) WaitReady(ctx context.Context, timeout time.Duration)
191194
case <-ctx.Done():
192195
return false
193196
case <-time.After(timeout):
194-
return true
197+
return false
195198
case <-r.done:
196199
return true
197200
}

0 commit comments

Comments
 (0)