Skip to content

Commit b8de389

Browse files
committed
Fix clean up of old attributes when containers are not restarting
When a pod crashes while in the process of starting, the operator cleans up outdated attributes in the galera CR status. The operator wrongly assumes that it can probe a container's state as soon as it gets a pod object from the API server, which is not always true (e.g when the pod is in "Pending" state). Fix the attribute clean up by always checking the state of the pod's container before inspecting its container ID. Jira: OSPRH-9411
1 parent 61d230f commit b8de389

1 file changed

Lines changed: 13 additions & 3 deletions

File tree

controllers/galera_controller.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,16 @@ func getRunningPodsMissingGcomm(ctx context.Context, pods []corev1.Pod, instance
197197
return
198198
}
199199

200+
// getGaleraContainerID retrieves the ContainerID of the galera container running in a pod
201+
func getGaleraContainerID(pod *corev1.Pod) (found bool, CID string) {
202+
for _, container := range pod.Status.ContainerStatuses {
203+
if container.Name == "galera" {
204+
return true, container.ContainerID
205+
}
206+
}
207+
return false, ""
208+
}
209+
200210
// isGaleraContainerStartedAndWaiting checks whether the galera container is waiting for a gcomm_uri file
201211
func isGaleraContainerStartedAndWaiting(ctx context.Context, pod *corev1.Pod, instance *mariadbv1.Galera, h *helper.Helper, config *rest.Config) bool {
202212
waiting := false
@@ -282,14 +292,14 @@ func assertPodsAttributesValidity(helper *helper.Helper, instance *mariadbv1.Gal
282292
// A node can have various attributes depending on its known state.
283293
// A ContainerID attribute is only present if the node is being started.
284294
attrCID := instance.Status.Attributes[pod.Name].ContainerID
285-
podCID := pod.Status.ContainerStatuses[0].ContainerID
286-
if attrCID != "" && attrCID != podCID {
295+
containerFound, podCID := getGaleraContainerID(&pod)
296+
if !containerFound || (attrCID != "" && attrCID != podCID) {
287297
// This gcomm URI was pushed in a pod which was restarted
288298
// before the attribute got cleared, which means the pod
289299
// failed to start galera. Clear the attribute here, and
290300
// reprobe the pod's state in the next reconcile loop
291301
clearPodAttributes(instance, pod.Name)
292-
util.LogForObject(helper, "Pod restarted while galera was starting", instance, "pod", pod.Name, "current pod ID", podCID, "recorded ID", attrCID)
302+
util.LogForObject(helper, "Pod restarted while galera was starting", instance, "pod", pod.Name, "recorded ID", attrCID)
293303
}
294304
}
295305
}

0 commit comments

Comments
 (0)