Skip to content

Commit fa16d25

Browse files
Merge pull request #318 from devtron-labs/hotfix-pre-post-cd-progress-status-fix
fix: handle force-deleted pods: add detection and handling logic for nodes associated with force-deleted pods and update related messaging
2 parents cb75c2a + 4fb0048 commit fa16d25

3 files changed

Lines changed: 46 additions & 15 deletions

File tree

kubewatch/pkg/informer/bean/bean.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const (
3030

3131
ExitCode143Error = "Error (exit code 143)"
3232
NodeNoLongerExists = "PodGC: node no longer exists"
33+
NodeForceDeleted = "Pod was force deleted"
3334
UpdateEvent = "update_event"
3435
DeleteEvent = "delete_event"
3536

kubewatch/pkg/informer/cluster/systemExec/helper.go

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ func (impl *InformerImpl) getStopChannel(informerFactory kubeinformers.SharedInf
6060
func (impl *InformerImpl) checkIfPodDeletedAndUpdateMessage(podName, namespace string,
6161
nodeStatus v1alpha1.NodeStatus, config *rest.Config) (v1alpha1.NodeStatus, bool) {
6262

63-
if (nodeStatus.Phase == v1alpha1.NodeFailed || nodeStatus.Phase == v1alpha1.NodeError) && (nodeStatus.Message == bean.ExitCode143Error || nodeStatus.Message == bean.NodeNoLongerExists) {
63+
if (nodeStatus.Phase == v1alpha1.NodeFailed || nodeStatus.Phase == v1alpha1.NodeError) && (nodeStatus.Message == bean.ExitCode143Error || nodeStatus.Message == bean.NodeNoLongerExists ||
64+
nodeStatus.Message == bean.NodeForceDeleted) {
6465
clusterClient, k8sErr := impl.k8sUtil.GetK8sClientForConfig(config)
6566
if k8sErr != nil {
6667
return nodeStatus, false
@@ -84,20 +85,40 @@ func (impl *InformerImpl) checkIfPodDeletedAndUpdateMessage(podName, namespace s
8485

8586
func (impl *InformerImpl) assessNodeStatus(eventType string, pod *coreV1.Pod) v1alpha1.NodeStatus {
8687
nodeStatus := v1alpha1.NodeStatus{}
87-
switch pod.Status.Phase {
88-
case coreV1.PodPending:
89-
nodeStatus.Phase = v1alpha1.NodePending
90-
nodeStatus.Message = getPendingReason(pod)
91-
case coreV1.PodSucceeded:
92-
nodeStatus.Phase = v1alpha1.NodeSucceeded
93-
case coreV1.PodFailed:
94-
nodeStatus.Phase, nodeStatus.Message = impl.inferFailedReason(eventType, pod)
95-
impl.logger.Infof("Pod %s failed: %s", pod.Name, nodeStatus.Message)
96-
case coreV1.PodRunning:
97-
nodeStatus.Phase = v1alpha1.NodeRunning
98-
default:
99-
nodeStatus.Phase = v1alpha1.NodeError
100-
nodeStatus.Message = fmt.Sprintf("Unexpected pod phase for %s: %s", pod.ObjectMeta.Name, pod.Status.Phase)
88+
89+
/*
90+
Special handling for delete events with force delete scenarios, Kubernetes does NOT guarantee that the pod status
91+
will be updated to "Failed" during force delete. Sometimes the pod phase can be "Running" even after force delete.
92+
Force deletion immediately removes the Pod object from the Kubernetes API server without waiting for
93+
the kubelet on the node to confirm termination.
94+
95+
If the application within the pod on the node is still running and hasn't received a termination signal or processed
96+
it yet, the container processes might continue to exist on the node even after the Pod object is gone from the API
97+
server. This can lead to a state where the pod effectively exists on the node, but Kubernetes no longer tracks it,
98+
and it might appear as Running if you were to inspect the node's process list.
99+
*/
100+
if eventType == bean.DeleteEvent && isPodForceDeletedWhileRunning(pod) {
101+
// Force delete detected - treat as failed regardless of current phase
102+
impl.logger.Infow("Force delete detected for pod", "podName", pod.Name, "currentPhase", pod.Status.Phase, "deletionGracePeriod", *pod.DeletionGracePeriodSeconds)
103+
nodeStatus.Phase = v1alpha1.NodeFailed
104+
nodeStatus.Message = bean.NodeForceDeleted
105+
return nodeStatus
106+
} else {
107+
switch pod.Status.Phase {
108+
case coreV1.PodPending:
109+
nodeStatus.Phase = v1alpha1.NodePending
110+
nodeStatus.Message = getPendingReason(pod)
111+
case coreV1.PodSucceeded:
112+
nodeStatus.Phase = v1alpha1.NodeSucceeded
113+
case coreV1.PodFailed:
114+
nodeStatus.Phase, nodeStatus.Message = impl.inferFailedReason(eventType, pod)
115+
impl.logger.Infof("Pod %s failed: %s", pod.Name, nodeStatus.Message)
116+
case coreV1.PodRunning:
117+
nodeStatus.Phase = v1alpha1.NodeRunning
118+
default:
119+
nodeStatus.Phase = v1alpha1.NodeError
120+
nodeStatus.Message = fmt.Sprintf("Unexpected pod phase for %s: %s", pod.ObjectMeta.Name, pod.Status.Phase)
121+
}
101122
}
102123

103124
// only update Pod IP for daemoned nodes to reduce number of updates

kubewatch/pkg/informer/cluster/systemExec/util.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,15 @@ func isResourceNotFoundErr(err error) bool {
8282
return false
8383
}
8484

85+
// isPodForceDeletedWhileRunning checks if a pod was force deleted based on deletion metadata
86+
func isPodForceDeletedWhileRunning(pod *coreV1.Pod) bool {
87+
//For reference all pod phases that exists :- https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
88+
if (pod.Status.Phase == coreV1.PodRunning || pod.Status.Phase == coreV1.PodPending) && pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil && *pod.DeletionGracePeriodSeconds == 0 {
89+
return true
90+
}
91+
return false
92+
}
93+
8594
func getWorkflowStatus(podObj *coreV1.Pod, nodeStatus v1alpha1.NodeStatus, templateName string) *informerBean.CiCdStatus {
8695
workflowStatus := &informerBean.CiCdStatus{
8796
WorkflowStatus: &v1alpha1.WorkflowStatus{},

0 commit comments

Comments
 (0)