Merge pull request #318 from devtron-labs/hotfix-pre-post-cd-progress-status-fix

prakash100198 · web-flow · commit fa16d25960c3 · 2025-08-01T13:23:44.000+05:30
fix: handle force-deleted pods: add detection and handling logic for nodes associated with force-deleted pods and update related messaging
diff --git a/kubewatch/pkg/informer/bean/bean.go b/kubewatch/pkg/informer/bean/bean.go
@@ -30,6 +30,7 @@ const (
 
 	ExitCode143Error   = "Error (exit code 143)"
 	NodeNoLongerExists = "PodGC: node no longer exists"
+	NodeForceDeleted   = "Pod was force deleted"
 	UpdateEvent        = "update_event"
 	DeleteEvent        = "delete_event"
 
diff --git a/kubewatch/pkg/informer/cluster/systemExec/helper.go b/kubewatch/pkg/informer/cluster/systemExec/helper.go
@@ -60,7 +60,8 @@ func (impl *InformerImpl) getStopChannel(informerFactory kubeinformers.SharedInf
 func (impl *InformerImpl) checkIfPodDeletedAndUpdateMessage(podName, namespace string,
 	nodeStatus v1alpha1.NodeStatus, config *rest.Config) (v1alpha1.NodeStatus, bool) {
 
-	if (nodeStatus.Phase == v1alpha1.NodeFailed || nodeStatus.Phase == v1alpha1.NodeError) && (nodeStatus.Message == bean.ExitCode143Error || nodeStatus.Message == bean.NodeNoLongerExists) {
+	if (nodeStatus.Phase == v1alpha1.NodeFailed || nodeStatus.Phase == v1alpha1.NodeError) && (nodeStatus.Message == bean.ExitCode143Error || nodeStatus.Message == bean.NodeNoLongerExists ||
+		nodeStatus.Message == bean.NodeForceDeleted) {
 		clusterClient, k8sErr := impl.k8sUtil.GetK8sClientForConfig(config)
 		if k8sErr != nil {
 			return nodeStatus, false
@@ -84,20 +85,40 @@ func (impl *InformerImpl) checkIfPodDeletedAndUpdateMessage(podName, namespace s
 
 func (impl *InformerImpl) assessNodeStatus(eventType string, pod *coreV1.Pod) v1alpha1.NodeStatus {
 	nodeStatus := v1alpha1.NodeStatus{}
-	switch pod.Status.Phase {
-	case coreV1.PodPending:
-		nodeStatus.Phase = v1alpha1.NodePending
-		nodeStatus.Message = getPendingReason(pod)
-	case coreV1.PodSucceeded:
-		nodeStatus.Phase = v1alpha1.NodeSucceeded
-	case coreV1.PodFailed:
-		nodeStatus.Phase, nodeStatus.Message = impl.inferFailedReason(eventType, pod)
-		impl.logger.Infof("Pod %s failed: %s", pod.Name, nodeStatus.Message)
-	case coreV1.PodRunning:
-		nodeStatus.Phase = v1alpha1.NodeRunning
-	default:
-		nodeStatus.Phase = v1alpha1.NodeError
-		nodeStatus.Message = fmt.Sprintf("Unexpected pod phase for %s: %s", pod.ObjectMeta.Name, pod.Status.Phase)
+
+	/*
+		Special handling for delete events with force delete scenarios, Kubernetes does NOT guarantee that the pod status
+		will be updated to "Failed" during force delete. Sometimes the pod phase can be "Running" even after force delete.
+		Force deletion immediately removes the Pod object from the Kubernetes API server without waiting for
+		the kubelet on the node to confirm termination.
+
+		If the application within the pod on the node is still running and hasn't received a termination signal or processed
+		it yet, the container processes might continue to exist on the node even after the Pod object is gone from the API
+		server. This can lead to a state where the pod effectively exists on the node, but Kubernetes no longer tracks it,
+		and it might appear as Running if you were to inspect the node's process list.
+	*/
+	if eventType == bean.DeleteEvent && isPodForceDeletedWhileRunning(pod) {
+		// Force delete detected - treat as failed regardless of current phase
+		impl.logger.Infow("Force delete detected for pod", "podName", pod.Name, "currentPhase", pod.Status.Phase, "deletionGracePeriod", *pod.DeletionGracePeriodSeconds)
+		nodeStatus.Phase = v1alpha1.NodeFailed
+		nodeStatus.Message = bean.NodeForceDeleted
+		return nodeStatus
+	} else {
+		switch pod.Status.Phase {
+		case coreV1.PodPending:
+			nodeStatus.Phase = v1alpha1.NodePending
+			nodeStatus.Message = getPendingReason(pod)
+		case coreV1.PodSucceeded:
+			nodeStatus.Phase = v1alpha1.NodeSucceeded
+		case coreV1.PodFailed:
+			nodeStatus.Phase, nodeStatus.Message = impl.inferFailedReason(eventType, pod)
+			impl.logger.Infof("Pod %s failed: %s", pod.Name, nodeStatus.Message)
+		case coreV1.PodRunning:
+			nodeStatus.Phase = v1alpha1.NodeRunning
+		default:
+			nodeStatus.Phase = v1alpha1.NodeError
+			nodeStatus.Message = fmt.Sprintf("Unexpected pod phase for %s: %s", pod.ObjectMeta.Name, pod.Status.Phase)
+		}
 	}
 
 	// only update Pod IP for daemoned nodes to reduce number of updates
diff --git a/kubewatch/pkg/informer/cluster/systemExec/util.go b/kubewatch/pkg/informer/cluster/systemExec/util.go
@@ -82,6 +82,15 @@ func isResourceNotFoundErr(err error) bool {
 	return false
 }
 
+// isPodForceDeletedWhileRunning checks if a pod was force deleted based on deletion metadata
+func isPodForceDeletedWhileRunning(pod *coreV1.Pod) bool {
+	//For reference all pod phases that exists :- https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
+	if (pod.Status.Phase == coreV1.PodRunning || pod.Status.Phase == coreV1.PodPending) && pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil && *pod.DeletionGracePeriodSeconds == 0 {
+		return true
+	}
+	return false
+}
+
 func getWorkflowStatus(podObj *coreV1.Pod, nodeStatus v1alpha1.NodeStatus, templateName string) *informerBean.CiCdStatus {
 	workflowStatus := &informerBean.CiCdStatus{
 		WorkflowStatus: &v1alpha1.WorkflowStatus{},