From 2dddcb32763e97d2a4b5cffc94ef774e1f6b8fcb Mon Sep 17 00:00:00 2001 From: Shivam-nagar23 Date: Mon, 28 Apr 2025 11:41:40 +0530 Subject: [PATCH 1/7] failed update event mark as succedded --- kubewatch/pkg/informer/cluster/systemExec/helper.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kubewatch/pkg/informer/cluster/systemExec/helper.go b/kubewatch/pkg/informer/cluster/systemExec/helper.go index 7ca088360..4509fc2ee 100644 --- a/kubewatch/pkg/informer/cluster/systemExec/helper.go +++ b/kubewatch/pkg/informer/cluster/systemExec/helper.go @@ -217,6 +217,11 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) ( } } + if pod.Status.Phase == coreV1.PodFailed { + // Don't mark as succeeded if the pod itself is failed, even if we couldn't determine why + return v1alpha1.NodeFailed, "Pod reported failed status" + } + // If we get here, we have detected that the main/wait containers succeed but the sidecar(s) // were SIGKILL'd. The executor may have had to forcefully terminate the sidecar (kill -9), // resulting in a 137 exit code (which we had ignored earlier). If failMessages is empty, it From 1ec688eedb8e1ee4cbecee3b9e4ccd6ee9abd20b Mon Sep 17 00:00:00 2001 From: Shivam-nagar23 Date: Mon, 28 Apr 2025 12:03:50 +0530 Subject: [PATCH 2/7] temp checks --- .../pkg/informer/cluster/systemExec/helper.go | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kubewatch/pkg/informer/cluster/systemExec/helper.go b/kubewatch/pkg/informer/cluster/systemExec/helper.go index 4509fc2ee..90a1276f6 100644 --- a/kubewatch/pkg/informer/cluster/systemExec/helper.go +++ b/kubewatch/pkg/informer/cluster/systemExec/helper.go @@ -217,10 +217,10 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) ( } } - if pod.Status.Phase == coreV1.PodFailed { - // Don't mark as succeeded if the pod itself is failed, even if we couldn't determine why - return v1alpha1.NodeFailed, "Pod reported failed status" - } + //if pod.Status.Phase == coreV1.PodFailed { + // // Don't mark as succeeded if the pod itself is failed, even if we couldn't determine why + // return v1alpha1.NodeFailed, "Pod reported failed status" + //} // If we get here, we have detected that the main/wait containers succeed but the sidecar(s) // were SIGKILL'd. The executor may have had to forcefully terminate the sidecar (kill -9), @@ -243,16 +243,17 @@ func foundAnyUpdateInPodStatus(from *coreV1.Pod, to *coreV1.Pod) bool { if from == nil || to == nil { return true } - return isAnyChangeInPodStatus(&from.Status, &to.Status) + return isAnyChangeInPodStatus(from, to) } -func isAnyChangeInPodStatus(from *coreV1.PodStatus, to *coreV1.PodStatus) bool { - return from.Phase != to.Phase || - from.Message != to.Message || - from.PodIP != to.PodIP || - isAnyChangeInContainersStatus(from.ContainerStatuses, to.ContainerStatuses) || - isAnyChangeInContainersStatus(from.InitContainerStatuses, to.InitContainerStatuses) || - isAnyChangeInPodConditions(from.Conditions, to.Conditions) +func isAnyChangeInPodStatus(from *coreV1.Pod, to *coreV1.Pod) bool { + return from.Status.Phase != to.Status.Phase || + from.Status.Message != to.Status.Message || + from.Status.PodIP != to.Status.PodIP || + from.GetDeletionTimestamp() != to.GetDeletionTimestamp() || + isAnyChangeInContainersStatus(from.Status.ContainerStatuses, to.Status.ContainerStatuses) || + isAnyChangeInContainersStatus(from.Status.InitContainerStatuses, to.Status.InitContainerStatuses) || + isAnyChangeInPodConditions(from.Status.Conditions, to.Status.Conditions) } func isAnyChangeInContainersStatus(from []coreV1.ContainerStatus, to []coreV1.ContainerStatus) bool { From bd6e99c4930d79e302eb76f11db86229e4e04945 Mon Sep 17 00:00:00 2001 From: Shivam-nagar23 Date: Mon, 28 Apr 2025 12:22:10 +0530 Subject: [PATCH 3/7] Revert "temp checks" This reverts commit 1ec688eedb8e1ee4cbecee3b9e4ccd6ee9abd20b. --- .../pkg/informer/cluster/systemExec/helper.go | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/kubewatch/pkg/informer/cluster/systemExec/helper.go b/kubewatch/pkg/informer/cluster/systemExec/helper.go index 90a1276f6..4509fc2ee 100644 --- a/kubewatch/pkg/informer/cluster/systemExec/helper.go +++ b/kubewatch/pkg/informer/cluster/systemExec/helper.go @@ -217,10 +217,10 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) ( } } - //if pod.Status.Phase == coreV1.PodFailed { - // // Don't mark as succeeded if the pod itself is failed, even if we couldn't determine why - // return v1alpha1.NodeFailed, "Pod reported failed status" - //} + if pod.Status.Phase == coreV1.PodFailed { + // Don't mark as succeeded if the pod itself is failed, even if we couldn't determine why + return v1alpha1.NodeFailed, "Pod reported failed status" + } // If we get here, we have detected that the main/wait containers succeed but the sidecar(s) // were SIGKILL'd. The executor may have had to forcefully terminate the sidecar (kill -9), @@ -243,17 +243,16 @@ func foundAnyUpdateInPodStatus(from *coreV1.Pod, to *coreV1.Pod) bool { if from == nil || to == nil { return true } - return isAnyChangeInPodStatus(from, to) + return isAnyChangeInPodStatus(&from.Status, &to.Status) } -func isAnyChangeInPodStatus(from *coreV1.Pod, to *coreV1.Pod) bool { - return from.Status.Phase != to.Status.Phase || - from.Status.Message != to.Status.Message || - from.Status.PodIP != to.Status.PodIP || - from.GetDeletionTimestamp() != to.GetDeletionTimestamp() || - isAnyChangeInContainersStatus(from.Status.ContainerStatuses, to.Status.ContainerStatuses) || - isAnyChangeInContainersStatus(from.Status.InitContainerStatuses, to.Status.InitContainerStatuses) || - isAnyChangeInPodConditions(from.Status.Conditions, to.Status.Conditions) +func isAnyChangeInPodStatus(from *coreV1.PodStatus, to *coreV1.PodStatus) bool { + return from.Phase != to.Phase || + from.Message != to.Message || + from.PodIP != to.PodIP || + isAnyChangeInContainersStatus(from.ContainerStatuses, to.ContainerStatuses) || + isAnyChangeInContainersStatus(from.InitContainerStatuses, to.InitContainerStatuses) || + isAnyChangeInPodConditions(from.Conditions, to.Conditions) } func isAnyChangeInContainersStatus(from []coreV1.ContainerStatus, to []coreV1.ContainerStatus) bool { From f5948d000deb4f756acffca6a796217e029b35d1 Mon Sep 17 00:00:00 2001 From: Shivam-nagar23 Date: Mon, 28 Apr 2025 13:09:59 +0530 Subject: [PATCH 4/7] pod deleted --- kubewatch/pkg/informer/cluster/systemExec/helper.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kubewatch/pkg/informer/cluster/systemExec/helper.go b/kubewatch/pkg/informer/cluster/systemExec/helper.go index 4509fc2ee..9f3d9b9bb 100644 --- a/kubewatch/pkg/informer/cluster/systemExec/helper.go +++ b/kubewatch/pkg/informer/cluster/systemExec/helper.go @@ -217,9 +217,10 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) ( } } - if pod.Status.Phase == coreV1.PodFailed { + if pod.Status.Phase == coreV1.PodFailed && pod.DeletionTimestamp != nil { + impl.logger.Debugw("Pod is deleted", "podName", pod.Name, "deletionTimestamp", pod.DeletionTimestamp) // Don't mark as succeeded if the pod itself is failed, even if we couldn't determine why - return v1alpha1.NodeFailed, "Pod reported failed status" + return v1alpha1.NodeFailed, informerBean.PodDeletedMessage } // If we get here, we have detected that the main/wait containers succeed but the sidecar(s) From 1147b139831775b9f16e08420a255bcc28fbd684 Mon Sep 17 00:00:00 2001 From: Shivam-nagar23 Date: Mon, 28 Apr 2025 16:26:52 +0530 Subject: [PATCH 5/7] ci failed --- kubewatch/pkg/informer/cluster/systemExec/helper.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kubewatch/pkg/informer/cluster/systemExec/helper.go b/kubewatch/pkg/informer/cluster/systemExec/helper.go index 9f3d9b9bb..2104a8977 100644 --- a/kubewatch/pkg/informer/cluster/systemExec/helper.go +++ b/kubewatch/pkg/informer/cluster/systemExec/helper.go @@ -217,17 +217,15 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) ( } } - if pod.Status.Phase == coreV1.PodFailed && pod.DeletionTimestamp != nil { - impl.logger.Debugw("Pod is deleted", "podName", pod.Name, "deletionTimestamp", pod.DeletionTimestamp) - // Don't mark as succeeded if the pod itself is failed, even if we couldn't determine why - return v1alpha1.NodeFailed, informerBean.PodDeletedMessage - } - // If we get here, we have detected that the main/wait containers succeed but the sidecar(s) // were SIGKILL'd. The executor may have had to forcefully terminate the sidecar (kill -9), // resulting in a 137 exit code (which we had ignored earlier). If failMessages is empty, it // indicates that this is the case and we return Success instead of Failure. - return v1alpha1.NodeSucceeded, "" + + // Update diff from argo workflow here as we only have one main container. + // Handling this for case of spot interruption where containers are in running state (no termination state found), in that case + // it was marking it successful, doing this as it will be skipped at upper level, and delete event will handle it. + return v1alpha1.NodeFailed, "" } func getFailedReasonFromPodConditions(conditions []coreV1.PodCondition) string { From d75776886d6caf41a38c6df40cbd06a55c05254b Mon Sep 17 00:00:00 2001 From: Shivam-nagar23 Date: Tue, 29 Apr 2025 08:24:11 +0530 Subject: [PATCH 6/7] comments --- kubewatch/pkg/informer/cluster/systemExec/helper.go | 1 + 1 file changed, 1 insertion(+) diff --git a/kubewatch/pkg/informer/cluster/systemExec/helper.go b/kubewatch/pkg/informer/cluster/systemExec/helper.go index 2104a8977..e39802b28 100644 --- a/kubewatch/pkg/informer/cluster/systemExec/helper.go +++ b/kubewatch/pkg/informer/cluster/systemExec/helper.go @@ -225,6 +225,7 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) ( // Update diff from argo workflow here as we only have one main container. // Handling this for case of spot interruption where containers are in running state (no termination state found), in that case // it was marking it successful, doing this as it will be skipped at upper level, and delete event will handle it. + // ticket - you can find debug logs/details here - https://github.com/devtron-labs/sprint-tasks/issues/2092 return v1alpha1.NodeFailed, "" } From 54a9a815ce3826c4e310c2165a0c72824144c0e2 Mon Sep 17 00:00:00 2001 From: Shivam-nagar23 Date: Tue, 29 Apr 2025 11:06:54 +0530 Subject: [PATCH 7/7] log added --- kubewatch/pkg/informer/cluster/systemExec/helper.go | 1 + 1 file changed, 1 insertion(+) diff --git a/kubewatch/pkg/informer/cluster/systemExec/helper.go b/kubewatch/pkg/informer/cluster/systemExec/helper.go index e39802b28..b5f0d2bbc 100644 --- a/kubewatch/pkg/informer/cluster/systemExec/helper.go +++ b/kubewatch/pkg/informer/cluster/systemExec/helper.go @@ -226,6 +226,7 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) ( // Handling this for case of spot interruption where containers are in running state (no termination state found), in that case // it was marking it successful, doing this as it will be skipped at upper level, and delete event will handle it. // ticket - you can find debug logs/details here - https://github.com/devtron-labs/sprint-tasks/issues/2092 + impl.logger.Infow("Pod phase was Failed but no container had terminated state, marking it as failed now", "podName", pod.Name, "status", pod.Status) return v1alpha1.NodeFailed, "" }