Skip to content

Commit 1dc39f7

Browse files
Merge pull request #242 from devtron-labs/fix-debug-ci
fix: spot node termination case, failed workflow marked as succedded
1 parent 6d3037d commit 1dc39f7

1 file changed

Lines changed: 7 additions & 1 deletion

File tree

  • kubewatch/pkg/informer/cluster/systemExec

kubewatch/pkg/informer/cluster/systemExec/helper.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,13 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) (
220220
// were SIGKILL'd. The executor may have had to forcefully terminate the sidecar (kill -9),
221221
// resulting in a 137 exit code (which we had ignored earlier). If failMessages is empty, it
222222
// indicates that this is the case and we return Success instead of Failure.
223-
return v1alpha1.NodeSucceeded, ""
223+
224+
// Update diff from argo workflow here as we only have one main container.
225+
// Handling this for case of spot interruption where containers are in running state (no termination state found), in that case
226+
// it was marking it successful, doing this as it will be skipped at upper level, and delete event will handle it.
227+
// ticket - you can find debug logs/details here - https://github.com/devtron-labs/sprint-tasks/issues/2092
228+
impl.logger.Infow("Pod phase was Failed but no container had terminated state, marking it as failed now", "podName", pod.Name, "status", pod.Status)
229+
return v1alpha1.NodeFailed, ""
224230
}
225231

226232
func getFailedReasonFromPodConditions(conditions []coreV1.PodCondition) string {

0 commit comments

Comments
 (0)