Skip to content

Commit 80bd805

Browse files
Merge pull request #242 from devtron-labs/fix-debug-ci
fix: spot node termination case, failed workflow marked as succedded
1 parent 2c8e67c commit 80bd805

1 file changed

Lines changed: 7 additions & 1 deletion

File tree

  • kubewatch/pkg/informer/cluster/systemExec

kubewatch/pkg/informer/cluster/systemExec/helper.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,13 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) (
221221
// were SIGKILL'd. The executor may have had to forcefully terminate the sidecar (kill -9),
222222
// resulting in a 137 exit code (which we had ignored earlier). If failMessages is empty, it
223223
// indicates that this is the case and we return Success instead of Failure.
224-
return v1alpha1.NodeSucceeded, ""
224+
225+
// Update diff from argo workflow here as we only have one main container.
226+
// Handling this for case of spot interruption where containers are in running state (no termination state found), in that case
227+
// it was marking it successful, doing this as it will be skipped at upper level, and delete event will handle it.
228+
// ticket - you can find debug logs/details here - https://github.com/devtron-labs/sprint-tasks/issues/2092
229+
impl.logger.Infow("Pod phase was Failed but no container had terminated state, marking it as failed now", "podName", pod.Name, "status", pod.Status)
230+
return v1alpha1.NodeFailed, ""
225231
}
226232

227233
func getFailedReasonFromPodConditions(conditions []coreV1.PodCondition) string {

0 commit comments

Comments
 (0)