Skip to content

Commit 5f61b02

Browse files
authored
Resolve concurrency problem causing skipping taint by surfacing error and retry with SQS (#1279)
Co-authored-by: mcornea mcornea@redhat.com
1 parent 64d4f57 commit 5f61b02

3 files changed

Lines changed: 15 additions & 3 deletions

File tree

pkg/interruptionevent/draincordon/handler.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,18 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
8181
}
8282

8383
if drainEvent.PreDrainTask != nil {
84-
h.commonHandler.RunPreDrainTask(nodeName, drainEvent)
84+
if err := h.commonHandler.RunPreDrainTask(nodeName, drainEvent); err != nil {
85+
log.Err(err).Str("nodeName", nodeName).Msg("Pre-drain task failed; aborting to allow SQS retry")
86+
h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID)
87+
88+
// If the node is missing and the user opted for DeleteSqsMsgIfNodeNotFound then delete the SQS message
89+
if !nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound && drainEvent.PostDrainTask != nil {
90+
h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
91+
return nil
92+
}
93+
94+
return err
95+
}
8596
}
8697

8798
podNameList, err := h.commonHandler.Node.FetchPodNameList(nodeName)

pkg/interruptionevent/internal/common/handler.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func (h *Handler) GetNodeName(drainEvent *monitor.InterruptionEvent) (string, er
4444
return nodeName, nil
4545
}
4646

47-
func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) {
47+
func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) error {
4848
err := drainEvent.PreDrainTask(*drainEvent, h.Node)
4949
if err != nil {
5050
log.Err(err).Msg("There was a problem executing the pre-drain task")
@@ -53,6 +53,7 @@ func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.Interrupt
5353
h.Recorder.Emit(nodeName, observability.Normal, observability.PreDrainReason, observability.PreDrainMsg)
5454
}
5555
h.Metrics.NodeActionsInc("pre-drain", nodeName, drainEvent.EventID, err)
56+
return err
5657
}
5758

5859
func (h *Handler) RunCancelDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) {

pkg/monitor/sqsevent/spot-itn-event.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event *EventBridgeEven
9696
if err != nil {
9797
log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID)
9898
}
99-
return nil
99+
return err
100100
}
101101
return &interruptionEvent, nil
102102
}

0 commit comments

Comments
 (0)