@@ -39,6 +39,10 @@ type BaseTriggerMetrics interface {
3939 IncInboxFull (triggerID string )
4040 EmitUndeliveredWarning (triggerID , eventID string )
4141 EmitUndeliveredCritical (triggerID , eventID string )
42+ // IncAckError counts ACK paths that return an error (e.g. store delete failure). reason is a stable identifier for dashboards.
43+ IncAckError (reason string )
44+ // IncAckMemoryOutcome records how an ACK related to the in-memory pending map: hit, miss_no_trigger_bucket, miss_no_event, miss_nil_record.
45+ IncAckMemoryOutcome (outcome string )
4246}
4347
4448type undeliveredState struct {
@@ -192,8 +196,12 @@ func (b *BaseTriggerCapability[T]) DeliverEvent(
192196 }
193197
194198 if err := b .store .Insert (ctx , rec ); err != nil {
199+ b .lggr .Errorw ("base trigger failed to persist pending event" ,
200+ "capabilityID" , b .capabilityId , "triggerID" , triggerID , "eventID" , te .ID , "err" , err )
195201 return err
196202 }
203+ b .lggr .Infow ("base trigger persisted pending event for ACK tracking" ,
204+ "capabilityID" , b .capabilityId , "triggerID" , triggerID , "eventID" , te .ID )
197205
198206 b .mu .Lock ()
199207 if b .pending [triggerID ] == nil {
@@ -236,27 +244,45 @@ func (b *BaseTriggerCapability[T]) sendToInbox(triggerID, eventID string, payloa
236244func (b * BaseTriggerCapability [T ]) AckEvent (ctx context.Context , triggerId string , eventId string ) error {
237245 b .lggr .Infow ("Event ACK" , "triggerID" , triggerId , "eventID" , eventId )
238246 if ! b .retransmitEnabled () {
247+ b .lggr .Debugw ("base trigger ACK skipped (retransmit disabled, no persistence/ACK tracking)" ,
248+ "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId )
249+ b .metrics .IncAckMemoryOutcome ("skipped_retransmit_disabled" )
239250 return nil
240251 }
241252
242253 var (
243- attempts int
244- firstAt time.Time
245- found bool
254+ attempts int
255+ firstAt time.Time
256+ found bool
257+ hadTriggerBucket bool
258+ hadEventKey bool
259+ hadNilPendingRecord bool
246260 )
247261
248262 b .mu .Lock ()
249- if eventsForTrigger , ok := b .pending [triggerId ]; ok && eventsForTrigger != nil {
250- if rec , recOk := eventsForTrigger [eventId ]; recOk && rec != nil {
263+ eventsForTrigger , ok := b .pending [triggerId ]
264+ hadTriggerBucket = ok && eventsForTrigger != nil
265+ if hadTriggerBucket {
266+ rec , recOk := eventsForTrigger [eventId ]
267+ hadEventKey = recOk
268+ switch {
269+ case recOk && rec != nil :
251270 attempts = rec .Attempts
252271 firstAt = rec .FirstAt
253272 found = true
273+ case recOk && rec == nil :
274+ hadNilPendingRecord = true
275+ b .metrics .IncAckMemoryOutcome ("miss_nil_record" )
276+ default :
277+ b .metrics .IncAckMemoryOutcome ("miss_no_event" )
254278 }
255279
256280 delete (eventsForTrigger , eventId )
257281 if len (eventsForTrigger ) == 0 {
258282 delete (b .pending , triggerId )
259283 }
284+ } else {
285+ b .metrics .IncAckMemoryOutcome ("miss_no_trigger_bucket" )
260286 }
261287
262288 if m , ok := b .undeliveredAlertStates [triggerId ]; ok {
@@ -267,12 +293,40 @@ func (b *BaseTriggerCapability[T]) AckEvent(ctx context.Context, triggerId strin
267293 }
268294 b .mu .Unlock ()
269295
270- if found {
296+ switch {
297+ case found :
298+ b .lggr .Infow ("base trigger ACK matched in-memory pending event" ,
299+ "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId ,
300+ "attempts" , attempts , "firstAt" , firstAt )
301+ b .metrics .IncAckMemoryOutcome ("hit" )
271302 b .metrics .IncAck (triggerId , eventId )
272303 b .metrics .ObserveTimeToAck (triggerId , eventId , time .Since (firstAt ), attempts )
304+ case hadNilPendingRecord :
305+ b .lggr .Warnw ("base trigger ACK: pending map had nil record for event (treating as miss; reconciling store)" ,
306+ "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId )
307+ case hadTriggerBucket && ! hadEventKey :
308+ b .lggr .Infow ("base trigger ACK: event id not in in-memory pending map for trigger (may exist only in store; reconciling)" ,
309+ "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId )
310+ case ! hadTriggerBucket :
311+ b .lggr .Infow ("base trigger ACK: no in-memory pending bucket for trigger (not pending here; still deleting from store if row exists)" ,
312+ "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId )
273313 }
274314
275- return b .store .DeleteEvent (ctx , triggerId , eventId )
315+ if err := b .store .DeleteEvent (ctx , triggerId , eventId ); err != nil {
316+ b .lggr .Errorw ("base trigger ACK failed to delete event from store" ,
317+ "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId ,
318+ "foundInMemory" , found , "err" , err )
319+ b .metrics .IncAckError ("store_delete_failed" )
320+ return err
321+ }
322+ if found {
323+ b .lggr .Debugw ("base trigger ACK store delete succeeded" ,
324+ "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId )
325+ } else {
326+ b .lggr .Infow ("base trigger ACK store delete succeeded (memory miss path; store row removed if present)" ,
327+ "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId )
328+ }
329+ return nil
276330}
277331
278332func (b * BaseTriggerCapability [T ]) retransmitLoop () {
0 commit comments