@@ -45,6 +45,13 @@ type BaseTriggerMetrics interface {
4545 IncAckError (reason string )
4646 // IncAckMemoryOutcome records how an ACK related to the in-memory pending map: hit, miss_no_trigger_bucket, miss_no_event, miss_nil_record.
4747 IncAckMemoryOutcome (outcome string )
48+ // AddPendingEvents adjusts the live gauge of events awaiting ACK. Positive on insert, negative on ACK/unregister.
49+ AddPendingEvents (delta int64 )
50+ // IncStuckEvent increments the live gauge of events stuck past the critical undelivered threshold.
51+ // Keyed by (capability_id, trigger_id, event_id) so you can see exactly which events are stuck.
52+ IncStuckEvent (triggerID , eventID string )
53+ // DecStuckEvent decrements the stuck-event gauge when a previously-critical event is ACKed or unregistered.
54+ DecStuckEvent (triggerID , eventID string )
4855}
4956
5057type undeliveredState struct {
@@ -185,6 +192,10 @@ func (b *BaseTriggerCapability[T]) Start(ctx context.Context) error {
185192 }
186193 b .mu .Unlock ()
187194
195+ if n := int64 (len (recs )); n > 0 {
196+ b .metrics .AddPendingEvents (n )
197+ }
198+
188199 b .wg .Add (1 )
189200 go func () {
190201 defer b .wg .Done ()
@@ -212,14 +223,32 @@ func (b *BaseTriggerCapability[T]) RegisterTrigger(triggerID string, sendCh chan
212223func (b * BaseTriggerCapability [T ]) UnregisterTrigger (triggerID string ) {
213224 b .mu .Lock ()
214225 _ , existed := b .inboxes [triggerID ]
226+ pendingCount := int64 (len (b .pending [triggerID ]))
227+
228+ var criticalEvents []string
229+ if m , ok := b .undeliveredAlertStates [triggerID ]; ok {
230+ for eventID , s := range m {
231+ if s != nil && s .emittedCritical {
232+ criticalEvents = append (criticalEvents , eventID )
233+ }
234+ }
235+ }
236+
215237 delete (b .inboxes , triggerID )
216238 delete (b .pending , triggerID )
217239 delete (b .undeliveredAlertStates , triggerID )
218240 b .mu .Unlock ()
219241
242+ for _ , eventID := range criticalEvents {
243+ b .metrics .DecStuckEvent (triggerID , eventID )
244+ }
245+
220246 if existed {
221247 b .metrics .DecActiveTriggers ()
222248 }
249+ if pendingCount > 0 {
250+ b .metrics .AddPendingEvents (- pendingCount )
251+ }
223252
224253 if err := b .store .DeleteEventsForTrigger (b .ctx , triggerID ); err != nil {
225254 b .lggr .Errorf ("Failed to delete events for trigger (TriggerID=%s): %v" , triggerID , err )
@@ -258,6 +287,7 @@ func (b *BaseTriggerCapability[T]) DeliverEvent(
258287 b.pending [triggerID ][te.ID ] = & rec
259288 b .mu .Unlock ()
260289
290+ b .metrics .AddPendingEvents (1 )
261291 b .trySend (rec )
262292 return nil
263293}
@@ -327,14 +357,22 @@ func (b *BaseTriggerCapability[T]) AckEvent(ctx context.Context, triggerId strin
327357 b .metrics .IncAckMemoryOutcome ("miss_no_trigger_bucket" )
328358 }
329359
360+ var wasCritical bool
330361 if m , ok := b .undeliveredAlertStates [triggerId ]; ok {
362+ if s , exists := m [eventId ]; exists && s != nil && s .emittedCritical {
363+ wasCritical = true
364+ }
331365 delete (m , eventId )
332366 if len (m ) == 0 {
333367 delete (b .undeliveredAlertStates , triggerId )
334368 }
335369 }
336370 b .mu .Unlock ()
337371
372+ if wasCritical {
373+ b .metrics .DecStuckEvent (triggerId , eventId )
374+ }
375+
338376 switch {
339377 case found :
340378 b .lggr .Infow ("base trigger ACK matched in-memory pending event" ,
@@ -343,6 +381,7 @@ func (b *BaseTriggerCapability[T]) AckEvent(ctx context.Context, triggerId strin
343381 b .metrics .IncAckMemoryOutcome ("hit" )
344382 b .metrics .IncAck (triggerId , eventId )
345383 b .metrics .ObserveTimeToAck (triggerId , eventId , time .Since (firstAt ), attempts )
384+ b .metrics .AddPendingEvents (- 1 )
346385 case hadNilPendingRecord :
347386 b .lggr .Warnw ("base trigger ACK: pending map had nil record for event (treating as miss; reconciling store)" ,
348387 "capabilityID" , b .capabilityId , "triggerID" , triggerId , "eventID" , eventId )
@@ -396,8 +435,8 @@ func (b *BaseTriggerCapability[T]) scanPending() {
396435 return
397436 }
398437
399- warnThreshold := 5 * interval
400- critThreshold := 20 * interval
438+ warnThreshold := 1 * interval
439+ critThreshold := 3 * interval
401440
402441 b .mu .Lock ()
403442 toResend := make ([]PendingEvent , 0 , len (b .pending ))
@@ -433,6 +472,7 @@ func (b *BaseTriggerCapability[T]) scanPending() {
433472
434473 if critThreshold > 0 && ! state .emittedCritical && age >= critThreshold {
435474 b .metrics .EmitUndeliveredCritical (triggerID , eventID )
475+ b .metrics .IncStuckEvent (triggerID , eventID )
436476 state .emittedCritical = true
437477 }
438478 }
0 commit comments