@@ -11,6 +11,7 @@ module V1
1111 #
1212 # Gap coverage (FAILURE_MODE_ANALYSIS.md):
1313 # Gap 1 — Sidekiq queue depth alert
14+ # Gap 2 — Scheduled job heartbeat monitoring (stale job detection)
1415 # Gap 4 — Dead queue monitoring
1516 #
1617 # @example Check Sidekiq health
@@ -22,14 +23,23 @@ class MonitoringController < BaseController
2223 QUEUE_DEPTH_ALERT_THRESHOLD = ENV . fetch ( 'SIDEKIQ_QUEUE_ALERT_THRESHOLD' , 100 ) . to_i
2324 DEAD_QUEUE_ALERT_THRESHOLD = ENV . fetch ( 'SIDEKIQ_DEAD_ALERT_THRESHOLD' , 10 ) . to_i
2425
26+ # Scheduled jobs to monitor. Each entry defines the job class name,
27+ # expected run interval, and the threshold after which it is considered stale.
28+ SCHEDULED_JOBS = [
29+ { name : 'RefreshMetadataViewsJob' , interval_hours : 2 , alert_after_hours : 3 } ,
30+ { name : 'CleanupExpiredTokensJob' , interval_hours : 24 , alert_after_hours : 25 }
31+ ] . freeze
32+
2533 # GET /api/v1/monitoring/sidekiq
2634 #
2735 # Returns a snapshot of Sidekiq operational state including queue depths,
28- # process count, scheduled and dead job counts.
36+ # process count, scheduled and dead job counts, and heartbeat status of
37+ # cron jobs (gap 2 — detects if a scheduled job has not run in its window).
2938 #
3039 # Healthy thresholds (logged as alerts when exceeded):
31- # - queue_depth > 100 jobs for more than 5 min → Sidekiq may be down
32- # - dead_count > 10 jobs → jobs are failing silently
40+ # - queue_depth > 100 jobs → Sidekiq may be down
41+ # - dead_count > 10 jobs → jobs are failing silently
42+ # - job stale → scheduled job has not run within expected interval
3343 #
3444 # @return [JSON] Sidekiq stats with health indicators
3545 def sidekiq
@@ -45,12 +55,15 @@ def sidekiq
4555 stats = Sidekiq ::Stats . new
4656 processes = Sidekiq ::ProcessSet . new . to_a
4757
48- queue_depths = build_queue_depths
49- total_depth = queue_depths . values . sum
50- dead_count = stats . dead_size
58+ queue_depths = build_queue_depths
59+ total_depth = queue_depths . values . sum
60+ dead_count = stats . dead_size
61+ job_heartbeats = build_job_heartbeats
62+ any_stale = job_heartbeats . values . any? { |j | j [ :stale ] }
5163
52- health_status = determine_health ( total_depth , dead_count , processes . size )
64+ health_status = determine_health ( total_depth , dead_count , processes . size , any_stale : any_stale )
5365 emit_alerts ( total_depth , dead_count , processes . size )
66+ emit_stale_job_alerts ( job_heartbeats )
5467
5568 render json : {
5669 status : health_status ,
@@ -70,12 +83,14 @@ def sidekiq
7083 retry : stats . retry_size ,
7184 dead : dead_count
7285 } ,
86+ scheduled_jobs : job_heartbeats ,
7387 alerts : {
7488 queue_depth_threshold : QUEUE_DEPTH_ALERT_THRESHOLD ,
7589 dead_queue_threshold : DEAD_QUEUE_ALERT_THRESHOLD ,
7690 queue_depth_exceeded : total_depth > QUEUE_DEPTH_ALERT_THRESHOLD ,
7791 dead_queue_exceeded : dead_count > DEAD_QUEUE_ALERT_THRESHOLD ,
78- no_workers : processes . empty?
92+ no_workers : processes . empty? ,
93+ stale_jobs : any_stale
7994 }
8095 } , status : health_status == 'ok' ? :ok : :service_unavailable
8196 end
@@ -98,10 +113,37 @@ def build_queue_depths
98113 end
99114 end
100115
101- def determine_health ( total_depth , dead_count , process_count )
116+ # Reads last-run timestamps from Redis for each scheduled job and returns
117+ # a hash with staleness status. Jobs that have never run return stale: true.
118+ def build_job_heartbeats
119+ Sidekiq . redis do |redis |
120+ SCHEDULED_JOBS . each_with_object ( { } ) do |config , hash |
121+ hash [ config [ :name ] ] = build_heartbeat_entry ( redis , config )
122+ end
123+ end
124+ rescue StandardError => e
125+ Rails . logger . warn ( event : 'monitoring_heartbeat_read_error' , error : e . message )
126+ { }
127+ end
128+
129+ def build_heartbeat_entry ( redis , config )
130+ raw = redis . call ( 'GET' , "prostaff:job_heartbeat:#{ config [ :name ] } " )
131+ last_run = raw ? Time . zone . parse ( raw ) : nil
132+ stale = last_run . nil? || last_run < config [ :alert_after_hours ] . hours . ago
133+
134+ {
135+ last_run_at : last_run &.iso8601 ,
136+ expected_interval_hours : config [ :interval_hours ] ,
137+ alert_after_hours : config [ :alert_after_hours ] ,
138+ stale : stale
139+ }
140+ end
141+
142+ def determine_health ( total_depth , dead_count , process_count , any_stale : false )
102143 return 'critical' if process_count . zero?
103144 return 'degraded' if dead_count > DEAD_QUEUE_ALERT_THRESHOLD
104145 return 'degraded' if total_depth > QUEUE_DEPTH_ALERT_THRESHOLD
146+ return 'degraded' if any_stale
105147
106148 'ok'
107149 end
@@ -135,6 +177,21 @@ def emit_alerts(total_depth, dead_count, process_count)
135177 threshold : DEAD_QUEUE_ALERT_THRESHOLD
136178 )
137179 end
180+
181+ def emit_stale_job_alerts ( heartbeats )
182+ heartbeats . each do |job_name , data |
183+ next unless data [ :stale ]
184+
185+ Rails . logger . error (
186+ event : 'scheduled_job_stale' ,
187+ level : 'ALERT' ,
188+ message : 'Scheduled job has not run within expected interval' ,
189+ job : job_name ,
190+ last_run_at : data [ :last_run_at ] ,
191+ alert_after_hours : data [ :alert_after_hours ]
192+ )
193+ end
194+ end
138195 end
139196 end
140197end
0 commit comments