Skip to content

Commit 99a3504

Browse files
committed
feat: add scheduled job tracking
Job heartbeat tracking Mitigação de retry storm
1 parent db1e2bb commit 99a3504

5 files changed

Lines changed: 97 additions & 13 deletions

File tree

app/controllers/api/v1/monitoring_controller.rb

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ module V1
1111
#
1212
# Gap coverage (FAILURE_MODE_ANALYSIS.md):
1313
# Gap 1 — Sidekiq queue depth alert
14+
# Gap 2 — Scheduled job heartbeat monitoring (stale job detection)
1415
# Gap 4 — Dead queue monitoring
1516
#
1617
# @example Check Sidekiq health
@@ -22,14 +23,23 @@ class MonitoringController < BaseController
2223
QUEUE_DEPTH_ALERT_THRESHOLD = ENV.fetch('SIDEKIQ_QUEUE_ALERT_THRESHOLD', 100).to_i
2324
DEAD_QUEUE_ALERT_THRESHOLD = ENV.fetch('SIDEKIQ_DEAD_ALERT_THRESHOLD', 10).to_i
2425

26+
# Scheduled jobs to monitor. Each entry defines the job class name,
27+
# expected run interval, and the threshold after which it is considered stale.
28+
SCHEDULED_JOBS = [
29+
{ name: 'RefreshMetadataViewsJob', interval_hours: 2, alert_after_hours: 3 },
30+
{ name: 'CleanupExpiredTokensJob', interval_hours: 24, alert_after_hours: 25 }
31+
].freeze
32+
2533
# GET /api/v1/monitoring/sidekiq
2634
#
2735
# Returns a snapshot of Sidekiq operational state including queue depths,
28-
# process count, scheduled and dead job counts.
36+
# process count, scheduled and dead job counts, and heartbeat status of
37+
# cron jobs (gap 2 — detects if a scheduled job has not run in its window).
2938
#
3039
# Healthy thresholds (logged as alerts when exceeded):
31-
# - queue_depth > 100 jobs for more than 5 min → Sidekiq may be down
32-
# - dead_count > 10 jobs → jobs are failing silently
40+
# - queue_depth > 100 jobs → Sidekiq may be down
41+
# - dead_count > 10 jobs → jobs are failing silently
42+
# - job stale → scheduled job has not run within expected interval
3343
#
3444
# @return [JSON] Sidekiq stats with health indicators
3545
def sidekiq
@@ -45,12 +55,15 @@ def sidekiq
4555
stats = Sidekiq::Stats.new
4656
processes = Sidekiq::ProcessSet.new.to_a
4757

48-
queue_depths = build_queue_depths
49-
total_depth = queue_depths.values.sum
50-
dead_count = stats.dead_size
58+
queue_depths = build_queue_depths
59+
total_depth = queue_depths.values.sum
60+
dead_count = stats.dead_size
61+
job_heartbeats = build_job_heartbeats
62+
any_stale = job_heartbeats.values.any? { |j| j[:stale] }
5163

52-
health_status = determine_health(total_depth, dead_count, processes.size)
64+
health_status = determine_health(total_depth, dead_count, processes.size, any_stale: any_stale)
5365
emit_alerts(total_depth, dead_count, processes.size)
66+
emit_stale_job_alerts(job_heartbeats)
5467

5568
render json: {
5669
status: health_status,
@@ -70,12 +83,14 @@ def sidekiq
7083
retry: stats.retry_size,
7184
dead: dead_count
7285
},
86+
scheduled_jobs: job_heartbeats,
7387
alerts: {
7488
queue_depth_threshold: QUEUE_DEPTH_ALERT_THRESHOLD,
7589
dead_queue_threshold: DEAD_QUEUE_ALERT_THRESHOLD,
7690
queue_depth_exceeded: total_depth > QUEUE_DEPTH_ALERT_THRESHOLD,
7791
dead_queue_exceeded: dead_count > DEAD_QUEUE_ALERT_THRESHOLD,
78-
no_workers: processes.empty?
92+
no_workers: processes.empty?,
93+
stale_jobs: any_stale
7994
}
8095
}, status: health_status == 'ok' ? :ok : :service_unavailable
8196
end
@@ -98,10 +113,37 @@ def build_queue_depths
98113
end
99114
end
100115

101-
def determine_health(total_depth, dead_count, process_count)
116+
# Reads last-run timestamps from Redis for each scheduled job and returns
117+
# a hash with staleness status. Jobs that have never run return stale: true.
118+
def build_job_heartbeats
119+
Sidekiq.redis do |redis|
120+
SCHEDULED_JOBS.each_with_object({}) do |config, hash|
121+
hash[config[:name]] = build_heartbeat_entry(redis, config)
122+
end
123+
end
124+
rescue StandardError => e
125+
Rails.logger.warn(event: 'monitoring_heartbeat_read_error', error: e.message)
126+
{}
127+
end
128+
129+
def build_heartbeat_entry(redis, config)
130+
raw = redis.call('GET', "prostaff:job_heartbeat:#{config[:name]}")
131+
last_run = raw ? Time.zone.parse(raw) : nil
132+
stale = last_run.nil? || last_run < config[:alert_after_hours].hours.ago
133+
134+
{
135+
last_run_at: last_run&.iso8601,
136+
expected_interval_hours: config[:interval_hours],
137+
alert_after_hours: config[:alert_after_hours],
138+
stale: stale
139+
}
140+
end
141+
142+
def determine_health(total_depth, dead_count, process_count, any_stale: false)
102143
return 'critical' if process_count.zero?
103144
return 'degraded' if dead_count > DEAD_QUEUE_ALERT_THRESHOLD
104145
return 'degraded' if total_depth > QUEUE_DEPTH_ALERT_THRESHOLD
146+
return 'degraded' if any_stale
105147

106148
'ok'
107149
end
@@ -135,6 +177,21 @@ def emit_alerts(total_depth, dead_count, process_count)
135177
threshold: DEAD_QUEUE_ALERT_THRESHOLD
136178
)
137179
end
180+
181+
def emit_stale_job_alerts(heartbeats)
182+
heartbeats.each do |job_name, data|
183+
next unless data[:stale]
184+
185+
Rails.logger.error(
186+
event: 'scheduled_job_stale',
187+
level: 'ALERT',
188+
message: 'Scheduled job has not run within expected interval',
189+
job: job_name,
190+
last_run_at: data[:last_run_at],
191+
alert_after_hours: data[:alert_after_hours]
192+
)
193+
end
194+
end
138195
end
139196
end
140197
end

app/jobs/application_job.rb

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,33 @@
11
# frozen_string_literal: true
22

33
class ApplicationJob < ActiveJob::Base
4+
# Discard jobs whose associated record was deleted before the job ran.
5+
# Without this, DeserializationError causes Sidekiq to retry up to 25 times.
6+
discard_on ActiveJob::DeserializationError
7+
48
# Automatically retry jobs that encountered a deadlock
59
# retry_on ActiveRecord::Deadlocked
610

7-
# Most jobs are safe to ignore if the underlying records are no longer available
8-
# discard_on ActiveJob::DeserializationError
11+
protected
12+
13+
# Writes a "last ran at" timestamp to Sidekiq Redis so MonitoringController
14+
# can detect when a scheduled job has not executed within its expected interval.
15+
#
16+
# Call this at the end of a successful #perform, before the rescue block.
17+
# Safe to call even if Redis is unavailable — failures are warned and swallowed.
18+
#
19+
# Key format: prostaff:job_heartbeat:<ClassName>
20+
# TTL: 7 days (survives a weekend without the job running)
21+
def record_job_heartbeat
22+
return unless defined?(Sidekiq)
23+
24+
key = "prostaff:job_heartbeat:#{self.class.name}"
25+
Sidekiq.redis { |r| r.call('SET', key, Time.current.iso8601, 'EX', 7 * 24 * 3600) }
26+
rescue StandardError => e
27+
Rails.logger.warn(
28+
event: 'job_heartbeat_error',
29+
job: self.class.name,
30+
error: e.message
31+
)
32+
end
933
end

app/jobs/cleanup_expired_tokens_job.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def perform
3333
password_reset_deleted: password_reset_deleted,
3434
blacklist_deleted: blacklist_deleted
3535
)
36+
37+
record_job_heartbeat
3638
rescue StandardError => e
3739
duration_ms = ((Time.current - start_time) * 1000).round
3840

app/jobs/refresh_metadata_views_job.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def perform
4747
duration_ms: duration_ms
4848
)
4949

50+
record_job_heartbeat
5051
duration_ms
5152
ensure
5253
release_lock

config/environments/production.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
require 'active_support/core_ext/integer/time'
44

5-
Rails.application.configure do
5+
Rails.application.configure do # rubocop:disable Metrics/BlockLength
66
config.cache_classes = true
77

88
config.eager_load = true
@@ -60,7 +60,7 @@
6060
{
6161
url: ENV['REDIS_URL'],
6262
reconnect_attempts: 3,
63-
error_handler: lambda { |method:, returning:, exception:|
63+
error_handler: lambda { |_method:, _returning:, exception:|
6464
Rails.logger.warn "Rails cache Redis error: #{exception.message}"
6565
}
6666
}

0 commit comments

Comments
 (0)