feat: add scheduled job tracking

Bulletdev · Bulletdev · commit 99a3504ba576 · 2026-02-26T00:14:40.000-03:00
Job heartbeat tracking

Mitigação de retry storm
diff --git a/app/controllers/api/v1/monitoring_controller.rb b/app/controllers/api/v1/monitoring_controller.rb
@@ -11,6 +11,7 @@ module V1
     #
     # Gap coverage (FAILURE_MODE_ANALYSIS.md):
     #   Gap 1  — Sidekiq queue depth alert
+    #   Gap 2  — Scheduled job heartbeat monitoring (stale job detection)
     #   Gap 4  — Dead queue monitoring
     #
     # @example Check Sidekiq health
@@ -22,14 +23,23 @@ class MonitoringController < BaseController
       QUEUE_DEPTH_ALERT_THRESHOLD = ENV.fetch('SIDEKIQ_QUEUE_ALERT_THRESHOLD', 100).to_i
       DEAD_QUEUE_ALERT_THRESHOLD  = ENV.fetch('SIDEKIQ_DEAD_ALERT_THRESHOLD', 10).to_i
 
+      # Scheduled jobs to monitor. Each entry defines the job class name,
+      # expected run interval, and the threshold after which it is considered stale.
+      SCHEDULED_JOBS = [
+        { name: 'RefreshMetadataViewsJob',  interval_hours: 2,  alert_after_hours: 3  },
+        { name: 'CleanupExpiredTokensJob',  interval_hours: 24, alert_after_hours: 25 }
+      ].freeze
+
       # GET /api/v1/monitoring/sidekiq
       #
       # Returns a snapshot of Sidekiq operational state including queue depths,
-      # process count, scheduled and dead job counts.
+      # process count, scheduled and dead job counts, and heartbeat status of
+      # cron jobs (gap 2 — detects if a scheduled job has not run in its window).
       #
       # Healthy thresholds (logged as alerts when exceeded):
-      #   - queue_depth > 100 jobs for more than 5 min → Sidekiq may be down
-      #   - dead_count   > 10 jobs                      → jobs are failing silently
+      #   - queue_depth > 100 jobs  → Sidekiq may be down
+      #   - dead_count   > 10 jobs  → jobs are failing silently
+      #   - job stale               → scheduled job has not run within expected interval
       #
       # @return [JSON] Sidekiq stats with health indicators
       def sidekiq
@@ -45,12 +55,15 @@ def sidekiq
         stats     = Sidekiq::Stats.new
         processes = Sidekiq::ProcessSet.new.to_a
 
-        queue_depths  = build_queue_depths
-        total_depth   = queue_depths.values.sum
-        dead_count    = stats.dead_size
+        queue_depths   = build_queue_depths
+        total_depth    = queue_depths.values.sum
+        dead_count     = stats.dead_size
+        job_heartbeats = build_job_heartbeats
+        any_stale      = job_heartbeats.values.any? { |j| j[:stale] }
 
-        health_status = determine_health(total_depth, dead_count, processes.size)
+        health_status = determine_health(total_depth, dead_count, processes.size, any_stale: any_stale)
         emit_alerts(total_depth, dead_count, processes.size)
+        emit_stale_job_alerts(job_heartbeats)
 
         render json: {
           status: health_status,
@@ -70,12 +83,14 @@ def sidekiq
             retry: stats.retry_size,
             dead: dead_count
           },
+          scheduled_jobs: job_heartbeats,
           alerts: {
             queue_depth_threshold: QUEUE_DEPTH_ALERT_THRESHOLD,
             dead_queue_threshold: DEAD_QUEUE_ALERT_THRESHOLD,
             queue_depth_exceeded: total_depth > QUEUE_DEPTH_ALERT_THRESHOLD,
             dead_queue_exceeded: dead_count > DEAD_QUEUE_ALERT_THRESHOLD,
-            no_workers: processes.empty?
+            no_workers: processes.empty?,
+            stale_jobs: any_stale
           }
         }, status: health_status == 'ok' ? :ok : :service_unavailable
       end
@@ -98,10 +113,37 @@ def build_queue_depths
         end
       end
 
-      def determine_health(total_depth, dead_count, process_count)
+      # Reads last-run timestamps from Redis for each scheduled job and returns
+      # a hash with staleness status. Jobs that have never run return stale: true.
+      def build_job_heartbeats
+        Sidekiq.redis do |redis|
+          SCHEDULED_JOBS.each_with_object({}) do |config, hash|
+            hash[config[:name]] = build_heartbeat_entry(redis, config)
+          end
+        end
+      rescue StandardError => e
+        Rails.logger.warn(event: 'monitoring_heartbeat_read_error', error: e.message)
+        {}
+      end
+
+      def build_heartbeat_entry(redis, config)
+        raw = redis.call('GET', "prostaff:job_heartbeat:#{config[:name]}")
+        last_run = raw ? Time.zone.parse(raw) : nil
+        stale = last_run.nil? || last_run < config[:alert_after_hours].hours.ago
+
+        {
+          last_run_at: last_run&.iso8601,
+          expected_interval_hours: config[:interval_hours],
+          alert_after_hours: config[:alert_after_hours],
+          stale: stale
+        }
+      end
+
+      def determine_health(total_depth, dead_count, process_count, any_stale: false)
         return 'critical' if process_count.zero?
         return 'degraded' if dead_count > DEAD_QUEUE_ALERT_THRESHOLD
         return 'degraded' if total_depth > QUEUE_DEPTH_ALERT_THRESHOLD
+        return 'degraded' if any_stale
 
         'ok'
       end
@@ -135,6 +177,21 @@ def emit_alerts(total_depth, dead_count, process_count)
           threshold: DEAD_QUEUE_ALERT_THRESHOLD
         )
       end
+
+      def emit_stale_job_alerts(heartbeats)
+        heartbeats.each do |job_name, data|
+          next unless data[:stale]
+
+          Rails.logger.error(
+            event: 'scheduled_job_stale',
+            level: 'ALERT',
+            message: 'Scheduled job has not run within expected interval',
+            job: job_name,
+            last_run_at: data[:last_run_at],
+            alert_after_hours: data[:alert_after_hours]
+          )
+        end
+      end
     end
   end
 end
diff --git a/app/jobs/application_job.rb b/app/jobs/application_job.rb
@@ -1,9 +1,33 @@
 # frozen_string_literal: true
 
 class ApplicationJob < ActiveJob::Base
+  # Discard jobs whose associated record was deleted before the job ran.
+  # Without this, DeserializationError causes Sidekiq to retry up to 25 times.
+  discard_on ActiveJob::DeserializationError
+
   # Automatically retry jobs that encountered a deadlock
   # retry_on ActiveRecord::Deadlocked
 
-  # Most jobs are safe to ignore if the underlying records are no longer available
-  # discard_on ActiveJob::DeserializationError
+  protected
+
+  # Writes a "last ran at" timestamp to Sidekiq Redis so MonitoringController
+  # can detect when a scheduled job has not executed within its expected interval.
+  #
+  # Call this at the end of a successful #perform, before the rescue block.
+  # Safe to call even if Redis is unavailable — failures are warned and swallowed.
+  #
+  # Key format: prostaff:job_heartbeat:<ClassName>
+  # TTL: 7 days (survives a weekend without the job running)
+  def record_job_heartbeat
+    return unless defined?(Sidekiq)
+
+    key = "prostaff:job_heartbeat:#{self.class.name}"
+    Sidekiq.redis { |r| r.call('SET', key, Time.current.iso8601, 'EX', 7 * 24 * 3600) }
+  rescue StandardError => e
+    Rails.logger.warn(
+      event: 'job_heartbeat_error',
+      job: self.class.name,
+      error: e.message
+    )
+  end
 end
diff --git a/app/jobs/cleanup_expired_tokens_job.rb b/app/jobs/cleanup_expired_tokens_job.rb
@@ -33,6 +33,8 @@ def perform
       password_reset_deleted: password_reset_deleted,
       blacklist_deleted: blacklist_deleted
     )
+
+    record_job_heartbeat
   rescue StandardError => e
     duration_ms = ((Time.current - start_time) * 1000).round
 
diff --git a/app/jobs/refresh_metadata_views_job.rb b/app/jobs/refresh_metadata_views_job.rb
@@ -47,6 +47,7 @@ def perform
         duration_ms: duration_ms
       )
 
+      record_job_heartbeat
       duration_ms
     ensure
       release_lock
diff --git a/config/environments/production.rb b/config/environments/production.rb
@@ -2,7 +2,7 @@
 
 require 'active_support/core_ext/integer/time'
 
-Rails.application.configure do
+Rails.application.configure do # rubocop:disable Metrics/BlockLength
   config.cache_classes = true
 
   config.eager_load = true
@@ -60,7 +60,7 @@
                            {
                              url: ENV['REDIS_URL'],
                              reconnect_attempts: 3,
-                             error_handler: lambda { |method:, returning:, exception:|
+                             error_handler: lambda { |_method:, _returning:, exception:|
                                Rails.logger.warn "Rails cache Redis error: #{exception.message}"
                              }
                            }

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,8 @@ def perform`
`33`	`33`	`password_reset_deleted: password_reset_deleted,`
`34`	`34`	`blacklist_deleted: blacklist_deleted`
`35`	`35`	`)`
	`36`	`+`
	`37`	`+ record_job_heartbeat`
`36`	`38`	`rescue StandardError => e`
`37`	`39`	`duration_ms = ((Time.current - start_time) * 1000).round`
`38`	`40`
Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,7 @@ def perform`
`47`	`47`	`duration_ms: duration_ms`
`48`	`48`	`)`
`49`	`49`
	`50`	`+ record_job_heartbeat`
`50`	`51`	`duration_ms`
`51`	`52`	`ensure`
`52`	`53`	`release_lock`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`
`3`	`3`	`require 'active_support/core_ext/integer/time'`
`4`	`4`
`5`		`-Rails.application.configure do`
	`5`	`+Rails.application.configure do # rubocop:disable Metrics/BlockLength`
`6`	`6`	`config.cache_classes = true`
`7`	`7`
`8`	`8`	`config.eager_load = true`
`@@ -60,7 +60,7 @@`
`60`	`60`	`{`
`61`	`61`	`url: ENV['REDIS_URL'],`
`62`	`62`	`reconnect_attempts: 3,`
`63`		`- error_handler: lambda { \|method:, returning:, exception:\|`
	`63`	`+ error_handler: lambda { \|_method:, _returning:, exception:\|`
`64`	`64`	`Rails.logger.warn "Rails cache Redis error: #{exception.message}"`
`65`	`65`	`}`
`66`	`66`	`}`