fix: replace N+1 lookup with single join query in DelayedJobsRecover

serdarozerr · serdarozerr · commit df047e329aec · 2026-05-05T17:20:40.000+02:00
The previous implementation queried dead delayed_jobs then performed
separate lookups per row to find the pollable job, entity, and last
operation state. Replace with a single 4-table join across
service_instance_operations, service_instances, jobs, and delayed_jobs,
filtering all conditions in one query
diff --git a/app/jobs/runtime/delayed_jobs_recover.rb b/app/jobs/runtime/delayed_jobs_recover.rb
@@ -2,10 +2,6 @@ module VCAP::CloudController
   module Jobs
     module Runtime
       class DelayedJobsRecover < VCAP::CloudController::Jobs::CCJob
-        RECOVERABLE_OPERATIONS = %w[
-          service_instance.create
-        ].freeze
-
         def perform
           logger.info('Recover halted delayed jobs')
           recover
@@ -18,48 +14,52 @@ def max_attempts
         private
 
         def recover
-          # find delayed jobs where failed_at is set (permanently failed)
-          # and still within the max polling duration (not expired)
+          # Find stuck service instance create operations where the broker is still working
+          # but CC's polling job has permanently failed due to a transient error (e.g. brief db connection flip).
+          # Join path: service_instance_operations → service_instances → jobs → delayed_jobs.
+          #
+          # Filters:
+          #   - service_instance_operations.state='in progress': the broker has not yet reported a final state
+          #     (succeeded or failed) that CC could successfully persist; if CC had received and saved a final
+          #     state from the broker, this column would already be 'succeeded' or 'failed' — not 'in progress'
+          #   - service_instance_operations.type='create': scope to create operations only
+          #   - service_instance_operations.created_at > cutoff: operations beyond the max async polling window
+          #     are intentionally excluded — the broker has given up on them too, so re-enqueuing is pointless
+          #   - jobs.state IN (POLLING, FAILED): the pollable job has not reached a terminal success state;
+          #     POLLING covers the case where the failure hook itself couldn't write FAILED due to the DB flip
+          #   - jobs.operation='service_instance.create': prevents matching update/delete jobs for the same
+          #     service instance that happen to share the same resource_guid
+          #   - delayed_jobs.failed_at IS NOT NULL: the delayed job permanently failed (exhausted max_attempts);
+          #     jobs still alive or locked have failed_at=NULL and must not be touched
           cutoff_time = Time.now - default_maximum_duration_seconds
-          dead_delayed_jobs = Delayed::Job.
-                              exclude(failed_at: nil).
-                              where { created_at > cutoff_time }.
-                              order(:created_at).
-                              limit(batch_size)
-
-          dead_delayed_jobs.each do |delayed|
-            # pollable job state can be POLLING or FAILED depending on whether the failure
-            # hook managed to persist before the db connection was lost
-            pollable = PollableJobModel.where(delayed_job_guid: delayed.guid).
-                       where(state: [PollableJobModel::POLLING_STATE, PollableJobModel::FAILED_STATE]).
-                       first
-            next unless pollable
-            next unless RECOVERABLE_OPERATIONS.include?(pollable.operation)
-
-            # last_operation.state must be 'in progress'. This confirms the broker is still
-            # working on the operation and CC is the one that gave up, not the broker
-            entity = find_entity(pollable)
-            next unless entity
-            next unless entity.last_operation&.state == 'in progress'
+          stuck = ServiceInstanceOperation.
+                  join(:service_instances, id: Sequel[:service_instance_operations][:service_instance_id]).
+                  join(:jobs, resource_guid: Sequel[:service_instances][:guid]).
+                  join(:delayed_jobs, guid: Sequel[:jobs][:delayed_job_guid]).
+                  where(Sequel[:service_instance_operations][:state] => 'in progress').
+                  where(Sequel[:service_instance_operations][:type] => 'create').
+                  where { Sequel[:service_instance_operations][:created_at] > cutoff_time }.
+                  where(Sequel[:jobs][:state] => [PollableJobModel::POLLING_STATE, PollableJobModel::FAILED_STATE]).
+                  where(Sequel[:jobs][:operation] => 'service_instance.create').
+                  exclude(Sequel[:delayed_jobs][:failed_at] => nil).
+                  select(Sequel[:jobs][:guid].as(:pollable_guid), Sequel[:delayed_jobs][:guid].as(:dj_guid)).
+                  order(Sequel[:service_instance_operations][:created_at]).
+                  limit(batch_size)
 
-            reenqueue(pollable, delayed)
-          end
-        end
+          stuck.each do |row|
+            delayed = Delayed::Job.first(guid: row[:dj_guid])
+            next unless delayed
 
-        def find_entity(pollable)
-          # TODO: resource_type field can be used
-          case pollable.operation
-          when 'service_instance.create'
-            ManagedServiceInstance.first(guid: pollable.resource_guid)
+            reenqueue(row[:pollable_guid], delayed)
           end
         end
 
-        def reenqueue(pollable, delayed)
+        def reenqueue(pollable_guid, delayed)
           # re-verify atomically that the pollable job still points to this dead delayed_job.
           # if another process already re-enqueued a new job, pollable.delayed_job_guid was
           # updated to the new delayed_job's guid, so where clause returns nil and we skip safely.
           PollableJobModel.db.transaction do
-            pjob = PollableJobModel.where(guid: pollable.guid,
+            pjob = PollableJobModel.where(guid: pollable_guid,
                                           delayed_job_guid: delayed.guid,
                                           state: [PollableJobModel::POLLING_STATE, PollableJobModel::FAILED_STATE]).
                    for_update.first
@@ -68,7 +68,7 @@ def reenqueue(pollable, delayed)
             # bring the pollable job into the clean polling state
             pjob.update(cf_api_error: nil, state: PollableJobModel::POLLING_STATE)
 
-            # unwrap the serialized handler and re-enqueue via the reoccurring job
+            # unwrap the serialized handler and re-enqueue via the reoccurring job's enqueue_next_job method
             inner_job = Jobs::Enqueuer.unwrap_job(delayed.payload_object)
             inner_job.send(:enqueue_next_job, pjob)
           end
diff --git a/db/migrations/20260505071445_add_jobs_operation_state_index.rb b/db/migrations/20260505071445_add_jobs_operation_state_index.rb
@@ -0,0 +1,38 @@
+Sequel.migration do
+  no_transaction # required for concurrently option on postgres
+
+  up do
+    if database_type == :postgres
+      VCAP::Migration.with_concurrent_timeout(self) do
+        add_index :jobs, %i[operation state],
+                  name: :jobs_operation_state_index,
+                  where: "state IN ('POLLING', 'FAILED')",
+                  if_not_exists: true,
+                  concurrently: true
+      end
+    elsif database_type == :mysql
+      alter_table(:jobs) do
+        # rubocop:disable Sequel/ConcurrentIndex -- MySQL does not support concurrent index operations
+        add_index %i[operation state], name: :jobs_operation_state_index unless @db.indexes(:jobs).key?(:jobs_operation_state_index)
+        # rubocop:enable Sequel/ConcurrentIndex
+      end
+    end
+  end
+
+  down do
+    if database_type == :postgres
+      VCAP::Migration.with_concurrent_timeout(self) do
+        drop_index :jobs, %i[operation state],
+                   name: :jobs_operation_state_index,
+                   if_exists: true,
+                   concurrently: true
+      end
+    elsif database_type == :mysql
+      alter_table(:jobs) do
+        # rubocop:disable Sequel/ConcurrentIndex
+        drop_index %i[operation state], name: :jobs_operation_state_index if @db.indexes(:jobs).key?(:jobs_operation_state_index)
+        # rubocop:enable Sequel/ConcurrentIndex
+      end
+    end
+  end
+end
diff --git a/spec/migrations/20260505071445_add_jobs_operation_state_index_spec.rb b/spec/migrations/20260505071445_add_jobs_operation_state_index_spec.rb
@@ -0,0 +1,63 @@
+# rubocop:disable Migration/TooManyMigrationRuns
+require 'spec_helper'
+require 'migrations/helpers/migration_shared_context'
+
+def operation_state_partial_index_present
+  # partial indexes are not returned in `db.indexes`. That's why we have to query this information manually.
+  partial_indexes = db.fetch("SELECT * FROM pg_indexes WHERE tablename = 'jobs' AND indexname = 'jobs_operation_state_index';")
+
+  index_present = false
+  partial_indexes.each do |_index|
+    index_present = true
+  end
+
+  index_present
+end
+
+RSpec.describe 'migration to add operation_state_index on jobs table', isolation: :truncation, type: :migration do
+  include_context 'migration' do
+    let(:migration_filename) { '20260505071445_add_jobs_operation_state_index.rb' }
+  end
+
+  describe 'jobs table' do
+    it 'adds index and handles idempotency gracefully' do
+      if db.database_type == :postgres
+        # Test up migration
+        expect(operation_state_partial_index_present).to be_falsey
+        expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index, allow_missing_migration_files: true) }.not_to raise_error
+        expect(operation_state_partial_index_present).to be_truthy
+
+        # Test up migration idempotency
+        expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index, allow_missing_migration_files: true) }.not_to raise_error
+        expect(operation_state_partial_index_present).to be_truthy
+
+        # Test down migration
+        expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index - 1, allow_missing_migration_files: true) }.not_to raise_error
+        expect(operation_state_partial_index_present).to be_falsey
+
+        # Test down migration idempotency
+        expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index - 1, allow_missing_migration_files: true) }.not_to raise_error
+        expect(operation_state_partial_index_present).to be_falsey
+
+      elsif db.database_type == :mysql
+        # Test up migration
+        expect(db.indexes(:jobs)).not_to include(:jobs_operation_state_index)
+        expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index, allow_missing_migration_files: true) }.not_to raise_error
+        expect(db.indexes(:jobs)).to include(:jobs_operation_state_index)
+
+        # Test up migration idempotency
+        expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index, allow_missing_migration_files: true) }.not_to raise_error
+        expect(db.indexes(:jobs)).to include(:jobs_operation_state_index)
+
+        # Test down migration
+        expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index - 1, allow_missing_migration_files: true) }.not_to raise_error
+        expect(db.indexes(:jobs)).not_to include(:jobs_operation_state_index)
+
+        # Test down migration idempotency
+        expect { Sequel::Migrator.run(db, migrations_path, target: current_migration_index - 1, allow_missing_migration_files: true) }.not_to raise_error
+        expect(db.indexes(:jobs)).not_to include(:jobs_operation_state_index)
+      end
+    end
+  end
+end
+# rubocop:enable Migration/TooManyMigrationRuns
diff --git a/spec/unit/jobs/runtime/delayed_jobs_recover_spec.rb b/spec/unit/jobs/runtime/delayed_jobs_recover_spec.rb