Skip to content

Commit 07c8edf

Browse files
Fix race condition in ProcessesSync causing SSH key mismatch (#5033)
The sync could update an LRP with stale SSH route data from a previous version when the process version changed between the initial scan and the batch execution. As a result, the LRP has a different RSA key pair for the diego-sshd daemon and the diego-ssh route. "cf ssh" is broken until the app is manually restarted. Fix: Before executing desire/update, verify the process version still matches what was recorded during the initial scan. Skip if mismatched - the next sync cycle will handle it correctly. Co-authored-by: Philipp Thun <philipp.thun@sap.com>
1 parent 5bc8f7d commit 07c8edf

File tree

2 files changed

+42
-4
lines changed

2 files changed

+42
-4
lines changed

lib/cloud_controller/diego/processes_sync.rb

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ def sync
2121
@bump_freshness = true
2222
diego_lrps = bbs_apps_client.fetch_scheduling_infos.index_by { |d| d.desired_lrp_key.process_guid }
2323
logger.info('fetched-scheduling-infos')
24-
to_desire = []
24+
to_desire = {}
2525
to_update = {}
2626
batched_processes_for_sync do |processes|
2727
processes.each do |process|
2828
process_guid = ProcessGuid.from_process(process)
2929
diego_lrp = diego_lrps.delete(process_guid)
3030

3131
if diego_lrp.nil?
32-
to_desire.append(process.id)
32+
to_desire[process.id] = process_guid
3333
elsif process.updated_at.to_f.to_s != diego_lrp.annotation
3434
to_update[process.id] = diego_lrp
3535
end
@@ -87,7 +87,10 @@ def process_workpool_exceptions(exceptions)
8787
def update_lrps(to_update)
8888
batched_processes(to_update.keys) do |processes|
8989
processes.each do |process|
90-
workpool.submit(process, to_update[process.id]) do |p, l|
90+
existing_lrp = to_update[process.id]
91+
next if ProcessGuid.from_process(process) != existing_lrp.desired_lrp_key.process_guid
92+
93+
workpool.submit(process, existing_lrp) do |p, l|
9194
logger.info('updating-lrp', process_guid: p.guid, app_guid: p.app_guid)
9295
bbs_apps_client.update_app(p, l)
9396
logger.info('update-lrp', process_guid: p.guid)
@@ -97,8 +100,11 @@ def update_lrps(to_update)
97100
end
98101

99102
def desire_lrps(to_desire)
100-
batched_processes(to_desire) do |processes|
103+
batched_processes(to_desire.keys) do |processes|
101104
processes.each do |process|
105+
desired_process_guid = to_desire[process.id]
106+
next if ProcessGuid.from_process(process) != desired_process_guid
107+
102108
workpool.submit(process) do |p|
103109
logger.info('desiring-lrp', process_guid: p.guid, app_guid: p.app_guid)
104110
bbs_apps_client.desire_app(p)

spec/isolated_specs/processes_sync_spec.rb

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,22 @@ module Diego
121121
expect(bbs_apps_client).to have_received(:bump_freshness).once
122122
end
123123

124+
context 'when the stale process gets a new version between fetching for sync (only guid + version + created_at) and fetching for actual update (full model)' do
125+
before do
126+
allow(subject).to receive(:update_lrps).and_wrap_original do |method, *args|
127+
stale_process.set_new_version
128+
stale_process.save
129+
method.call(*args)
130+
end
131+
end
132+
133+
it 'skips the (obsolete) update' do
134+
allow(bbs_apps_client).to receive(:update_app)
135+
subject.sync
136+
expect(bbs_apps_client).not_to have_received(:update_app)
137+
end
138+
end
139+
124140
context 'when the process is a cnb app' do
125141
let(:app) { AppModel.make(:cnb, droplet: DropletModel.make(:cnb)) }
126142
let!(:stale_process) { ProcessModel.make(:cnb, state: 'STARTED', app: app) }
@@ -253,6 +269,22 @@ module Diego
253269
expect(bbs_apps_client).to have_received(:bump_freshness).once
254270
end
255271

272+
context 'when the missing process gets a new version between fetching for sync (only guid + version + created_at) and fetching for actual creation (full model)' do
273+
before do
274+
allow(subject).to receive(:desire_lrps).and_wrap_original do |method, *args|
275+
missing_process.set_new_version
276+
missing_process.save
277+
method.call(*args)
278+
end
279+
end
280+
281+
it 'skips the (obsolete) creation' do
282+
allow(bbs_apps_client).to receive(:desire_app)
283+
subject.sync
284+
expect(bbs_apps_client).not_to have_received(:desire_app)
285+
end
286+
end
287+
256288
context 'when desiring app fails' do
257289
# bbs_apps_client will raise ApiErrors as of right now, we should think about factoring that out so that
258290
# the background job doesn't have to deal with API concerns

0 commit comments

Comments
 (0)