Skip to content

Commit 604b735

Browse files
RUBY-3778 Server deprioritizing in replica sets (#2993)
1 parent fa600a3 commit 604b735

File tree

5 files changed

+114
-2
lines changed

5 files changed

+114
-2
lines changed

lib/mongo/retryable.rb

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,36 @@ module Retryable
4646
# @api private
4747
#
4848
# @return [ Mongo::Server ] A server matching the server preference.
49-
def select_server(cluster, server_selector, session, failed_server = nil, timeout: nil)
49+
def select_server(cluster, server_selector, session, failed_server = nil, error: nil, timeout: nil)
50+
deprioritized = if failed_server && deprioritize_server?(cluster, error)
51+
[failed_server]
52+
else
53+
[]
54+
end
5055
server_selector.select_server(
5156
cluster,
5257
nil,
5358
session,
54-
deprioritized: [failed_server].compact,
59+
deprioritized: deprioritized,
5560
timeout: timeout
5661
)
5762
end
5863

64+
private
65+
66+
# Whether the failed server should be deprioritized during server
67+
# selection for a retry attempt. For sharded and load-balanced
68+
# topologies, servers are always deprioritized on any retryable error.
69+
# For replica sets, servers are only deprioritized when the error
70+
# carries the SystemOverloadedError label.
71+
def deprioritize_server?(cluster, error)
72+
return true if cluster.sharded? || cluster.load_balanced?
73+
74+
error.respond_to?(:label?) && error.label?('SystemOverloadedError')
75+
end
76+
77+
public
78+
5979
# Returns the read worker for handling retryable reads.
6080
#
6181
# @api private

lib/mongo/retryable/read_worker.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,7 @@ def select_server_for_retry(original_error, session, server_selector, context, f
327327
server_selector,
328328
session,
329329
failed_server,
330+
error: original_error,
330331
timeout: context&.remaining_timeout_sec
331332
)
332333
rescue Error, Error::AuthError => e

lib/mongo/retryable/write_worker.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ def modern_write_with_retry(session, server, context, &block)
272272
#
273273
# @return [ Result ] The result of the operation.
274274
def retry_write(original_error, txn_num, context:, failed_server: nil, &block)
275+
failed_error = failed_error || original_error
275276
context&.check_timeout!
276277

277278
session = context.session
@@ -286,6 +287,7 @@ def retry_write(original_error, txn_num, context:, failed_server: nil, &block)
286287
ServerSelector.primary,
287288
session,
288289
failed_server,
290+
error: failed_error,
289291
timeout: context.remaining_timeout_sec
290292
)
291293

@@ -311,10 +313,12 @@ def retry_write(original_error, txn_num, context:, failed_server: nil, &block)
311313
rescue *retryable_exceptions, Error::PoolError => e
312314
maybe_fail_on_retryable(e, original_error, context, attempt)
313315
failed_server = server
316+
failed_error = e
314317
retry
315318
rescue Error::OperationFailure::Family => e
316319
maybe_fail_on_operation_failure(e, original_error, context, attempt)
317320
failed_server = server
321+
failed_error = e
318322
retry
319323
rescue Mongo::Error::TimeoutError
320324
raise

spec/integration/retryable_reads_errors_spec.rb

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,4 +271,89 @@
271271
end
272272
end
273273
end
274+
275+
context 'Retries in a replica set' do
276+
require_topology :replica_set
277+
min_server_version '4.4'
278+
279+
let(:subscriber) { Mrss::EventSubscriber.new }
280+
281+
let(:find_failed_events) do
282+
subscriber.failed_events.select { |e| e.command_name == 'find' }
283+
end
284+
285+
let(:find_succeeded_events) do
286+
subscriber.succeeded_events.select { |e| e.command_name == 'find' }
287+
end
288+
289+
let(:options) { {} }
290+
291+
after do
292+
authorized_client.use(:admin).command(
293+
configureFailPoint: 'failCommand',
294+
mode: 'off'
295+
)
296+
client.close
297+
end
298+
299+
context 'when error has SystemOverloadedError label' do
300+
let(:client) do
301+
authorized_client.with(
302+
retry_reads: true,
303+
read: { mode: :primary_preferred }
304+
)
305+
end
306+
307+
before do
308+
authorized_client.use(:admin).command(
309+
configureFailPoint: 'failCommand',
310+
mode: { times: 1 },
311+
data: {
312+
failCommands: %w(find),
313+
errorCode: 6,
314+
errorLabels: %w(RetryableError SystemOverloadedError)
315+
}
316+
)
317+
end
318+
319+
it 'retries on a different server' do
320+
client.subscribe(Mongo::Monitoring::COMMAND, subscriber)
321+
subscriber.clear_events!
322+
expect { collection.find.first }.not_to raise_error
323+
expect(find_failed_events.length).to eq(1)
324+
expect(find_succeeded_events.length).to eq(1)
325+
expect(find_failed_events.first.address).not_to eq(find_succeeded_events.first.address)
326+
end
327+
end
328+
329+
context 'when error does not have SystemOverloadedError label' do
330+
let(:client) do
331+
authorized_client.with(
332+
retry_reads: true,
333+
read: { mode: :primary_preferred }
334+
)
335+
end
336+
337+
before do
338+
authorized_client.use(:admin).command(
339+
configureFailPoint: 'failCommand',
340+
mode: { times: 1 },
341+
data: {
342+
failCommands: %w(find),
343+
errorCode: 6,
344+
errorLabels: %w(RetryableError)
345+
}
346+
)
347+
end
348+
349+
it 'retries on the same server' do
350+
client.subscribe(Mongo::Monitoring::COMMAND, subscriber)
351+
subscriber.clear_events!
352+
expect { collection.find.first }.not_to raise_error
353+
expect(find_failed_events.length).to eq(1)
354+
expect(find_succeeded_events.length).to eq(1)
355+
expect(find_failed_events.first.address).to eq(find_succeeded_events.first.address)
356+
end
357+
end
358+
end
274359
end

spec/mongo/retryable_spec.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ def retry_write_allowed?(*args)
115115
let(:cluster) do
116116
double('cluster', next_primary: server).tap do |cluster|
117117
allow(cluster).to receive(:replica_set?).and_return(true)
118+
allow(cluster).to receive(:sharded?).and_return(false)
119+
allow(cluster).to receive(:load_balanced?).and_return(false)
118120
allow(cluster).to receive(:addresses).and_return(['x'])
119121
end
120122
end

0 commit comments

Comments
 (0)