Skip to content

Commit 2f62bde

Browse files
RUBY-3700 Do not clear pool on retryable errors (#2990)
1 parent 7e7e15d commit 2f62bde

13 files changed

+592
-20
lines changed

lib/mongo/server.rb

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -507,11 +507,13 @@ def with_connection(connection_global_id: nil, context: nil, &block)
507507
def handle_handshake_failure!
508508
yield
509509
rescue Mongo::Error::SocketError, Mongo::Error::SocketTimeoutError => e
510-
unknown!(
511-
generation: e.generation,
512-
service_id: e.service_id,
513-
stop_push_monitor: true,
514-
)
510+
unless e.label?('SystemOverloadedError')
511+
unknown!(
512+
generation: e.generation,
513+
service_id: e.service_id,
514+
stop_push_monitor: true,
515+
)
516+
end
515517
raise
516518
end
517519

lib/mongo/server/connection_pool.rb

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -995,17 +995,26 @@ def maybe_raise_pool_cleared!(connection, e)
995995
def connect_connection(connection, context = nil)
996996
begin
997997
connection.connect!(context)
998-
rescue Exception
998+
rescue Exception => exc
999+
# When a connection encounters an error during creation
1000+
# (including handshake and authentication), mark the server
1001+
# as Unknown so the pool is cleared (per CMAP spec).
1002+
# The unknown! call must happen before disconnect! so that
1003+
# the ConnectionPoolCleared event is published before the
1004+
# ConnectionClosed event.
1005+
# Errors with the SystemOverloadedError label (network errors
1006+
# during handshake) are excluded per the SDAM spec — those
1007+
# must not change the server description.
1008+
if exc.is_a?(Mongo::Error) && !exc.label?('SystemOverloadedError')
1009+
@server.unknown!(
1010+
generation: exc.generation,
1011+
service_id: exc.service_id,
1012+
stop_push_monitor: true,
1013+
)
1014+
end
9991015
connection.disconnect!(reason: :error)
10001016
raise
10011017
end
1002-
rescue Error::SocketError, Error::SocketTimeoutError => exc
1003-
@server.unknown!(
1004-
generation: exc.generation,
1005-
service_id: exc.service_id,
1006-
stop_push_monitor: true,
1007-
)
1008-
raise
10091018
end
10101019

10111020
def check_invariants

lib/mongo/server/pending_connection.rb

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,19 @@ def handshake!(speculative_auth_doc: nil)
160160
log_prefix: options[:log_prefix],
161161
bg_error_backtrace: options[:bg_error_backtrace],
162162
)
163+
if exc.is_a?(::Mongo::Error)
164+
# The SystemOverloadedError label marks errors as back-pressure
165+
# signals that should not cause the server to be marked Unknown.
166+
# Per the SDAM spec, this label only applies to network errors
167+
# during the handshake. Command errors (e.g. OperationFailure
168+
# with "node is shutting down" codes) must still flow through
169+
# normal SDAM error handling so the server is marked Unknown
170+
# and the pool is cleared.
171+
if exc.is_a?(Error::SocketError) || exc.is_a?(Error::SocketTimeoutError)
172+
exc.add_label('SystemOverloadedError')
173+
exc.add_label('RetryableError')
174+
end
175+
end
163176
raise
164177
end
165178
end

lib/mongo/session.rb

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -456,13 +456,26 @@ def with_transaction(options = nil)
456456
Utils.monotonic_time + 120
457457
end
458458
transaction_in_progress = false
459+
transaction_attempt = 0
460+
last_error = nil
461+
459462
loop do
463+
if transaction_attempt > 0
464+
backoff = backoff_seconds_for_retry(transaction_attempt)
465+
if backoff_would_exceed_deadline?(deadline, backoff)
466+
raise(last_error)
467+
end
468+
sleep(backoff)
469+
end
470+
460471
commit_options = {}
461472
if options
462473
commit_options[:write_concern] = options[:write_concern]
463474
end
464475
start_transaction(options)
465476
transaction_in_progress = true
477+
transaction_attempt += 1
478+
466479
begin
467480
rv = yield self
468481
rescue Exception => e
@@ -479,6 +492,7 @@ def with_transaction(options = nil)
479492
end
480493

481494
if e.is_a?(Mongo::Error) && e.label?('TransientTransactionError')
495+
last_error = e
482496
next
483497
end
484498

@@ -495,7 +509,7 @@ def with_transaction(options = nil)
495509
return rv
496510
rescue Mongo::Error => e
497511
if e.label?('UnknownTransactionCommitResult')
498-
if deadline_expired?(deadline) ||
512+
if deadline_expired?(deadline) ||
499513
e.is_a?(Error::OperationFailure::Family) && e.max_time_ms_expired?
500514
then
501515
transaction_in_progress = false
@@ -516,6 +530,7 @@ def with_transaction(options = nil)
516530
transaction_in_progress = false
517531
raise
518532
end
533+
last_error = e
519534
@state = NO_TRANSACTION_STATE
520535
next
521536
else
@@ -1312,5 +1327,20 @@ def deadline_expired?(deadline)
13121327
Utils.monotonic_time >= deadline
13131328
end
13141329
end
1330+
1331+
# Exponential backoff settings for with_transaction retries.
1332+
BACKOFF_INITIAL = 0.005
1333+
BACKOFF_MAX = 0.5
1334+
1335+
def backoff_seconds_for_retry(transaction_attempt)
1336+
exponential = BACKOFF_INITIAL * (1.5 ** (transaction_attempt - 1))
1337+
Random.rand * [exponential, BACKOFF_MAX].min
1338+
end
1339+
1340+
def backoff_would_exceed_deadline?(deadline, backoff_seconds)
1341+
return false if deadline.zero?
1342+
1343+
Utils.monotonic_time + backoff_seconds >= deadline
1344+
end
13151345
end
13161346
end

spec/mongo/server/connection_spec.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,6 @@ class ConnectionSpecTestException < Exception; end
257257
end
258258

259259
it_behaves_like 'failing connection with server diagnostics'
260-
it_behaves_like 'marks server unknown'
261260
it_behaves_like 'logs a warning'
262261
it_behaves_like 'adds server diagnostics'
263262

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# frozen_string_literal: true
2+
# rubocop:todo all
3+
4+
require 'spec_helper'
5+
6+
describe Mongo::Session do
7+
require_topology :replica_set
8+
9+
describe 'transactions convenient API prose tests' do
10+
let(:client) { authorized_client }
11+
let(:admin_client) { authorized_client.use('admin') }
12+
let(:collection) { client['session-transaction-prose-test'] }
13+
14+
before do
15+
collection.delete_many
16+
end
17+
18+
after do
19+
disable_fail_command
20+
end
21+
22+
# Prose test from:
23+
# specifications/source/transactions-convenient-api/tests/README.md
24+
# ### Retry Backoff is Enforced
25+
it 'adds measurable delay when jitter is enabled' do
26+
skip 'failCommand fail point is not available' unless fail_command_available?
27+
28+
no_backoff_time = with_fixed_jitter(0) do
29+
with_commit_failures(13) do
30+
measure_with_transaction_time do |session|
31+
collection.insert_one({}, session: session)
32+
end
33+
end
34+
end
35+
36+
with_backoff_time = with_fixed_jitter(1) do
37+
with_commit_failures(13) do
38+
measure_with_transaction_time do |session|
39+
collection.insert_one({}, session: session)
40+
end
41+
end
42+
end
43+
44+
# Sum of 13 backoffs per spec is approximately 1.8 seconds.
45+
expect(with_backoff_time).to be_within(0.5).of(no_backoff_time + 1.8)
46+
end
47+
48+
private
49+
50+
def measure_with_transaction_time
51+
start_time = Mongo::Utils.monotonic_time
52+
client.start_session do |session|
53+
session.with_transaction do
54+
yield(session)
55+
end
56+
end
57+
Mongo::Utils.monotonic_time - start_time
58+
end
59+
60+
def with_fixed_jitter(value)
61+
allow(Random).to receive(:rand).and_return(value)
62+
yield
63+
end
64+
65+
def with_commit_failures(times)
66+
admin_client.command(
67+
configureFailPoint: 'failCommand',
68+
mode: { times: times },
69+
data: {
70+
failCommands: ['commitTransaction'],
71+
errorCode: 251,
72+
},
73+
)
74+
yield
75+
ensure
76+
disable_fail_command
77+
end
78+
79+
def disable_fail_command
80+
admin_client.command(configureFailPoint: 'failCommand', mode: 'off')
81+
rescue Mongo::Error
82+
# Ignore cleanup failures.
83+
end
84+
85+
def fail_command_available?
86+
admin_client.command(configureFailPoint: 'failCommand', mode: 'off')
87+
true
88+
rescue Mongo::Error
89+
false
90+
end
91+
end
92+
end

spec/mongo/session_transaction_spec.rb

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,5 +264,45 @@ class SessionTransactionSpecError < StandardError; end
264264
end
265265
end
266266
end
267+
268+
context 'backoff calculation' do
269+
require_topology :replica_set
270+
271+
it 'calculates exponential backoff correctly' do
272+
# Test backoff formula: jitter * min(BACKOFF_INITIAL * 1.5^(attempt-1), BACKOFF_MAX)
273+
backoff_initial = Mongo::Session::BACKOFF_INITIAL
274+
backoff_max = Mongo::Session::BACKOFF_MAX
275+
276+
# Test attempt 1: 1.5^0 = 1
277+
expected_attempt_1 = backoff_initial * (1.5 ** 0)
278+
expect(expected_attempt_1).to eq(0.005)
279+
280+
# Test attempt 2: 1.5^1 = 1.5
281+
expected_attempt_2 = backoff_initial * (1.5 ** 1)
282+
expect(expected_attempt_2).to eq(0.0075)
283+
284+
# Test attempt 3: 1.5^2 = 2.25
285+
expected_attempt_3 = backoff_initial * (1.5 ** 2)
286+
expect(expected_attempt_3).to eq(0.01125)
287+
288+
# Test cap at BACKOFF_MAX
289+
expected_attempt_large = [backoff_initial * (1.5 ** 20), backoff_max].min
290+
expect(expected_attempt_large).to eq(backoff_max)
291+
end
292+
293+
it 'applies jitter to backoff' do
294+
# Jitter should be a random value between 0 and 1
295+
# When multiplied with backoff, it should reduce the actual sleep time
296+
backoff = 0.100 # 100ms
297+
jitter_min = 0
298+
jitter_max = 1
299+
300+
actual_min = jitter_min * backoff
301+
actual_max = jitter_max * backoff
302+
303+
expect(actual_min).to eq(0)
304+
expect(actual_max).to eq(0.100)
305+
end
306+
end
267307
end
268308
end

spec/spec_tests/data/cmap/pool-create-min-size-error.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,17 @@ version: 1
22
style: integration
33
description: error during minPoolSize population clears pool
44
runOn:
5-
-
6-
# required for appName in fail point
5+
- # required for appName in fail point
76
minServerVersion: "4.9.0"
87
# Remove the topology runOn requirement when cmap specs are adjusted for lbs
9-
- topology: [ "single", "replicaset", "sharded" ]
8+
- topology: ["single", "replicaset", "sharded"]
109
failPoint:
1110
configureFailPoint: failCommand
1211
# high amount to ensure not interfered with by monitor checks.
1312
mode: { times: 50 }
1413
data:
15-
failCommands: ["isMaster","hello"]
16-
closeConnection: true
14+
failCommands: ["isMaster", "hello"]
15+
errorCode: 91
1716
appName: "poolCreateMinSizeErrorTest"
1817
poolOptions:
1918
minPoolSize: 1
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
description: backpressure-network-error-fail-replicaset
2+
schemaVersion: "1.17"
3+
runOnRequirements:
4+
- minServerVersion: "4.4"
5+
serverless: forbid
6+
topologies:
7+
- replicaset
8+
createEntities:
9+
- client:
10+
id: setupClient
11+
useMultipleMongoses: false
12+
initialData:
13+
- collectionName: backpressure-network-error-fail
14+
databaseName: sdam-tests
15+
documents:
16+
- _id: 1
17+
- _id: 2
18+
tests:
19+
- description: apply backpressure on network connection errors during connection establishment
20+
operations:
21+
- name: createEntities
22+
object: testRunner
23+
arguments:
24+
entities:
25+
- client:
26+
id: client
27+
useMultipleMongoses: false
28+
observeEvents:
29+
- serverDescriptionChangedEvent
30+
- poolClearedEvent
31+
uriOptions:
32+
retryWrites: false
33+
heartbeatFrequencyMS: 1000000
34+
serverMonitoringMode: poll
35+
appname: backpressureNetworkErrorFailTest
36+
- database:
37+
id: database
38+
client: client
39+
databaseName: sdam-tests
40+
- collection:
41+
id: collection
42+
database: database
43+
collectionName: backpressure-network-error-fail
44+
- name: waitForEvent
45+
object: testRunner
46+
arguments:
47+
client: client
48+
event:
49+
serverDescriptionChangedEvent:
50+
newDescription:
51+
type: RSPrimary
52+
count: 1
53+
- name: failPoint
54+
object: testRunner
55+
arguments:
56+
client: setupClient
57+
failPoint:
58+
configureFailPoint: failCommand
59+
mode: alwaysOn
60+
data:
61+
failCommands:
62+
- isMaster
63+
- hello
64+
appName: backpressureNetworkErrorFailTest
65+
closeConnection: true
66+
- name: insertMany
67+
object: collection
68+
arguments:
69+
documents:
70+
- _id: 3
71+
- _id: 4
72+
expectError:
73+
isError: true
74+
errorLabelsContain:
75+
- SystemOverloadedError
76+
- RetryableError
77+
expectEvents:
78+
- client: client
79+
eventType: cmap
80+
events: []

0 commit comments

Comments
 (0)