Skip to content

Commit cc37229

Browse files
RUBY-3700 Do not clear pool on retryable errors (#2990)
1 parent 7e7e15d commit cc37229

10 files changed

+429
-19
lines changed

lib/mongo/server.rb

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -507,11 +507,13 @@ def with_connection(connection_global_id: nil, context: nil, &block)
507507
def handle_handshake_failure!
508508
yield
509509
rescue Mongo::Error::SocketError, Mongo::Error::SocketTimeoutError => e
510-
unknown!(
511-
generation: e.generation,
512-
service_id: e.service_id,
513-
stop_push_monitor: true,
514-
)
510+
unless e.label?('SystemOverloadedError')
511+
unknown!(
512+
generation: e.generation,
513+
service_id: e.service_id,
514+
stop_push_monitor: true,
515+
)
516+
end
515517
raise
516518
end
517519

lib/mongo/server/connection_pool.rb

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -995,17 +995,26 @@ def maybe_raise_pool_cleared!(connection, e)
995995
def connect_connection(connection, context = nil)
996996
begin
997997
connection.connect!(context)
998-
rescue Exception
998+
rescue Exception => exc
999+
# When a connection encounters an error during creation
1000+
# (including handshake and authentication), mark the server
1001+
# as Unknown so the pool is cleared (per CMAP spec).
1002+
# The unknown! call must happen before disconnect! so that
1003+
# the ConnectionPoolCleared event is published before the
1004+
# ConnectionClosed event.
1005+
# Errors with the SystemOverloadedError label (network errors
1006+
# during handshake) are excluded per the SDAM spec — those
1007+
# must not change the server description.
1008+
if exc.is_a?(Mongo::Error) && !exc.label?('SystemOverloadedError')
1009+
@server.unknown!(
1010+
generation: exc.generation,
1011+
service_id: exc.service_id,
1012+
stop_push_monitor: true,
1013+
)
1014+
end
9991015
connection.disconnect!(reason: :error)
10001016
raise
10011017
end
1002-
rescue Error::SocketError, Error::SocketTimeoutError => exc
1003-
@server.unknown!(
1004-
generation: exc.generation,
1005-
service_id: exc.service_id,
1006-
stop_push_monitor: true,
1007-
)
1008-
raise
10091018
end
10101019

10111020
def check_invariants

lib/mongo/server/pending_connection.rb

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,19 @@ def handshake!(speculative_auth_doc: nil)
160160
log_prefix: options[:log_prefix],
161161
bg_error_backtrace: options[:bg_error_backtrace],
162162
)
163+
if exc.is_a?(::Mongo::Error)
164+
# The SystemOverloadedError label marks errors as back-pressure
165+
# signals that should not cause the server to be marked Unknown.
166+
# Per the SDAM spec, this label only applies to network errors
167+
# during the handshake. Command errors (e.g. OperationFailure
168+
# with "node is shutting down" codes) must still flow through
169+
# normal SDAM error handling so the server is marked Unknown
170+
# and the pool is cleared.
171+
if exc.is_a?(Error::SocketError) || exc.is_a?(Error::SocketTimeoutError)
172+
exc.add_label('SystemOverloadedError')
173+
exc.add_label('RetryableError')
174+
end
175+
end
163176
raise
164177
end
165178
end

spec/mongo/server/connection_spec.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,6 @@ class ConnectionSpecTestException < Exception; end
257257
end
258258

259259
it_behaves_like 'failing connection with server diagnostics'
260-
it_behaves_like 'marks server unknown'
261260
it_behaves_like 'logs a warning'
262261
it_behaves_like 'adds server diagnostics'
263262

spec/spec_tests/data/cmap/pool-create-min-size-error.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,17 @@ version: 1
22
style: integration
33
description: error during minPoolSize population clears pool
44
runOn:
5-
-
6-
# required for appName in fail point
5+
- # required for appName in fail point
76
minServerVersion: "4.9.0"
87
# Remove the topology runOn requirement when cmap specs are adjusted for lbs
9-
- topology: [ "single", "replicaset", "sharded" ]
8+
- topology: ["single", "replicaset", "sharded"]
109
failPoint:
1110
configureFailPoint: failCommand
1211
# high amount to ensure not interfered with by monitor checks.
1312
mode: { times: 50 }
1413
data:
15-
failCommands: ["isMaster","hello"]
16-
closeConnection: true
14+
failCommands: ["isMaster", "hello"]
15+
errorCode: 91
1716
appName: "poolCreateMinSizeErrorTest"
1817
poolOptions:
1918
minPoolSize: 1
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
description: backpressure-network-error-fail-replicaset
2+
schemaVersion: "1.17"
3+
runOnRequirements:
4+
- minServerVersion: "4.4"
5+
serverless: forbid
6+
topologies:
7+
- replicaset
8+
createEntities:
9+
- client:
10+
id: setupClient
11+
useMultipleMongoses: false
12+
initialData:
13+
- collectionName: backpressure-network-error-fail
14+
databaseName: sdam-tests
15+
documents:
16+
- _id: 1
17+
- _id: 2
18+
tests:
19+
- description: apply backpressure on network connection errors during connection establishment
20+
operations:
21+
- name: createEntities
22+
object: testRunner
23+
arguments:
24+
entities:
25+
- client:
26+
id: client
27+
useMultipleMongoses: false
28+
observeEvents:
29+
- serverDescriptionChangedEvent
30+
- poolClearedEvent
31+
uriOptions:
32+
retryWrites: false
33+
heartbeatFrequencyMS: 1000000
34+
serverMonitoringMode: poll
35+
appname: backpressureNetworkErrorFailTest
36+
- database:
37+
id: database
38+
client: client
39+
databaseName: sdam-tests
40+
- collection:
41+
id: collection
42+
database: database
43+
collectionName: backpressure-network-error-fail
44+
- name: waitForEvent
45+
object: testRunner
46+
arguments:
47+
client: client
48+
event:
49+
serverDescriptionChangedEvent:
50+
newDescription:
51+
type: RSPrimary
52+
count: 1
53+
- name: failPoint
54+
object: testRunner
55+
arguments:
56+
client: setupClient
57+
failPoint:
58+
configureFailPoint: failCommand
59+
mode: alwaysOn
60+
data:
61+
failCommands:
62+
- isMaster
63+
- hello
64+
appName: backpressureNetworkErrorFailTest
65+
closeConnection: true
66+
- name: insertMany
67+
object: collection
68+
arguments:
69+
documents:
70+
- _id: 3
71+
- _id: 4
72+
expectError:
73+
isError: true
74+
errorLabelsContain:
75+
- SystemOverloadedError
76+
- RetryableError
77+
expectEvents:
78+
- client: client
79+
eventType: cmap
80+
events: []
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
description: backpressure-network-error-fail-single
2+
schemaVersion: "1.17"
3+
runOnRequirements:
4+
- minServerVersion: "4.4"
5+
serverless: forbid
6+
topologies:
7+
- single
8+
createEntities:
9+
- client:
10+
id: setupClient
11+
useMultipleMongoses: false
12+
initialData:
13+
- collectionName: backpressure-network-error-fail
14+
databaseName: sdam-tests
15+
documents:
16+
- _id: 1
17+
- _id: 2
18+
tests:
19+
- description: apply backpressure on network connection errors during connection establishment
20+
operations:
21+
- name: createEntities
22+
object: testRunner
23+
arguments:
24+
entities:
25+
- client:
26+
id: client
27+
useMultipleMongoses: false
28+
observeEvents:
29+
- serverDescriptionChangedEvent
30+
- poolClearedEvent
31+
uriOptions:
32+
retryWrites: false
33+
heartbeatFrequencyMS: 1000000
34+
serverMonitoringMode: poll
35+
appname: backpressureNetworkErrorFailTest
36+
- database:
37+
id: database
38+
client: client
39+
databaseName: sdam-tests
40+
- collection:
41+
id: collection
42+
database: database
43+
collectionName: backpressure-network-error-fail
44+
- name: waitForEvent
45+
object: testRunner
46+
arguments:
47+
client: client
48+
event:
49+
serverDescriptionChangedEvent:
50+
newDescription:
51+
type: Standalone
52+
count: 1
53+
- name: failPoint
54+
object: testRunner
55+
arguments:
56+
client: setupClient
57+
failPoint:
58+
configureFailPoint: failCommand
59+
mode: alwaysOn
60+
data:
61+
failCommands:
62+
- isMaster
63+
- hello
64+
appName: backpressureNetworkErrorFailTest
65+
closeConnection: true
66+
- name: insertMany
67+
object: collection
68+
arguments:
69+
documents:
70+
- _id: 3
71+
- _id: 4
72+
expectError:
73+
isError: true
74+
errorLabelsContain:
75+
- SystemOverloadedError
76+
- RetryableError
77+
expectEvents:
78+
- client: client
79+
eventType: cmap
80+
events: []
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
description: backpressure-network-timeout-error-replicaset
2+
schemaVersion: "1.17"
3+
runOnRequirements:
4+
- minServerVersion: "4.4"
5+
serverless: forbid
6+
topologies:
7+
- replicaset
8+
createEntities:
9+
- client:
10+
id: setupClient
11+
useMultipleMongoses: false
12+
initialData:
13+
- collectionName: backpressure-network-timeout-error
14+
databaseName: sdam-tests
15+
documents:
16+
- _id: 1
17+
- _id: 2
18+
tests:
19+
- description: apply backpressure on network timeout error during connection establishment
20+
operations:
21+
- name: createEntities
22+
object: testRunner
23+
arguments:
24+
entities:
25+
- client:
26+
id: client
27+
useMultipleMongoses: false
28+
observeEvents:
29+
- serverDescriptionChangedEvent
30+
- poolClearedEvent
31+
uriOptions:
32+
retryWrites: false
33+
heartbeatFrequencyMS: 1000000
34+
appname: backpressureNetworkTimeoutErrorTest
35+
serverMonitoringMode: poll
36+
connectTimeoutMS: 250
37+
socketTimeoutMS: 250
38+
- database:
39+
id: database
40+
client: client
41+
databaseName: sdam-tests
42+
- collection:
43+
id: collection
44+
database: database
45+
collectionName: backpressure-network-timeout-error
46+
- name: waitForEvent
47+
object: testRunner
48+
arguments:
49+
client: client
50+
event:
51+
serverDescriptionChangedEvent:
52+
newDescription:
53+
type: RSPrimary
54+
count: 1
55+
- name: failPoint
56+
object: testRunner
57+
arguments:
58+
client: setupClient
59+
failPoint:
60+
configureFailPoint: failCommand
61+
mode: alwaysOn
62+
data:
63+
failCommands:
64+
- isMaster
65+
- hello
66+
blockConnection: true
67+
blockTimeMS: 500
68+
appName: backpressureNetworkTimeoutErrorTest
69+
- name: insertMany
70+
object: collection
71+
arguments:
72+
documents:
73+
- _id: 3
74+
- _id: 4
75+
expectError:
76+
isError: true
77+
errorLabelsContain:
78+
- SystemOverloadedError
79+
- RetryableError
80+
expectEvents:
81+
- client: client
82+
eventType: cmap
83+
events: []

0 commit comments

Comments
 (0)