Skip to content

Commit f8b7ed5

Browse files
RUBY-3786 Retry inside txns on overload errors (#2999)
1 parent 6391c2a commit f8b7ed5

File tree

10 files changed

+922
-10
lines changed

10 files changed

+922
-10
lines changed

lib/mongo/operation/context.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ def retry?
131131
!!@is_retry
132132
end
133133

134+
# Whether every retry so far has been due to overload only.
135+
def overload_only_retry?
136+
!!@overload_only_retry
137+
end
138+
134139
# Returns a new context with the parameters changed as per the
135140
# provided arguments.
136141
#

lib/mongo/retryable/read_worker.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def modern_read_with_retry(session, server_selector, context, &block)
207207
result
208208
rescue *retryable_exceptions, Error::OperationFailure::Family, Auth::Unauthorized, Error::PoolError => e
209209
e.add_notes('modern retry', 'attempt 1')
210-
raise e if session.in_transaction?
210+
raise e if session.in_transaction? && !retryable_overload_error?(e)
211211

212212
if retryable_overload_error?(e)
213213
overload_read_retry(e, session, server_selector, context, server, error_count: 1, &block)

lib/mongo/retryable/write_worker.rb

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ def legacy_write_with_retry(server = nil, context:)
257257
def modern_write_with_retry(session, server, context, &block)
258258
txn_num = nil
259259
connection_succeeded = false
260+
was_starting = false
260261

261262
result = server.with_connection(
262263
connection_global_id: context.connection_global_id,
@@ -266,6 +267,7 @@ def modern_write_with_retry(session, server, context, &block)
266267

267268
session.materialize_if_needed
268269
txn_num = session.in_transaction? ? session.txn_num : session.next_txn_num
270+
was_starting = session.starting_transaction?
269271

270272
# The context needs to be duplicated here because we will be using
271273
# it later for the retry as well.
@@ -288,7 +290,11 @@ def modern_write_with_retry(session, server, context, &block)
288290
retry_context = context.with(is_retry: true)
289291

290292
if is_overload
291-
overload_write_retry(e, session, txn_num, context: retry_context, failed_server: server, error_count: 1, &block)
293+
overload_write_retry(e, session, txn_num,
294+
context: retry_context.with(overload_only_retry: true),
295+
failed_server: server, error_count: 1,
296+
was_starting_transaction: was_starting,
297+
&block)
292298
else
293299
# Context#with creates a new context, which is not necessary here
294300
# but the API is less prone to misuse this way.
@@ -351,7 +357,7 @@ def retry_write(original_error, txn_num, context:, failed_server: nil, &block)
351357
rescue *retryable_exceptions, Error::PoolError => e
352358
if retryable_overload_error?(e)
353359
e.add_notes('modern retry', "attempt #{attempt}")
354-
return overload_write_retry(e, context.session, txn_num, context: context, failed_server: server, error_count: attempt, &block)
360+
return overload_write_retry(e, context.session, txn_num, context: context, failed_server: server, error_count: attempt, was_starting_transaction: false, &block)
355361
end
356362
maybe_fail_on_retryable(e, original_error, context, attempt)
357363
failed_server = server
@@ -360,7 +366,7 @@ def retry_write(original_error, txn_num, context:, failed_server: nil, &block)
360366
rescue Error::OperationFailure::Family => e
361367
if retryable_overload_error?(e)
362368
e.add_notes('modern retry', "attempt #{attempt}")
363-
return overload_write_retry(e, context.session, txn_num, context: context, failed_server: server, error_count: attempt, &block)
369+
return overload_write_retry(e, context.session, txn_num, context: context, failed_server: server, error_count: attempt, was_starting_transaction: false, &block)
364370
end
365371
maybe_fail_on_operation_failure(e, original_error, context, attempt)
366372
failed_server = server
@@ -375,7 +381,8 @@ def retry_write(original_error, txn_num, context:, failed_server: nil, &block)
375381
end
376382

377383
# Retry loop for overload write errors with exponential backoff.
378-
def overload_write_retry(last_error, session, txn_num, context:, failed_server:, error_count:, &block)
384+
def overload_write_retry(last_error, session, txn_num, context:, failed_server:, error_count:,
385+
was_starting_transaction: false, &block)
379386
loop do
380387
delay = retry_policy.backoff_delay(error_count)
381388
unless retry_policy.should_retry_overload?(error_count, delay, context: context)
@@ -401,6 +408,7 @@ def overload_write_retry(last_error, session, txn_num, context:, failed_server:,
401408
end
402409

403410
begin
411+
session.revert_to_starting_transaction! if was_starting_transaction
404412
context.check_timeout!
405413
result = server.with_connection(connection_global_id: context.connection_global_id) do |connection|
406414
yield connection, txn_num, context
@@ -423,6 +431,7 @@ def overload_write_retry(last_error, session, txn_num, context:, failed_server:,
423431
end
424432
end
425433
retry_policy.record_non_overload_retry_failure unless is_overload
434+
context = context.with(overload_only_retry: false) unless is_overload
426435
failed_server = server
427436
last_error = e
428437
rescue Error, Error::AuthError => e

lib/mongo/session.rb

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -750,7 +750,7 @@ def commit_transaction(options=nil)
750750
write_with_retry(write_concern, ending_transaction: true,
751751
context: context,
752752
) do |connection, txn_num, context|
753-
if context.retry?
753+
if context.retry? && !context.overload_only_retry?
754754
if write_concern
755755
wco = write_concern.options.merge(w: :majority)
756756
wco[:wtimeout] ||= 10000
@@ -1126,6 +1126,16 @@ def validate_read_preference!(command)
11261126
end
11271127
end
11281128

1129+
# Reverts the session state to STARTING_TRANSACTION_STATE.
1130+
# Called before retrying the first command in a transaction so that
1131+
# startTransaction: true is preserved on the retry.
1132+
# @api private
1133+
def revert_to_starting_transaction!
1134+
if within_states?(TRANSACTION_IN_PROGRESS_STATE)
1135+
@state = STARTING_TRANSACTION_STATE
1136+
end
1137+
end
1138+
11291139
# Update the state of the session due to a (non-commit and non-abort) operation being run.
11301140
#
11311141
# @since 2.6.0

spec/mongo/retryable/write_worker_overload_spec.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
instance_double(Mongo::Session).tap do |s|
2626
allow(s).to receive(:retry_writes?).and_return(true)
2727
allow(s).to receive(:in_transaction?).and_return(false)
28+
allow(s).to receive(:starting_transaction?).and_return(false)
2829
allow(s).to receive(:materialize_if_needed)
2930
allow(s).to receive(:txn_num).and_return(1)
3031
allow(s).to receive(:next_txn_num).and_return(2)

spec/runners/unified/support_operations.rb

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,13 @@ def fail_point(op)
3434
client = entities.get(:client, args.use!('client'))
3535
client.command(fp = args.use('failPoint'))
3636

37+
# Use the client's actual primary address rather than the cached
38+
# ClusterConfig value, which can become stale after a replSetStepDown.
39+
primary_server = client.cluster.servers.find(&:primary?)
40+
address = primary_server&.address || ClusterConfig.instance.primary_address
41+
3742
$disable_fail_points ||= []
38-
$disable_fail_points << [
39-
fp,
40-
ClusterConfig.instance.primary_address,
41-
]
43+
$disable_fail_points << [fp, address]
4244
end
4345
end
4446

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
description: backpressure-retryable-abort
2+
schemaVersion: "1.3"
3+
runOnRequirements:
4+
- minServerVersion: "4.4"
5+
topologies:
6+
- replicaset
7+
- sharded
8+
- load-balanced
9+
createEntities:
10+
-
11+
client:
12+
id: &client0 client0
13+
useMultipleMongoses: false
14+
observeEvents:
15+
- commandStartedEvent
16+
-
17+
database:
18+
id: &database0 database0
19+
client: *client0
20+
databaseName: &database_name transaction-tests
21+
-
22+
collection:
23+
id: &collection0 collection0
24+
database: *database0
25+
collectionName: &collection_name test
26+
-
27+
session:
28+
id: &session0 session0
29+
client: *client0
30+
31+
initialData:
32+
-
33+
collectionName: *collection_name
34+
databaseName: *database_name
35+
documents: []
36+
tests:
37+
- description: abortTransaction retries if backpressure labels are added
38+
operations:
39+
- object: testRunner
40+
name: failPoint
41+
arguments:
42+
client: *client0
43+
failPoint:
44+
configureFailPoint: failCommand
45+
mode:
46+
times: 2
47+
data:
48+
failCommands:
49+
- abortTransaction
50+
errorLabels:
51+
- RetryableError
52+
- SystemOverloadedError
53+
errorCode: 112
54+
- object: *session0
55+
name: startTransaction
56+
- object: *collection0
57+
name: insertOne
58+
arguments:
59+
session: *session0
60+
document:
61+
_id: 1
62+
expectResult:
63+
$$unsetOrMatches:
64+
insertedId:
65+
$$unsetOrMatches: 1
66+
- object: *session0
67+
name: abortTransaction
68+
expectEvents:
69+
- client: *client0
70+
events:
71+
- commandStartedEvent:
72+
command:
73+
insert: test
74+
documents:
75+
- _id: 1
76+
ordered: true
77+
readConcern:
78+
$$exists: false
79+
lsid:
80+
$$sessionLsid: *session0
81+
txnNumber:
82+
$numberLong: "1"
83+
startTransaction: true
84+
autocommit: false
85+
writeConcern:
86+
$$exists: false
87+
commandName: insert
88+
databaseName: *database_name
89+
- commandStartedEvent:
90+
command:
91+
abortTransaction: 1
92+
lsid:
93+
$$sessionLsid: *session0
94+
txnNumber:
95+
$numberLong: "1"
96+
startTransaction:
97+
$$exists: false
98+
autocommit: false
99+
writeConcern:
100+
$$exists: false
101+
commandName: abortTransaction
102+
databaseName: admin
103+
- commandStartedEvent:
104+
command:
105+
abortTransaction: 1
106+
lsid:
107+
$$sessionLsid: *session0
108+
txnNumber:
109+
$numberLong: "1"
110+
startTransaction:
111+
$$exists: false
112+
autocommit: false
113+
writeConcern:
114+
$$exists: false
115+
commandName: abortTransaction
116+
databaseName: admin
117+
- commandStartedEvent:
118+
command:
119+
abortTransaction: 1
120+
lsid:
121+
$$sessionLsid: *session0
122+
txnNumber:
123+
$numberLong: "1"
124+
startTransaction:
125+
$$exists: false
126+
autocommit: false
127+
writeConcern:
128+
$$exists: false
129+
commandName: abortTransaction
130+
databaseName: admin
131+
outcome:
132+
- collectionName: *collection_name
133+
databaseName: *database_name
134+
documents: []
135+
- description: abortTransaction is retried maxAttempts=5 times if backpressure labels are added
136+
operations:
137+
- object: testRunner
138+
name: failPoint
139+
arguments:
140+
client: *client0
141+
failPoint:
142+
configureFailPoint: failCommand
143+
mode: alwaysOn
144+
data:
145+
failCommands:
146+
- abortTransaction
147+
errorLabels:
148+
- RetryableError
149+
- SystemOverloadedError
150+
errorCode: 112
151+
- object: *session0
152+
name: startTransaction
153+
- object: *collection0
154+
name: insertOne
155+
arguments:
156+
session: *session0
157+
document:
158+
_id: 1
159+
expectResult:
160+
$$unsetOrMatches:
161+
insertedId:
162+
$$unsetOrMatches: 1
163+
- object: *session0
164+
name: abortTransaction
165+
expectEvents:
166+
- client: *client0
167+
events:
168+
- commandStartedEvent:
169+
command:
170+
insert: test
171+
documents:
172+
- _id: 1
173+
ordered: true
174+
readConcern:
175+
$$exists: false
176+
lsid:
177+
$$sessionLsid: *session0
178+
txnNumber:
179+
$numberLong: "1"
180+
startTransaction: true
181+
autocommit: false
182+
writeConcern:
183+
$$exists: false
184+
commandName: insert
185+
databaseName: *database_name
186+
- commandStartedEvent:
187+
command:
188+
abortTransaction: 1
189+
lsid:
190+
$$sessionLsid: *session0
191+
txnNumber:
192+
$numberLong: "1"
193+
startTransaction:
194+
$$exists: false
195+
autocommit: false
196+
writeConcern:
197+
$$exists: false
198+
commandName: abortTransaction
199+
databaseName: admin
200+
- commandStartedEvent:
201+
commandName: abortTransaction
202+
- commandStartedEvent:
203+
commandName: abortTransaction
204+
- commandStartedEvent:
205+
commandName: abortTransaction
206+
- commandStartedEvent:
207+
commandName: abortTransaction
208+
- commandStartedEvent:
209+
commandName: abortTransaction
210+
outcome:
211+
- collectionName: *collection_name
212+
databaseName: *database_name
213+
documents: []

0 commit comments

Comments
 (0)