Skip to content

Commit 3083438

Browse files
authored
HDDS-15569. Speed up TestClientRetryTimeout by tightening configurations (#10527)
1 parent 0e1b03c commit 3083438

2 files changed

Lines changed: 145 additions & 69 deletions

File tree

hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.hadoop.hdds.ratis.conf;
1919

20+
import static org.assertj.core.api.Assertions.assertThat;
2021
import static org.junit.jupiter.api.Assertions.assertEquals;
2122

2223
import java.time.Duration;
@@ -69,4 +70,73 @@ void setAndGet() {
6970
assertEquals(maxRetry, subject.getExponentialPolicyMaxRetries());
7071
}
7172

73+
/**
74+
* Regression guard for HDDS-15444: the production defaults must keep the
75+
* worst-case wall-clock of a single Ratis-client retry cycle bounded.
76+
* Per cycle = write-rpc + max-retries × (backoff-sleep + write-rpc) +
77+
* watch-rpc. With the post-HDDS-15444 defaults this is ~213 s; we assert
78+
* it stays under 4 minutes so a future revert of any one knob is caught
79+
* in a unit test rather than in a multi-minute integration test.
80+
*/
81+
@Test
82+
void defaultsBoundSingleCycleWallClock() {
83+
RatisClientConfig subject = new OzoneConfiguration()
84+
.getObject(RatisClientConfig.class);
85+
RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
86+
.getObject(RatisClientConfig.RaftConfig.class);
87+
88+
Duration writeRpc = raftSubject.getRpcRequestTimeout();
89+
Duration watchRpc = raftSubject.getRpcWatchRequestTimeout();
90+
int maxRetries = subject.getExponentialPolicyMaxRetries();
91+
Duration maxBackoff = subject.getExponentialPolicyMaxSleep();
92+
93+
Duration perCycle = writeRpc
94+
.plus(maxBackoff.plus(writeRpc).multipliedBy(maxRetries))
95+
.plus(watchRpc);
96+
97+
assertThat(perCycle)
98+
.as("Single Ratis-client retry cycle worst-case wall-clock with "
99+
+ "production defaults (writeRpc=%s, watchRpc=%s, maxRetries=%d, "
100+
+ "maxBackoff=%s) must stay bounded; a regression here means "
101+
+ "client writes against a dead pipeline can hang for minutes.",
102+
writeRpc, watchRpc, maxRetries, maxBackoff)
103+
.isLessThan(Duration.ofMinutes(4));
104+
}
105+
106+
/**
107+
* Regression guard for HDDS-15444: the bounded exponential backoff is
108+
* what stops the Ratis client from retrying indefinitely. If this is
109+
* ever set back to {@code Integer.MAX_VALUE} (the pre-HDDS-15444
110+
* behaviour) write failures revert to multi-minute hangs.
111+
*/
112+
@Test
113+
void defaultsCapExponentialMaxRetries() {
114+
RatisClientConfig subject = new OzoneConfiguration()
115+
.getObject(RatisClientConfig.class);
116+
117+
assertThat(subject.getExponentialPolicyMaxRetries())
118+
.as("hdds.ratis.client.exponential.backoff.max.retries must remain "
119+
+ "bounded; unbounded retries reintroduce the HDDS-15444 hang.")
120+
.isPositive()
121+
.isLessThanOrEqualTo(5);
122+
}
123+
124+
/**
125+
* Regression guard for HDDS-15444: the client-side watch RPC timeout
126+
* must align with the server-side watch timeout (30 s by default).
127+
* If the client waits longer than the server is willing to honour, the
128+
* client hangs past the server-side abort.
129+
*/
130+
@Test
131+
void defaultsAlignWatchTimeoutWithServer() {
132+
RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
133+
.getObject(RatisClientConfig.RaftConfig.class);
134+
135+
assertThat(raftSubject.getRpcWatchRequestTimeout())
136+
.as("hdds.ratis.raft.client.rpc.watch.request.timeout should be "
137+
+ "close to the server-side watch timeout (30 s); a much larger "
138+
+ "value lets the client hang past the server's abort.")
139+
.isLessThanOrEqualTo(Duration.ofSeconds(60));
140+
}
141+
72142
}

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestClientRetryTimeout.java

Lines changed: 75 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
package org.apache.hadoop.ozone.client.rpc;
1919

2020
import static java.nio.charset.StandardCharsets.UTF_8;
21+
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL;
22+
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL;
2123
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
2224
import static org.assertj.core.api.Assertions.assertThat;
2325
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
@@ -35,6 +37,7 @@
3537
import org.apache.hadoop.hdds.conf.StorageUnit;
3638
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
3739
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
40+
import org.apache.hadoop.hdds.ratis.conf.RatisClientConfig;
3841
import org.apache.hadoop.hdds.scm.OzoneClientConfig;
3942
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
4043
import org.apache.hadoop.hdds.scm.XceiverClientRatis;
@@ -63,9 +66,20 @@
6366
import org.slf4j.LoggerFactory;
6467

6568
/**
66-
* Verifies that client write operations
67-
* fail within acceptable time bounds when pipelines/datanodes are down.
68-
*
69+
* Verifies that client write operations fail within acceptable time bounds
70+
* when pipelines/datanodes are down.
71+
* <p>
72+
* This class covers the <em>plumbing</em>: that the layered retry path
73+
* (Ratis-client retries × {@code ozone.client.max.retries}) terminates in
74+
* bounded time when the pipeline is unusable. To keep the suite under the
75+
* per-test wall-clock budget the cluster is started with compressed retry
76+
* and timeout values; the assertions are scaled accordingly. A regression
77+
* that re-introduces unbounded retries here will still trip the bound.
78+
* <p>
79+
* The companion regression check on the <em>production defaults</em> lives
80+
* in {@code TestRatisClientConfig} (hdds-common). That unit test is what
81+
* catches a future revert of any of the HDDS-15444 default values without
82+
* needing a mini-cluster.
6983
*/
7084
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
7185
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@@ -83,37 +97,21 @@ public class TestClientRetryTimeout {
8397
/**
8498
* Maximum acceptable duration for a SINGLE retry cycle (write + watch)
8599
* when the pipeline is completely dead.
86-
* <p>
87-
* With the proposed config fixes:
88-
* Write: RPC(60s) + retry1(1s+60s) + retry2(2s+60s) ≈ 183s
89-
* Watch: RPC(30s)
90-
* Total single cycle: ~213s
91-
* <p>
92-
* We use 4 minutes (240s) as the upper bound with some margin.
93-
* The OLD defaults would take ~8 minutes per cycle.
94100
*/
95101
private static final Duration MAX_SINGLE_CYCLE_DURATION =
96-
Duration.ofMinutes(4);
102+
Duration.ofSeconds(90);
97103

98104
/**
99105
* Maximum acceptable duration for the watch-for-commit operation alone.
100-
* <p>
101-
* With the proposed fix: watch RPC timeout is 30s, but the write timeout
102-
* (70s) may also be hit if majority is lost. We use 120s to accommodate
103-
* write(70s) + watch(30s) + overhead.
104-
* The OLD default would take 180s+ (3 minutes watch alone).
105106
*/
106-
private static final Duration MAX_WATCH_DURATION = Duration.ofSeconds(120);
107+
private static final Duration MAX_WATCH_DURATION = Duration.ofSeconds(75);
107108

108109
/**
109110
* Maximum acceptable duration for the end-to-end write failure with
110-
* Ozone-level retries (ozone.client.max.retries = 5).
111-
* <p>
112-
* With proposed fixes: 5 × ~93s ≈ 465s ≈ 8 min.
113-
* The OLD defaults would take ~40 minutes.
111+
* Ozone-level retries.
114112
*/
115113
private static final Duration MAX_TOTAL_WRITE_DURATION =
116-
Duration.ofMinutes(10);
114+
Duration.ofSeconds(180);
117115

118116
private MiniOzoneCluster cluster;
119117
private OzoneClient client;
@@ -135,15 +133,43 @@ public void init() throws Exception {
135133

136134
OzoneClientConfig clientConfig = conf.getObject(OzoneClientConfig.class);
137135
clientConfig.setStreamBufferFlushDelay(false);
136+
// Cap ozone-level retries at 1 (i.e. 2 attempts total): enough to
137+
// exercise the compound retry multiplier without bloating wall-clock.
138+
clientConfig.setMaxRetryCount(1);
138139
conf.setFromObject(clientConfig);
139140

141+
// Compress Ratis client retry/timeout knobs for tests only. The test
142+
// verifies that retry plumbing terminates in bounded time; the ratio
143+
// holds at any scale, so we run with smaller absolute values. Values
144+
// are kept conservative enough that a healthy round-trip on a loaded
145+
// CI runner (multi-second GC pauses, JVM stalls) doesn't spuriously
146+
// trip a per-RPC timeout.
147+
RatisClientConfig ratisClient = conf.getObject(RatisClientConfig.class);
148+
ratisClient.setWriteRequestTimeout(Duration.ofSeconds(15));
149+
ratisClient.setWatchRequestTimeout(Duration.ofSeconds(10));
150+
ratisClient.setExponentialPolicyBaseSleep(Duration.ofMillis(500));
151+
ratisClient.setExponentialPolicyMaxSleep(Duration.ofSeconds(1));
152+
ratisClient.setExponentialPolicyMaxRetries(1);
153+
conf.setFromObject(ratisClient);
154+
155+
RatisClientConfig.RaftConfig raftClient =
156+
conf.getObject(RatisClientConfig.RaftConfig.class);
157+
raftClient.setRpcRequestTimeout(Duration.ofSeconds(10));
158+
raftClient.setRpcWatchRequestTimeout(Duration.ofSeconds(10));
159+
conf.setFromObject(raftClient);
160+
140161
// Fast leader election so new leader can be chosen quickly
141162
conf.setTimeDuration(
142163
OzoneConfigKeys.HDDS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY,
143164
1, TimeUnit.SECONDS);
144165

145-
// Prevent SCM from removing dead datanodes during the test
146-
conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 300, TimeUnit.SECONDS);
166+
// Fast heartbeats so SCM detects restarted DNs quickly. Stale=60s and
167+
// dead=300s are sized so that SCM does not transition killed DNs to
168+
// DEAD inside a single test (which would change the failure mode from
169+
// "RPC timeout" to "pipeline removed"); STALE during a test is fine.
170+
conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 1, TimeUnit.SECONDS);
171+
conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 60, TimeUnit.SECONDS);
172+
conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 300, TimeUnit.SECONDS);
147173

148174
// Allow multiple pipelines per datanode to accommodate all tests
149175
conf.setInt(ScmConfigKeys.OZONE_DATANODE_PIPELINE_LIMIT, 5);
@@ -175,15 +201,9 @@ public void shutdown() {
175201
/**
176202
* Test 1: Write to a pipeline where ALL datanodes are dead.
177203
* <p>
178-
* Demonstrates the problem:
179-
* - Client sends WriteChunk to dead leader
180-
* - TimeoutIOException triggers exponential backoff
181-
* - OLD config: retries with unlimited max retries for up to 5 minutes
182-
* - FIXED config: retries at most 2 times, fails in ~63s
183-
* <p>
184-
* This test asserts that the write failure is detected within
185-
* MAX_SINGLE_CYCLE_DURATION (4 minutes), which would fail with the old
186-
* 5-minute write timeout + 3-minute watch timeout = 8 minutes.
204+
* Verifies that when WriteChunk is sent to a dead leader, exponential
205+
* backoff terminates after the configured retry count and surfaces the
206+
* failure within {@link #MAX_SINGLE_CYCLE_DURATION}.
187207
*/
188208
@Test
189209
@Order(1)
@@ -239,24 +259,23 @@ public void testWriteToDeadPipelineFailsFast() throws Exception {
239259
MAX_SINGLE_CYCLE_DURATION, elapsed)
240260
.isLessThan(MAX_SINGLE_CYCLE_DURATION);
241261

242-
// Restart the datanodes for subsequent tests
262+
// Restart the datanodes and wait for SCM to have a writable pipeline
263+
// before the next test runs. We don't need full cluster recovery
264+
// (all 7 DNs HEALTHY), only one OPEN factor-THREE pipeline.
243265
for (DatanodeDetails dn : nodes) {
244266
cluster.restartHddsDatanode(dn, false);
245267
}
246-
cluster.waitForClusterToBeReady();
268+
cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
269+
60000);
247270
}
248271

249272
/**
250273
* Test 2: Watch-for-commit when follower datanodes are dead.
251274
* <p>
252-
* Demonstrates the problem:
253-
* - Client writes data, leader commits but followers are dead
254-
* - Watch-ALL_COMMITTED is issued
255-
* - OLD config: watch RPC timeout is 180s, client waits 3 minutes
256-
* - FIXED config: watch RPC timeout is 30s, matches server timeout
257-
* <p>
258-
* This test asserts that the watch failure is detected within
259-
* MAX_WATCH_DURATION (60s), which would fail with the old 180s timeout.
275+
* Verifies that {@code watch(ALL_COMMITTED)} terminates within
276+
* {@link #MAX_WATCH_DURATION} when followers cannot ack — the client
277+
* watch RPC timeout must align with the server-side watch timeout
278+
* rather than running well past it.
260279
*/
261280
@Test
262281
@Order(2)
@@ -328,21 +347,16 @@ public void testWatchForCommitWithDeadFollowersFailsFast() throws Exception {
328347
} catch (Exception e) {
329348
// May already be running
330349
}
331-
cluster.waitForClusterToBeReady();
350+
cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
351+
60000);
332352
}
333353

334354
/**
335355
* Test 3: Write failure when the Raft leader is specifically killed.
336356
* <p>
337-
* Demonstrates the problem:
338-
* - Client is writing to a pipeline
339-
* - The Raft leader is killed mid-write
340-
* - RaftClient still holds stale connection to dead leader
341-
* - OLD config: exponential backoff retries indefinitely for 5 min
342-
* - FIXED config: limited to 2 retries, fails in ~63s
343-
* <p>
344-
* This test asserts the entire operation (write + close) completes
345-
* within MAX_SINGLE_CYCLE_DURATION.
357+
* Verifies that with the leader killed mid-write, the RaftClient does
358+
* not retry forever against the stale leader; the write either recovers
359+
* via re-election or fails within {@link #MAX_SINGLE_CYCLE_DURATION}.
346360
*/
347361
@Test
348362
@Order(3)
@@ -404,21 +418,19 @@ public void testWriteWithLeaderFailureFailsFast() throws Exception {
404418

405419
// Restart the leader
406420
cluster.restartHddsDatanode(leader.getDatanodeDetails(), false);
407-
cluster.waitForClusterToBeReady();
421+
cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
422+
60000);
408423
}
409424

410425
/**
411426
* Test 4: End-to-end write with ALL datanodes killed, verifying the
412427
* total time including Ozone-level retries.
413428
* <p>
414-
* Demonstrates the compound problem:
415-
* - Ozone retries up to 5 times (ozone.client.max.retries)
416-
* - Each retry allocates a new pipeline, but ALL datanodes are dead
417-
* - OLD config: 5 × (5min write + 3min watch) = 40 minutes
418-
* - FIXED config: 5 × (~63s write + 30s watch) ≈ 7.7 minutes
419-
* <p>
420-
* This test asserts the TOTAL time is within MAX_TOTAL_WRITE_DURATION
421-
* (10 minutes), which would fail with the old 40-minute worst case.
429+
* Verifies that the compound retry path
430+
* ({@code ozone.client.max.retries} × per-cycle Ratis retries) does not
431+
* multiply into an unbounded wait. This is the test that catches the
432+
* interaction between the two retry layers; tests 1–3 only exercise
433+
* each layer in isolation.
422434
*/
423435
@Test
424436
@Order(4)
@@ -464,12 +476,6 @@ public void testEndToEndWriteWithAllDatanodesDownFailsFast()
464476
+ "is causing the client to hang.",
465477
MAX_TOTAL_WRITE_DURATION, elapsed)
466478
.isLessThan(MAX_TOTAL_WRITE_DURATION);
467-
468-
// Restart all datanodes
469-
for (HddsDatanodeService dn : allDatanodes) {
470-
cluster.restartHddsDatanode(dn.getDatanodeDetails(), false);
471-
}
472-
cluster.waitForClusterToBeReady();
473479
}
474480

475481
private String getKeyName() {

0 commit comments

Comments
 (0)