1818package org .apache .hadoop .ozone .client .rpc ;
1919
2020import static java .nio .charset .StandardCharsets .UTF_8 ;
21+ import static org .apache .hadoop .hdds .HddsConfigKeys .HDDS_HEARTBEAT_INTERVAL ;
22+ import static org .apache .hadoop .hdds .scm .ScmConfigKeys .OZONE_SCM_DEADNODE_INTERVAL ;
2123import static org .apache .hadoop .hdds .scm .ScmConfigKeys .OZONE_SCM_STALENODE_INTERVAL ;
2224import static org .assertj .core .api .Assertions .assertThat ;
2325import static org .junit .jupiter .api .Assertions .assertInstanceOf ;
3537import org .apache .hadoop .hdds .conf .StorageUnit ;
3638import org .apache .hadoop .hdds .protocol .DatanodeDetails ;
3739import org .apache .hadoop .hdds .protocol .proto .HddsProtos ;
40+ import org .apache .hadoop .hdds .ratis .conf .RatisClientConfig ;
3841import org .apache .hadoop .hdds .scm .OzoneClientConfig ;
3942import org .apache .hadoop .hdds .scm .ScmConfigKeys ;
4043import org .apache .hadoop .hdds .scm .XceiverClientRatis ;
6366import org .slf4j .LoggerFactory ;
6467
6568/**
66- * Verifies that client write operations
67- * fail within acceptable time bounds when pipelines/datanodes are down.
68- *
69+ * Verifies that client write operations fail within acceptable time bounds
70+ * when pipelines/datanodes are down.
71+ * <p>
72+ * This class covers the <em>plumbing</em>: that the layered retry path
73+ * (Ratis-client retries × {@code ozone.client.max.retries}) terminates in
74+ * bounded time when the pipeline is unusable. To keep the suite under the
75+ * per-test wall-clock budget the cluster is started with compressed retry
76+ * and timeout values; the assertions are scaled accordingly. A regression
77+ * that re-introduces unbounded retries here will still trip the bound.
78+ * <p>
79+ * The companion regression check on the <em>production defaults</em> lives
80+ * in {@code TestRatisClientConfig} (hdds-common). That unit test is what
81+ * catches a future revert of any of the HDDS-15444 default values without
82+ * needing a mini-cluster.
6983 */
7084@ TestInstance (TestInstance .Lifecycle .PER_CLASS )
7185@ TestMethodOrder (MethodOrderer .OrderAnnotation .class )
@@ -83,37 +97,21 @@ public class TestClientRetryTimeout {
8397 /**
8498 * Maximum acceptable duration for a SINGLE retry cycle (write + watch)
8599 * when the pipeline is completely dead.
86- * <p>
87- * With the proposed config fixes:
88- * Write: RPC(60s) + retry1(1s+60s) + retry2(2s+60s) ≈ 183s
89- * Watch: RPC(30s)
90- * Total single cycle: ~213s
91- * <p>
92- * We use 4 minutes (240s) as the upper bound with some margin.
93- * The OLD defaults would take ~8 minutes per cycle.
94100 */
95101 private static final Duration MAX_SINGLE_CYCLE_DURATION =
96- Duration .ofMinutes ( 4 );
102+ Duration .ofSeconds ( 90 );
97103
98104 /**
99105 * Maximum acceptable duration for the watch-for-commit operation alone.
100- * <p>
101- * With the proposed fix: watch RPC timeout is 30s, but the write timeout
102- * (70s) may also be hit if majority is lost. We use 120s to accommodate
103- * write(70s) + watch(30s) + overhead.
104- * The OLD default would take 180s+ (3 minutes watch alone).
105106 */
106- private static final Duration MAX_WATCH_DURATION = Duration .ofSeconds (120 );
107+ private static final Duration MAX_WATCH_DURATION = Duration .ofSeconds (75 );
107108
108109 /**
109110 * Maximum acceptable duration for the end-to-end write failure with
110- * Ozone-level retries (ozone.client.max.retries = 5).
111- * <p>
112- * With proposed fixes: 5 × ~93s ≈ 465s ≈ 8 min.
113- * The OLD defaults would take ~40 minutes.
111+ * Ozone-level retries.
114112 */
115113 private static final Duration MAX_TOTAL_WRITE_DURATION =
116- Duration .ofMinutes ( 10 );
114+ Duration .ofSeconds ( 180 );
117115
118116 private MiniOzoneCluster cluster ;
119117 private OzoneClient client ;
@@ -135,15 +133,43 @@ public void init() throws Exception {
135133
136134 OzoneClientConfig clientConfig = conf .getObject (OzoneClientConfig .class );
137135 clientConfig .setStreamBufferFlushDelay (false );
136+ // Cap ozone-level retries at 1 (i.e. 2 attempts total): enough to
137+ // exercise the compound retry multiplier without bloating wall-clock.
138+ clientConfig .setMaxRetryCount (1 );
138139 conf .setFromObject (clientConfig );
139140
141+ // Compress Ratis client retry/timeout knobs for tests only. The test
142+ // verifies that retry plumbing terminates in bounded time; the ratio
143+ // holds at any scale, so we run with smaller absolute values. Values
144+ // are kept conservative enough that a healthy round-trip on a loaded
145+ // CI runner (multi-second GC pauses, JVM stalls) doesn't spuriously
146+ // trip a per-RPC timeout.
147+ RatisClientConfig ratisClient = conf .getObject (RatisClientConfig .class );
148+ ratisClient .setWriteRequestTimeout (Duration .ofSeconds (15 ));
149+ ratisClient .setWatchRequestTimeout (Duration .ofSeconds (10 ));
150+ ratisClient .setExponentialPolicyBaseSleep (Duration .ofMillis (500 ));
151+ ratisClient .setExponentialPolicyMaxSleep (Duration .ofSeconds (1 ));
152+ ratisClient .setExponentialPolicyMaxRetries (1 );
153+ conf .setFromObject (ratisClient );
154+
155+ RatisClientConfig .RaftConfig raftClient =
156+ conf .getObject (RatisClientConfig .RaftConfig .class );
157+ raftClient .setRpcRequestTimeout (Duration .ofSeconds (10 ));
158+ raftClient .setRpcWatchRequestTimeout (Duration .ofSeconds (10 ));
159+ conf .setFromObject (raftClient );
160+
140161 // Fast leader election so new leader can be chosen quickly
141162 conf .setTimeDuration (
142163 OzoneConfigKeys .HDDS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY ,
143164 1 , TimeUnit .SECONDS );
144165
145- // Prevent SCM from removing dead datanodes during the test
146- conf .setTimeDuration (OZONE_SCM_STALENODE_INTERVAL , 300 , TimeUnit .SECONDS );
166+ // Fast heartbeats so SCM detects restarted DNs quickly. Stale=60s and
167+ // dead=300s are sized so that SCM does not transition killed DNs to
168+ // DEAD inside a single test (which would change the failure mode from
169+ // "RPC timeout" to "pipeline removed"); STALE during a test is fine.
170+ conf .setTimeDuration (HDDS_HEARTBEAT_INTERVAL , 1 , TimeUnit .SECONDS );
171+ conf .setTimeDuration (OZONE_SCM_STALENODE_INTERVAL , 60 , TimeUnit .SECONDS );
172+ conf .setTimeDuration (OZONE_SCM_DEADNODE_INTERVAL , 300 , TimeUnit .SECONDS );
147173
148174 // Allow multiple pipelines per datanode to accommodate all tests
149175 conf .setInt (ScmConfigKeys .OZONE_DATANODE_PIPELINE_LIMIT , 5 );
@@ -175,15 +201,9 @@ public void shutdown() {
175201 /**
176202 * Test 1: Write to a pipeline where ALL datanodes are dead.
177203 * <p>
178- * Demonstrates the problem:
179- * - Client sends WriteChunk to dead leader
180- * - TimeoutIOException triggers exponential backoff
181- * - OLD config: retries with unlimited max retries for up to 5 minutes
182- * - FIXED config: retries at most 2 times, fails in ~63s
183- * <p>
184- * This test asserts that the write failure is detected within
185- * MAX_SINGLE_CYCLE_DURATION (4 minutes), which would fail with the old
186- * 5-minute write timeout + 3-minute watch timeout = 8 minutes.
204+ * Verifies that when WriteChunk is sent to a dead leader, exponential
205+ * backoff terminates after the configured retry count and surfaces the
206+ * failure within {@link #MAX_SINGLE_CYCLE_DURATION}.
187207 */
188208 @ Test
189209 @ Order (1 )
@@ -239,24 +259,23 @@ public void testWriteToDeadPipelineFailsFast() throws Exception {
239259 MAX_SINGLE_CYCLE_DURATION , elapsed )
240260 .isLessThan (MAX_SINGLE_CYCLE_DURATION );
241261
242- // Restart the datanodes for subsequent tests
262+ // Restart the datanodes and wait for SCM to have a writable pipeline
263+ // before the next test runs. We don't need full cluster recovery
264+ // (all 7 DNs HEALTHY), only one OPEN factor-THREE pipeline.
243265 for (DatanodeDetails dn : nodes ) {
244266 cluster .restartHddsDatanode (dn , false );
245267 }
246- cluster .waitForClusterToBeReady ();
268+ cluster .waitForPipelineTobeReady (HddsProtos .ReplicationFactor .THREE ,
269+ 60000 );
247270 }
248271
249272 /**
250273 * Test 2: Watch-for-commit when follower datanodes are dead.
251274 * <p>
252- * Demonstrates the problem:
253- * - Client writes data, leader commits but followers are dead
254- * - Watch-ALL_COMMITTED is issued
255- * - OLD config: watch RPC timeout is 180s, client waits 3 minutes
256- * - FIXED config: watch RPC timeout is 30s, matches server timeout
257- * <p>
258- * This test asserts that the watch failure is detected within
259- * MAX_WATCH_DURATION (60s), which would fail with the old 180s timeout.
275+ * Verifies that {@code watch(ALL_COMMITTED)} terminates within
276+ * {@link #MAX_WATCH_DURATION} when followers cannot ack — the client
277+ * watch RPC timeout must align with the server-side watch timeout
278+ * rather than running well past it.
260279 */
261280 @ Test
262281 @ Order (2 )
@@ -328,21 +347,16 @@ public void testWatchForCommitWithDeadFollowersFailsFast() throws Exception {
328347 } catch (Exception e ) {
329348 // May already be running
330349 }
331- cluster .waitForClusterToBeReady ();
350+ cluster .waitForPipelineTobeReady (HddsProtos .ReplicationFactor .THREE ,
351+ 60000 );
332352 }
333353
334354 /**
335355 * Test 3: Write failure when the Raft leader is specifically killed.
336356 * <p>
337- * Demonstrates the problem:
338- * - Client is writing to a pipeline
339- * - The Raft leader is killed mid-write
340- * - RaftClient still holds stale connection to dead leader
341- * - OLD config: exponential backoff retries indefinitely for 5 min
342- * - FIXED config: limited to 2 retries, fails in ~63s
343- * <p>
344- * This test asserts the entire operation (write + close) completes
345- * within MAX_SINGLE_CYCLE_DURATION.
357+ * Verifies that with the leader killed mid-write, the RaftClient does
358+ * not retry forever against the stale leader; the write either recovers
359+ * via re-election or fails within {@link #MAX_SINGLE_CYCLE_DURATION}.
346360 */
347361 @ Test
348362 @ Order (3 )
@@ -404,21 +418,19 @@ public void testWriteWithLeaderFailureFailsFast() throws Exception {
404418
405419 // Restart the leader
406420 cluster .restartHddsDatanode (leader .getDatanodeDetails (), false );
407- cluster .waitForClusterToBeReady ();
421+ cluster .waitForPipelineTobeReady (HddsProtos .ReplicationFactor .THREE ,
422+ 60000 );
408423 }
409424
410425 /**
411426 * Test 4: End-to-end write with ALL datanodes killed, verifying the
412427 * total time including Ozone-level retries.
413428 * <p>
414- * Demonstrates the compound problem:
415- * - Ozone retries up to 5 times (ozone.client.max.retries)
416- * - Each retry allocates a new pipeline, but ALL datanodes are dead
417- * - OLD config: 5 × (5min write + 3min watch) = 40 minutes
418- * - FIXED config: 5 × (~63s write + 30s watch) ≈ 7.7 minutes
419- * <p>
420- * This test asserts the TOTAL time is within MAX_TOTAL_WRITE_DURATION
421- * (10 minutes), which would fail with the old 40-minute worst case.
429+ * Verifies that the compound retry path
430+ * ({@code ozone.client.max.retries} × per-cycle Ratis retries) does not
431+ * multiply into an unbounded wait. This is the test that catches the
432+ * interaction between the two retry layers; tests 1–3 only exercise
433+ * each layer in isolation.
422434 */
423435 @ Test
424436 @ Order (4 )
@@ -464,12 +476,6 @@ public void testEndToEndWriteWithAllDatanodesDownFailsFast()
464476 + "is causing the client to hang." ,
465477 MAX_TOTAL_WRITE_DURATION , elapsed )
466478 .isLessThan (MAX_TOTAL_WRITE_DURATION );
467-
468- // Restart all datanodes
469- for (HddsDatanodeService dn : allDatanodes ) {
470- cluster .restartHddsDatanode (dn .getDatanodeDetails (), false );
471- }
472- cluster .waitForClusterToBeReady ();
473479 }
474480
475481 private String getKeyName () {
0 commit comments