HDDS-15569. Speed up TestClientRetryTimeout by tightening configurations (#10527)

ptlrs · web-flow · commit 3083438b186a · 2026-06-17T07:35:39.000+02:00
diff --git a/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java b/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java
@@ -17,6 +17,7 @@
 
 package org.apache.hadoop.hdds.ratis.conf;
 
+import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 import java.time.Duration;
@@ -69,4 +70,73 @@ void setAndGet() {
     assertEquals(maxRetry, subject.getExponentialPolicyMaxRetries());
   }
 
+  /**
+   * Regression guard for HDDS-15444: the production defaults must keep the
+   * worst-case wall-clock of a single Ratis-client retry cycle bounded.
+   * Per cycle = write-rpc + max-retries × (backoff-sleep + write-rpc) +
+   * watch-rpc. With the post-HDDS-15444 defaults this is ~213 s; we assert
+   * it stays under 4 minutes so a future revert of any one knob is caught
+   * in a unit test rather than in a multi-minute integration test.
+   */
+  @Test
+  void defaultsBoundSingleCycleWallClock() {
+    RatisClientConfig subject = new OzoneConfiguration()
+        .getObject(RatisClientConfig.class);
+    RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
+        .getObject(RatisClientConfig.RaftConfig.class);
+
+    Duration writeRpc = raftSubject.getRpcRequestTimeout();
+    Duration watchRpc = raftSubject.getRpcWatchRequestTimeout();
+    int maxRetries = subject.getExponentialPolicyMaxRetries();
+    Duration maxBackoff = subject.getExponentialPolicyMaxSleep();
+
+    Duration perCycle = writeRpc
+        .plus(maxBackoff.plus(writeRpc).multipliedBy(maxRetries))
+        .plus(watchRpc);
+
+    assertThat(perCycle)
+        .as("Single Ratis-client retry cycle worst-case wall-clock with "
+                + "production defaults (writeRpc=%s, watchRpc=%s, maxRetries=%d, "
+                + "maxBackoff=%s) must stay bounded; a regression here means "
+                + "client writes against a dead pipeline can hang for minutes.",
+            writeRpc, watchRpc, maxRetries, maxBackoff)
+        .isLessThan(Duration.ofMinutes(4));
+  }
+
+  /**
+   * Regression guard for HDDS-15444: the bounded exponential backoff is
+   * what stops the Ratis client from retrying indefinitely. If this is
+   * ever set back to {@code Integer.MAX_VALUE} (the pre-HDDS-15444
+   * behaviour) write failures revert to multi-minute hangs.
+   */
+  @Test
+  void defaultsCapExponentialMaxRetries() {
+    RatisClientConfig subject = new OzoneConfiguration()
+        .getObject(RatisClientConfig.class);
+
+    assertThat(subject.getExponentialPolicyMaxRetries())
+        .as("hdds.ratis.client.exponential.backoff.max.retries must remain "
+            + "bounded; unbounded retries reintroduce the HDDS-15444 hang.")
+        .isPositive()
+        .isLessThanOrEqualTo(5);
+  }
+
+  /**
+   * Regression guard for HDDS-15444: the client-side watch RPC timeout
+   * must align with the server-side watch timeout (30 s by default).
+   * If the client waits longer than the server is willing to honour, the
+   * client hangs past the server-side abort.
+   */
+  @Test
+  void defaultsAlignWatchTimeoutWithServer() {
+    RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
+        .getObject(RatisClientConfig.RaftConfig.class);
+
+    assertThat(raftSubject.getRpcWatchRequestTimeout())
+        .as("hdds.ratis.raft.client.rpc.watch.request.timeout should be "
+            + "close to the server-side watch timeout (30 s); a much larger "
+            + "value lets the client hang past the server's abort.")
+        .isLessThanOrEqualTo(Duration.ofSeconds(60));
+  }
+
 }
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestClientRetryTimeout.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestClientRetryTimeout.java
@@ -18,6 +18,8 @@
 package org.apache.hadoop.ozone.client.rpc;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.jupiter.api.Assertions.assertInstanceOf;
@@ -35,6 +37,7 @@
 import org.apache.hadoop.hdds.conf.StorageUnit;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.ratis.conf.RatisClientConfig;
 import org.apache.hadoop.hdds.scm.OzoneClientConfig;
 import org.apache.hadoop.hdds.scm.ScmConfigKeys;
 import org.apache.hadoop.hdds.scm.XceiverClientRatis;
@@ -63,9 +66,20 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Verifies that client write operations
- * fail within acceptable time bounds when pipelines/datanodes are down.
- *
+ * Verifies that client write operations fail within acceptable time bounds
+ * when pipelines/datanodes are down.
+ * <p>
+ * This class covers the <em>plumbing</em>: that the layered retry path
+ * (Ratis-client retries × {@code ozone.client.max.retries}) terminates in
+ * bounded time when the pipeline is unusable. To keep the suite under the
+ * per-test wall-clock budget the cluster is started with compressed retry
+ * and timeout values; the assertions are scaled accordingly. A regression
+ * that re-introduces unbounded retries here will still trip the bound.
+ * <p>
+ * The companion regression check on the <em>production defaults</em> lives
+ * in {@code TestRatisClientConfig} (hdds-common). That unit test is what
+ * catches a future revert of any of the HDDS-15444 default values without
+ * needing a mini-cluster.
  */
 @TestInstance(TestInstance.Lifecycle.PER_CLASS)
 @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@@ -83,37 +97,21 @@ public class TestClientRetryTimeout {
   /**
    * Maximum acceptable duration for a SINGLE retry cycle (write + watch)
    * when the pipeline is completely dead.
-   * <p>
-   * With the proposed config fixes:
-   * Write: RPC(60s) + retry1(1s+60s) + retry2(2s+60s) ≈ 183s
-   * Watch: RPC(30s)
-   * Total single cycle: ~213s
-   * <p>
-   * We use 4 minutes (240s) as the upper bound with some margin.
-   * The OLD defaults would take ~8 minutes per cycle.
    */
   private static final Duration MAX_SINGLE_CYCLE_DURATION =
-      Duration.ofMinutes(4);
+      Duration.ofSeconds(90);
 
   /**
    * Maximum acceptable duration for the watch-for-commit operation alone.
-   * <p>
-   * With the proposed fix: watch RPC timeout is 30s, but the write timeout
-   * (70s) may also be hit if majority is lost. We use 120s to accommodate
-   * write(70s) + watch(30s) + overhead.
-   * The OLD default would take 180s+ (3 minutes watch alone).
    */
-  private static final Duration MAX_WATCH_DURATION = Duration.ofSeconds(120);
+  private static final Duration MAX_WATCH_DURATION = Duration.ofSeconds(75);
 
   /**
    * Maximum acceptable duration for the end-to-end write failure with
-   * Ozone-level retries (ozone.client.max.retries = 5).
-   * <p>
-   * With proposed fixes: 5 × ~93s ≈ 465s ≈ 8 min.
-   * The OLD defaults would take ~40 minutes.
+   * Ozone-level retries.
    */
   private static final Duration MAX_TOTAL_WRITE_DURATION =
-      Duration.ofMinutes(10);
+      Duration.ofSeconds(180);
 
   private MiniOzoneCluster cluster;
   private OzoneClient client;
@@ -135,15 +133,43 @@ public void init() throws Exception {
 
     OzoneClientConfig clientConfig = conf.getObject(OzoneClientConfig.class);
     clientConfig.setStreamBufferFlushDelay(false);
+    // Cap ozone-level retries at 1 (i.e. 2 attempts total): enough to
+    // exercise the compound retry multiplier without bloating wall-clock.
+    clientConfig.setMaxRetryCount(1);
     conf.setFromObject(clientConfig);
 
+    // Compress Ratis client retry/timeout knobs for tests only. The test
+    // verifies that retry plumbing terminates in bounded time; the ratio
+    // holds at any scale, so we run with smaller absolute values. Values
+    // are kept conservative enough that a healthy round-trip on a loaded
+    // CI runner (multi-second GC pauses, JVM stalls) doesn't spuriously
+    // trip a per-RPC timeout.
+    RatisClientConfig ratisClient = conf.getObject(RatisClientConfig.class);
+    ratisClient.setWriteRequestTimeout(Duration.ofSeconds(15));
+    ratisClient.setWatchRequestTimeout(Duration.ofSeconds(10));
+    ratisClient.setExponentialPolicyBaseSleep(Duration.ofMillis(500));
+    ratisClient.setExponentialPolicyMaxSleep(Duration.ofSeconds(1));
+    ratisClient.setExponentialPolicyMaxRetries(1);
+    conf.setFromObject(ratisClient);
+
+    RatisClientConfig.RaftConfig raftClient =
+        conf.getObject(RatisClientConfig.RaftConfig.class);
+    raftClient.setRpcRequestTimeout(Duration.ofSeconds(10));
+    raftClient.setRpcWatchRequestTimeout(Duration.ofSeconds(10));
+    conf.setFromObject(raftClient);
+
     // Fast leader election so new leader can be chosen quickly
     conf.setTimeDuration(
         OzoneConfigKeys.HDDS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY,
         1, TimeUnit.SECONDS);
 
-    // Prevent SCM from removing dead datanodes during the test
-    conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 300, TimeUnit.SECONDS);
+    // Fast heartbeats so SCM detects restarted DNs quickly. Stale=60s and
+    // dead=300s are sized so that SCM does not transition killed DNs to
+    // DEAD inside a single test (which would change the failure mode from
+    // "RPC timeout" to "pipeline removed"); STALE during a test is fine.
+    conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 1, TimeUnit.SECONDS);
+    conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 60, TimeUnit.SECONDS);
+    conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 300, TimeUnit.SECONDS);
 
     // Allow multiple pipelines per datanode to accommodate all tests
     conf.setInt(ScmConfigKeys.OZONE_DATANODE_PIPELINE_LIMIT, 5);
@@ -175,15 +201,9 @@ public void shutdown() {
   /**
    * Test 1: Write to a pipeline where ALL datanodes are dead.
    * <p>
-   * Demonstrates the problem:
-   * - Client sends WriteChunk to dead leader
-   * - TimeoutIOException triggers exponential backoff
-   * - OLD config: retries with unlimited max retries for up to 5 minutes
-   * - FIXED config: retries at most 2 times, fails in ~63s
-   * <p>
-   * This test asserts that the write failure is detected within
-   * MAX_SINGLE_CYCLE_DURATION (4 minutes), which would fail with the old
-   * 5-minute write timeout + 3-minute watch timeout = 8 minutes.
+   * Verifies that when WriteChunk is sent to a dead leader, exponential
+   * backoff terminates after the configured retry count and surfaces the
+   * failure within {@link #MAX_SINGLE_CYCLE_DURATION}.
    */
   @Test
   @Order(1)
@@ -239,24 +259,23 @@ public void testWriteToDeadPipelineFailsFast() throws Exception {
             MAX_SINGLE_CYCLE_DURATION, elapsed)
         .isLessThan(MAX_SINGLE_CYCLE_DURATION);
 
-    // Restart the datanodes for subsequent tests
+    // Restart the datanodes and wait for SCM to have a writable pipeline
+    // before the next test runs. We don't need full cluster recovery
+    // (all 7 DNs HEALTHY), only one OPEN factor-THREE pipeline.
     for (DatanodeDetails dn : nodes) {
       cluster.restartHddsDatanode(dn, false);
     }
-    cluster.waitForClusterToBeReady();
+    cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
+        60000);
   }
 
   /**
    * Test 2: Watch-for-commit when follower datanodes are dead.
    * <p>
-   * Demonstrates the problem:
-   * - Client writes data, leader commits but followers are dead
-   * - Watch-ALL_COMMITTED is issued
-   * - OLD config: watch RPC timeout is 180s, client waits 3 minutes
-   * - FIXED config: watch RPC timeout is 30s, matches server timeout
-   * <p>
-   * This test asserts that the watch failure is detected within
-   * MAX_WATCH_DURATION (60s), which would fail with the old 180s timeout.
+   * Verifies that {@code watch(ALL_COMMITTED)} terminates within
+   * {@link #MAX_WATCH_DURATION} when followers cannot ack — the client
+   * watch RPC timeout must align with the server-side watch timeout
+   * rather than running well past it.
    */
   @Test
   @Order(2)
@@ -328,21 +347,16 @@ public void testWatchForCommitWithDeadFollowersFailsFast() throws Exception {
     } catch (Exception e) {
       // May already be running
     }
-    cluster.waitForClusterToBeReady();
+    cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
+        60000);
   }
 
   /**
    * Test 3: Write failure when the Raft leader is specifically killed.
    * <p>
-   * Demonstrates the problem:
-   * - Client is writing to a pipeline
-   * - The Raft leader is killed mid-write
-   * - RaftClient still holds stale connection to dead leader
-   * - OLD config: exponential backoff retries indefinitely for 5 min
-   * - FIXED config: limited to 2 retries, fails in ~63s
-   * <p>
-   * This test asserts the entire operation (write + close) completes
-   * within MAX_SINGLE_CYCLE_DURATION.
+   * Verifies that with the leader killed mid-write, the RaftClient does
+   * not retry forever against the stale leader; the write either recovers
+   * via re-election or fails within {@link #MAX_SINGLE_CYCLE_DURATION}.
    */
   @Test
   @Order(3)
@@ -404,21 +418,19 @@ public void testWriteWithLeaderFailureFailsFast() throws Exception {
 
     // Restart the leader
     cluster.restartHddsDatanode(leader.getDatanodeDetails(), false);
-    cluster.waitForClusterToBeReady();
+    cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
+        60000);
   }
 
   /**
    * Test 4: End-to-end write with ALL datanodes killed, verifying the
    * total time including Ozone-level retries.
    * <p>
-   * Demonstrates the compound problem:
-   * - Ozone retries up to 5 times (ozone.client.max.retries)
-   * - Each retry allocates a new pipeline, but ALL datanodes are dead
-   * - OLD config: 5 × (5min write + 3min watch) = 40 minutes
-   * - FIXED config: 5 × (~63s write + 30s watch) ≈ 7.7 minutes
-   * <p>
-   * This test asserts the TOTAL time is within MAX_TOTAL_WRITE_DURATION
-   * (10 minutes), which would fail with the old 40-minute worst case.
+   * Verifies that the compound retry path
+   * ({@code ozone.client.max.retries} × per-cycle Ratis retries) does not
+   * multiply into an unbounded wait. This is the test that catches the
+   * interaction between the two retry layers; tests 1–3 only exercise
+   * each layer in isolation.
    */
   @Test
   @Order(4)
@@ -464,12 +476,6 @@ public void testEndToEndWriteWithAllDatanodesDownFailsFast()
                 + "is causing the client to hang.",
             MAX_TOTAL_WRITE_DURATION, elapsed)
         .isLessThan(MAX_TOTAL_WRITE_DURATION);
-
-    // Restart all datanodes
-    for (HddsDatanodeService dn : allDatanodes) {
-      cluster.restartHddsDatanode(dn.getDatanodeDetails(), false);
-    }
-    cluster.waitForClusterToBeReady();
   }
 
   private String getKeyName() {