apache
diff --git a/‎hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java‎
Lines changed: 8 additions & 0 deletions b/‎hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/ConnectionFailureUtils.java‎
Lines changed: 101 additions & 0 deletions b/‎hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/ConnectionFailureUtils.java‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java‎
Lines changed: 13 additions & 0 deletions b/‎hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎hadoop-hdds/common/src/main/resources/ozone-default.xml‎
Lines changed: 40 additions & 0 deletions b/‎hadoop-hdds/common/src/main/resources/ozone-default.xml‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java‎
Lines changed: 70 additions & 0 deletions b/‎hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java‎
Lines changed: 70 additions & 0 deletions
@@ -116,6 +116,14 @@ public final class HddsConfigKeys {
       "hdds.scm.safemode.log.interval";
   public static final String HDDS_SCM_SAFEMODE_LOG_INTERVAL_DEFAULT = "1m";
 
+  /**
+   * Interval for background refresh of safeMode rules. 0 disables the background thread.
+   */
+  public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL =
+      "hdds.scm.safemode.rule.refresh.interval";
+  public static final String
+      HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT = "5s";
+
   // This configuration setting is used as a fallback location by all
   // Ozone/HDDS services for their metadata. It is useful as a single
   // config point for test/PoC clusters.
 
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.utils;
+
+import java.io.EOFException;
+import java.net.ConnectException;
+import java.net.NoRouteToHostException;
+import java.net.SocketException;
+import java.net.SocketTimeoutException;
+import java.net.UnknownHostException;
+
+/**
+ * Shared classifier for exceptions where the cached peer IP is no longer
+ * reachable and DNS re-resolution is the only plausible recovery path.
+ * <p>
+ * Used by both {@code SCMFailoverProxyProviderBase} and
+ * {@code OMFailoverProxyProviderBase} to gate the DNS-refresh-on-failure
+ * code path so that application-level errors (NotLeader, AccessControl,
+ * OMException, RetryAction) do not trigger spurious DNS lookups.
+ * <p>
+ * The classifier must match the failure shapes seen in production
+ * Kubernetes deployments where the peer pod has been rescheduled to a
+ * new IP under a stable hostname:
+ * <ul>
+ *   <li>{@link ConnectException} -- the TCP SYN was refused. Seen on
+ *       OpenStack / fast-RST environments. </li>
+ *   <li>{@link SocketTimeoutException} (and its IPC subclass
+ *       {@code ConnectTimeoutException}) -- the SYN was dropped silently.
+ *       This is the dominant failure shape on AWS EC2 / EKS where the
+ *       network silently drops packets to a defunct pod IP. The PR that
+ *       introduced this helper (HDDS-15514) is sold on this case; it
+ *       must be in the filter. </li>
+ *   <li>{@link NoRouteToHostException} -- routing table no longer
+ *       reaches the cached IP. </li>
+ *   <li>{@link UnknownHostException} -- the hostname itself failed to
+ *       resolve at the time the IPC layer reconstructed the address. </li>
+ *   <li>{@link EOFException} -- a load balancer or iptables RST closed
+ *       the half-open connection cleanly. Common in Kubernetes when an
+ *       IP is reassigned to an unrelated pod that rejects the RPC
+ *       handshake. </li>
+ *   <li>{@link SocketException} (e.g. "Connection reset") -- the peer
+ *       sent RST mid-stream. </li>
+ * </ul>
+ * The walk is bounded to {@value #MAX_CAUSE_DEPTH} levels to defend
+ * against cause chains that have been constructed (in violation of
+ * {@code Throwable.initCause}'s contract) into a cycle of length &gt; 1.
+ */
+public final class ConnectionFailureUtils {
+
+  /**
+   * Maximum depth of the {@code Throwable.getCause()} chain we walk
+   * before giving up. Matches Hadoop's own walkers in
+   * {@code RemoteException} handling.
+   */
+  static final int MAX_CAUSE_DEPTH = 16;
+
+  private ConnectionFailureUtils() {
+  }
+
+  /**
+   * Returns true when any link in {@code t}'s cause chain (up to
+   * {@link #MAX_CAUSE_DEPTH} levels) is one of the connection-class
+   * exceptions documented on this class.
+   *
+   * @param t the throwable to classify. {@code null} returns false.
+   */
+  public static boolean isConnectionFailure(Throwable t) {
+    Throwable cause = t;
+    for (int depth = 0; cause != null && depth < MAX_CAUSE_DEPTH; depth++) {
+      if (cause instanceof ConnectException
+          || cause instanceof SocketTimeoutException
+          || cause instanceof NoRouteToHostException
+          || cause instanceof UnknownHostException
+          || cause instanceof EOFException
+          || cause instanceof SocketException) {
+        return true;
+      }
+      Throwable next = cause.getCause();
+      if (next == cause) {
+        break;
+      }
+      cause = next;
+    }
+    return false;
+  }
+}
@@ -604,6 +604,19 @@ public final class OzoneConfigKeys {
   public static final boolean OZONE_JVM_NETWORK_ADDRESS_CACHE_ENABLED_DEFAULT =
           true;
 
+  /**
+   * When true, RPC clients (DN heartbeat, OM client, SCM client) re-resolve
+   * cached hostnames on connection failure and rebuild the proxy if the
+   * resolved IP has changed. Set to true in environments where server pod
+   * IPs may change while DNS names remain stable, such as Kubernetes.
+   * Default false preserves pre-fix behavior. Mirrors the design intent of
+   * HADOOP-17068 / HDFS-14118.
+   */
+  public static final String OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY =
+          "ozone.client.failover.resolve-needed";
+  public static final boolean OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_DEFAULT =
+          false;
+
   public static final String OZONE_CLIENT_REQUIRED_OM_VERSION_MIN_KEY =
       "ozone.client.required.om.version.min";
 
 
@@ -1717,6 +1717,13 @@
       reported replica before SCM comes out of safe mode.
     </description>
   </property>
+  <property>
+    <name>hdds.scm.safemode.rule.refresh.interval</name>
+    <value>5s</value>
+    <tag>HDDS,SCM,OPERATION</tag>
+    <description> Refresh interval in SCM Safemode.
+    </description>
+  </property>
 
   <property>
     <name>hdds.scm.wait.time.after.safemode.exit</name>
@@ -3913,6 +3920,39 @@
     </description>
   </property>
 
+  <property>
+    <name>ozone.client.failover.resolve-needed</name>
+    <value>false</value>
+    <tag>OZONE, CLIENT, OM, SCM, HA</tag>
+    <description>When true, RPC clients (DN heartbeat, OM client, SCM
+      client) re-resolve cached hostnames on connection-class failures
+      (ConnectException, SocketTimeoutException, NoRouteToHostException,
+      UnknownHostException, EOFException, SocketException) and rebuild
+      the proxy if the resolved IP has changed. Set to true in
+      environments where server pod IPs may change while DNS names
+      remain stable, such as Kubernetes. Default false preserves
+      pre-fix behaviour. Mirrors the design intent of HADOOP-17068 /
+      HDFS-14118.
+
+      Required co-config for SECURE clusters: when this flag is true,
+      operators must ALSO set hadoop.security.token.service.use_ip=false
+      (in core-site.xml). Reason: the Hadoop delegation-token service
+      identifier defaults to an IP:port string. After a refresh, the
+      per-OM service identifier built from the new IP no longer matches
+      the IP-based service captured on long-lived tokens, and token
+      selection (OzoneDelegationTokenSelector) silently fails for the
+      refreshed peer. With use_ip=false the service identifier is the
+      stable hostname:port, which survives any IP change. Insecure
+      clusters do not need the co-config.
+
+      Note: ozone.network.jvm.address.cache.enabled controls a related
+      but distinct concern -- the JVM-level positive DNS cache TTL.
+      That setting only affects future name lookups; this setting
+      additionally rebuilds long-lived RPC proxies whose
+      InetSocketAddress was frozen at process start.
+    </description>
+  </property>
+
   <property>
     <name>ozone.directory.deleting.service.interval</name>
     <value>1m</value>
 
@@ -17,6 +17,7 @@
 
 package org.apache.hadoop.hdds.ratis.conf;
 
+import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 import java.time.Duration;
@@ -69,4 +70,73 @@ void setAndGet() {
     assertEquals(maxRetry, subject.getExponentialPolicyMaxRetries());
   }
 
+  /**
+   * Regression guard for HDDS-15444: the production defaults must keep the
+   * worst-case wall-clock of a single Ratis-client retry cycle bounded.
+   * Per cycle = write-rpc + max-retries × (backoff-sleep + write-rpc) +
+   * watch-rpc. With the post-HDDS-15444 defaults this is ~213 s; we assert
+   * it stays under 4 minutes so a future revert of any one knob is caught
+   * in a unit test rather than in a multi-minute integration test.
+   */
+  @Test
+  void defaultsBoundSingleCycleWallClock() {
+    RatisClientConfig subject = new OzoneConfiguration()
+        .getObject(RatisClientConfig.class);
+    RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
+        .getObject(RatisClientConfig.RaftConfig.class);
+
+    Duration writeRpc = raftSubject.getRpcRequestTimeout();
+    Duration watchRpc = raftSubject.getRpcWatchRequestTimeout();
+    int maxRetries = subject.getExponentialPolicyMaxRetries();
+    Duration maxBackoff = subject.getExponentialPolicyMaxSleep();
+
+    Duration perCycle = writeRpc
+        .plus(maxBackoff.plus(writeRpc).multipliedBy(maxRetries))
+        .plus(watchRpc);
+
+    assertThat(perCycle)
+        .as("Single Ratis-client retry cycle worst-case wall-clock with "
+                + "production defaults (writeRpc=%s, watchRpc=%s, maxRetries=%d, "
+                + "maxBackoff=%s) must stay bounded; a regression here means "
+                + "client writes against a dead pipeline can hang for minutes.",
+            writeRpc, watchRpc, maxRetries, maxBackoff)
+        .isLessThan(Duration.ofMinutes(4));
+  }
+
+  /**
+   * Regression guard for HDDS-15444: the bounded exponential backoff is
+   * what stops the Ratis client from retrying indefinitely. If this is
+   * ever set back to {@code Integer.MAX_VALUE} (the pre-HDDS-15444
+   * behaviour) write failures revert to multi-minute hangs.
+   */
+  @Test
+  void defaultsCapExponentialMaxRetries() {
+    RatisClientConfig subject = new OzoneConfiguration()
+        .getObject(RatisClientConfig.class);
+
+    assertThat(subject.getExponentialPolicyMaxRetries())
+        .as("hdds.ratis.client.exponential.backoff.max.retries must remain "
+            + "bounded; unbounded retries reintroduce the HDDS-15444 hang.")
+        .isPositive()
+        .isLessThanOrEqualTo(5);
+  }
+
+  /**
+   * Regression guard for HDDS-15444: the client-side watch RPC timeout
+   * must align with the server-side watch timeout (30 s by default).
+   * If the client waits longer than the server is willing to honour, the
+   * client hangs past the server-side abort.
+   */
+  @Test
+  void defaultsAlignWatchTimeoutWithServer() {
+    RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
+        .getObject(RatisClientConfig.RaftConfig.class);
+
+    assertThat(raftSubject.getRpcWatchRequestTimeout())
+        .as("hdds.ratis.raft.client.rpc.watch.request.timeout should be "
+            + "close to the server-side watch timeout (30 s); a much larger "
+            + "value lets the client hang past the server's abort.")
+        .isLessThanOrEqualTo(Duration.ofSeconds(60));
+  }
+
 }