Skip to content

Commit 18c63b8

Browse files
committed
Merge remote-tracking branch 'origin/master' into HDDS-7956
2 parents 8596a9b + 5a9d5ed commit 18c63b8

67 files changed

Lines changed: 5020 additions & 140 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,14 @@ public final class HddsConfigKeys {
116116
"hdds.scm.safemode.log.interval";
117117
public static final String HDDS_SCM_SAFEMODE_LOG_INTERVAL_DEFAULT = "1m";
118118

119+
/**
120+
* Interval for background refresh of safeMode rules. 0 disables the background thread.
121+
*/
122+
public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL =
123+
"hdds.scm.safemode.rule.refresh.interval";
124+
public static final String
125+
HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT = "5s";
126+
119127
// This configuration setting is used as a fallback location by all
120128
// Ozone/HDDS services for their metadata. It is useful as a single
121129
// config point for test/PoC clusters.
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.hadoop.hdds.utils;
19+
20+
import java.io.EOFException;
21+
import java.net.ConnectException;
22+
import java.net.NoRouteToHostException;
23+
import java.net.SocketException;
24+
import java.net.SocketTimeoutException;
25+
import java.net.UnknownHostException;
26+
27+
/**
28+
* Shared classifier for exceptions where the cached peer IP is no longer
29+
* reachable and DNS re-resolution is the only plausible recovery path.
30+
* <p>
31+
* Used by both {@code SCMFailoverProxyProviderBase} and
32+
* {@code OMFailoverProxyProviderBase} to gate the DNS-refresh-on-failure
33+
* code path so that application-level errors (NotLeader, AccessControl,
34+
* OMException, RetryAction) do not trigger spurious DNS lookups.
35+
* <p>
36+
* The classifier must match the failure shapes seen in production
37+
* Kubernetes deployments where the peer pod has been rescheduled to a
38+
* new IP under a stable hostname:
39+
* <ul>
40+
* <li>{@link ConnectException} -- the TCP SYN was refused. Seen on
41+
* OpenStack / fast-RST environments. </li>
42+
* <li>{@link SocketTimeoutException} (and its IPC subclass
43+
* {@code ConnectTimeoutException}) -- the SYN was dropped silently.
44+
* This is the dominant failure shape on AWS EC2 / EKS where the
45+
* network silently drops packets to a defunct pod IP. The PR that
46+
* introduced this helper (HDDS-15514) is sold on this case; it
47+
* must be in the filter. </li>
48+
* <li>{@link NoRouteToHostException} -- routing table no longer
49+
* reaches the cached IP. </li>
50+
* <li>{@link UnknownHostException} -- the hostname itself failed to
51+
* resolve at the time the IPC layer reconstructed the address. </li>
52+
* <li>{@link EOFException} -- a load balancer or iptables RST closed
53+
* the half-open connection cleanly. Common in Kubernetes when an
54+
* IP is reassigned to an unrelated pod that rejects the RPC
55+
* handshake. </li>
56+
* <li>{@link SocketException} (e.g. "Connection reset") -- the peer
57+
* sent RST mid-stream. </li>
58+
* </ul>
59+
* The walk is bounded to {@value #MAX_CAUSE_DEPTH} levels to defend
60+
* against cause chains that have been constructed (in violation of
61+
* {@code Throwable.initCause}'s contract) into a cycle of length &gt; 1.
62+
*/
63+
public final class ConnectionFailureUtils {
64+
65+
/**
66+
* Maximum depth of the {@code Throwable.getCause()} chain we walk
67+
* before giving up. Matches Hadoop's own walkers in
68+
* {@code RemoteException} handling.
69+
*/
70+
static final int MAX_CAUSE_DEPTH = 16;
71+
72+
private ConnectionFailureUtils() {
73+
}
74+
75+
/**
76+
* Returns true when any link in {@code t}'s cause chain (up to
77+
* {@link #MAX_CAUSE_DEPTH} levels) is one of the connection-class
78+
* exceptions documented on this class.
79+
*
80+
* @param t the throwable to classify. {@code null} returns false.
81+
*/
82+
public static boolean isConnectionFailure(Throwable t) {
83+
Throwable cause = t;
84+
for (int depth = 0; cause != null && depth < MAX_CAUSE_DEPTH; depth++) {
85+
if (cause instanceof ConnectException
86+
|| cause instanceof SocketTimeoutException
87+
|| cause instanceof NoRouteToHostException
88+
|| cause instanceof UnknownHostException
89+
|| cause instanceof EOFException
90+
|| cause instanceof SocketException) {
91+
return true;
92+
}
93+
Throwable next = cause.getCause();
94+
if (next == cause) {
95+
break;
96+
}
97+
cause = next;
98+
}
99+
return false;
100+
}
101+
}

hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,19 @@ public final class OzoneConfigKeys {
604604
public static final boolean OZONE_JVM_NETWORK_ADDRESS_CACHE_ENABLED_DEFAULT =
605605
true;
606606

607+
/**
608+
* When true, RPC clients (DN heartbeat, OM client, SCM client) re-resolve
609+
* cached hostnames on connection failure and rebuild the proxy if the
610+
* resolved IP has changed. Set to true in environments where server pod
611+
* IPs may change while DNS names remain stable, such as Kubernetes.
612+
* Default false preserves pre-fix behavior. Mirrors the design intent of
613+
* HADOOP-17068 / HDFS-14118.
614+
*/
615+
public static final String OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_KEY =
616+
"ozone.client.failover.resolve-needed";
617+
public static final boolean OZONE_CLIENT_FAILOVER_RESOLVE_NEEDED_DEFAULT =
618+
false;
619+
607620
public static final String OZONE_CLIENT_REQUIRED_OM_VERSION_MIN_KEY =
608621
"ozone.client.required.om.version.min";
609622

hadoop-hdds/common/src/main/resources/ozone-default.xml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1717,6 +1717,13 @@
17171717
reported replica before SCM comes out of safe mode.
17181718
</description>
17191719
</property>
1720+
<property>
1721+
<name>hdds.scm.safemode.rule.refresh.interval</name>
1722+
<value>5s</value>
1723+
<tag>HDDS,SCM,OPERATION</tag>
1724+
<description> Refresh interval in SCM Safemode.
1725+
</description>
1726+
</property>
17201727

17211728
<property>
17221729
<name>hdds.scm.wait.time.after.safemode.exit</name>
@@ -3913,6 +3920,39 @@
39133920
</description>
39143921
</property>
39153922

3923+
<property>
3924+
<name>ozone.client.failover.resolve-needed</name>
3925+
<value>false</value>
3926+
<tag>OZONE, CLIENT, OM, SCM, HA</tag>
3927+
<description>When true, RPC clients (DN heartbeat, OM client, SCM
3928+
client) re-resolve cached hostnames on connection-class failures
3929+
(ConnectException, SocketTimeoutException, NoRouteToHostException,
3930+
UnknownHostException, EOFException, SocketException) and rebuild
3931+
the proxy if the resolved IP has changed. Set to true in
3932+
environments where server pod IPs may change while DNS names
3933+
remain stable, such as Kubernetes. Default false preserves
3934+
pre-fix behaviour. Mirrors the design intent of HADOOP-17068 /
3935+
HDFS-14118.
3936+
3937+
Required co-config for SECURE clusters: when this flag is true,
3938+
operators must ALSO set hadoop.security.token.service.use_ip=false
3939+
(in core-site.xml). Reason: the Hadoop delegation-token service
3940+
identifier defaults to an IP:port string. After a refresh, the
3941+
per-OM service identifier built from the new IP no longer matches
3942+
the IP-based service captured on long-lived tokens, and token
3943+
selection (OzoneDelegationTokenSelector) silently fails for the
3944+
refreshed peer. With use_ip=false the service identifier is the
3945+
stable hostname:port, which survives any IP change. Insecure
3946+
clusters do not need the co-config.
3947+
3948+
Note: ozone.network.jvm.address.cache.enabled controls a related
3949+
but distinct concern -- the JVM-level positive DNS cache TTL.
3950+
That setting only affects future name lookups; this setting
3951+
additionally rebuilds long-lived RPC proxies whose
3952+
InetSocketAddress was frozen at process start.
3953+
</description>
3954+
</property>
3955+
39163956
<property>
39173957
<name>ozone.directory.deleting.service.interval</name>
39183958
<value>1m</value>

hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.hadoop.hdds.ratis.conf;
1919

20+
import static org.assertj.core.api.Assertions.assertThat;
2021
import static org.junit.jupiter.api.Assertions.assertEquals;
2122

2223
import java.time.Duration;
@@ -69,4 +70,73 @@ void setAndGet() {
6970
assertEquals(maxRetry, subject.getExponentialPolicyMaxRetries());
7071
}
7172

73+
/**
74+
* Regression guard for HDDS-15444: the production defaults must keep the
75+
* worst-case wall-clock of a single Ratis-client retry cycle bounded.
76+
* Per cycle = write-rpc + max-retries × (backoff-sleep + write-rpc) +
77+
* watch-rpc. With the post-HDDS-15444 defaults this is ~213 s; we assert
78+
* it stays under 4 minutes so a future revert of any one knob is caught
79+
* in a unit test rather than in a multi-minute integration test.
80+
*/
81+
@Test
82+
void defaultsBoundSingleCycleWallClock() {
83+
RatisClientConfig subject = new OzoneConfiguration()
84+
.getObject(RatisClientConfig.class);
85+
RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
86+
.getObject(RatisClientConfig.RaftConfig.class);
87+
88+
Duration writeRpc = raftSubject.getRpcRequestTimeout();
89+
Duration watchRpc = raftSubject.getRpcWatchRequestTimeout();
90+
int maxRetries = subject.getExponentialPolicyMaxRetries();
91+
Duration maxBackoff = subject.getExponentialPolicyMaxSleep();
92+
93+
Duration perCycle = writeRpc
94+
.plus(maxBackoff.plus(writeRpc).multipliedBy(maxRetries))
95+
.plus(watchRpc);
96+
97+
assertThat(perCycle)
98+
.as("Single Ratis-client retry cycle worst-case wall-clock with "
99+
+ "production defaults (writeRpc=%s, watchRpc=%s, maxRetries=%d, "
100+
+ "maxBackoff=%s) must stay bounded; a regression here means "
101+
+ "client writes against a dead pipeline can hang for minutes.",
102+
writeRpc, watchRpc, maxRetries, maxBackoff)
103+
.isLessThan(Duration.ofMinutes(4));
104+
}
105+
106+
/**
107+
* Regression guard for HDDS-15444: the bounded exponential backoff is
108+
* what stops the Ratis client from retrying indefinitely. If this is
109+
* ever set back to {@code Integer.MAX_VALUE} (the pre-HDDS-15444
110+
* behaviour) write failures revert to multi-minute hangs.
111+
*/
112+
@Test
113+
void defaultsCapExponentialMaxRetries() {
114+
RatisClientConfig subject = new OzoneConfiguration()
115+
.getObject(RatisClientConfig.class);
116+
117+
assertThat(subject.getExponentialPolicyMaxRetries())
118+
.as("hdds.ratis.client.exponential.backoff.max.retries must remain "
119+
+ "bounded; unbounded retries reintroduce the HDDS-15444 hang.")
120+
.isPositive()
121+
.isLessThanOrEqualTo(5);
122+
}
123+
124+
/**
125+
* Regression guard for HDDS-15444: the client-side watch RPC timeout
126+
* must align with the server-side watch timeout (30 s by default).
127+
* If the client waits longer than the server is willing to honour, the
128+
* client hangs past the server-side abort.
129+
*/
130+
@Test
131+
void defaultsAlignWatchTimeoutWithServer() {
132+
RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
133+
.getObject(RatisClientConfig.RaftConfig.class);
134+
135+
assertThat(raftSubject.getRpcWatchRequestTimeout())
136+
.as("hdds.ratis.raft.client.rpc.watch.request.timeout should be "
137+
+ "close to the server-side watch timeout (30 s); a much larger "
138+
+ "value lets the client hang past the server's abort.")
139+
.isLessThanOrEqualTo(Duration.ofSeconds(60));
140+
}
141+
72142
}

0 commit comments

Comments
 (0)