1818 */
1919package com .arcadedb .server .ha .raft ;
2020
21+ import com .arcadedb .GlobalConfiguration ;
2122import com .arcadedb .log .LogManager ;
2223import org .apache .ratis .thirdparty .io .grpc .Attributes ;
2324import org .apache .ratis .thirdparty .io .grpc .Grpc ;
2930import java .net .UnknownHostException ;
3031import java .util .ArrayList ;
3132import java .util .Collections ;
33+ import java .util .HashMap ;
3234import java .util .HashSet ;
3335import java .util .List ;
36+ import java .util .Map ;
3437import java .util .Set ;
3538import java .util .concurrent .atomic .AtomicReference ;
39+ import java .util .function .LongSupplier ;
3640import java .util .logging .Level ;
3741
3842/**
4448 * address arrives, rate-limited by {@code refreshIntervalMs}, so that Kubernetes pod-IP churn
4549 * on restart does not permanently lock out a restarted peer.
4650 * <p>
51+ * To avoid a self-inflicted partition during the window where peer DNS is not yet usable
52+ * (issue #4471 - on Kubernetes a headless-service A record is only published once a pod is
53+ * Ready, so peers come up before each other's names resolve), the filter is hardened in three
54+ * ways:
55+ * <ul>
56+ * <li><b>Bypass the rate limit while the allowlist is incomplete.</b> Until every peer host has
57+ * resolved at least once, a miss re-resolves on a short floor instead of waiting the full
58+ * {@code refreshIntervalMs}, so the allowlist converges quickly at startup.</li>
59+ * <li><b>Sticky last-known-good IPs.</b> When a host that resolved before fails to resolve now
60+ * (transient DNS outage, pod-IP churn mid-restart), its previous IPs are retained for
61+ * {@code stickyTtlMs} rather than being evicted immediately.</li>
62+ * <li><b>Startup fail-open grace.</b> Until the first time all peer hosts have resolved, and for
63+ * at most {@code startupGraceMs} from creation, an unmatched address is accepted with a
64+ * warning instead of rejected. After the allowlist is once complete, or the window elapses,
65+ * the filter enforces normally.</li>
66+ * </ul>
67+ * <p>
4768 * This is NOT a substitute for mTLS: it does not authenticate peer identity and does not
48- * encrypt the traffic. See GitHub issue #3890.
69+ * encrypt the traffic. See GitHub issue #3890. The bounded startup fail-open is an acceptable
70+ * trade-off for that reason; set {@code startupGraceMs=0} to disable it.
4971 */
5072final class PeerAddressAllowlistFilter extends ServerTransportFilter {
5173
74+ /** Pluggable host resolver so tests can drive resolution deterministically without real DNS. */
75+ @ FunctionalInterface
76+ interface HostResolver {
77+ InetAddress [] resolve (String host ) throws UnknownHostException ;
78+ }
79+
5280 private static final Set <String > LOOPBACK_IPS = Set .of ("127.0.0.1" , "0:0:0:0:0:0:0:1" , "::1" );
81+ // Minimum spacing between re-resolutions while the allowlist is still incomplete. Bounds DNS load
82+ // under a connection flood at startup while still letting the allowlist converge within ~1s.
83+ private static final long INCOMPLETE_RESOLVE_FLOOR_MS = 1_000L ;
5384
5485 private final List <String > peerHosts ;
5586 private final long refreshIntervalMs ;
87+ private final long startupGraceMs ;
88+ private final long stickyTtlMs ;
89+ private final long createdMs ;
90+ private final LongSupplier clock ;
91+ private final HostResolver resolver ;
5692 private final AtomicReference <Set <String >> allowedIps = new AtomicReference <>(Collections .emptySet ());
93+ // Per-host last successfully-resolved IPs and the time they were resolved, for sticky retention.
94+ // Only mutated inside the synchronized doResolve(); never read outside it.
95+ private final Map <String , Set <String >> lastKnownIps = new HashMap <>();
96+ private final Map <String , Long > lastKnownMs = new HashMap <>();
5797 private volatile long lastResolveMs ;
98+ // Latches true the first time every peer host is covered by the allowlist; gates the fail-open grace.
99+ private volatile boolean everCompletelyResolved ;
58100
59101 PeerAddressAllowlistFilter (final List <String > peerHosts , final long refreshIntervalMs ) {
102+ this (peerHosts , refreshIntervalMs ,
103+ GlobalConfiguration .HA_PEER_ALLOWLIST_STARTUP_GRACE_MS .getValueAsLong (),
104+ GlobalConfiguration .HA_PEER_ALLOWLIST_STICKY_TTL_MS .getValueAsLong ());
105+ }
106+
107+ PeerAddressAllowlistFilter (final List <String > peerHosts , final long refreshIntervalMs , final long startupGraceMs ,
108+ final long stickyTtlMs ) {
109+ this (peerHosts , refreshIntervalMs , startupGraceMs , stickyTtlMs , System ::currentTimeMillis , InetAddress ::getAllByName );
110+ }
111+
112+ /** Full constructor; the {@code clock} and {@code resolver} hooks make resolution deterministic in tests. */
113+ PeerAddressAllowlistFilter (final List <String > peerHosts , final long refreshIntervalMs , final long startupGraceMs ,
114+ final long stickyTtlMs , final LongSupplier clock , final HostResolver resolver ) {
60115 if (peerHosts == null || peerHosts .isEmpty ())
61116 throw new IllegalArgumentException ("Peer allowlist requires at least one host" );
62117 this .peerHosts = List .copyOf (peerHosts );
63118 this .refreshIntervalMs = Math .max (0L , refreshIntervalMs );
64- resolveNow ();
119+ this .startupGraceMs = Math .max (0L , startupGraceMs );
120+ this .stickyTtlMs = Math .max (0L , stickyTtlMs );
121+ this .clock = clock ;
122+ this .resolver = resolver ;
123+ this .createdMs = clock .getAsLong ();
124+ doResolve ();
65125 }
66126
67127 @ Override
@@ -77,47 +137,109 @@ public Attributes transportReady(final Attributes attrs) {
77137 if (address .isLoopbackAddress ())
78138 return attrs ;
79139
80- final String ip = address .getHostAddress ();
81- if (allowedIps .get ().contains (ip ))
140+ if (isAllowed (address .getHostAddress ()))
82141 return attrs ;
83142
84- // Miss: re-resolve (rate-limited) to pick up restarted peers with new IPs.
85- if (System .currentTimeMillis () - lastResolveMs >= refreshIntervalMs ) {
86- resolveNow ();
87- if (allowedIps .get ().contains (ip ))
88- return attrs ;
143+ throw new SecurityException ("Remote address '" + address .getHostAddress () + "' is not in the cluster peer allowlist" );
144+ }
145+
146+ /**
147+ * Decides whether {@code ip} may connect, applying the incomplete-allowlist bypass and the
148+ * startup fail-open grace (issue #4471). Package-private so it can be unit-tested without
149+ * constructing gRPC transport objects. Logs a warning on both fail-open and reject.
150+ */
151+ boolean isAllowed (final String ip ) {
152+ if (ip == null )
153+ return true ; // address not available; cannot evaluate, leave to other layers
154+ if (allowedIps .get ().contains (ip ))
155+ return true ;
156+
157+ // Miss: re-resolve to pick up restarted peers with new IPs. While the allowlist has never been
158+ // complete (startup), use a short floor so it converges fast; once complete, respect refreshIntervalMs.
159+ final long floor = everCompletelyResolved ? refreshIntervalMs : Math .min (refreshIntervalMs , INCOMPLETE_RESOLVE_FLOOR_MS );
160+ resolveIfStale (floor );
161+ if (allowedIps .get ().contains (ip ))
162+ return true ;
163+
164+ // Startup fail-open: we have never seen the full peer set resolve and are still within the grace
165+ // window. Accept rather than partition the cluster against itself while DNS catches up.
166+ final long now = clock .getAsLong ();
167+ if (!everCompletelyResolved && startupGraceMs > 0 && now - createdMs < startupGraceMs ) {
168+ LogManager .instance ().log (this , Level .WARNING ,
169+ "Accepting Raft gRPC connection from %s during startup grace: peer allowlist not yet complete "
170+ + "(resolved %d/%d hosts, allowed=%s). Will enforce once all peers resolve or after %dms." ,
171+ ip , lastKnownIps .size (), peerHosts .size (), allowedIps .get (), startupGraceMs );
172+ return true ;
89173 }
90174
91175 LogManager .instance ().log (this , Level .WARNING ,
92176 "Rejecting Raft gRPC connection from non-peer address: %s (allowed=%s)" , ip , allowedIps .get ());
93- throw new SecurityException ( "Remote address '" + ip + "' is not in the cluster peer allowlist" ) ;
177+ return false ;
94178 }
95179
96180 /** Returns an immutable snapshot of the currently allowed IPs. Exposed for testing. */
97181 Set <String > getAllowedIps () {
98182 return allowedIps .get ();
99183 }
100184
185+ /** True once every peer host has been covered by the allowlist at least once. Exposed for testing. */
186+ boolean isEverCompletelyResolved () {
187+ return everCompletelyResolved ;
188+ }
189+
101190 /** Triggers an immediate DNS re-resolution. Exposed for testing. */
102191 void refresh () {
103- resolveNow ();
192+ doResolve ();
104193 }
105194
106- private void resolveNow () {
107- final Set <String > resolved = new HashSet <>();
108- resolved .addAll (LOOPBACK_IPS );
195+ /** Re-resolves only if at least {@code floor} ms have elapsed since the last resolution. */
196+ private synchronized void resolveIfStale (final long floor ) {
197+ if (clock .getAsLong () - lastResolveMs < floor )
198+ return ; // another thread re-resolved recently; avoid a thundering herd under a connection flood
199+ doResolve ();
200+ }
201+
202+ private synchronized void doResolve () {
203+ final long now = clock .getAsLong ();
204+ final Set <String > effective = new HashSet <>(LOOPBACK_IPS );
205+ int covered = 0 ;
109206 for (final String host : peerHosts ) {
207+ Set <String > fresh = null ;
110208 try {
111- final InetAddress [] addrs = InetAddress .getAllByName (host );
112- for (final InetAddress a : addrs )
113- resolved .add (a .getHostAddress ());
209+ final InetAddress [] addrs = resolver .resolve (host );
210+ if (addrs != null && addrs .length > 0 ) {
211+ fresh = new HashSet <>();
212+ for (final InetAddress a : addrs )
213+ fresh .add (a .getHostAddress ());
214+ }
114215 } catch (final UnknownHostException e ) {
115216 LogManager .instance ().log (this , Level .WARNING ,
116217 "Cannot resolve cluster peer host '%s' for Raft gRPC allowlist: %s" , host , e .getMessage ());
117218 }
219+
220+ if (fresh != null && !fresh .isEmpty ()) {
221+ lastKnownIps .put (host , fresh );
222+ lastKnownMs .put (host , now );
223+ effective .addAll (fresh );
224+ covered ++;
225+ } else {
226+ // Resolution failed: keep the last-known-good IPs for a bounded time (sticky) so a transient
227+ // DNS outage or pod-IP churn does not evict a peer that resolved moments ago.
228+ final Set <String > prev = lastKnownIps .get (host );
229+ final Long prevMs = lastKnownMs .get (host );
230+ if (prev != null && prevMs != null && stickyTtlMs > 0 && now - prevMs <= stickyTtlMs ) {
231+ effective .addAll (prev );
232+ covered ++;
233+ } else {
234+ lastKnownIps .remove (host );
235+ lastKnownMs .remove (host );
236+ }
237+ }
118238 }
119- allowedIps .set (Collections .unmodifiableSet (resolved ));
120- lastResolveMs = System .currentTimeMillis ();
239+ allowedIps .set (Collections .unmodifiableSet (effective ));
240+ lastResolveMs = now ;
241+ if (covered == peerHosts .size ())
242+ everCompletelyResolved = true ;
121243 }
122244
123245 /**
0 commit comments