Skip to content

Commit 9f77595

Browse files
committed
fix: [ha] PeerAddressAllowlistFilter rejects legitimate peers during k8s DNS-resolution race (incomplete allowlist on startup/restart)
Fixed issue ArcadeData#4471
1 parent a857e57 commit 9f77595

4 files changed

Lines changed: 304 additions & 20 deletions

File tree

engine/src/main/java/com/arcadedb/GlobalConfiguration.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,25 @@ producing an entry too large for the Raft transport. Kept below the Raft message
949949
"Rate-limiting interval in milliseconds for DNS re-resolution in the gRPC peer address allowlist filter.",
950950
Long.class, 30_000L),
951951

952+
HA_PEER_ALLOWLIST_STARTUP_GRACE_MS("arcadedb.ha.peerAllowlistStartupGraceMs", SCOPE.SERVER,
953+
"""
954+
Startup grace window in milliseconds during which the gRPC peer allowlist filter fails OPEN (accepts and logs a \
955+
warning) for an inbound address it cannot yet match, as long as it has never resolved every host in \
956+
arcadedb.ha.serverList at least once. This prevents a self-inflicted partition on Kubernetes, where a peer's \
957+
headless-service DNS record is only published once its pod is Ready, so a legitimately-restarting peer connects \
958+
before its own name resolves. Measured from filter creation. Once all peer hosts have resolved at least once, or \
959+
the window elapses, the filter enforces normally. Set to 0 to disable fail-open (strict from the first connection); \
960+
the filter is not an mTLS substitute (see issue #3890), so a bounded fail-open window is the safer default.""",
961+
Long.class, 60_000L),
962+
963+
HA_PEER_ALLOWLIST_STICKY_TTL_MS("arcadedb.ha.peerAllowlistStickyTtlMs", SCOPE.SERVER,
964+
"""
965+
How long in milliseconds the gRPC peer allowlist filter keeps the last successfully-resolved IPs of a peer host \
966+
when a later DNS re-resolution of that host fails. Bridges transient DNS outages and pod-IP churn so a peer that \
967+
resolved moments ago is not evicted from the allowlist by a momentary lookup failure. Set to 0 to disable \
968+
stickiness (drop a host from the allowlist as soon as it stops resolving).""",
969+
Long.class, 300_000L),
970+
952971
// POSTGRES
953972
POSTGRES_PORT("arcadedb.postgres.port", SCOPE.SERVER,
954973
"TCP/IP port number used for incoming connections for Postgres plugin. Default is 5432", Integer.class, 5432),

ha-raft/src/main/java/com/arcadedb/server/ha/raft/PeerAddressAllowlistFilter.java

Lines changed: 141 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919
package com.arcadedb.server.ha.raft;
2020

21+
import com.arcadedb.GlobalConfiguration;
2122
import com.arcadedb.log.LogManager;
2223
import org.apache.ratis.thirdparty.io.grpc.Attributes;
2324
import org.apache.ratis.thirdparty.io.grpc.Grpc;
@@ -29,10 +30,13 @@
2930
import java.net.UnknownHostException;
3031
import java.util.ArrayList;
3132
import java.util.Collections;
33+
import java.util.HashMap;
3234
import java.util.HashSet;
3335
import java.util.List;
36+
import java.util.Map;
3437
import java.util.Set;
3538
import java.util.concurrent.atomic.AtomicReference;
39+
import java.util.function.LongSupplier;
3640
import java.util.logging.Level;
3741

3842
/**
@@ -44,24 +48,80 @@
4448
* address arrives, rate-limited by {@code refreshIntervalMs}, so that Kubernetes pod-IP churn
4549
* on restart does not permanently lock out a restarted peer.
4650
* <p>
51+
* To avoid a self-inflicted partition during the window where peer DNS is not yet usable
52+
* (issue #4471 - on Kubernetes a headless-service A record is only published once a pod is
53+
* Ready, so peers come up before each other's names resolve), the filter is hardened in three
54+
* ways:
55+
* <ul>
56+
* <li><b>Bypass the rate limit while the allowlist is incomplete.</b> Until every peer host has
57+
* resolved at least once, a miss re-resolves on a short floor instead of waiting the full
58+
* {@code refreshIntervalMs}, so the allowlist converges quickly at startup.</li>
59+
* <li><b>Sticky last-known-good IPs.</b> When a host that resolved before fails to resolve now
60+
* (transient DNS outage, pod-IP churn mid-restart), its previous IPs are retained for
61+
* {@code stickyTtlMs} rather than being evicted immediately.</li>
62+
* <li><b>Startup fail-open grace.</b> Until the first time all peer hosts have resolved, and for
63+
* at most {@code startupGraceMs} from creation, an unmatched address is accepted with a
64+
* warning instead of rejected. After the allowlist is once complete, or the window elapses,
65+
* the filter enforces normally.</li>
66+
* </ul>
67+
* <p>
4768
* This is NOT a substitute for mTLS: it does not authenticate peer identity and does not
48-
* encrypt the traffic. See GitHub issue #3890.
69+
* encrypt the traffic. See GitHub issue #3890. The bounded startup fail-open is an acceptable
70+
* trade-off for that reason; set {@code startupGraceMs=0} to disable it.
4971
*/
5072
final class PeerAddressAllowlistFilter extends ServerTransportFilter {
5173

74+
/** Pluggable host resolver so tests can drive resolution deterministically without real DNS. */
75+
@FunctionalInterface
76+
interface HostResolver {
77+
InetAddress[] resolve(String host) throws UnknownHostException;
78+
}
79+
5280
private static final Set<String> LOOPBACK_IPS = Set.of("127.0.0.1", "0:0:0:0:0:0:0:1", "::1");
81+
// Minimum spacing between re-resolutions while the allowlist is still incomplete. Bounds DNS load
82+
// under a connection flood at startup while still letting the allowlist converge within ~1s.
83+
private static final long INCOMPLETE_RESOLVE_FLOOR_MS = 1_000L;
5384

5485
private final List<String> peerHosts;
5586
private final long refreshIntervalMs;
87+
private final long startupGraceMs;
88+
private final long stickyTtlMs;
89+
private final long createdMs;
90+
private final LongSupplier clock;
91+
private final HostResolver resolver;
5692
private final AtomicReference<Set<String>> allowedIps = new AtomicReference<>(Collections.emptySet());
93+
// Per-host last successfully-resolved IPs and the time they were resolved, for sticky retention.
94+
// Only mutated inside the synchronized doResolve(); never read outside it.
95+
private final Map<String, Set<String>> lastKnownIps = new HashMap<>();
96+
private final Map<String, Long> lastKnownMs = new HashMap<>();
5797
private volatile long lastResolveMs;
98+
// Latches true the first time every peer host is covered by the allowlist; gates the fail-open grace.
99+
private volatile boolean everCompletelyResolved;
58100

59101
PeerAddressAllowlistFilter(final List<String> peerHosts, final long refreshIntervalMs) {
102+
this(peerHosts, refreshIntervalMs,
103+
GlobalConfiguration.HA_PEER_ALLOWLIST_STARTUP_GRACE_MS.getValueAsLong(),
104+
GlobalConfiguration.HA_PEER_ALLOWLIST_STICKY_TTL_MS.getValueAsLong());
105+
}
106+
107+
PeerAddressAllowlistFilter(final List<String> peerHosts, final long refreshIntervalMs, final long startupGraceMs,
108+
final long stickyTtlMs) {
109+
this(peerHosts, refreshIntervalMs, startupGraceMs, stickyTtlMs, System::currentTimeMillis, InetAddress::getAllByName);
110+
}
111+
112+
/** Full constructor; the {@code clock} and {@code resolver} hooks make resolution deterministic in tests. */
113+
PeerAddressAllowlistFilter(final List<String> peerHosts, final long refreshIntervalMs, final long startupGraceMs,
114+
final long stickyTtlMs, final LongSupplier clock, final HostResolver resolver) {
60115
if (peerHosts == null || peerHosts.isEmpty())
61116
throw new IllegalArgumentException("Peer allowlist requires at least one host");
62117
this.peerHosts = List.copyOf(peerHosts);
63118
this.refreshIntervalMs = Math.max(0L, refreshIntervalMs);
64-
resolveNow();
119+
this.startupGraceMs = Math.max(0L, startupGraceMs);
120+
this.stickyTtlMs = Math.max(0L, stickyTtlMs);
121+
this.clock = clock;
122+
this.resolver = resolver;
123+
this.createdMs = clock.getAsLong();
124+
doResolve();
65125
}
66126

67127
@Override
@@ -77,47 +137,109 @@ public Attributes transportReady(final Attributes attrs) {
77137
if (address.isLoopbackAddress())
78138
return attrs;
79139

80-
final String ip = address.getHostAddress();
81-
if (allowedIps.get().contains(ip))
140+
if (isAllowed(address.getHostAddress()))
82141
return attrs;
83142

84-
// Miss: re-resolve (rate-limited) to pick up restarted peers with new IPs.
85-
if (System.currentTimeMillis() - lastResolveMs >= refreshIntervalMs) {
86-
resolveNow();
87-
if (allowedIps.get().contains(ip))
88-
return attrs;
143+
throw new SecurityException("Remote address '" + address.getHostAddress() + "' is not in the cluster peer allowlist");
144+
}
145+
146+
/**
147+
* Decides whether {@code ip} may connect, applying the incomplete-allowlist bypass and the
148+
* startup fail-open grace (issue #4471). Package-private so it can be unit-tested without
149+
* constructing gRPC transport objects. Logs a warning on both fail-open and reject.
150+
*/
151+
boolean isAllowed(final String ip) {
152+
if (ip == null)
153+
return true; // address not available; cannot evaluate, leave to other layers
154+
if (allowedIps.get().contains(ip))
155+
return true;
156+
157+
// Miss: re-resolve to pick up restarted peers with new IPs. While the allowlist has never been
158+
// complete (startup), use a short floor so it converges fast; once complete, respect refreshIntervalMs.
159+
final long floor = everCompletelyResolved ? refreshIntervalMs : Math.min(refreshIntervalMs, INCOMPLETE_RESOLVE_FLOOR_MS);
160+
resolveIfStale(floor);
161+
if (allowedIps.get().contains(ip))
162+
return true;
163+
164+
// Startup fail-open: we have never seen the full peer set resolve and are still within the grace
165+
// window. Accept rather than partition the cluster against itself while DNS catches up.
166+
final long now = clock.getAsLong();
167+
if (!everCompletelyResolved && startupGraceMs > 0 && now - createdMs < startupGraceMs) {
168+
LogManager.instance().log(this, Level.WARNING,
169+
"Accepting Raft gRPC connection from %s during startup grace: peer allowlist not yet complete "
170+
+ "(resolved %d/%d hosts, allowed=%s). Will enforce once all peers resolve or after %dms.",
171+
ip, lastKnownIps.size(), peerHosts.size(), allowedIps.get(), startupGraceMs);
172+
return true;
89173
}
90174

91175
LogManager.instance().log(this, Level.WARNING,
92176
"Rejecting Raft gRPC connection from non-peer address: %s (allowed=%s)", ip, allowedIps.get());
93-
throw new SecurityException("Remote address '" + ip + "' is not in the cluster peer allowlist");
177+
return false;
94178
}
95179

96180
/** Returns an immutable snapshot of the currently allowed IPs. Exposed for testing. */
97181
Set<String> getAllowedIps() {
98182
return allowedIps.get();
99183
}
100184

185+
/** True once every peer host has been covered by the allowlist at least once. Exposed for testing. */
186+
boolean isEverCompletelyResolved() {
187+
return everCompletelyResolved;
188+
}
189+
101190
/** Triggers an immediate DNS re-resolution. Exposed for testing. */
102191
void refresh() {
103-
resolveNow();
192+
doResolve();
104193
}
105194

106-
private void resolveNow() {
107-
final Set<String> resolved = new HashSet<>();
108-
resolved.addAll(LOOPBACK_IPS);
195+
/** Re-resolves only if at least {@code floor} ms have elapsed since the last resolution. */
196+
private synchronized void resolveIfStale(final long floor) {
197+
if (clock.getAsLong() - lastResolveMs < floor)
198+
return; // another thread re-resolved recently; avoid a thundering herd under a connection flood
199+
doResolve();
200+
}
201+
202+
private synchronized void doResolve() {
203+
final long now = clock.getAsLong();
204+
final Set<String> effective = new HashSet<>(LOOPBACK_IPS);
205+
int covered = 0;
109206
for (final String host : peerHosts) {
207+
Set<String> fresh = null;
110208
try {
111-
final InetAddress[] addrs = InetAddress.getAllByName(host);
112-
for (final InetAddress a : addrs)
113-
resolved.add(a.getHostAddress());
209+
final InetAddress[] addrs = resolver.resolve(host);
210+
if (addrs != null && addrs.length > 0) {
211+
fresh = new HashSet<>();
212+
for (final InetAddress a : addrs)
213+
fresh.add(a.getHostAddress());
214+
}
114215
} catch (final UnknownHostException e) {
115216
LogManager.instance().log(this, Level.WARNING,
116217
"Cannot resolve cluster peer host '%s' for Raft gRPC allowlist: %s", host, e.getMessage());
117218
}
219+
220+
if (fresh != null && !fresh.isEmpty()) {
221+
lastKnownIps.put(host, fresh);
222+
lastKnownMs.put(host, now);
223+
effective.addAll(fresh);
224+
covered++;
225+
} else {
226+
// Resolution failed: keep the last-known-good IPs for a bounded time (sticky) so a transient
227+
// DNS outage or pod-IP churn does not evict a peer that resolved moments ago.
228+
final Set<String> prev = lastKnownIps.get(host);
229+
final Long prevMs = lastKnownMs.get(host);
230+
if (prev != null && prevMs != null && stickyTtlMs > 0 && now - prevMs <= stickyTtlMs) {
231+
effective.addAll(prev);
232+
covered++;
233+
} else {
234+
lastKnownIps.remove(host);
235+
lastKnownMs.remove(host);
236+
}
237+
}
118238
}
119-
allowedIps.set(Collections.unmodifiableSet(resolved));
120-
lastResolveMs = System.currentTimeMillis();
239+
allowedIps.set(Collections.unmodifiableSet(effective));
240+
lastResolveMs = now;
241+
if (covered == peerHosts.size())
242+
everCompletelyResolved = true;
121243
}
122244

123245
/**

ha-raft/src/main/java/com/arcadedb/server/ha/raft/RaftHAServer.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1229,13 +1229,16 @@ private static Parameters buildParameters(final ContextConfiguration configurati
12291229

12301230
final String serverList = configuration.getValueAsString(GlobalConfiguration.HA_SERVER_LIST);
12311231
final long refreshMs = configuration.getValueAsLong(GlobalConfiguration.HA_GRPC_ALLOWLIST_REFRESH_MS);
1232+
final long startupGraceMs = configuration.getValueAsLong(GlobalConfiguration.HA_PEER_ALLOWLIST_STARTUP_GRACE_MS);
1233+
final long stickyTtlMs = configuration.getValueAsLong(GlobalConfiguration.HA_PEER_ALLOWLIST_STICKY_TTL_MS);
12321234
final List<String> peerHosts = PeerAddressAllowlistFilter.extractPeerHosts(serverList);
12331235
if (peerHosts.isEmpty()) {
12341236
LogManager.instance().log(RaftHAServer.class, Level.WARNING,
12351237
"arcadedb.ha.peerAllowlist.enabled=true but arcadedb.ha.serverList is empty; allowlist not installed");
12361238
return parameters;
12371239
}
1238-
final PeerAddressAllowlistFilter allowlistFilter = new PeerAddressAllowlistFilter(peerHosts, refreshMs);
1240+
final PeerAddressAllowlistFilter allowlistFilter = new PeerAddressAllowlistFilter(peerHosts, refreshMs, startupGraceMs,
1241+
stickyTtlMs);
12391242
GrpcConfigKeys.Server.setServicesCustomizer(parameters, new RaftGrpcServicesCustomizer(allowlistFilter));
12401243
return parameters;
12411244
}

0 commit comments

Comments
 (0)