2020import java .util .ArrayList ;
2121import java .util .Collection ;
2222import java .util .List ;
23+ import java .util .concurrent .atomic .AtomicBoolean ;
2324
2425import org .apache .hugegraph .backend .BackendException ;
2526import org .apache .hugegraph .backend .store .BackendSession .AbstractBackendSession ;
@@ -53,10 +54,21 @@ public class CassandraSessionPool extends BackendSessionPool {
5354 private static final String HEALTH_CHECK_CQL =
5455 "SELECT now() FROM system.local" ;
5556
57+ /**
58+ * Guards the one-time JVM-wide warning about {@code commitAsync()} not
59+ * being covered by query-time retries. {@link CassandraSessionPool} is
60+ * instantiated once per backend store per graph, so without this guard
61+ * the warning would fire many times on startup for a structural
62+ * limitation that does not change between instances.
63+ */
64+ private static final AtomicBoolean ASYNC_RETRY_WARNING_LOGGED =
65+ new AtomicBoolean (false );
66+
5667 private Cluster cluster ;
5768 private final String keyspace ;
5869 private final int maxRetries ;
5970 private final long retryInterval ;
71+ private final long retryBaseDelay ;
6072 private final long retryMaxDelay ;
6173
6274 public CassandraSessionPool (HugeConfig config ,
@@ -78,7 +90,14 @@ public CassandraSessionPool(HugeConfig config,
7890 reconnectMax ,
7991 CassandraOptions .CASSANDRA_RECONNECT_BASE_DELAY .name (),
8092 reconnectBase );
93+ this .retryBaseDelay = reconnectBase ;
8194 this .retryMaxDelay = reconnectMax ;
95+
96+ if (this .maxRetries > 0 &&
97+ ASYNC_RETRY_WARNING_LOGGED .compareAndSet (false , true )) {
98+ LOG .warn ("cassandra.reconnect_max_retries={} applies to sync commit()" +
99+ " only. commitAsync() has no retry protection." , this .maxRetries );
100+ }
82101 }
83102
84103 @ Override
@@ -117,10 +136,8 @@ public synchronized void open() {
117136
118137 // Reconnection policy: let driver keep retrying nodes in background
119138 // with exponential backoff after they go down (see issue #2740).
120- long reconnectBase = config .get (
121- CassandraOptions .CASSANDRA_RECONNECT_BASE_DELAY );
122139 builder .withReconnectionPolicy (
123- new ExponentialReconnectionPolicy (reconnectBase ,
140+ new ExponentialReconnectionPolicy (this . retryBaseDelay ,
124141 this .retryMaxDelay ));
125142
126143 // Credential options
@@ -211,7 +228,11 @@ public void commitAsync() {
211228 int processors = Math .min (statements .size (), 1023 );
212229 List <ResultSetFuture > results = new ArrayList <>(processors + 1 );
213230 for (Statement s : statements ) {
214- // TODO: commitAsync is not retried (async retry semantics are complex)
231+ // TODO(issue #2740): commitAsync() bypasses executeWithRetry().
232+ // During a Cassandra restart, async writes may fail with
233+ // NoHostAvailableException even when maxRetries > 0. Callers
234+ // must handle CompletableFuture failures. A follow-up will
235+ // wrap each future with retry semantics.
215236 ResultSetFuture future = this .session .executeAsync (s );
216237 results .add (future );
217238
@@ -253,13 +274,19 @@ public ResultSet execute(String statement, Object... args) {
253274 * reconnection policy, so once Cassandra comes back online, a
254275 * subsequent attempt here will succeed without restarting the server.
255276 *
277+ * <p>If the driver session has been discarded (e.g. by
278+ * {@link #reconnectIfNeeded()} after a failed health-check) it is
279+ * lazily reopened at the start of each attempt. After a transient
280+ * failure the session is {@linkplain #reset() reset} so the next
281+ * iteration gets a fresh driver session.
282+ *
256283 * <p><b>Blocking note:</b> retries block the calling thread via
257284 * {@link Thread#sleep(long)}. Worst-case a single call blocks for
258285 * {@code maxRetries * retryMaxDelay} ms. Under high-throughput
259286 * workloads concurrent threads may pile up in {@code sleep()} during
260287 * a Cassandra outage. For such deployments lower
261- * {@code cassandra.reconnect_max_retries} (default 10 ) and
262- * {@code cassandra.reconnect_max_delay} (default 60000ms ) so the
288+ * {@code cassandra.reconnect_max_retries} (default 3 ) and
289+ * {@code cassandra.reconnect_max_delay} (default 10000ms ) so the
263290 * request fails fast and pressure is released back to the caller.
264291 */
265292 private ResultSet executeWithRetry (Statement statement ) {
@@ -269,9 +296,18 @@ private ResultSet executeWithRetry(Statement statement) {
269296 DriverException lastError = null ;
270297 for (int attempt = 0 ; attempt <= retries ; attempt ++) {
271298 try {
299+ if (this .session == null ) {
300+ // Lazy reopen: may itself throw NHAE while
301+ // Cassandra is still unreachable; the catch below
302+ // treats that as a transient failure.
303+ this .open ();
304+ }
272305 return this .session .execute (statement );
273306 } catch (NoHostAvailableException | OperationTimedOutException e ) {
274307 lastError = e ;
308+ // Discard the (possibly broken) driver session so the
309+ // next iteration reopens cleanly.
310+ this .reset ();
275311 if (attempt >= retries ) {
276312 break ;
277313 }
@@ -359,9 +395,10 @@ public boolean hasChanges() {
359395 * Periodic liveness probe invoked by {@link BackendSessionPool} to
360396 * recover thread-local sessions after Cassandra has been restarted.
361397 * Reopens the driver session if it was closed and pings the cluster
362- * with a lightweight query. Any failure here is swallowed so the
363- * caller can still issue the real query, which will drive retries via
364- * {@link #executeWithRetry(Statement)}.
398+ * with a lightweight query. On failure the session is discarded via
399+ * {@link #reset()} so the next call to
400+ * {@link #executeWithRetry(Statement)} reopens it; any exception
401+ * here is swallowed so the caller can still issue the real query.
365402 */
366403 @ Override
367404 public void reconnectIfNeeded () {
@@ -377,15 +414,9 @@ public void reconnectIfNeeded() {
377414 this .session .execute (new SimpleStatement (HEALTH_CHECK_CQL ));
378415 }
379416 } catch (DriverException e ) {
380- LOG .debug ("Cassandra health-check failed, " +
381- "will retry on next query: {}" , e .getMessage ());
382- } finally {
383- // Keep opened flag consistent with session: if tryOpen()
384- // failed to reopen, clear opened so the next execute() does
385- // not NPE before executeWithRetry() can intercept.
386- if (this .session == null ) {
387- this .opened = false ;
388- }
417+ LOG .debug ("Cassandra health-check failed, resetting session: {}" ,
418+ e .getMessage ());
419+ this .session = null ;
389420 }
390421 }
391422
0 commit comments