From a15cabf8f7529fd06061bae990dc0b71252ac58b Mon Sep 17 00:00:00 2001
From: fabrizzio-dotCMS Connectivity is verified with {@code client.info()}, which round-trips to the cluster
+ * and propagates any transport / TLS / auth failure. This is deliberate:
+ * {@link #getClusterStats()} catches every exception and returns an empty result, so a retry
+ * loop built on it can never observe a failure — the gate would always pass and the real
+ * error would only surface much later, deep inside {@code createContentIndex} (the opaque
+ * late crash this gate exists to prevent). These tests exercise the retry-exhausted branch with an
+ * {@link OSClientProvider} whose {@code getClient()} always throws, simulating an
+ * unreachable / misconfigured OpenSearch cluster. The connection attempts and retry sleep are
+ * forced to their minimum so the gate exhausts immediately without a real cluster. These tests exercise the retry-exhausted branch with an
@@ -20,18 +22,29 @@
* unreachable / misconfigured OpenSearch cluster. The connection attempts and retry sleep are
* forced to their minimum so the gate exhausts immediately without a real cluster. Registered in {@link com.dotcms.OpenSearchUpgradeSuite}. Run with:
+ * Active probe — not the swallowing {@code getClusterStats()}
+ * Phase-aware outcome on exhaustion
+ *
+ *
+ *
+ * @return {@code true} when OS is reachable; {@code false} when OS was unreachable in a
+ * shadow phase and the migration was halted (ES-only fallback). In Phase 3 this method
+ * never returns {@code false} — it aborts the JVM instead.
+ */
@Override
public boolean waitUtilIndexReady() {
- ClusterStats stats = null;
final int attempts = IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_ATTEMPTS, 24);
+ final long sleepMs =
+ IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS, 5) * 1000L;
+ Exception lastError = null;
for (int i = 0; i < attempts; i++) {
try {
- stats = getClusterStats();
- break;
+ // Active probe: info() round-trips to the cluster and throws on any failure.
+ clientProvider.getClient().info();
+ return true;
} catch (Exception e) {
- Logger.error(this.getClass(),
- "OpenSearch Connection Attempt #" + (i + 1) + ": " + e.getMessage());
+ lastError = e;
+ Logger.error(this.getClass(), "OpenSearch Connection Attempt #" + (i + 1)
+ + " of " + attempts + ": " + e.getMessage());
}
- DateUtil.sleep(IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS, 5) * 1000L);
+ DateUtil.sleep(sleepMs);
}
- if (stats == null) {
- Logger.fatal(this.getClass(), "Cannot connect to OpenSearch, giving up.");
- com.dotcms.shutdown.SystemExitManager.immediateExit(1, "OpenSearch connection failed");
- }
- return true;
+ return handleConnectionExhausted(attempts, lastError);
+ }
+
+ /**
+ * Phase-aware handler invoked once the OS connection retries are exhausted.
+ *
+ * @param attempts the number of attempts that were made (for the log message)
+ * @param lastError the last connection error observed, or {@code null}
+ * @return {@code false} after halting the migration in a shadow phase; never returns in Phase 3
+ * (the JVM is terminated).
+ */
+ private boolean handleConnectionExhausted(final int attempts, final Exception lastError) {
+ final MigrationPhase phase = MigrationPhase.current();
+ final String cause = lastError != null ? lastError.getMessage() : "unknown";
+ final String detail = "OpenSearch is not reachable after " + attempts + " attempt(s)."
+ + " phase=" + phase.name()
+ + ", endpoints=" + resolveEndpointsForLogging()
+ + ", cause=" + cause;
+
+ if (phase.isMigrationComplete()) {
+ // Phase 3: OS is primary and ES is decommissioned — no fallback is possible.
+ Logger.fatal(this.getClass(), detail
+ + " — OS is the primary store in " + phase.name() + "; cannot fall back to ES."
+ + " Verify OS_ENDPOINTS, OS_PROTOCOL/OS_TLS_ENABLED (scheme must match the"
+ + " server), and credentials, then restart dotCMS.");
+ com.dotcms.shutdown.SystemExitManager.immediateExit(1,
+ "OpenSearch connection failed in PHASE_3_OPENSEARCH_ONLY");
+ return false; // unreachable — immediateExit terminates the JVM
+ }
+
+ // Phase 1 / 2 (shadow): ES still holds the authoritative state. Fall back to ES-only
+ // instead of killing the server.
+ Logger.error(this.getClass(), detail
+ + " — OS is a shadow store in " + phase.name() + "; falling back to ES-only"
+ + " (resetting FEATURE_FLAG_OPEN_SEARCH_PHASE to 0 via haltMigration)."
+ + " Fix OS connectivity and re-enable the migration phase when ready.");
+ IndexConfigHelper.haltMigration();
+ return false;
+ }
+
+ /**
+ * Resolves the configured OpenSearch endpoints for an actionable log message, mirroring
+ * {@code ConfigurableOpenSearchProvider} resolution: the explicit {@code OS_ENDPOINTS} array
+ * when set, otherwise a single {@code protocol://host:port} synthesised from the OS connection
+ * properties (with ES fallback).
+ */
+ private static String resolveEndpointsForLogging() {
+ final String[] endpoints = Config.getStringArrayProperty("OS_ENDPOINTS", null);
+ if (endpoints != null && endpoints.length > 0) {
+ return Arrays.toString(endpoints);
+ }
+ final String protocol = IndexConfigHelper.getString(OSIndexProperty.PROTOCOL, "https");
+ final String hostname = IndexConfigHelper.getString(OSIndexProperty.HOSTNAME, "localhost");
+ final int port = IndexConfigHelper.getInt(OSIndexProperty.PORT, 9200);
+ return protocol + "://" + hostname + ":" + port;
}
diff --git a/dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java b/dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java
index df38e7854566..01898fd7e403 100644
--- a/dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java
+++ b/dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java
@@ -109,8 +109,15 @@ public void init(ServletConfig config) throws ServletException {
Logger.debug(InitServlet.class, "IOException: " + e1.getMessage(), e1);
}
- // Make sure elasticseach is up
- APILocator.getESIndexAPI().waitUtilIndexReady();
+ // Make sure the primary search store for the current migration phase is up before running
+ // the startup tasks (issue #36244). getESIndexAPI() returns the phase-aware router
+ // (IndexAPIImpl), so waitUtilIndexReady() waits on the primary store for the phase: ES in
+ // phases 0–1, OS in phases 2–3 — not ES-hardcoded. If OS was the primary and a shadow-phase
+ // (1/2) fallback to ES occurred, waitUtilIndexReady() returns false after halting the
+ // migration to Phase 0; wait again so the new primary (ES) is gated too.
+ if (!APILocator.getESIndexAPI().waitUtilIndexReady()) {
+ APILocator.getESIndexAPI().waitUtilIndexReady();
+ }
diff --git a/dotCMS/src/test/java/com/dotcms/content/index/opensearch/OSIndexAPIImplWaitReadyTest.java b/dotCMS/src/test/java/com/dotcms/content/index/opensearch/OSIndexAPIImplWaitReadyTest.java
new file mode 100644
index 000000000000..50772d46146e
--- /dev/null
+++ b/dotCMS/src/test/java/com/dotcms/content/index/opensearch/OSIndexAPIImplWaitReadyTest.java
@@ -0,0 +1,97 @@
+package com.dotcms.content.index.opensearch;
+
+import static com.dotcms.content.index.IndexConfigHelper.MigrationPhase.FLAG_KEY;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import com.dotcms.content.index.IndexConfigHelper.MigrationPhase;
+import com.dotmarketing.util.Config;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.opensearch.client.opensearch.OpenSearchClient;
+
+/**
+ * Unit tests for the phase-aware OpenSearch startup connection gate
+ * {@link OSIndexAPIImpl#waitUtilIndexReady()} (issue #36244).
+ *
+ * What is and isn't covered here
+ *
+ *
+ */
+public class OSIndexAPIImplWaitReadyTest {
+
+ /** {@link OSClientProvider} that always fails to produce a client (unreachable cluster). */
+ private static final class FailingClientProvider implements OSClientProvider {
+ @Override
+ public OpenSearchClient getClient() {
+ throw new RuntimeException("simulated OpenSearch connection failure (test)");
+ }
+ }
+
+ @Before
+ public void fastRetries() {
+ // Exhaust the gate immediately: a single attempt with no sleep between retries.
+ Config.setProperty(OSIndexProperty.CONNECTION_ATTEMPTS.osKey, "1");
+ Config.setProperty(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS.osKey, "0");
+ }
+
+ @After
+ public void clearProps() {
+ Config.setProperty(FLAG_KEY, null);
+ Config.setProperty(OSIndexProperty.CONNECTION_ATTEMPTS.osKey, null);
+ Config.setProperty(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS.osKey, null);
+ }
+
+ private static void setPhase(final int ordinal) {
+ Config.setProperty(FLAG_KEY, String.valueOf(ordinal));
+ }
+
+ /**
+ * Given : Phase 1 (dual-write, ES reads) and an unreachable OpenSearch cluster.
+ * When : waitUtilIndexReady() exhausts its retries.
+ * Then : the server is NOT killed — the gate returns {@code false} and the migration is
+ * halted (FEATURE_FLAG_OPEN_SEARCH_PHASE reset to 0), so dotCMS falls back to ES-only.
+ */
+ @Test
+ public void test_phase1_osUnreachable_fallsBackToEs_noExit() {
+ setPhase(1);
+ final OSIndexAPIImpl api = new OSIndexAPIImpl(new FailingClientProvider());
+
+ final boolean ready = api.waitUtilIndexReady();
+
+ assertFalse("Phase 1 must NOT abort: the gate returns false (ES-only fallback)", ready);
+ assertEquals("Migration phase must be reset to PHASE_0 after the ES fallback",
+ MigrationPhase.PHASE_0_MIGRATION_NOT_STARTED, MigrationPhase.current());
+ }
+
+ /**
+ * Given : Phase 2 (dual-write, OS reads) and an unreachable OpenSearch cluster.
+ * When : waitUtilIndexReady() exhausts its retries.
+ * Then : same shadow-phase behavior as Phase 1 — fall back to ES (halt), return {@code false},
+ * never abort. ES still holds the authoritative state in Phase 2.
+ */
+ @Test
+ public void test_phase2_osUnreachable_fallsBackToEs_noExit() {
+ setPhase(2);
+ final OSIndexAPIImpl api = new OSIndexAPIImpl(new FailingClientProvider());
+
+ final boolean ready = api.waitUtilIndexReady();
+
+ assertFalse("Phase 2 must NOT abort: the gate returns false (ES-only fallback)", ready);
+ assertEquals("Migration phase must be reset to PHASE_0 after the ES fallback",
+ MigrationPhase.PHASE_0_MIGRATION_NOT_STARTED, MigrationPhase.current());
+ }
+}
From 949d49b94431b5be40ef6b0520d0bbaccc4f63ce Mon Sep 17 00:00:00 2001
From: fabrizzio-dotCMS
+ * ./mvnw verify -pl :dotcms-integration \
+ * -Dcoreit.test.skip=false \
+ * -Dopensearch.upgrade.test=true
+ *
+ *