diff --git a/dotCMS/src/main/java/com/dotcms/content/elasticsearch/business/ContentletIndexAPIImpl.java b/dotCMS/src/main/java/com/dotcms/content/elasticsearch/business/ContentletIndexAPIImpl.java index 0e713aee9e9a..86b2a3cd1f03 100644 --- a/dotCMS/src/main/java/com/dotcms/content/elasticsearch/business/ContentletIndexAPIImpl.java +++ b/dotCMS/src/main/java/com/dotcms/content/elasticsearch/business/ContentletIndexAPIImpl.java @@ -885,6 +885,27 @@ private void bootstrapAndPointES(final String workingName, final String liveName */ private void bootstrapAndPointOS(final String workingName, final String liveName) throws DotDataException { + + // Connection gate (issue #36244): verify OS reachability BEFORE creating OS indices. + // This is the single chokepoint for all OS index creation (fresh-install bootstrap and + // migration catchup), so both startup paths — populated-DB (InitServlet) and empty-DB + // (Task00004LoadStarter) — pass through the same phase-aware gate instead of failing + // late and opaquely with a transport exception deep inside createContentIndex. + // + // operationsOS.indexAPI() is the OS-specific IndexAPI, so the gate always probes OS + // regardless of the current read provider (in Phase 1 the read provider is ES). The + // phase-aware outcome lives in OSIndexAPIImpl.waitUtilIndexReady(): Phase 3 aborts the + // JVM with an actionable message; Phase 1/2 halts the migration (ES-only fallback) and + // returns false — in which case we must NOT create OS indices. + if (!operationsOS.indexAPI().waitUtilIndexReady()) { + Logger.warn(this.getClass(), + "Skipping OpenSearch index bootstrap (working=" + workingName + + ", live=" + liveName + "): OS was unreachable and the migration was halted" + + " (now ES-only). OS indices will be created on a later restart once OS is" + + " reachable and the migration phase is re-enabled."); + return; + } + boolean result; try { // Targeted: executed directly against this provider only. No phase fan-out here. diff --git a/dotCMS/src/main/java/com/dotcms/content/index/opensearch/OSIndexAPIImpl.java b/dotCMS/src/main/java/com/dotcms/content/index/opensearch/OSIndexAPIImpl.java index 3e53bf79267d..316ae9d7a7e9 100644 --- a/dotCMS/src/main/java/com/dotcms/content/index/opensearch/OSIndexAPIImpl.java +++ b/dotCMS/src/main/java/com/dotcms/content/index/opensearch/OSIndexAPIImpl.java @@ -18,6 +18,8 @@ import com.dotmarketing.exception.DotDataException; import com.dotmarketing.util.AdminLogger; import com.dotcms.content.index.IndexConfigHelper; +import com.dotcms.content.index.IndexConfigHelper.MigrationPhase; +import com.dotmarketing.util.Config; import com.dotmarketing.util.DateUtil; import com.dotmarketing.util.Logger; import com.dotmarketing.util.UtilMethods; @@ -508,25 +510,110 @@ public ClusterStats getClusterStats() { } } + /** + * Waits for the OpenSearch cluster to become reachable, retrying up to + * {@code OS_CONNECTION_ATTEMPTS} times (falling back to {@code ES_CONNECTION_ATTEMPTS}, + * default 24) with a {@code OS_CONNECTION_RETRY_SLEEP_SECONDS} (default 5s) pause between + * attempts. Used as the OpenSearch startup connection gate (issue #36244). + * + *

Active probe — not the swallowing {@code getClusterStats()}

+ *

Connectivity is verified with {@code client.info()}, which round-trips to the cluster + * and propagates any transport / TLS / auth failure. This is deliberate: + * {@link #getClusterStats()} catches every exception and returns an empty result, so a retry + * loop built on it can never observe a failure — the gate would always pass and the real + * error would only surface much later, deep inside {@code createContentIndex} (the opaque + * late crash this gate exists to prevent).

+ * + *

Phase-aware outcome on exhaustion

+ * + * + * @return {@code true} when OS is reachable; {@code false} when OS was unreachable in a + * shadow phase and the migration was halted (ES-only fallback). In Phase 3 this method + * never returns {@code false} — it aborts the JVM instead. + */ @Override public boolean waitUtilIndexReady() { - ClusterStats stats = null; final int attempts = IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_ATTEMPTS, 24); + final long sleepMs = + IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS, 5) * 1000L; + Exception lastError = null; for (int i = 0; i < attempts; i++) { try { - stats = getClusterStats(); - break; + // Active probe: info() round-trips to the cluster and throws on any failure. + clientProvider.getClient().info(); + return true; } catch (Exception e) { - Logger.error(this.getClass(), - "OpenSearch Connection Attempt #" + (i + 1) + ": " + e.getMessage()); + lastError = e; + Logger.error(this.getClass(), "OpenSearch Connection Attempt #" + (i + 1) + + " of " + attempts + ": " + e.getMessage()); } - DateUtil.sleep(IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS, 5) * 1000L); + DateUtil.sleep(sleepMs); } - if (stats == null) { - Logger.fatal(this.getClass(), "Cannot connect to OpenSearch, giving up."); - com.dotcms.shutdown.SystemExitManager.immediateExit(1, "OpenSearch connection failed"); - } - return true; + return handleConnectionExhausted(attempts, lastError); + } + + /** + * Phase-aware handler invoked once the OS connection retries are exhausted. + * + * @param attempts the number of attempts that were made (for the log message) + * @param lastError the last connection error observed, or {@code null} + * @return {@code false} after halting the migration in a shadow phase; never returns in Phase 3 + * (the JVM is terminated). + */ + private boolean handleConnectionExhausted(final int attempts, final Exception lastError) { + final MigrationPhase phase = MigrationPhase.current(); + final String cause = lastError != null ? lastError.getMessage() : "unknown"; + final String detail = "OpenSearch is not reachable after " + attempts + " attempt(s)." + + " phase=" + phase.name() + + ", endpoints=" + resolveEndpointsForLogging() + + ", cause=" + cause; + + if (phase.isMigrationComplete()) { + // Phase 3: OS is primary and ES is decommissioned — no fallback is possible. + Logger.fatal(this.getClass(), detail + + " — OS is the primary store in " + phase.name() + "; cannot fall back to ES." + + " Verify OS_ENDPOINTS, OS_PROTOCOL/OS_TLS_ENABLED (scheme must match the" + + " server), and credentials, then restart dotCMS."); + com.dotcms.shutdown.SystemExitManager.immediateExit(1, + "OpenSearch connection failed in PHASE_3_OPENSEARCH_ONLY"); + return false; // unreachable — immediateExit terminates the JVM + } + + // Phase 1 / 2 (shadow): ES still holds the authoritative state. Fall back to ES-only + // instead of killing the server. + Logger.error(this.getClass(), detail + + " — OS is a shadow store in " + phase.name() + "; falling back to ES-only" + + " (resetting FEATURE_FLAG_OPEN_SEARCH_PHASE to 0 via haltMigration)." + + " Fix OS connectivity and re-enable the migration phase when ready."); + IndexConfigHelper.haltMigration(); + return false; + } + + /** + * Resolves the configured OpenSearch endpoints for an actionable log message, mirroring + * {@code ConfigurableOpenSearchProvider} resolution: the explicit {@code OS_ENDPOINTS} array + * when set, otherwise a single {@code protocol://host:port} synthesised from the OS connection + * properties (with ES fallback). + */ + private static String resolveEndpointsForLogging() { + final String[] endpoints = Config.getStringArrayProperty("OS_ENDPOINTS", null); + if (endpoints != null && endpoints.length > 0) { + return Arrays.toString(endpoints); + } + final String protocol = IndexConfigHelper.getString(OSIndexProperty.PROTOCOL, "https"); + final String hostname = IndexConfigHelper.getString(OSIndexProperty.HOSTNAME, "localhost"); + final int port = IndexConfigHelper.getInt(OSIndexProperty.PORT, 9200); + return protocol + "://" + hostname + ":" + port; } diff --git a/dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java b/dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java index df38e7854566..01898fd7e403 100644 --- a/dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java +++ b/dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java @@ -109,8 +109,15 @@ public void init(ServletConfig config) throws ServletException { Logger.debug(InitServlet.class, "IOException: " + e1.getMessage(), e1); } - // Make sure elasticseach is up - APILocator.getESIndexAPI().waitUtilIndexReady(); + // Make sure the primary search store for the current migration phase is up before running + // the startup tasks (issue #36244). getESIndexAPI() returns the phase-aware router + // (IndexAPIImpl), so waitUtilIndexReady() waits on the primary store for the phase: ES in + // phases 0–1, OS in phases 2–3 — not ES-hardcoded. If OS was the primary and a shadow-phase + // (1/2) fallback to ES occurred, waitUtilIndexReady() returns false after halting the + // migration to Phase 0; wait again so the new primary (ES) is gated too. + if (!APILocator.getESIndexAPI().waitUtilIndexReady()) { + APILocator.getESIndexAPI().waitUtilIndexReady(); + } diff --git a/dotcms-integration/src/test/java/com/dotcms/OpenSearchUpgradeSuite.java b/dotcms-integration/src/test/java/com/dotcms/OpenSearchUpgradeSuite.java index 28a7d54051ae..722e82865a7c 100644 --- a/dotcms-integration/src/test/java/com/dotcms/OpenSearchUpgradeSuite.java +++ b/dotcms-integration/src/test/java/com/dotcms/OpenSearchUpgradeSuite.java @@ -8,6 +8,7 @@ import com.dotcms.content.index.opensearch.OSMappingAPIImplIntegrationTest; import com.dotcms.content.index.VersionedIndicesAPITest; import com.dotcms.content.index.opensearch.OSIndexAPIImplIntegrationTest; +import com.dotcms.content.index.opensearch.OSIndexAPIImplWaitReadyIT; import com.dotcms.content.index.opensearch.OSClientConfigTest; import com.dotcms.content.index.opensearch.OSClientProviderIntegrationTest; import com.dotcms.content.index.opensearch.OSSearchAPIImplIntegrationTest; @@ -36,6 +37,7 @@ @SuiteClasses({ VersionedIndicesAPITest.class, OSIndexAPIImplIntegrationTest.class, + OSIndexAPIImplWaitReadyIT.class, OSMappingAPIImplIntegrationTest.class, ContentletIndexOperationsOSIntegrationTest.class, OSCreateContentIndexIntegrationTest.class, diff --git a/dotcms-integration/src/test/java/com/dotcms/content/index/opensearch/OSIndexAPIImplWaitReadyIT.java b/dotcms-integration/src/test/java/com/dotcms/content/index/opensearch/OSIndexAPIImplWaitReadyIT.java new file mode 100644 index 000000000000..7ee87c2b1cf6 --- /dev/null +++ b/dotcms-integration/src/test/java/com/dotcms/content/index/opensearch/OSIndexAPIImplWaitReadyIT.java @@ -0,0 +1,115 @@ +package com.dotcms.content.index.opensearch; + +import static com.dotcms.content.index.IndexConfigHelper.MigrationPhase.FLAG_KEY; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +import com.dotcms.content.index.IndexConfigHelper.MigrationPhase; +import com.dotcms.util.IntegrationTestInitService; +import com.dotmarketing.util.Config; +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.opensearch.client.opensearch.OpenSearchClient; + +/** + * Integration tests for the phase-aware OpenSearch startup connection gate + * {@link OSIndexAPIImpl#waitUtilIndexReady()} (issue #36244). + * + *

These tests exercise the retry-exhausted branch with an + * {@link OSClientProvider} whose {@code getClient()} always throws, simulating an + * unreachable / misconfigured OpenSearch cluster. The connection attempts and retry sleep are + * forced to their minimum so the gate exhausts immediately without a real cluster.

+ * + *

Registered in {@link com.dotcms.OpenSearchUpgradeSuite}. Run with: + *

+ *   ./mvnw verify -pl :dotcms-integration \
+ *       -Dcoreit.test.skip=false \
+ *       -Dopensearch.upgrade.test=true
+ * 
+ *

+ * + *

What is and isn't covered here

+ * + * + * @author Fabrizzio Araya + */ +public class OSIndexAPIImplWaitReadyIT { + + /** {@link OSClientProvider} that always fails to produce a client (unreachable cluster). */ + private static final class FailingClientProvider implements OSClientProvider { + @Override + public OpenSearchClient getClient() { + throw new RuntimeException("simulated OpenSearch connection failure (test)"); + } + } + + @BeforeClass + public static void prepare() throws Exception { + IntegrationTestInitService.getInstance().init(); + } + + @Before + public void fastRetries() { + // Exhaust the gate immediately: a single attempt with no sleep between retries. + Config.setProperty(OSIndexProperty.CONNECTION_ATTEMPTS.osKey, "1"); + Config.setProperty(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS.osKey, "0"); + } + + @After + public void clearProps() { + Config.setProperty(FLAG_KEY, null); + Config.setProperty(OSIndexProperty.CONNECTION_ATTEMPTS.osKey, null); + Config.setProperty(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS.osKey, null); + } + + private static void setPhase(final int ordinal) { + Config.setProperty(FLAG_KEY, String.valueOf(ordinal)); + } + + /** + * Given : Phase 1 (dual-write, ES reads) and an unreachable OpenSearch cluster. + * When : waitUtilIndexReady() exhausts its retries. + * Then : the server is NOT killed — the gate returns {@code false} and the migration is + * halted (FEATURE_FLAG_OPEN_SEARCH_PHASE reset to 0), so dotCMS falls back to ES-only. + */ + @Test + public void test_phase1_osUnreachable_fallsBackToEs_noExit() { + setPhase(1); + final OSIndexAPIImpl api = new OSIndexAPIImpl(new FailingClientProvider()); + + final boolean ready = api.waitUtilIndexReady(); + + assertFalse("Phase 1 must NOT abort: the gate returns false (ES-only fallback)", ready); + assertEquals("Migration phase must be reset to PHASE_0 after the ES fallback", + MigrationPhase.PHASE_0_MIGRATION_NOT_STARTED, MigrationPhase.current()); + } + + /** + * Given : Phase 2 (dual-write, OS reads) and an unreachable OpenSearch cluster. + * When : waitUtilIndexReady() exhausts its retries. + * Then : same shadow-phase behavior as Phase 1 — fall back to ES (halt), return {@code false}, + * never abort. ES still holds the authoritative state in Phase 2. + */ + @Test + public void test_phase2_osUnreachable_fallsBackToEs_noExit() { + setPhase(2); + final OSIndexAPIImpl api = new OSIndexAPIImpl(new FailingClientProvider()); + + final boolean ready = api.waitUtilIndexReady(); + + assertFalse("Phase 2 must NOT abort: the gate returns false (ES-only fallback)", ready); + assertEquals("Migration phase must be reset to PHASE_0 after the ES fallback", + MigrationPhase.PHASE_0_MIGRATION_NOT_STARTED, MigrationPhase.current()); + } +}