Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,27 @@ private void bootstrapAndPointES(final String workingName, final String liveName
*/
private void bootstrapAndPointOS(final String workingName, final String liveName)
throws DotDataException {

// Connection gate (issue #36244): verify OS reachability BEFORE creating OS indices.
// This is the single chokepoint for all OS index creation (fresh-install bootstrap and
// migration catchup), so both startup paths — populated-DB (InitServlet) and empty-DB
// (Task00004LoadStarter) — pass through the same phase-aware gate instead of failing
// late and opaquely with a transport exception deep inside createContentIndex.
//
// operationsOS.indexAPI() is the OS-specific IndexAPI, so the gate always probes OS
// regardless of the current read provider (in Phase 1 the read provider is ES). The
// phase-aware outcome lives in OSIndexAPIImpl.waitUtilIndexReady(): Phase 3 aborts the
// JVM with an actionable message; Phase 1/2 halts the migration (ES-only fallback) and
// returns false — in which case we must NOT create OS indices.
if (!operationsOS.indexAPI().waitUtilIndexReady()) {
Logger.warn(this.getClass(),
"Skipping OpenSearch index bootstrap (working=" + workingName
+ ", live=" + liveName + "): OS was unreachable and the migration was halted"
+ " (now ES-only). OS indices will be created on a later restart once OS is"
+ " reachable and the migration phase is re-enabled.");
return;
}

boolean result;
try {
// Targeted: executed directly against this provider only. No phase fan-out here.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import com.dotmarketing.exception.DotDataException;
import com.dotmarketing.util.AdminLogger;
import com.dotcms.content.index.IndexConfigHelper;
import com.dotcms.content.index.IndexConfigHelper.MigrationPhase;
import com.dotmarketing.util.Config;
import com.dotmarketing.util.DateUtil;
import com.dotmarketing.util.Logger;
import com.dotmarketing.util.UtilMethods;
Expand Down Expand Up @@ -508,25 +510,110 @@ public ClusterStats getClusterStats() {
}
}

/**
* Waits for the OpenSearch cluster to become reachable, retrying up to
* {@code OS_CONNECTION_ATTEMPTS} times (falling back to {@code ES_CONNECTION_ATTEMPTS},
* default 24) with a {@code OS_CONNECTION_RETRY_SLEEP_SECONDS} (default 5s) pause between
* attempts. Used as the OpenSearch startup connection gate (issue #36244).
*
* <h2>Active probe — not the swallowing {@code getClusterStats()}</h2>
* <p>Connectivity is verified with {@code client.info()}, which round-trips to the cluster
* and <strong>propagates</strong> any transport / TLS / auth failure. This is deliberate:
* {@link #getClusterStats()} catches every exception and returns an empty result, so a retry
* loop built on it can never observe a failure — the gate would always pass and the real
* error would only surface much later, deep inside {@code createContentIndex} (the opaque
* late crash this gate exists to prevent).</p>
*
* <h2>Phase-aware outcome on exhaustion</h2>
* <ul>
* <li><strong>Phase 3 (OS only)</strong> — OS is the primary store and ES is decommissioned,
* so there is no safe fallback: log a FATAL actionable message and abort the JVM via
* {@link com.dotcms.shutdown.SystemExitManager#immediateExit(int, String)} (same as ES
* does today).</li>
* <li><strong>Phase 1 / 2 (shadow)</strong> — OS is not yet primary; ES still holds the
* authoritative state. Instead of killing the server, halt the migration
* ({@link IndexConfigHelper#haltMigration()} resets the phase to
* {@code PHASE_0_MIGRATION_NOT_STARTED}) so dotCMS falls back to ES-only, log an ERROR
* explaining the fallback, and return {@code false}.</li>
* </ul>
*
* @return {@code true} when OS is reachable; {@code false} when OS was unreachable in a
* shadow phase and the migration was halted (ES-only fallback). In Phase 3 this method
* never returns {@code false} — it aborts the JVM instead.
*/
@Override
public boolean waitUtilIndexReady() {
ClusterStats stats = null;
final int attempts = IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_ATTEMPTS, 24);
final long sleepMs =
IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS, 5) * 1000L;
Exception lastError = null;
for (int i = 0; i < attempts; i++) {
try {
stats = getClusterStats();
break;
// Active probe: info() round-trips to the cluster and throws on any failure.
clientProvider.getClient().info();
return true;
} catch (Exception e) {
Logger.error(this.getClass(),
"OpenSearch Connection Attempt #" + (i + 1) + ": " + e.getMessage());
lastError = e;
Logger.error(this.getClass(), "OpenSearch Connection Attempt #" + (i + 1)
+ " of " + attempts + ": " + e.getMessage());
}
DateUtil.sleep(IndexConfigHelper.getInt(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS, 5) * 1000L);
DateUtil.sleep(sleepMs);
}
if (stats == null) {
Logger.fatal(this.getClass(), "Cannot connect to OpenSearch, giving up.");
com.dotcms.shutdown.SystemExitManager.immediateExit(1, "OpenSearch connection failed");
}
return true;
return handleConnectionExhausted(attempts, lastError);
}

/**
* Phase-aware handler invoked once the OS connection retries are exhausted.
*
* @param attempts the number of attempts that were made (for the log message)
* @param lastError the last connection error observed, or {@code null}
* @return {@code false} after halting the migration in a shadow phase; never returns in Phase 3
* (the JVM is terminated).
*/
private boolean handleConnectionExhausted(final int attempts, final Exception lastError) {
final MigrationPhase phase = MigrationPhase.current();
final String cause = lastError != null ? lastError.getMessage() : "unknown";
final String detail = "OpenSearch is not reachable after " + attempts + " attempt(s)."
+ " phase=" + phase.name()
+ ", endpoints=" + resolveEndpointsForLogging()
+ ", cause=" + cause;

if (phase.isMigrationComplete()) {
// Phase 3: OS is primary and ES is decommissioned — no fallback is possible.
Logger.fatal(this.getClass(), detail
+ " — OS is the primary store in " + phase.name() + "; cannot fall back to ES."
+ " Verify OS_ENDPOINTS, OS_PROTOCOL/OS_TLS_ENABLED (scheme must match the"
+ " server), and credentials, then restart dotCMS.");
com.dotcms.shutdown.SystemExitManager.immediateExit(1,
"OpenSearch connection failed in PHASE_3_OPENSEARCH_ONLY");
return false; // unreachable — immediateExit terminates the JVM
}

// Phase 1 / 2 (shadow): ES still holds the authoritative state. Fall back to ES-only
// instead of killing the server.
Logger.error(this.getClass(), detail
+ " — OS is a shadow store in " + phase.name() + "; falling back to ES-only"
+ " (resetting FEATURE_FLAG_OPEN_SEARCH_PHASE to 0 via haltMigration)."
+ " Fix OS connectivity and re-enable the migration phase when ready.");
IndexConfigHelper.haltMigration();
return false;
}

/**
* Resolves the configured OpenSearch endpoints for an actionable log message, mirroring
* {@code ConfigurableOpenSearchProvider} resolution: the explicit {@code OS_ENDPOINTS} array
* when set, otherwise a single {@code protocol://host:port} synthesised from the OS connection
* properties (with ES fallback).
*/
private static String resolveEndpointsForLogging() {
final String[] endpoints = Config.getStringArrayProperty("OS_ENDPOINTS", null);
if (endpoints != null && endpoints.length > 0) {
return Arrays.toString(endpoints);
}
final String protocol = IndexConfigHelper.getString(OSIndexProperty.PROTOCOL, "https");
final String hostname = IndexConfigHelper.getString(OSIndexProperty.HOSTNAME, "localhost");
final int port = IndexConfigHelper.getInt(OSIndexProperty.PORT, 9200);
return protocol + "://" + hostname + ":" + port;
}


Expand Down
11 changes: 9 additions & 2 deletions dotCMS/src/main/java/com/liferay/portal/servlet/MainServlet.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,15 @@ public void init(ServletConfig config) throws ServletException {
Logger.debug(InitServlet.class, "IOException: " + e1.getMessage(), e1);
}

// Make sure elasticseach is up
APILocator.getESIndexAPI().waitUtilIndexReady();
// Make sure the primary search store for the current migration phase is up before running
// the startup tasks (issue #36244). getESIndexAPI() returns the phase-aware router
// (IndexAPIImpl), so waitUtilIndexReady() waits on the primary store for the phase: ES in
// phases 0–1, OS in phases 2–3 — not ES-hardcoded. If OS was the primary and a shadow-phase
// (1/2) fallback to ES occurred, waitUtilIndexReady() returns false after halting the
// migration to Phase 0; wait again so the new primary (ES) is gated too.
if (!APILocator.getESIndexAPI().waitUtilIndexReady()) {
APILocator.getESIndexAPI().waitUtilIndexReady();
}



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import com.dotcms.content.index.opensearch.OSMappingAPIImplIntegrationTest;
import com.dotcms.content.index.VersionedIndicesAPITest;
import com.dotcms.content.index.opensearch.OSIndexAPIImplIntegrationTest;
import com.dotcms.content.index.opensearch.OSIndexAPIImplWaitReadyIT;
import com.dotcms.content.index.opensearch.OSClientConfigTest;
import com.dotcms.content.index.opensearch.OSClientProviderIntegrationTest;
import com.dotcms.content.index.opensearch.OSSearchAPIImplIntegrationTest;
Expand Down Expand Up @@ -36,6 +37,7 @@
@SuiteClasses({
VersionedIndicesAPITest.class,
OSIndexAPIImplIntegrationTest.class,
OSIndexAPIImplWaitReadyIT.class,
OSMappingAPIImplIntegrationTest.class,
ContentletIndexOperationsOSIntegrationTest.class,
OSCreateContentIndexIntegrationTest.class,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package com.dotcms.content.index.opensearch;

import static com.dotcms.content.index.IndexConfigHelper.MigrationPhase.FLAG_KEY;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;

import com.dotcms.content.index.IndexConfigHelper.MigrationPhase;
import com.dotcms.util.IntegrationTestInitService;
import com.dotmarketing.util.Config;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.opensearch.client.opensearch.OpenSearchClient;

/**
* Integration tests for the phase-aware OpenSearch startup connection gate
* {@link OSIndexAPIImpl#waitUtilIndexReady()} (issue #36244).
*
* <p>These tests exercise the <strong>retry-exhausted</strong> branch with an
* {@link OSClientProvider} whose {@code getClient()} always throws, simulating an
* unreachable / misconfigured OpenSearch cluster. The connection attempts and retry sleep are
* forced to their minimum so the gate exhausts immediately without a real cluster.</p>
*
* <p>Registered in {@link com.dotcms.OpenSearchUpgradeSuite}. Run with:
* <pre>
* ./mvnw verify -pl :dotcms-integration \
* -Dcoreit.test.skip=false \
* -Dopensearch.upgrade.test=true
* </pre>
* </p>
*
* <h2>What is and isn't covered here</h2>
* <ul>
* <li><strong>Phase 1 / 2 (shadow)</strong> — covered: the gate must NOT kill the JVM; it halts
* the migration (phase reset to 0) and returns {@code false}. These two cases use the failing
* client provider, so they do not need the live OpenSearch container.</li>
* <li><strong>Phase 3 (OS primary)</strong> — intentionally NOT tested here: the gate aborts via
* {@code SystemExitManager.immediateExit}, which calls {@code Runtime.halt()} and would kill
* the test JVM. The Phase 3 abort is verified by manual QA instead.</li>
* <li><strong>Success path</strong> — covered by the other OpenSearch integration tests that run
* against the live cluster (a reachable {@code client.info()}).</li>
* </ul>
*
* @author Fabrizzio Araya
*/
public class OSIndexAPIImplWaitReadyIT {

/** {@link OSClientProvider} that always fails to produce a client (unreachable cluster). */
private static final class FailingClientProvider implements OSClientProvider {
@Override
public OpenSearchClient getClient() {
throw new RuntimeException("simulated OpenSearch connection failure (test)");
}
}

@BeforeClass
public static void prepare() throws Exception {
IntegrationTestInitService.getInstance().init();
}

@Before
public void fastRetries() {
// Exhaust the gate immediately: a single attempt with no sleep between retries.
Config.setProperty(OSIndexProperty.CONNECTION_ATTEMPTS.osKey, "1");
Config.setProperty(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS.osKey, "0");
}

@After
public void clearProps() {
Config.setProperty(FLAG_KEY, null);
Config.setProperty(OSIndexProperty.CONNECTION_ATTEMPTS.osKey, null);
Config.setProperty(OSIndexProperty.CONNECTION_RETRY_SLEEP_SECONDS.osKey, null);
}

private static void setPhase(final int ordinal) {
Config.setProperty(FLAG_KEY, String.valueOf(ordinal));
}

/**
* Given : Phase 1 (dual-write, ES reads) and an unreachable OpenSearch cluster.
* When : waitUtilIndexReady() exhausts its retries.
* Then : the server is NOT killed — the gate returns {@code false} and the migration is
* halted (FEATURE_FLAG_OPEN_SEARCH_PHASE reset to 0), so dotCMS falls back to ES-only.
*/
@Test
public void test_phase1_osUnreachable_fallsBackToEs_noExit() {
setPhase(1);
final OSIndexAPIImpl api = new OSIndexAPIImpl(new FailingClientProvider());

final boolean ready = api.waitUtilIndexReady();

assertFalse("Phase 1 must NOT abort: the gate returns false (ES-only fallback)", ready);
assertEquals("Migration phase must be reset to PHASE_0 after the ES fallback",
MigrationPhase.PHASE_0_MIGRATION_NOT_STARTED, MigrationPhase.current());
}

/**
* Given : Phase 2 (dual-write, OS reads) and an unreachable OpenSearch cluster.
* When : waitUtilIndexReady() exhausts its retries.
* Then : same shadow-phase behavior as Phase 1 — fall back to ES (halt), return {@code false},
* never abort. ES still holds the authoritative state in Phase 2.
*/
@Test
public void test_phase2_osUnreachable_fallsBackToEs_noExit() {
setPhase(2);
final OSIndexAPIImpl api = new OSIndexAPIImpl(new FailingClientProvider());

final boolean ready = api.waitUtilIndexReady();

assertFalse("Phase 2 must NOT abort: the gate returns false (ES-only fallback)", ready);
assertEquals("Migration phase must be reset to PHASE_0 after the ES fallback",
MigrationPhase.PHASE_0_MIGRATION_NOT_STARTED, MigrationPhase.current());
}
}
Loading