diff --git a/fdbserver/datadistributor/DataDistribution.cpp b/fdbserver/datadistributor/DataDistribution.cpp index b251cae0d51..b8e22b0d43e 100644 --- a/fdbserver/datadistributor/DataDistribution.cpp +++ b/fdbserver/datadistributor/DataDistribution.cpp @@ -299,6 +299,14 @@ Future StorageWiggler::finishWiggle() { }); } +Future waitForAcceptingCommits(Reference const> db) { + TraceEvent("DDWaitForAcceptingCommitsStart").log(); + while (db->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) { + TraceEvent("DDWaitForAcceptingCommits").detail("RecoveryState", (int)db->get().recoveryState); + co_await db->onChange(); + } +} + Future remoteRecovered(Reference const> db) { TraceEvent("DDTrackerStarting").log(); while (db->get().recoveryState < RecoveryState::ALL_LOGS_RECRUITED) { @@ -2900,7 +2908,11 @@ Future dataDistribution(Reference self, self->configuration, self->remoteDcIds, Optional>>(), - self->initialized.getFuture() && remoteRecovered(self->dbInfo), + // In multi-region configurations DD only needs the cluster to + // reach ACCEPTING_COMMITS before continuing startup. Waiting for + // ALL_LOGS_RECRUITED can hang during degraded failover scenarios + // such as when the primary DC is down. + self->initialized.getFuture() && waitForAcceptingCommits(self->dbInfo), zeroHealthyTeams[1], IsPrimary::False, processingUnhealthy,