From 4a22692b433888355366261b07c31fcf9c4f58aa Mon Sep 17 00:00:00 2001 From: Mark Shabanov Date: Thu, 28 May 2026 12:06:57 +0300 Subject: [PATCH 1/2] prevent infinity wait for ALL_LOGS_RECRUITED on remote DC TC when there are no tlog processes in it but usableRegions=2 --- fdbserver/datadistributor/DataDistribution.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fdbserver/datadistributor/DataDistribution.cpp b/fdbserver/datadistributor/DataDistribution.cpp index b251cae0d51..f6aa29b94e9 100644 --- a/fdbserver/datadistributor/DataDistribution.cpp +++ b/fdbserver/datadistributor/DataDistribution.cpp @@ -299,6 +299,14 @@ Future StorageWiggler::finishWiggle() { }); } +Future waitForAcceptingCommits(Reference const> db) { + TraceEvent("DDWaitForAcceptingCommitsStart").log(); + while (db->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) { + TraceEvent("DDWaitForAcceptingCommits").detail("RecoveryState", (int)db->get().recoveryState); + co_await db->onChange(); + } +} + Future remoteRecovered(Reference const> db) { TraceEvent("DDTrackerStarting").log(); while (db->get().recoveryState < RecoveryState::ALL_LOGS_RECRUITED) { @@ -2900,7 +2908,12 @@ Future dataDistribution(Reference self, self->configuration, self->remoteDcIds, Optional>>(), - self->initialized.getFuture() && remoteRecovered(self->dbInfo), + // In multi-region configurations DD only needs the cluster to + // reach ACCEPTING_COMMITS before continuing startup. Waiting for + // ALL_LOGS_RECRUITED can hang during degraded failover scenarios + // such as when the primary DC is down. + self->initialized.getFuture() && + waitForAcceptingCommits(self->dbInfo), zeroHealthyTeams[1], IsPrimary::False, processingUnhealthy, From b363e116727ca924b0febb97be667661e778b9cf Mon Sep 17 00:00:00 2001 From: Mark Shabanov Date: Fri, 29 May 2026 10:44:37 +0300 Subject: [PATCH 2/2] applied the clang format to DataDistribution.cpp --- fdbserver/datadistributor/DataDistribution.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbserver/datadistributor/DataDistribution.cpp b/fdbserver/datadistributor/DataDistribution.cpp index f6aa29b94e9..b8e22b0d43e 100644 --- a/fdbserver/datadistributor/DataDistribution.cpp +++ b/fdbserver/datadistributor/DataDistribution.cpp @@ -2912,8 +2912,7 @@ Future dataDistribution(Reference self, // reach ACCEPTING_COMMITS before continuing startup. Waiting for // ALL_LOGS_RECRUITED can hang during degraded failover scenarios // such as when the primary DC is down. - self->initialized.getFuture() && - waitForAcceptingCommits(self->dbInfo), + self->initialized.getFuture() && waitForAcceptingCommits(self->dbInfo), zeroHealthyTeams[1], IsPrimary::False, processingUnhealthy,