From d86ffe0bf31153c8d6e6af69bbc64aa6df6c4cf6 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sat, 23 May 2026 11:22:39 -0700 Subject: [PATCH 01/56] Initial native CDC commit --- fdbclient/SystemData.cpp | 144 +++++++++++ .../include/fdbclient/CDCProxyInterface.h | 234 ++++++++++++++++++ .../include/fdbclient/CommitProxyInterface.h | 7 + fdbclient/include/fdbclient/FDBTypes.h | 2 + fdbclient/include/fdbclient/SystemData.h | 33 +++ flow/ProtocolVersion.h.cmake | 1 + flow/ProtocolVersions.cmake | 7 +- 7 files changed, 425 insertions(+), 3 deletions(-) create mode 100644 fdbclient/include/fdbclient/CDCProxyInterface.h diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 07ca49a33f3..c30796112b8 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -771,6 +771,123 @@ int8_t decodeTagLocalityListValue(ValueRef const& value) { return s; } +const KeyRangeRef cdcStreamNameKeys("\xff/cdc/name/"_sr, "\xff/cdc/name0"_sr); +const KeyRangeRef cdcStreamKeys("\xff/cdc/keys/"_sr, "\xff/cdc/keys0"_sr); +const KeyRangeRef cdcTagHistoryKeys("\xff/cdc/tagHistory/"_sr, "\xff/cdc/tagHistory0"_sr); +const KeyRangeRef cdcMinVersionKeys("\xff/cdc/minVersion/"_sr, "\xff/cdc/minVersion0"_sr); +const KeyRangeRef cdcProxyKeys("\xff/cdc/proxies/"_sr, "\xff/cdc/proxies0"_sr); + +Key cdcStreamNameKeyFor(KeyRef const& streamName) { + return streamName.withPrefix(cdcStreamNameKeys.begin); +} + +Key decodeCDCStreamNameKey(KeyRef const& key) { + return key.removePrefix(cdcStreamNameKeys.begin); +} + +Value cdcStreamNameValue(CDCStreamId streamId) { + BinaryWriter wr(IncludeVersion(ProtocolVersion::withNativeCdc())); + wr << streamId; + return wr.toValue(); +} + +CDCStreamId decodeCDCStreamNameValue(ValueRef const& value) { + CDCStreamId streamId; + BinaryReader reader(value, IncludeVersion()); + ASSERT_WE_THINK(reader.protocolVersion().hasNativeCdc()); + reader >> streamId; + return streamId; +} + +Key cdcStreamKeyFor(CDCStreamId streamId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcStreamKeys.begin); + wr << streamId; + return wr.toValue(); +} + +Value cdcStreamKeysValue(KeyRangeRef const& keys) { + BinaryWriter wr(IncludeVersion(ProtocolVersion::withNativeCdc())); + wr << keys; + return wr.toValue(); +} + +KeyRange decodeCDCStreamKeysValue(ValueRef const& value) { + KeyRange keys; + BinaryReader reader(value, IncludeVersion()); + ASSERT_WE_THINK(reader.protocolVersion().hasNativeCdc()); + reader >> keys; + return keys; +} + +static Key cdcTagHistoryPrefixFor(CDCStreamId streamId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcTagHistoryKeys.begin); + wr << streamId; + return wr.toValue(); +} + +Key cdcTagHistoryKeyFor(CDCStreamId streamId, Version version, Tag tag) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcTagHistoryPrefixFor(streamId)); + + Version encodedVersion = bigEndian64(version); + Key versionBytes = makeString(sizeof(encodedVersion)); + memcpy(mutateString(versionBytes), &encodedVersion, sizeof(encodedVersion)); + wr.serializeBytes(versionBytes); + wr << tag; + return wr.toValue(); +} + +KeyRange cdcTagHistoryRangeFor(CDCStreamId streamId) { + return prefixRange(cdcTagHistoryPrefixFor(streamId)); +} + +std::tuple decodeCDCTagHistoryKey(KeyRef const& key) { + CDCStreamId streamId; + Version encodedVersion; + Tag tag; + BinaryReader reader(key.removePrefix(cdcTagHistoryKeys.begin), Unversioned()); + reader >> streamId >> encodedVersion >> tag; + return { streamId, bigEndian64(encodedVersion), tag }; +} + +Key cdcMinVersionKeyFor(CDCStreamId streamId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcMinVersionKeys.begin); + wr << streamId; + return wr.toValue(); +} + +Value cdcMinVersionValue(Version version) { + BinaryWriter wr(IncludeVersion(ProtocolVersion::withNativeCdc())); + wr << version; + return wr.toValue(); +} + +Version decodeCDCMinVersionValue(ValueRef const& value) { + Version version; + BinaryReader reader(value, IncludeVersion()); + ASSERT_WE_THINK(reader.protocolVersion().hasNativeCdc()); + reader >> version; + return version; +} + +Key cdcProxyKeyFor(CDCStreamId streamId, UID proxyId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcProxyKeys.begin); + wr << streamId << proxyId; + return wr.toValue(); +} + +std::pair decodeCDCProxyKey(KeyRef const& key) { + CDCStreamId streamId; + UID proxyId; + BinaryReader reader(key.removePrefix(cdcProxyKeys.begin), Unversioned()); + reader >> streamId >> proxyId; + return { streamId, proxyId }; +} + const KeyRangeRef datacenterReplicasKeys("\xff\x02/datacenterReplicas/"_sr, "\xff\x02/datacenterReplicas0"_sr); const KeyRef datacenterReplicasPrefix = datacenterReplicasKeys.begin; @@ -1648,3 +1765,30 @@ TEST_CASE("noSim/SystemData/DataMoveId") { return Void(); } + +TEST_CASE("noSim/SystemData/NativeCDC") { + const Key name = "orders"_sr; + const CDCStreamId streamId = 42; + const KeyRange keys(KeyRangeRef("a"_sr, "z"_sr)); + const Version minVersion = 123456789; + const Tag tag(tagLocalityCDC, 9); + const UID proxyId(1, 2); + + ASSERT(decodeCDCStreamNameKey(cdcStreamNameKeyFor(name)) == name); + ASSERT(decodeCDCStreamNameValue(cdcStreamNameValue(streamId)) == streamId); + ASSERT(decodeCDCStreamKeysValue(cdcStreamKeysValue(keys)) == keys); + ASSERT(decodeCDCMinVersionValue(cdcMinVersionValue(minVersion)) == minVersion); + + const Key tagHistoryKey = cdcTagHistoryKeyFor(streamId, minVersion, tag); + const auto [decodedStreamId, decodedVersion, decodedTag] = decodeCDCTagHistoryKey(tagHistoryKey); + ASSERT(decodedStreamId == streamId); + ASSERT(decodedVersion == minVersion); + ASSERT(decodedTag == tag); + ASSERT(cdcTagHistoryRangeFor(streamId).contains(tagHistoryKey)); + + const auto [proxyStreamId, decodedProxyId] = decodeCDCProxyKey(cdcProxyKeyFor(streamId, proxyId)); + ASSERT(proxyStreamId == streamId); + ASSERT(decodedProxyId == proxyId); + + return Void(); +} diff --git a/fdbclient/include/fdbclient/CDCProxyInterface.h b/fdbclient/include/fdbclient/CDCProxyInterface.h new file mode 100644 index 00000000000..5de6c3a588b --- /dev/null +++ b/fdbclient/include/fdbclient/CDCProxyInterface.h @@ -0,0 +1,234 @@ +/* + * CDCProxyInterface.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_CDCPROXYINTERFACE_H +#define FDBCLIENT_CDCPROXYINTERFACE_H +#pragma once + +#include "fdbclient/CommitTransaction.h" +#include "flow/FileIdentifier.h" +#include "fdbrpc/fdbrpc.h" + +struct CDCCursor { + constexpr static FileIdentifier file_identifier = 16776001; + CDCStreamId streamId = 0; + Version lastConsumedVersion = invalidVersion; + + CDCCursor() = default; + CDCCursor(CDCStreamId streamId, Version lastConsumedVersion) + : streamId(streamId), lastConsumedVersion(lastConsumedVersion) {} + + template + void serialize(Ar& ar) { + serializer(ar, streamId, lastConsumedVersion); + } +}; + +struct VersionedMutationsRef { + constexpr static FileIdentifier file_identifier = 16776002; + Version version = invalidVersion; + VectorRef mutations; + + VersionedMutationsRef() = default; + VersionedMutationsRef(Version version, VectorRef mutations) : version(version), mutations(mutations) {} + + template + void serialize(Ar& ar) { + serializer(ar, version, mutations); + } +}; + +struct CDCStreamInfoRef { + constexpr static FileIdentifier file_identifier = 16776003; + StringRef name; + CDCStreamId streamId = 0; + KeyRangeRef keys; + Version minVersion = invalidVersion; + + CDCStreamInfoRef() = default; + CDCStreamInfoRef(StringRef name, CDCStreamId streamId, KeyRangeRef keys, Version minVersion) + : name(name), streamId(streamId), keys(keys), minVersion(minVersion) {} + + template + void serialize(Ar& ar) { + serializer(ar, name, streamId, keys, minVersion); + } +}; + +struct CDCRegisterStreamReply { + constexpr static FileIdentifier file_identifier = 16776012; + CDCStreamId streamId = 0; + + CDCRegisterStreamReply() = default; + explicit CDCRegisterStreamReply(CDCStreamId streamId) : streamId(streamId) {} + + template + void serialize(Ar& ar) { + serializer(ar, streamId); + } +}; + +struct CDCRegisterStreamRequest { + constexpr static FileIdentifier file_identifier = 16776004; + Key name; + KeyRange keys; + ReplyPromise reply; + + CDCRegisterStreamRequest() = default; + CDCRegisterStreamRequest(Key name, KeyRange keys) : name(name), keys(keys) {} + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, name, keys, reply); + } +}; + +struct CDCRemoveStreamRequest { + constexpr static FileIdentifier file_identifier = 16776005; + Key name; + ReplyPromise reply; + + CDCRemoveStreamRequest() = default; + explicit CDCRemoveStreamRequest(Key name) : name(name) {} + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, name, reply); + } +}; + +struct CDCListStreamsReply { + constexpr static FileIdentifier file_identifier = 16776006; + Arena arena; + VectorRef streams; + + template + void serialize(Ar& ar) { + serializer(ar, streams, arena); + } +}; + +struct CDCListStreamsRequest { + constexpr static FileIdentifier file_identifier = 16776007; + ReplyPromise reply; + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + +struct CDCConsumeReply { + constexpr static FileIdentifier file_identifier = 16776008; + Arena arena; + VectorRef mutations; + Version lastConsumedVersion = invalidVersion; + + template + void serialize(Ar& ar) { + serializer(ar, mutations, lastConsumedVersion, arena); + } +}; + +struct CDCConsumeRequest { + constexpr static FileIdentifier file_identifier = 16776009; + CDCCursor cursor; + ReplyPromise reply; + + CDCConsumeRequest() = default; + explicit CDCConsumeRequest(CDCCursor cursor) : cursor(cursor) {} + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, cursor, reply); + } +}; + +struct CDCAckRequest { + constexpr static FileIdentifier file_identifier = 16776010; + CDCStreamId streamId = 0; + Version version = invalidVersion; + ReplyPromise reply; + + CDCAckRequest() = default; + CDCAckRequest(CDCStreamId streamId, Version version) : streamId(streamId), version(version) {} + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, streamId, version, reply); + } +}; + +struct CDCProxyInterface { + constexpr static FileIdentifier file_identifier = 16776011; + enum { LocationAwareLoadBalance = 1 }; + enum { AlwaysFresh = 1 }; + + Optional processId; + PublicRequestStream consume; + PublicRequestStream registerStream; + PublicRequestStream removeStream; + PublicRequestStream listStreams; + PublicRequestStream ack; + RequestStream> waitFailure; + + UID id() const { return consume.getEndpoint().token; } + std::string toString() const { return id().shortString(); } + bool operator==(CDCProxyInterface const& r) const { return id() == r.id(); } + bool operator!=(CDCProxyInterface const& r) const { return id() != r.id(); } + NetworkAddress address() const { return consume.getEndpoint().getPrimaryAddress(); } + NetworkAddressList addresses() const { return consume.getEndpoint().addresses; } + + template + void serialize(Ar& ar) { + serializer(ar, processId, consume); + if (Ar::isDeserializing) { + registerStream = + PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(1)); + removeStream = PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(2)); + listStreams = PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(3)); + ack = PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(4)); + waitFailure = RequestStream>(consume.getEndpoint().getAdjustedEndpoint(5)); + } + } + + void initEndpoints() { + std::vector> streams; + streams.push_back(consume.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(registerStream.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(removeStream.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(listStreams.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(ack.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(waitFailure.getReceiver()); + FlowTransport::transport().addEndpoints(streams); + } +}; + +#endif // FDBCLIENT_CDCPROXYINTERFACE_H diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index 5fa33fae819..fd4cdfa2968 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -22,9 +22,11 @@ #define FDBCLIENT_COMMITPROXYINTERFACE_H #pragma once +#include #include #include +#include "fdbclient/CDCProxyInterface.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/GlobalConfig.h" @@ -111,6 +113,8 @@ struct ClientDBInfo { UID id; // Changes each time anything else changes std::vector grvProxies; std::vector commitProxies; + std::vector cdcProxies; + std::map streamToCDCProxyId; Optional firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk Optional forward; @@ -130,6 +134,9 @@ struct ClientDBInfo { ASSERT(ar.protocolVersion().isValid()); } serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType); + if (ar.protocolVersion().hasNativeCdc()) { + serializer(ar, cdcProxies, streamToCDCProxyId); + } } }; diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index ea728b8f724..5e87aa45d48 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -39,6 +39,7 @@ #include "fdbrpc/Locality.h" typedef int64_t Version; +typedef uint64_t CDCStreamId; typedef uint64_t LogEpoch; typedef uint64_t Sequence; typedef StringRef KeyRef; @@ -62,6 +63,7 @@ enum { tagLocalityTxs = -7, tagLocalityBackup = -8, // used by backup role to pop from TLogs tagLocalityRangeBackup = -9, // used by range-partitioned backup workers + tagLocalityCDC = -10, // used by native change data capture streams tagLocalityInvalid = -99 }; // The TLog and LogRouter require these number to be as compact as possible diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index 2ad0ede86f1..d00a75271ab 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -22,6 +22,8 @@ #define FDBCLIENT_SYSTEMDATA_H #pragma once +#include + // Functions and constants documenting the organization of the reserved keyspace in the database beginning with "\xFF" #include "fdbclient/AccumulativeChecksum.h" @@ -264,6 +266,37 @@ Value tagLocalityListValue(int8_t const&); Optional decodeTagLocalityListKey(KeyRef const&); int8_t decodeTagLocalityListValue(ValueRef const&); +// Native CDC metadata persisted in the transaction state store. +// "\xff/cdc/name/[[streamName]]" := "[[CDCStreamId]]" +extern const KeyRangeRef cdcStreamNameKeys; +Key cdcStreamNameKeyFor(KeyRef const& streamName); +Key decodeCDCStreamNameKey(KeyRef const& key); +Value cdcStreamNameValue(CDCStreamId streamId); +CDCStreamId decodeCDCStreamNameValue(ValueRef const& value); + +// "\xff/cdc/keys/[[CDCStreamId]]" := "[[KeyRange]]" +extern const KeyRangeRef cdcStreamKeys; +Key cdcStreamKeyFor(CDCStreamId streamId); +Value cdcStreamKeysValue(KeyRangeRef const& keys); +KeyRange decodeCDCStreamKeysValue(ValueRef const& value); + +// "\xff/cdc/tagHistory/[[CDCStreamId]][[Version]][[Tag]]" := "" +extern const KeyRangeRef cdcTagHistoryKeys; +Key cdcTagHistoryKeyFor(CDCStreamId streamId, Version version, Tag tag); +KeyRange cdcTagHistoryRangeFor(CDCStreamId streamId); +std::tuple decodeCDCTagHistoryKey(KeyRef const& key); + +// "\xff/cdc/minVersion/[[CDCStreamId]]" := "[[Version]]" +extern const KeyRangeRef cdcMinVersionKeys; +Key cdcMinVersionKeyFor(CDCStreamId streamId); +Value cdcMinVersionValue(Version version); +Version decodeCDCMinVersionValue(ValueRef const& value); + +// "\xff/cdc/proxies/[[CDCStreamId]][[proxyUID]]" := "" +extern const KeyRangeRef cdcProxyKeys; +Key cdcProxyKeyFor(CDCStreamId streamId, UID proxyId); +std::pair decodeCDCProxyKey(KeyRef const& key); + // "\xff\x02/datacenterReplicas/[[datacenterID]]" := "[[replicas]]" // Provides the number of replicas for the given datacenterID. // Used in the initialization of the Data Distributor. diff --git a/flow/ProtocolVersion.h.cmake b/flow/ProtocolVersion.h.cmake index fd8a24c6384..dca92cc298a 100644 --- a/flow/ProtocolVersion.h.cmake +++ b/flow/ProtocolVersion.h.cmake @@ -179,6 +179,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(@FDB_PV_GC_TXN_GENERATIONS@, GcTxnGenerations); PROTOCOL_VERSION_FEATURE(@FDB_PV_MUTATION_CHECKSUM@, MutationChecksum); PROTOCOL_VERSION_FEATURE(@FDB_PV_RANGE_BACKUP_WORKER@, RangeBackupWorker); + PROTOCOL_VERSION_FEATURE(@FDB_PV_NATIVE_CDC@, NativeCdc); }; template <> diff --git a/flow/ProtocolVersions.cmake b/flow/ProtocolVersions.cmake index 4066e7435a4..10a6eb6a000 100644 --- a/flow/ProtocolVersions.cmake +++ b/flow/ProtocolVersions.cmake @@ -8,10 +8,10 @@ # used and should not be changed from 0. # xyzdev # vvvv -set(FDB_PV_DEFAULT_VERSION "0x0FDB00B080000000LL") -set(FDB_PV_FUTURE_VERSION "0x0FDB00B081000000LL") +set(FDB_PV_DEFAULT_VERSION "0x0FDB00B081000000LL") +set(FDB_PV_FUTURE_VERSION "0x0FDB00B082000000LL") set(FDB_PV_MIN_COMPATIBLE_VERSION "0x0FDB00B074000000LL") -set(FDB_PV_MIN_INVALID_VERSION "0x0FDB00B082000000LL") +set(FDB_PV_MIN_INVALID_VERSION "0x0FDB00B083000000LL") set(FDB_PV_LEFT_MOST_CHECK "0x0FDB00B100000000LL") set(FDB_PV_LSB_MASK "0xF0FFFFLL") @@ -96,3 +96,4 @@ set(FDB_PV_GC_TXN_GENERATIONS "0x0FDB00B073000000LL") set(FDB_PV_MUTATION_CHECKSUM "0x0FDB00B074000000LL") set(FDB_PV_GRPC_ENDPOINT "0x0FDB00B080000000LL") set(FDB_PV_RANGE_BACKUP_WORKER "0x0FDB00B080000000LL") +set(FDB_PV_NATIVE_CDC "0x0FDB00B081000000LL") From 158333e123e523867cff538b20de042e181e090d Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sat, 23 May 2026 11:51:53 -0700 Subject: [PATCH 02/56] Add native CDC commit proxy routing --- fdbclient/SystemData.cpp | 8 + fdbclient/include/fdbclient/SystemData.h | 1 + fdbserver/commitproxy/CommitProxyServer.cpp | 6 + fdbserver/commitproxy/ProxyCommitData.h | 2 + fdbserver/logsystem/ApplyMetadataMutation.cpp | 141 +++++++++++++++++- .../logsystem/ApplyMetadataMutation.h | 22 +++ 6 files changed, 175 insertions(+), 5 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index c30796112b8..d84d8ccf183 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -806,6 +806,13 @@ Key cdcStreamKeyFor(CDCStreamId streamId) { return wr.toValue(); } +CDCStreamId decodeCDCStreamKey(KeyRef const& key) { + CDCStreamId streamId; + BinaryReader reader(key.removePrefix(cdcStreamKeys.begin), Unversioned()); + reader >> streamId; + return streamId; +} + Value cdcStreamKeysValue(KeyRangeRef const& keys) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withNativeCdc())); wr << keys; @@ -1776,6 +1783,7 @@ TEST_CASE("noSim/SystemData/NativeCDC") { ASSERT(decodeCDCStreamNameKey(cdcStreamNameKeyFor(name)) == name); ASSERT(decodeCDCStreamNameValue(cdcStreamNameValue(streamId)) == streamId); + ASSERT(decodeCDCStreamKey(cdcStreamKeyFor(streamId)) == streamId); ASSERT(decodeCDCStreamKeysValue(cdcStreamKeysValue(keys)) == keys); ASSERT(decodeCDCMinVersionValue(cdcMinVersionValue(minVersion)) == minVersion); diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index d00a75271ab..e783d3e3939 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -277,6 +277,7 @@ CDCStreamId decodeCDCStreamNameValue(ValueRef const& value); // "\xff/cdc/keys/[[CDCStreamId]]" := "[[KeyRange]]" extern const KeyRangeRef cdcStreamKeys; Key cdcStreamKeyFor(CDCStreamId streamId); +CDCStreamId decodeCDCStreamKey(KeyRef const& key); Value cdcStreamKeysValue(KeyRangeRef const& keys); KeyRange decodeCDCStreamKeysValue(ValueRef const& value); diff --git a/fdbserver/commitproxy/CommitProxyServer.cpp b/fdbserver/commitproxy/CommitProxyServer.cpp index 086b4814e09..f85b179d874 100644 --- a/fdbserver/commitproxy/CommitProxyServer.cpp +++ b/fdbserver/commitproxy/CommitProxyServer.cpp @@ -695,6 +695,8 @@ std::set CommitBatchContext::getWrittenTagsPreResolution() { if (isSingleKeyMutation((MutationRef::Type)m.type)) { auto& tags = pProxyCommitData->tagsForKey(m.param1); transactionTags.insert(tags.begin(), tags.end()); + const auto& cdcTags = pProxyCommitData->cdcRouting.tagsForKey(m.param1); + transactionTags.insert(cdcTags.begin(), cdcTags.end()); } else if (m.type == MutationRef::ClearRange) { auto range = pProxyCommitData->keyInfo.rangeContaining(m.param1); if (range.end() >= m.param2) { @@ -710,6 +712,8 @@ std::set CommitBatchContext::getWrittenTagsPreResolution() { } } KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); + const auto cdcTags = pProxyCommitData->cdcRouting.tagsForRange(clearRange); + transactionTags.insert(cdcTags.begin(), cdcTags.end()); } else { UNREACHABLE(); } @@ -1391,6 +1395,7 @@ Future assignMutationsToStorageServers(CommitBatchContext* self) { DEBUG_MUTATION("ProxyCommit", self->commitVersion, m, pProxyCommitData->dbgid).detail("To", tags); self->toCommit.addTags(tags); + self->toCommit.addTags(pProxyCommitData->cdcRouting.tagsForKey(m.param1)); if (pProxyCommitData->acsBuilder != nullptr) { updateMutationWithAcsAndAddMutationToAcsBuilder( @@ -1482,6 +1487,7 @@ Future assignMutationsToStorageServers(CommitBatchContext* self) { } KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); + self->toCommit.addTags(pProxyCommitData->cdcRouting.tagsForRange(clearRange)); WriteMutationRefVar var = writeMutation(self, &m); // FIXME: Remove assert once ClearRange RAW_ACCESS usecase handling is done ASSERT(std::holds_alternative(var)); diff --git a/fdbserver/commitproxy/ProxyCommitData.h b/fdbserver/commitproxy/ProxyCommitData.h index c2c41f99652..dbe1cce69a7 100644 --- a/fdbserver/commitproxy/ProxyCommitData.h +++ b/fdbserver/commitproxy/ProxyCommitData.h @@ -199,6 +199,7 @@ struct ProxyCommitData { Promise validState; // Set once txnStateStore and version are valid double lastVersionTime; KeyRangeMap> vecBackupKeys; + CDCRoutingTable cdcRouting; uint64_t commitVersionRequestNumber; uint64_t mostRecentProcessedRequestNumber; KeyRangeMap>> keyResolvers; @@ -414,6 +415,7 @@ inline ApplyMetadataProxyContext ProxyCommitData::getApplyMetadataProxyContext() return { .dbgid = dbgid, .txnStateStore = txnStateStore, .vecBackupKeys = &vecBackupKeys, + .cdcRouting = &cdcRouting, .keyInfo = &keyInfo, .uid_applyMutationsData = firstProxy ? &uid_applyMutationsData : nullptr, .commit = commit, diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index cf001f7ec61..e3337ad705f 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -31,6 +31,7 @@ #include "fdbserver/logsystem/LogSystem.h" #include "flow/Error.h" #include "flow/Trace.h" +#include "flow/UnitTest.h" Reference getStorageInfo(UID id, std::map>* storageCache, @@ -47,6 +48,58 @@ Reference getStorageInfo(UID id, } return storageInfo; } + +void CDCRoutingTable::rebuildRanges() { + tagsByRange.insert(allKeys, std::set()); + for (const auto& [streamId, state] : streams) { + if (!state.keys.present() || !state.tag.present()) { + continue; + } + for (auto range : tagsByRange.modify(state.keys.get())) { + range->value().insert(state.tag.get().second); + } + } + tagsByRange.coalesce(allKeys); +} + +void CDCRoutingTable::setRange(CDCStreamId streamId, KeyRangeRef const& keys) { + streams[streamId].keys = KeyRange(keys); + rebuildRanges(); +} + +void CDCRoutingTable::setTag(CDCStreamId streamId, Version version, Tag tag) { + ASSERT(tag.locality == tagLocalityCDC); + auto& existing = streams[streamId].tag; + if (!existing.present() || version >= existing.get().first) { + existing = std::make_pair(version, tag); + rebuildRanges(); + } +} + +void CDCRoutingTable::reload(IKeyValueStore* txnStateStore) { + streams.clear(); + for (const auto& kv : txnStateStore->readRange(cdcStreamKeys).get()) { + setRange(decodeCDCStreamKey(kv.key), decodeCDCStreamKeysValue(kv.value)); + } + for (const auto& kv : txnStateStore->readRange(cdcTagHistoryKeys).get()) { + const auto [streamId, version, tag] = decodeCDCTagHistoryKey(kv.key); + setTag(streamId, version, tag); + } + rebuildRanges(); +} + +const std::set& CDCRoutingTable::tagsForKey(KeyRef const& key) const { + return tagsByRange.rangeContaining(key).value(); +} + +std::set CDCRoutingTable::tagsForRange(KeyRangeRef const& keys) const { + std::set tags; + for (auto range : tagsByRange.intersectingRanges(keys)) { + tags.insert(range.value().begin(), range.value().end()); + } + return tags; +} + namespace { // It is incredibly important that any modifications to txnStateStore are done in such a way that the same operations @@ -77,9 +130,9 @@ class ApplyMetadataMutationsImpl { : spanContext(spanContext_), dbgid(proxyMetadata_.dbgid), arena(arena_), mutations(mutations_), txnStateStore(proxyMetadata_.txnStateStore), toCommit(toCommit_), confChange(confChange_), logSystemConsumer(logSystemConsumer_), version(version), popVersion(popVersion_), - vecBackupKeys(proxyMetadata_.vecBackupKeys), keyInfo(proxyMetadata_.keyInfo), - uid_applyMutationsData(proxyMetadata_.uid_applyMutationsData), commit(proxyMetadata_.commit), - cx(proxyMetadata_.cx), committedVersion(proxyMetadata_.committedVersion), + vecBackupKeys(proxyMetadata_.vecBackupKeys), cdcRouting(proxyMetadata_.cdcRouting), + keyInfo(proxyMetadata_.keyInfo), uid_applyMutationsData(proxyMetadata_.uid_applyMutationsData), + commit(proxyMetadata_.commit), cx(proxyMetadata_.cx), committedVersion(proxyMetadata_.committedVersion), storageCache(proxyMetadata_.storageCache), tag_popped(proxyMetadata_.tag_popped), tssMapping(proxyMetadata_.tssMapping), initialCommit(initialCommit_), provisionalCommitProxy(provisionalCommitProxy_), @@ -124,6 +177,7 @@ class ApplyMetadataMutationsImpl { Version version = invalidVersion; Version popVersion = 0; KeyRangeMap>* vecBackupKeys = nullptr; + CDCRoutingTable* cdcRouting = nullptr; KeyRangeMap* keyInfo = nullptr; std::map* uid_applyMutationsData = nullptr; PublicRequestStream commit = PublicRequestStream(); @@ -552,6 +606,30 @@ class ApplyMetadataMutationsImpl { .detail("LogRangeEnd", logRangeEnd); } + void checkSetCDCMetadata(MutationRef m) { + if (!cdcStreamNameKeys.contains(m.param1) && !cdcStreamKeys.contains(m.param1) && + !cdcTagHistoryKeys.contains(m.param1) && !cdcMinVersionKeys.contains(m.param1) && + !cdcProxyKeys.contains(m.param1)) { + return; + } + if (!initialCommit) { + txnStateStore->set(KeyValueRef(m.param1, m.param2)); + } + if (toCommit && SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST && + (cdcStreamKeys.contains(m.param1) || cdcTagHistoryKeys.contains(m.param1))) { + toCommit->setLogsChanged(); + } + if (!cdcRouting) { + return; + } + if (cdcStreamKeys.contains(m.param1)) { + cdcRouting->setRange(decodeCDCStreamKey(m.param1), decodeCDCStreamKeysValue(m.param2)); + } else if (cdcTagHistoryKeys.contains(m.param1)) { + const auto [streamId, tagVersion, tag] = decodeCDCTagHistoryKey(m.param1); + cdcRouting->setTag(streamId, tagVersion, tag); + } + } + void checkSetGlobalKeys(MutationRef m) { if (!m.param1.startsWith(globalKeysPrefix)) { return; @@ -994,6 +1072,29 @@ class ApplyMetadataMutationsImpl { txnStateStore->clear(commonLogRange); } + void checkClearCDCMetadata(KeyRangeRef range) { + if (!cdcStreamNameKeys.intersects(range) && !cdcStreamKeys.intersects(range) && + !cdcTagHistoryKeys.intersects(range) && !cdcMinVersionKeys.intersects(range) && + !cdcProxyKeys.intersects(range)) { + return; + } + if (!initialCommit) { + for (const KeyRangeRef cdcRange : + { cdcStreamNameKeys, cdcStreamKeys, cdcTagHistoryKeys, cdcMinVersionKeys, cdcProxyKeys }) { + if (cdcRange.intersects(range)) { + txnStateStore->clear(cdcRange & range); + } + } + } + if (toCommit && SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST && + (cdcStreamKeys.intersects(range) || cdcTagHistoryKeys.intersects(range))) { + toCommit->setLogsChanged(); + } + if (cdcRouting && (cdcStreamKeys.intersects(range) || cdcTagHistoryKeys.intersects(range))) { + cdcRouting->reload(txnStateStore); + } + } + void checkClearTssMappingKeys(MutationRef m, KeyRangeRef range) { if (!tssMappingKeys.intersects(range)) { return; @@ -1131,6 +1232,7 @@ class ApplyMetadataMutationsImpl { checkSetApplyMutationsEndRange(m); checkSetApplyMutationsKeyVersionMapRange(m); checkSetLogRangesRange(m); + checkSetCDCMetadata(m); checkSetGlobalKeys(m); checkSetWriteRecoverKey(m); checkSetMinRequiredCommitVersionKey(m); @@ -1149,6 +1251,7 @@ class ApplyMetadataMutationsImpl { checkClearApplyMutationsEndRange(m, range); checkClearApplyMutationKeyVersionMapRange(m, range); checkClearLogRangesRange(range); + checkClearCDCMetadata(range); checkClearTssMappingKeys(m, range); checkClearTssQuarantineKeys(m, range); checkClearVersionEpochKeys(m, range); @@ -1219,7 +1322,9 @@ bool containsMetadataMutation(const VectorRef& mutations) { (m.param1.startsWith(applyMutationsEndRange.begin)) || (m.param1.startsWith(applyMutationsKeyVersionMapRange.begin)) || (m.param1.startsWith(logRangesRange.begin)) || (m.param1.startsWith(serverKeysPrefix)) || - (m.param1.startsWith(keyServersPrefix))) { + (m.param1.startsWith(keyServersPrefix)) || cdcStreamNameKeys.contains(m.param1) || + cdcStreamKeys.contains(m.param1) || cdcTagHistoryKeys.contains(m.param1) || + cdcMinVersionKeys.contains(m.param1) || cdcProxyKeys.contains(m.param1)) { return true; } } else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) { @@ -1232,10 +1337,36 @@ bool containsMetadataMutation(const VectorRef& mutations) { (tssQuarantineKeys.intersects(range)) || (range.contains(previousCoordinatorsKey)) || (range.contains(coordinatorsKey)) || (range.contains(databaseLockedKey)) || (range.contains(metadataVersionKey)) || (range.contains(mustContainSystemMutationsKey)) || - (range.contains(writeRecoveryKey)) || (range.intersects(testOnlyTxnStateStorePrefixRange))) { + (range.contains(writeRecoveryKey)) || (range.intersects(testOnlyTxnStateStorePrefixRange)) || + cdcStreamNameKeys.intersects(range) || cdcStreamKeys.intersects(range) || + cdcTagHistoryKeys.intersects(range) || cdcMinVersionKeys.intersects(range) || + cdcProxyKeys.intersects(range)) { return true; } } } return false; } + +TEST_CASE("noSim/NativeCDC/RoutingTable") { + CDCRoutingTable table; + const Tag ordersTag(tagLocalityCDC, 1); + const Tag overlappingTag(tagLocalityCDC, 2); + const Tag rotatedOrdersTag(tagLocalityCDC, 3); + + table.setRange(1, KeyRangeRef("a"_sr, "m"_sr)); + table.setTag(1, 100, ordersTag); + table.setRange(2, KeyRangeRef("g"_sr, "z"_sr)); + table.setTag(2, 100, overlappingTag); + + ASSERT(table.tagsForKey("b"_sr) == std::set{ ordersTag }); + ASSERT(table.tagsForKey("h"_sr) == (std::set{ ordersTag, overlappingTag })); + ASSERT(table.tagsForKey("x"_sr) == std::set{ overlappingTag }); + ASSERT(table.tagsForRange(KeyRangeRef("b"_sr, "x"_sr)) == (std::set{ ordersTag, overlappingTag })); + + table.setTag(1, 200, rotatedOrdersTag); + ASSERT(table.tagsForKey("b"_sr) == std::set{ rotatedOrdersTag }); + ASSERT(table.tagsForKey("h"_sr) == (std::set{ rotatedOrdersTag, overlappingTag })); + + return Void(); +} diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h index aac8c5d5b0e..ad5578a5957 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h @@ -56,10 +56,32 @@ struct ApplyMutationsData { Reference> keyVersion; }; +// Active CDC write routing reconstructed from durable stream and tag-history metadata. +class CDCRoutingTable : NonCopyable { + struct StreamState { + Optional keys; + Optional> tag; + }; + + std::map streams; + KeyRangeMap> tagsByRange; + + void rebuildRanges(); + +public: + void setRange(CDCStreamId streamId, KeyRangeRef const& keys); + void setTag(CDCStreamId streamId, Version version, Tag tag); + void reload(IKeyValueStore* txnStateStore); + + const std::set& tagsForKey(KeyRef const& key) const; + std::set tagsForRange(KeyRangeRef const& keys) const; +}; + struct ApplyMetadataProxyContext { UID dbgid; IKeyValueStore* txnStateStore = nullptr; KeyRangeMap>* vecBackupKeys = nullptr; + CDCRoutingTable* cdcRouting = nullptr; KeyRangeMap* keyInfo = nullptr; std::map* uid_applyMutationsData = nullptr; PublicRequestStream commit; From 9ce0954a67b42be2a5da1f52b86b11c87bde2c6c Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sat, 23 May 2026 14:18:58 -0700 Subject: [PATCH 03/56] Add native CDC stream lifecycle metadata operations and tests --- fdbclient/NativeCdc.cpp | 225 +++++++++++++++++++++++ fdbclient/SystemData.cpp | 16 +- fdbclient/include/fdbclient/NativeCdc.h | 42 +++++ fdbclient/include/fdbclient/SystemData.h | 1 + fdbserver/workloads/NativeCdc.cpp | 113 ++++++++++++ tests/CMakeLists.txt | 1 + tests/fast/NativeCdc.toml | 6 + 7 files changed, 402 insertions(+), 2 deletions(-) create mode 100644 fdbclient/NativeCdc.cpp create mode 100644 fdbclient/include/fdbclient/NativeCdc.h create mode 100644 fdbserver/workloads/NativeCdc.cpp create mode 100644 tests/fast/NativeCdc.toml diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp new file mode 100644 index 00000000000..2e8c7e9c0cb --- /dev/null +++ b/fdbclient/NativeCdc.cpp @@ -0,0 +1,225 @@ +/* + * NativeCdc.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "fdbclient/Knobs.h" +#include "fdbclient/NativeCdc.h" +#include "fdbclient/SystemData.h" +#include "flow/Error.h" +#include "flow/UnitTest.h" + +namespace { + +struct NativeCdcIdentifierAllocator { + bool sawStream = false; + CDCStreamId maxStreamId = 0; + std::set usedTagIds; + + void observeStreamId(CDCStreamId streamId) { + sawStream = true; + maxStreamId = std::max(maxStreamId, streamId); + } + + void observeTag(Tag tag) { + ASSERT_WE_THINK(tag.locality == tagLocalityCDC); + usedTagIds.insert(tag.id); + } + + std::pair allocate() const { + if (sawStream && maxStreamId == std::numeric_limits::max()) { + throw operation_failed(); + } + + const CDCStreamId streamId = sawStream ? maxStreamId + 1 : 1; + for (uint32_t tagId = 0; tagId <= std::numeric_limits::max(); ++tagId) { + if (!usedTagIds.contains(static_cast(tagId))) { + return { streamId, Tag(tagLocalityCDC, static_cast(tagId)) }; + } + } + throw operation_failed(); + } +}; + +void validateNativeCdcStream(KeyRef const& name, KeyRangeRef const& keys) { + if (name.empty() || keys.empty() || !normalKeys.contains(keys)) { + throw client_invalid_operation(); + } +} + +Future observeNativeCdcMetadata(Transaction* tr, NativeCdcIdentifierAllocator* allocator) { + Key begin = cdcStreamKeys.begin; + while (begin < cdcStreamKeys.end) { + RangeResult streams = co_await tr->getRange(KeyRangeRef(begin, cdcStreamKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : streams) { + allocator->observeStreamId(decodeCDCStreamKey(kv.key)); + } + if (!streams.more) { + break; + } + begin = keyAfter(streams.back().key); + } + + begin = cdcTagHistoryKeys.begin; + while (begin < cdcTagHistoryKeys.end) { + RangeResult histories = + co_await tr->getRange(KeyRangeRef(begin, cdcTagHistoryKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : histories) { + const auto history = decodeCDCTagHistoryKey(kv.key); + allocator->observeStreamId(std::get<0>(history)); + allocator->observeTag(std::get<2>(history)); + } + if (!histories.more) { + break; + } + begin = keyAfter(histories.back().key); + } +} + +} // namespace + +Future registerNativeCdcStream(Database cx, Key name, KeyRange keys) { + validateNativeCdcStream(name, keys); + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + const Key nameKey = cdcStreamNameKeyFor(name); + Optional currentId = co_await tr.get(nameKey); + if (currentId.present()) { + const CDCStreamId streamId = decodeCDCStreamNameValue(currentId.get()); + Optional currentKeys = co_await tr.get(cdcStreamKeyFor(streamId)); + if (!currentKeys.present() || decodeCDCStreamKeysValue(currentKeys.get()) != keys) { + throw client_invalid_operation(); + } + co_return streamId; + } + + NativeCdcIdentifierAllocator allocator; + co_await observeNativeCdcMetadata(&tr, &allocator); + const auto [streamId, tag] = allocator.allocate(); + const Version registrationVersion = co_await tr.getReadVersion(); + + tr.set(nameKey, cdcStreamNameValue(streamId)); + tr.set(cdcStreamKeyFor(streamId), cdcStreamKeysValue(keys)); + tr.set(cdcTagHistoryKeyFor(streamId, registrationVersion, tag), Value()); + tr.set(cdcMinVersionKeyFor(streamId), cdcMinVersionValue(registrationVersion)); + co_await tr.commit(); + co_return streamId; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future removeNativeCdcStream(Database cx, Key name) { + if (name.empty()) { + throw client_invalid_operation(); + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + const Key nameKey = cdcStreamNameKeyFor(name); + Optional currentId = co_await tr.get(nameKey); + if (!currentId.present()) { + co_return; + } + + const CDCStreamId streamId = decodeCDCStreamNameValue(currentId.get()); + tr.clear(nameKey); + tr.clear(cdcStreamKeyFor(streamId)); + tr.clear(cdcProxyRangeFor(streamId)); + // Retain tag history and minVersion until the pop/cleanup phase can + // safely release all durable mutations for this retired stream. + co_await tr.commit(); + co_return; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future> listNativeCdcStreams(Database cx) { + std::vector result; + Key begin = cdcStreamNameKeys.begin; + Transaction tr(cx); + + while (begin < cdcStreamNameKeys.end) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + RangeResult names = co_await tr.getRange(KeyRangeRef(begin, cdcStreamNameKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : names) { + const CDCStreamId streamId = decodeCDCStreamNameValue(kv.value); + Optional keys = co_await tr.get(cdcStreamKeyFor(streamId)); + Optional minVersion = co_await tr.get(cdcMinVersionKeyFor(streamId)); + if (keys.present() && minVersion.present()) { + result.push_back(NativeCdcStreamInfo{ decodeCDCStreamNameKey(kv.key), + streamId, + decodeCDCStreamKeysValue(keys.get()), + decodeCDCMinVersionValue(minVersion.get()) }); + } + } + if (!names.more) { + break; + } + begin = keyAfter(names.back().key); + continue; + } catch (Error& e) { + err = e; + } + result.clear(); + begin = cdcStreamNameKeys.begin; + co_await tr.onError(err); + } + co_return result; +} + +TEST_CASE("noSim/NativeCDC/LifecycleAllocation") { + NativeCdcIdentifierAllocator allocator; + auto [initialId, initialTag] = allocator.allocate(); + ASSERT(initialId == 1); + ASSERT(initialTag == Tag(tagLocalityCDC, 0)); + + allocator.observeStreamId(9); + allocator.observeTag(initialTag); + allocator.observeTag(Tag(tagLocalityCDC, 2)); + auto [nextId, nextTag] = allocator.allocate(); + ASSERT(nextId == 10); + ASSERT(nextTag == Tag(tagLocalityCDC, 1)); + + return Void(); +} diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index d84d8ccf183..623ec0e2bf0 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -880,13 +880,24 @@ Version decodeCDCMinVersionValue(ValueRef const& value) { return version; } -Key cdcProxyKeyFor(CDCStreamId streamId, UID proxyId) { +static Key cdcProxyPrefixFor(CDCStreamId streamId) { BinaryWriter wr(Unversioned()); wr.serializeBytes(cdcProxyKeys.begin); - wr << streamId << proxyId; + wr << streamId; return wr.toValue(); } +Key cdcProxyKeyFor(CDCStreamId streamId, UID proxyId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcProxyPrefixFor(streamId)); + wr << proxyId; + return wr.toValue(); +} + +KeyRange cdcProxyRangeFor(CDCStreamId streamId) { + return prefixRange(cdcProxyPrefixFor(streamId)); +} + std::pair decodeCDCProxyKey(KeyRef const& key) { CDCStreamId streamId; UID proxyId; @@ -1797,6 +1808,7 @@ TEST_CASE("noSim/SystemData/NativeCDC") { const auto [proxyStreamId, decodedProxyId] = decodeCDCProxyKey(cdcProxyKeyFor(streamId, proxyId)); ASSERT(proxyStreamId == streamId); ASSERT(decodedProxyId == proxyId); + ASSERT(cdcProxyRangeFor(streamId).contains(cdcProxyKeyFor(streamId, proxyId))); return Void(); } diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h new file mode 100644 index 00000000000..39c3b04142f --- /dev/null +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -0,0 +1,42 @@ +/* + * NativeCdc.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_NATIVECDC_H +#define FDBCLIENT_NATIVECDC_H +#pragma once + +#include + +#include "fdbclient/NativeAPI.actor.h" + +struct NativeCdcStreamInfo { + Key name; + CDCStreamId streamId = 0; + KeyRange keys; + Version minVersion = invalidVersion; +}; + +// These durable metadata operations are intended to back CDCProxyInterface +// lifecycle requests once CDC proxies are recruited. +Future registerNativeCdcStream(Database cx, Key name, KeyRange keys); +Future removeNativeCdcStream(Database cx, Key name); +Future> listNativeCdcStreams(Database cx); + +#endif // FDBCLIENT_NATIVECDC_H diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index e783d3e3939..e83685409ab 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -296,6 +296,7 @@ Version decodeCDCMinVersionValue(ValueRef const& value); // "\xff/cdc/proxies/[[CDCStreamId]][[proxyUID]]" := "" extern const KeyRangeRef cdcProxyKeys; Key cdcProxyKeyFor(CDCStreamId streamId, UID proxyId); +KeyRange cdcProxyRangeFor(CDCStreamId streamId); std::pair decodeCDCProxyKey(KeyRef const& key); // "\xff\x02/datacenterReplicas/[[datacenterID]]" := "[[replicas]]" diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp new file mode 100644 index 00000000000..de2e50f4b52 --- /dev/null +++ b/fdbserver/workloads/NativeCdc.cpp @@ -0,0 +1,113 @@ +/* + * NativeCdc.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "fdbclient/NativeCdc.h" +#include "fdbclient/SystemData.h" +#include "fdbserver/tester/workloads.h" + +struct NativeCdcWorkload : TestWorkload { + static constexpr auto NAME = "NativeCdc"; + + explicit NativeCdcWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {} + + Future setup(Database const& cx) override { return Void(); } + + Future start(Database const& cx) override { + if (clientId != 0) { + return Void(); + } + return run(cx); + } + + Future check(Database const& cx) override { return true; } + + void getMetrics(std::vector& m) override {} + + Future> getPersistedRoute(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional keys = co_await tr.get(cdcStreamKeyFor(streamId)); + Optional minVersion = co_await tr.get(cdcMinVersionKeyFor(streamId)); + RangeResult history = co_await tr.getRange(cdcTagHistoryRangeFor(streamId), 2); + ASSERT(keys.present()); + ASSERT(minVersion.present()); + ASSERT(history.size() == 1); + const auto [historyStreamId, historyVersion, tag] = decodeCDCTagHistoryKey(history[0].key); + ASSERT(historyStreamId == streamId); + ASSERT(historyVersion == decodeCDCMinVersionValue(minVersion.get())); + co_return std::make_pair(tag, historyVersion); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future run(Database cx) { + const Key firstName = "native-cdc-first"_sr; + const Key secondName = "native-cdc-second"_sr; + const KeyRange firstRange(KeyRangeRef("a"_sr, "m"_sr)); + const KeyRange conflictingRange(KeyRangeRef("a"_sr, "z"_sr)); + const KeyRange secondRange(KeyRangeRef("g"_sr, "z"_sr)); + + const CDCStreamId firstId = co_await registerNativeCdcStream(cx, firstName, firstRange); + ASSERT(co_await registerNativeCdcStream(cx, firstName, firstRange) == firstId); + + bool conflictingDuplicateRejected = false; + try { + co_await registerNativeCdcStream(cx, firstName, conflictingRange); + } catch (Error& e) { + if (e.code() == error_code_client_invalid_operation) { + conflictingDuplicateRejected = true; + } else { + throw; + } + } + ASSERT(conflictingDuplicateRejected); + + const auto firstRoute = co_await getPersistedRoute(cx, firstId); + ASSERT(firstRoute.first.locality == tagLocalityCDC); + + std::vector streams = co_await listNativeCdcStreams(cx); + ASSERT(streams.size() == 1); + ASSERT(streams[0].name == firstName); + ASSERT(streams[0].streamId == firstId); + ASSERT(streams[0].keys == firstRange); + ASSERT(streams[0].minVersion == firstRoute.second); + + co_await removeNativeCdcStream(cx, firstName); + ASSERT((co_await listNativeCdcStreams(cx)).empty()); + + const CDCStreamId secondId = co_await registerNativeCdcStream(cx, secondName, secondRange); + const auto secondRoute = co_await getPersistedRoute(cx, secondId); + ASSERT(secondId > firstId); + ASSERT(secondRoute.first != firstRoute.first); + + co_await removeNativeCdcStream(cx, secondName); + } +}; + +WorkloadFactory NativeCdcWorkloadFactory; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7915120ceea..7fbac15f4c2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -197,6 +197,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/RandomSelector.toml) add_fdb_test(TEST_FILES fast/RandomUnitTests.toml) add_fdb_test(TEST_FILES fast/RangeLocking.toml) + add_fdb_test(TEST_FILES fast/NativeCdc.toml) add_fdb_test(TEST_FILES fast/RangeLockCycle.toml) add_fdb_test(TEST_FILES fast/ReadHotDetectionCorrectness.toml IGNORE) # TODO re-enable once read hot detection is enabled. add_fdb_test(TEST_FILES fast/ReportConflictingKeys.toml) diff --git a/tests/fast/NativeCdc.toml b/tests/fast/NativeCdc.toml new file mode 100644 index 00000000000..24d602a48f1 --- /dev/null +++ b/tests/fast/NativeCdc.toml @@ -0,0 +1,6 @@ +[[test]] +testTitle = 'NativeCdc' +useDB = true + + [[test.workload]] + testName = 'NativeCdc' From a28711d675ebf6e53298b8a7bfcd3612e55ad741 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sat, 23 May 2026 17:16:02 -0700 Subject: [PATCH 04/56] Fix correctness regressions --- fdbclient/include/fdbclient/CommitProxyInterface.h | 7 ------- fdbserver/logsystem/ApplyMetadataMutation.cpp | 13 +++++++++++-- .../fdbserver/logsystem/ApplyMetadataMutation.h | 1 + 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index fd4cdfa2968..5fa33fae819 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -22,11 +22,9 @@ #define FDBCLIENT_COMMITPROXYINTERFACE_H #pragma once -#include #include #include -#include "fdbclient/CDCProxyInterface.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/GlobalConfig.h" @@ -113,8 +111,6 @@ struct ClientDBInfo { UID id; // Changes each time anything else changes std::vector grvProxies; std::vector commitProxies; - std::vector cdcProxies; - std::map streamToCDCProxyId; Optional firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk Optional forward; @@ -134,9 +130,6 @@ struct ClientDBInfo { ASSERT(ar.protocolVersion().isValid()); } serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType); - if (ar.protocolVersion().hasNativeCdc()) { - serializer(ar, cdcProxies, streamToCDCProxyId); - } } }; diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index e3337ad705f..7b3754827f9 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -49,6 +49,10 @@ Reference getStorageInfo(UID id, return storageInfo; } +CDCRoutingTable::CDCRoutingTable() { + tagsByRange.insert(allKeys, std::set()); +} + void CDCRoutingTable::rebuildRanges() { tagsByRange.insert(allKeys, std::set()); for (const auto& [streamId, state] : streams) { @@ -78,10 +82,12 @@ void CDCRoutingTable::setTag(CDCStreamId streamId, Version version, Tag tag) { void CDCRoutingTable::reload(IKeyValueStore* txnStateStore) { streams.clear(); - for (const auto& kv : txnStateStore->readRange(cdcStreamKeys).get()) { + const RangeResult streamRows = txnStateStore->readRange(cdcStreamKeys).get(); + for (const auto& kv : streamRows) { setRange(decodeCDCStreamKey(kv.key), decodeCDCStreamKeysValue(kv.value)); } - for (const auto& kv : txnStateStore->readRange(cdcTagHistoryKeys).get()) { + const RangeResult tagHistoryRows = txnStateStore->readRange(cdcTagHistoryKeys).get(); + for (const auto& kv : tagHistoryRows) { const auto [streamId, version, tag] = decodeCDCTagHistoryKey(kv.key); setTag(streamId, version, tag); } @@ -1354,6 +1360,9 @@ TEST_CASE("noSim/NativeCDC/RoutingTable") { const Tag overlappingTag(tagLocalityCDC, 2); const Tag rotatedOrdersTag(tagLocalityCDC, 3); + ASSERT(table.tagsForKey("b"_sr).empty()); + ASSERT(table.tagsForRange(KeyRangeRef("b"_sr, "x"_sr)).empty()); + table.setRange(1, KeyRangeRef("a"_sr, "m"_sr)); table.setTag(1, 100, ordersTag); table.setRange(2, KeyRangeRef("g"_sr, "z"_sr)); diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h index ad5578a5957..17663d58a06 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h @@ -69,6 +69,7 @@ class CDCRoutingTable : NonCopyable { void rebuildRanges(); public: + CDCRoutingTable(); void setRange(CDCStreamId streamId, KeyRangeRef const& keys); void setTag(CDCStreamId streamId, Version version, Tag tag); void reload(IKeyValueStore* txnStateStore); From 46e41a0e4675529124da0444d82bc1b545169781 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sun, 24 May 2026 12:31:06 -0700 Subject: [PATCH 05/56] Add native CDC acknowledgement watermarks --- fdbclient/NativeCdc.cpp | 33 +++++++++++++++++++++++++ fdbclient/include/fdbclient/NativeCdc.h | 3 +++ fdbserver/workloads/NativeCdc.cpp | 28 +++++++++++++++++++++ 3 files changed, 64 insertions(+) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 2e8c7e9c0cb..ab562e92908 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -208,6 +208,39 @@ Future> listNativeCdcStreams(Database cx) { co_return result; } +Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough) { + if (streamId == 0 || consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { + throw client_invalid_operation(); + } + const Version minUnpoppedVersion = consumedThrough + 1; + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + Optional minVersionValue = co_await tr.get(cdcMinVersionKeyFor(streamId)); + if (!minVersionValue.present()) { + throw client_invalid_operation(); + } + + const Version minVersion = decodeCDCMinVersionValue(minVersionValue.get()); + if (minUnpoppedVersion <= minVersion) { + co_return minVersion; + } + + tr.set(cdcMinVersionKeyFor(streamId), cdcMinVersionValue(minUnpoppedVersion)); + co_await tr.commit(); + co_return minUnpoppedVersion; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + TEST_CASE("noSim/NativeCDC/LifecycleAllocation") { NativeCdcIdentifierAllocator allocator; auto [initialId, initialTag] = allocator.allocate(); diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index 39c3b04142f..0c200f71af4 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -38,5 +38,8 @@ struct NativeCdcStreamInfo { Future registerNativeCdcStream(Database cx, Key name, KeyRange keys); Future removeNativeCdcStream(Database cx, Key name); Future> listNativeCdcStreams(Database cx); +// Persists the exclusive unpopped watermark after consuming through a version. +// Removed streams remain acknowledgeable while retained CDC log data is drained. +Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough); #endif // FDBCLIENT_NATIVECDC_H diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index de2e50f4b52..02c931dcf59 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -66,6 +66,22 @@ struct NativeCdcWorkload : TestWorkload { } } + Future getPersistedMinVersion(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional minVersion = co_await tr.get(cdcMinVersionKeyFor(streamId)); + ASSERT(minVersion.present()); + co_return decodeCDCMinVersionValue(minVersion.get()); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + Future run(Database cx) { const Key firstName = "native-cdc-first"_sr; const Key secondName = "native-cdc-second"_sr; @@ -98,8 +114,20 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(streams[0].keys == firstRange); ASSERT(streams[0].minVersion == firstRoute.second); + const Version firstConsumedThrough = firstRoute.second + 5; + const Version firstAckMinVersion = firstConsumedThrough + 1; + ASSERT(co_await acknowledgeNativeCdcStream(cx, firstId, firstConsumedThrough) == firstAckMinVersion); + ASSERT(co_await acknowledgeNativeCdcStream(cx, firstId, firstRoute.second) == firstAckMinVersion); + streams = co_await listNativeCdcStreams(cx); + ASSERT(streams.size() == 1); + ASSERT(streams[0].minVersion == firstAckMinVersion); + co_await removeNativeCdcStream(cx, firstName); ASSERT((co_await listNativeCdcStreams(cx)).empty()); + const Version retiredConsumedThrough = firstConsumedThrough + 5; + const Version retiredAckMinVersion = retiredConsumedThrough + 1; + ASSERT(co_await acknowledgeNativeCdcStream(cx, firstId, retiredConsumedThrough) == retiredAckMinVersion); + ASSERT(co_await getPersistedMinVersion(cx, firstId) == retiredAckMinVersion); const CDCStreamId secondId = co_await registerNativeCdcStream(cx, secondName, secondRange); const auto secondRoute = co_await getPersistedRoute(cx, secondId); From 0bc57992e5c546816ef35af4c4d4776a880c5150 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sun, 24 May 2026 13:37:16 -0700 Subject: [PATCH 06/56] Add native CDC proxy consume and acknowledge backend --- fdbserver/CMakeLists.txt | 2 + fdbserver/cdcproxy/CDCProxy.cpp | 317 ++++++++++++++++++ fdbserver/cdcproxy/CMakeLists.txt | 13 + .../include/fdbserver/cdcproxy/CDCProxy.h | 30 ++ 4 files changed, 362 insertions(+) create mode 100644 fdbserver/cdcproxy/CDCProxy.cpp create mode 100644 fdbserver/cdcproxy/CMakeLists.txt create mode 100644 fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index b40708092ad..e1491384135 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -29,6 +29,7 @@ add_subdirectory(core) add_subdirectory(kvstore) add_subdirectory(logsystem) add_subdirectory(mocks3) +add_subdirectory(cdcproxy) add_subdirectory(clustercontroller) add_subdirectory(backupworker) add_subdirectory(commitproxy) @@ -168,6 +169,7 @@ target_link_libraries(fdbserver PRIVATE "$" fdbserver_worker fdbserver_backupworker + "$" fdbserver_clustercontroller fdbserver_commitproxy fdbserver_consistencyscan diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp new file mode 100644 index 00000000000..d97b3ceef80 --- /dev/null +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -0,0 +1,317 @@ +/* + * CDCProxy.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "fdbclient/Knobs.h" +#include "fdbclient/NativeCdc.h" +#include "fdbclient/SystemData.h" +#include "fdbserver/cdcproxy/CDCProxy.h" +#include "fdbserver/core/LogProtocolMessage.h" +#include "fdbserver/core/OTELSpanContextMessage.h" +#include "fdbserver/core/ServerDBInfo.h" +#include "fdbserver/core/SpanContextMessage.h" +#include "fdbserver/core/WaitFailure.h" +#include "fdbserver/core/WorkerInterface.actor.h" +#include "fdbserver/logsystem/LogSystemConsumer.h" +#include "fdbserver/logsystem/LogSystemFactory.h" +#include "flow/ActorCollection.h" +#include "flow/Error.h" +#include "flow/UnitTest.h" + +namespace { + +struct CDCStreamReadState { + Optional keys; + Version minVersion = invalidVersion; + Tag currentTag = invalidTag; + std::vector> tagHistory; +}; + +struct CDCProxyData { + UID id; + Database cx; + Reference const> dbInfo; + Reference logSystem; + + CDCProxyData(CDCProxyInterface const& proxy, Reference const> dbInfo) + : id(proxy.id()), cx(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)), dbInfo(dbInfo) {} +}; + +Optional clipCDCMutation(MutationRef const& mutation, KeyRangeRef const& keys) { + if (isSingleKeyMutation((MutationRef::Type)mutation.type)) { + if (keys.contains(mutation.param1)) { + return mutation; + } + } else if (mutation.type == MutationRef::ClearRange) { + KeyRangeRef intersection = keys & KeyRangeRef(mutation.param1, mutation.param2); + if (!intersection.empty()) { + return MutationRef(MutationRef::ClearRange, intersection.begin, intersection.end); + } + } else { + ASSERT(false); + } + return Optional(); +} + +Future readCDCStreamState(Database cx, CDCStreamId streamId, bool requireKeys) { + if (streamId == 0) { + throw client_invalid_operation(); + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + + CDCStreamReadState result; + Optional keysValue = co_await tr.get(cdcStreamKeyFor(streamId)); + if (keysValue.present()) { + result.keys = decodeCDCStreamKeysValue(keysValue.get()); + } else if (requireKeys) { + throw client_invalid_operation(); + } + + Optional minVersionValue = co_await tr.get(cdcMinVersionKeyFor(streamId)); + if (!minVersionValue.present()) { + throw client_invalid_operation(); + } + result.minVersion = decodeCDCMinVersionValue(minVersionValue.get()); + + std::vector> tagAssignments; + KeyRange tagHistoryRange = cdcTagHistoryRangeFor(streamId); + Key begin = tagHistoryRange.begin; + while (begin < tagHistoryRange.end) { + RangeResult history = + co_await tr.getRange(KeyRangeRef(begin, tagHistoryRange.end), CLIENT_KNOBS->TOO_MANY); + for (KeyValueRef const& kv : history) { + const auto [historyStreamId, version, tag] = decodeCDCTagHistoryKey(kv.key); + ASSERT_WE_THINK(historyStreamId == streamId); + ASSERT_WE_THINK(tag.locality == tagLocalityCDC); + tagAssignments.emplace_back(version, tag); + } + if (!history.more) { + break; + } + begin = keyAfter(history.back().key); + } + if (tagAssignments.empty()) { + throw client_invalid_operation(); + } + + result.currentTag = tagAssignments.back().second; + for (int i = tagAssignments.size() - 1; i > 0; --i) { + result.tagHistory.emplace_back(tagAssignments[i].first, tagAssignments[i - 1].second); + } + co_return result; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future consume(CDCProxyData* self, CDCConsumeRequest request) { + try { + if (request.cursor.lastConsumedVersion < invalidVersion || + request.cursor.lastConsumedVersion == std::numeric_limits::max()) { + throw client_invalid_operation(); + } + + CDCStreamReadState state = co_await readCDCStreamState(self->cx, request.cursor.streamId, true); + Version begin = request.cursor.lastConsumedVersion == invalidVersion ? state.minVersion + : request.cursor.lastConsumedVersion + 1; + if (begin < state.minVersion) { + throw transaction_too_old(); + } + + Reference cursor = + self->logSystem->peekSingle(self->id, begin, state.currentTag, state.tagHistory); + cursor->setProtocolVersion(g_network->protocolVersion()); + co_await cursor->getMore(TaskPriority::TLogPeekReply); + if (cursor->popped() > begin) { + throw transaction_too_old(); + } + + CDCConsumeReply reply; + reply.lastConsumedVersion = request.cursor.lastConsumedVersion; + while (cursor->hasMessage()) { + const Version messageVersion = cursor->version().version; + ArenaReader& reader = *cursor->reader(); + if (LogProtocolMessage::isNextIn(reader)) { + LogProtocolMessage protocolMessage; + reader >> protocolMessage; + cursor->setProtocolVersion(reader.protocolVersion()); + } else if (reader.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(reader)) { + SpanContextMessage contextMessage; + reader >> contextMessage; + } else if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) { + OTELSpanContextMessage contextMessage; + reader >> contextMessage; + } else { + MutationRef mutation; + reader >> mutation; + Optional clipped = clipCDCMutation(mutation, state.keys.get()); + if (clipped.present()) { + if (reply.mutations.empty() || reply.mutations.back().version != messageVersion) { + reply.mutations.push_back(reply.arena, VersionedMutationsRef(messageVersion, {})); + } + reply.mutations.back().mutations.push_back_deep(reply.arena, clipped.get()); + } + } + reply.lastConsumedVersion = std::max(reply.lastConsumedVersion, messageVersion); + cursor->nextMessage(); + } + request.reply.send(reply); + } catch (Error& e) { + request.reply.sendError(e); + } + co_return; +} + +Future acknowledge(CDCProxyData* self, CDCAckRequest request) { + try { + CDCStreamReadState state = co_await readCDCStreamState(self->cx, request.streamId, false); + const Version minVersion = co_await acknowledgeNativeCdcStream(self->cx, request.streamId, request.version); + std::set tags{ state.currentTag }; + for (const auto& history : state.tagHistory) { + tags.insert(history.second); + } + for (Tag tag : tags) { + self->logSystem->pop(minVersion, tag); + } + request.reply.send(Void()); + } catch (Error& e) { + request.reply.sendError(e); + } + co_return; +} + +Future registerStream(CDCProxyData* self, CDCRegisterStreamRequest request) { + try { + const CDCStreamId streamId = co_await registerNativeCdcStream(self->cx, request.name, request.keys); + request.reply.send(CDCRegisterStreamReply(streamId)); + } catch (Error& e) { + request.reply.sendError(e); + } + co_return; +} + +Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { + try { + co_await removeNativeCdcStream(self->cx, request.name); + request.reply.send(Void()); + } catch (Error& e) { + request.reply.sendError(e); + } + co_return; +} + +Future listStreams(CDCProxyData* self, CDCListStreamsRequest request) { + try { + std::vector streams = co_await listNativeCdcStreams(self->cx); + CDCListStreamsReply reply; + for (NativeCdcStreamInfo const& stream : streams) { + reply.streams.push_back(reply.arena, + CDCStreamInfoRef(StringRef(reply.arena, stream.name), + stream.streamId, + KeyRangeRef(reply.arena, stream.keys), + stream.minVersion)); + } + request.reply.send(reply); + } catch (Error& e) { + request.reply.sendError(e); + } + co_return; +} + +} // namespace + +Future cdcProxyServer(CDCProxyInterface proxy, Reference const> dbInfo) { + CDCProxyData self(proxy, dbInfo); + ActorCollection actors(false); + + actors.add(waitFailureServer(proxy.waitFailure.getFuture())); + self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); + Future dbInfoChange = dbInfo->onChange(); + + while (true) { + auto result = co_await race(proxy.consume.getFuture(), + proxy.ack.getFuture(), + proxy.registerStream.getFuture(), + proxy.removeStream.getFuture(), + proxy.listStreams.getFuture(), + dbInfoChange, + actors.getResult()); + switch (result.index()) { + case 0: + actors.add(consume(&self, std::get<0>(std::move(result)))); + break; + case 1: + actors.add(acknowledge(&self, std::get<1>(std::move(result)))); + break; + case 2: + actors.add(registerStream(&self, std::get<2>(std::move(result)))); + break; + case 3: + actors.add(removeStream(&self, std::get<3>(std::move(result)))); + break; + case 4: + actors.add(listStreams(&self, std::get<4>(std::move(result)))); + break; + case 5: + self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); + dbInfoChange = dbInfo->onChange(); + break; + case 6: + co_await actors.getResult(); + break; + default: + ASSERT(false); + } + } +} + +TEST_CASE("noSim/NativeCDC/ProxyMutationFiltering") { + const KeyRangeRef keys("c"_sr, "m"_sr); + + Optional inRange = clipCDCMutation(MutationRef(MutationRef::SetValue, "d"_sr, "value"_sr), keys); + ASSERT(inRange.present()); + ASSERT(inRange.get().param1 == "d"_sr); + + Optional outOfRange = clipCDCMutation(MutationRef(MutationRef::SetValue, "z"_sr, "value"_sr), keys); + ASSERT(!outOfRange.present()); + + Optional clippedClear = clipCDCMutation(MutationRef(MutationRef::ClearRange, "a"_sr, "f"_sr), keys); + ASSERT(clippedClear.present()); + ASSERT(clippedClear.get().param1 == "c"_sr); + ASSERT(clippedClear.get().param2 == "f"_sr); + + Optional excludedClear = clipCDCMutation(MutationRef(MutationRef::ClearRange, "n"_sr, "z"_sr), keys); + ASSERT(!excludedClear.present()); + + return Void(); +} diff --git a/fdbserver/cdcproxy/CMakeLists.txt b/fdbserver/cdcproxy/CMakeLists.txt new file mode 100644 index 00000000000..63019e0fa9e --- /dev/null +++ b/fdbserver/cdcproxy/CMakeLists.txt @@ -0,0 +1,13 @@ +fdb_find_sources(FDBSERVER_CDCPROXY_SRCS) + +add_flow_target(STATIC_LIBRARY NAME fdbserver_cdcproxy SRCS ${FDBSERVER_CDCPROXY_SRCS}) +add_fdbserver_link_test(fdbserver_cdcproxylinktest + fdbserver_cdcproxy + fdbserver_logsystem + fdbserver_core) + +configure_fdbserver_common_includes(fdbserver_cdcproxy) +target_include_directories(fdbserver_cdcproxy + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include) +target_link_libraries(fdbserver_cdcproxy PUBLIC fdbserver_logsystem fdbserver_core) diff --git a/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h b/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h new file mode 100644 index 00000000000..4c8cb1235f6 --- /dev/null +++ b/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h @@ -0,0 +1,30 @@ +/* + * CDCProxy.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "fdbclient/CDCProxyInterface.h" +#include "flow/flow.h" + +struct ServerDBInfo; + +// Implements CDCProxyInterface once a worker/cluster-controller recruitment +// path publishes a CDCProxyInterface to clients. +Future cdcProxyServer(CDCProxyInterface proxy, Reference const> dbInfo); From 7203fe5316ea6edef0b12d67196307dfb5f2a15d Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sun, 24 May 2026 16:26:13 -0700 Subject: [PATCH 07/56] Recruit and publish CDC proxy endpoints through recovery routing --- .../include/fdbclient/CommitProxyInterface.h | 9 +- fdbserver/cdcproxy/CDCProxy.cpp | 95 +++++++----- .../include/fdbserver/cdcproxy/CDCProxy.h | 6 +- .../ClusterController.actor.cpp | 12 +- .../clustercontroller/ClusterRecovery.cpp | 38 +++++ fdbserver/clustercontroller/ClusterRecovery.h | 1 + fdbserver/core/WorkerSupport.cpp | 4 + .../fdbserver/core/WorkerInterface.actor.h | 146 +++++++++++++----- fdbserver/logsystem/LogSystemConsumer.cpp | 23 +-- fdbserver/worker/CMakeLists.txt | 2 + fdbserver/worker/worker.actor.cpp | 21 +++ fdbserver/workloads/NativeCdc.cpp | 49 ++++++ flow/include/flow/error_definitions.h | 1 + 13 files changed, 315 insertions(+), 92 deletions(-) diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index 5fa33fae819..25d1e96511d 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -26,6 +26,7 @@ #include #include "fdbclient/CommitTransaction.h" +#include "fdbclient/CDCProxyInterface.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/GlobalConfig.h" #include "fdbclient/GrvProxyInterface.h" @@ -111,6 +112,7 @@ struct ClientDBInfo { UID id; // Changes each time anything else changes std::vector grvProxies; std::vector commitProxies; + std::vector cdcProxies; Optional firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk Optional forward; @@ -128,8 +130,13 @@ struct ClientDBInfo { void serialize(Archive& ar) { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); + serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType); + if (ar.protocolVersion().hasNativeCdc()) { + serializer(ar, cdcProxies); + } + } else { + serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType, cdcProxies); } - serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType); } }; diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index d97b3ceef80..14e44c3fd76 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -150,8 +150,8 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { Reference cursor = self->logSystem->peekSingle(self->id, begin, state.currentTag, state.tagHistory); - cursor->setProtocolVersion(g_network->protocolVersion()); co_await cursor->getMore(TaskPriority::TLogPeekReply); + cursor->setProtocolVersion(g_network->protocolVersion()); if (cursor->popped() > begin) { throw transaction_too_old(); } @@ -250,47 +250,62 @@ Future listStreams(CDCProxyData* self, CDCListStreamsRequest request) { } // namespace -Future cdcProxyServer(CDCProxyInterface proxy, Reference const> dbInfo) { - CDCProxyData self(proxy, dbInfo); - ActorCollection actors(false); +Future cdcProxyServer(CDCProxyInterface proxy, + uint64_t recoveryCount, + Reference const> dbInfo) { + try { + CDCProxyData self(proxy, dbInfo); + ActorCollection actors(false); - actors.add(waitFailureServer(proxy.waitFailure.getFuture())); - self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); - Future dbInfoChange = dbInfo->onChange(); + actors.add(waitFailureServer(proxy.waitFailure.getFuture())); + actors.add(traceRole(Role::CDC_PROXY, proxy.id())); + self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); + Future dbInfoChange = dbInfo->onChange(); - while (true) { - auto result = co_await race(proxy.consume.getFuture(), - proxy.ack.getFuture(), - proxy.registerStream.getFuture(), - proxy.removeStream.getFuture(), - proxy.listStreams.getFuture(), - dbInfoChange, - actors.getResult()); - switch (result.index()) { - case 0: - actors.add(consume(&self, std::get<0>(std::move(result)))); - break; - case 1: - actors.add(acknowledge(&self, std::get<1>(std::move(result)))); - break; - case 2: - actors.add(registerStream(&self, std::get<2>(std::move(result)))); - break; - case 3: - actors.add(removeStream(&self, std::get<3>(std::move(result)))); - break; - case 4: - actors.add(listStreams(&self, std::get<4>(std::move(result)))); - break; - case 5: - self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); - dbInfoChange = dbInfo->onChange(); - break; - case 6: - co_await actors.getResult(); - break; - default: - ASSERT(false); + while (true) { + auto result = co_await race(proxy.consume.getFuture(), + proxy.ack.getFuture(), + proxy.registerStream.getFuture(), + proxy.removeStream.getFuture(), + proxy.listStreams.getFuture(), + dbInfoChange, + actors.getResult()); + switch (result.index()) { + case 0: + actors.add(consume(&self, std::get<0>(std::move(result)))); + break; + case 1: + actors.add(acknowledge(&self, std::get<1>(std::move(result)))); + break; + case 2: + actors.add(registerStream(&self, std::get<2>(std::move(result)))); + break; + case 3: + actors.add(removeStream(&self, std::get<3>(std::move(result)))); + break; + case 4: + actors.add(listStreams(&self, std::get<4>(std::move(result)))); + break; + case 5: + if (dbInfo->get().recoveryCount >= recoveryCount && + std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) == + dbInfo->get().client.cdcProxies.end()) { + throw worker_removed(); + } + self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); + dbInfoChange = dbInfo->onChange(); + break; + case 6: + co_await actors.getResult(); + break; + default: + ASSERT(false); + } + } + } catch (Error& e) { + TraceEvent("CDCProxyTerminated", proxy.id()).errorUnsuppressed(e); + if (e.code() != error_code_worker_removed) { + throw; } } } diff --git a/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h b/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h index 4c8cb1235f6..f2756ab27b3 100644 --- a/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h +++ b/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h @@ -25,6 +25,6 @@ struct ServerDBInfo; -// Implements CDCProxyInterface once a worker/cluster-controller recruitment -// path publishes a CDCProxyInterface to clients. -Future cdcProxyServer(CDCProxyInterface proxy, Reference const> dbInfo); +Future cdcProxyServer(CDCProxyInterface proxy, + uint64_t recoveryCount, + Reference const> dbInfo); diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 92f7308b295..8fb84de574d 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -236,6 +236,12 @@ bool ClusterControllerData::transactionSystemContainsDegradedServers() { } } + for (const auto& proxy : dbi.client.cdcProxies) { + if (proxy.addresses().contains(server)) { + return true; + } + } + for (const auto& resolver : dbi.resolvers) { if (resolver.addresses().contains(server)) { return true; @@ -1177,6 +1183,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co .detail("RegistrationCount", req.registrationCount) .detail("CommitProxies", req.commitProxies.size()) .detail("GrvProxies", req.grvProxies.size()) + .detail("CDCProxies", req.cdcProxies.size()) .detail("RecoveryCount", req.recoveryCount) .detail("Stalled", req.recoveryStalled) .detail("OldestBackupEpoch", req.logSystemConfig.oldestBackupEpoch); @@ -1236,7 +1243,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co // Construct the client information if (db->clientInfo->get().commitProxies != req.commitProxies || - db->clientInfo->get().grvProxies != req.grvProxies || + db->clientInfo->get().grvProxies != req.grvProxies || db->clientInfo->get().cdcProxies != req.cdcProxies || db->clientInfo->get().clusterId != db->serverInfo->get().client.clusterId || db->clientInfo->get().clusterType != db->clusterType) { TraceEvent("PublishNewClientInfo", self->id) @@ -1246,6 +1253,8 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co .detail("CommitProxies", db->clientInfo->get().commitProxies) .detail("GlobalConfigHistorySize", db->clientInfo->get().history.size()) .detail("ReqCPs", req.commitProxies) + .detail("CDCProxies", db->clientInfo->get().cdcProxies) + .detail("ReqCDCProxies", req.cdcProxies) .detail("ClusterId", db->serverInfo->get().client.clusterId) .detail("ClientClusterId", db->clientInfo->get().clusterId) .detail("ClusterType", db->clientInfo->get().clusterType) @@ -1256,6 +1265,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co clientInfo.id = deterministicRandom()->randomUniqueID(); clientInfo.commitProxies = req.commitProxies; clientInfo.grvProxies = req.grvProxies; + clientInfo.cdcProxies = req.cdcProxies; clientInfo.history = db->clientInfo->get().history; clientInfo.clusterId = db->serverInfo->get().client.clusterId; clientInfo.clusterType = db->clusterType; diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index ec4a30e54d2..b51d9f0060e 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -45,6 +45,7 @@ static std::set const& normalClusterRecoveryErrors() { s.insert(error_code_tlog_failed); s.insert(error_code_commit_proxy_failed); s.insert(error_code_grv_proxy_failed); + s.insert(error_code_cdc_proxy_failed); s.insert(error_code_resolver_failed); s.insert(error_code_backup_worker_failed); s.insert(error_code_recruitment_failed); @@ -231,6 +232,23 @@ Future newGrvProxies(Reference self, RecruitFromConfi self->grvProxies = std::move(newRecruits); } +Future newCDCProxies(Reference self, RecruitFromConfigurationReply recr) { + std::vector> initializationReplies; + for (int i = 0; i < recr.grvProxies.size(); i++) { + InitializeCDCProxyRequest req; + req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; + TraceEvent("CDCProxyReplies", self->dbgid).detail("WorkerID", recr.grvProxies[i].id()); + initializationReplies.push_back( + transformErrors(throwErrorOr(recr.grvProxies[i].cdcProxy.getReplyUnlessFailedFor( + req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + cdc_proxy_failed())); + } + + std::vector newRecruits = co_await getAll(initializationReplies); + TraceEvent("CDCProxyInitializationComplete", self->dbgid).log(); + self->cdcProxies = std::move(newRecruits); +} + Future newResolvers(Reference self, RecruitFromConfigurationReply recr) { std::vector> initializationReplies; for (int i = 0; i < recr.resolvers.size(); i++) { @@ -411,6 +429,19 @@ Future waitGrvProxyFailure(std::vector const& grvProxie return tagError(quorum(failed, 1), grv_proxy_failed()); } +Future waitCDCProxyFailure(std::vector const& cdcProxies) { + std::vector> failed; + failed.reserve(cdcProxies.size()); + for (auto cdcProxy : cdcProxies) { + failed.push_back(waitFailureClient(cdcProxy.waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true)); + } + ASSERT(failed.size() >= 1); + return tagError(quorum(failed, 1), cdc_proxy_failed()); +} + Future waitResolverFailure(std::vector const& resolvers) { std::vector> failed; failed.reserve(resolvers.size()); @@ -779,6 +810,7 @@ void sendMasterRegistration(ClusterRecoveryData* self, LogSystemConfig const& logSystemConfig, std::vector commitProxies, std::vector grvProxies, + std::vector cdcProxies, std::vector resolvers, DBRecoveryCount recoveryCount, std::vector priorCommittedLogServers) { @@ -788,6 +820,7 @@ void sendMasterRegistration(ClusterRecoveryData* self, masterReq.logSystemConfig = logSystemConfig; masterReq.commitProxies = commitProxies; masterReq.grvProxies = grvProxies; + masterReq.cdcProxies = cdcProxies; masterReq.resolvers = resolvers; masterReq.recoveryCount = recoveryCount; if (self->hasConfiguration) @@ -826,6 +859,7 @@ Future updateRegistration(Reference self, ReferenceprovisionalCommitProxies, self->provisionalGrvProxies, + std::vector(), self->resolvers, self->cstate.myDBState.recoveryCount, self->cstate.prevDBState.getPriorCommittedLogServers()); @@ -835,6 +869,7 @@ Future updateRegistration(Reference self, ReferencecommitProxies, self->grvProxies, + self->cdcProxies, self->resolvers, self->cstate.myDBState.recoveryCount, std::vector()); @@ -1102,6 +1137,7 @@ Future>> recruitEverything( .detail("Status", RecoveryStatus::names[RecoveryStatus::initializing_transaction_servers]) .detail("CommitProxies", recruits.commitProxies.size()) .detail("GrvProxies", recruits.grvProxies.size()) + .detail("CDCProxies", recruits.grvProxies.size()) .detail("TLogs", recruits.tLogs.size()) .detail("Resolvers", recruits.resolvers.size()) .detail("SatelliteTLogs", recruits.satelliteTLogs.size()) @@ -1122,6 +1158,7 @@ Future>> recruitEverything( Future txnSystemInitialized = traceAfter(newCommitProxies(self, recruits), "CommitProxiesInitialized") && traceAfter(newGrvProxies(self, recruits), "GRVProxiesInitialized") && + traceAfter(newCDCProxies(self, recruits), "CDCProxiesInitialized") && traceAfter(newResolvers(self, recruits), "ResolversInitialized") && traceAfter(newTLogServers(self, recruits, oldLogSystem, &confChanges), "TLogServersInitialized"); co_await (txnSystemInitialized || monitorInitializingTxnSystem(self->controllerData->db.unfinishedRecoveries)); @@ -1761,6 +1798,7 @@ Future clusterRecoveryCore(Reference self) { self->addActor.send(waitResolverFailure(self->resolvers)); self->addActor.send(waitCommitProxyFailure(self->commitProxies)); self->addActor.send(waitGrvProxyFailure(self->grvProxies)); + self->addActor.send(waitCDCProxyFailure(self->cdcProxies)); self->addActor.send(reportErrors(updateRegistration(self, self->logSystem), "UpdateRegistration", self->dbgid)); self->registrationTrigger.trigger(); diff --git a/fdbserver/clustercontroller/ClusterRecovery.h b/fdbserver/clustercontroller/ClusterRecovery.h index 09499d5dec3..6d82c1cadcc 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.h +++ b/fdbserver/clustercontroller/ClusterRecovery.h @@ -208,6 +208,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted std::vector provisionalCommitProxies; std::vector grvProxies; std::vector provisionalGrvProxies; + std::vector cdcProxies; std::vector resolvers; std::map lastCommitProxyVersionReplies; diff --git a/fdbserver/core/WorkerSupport.cpp b/fdbserver/core/WorkerSupport.cpp index 0385e1ddd3a..fd7a58cbebf 100644 --- a/fdbserver/core/WorkerSupport.cpp +++ b/fdbserver/core/WorkerSupport.cpp @@ -19,6 +19,9 @@ template struct NetNotifiedQueue; template class RequestStream; template struct NetNotifiedQueue; +template class RequestStream; +template struct NetNotifiedQueue; + template class RequestStream; template struct NetNotifiedQueue; @@ -230,6 +233,7 @@ const Role Role::TRANSACTION_LOG("TLog", "TL"); const Role Role::SHARED_TRANSACTION_LOG("SharedTLog", "SL", false); const Role Role::COMMIT_PROXY("CommitProxyServer", "CP"); const Role Role::GRV_PROXY("GrvProxyServer", "GP"); +const Role Role::CDC_PROXY("CDCProxyServer", "DP"); const Role Role::MASTER("MasterServer", "MS"); const Role Role::RESOLVER("Resolver", "RV"); const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC"); diff --git a/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h b/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h index c72fa2c4476..74856ea3652 100644 --- a/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h +++ b/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h @@ -28,6 +28,7 @@ #include "fdbserver/core/BackupInterface.h" #include "fdbserver/core/DataDistributorInterface.h" #include "fdbserver/core/MasterInterface.h" +#include "fdbclient/CDCProxyInterface.h" #include "fdbserver/core/TLogInterface.h" #include "fdbserver/core/RatekeeperInterface.h" #include "fdbclient/ConsistencyScanInterface.h" @@ -50,6 +51,7 @@ struct WorkerInterface { RequestStream master; RequestStream commitProxy; RequestStream grvProxy; + RequestStream cdcProxy; RequestStream dataDistributor; RequestStream ratekeeper; RequestStream consistencyScan; @@ -87,6 +89,7 @@ struct WorkerInterface { master.getEndpoint(TaskPriority::Worker); commitProxy.getEndpoint(TaskPriority::Worker); grvProxy.getEndpoint(TaskPriority::Worker); + cdcProxy.getEndpoint(TaskPriority::Worker); resolver.getEndpoint(TaskPriority::Worker); logRouter.getEndpoint(TaskPriority::Worker); debugPing.getEndpoint(TaskPriority::Worker); @@ -97,31 +100,63 @@ struct WorkerInterface { template void serialize(Ar& ar) { - serializer(ar, - clientInterface, - locality, - tLog, - master, - commitProxy, - grvProxy, - dataDistributor, - ratekeeper, - consistencyScan, - resolver, - storage, - logRouter, - debugPing, - coordinationPing, - waitFailure, - setMetricsRate, - eventLogRequest, - traceBatchDumpRequest, - testerInterface, - diskStoreRequest, - execReq, - workerSnapReq, - backup, - updateServerDBInfo); + if constexpr (is_fb_function) { + serializer(ar, + clientInterface, + locality, + tLog, + master, + commitProxy, + grvProxy, + dataDistributor, + ratekeeper, + consistencyScan, + resolver, + storage, + logRouter, + debugPing, + coordinationPing, + waitFailure, + setMetricsRate, + eventLogRequest, + traceBatchDumpRequest, + testerInterface, + diskStoreRequest, + execReq, + workerSnapReq, + backup, + updateServerDBInfo, + cdcProxy); + } else { + serializer(ar, + clientInterface, + locality, + tLog, + master, + commitProxy, + grvProxy, + dataDistributor, + ratekeeper, + consistencyScan, + resolver, + storage, + logRouter, + debugPing, + coordinationPing, + waitFailure, + setMetricsRate, + eventLogRequest, + traceBatchDumpRequest, + testerInterface, + diskStoreRequest, + execReq, + workerSnapReq, + backup, + updateServerDBInfo); + if (ar.protocolVersion().hasNativeCdc()) { + serializer(ar, cdcProxy); + } + } } }; @@ -239,6 +274,7 @@ struct RegisterMasterRequest { LogSystemConfig logSystemConfig; std::vector commitProxies; std::vector grvProxies; + std::vector cdcProxies; std::vector resolvers; DBRecoveryCount recoveryCount; int64_t registrationCount; @@ -253,20 +289,38 @@ struct RegisterMasterRequest { void serialize(Ar& ar) { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); + serializer(ar, + id, + mi, + logSystemConfig, + commitProxies, + grvProxies, + resolvers, + recoveryCount, + registrationCount, + configuration, + priorCommittedLogServers, + recoveryState, + recoveryStalled); + if (ar.protocolVersion().hasNativeCdc()) { + serializer(ar, cdcProxies); + } + } else { + serializer(ar, + id, + mi, + logSystemConfig, + commitProxies, + grvProxies, + resolvers, + recoveryCount, + registrationCount, + configuration, + priorCommittedLogServers, + recoveryState, + recoveryStalled, + cdcProxies); } - serializer(ar, - id, - mi, - logSystemConfig, - commitProxies, - grvProxies, - resolvers, - recoveryCount, - registrationCount, - configuration, - priorCommittedLogServers, - recoveryState, - recoveryStalled); } }; @@ -731,6 +785,21 @@ struct InitializeGrvProxyRequest { extern template class RequestStream; extern template struct NetNotifiedQueue; +struct InitializeCDCProxyRequest { + constexpr static FileIdentifier file_identifier = 16776013; + uint64_t recoveryCount; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, recoveryCount, reply); + } +}; + +// Instantiated in WorkerSupport.cpp +extern template class RequestStream; +extern template struct NetNotifiedQueue; + struct InitializeDataDistributorRequest { constexpr static FileIdentifier file_identifier = 8858952; UID reqId; @@ -970,6 +1039,7 @@ struct Role { static const Role SHARED_TRANSACTION_LOG; static const Role COMMIT_PROXY; static const Role GRV_PROXY; + static const Role CDC_PROXY; static const Role MASTER; static const Role RESOLVER; static const Role CLUSTER_CONTROLLER; diff --git a/fdbserver/logsystem/LogSystemConsumer.cpp b/fdbserver/logsystem/LogSystemConsumer.cpp index 816fe221616..f39c533d243 100644 --- a/fdbserver/logsystem/LogSystemConsumer.cpp +++ b/fdbserver/logsystem/LogSystemConsumer.cpp @@ -20,7 +20,7 @@ Reference LogSystemConsumer::peekAll(UID dbgid, } if (log->isLocal && !log->logServers.empty() && (log->locality == tagLocalitySpecial || log->locality == tag.locality || tag.locality == tagLocalityTxs || - tag.locality == tagLocalityLogRouter)) { + tag.locality == tagLocalityLogRouter || tag.locality == tagLocalityCDC)) { lastBegin = std::max(lastBegin, log->startVersion); localSets.push_back(log); if (log->locality != tagLocalitySatellite) { @@ -95,7 +95,8 @@ Reference LogSystemConsumer::peekAll(UID dbgid, } if (log->isLocal && !log->logServers.empty() && (log->locality == tagLocalitySpecial || log->locality == tag.locality || - tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter)) { + tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || + tag.locality == tagLocalityCDC)) { thisBegin = std::max(thisBegin, log->startVersion); localOldSets.push_back(log); if (log->locality != tagLocalitySatellite) { @@ -608,19 +609,25 @@ Reference LogSystemConsumer::peekSingle(UID dbgid, Tag tag, std::vector> history) { auto& ls = *logSystem; + auto peekTag = [&](Tag readTag, Version readBegin, Version readEnd) -> Reference { + if (readTag.locality == tagLocalityCDC) { + return peekAll(dbgid, readBegin, readEnd, readTag, false); + } + return peekLocal(dbgid, readTag, readBegin, readEnd, false); + }; while (!history.empty() && begin >= history.back().first) { history.pop_back(); } if (history.empty()) { TraceEvent("TLogPeekSingleNoHistory", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); - return peekLocal(dbgid, tag, begin, ls.getPeekEnd(), false); + return peekTag(tag, begin, ls.getPeekEnd()); } else { std::vector> cursors; std::vector epochEnds; TraceEvent("TLogPeekSingleAddingLocal", dbgid).detail("Tag", tag.toString()).detail("Begin", history[0].first); - cursors.push_back(peekLocal(dbgid, tag, history[0].first, ls.getPeekEnd(), false)); + cursors.push_back(peekTag(tag, history[0].first, ls.getPeekEnd())); for (int i = 0; i < history.size(); i++) { TraceEvent("TLogPeekSingleAddingOld", dbgid) @@ -628,11 +635,9 @@ Reference LogSystemConsumer::peekSingle(UID dbgid, .detail("HistoryTag", history[i].second.toString()) .detail("Begin", i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin)) .detail("End", history[i].first); - cursors.push_back(peekLocal(dbgid, - history[i].second, - i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin), - history[i].first, - false)); + cursors.push_back(peekTag(history[i].second, + i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin), + history[i].first)); epochEnds.emplace_back(history[i].first); } diff --git a/fdbserver/worker/CMakeLists.txt b/fdbserver/worker/CMakeLists.txt index 16a26a701a7..b7e4888340f 100644 --- a/fdbserver/worker/CMakeLists.txt +++ b/fdbserver/worker/CMakeLists.txt @@ -4,6 +4,7 @@ add_flow_target(STATIC_LIBRARY NAME fdbserver_worker SRCS ${FDBSERVER_WORKER_SRC add_fdbserver_link_test(fdbserver_workerlinktest fdbserver_worker fdbserver_backupworker + fdbserver_cdcproxy fdbserver_clustercontroller fdbserver_commitproxy fdbserver_consistencyscan @@ -56,6 +57,7 @@ target_link_libraries(fdbserver_worker fdbctl PRIVATE fdbserver_backupworker + fdbserver_cdcproxy fdbserver_clustercontroller fdbserver_commitproxy fdbserver_consistencyscan diff --git a/fdbserver/worker/worker.actor.cpp b/fdbserver/worker/worker.actor.cpp index cf9c9959600..4334b1d404d 100644 --- a/fdbserver/worker/worker.actor.cpp +++ b/fdbserver/worker/worker.actor.cpp @@ -50,6 +50,7 @@ #include "MetricLogger.actor.h" #include "fdbserver/backupworker/BackupWorker.h" #include "fdbserver/clustercontroller/ClusterController.h" +#include "fdbserver/cdcproxy/CDCProxy.h" #include "fdbserver/commitproxy/CommitProxyServer.h" #include "fdbserver/consistencyscan/ConsistencyScan.h" #include "fdbserver/datadistributor/DataDistributor.h" @@ -2787,6 +2788,26 @@ ACTOR Future workerServer(Reference connRecord, forwardError(errors, Role::GRV_PROXY, recruited.id(), grvProxyServer(recruited, req, dbInfo)))); req.reply.send(recruited); } + when(InitializeCDCProxyRequest req = waitNext(interf.cdcProxy.getFuture())) { + LocalLineage _; + CDCProxyInterface recruited; + recruited.processId = locality.processId(); + recruited.initEndpoints(); + + std::map details; + startRole(Role::CDC_PROXY, recruited.id(), interf.id(), details); + + DUMPTOKEN(recruited.consume); + DUMPTOKEN(recruited.ack); + DUMPTOKEN(recruited.waitFailure); + + errorForwarders.add(zombie(recruited, + forwardError(errors, + Role::CDC_PROXY, + recruited.id(), + cdcProxyServer(recruited, req.recoveryCount, dbInfo)))); + req.reply.send(recruited); + } when(InitializeResolverRequest req = waitNext(interf.resolver.getFuture())) { LocalLineage _; getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver; diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 02c931dcf59..96185bcf989 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -21,8 +21,10 @@ #include #include +#include "fdbclient/CDCProxyInterface.h" #include "fdbclient/NativeCdc.h" #include "fdbclient/SystemData.h" +#include "fdbserver/core/ServerDBInfo.h" #include "fdbserver/tester/workloads.h" struct NativeCdcWorkload : TestWorkload { @@ -82,6 +84,13 @@ struct NativeCdcWorkload : TestWorkload { } } + Future getCDCProxy() { + while (dbInfo->get().client.cdcProxies.empty()) { + co_await dbInfo->onChange(); + } + co_return dbInfo->get().client.cdcProxies.front(); + } + Future run(Database cx) { const Key firstName = "native-cdc-first"_sr; const Key secondName = "native-cdc-second"_sr; @@ -135,6 +144,46 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(secondRoute.first != firstRoute.first); co_await removeNativeCdcStream(cx, secondName); + + CDCProxyInterface proxy = co_await getCDCProxy(); + const Key liveName = "native-cdc-live"_sr; + const KeyRange liveRange(KeyRangeRef("live/"_sr, "live0"_sr)); + CDCRegisterStreamReply liveRegistration = + co_await proxy.registerStream.getReply(CDCRegisterStreamRequest(liveName, liveRange)); + + CDCListStreamsReply listed = co_await proxy.listStreams.getReply(CDCListStreamsRequest()); + ASSERT(listed.streams.size() == 1); + ASSERT(listed.streams[0].name == liveName); + ASSERT(listed.streams[0].streamId == liveRegistration.streamId); + ASSERT(listed.streams[0].keys == liveRange); + + Transaction write(cx); + write.set("live/in"_sr, "captured"_sr); + write.set("other/out"_sr, "ignored"_sr); + co_await write.commit(); + const Version writeVersion = write.getCommittedVersion(); + + CDCConsumeReply consumed = co_await timeoutError( + proxy.consume.getReply(CDCConsumeRequest(CDCCursor(liveRegistration.streamId, invalidVersion))), 30.0); + ASSERT(consumed.lastConsumedVersion >= writeVersion); + bool foundInRangeWrite = false; + bool foundOutOfRangeWrite = false; + for (const auto& versioned : consumed.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/in"_sr) { + foundInRangeWrite = true; + } + if (mutation.param1 == "other/out"_sr) { + foundOutOfRangeWrite = true; + } + } + } + ASSERT(foundInRangeWrite); + ASSERT(!foundOutOfRangeWrite); + + co_await proxy.ack.getReply(CDCAckRequest(liveRegistration.streamId, writeVersion)); + ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == writeVersion + 1); + co_await proxy.removeStream.getReply(CDCRemoveStreamRequest(liveName)); } }; diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index fb1a6a52cb7..846fa13567e 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -169,6 +169,7 @@ ERROR( bulkload_invalid_configuration, 1250, "BulkLoad requires cluster configur ERROR( transaction_grv_queue_rejected, 1251, "GRV request rejected because estimated queue wait exceeds transaction limit" ) ERROR( finish_move_keys_too_many_retries, 1252, "finishMoveKeys exceeded retry limit" ) ERROR( start_move_keys_too_many_retries, 1253, "startMoveKeys exceeded retry limit" ) +ERROR( cdc_proxy_failed, 1254, "Cluster recovery terminating because a CDCProxy failed" ) // 15xx Platform errors ERROR( platform_error, 1500, "Platform error" ) From 9f52b0cae7f1f445889c2cb6a00d69986ff3e4f7 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sun, 24 May 2026 18:12:39 -0700 Subject: [PATCH 08/56] Persist and publish CDC stream ownership routing --- fdbclient/NativeCdc.cpp | 40 +++++++++++++- fdbclient/SystemData.cpp | 1 + .../include/fdbclient/CommitProxyInterface.h | 15 +++++- fdbclient/include/fdbclient/NativeCdc.h | 7 ++- fdbclient/include/fdbclient/SystemData.h | 2 + fdbserver/cdcproxy/CDCProxy.cpp | 21 ++++++-- .../ClusterController.actor.cpp | 51 ++++++++++++++++++ fdbserver/logsystem/ApplyMetadataMutation.cpp | 5 +- fdbserver/workloads/NativeCdc.cpp | 53 +++++++++++++++++-- 9 files changed, 179 insertions(+), 16 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index ab562e92908..0801a94e36c 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -68,6 +68,23 @@ void validateNativeCdcStream(KeyRef const& name, KeyRangeRef const& keys) { } } +Future> getNativeCdcProxyAssignment(Transaction* tr, CDCStreamId streamId) { + RangeResult assignments = co_await tr->getRange(cdcProxyRangeFor(streamId), 2); + ASSERT(assignments.size() <= 1); + if (assignments.empty()) { + co_return Optional(); + } + const auto [assignedStreamId, proxyId] = decodeCDCProxyKey(assignments[0].key); + ASSERT_WE_THINK(assignedStreamId == streamId); + co_return proxyId; +} + +void signalNativeCdcProxyAssignmentChange(Transaction* tr) { + tr->set(cdcProxyAssignmentChangeKey, + BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), + IncludeVersion(ProtocolVersion::withNativeCdc()))); +} + Future observeNativeCdcMetadata(Transaction* tr, NativeCdcIdentifierAllocator* allocator) { Key begin = cdcStreamKeys.begin; while (begin < cdcStreamKeys.end) { @@ -99,7 +116,7 @@ Future observeNativeCdcMetadata(Transaction* tr, NativeCdcIdentifierAlloca } // namespace -Future registerNativeCdcStream(Database cx, Key name, KeyRange keys) { +Future registerNativeCdcStream(Database cx, Key name, KeyRange keys, Optional proxyId) { validateNativeCdcStream(name, keys); Transaction tr(cx); @@ -117,6 +134,11 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys if (!currentKeys.present() || decodeCDCStreamKeysValue(currentKeys.get()) != keys) { throw client_invalid_operation(); } + if (proxyId.present() && !(co_await getNativeCdcProxyAssignment(&tr, streamId)).present()) { + tr.set(cdcProxyKeyFor(streamId, proxyId.get()), Value()); + signalNativeCdcProxyAssignmentChange(&tr); + co_await tr.commit(); + } co_return streamId; } @@ -129,6 +151,10 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys tr.set(cdcStreamKeyFor(streamId), cdcStreamKeysValue(keys)); tr.set(cdcTagHistoryKeyFor(streamId, registrationVersion, tag), Value()); tr.set(cdcMinVersionKeyFor(streamId), cdcMinVersionValue(registrationVersion)); + if (proxyId.present()) { + tr.set(cdcProxyKeyFor(streamId, proxyId.get()), Value()); + signalNativeCdcProxyAssignmentChange(&tr); + } co_await tr.commit(); co_return streamId; } catch (Error& e) { @@ -138,7 +164,7 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys } } -Future removeNativeCdcStream(Database cx, Key name) { +Future removeNativeCdcStream(Database cx, Key name, Optional proxyId) { if (name.empty()) { throw client_invalid_operation(); } @@ -157,14 +183,24 @@ Future removeNativeCdcStream(Database cx, Key name) { } const CDCStreamId streamId = decodeCDCStreamNameValue(currentId.get()); + Optional assignedProxy = co_await getNativeCdcProxyAssignment(&tr, streamId); + if (proxyId.present() && (!assignedProxy.present() || assignedProxy.get() != proxyId.get())) { + throw wrong_shard_server(); + } tr.clear(nameKey); tr.clear(cdcStreamKeyFor(streamId)); tr.clear(cdcProxyRangeFor(streamId)); + if (assignedProxy.present()) { + signalNativeCdcProxyAssignmentChange(&tr); + } // Retain tag history and minVersion until the pop/cleanup phase can // safely release all durable mutations for this retired stream. co_await tr.commit(); co_return; } catch (Error& e) { + if (e.code() == error_code_wrong_shard_server) { + throw; + } err = e; } co_await tr.onError(err); diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 623ec0e2bf0..dbaa70d9f78 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -776,6 +776,7 @@ const KeyRangeRef cdcStreamKeys("\xff/cdc/keys/"_sr, "\xff/cdc/keys0"_sr); const KeyRangeRef cdcTagHistoryKeys("\xff/cdc/tagHistory/"_sr, "\xff/cdc/tagHistory0"_sr); const KeyRangeRef cdcMinVersionKeys("\xff/cdc/minVersion/"_sr, "\xff/cdc/minVersion0"_sr); const KeyRangeRef cdcProxyKeys("\xff/cdc/proxies/"_sr, "\xff/cdc/proxies0"_sr); +const KeyRef cdcProxyAssignmentChangeKey = "\xff/cdc/proxyAssignmentChange"_sr; Key cdcStreamNameKeyFor(KeyRef const& streamName) { return streamName.withPrefix(cdcStreamNameKeys.begin); diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index 25d1e96511d..ba48de19c52 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -22,6 +22,7 @@ #define FDBCLIENT_COMMITPROXYINTERFACE_H #pragma once +#include #include #include @@ -113,6 +114,7 @@ struct ClientDBInfo { std::vector grvProxies; std::vector commitProxies; std::vector cdcProxies; + std::map streamToCDCProxyId; Optional firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk Optional forward; @@ -132,10 +134,19 @@ struct ClientDBInfo { ASSERT(ar.protocolVersion().isValid()); serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType); if (ar.protocolVersion().hasNativeCdc()) { - serializer(ar, cdcProxies); + serializer(ar, cdcProxies, streamToCDCProxyId); } } else { - serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType, cdcProxies); + serializer(ar, + grvProxies, + commitProxies, + id, + forward, + history, + clusterId, + clusterType, + cdcProxies, + streamToCDCProxyId); } } }; diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index 0c200f71af4..e49fdd6e538 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -35,8 +35,11 @@ struct NativeCdcStreamInfo { // These durable metadata operations are intended to back CDCProxyInterface // lifecycle requests once CDC proxies are recruited. -Future registerNativeCdcStream(Database cx, Key name, KeyRange keys); -Future removeNativeCdcStream(Database cx, Key name); +Future registerNativeCdcStream(Database cx, + Key name, + KeyRange keys, + Optional proxyId = Optional()); +Future removeNativeCdcStream(Database cx, Key name, Optional proxyId = Optional()); Future> listNativeCdcStreams(Database cx); // Persists the exclusive unpopped watermark after consuming through a version. // Removed streams remain acknowledgeable while retained CDC log data is drained. diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index e83685409ab..bc7785e68a7 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -295,6 +295,8 @@ Version decodeCDCMinVersionValue(ValueRef const& value); // "\xff/cdc/proxies/[[CDCStreamId]][[proxyUID]]" := "" extern const KeyRangeRef cdcProxyKeys; +// Changed whenever durable CDC stream-to-proxy assignments change. +extern const KeyRef cdcProxyAssignmentChangeKey; Key cdcProxyKeyFor(CDCStreamId streamId, UID proxyId); KeyRange cdcProxyRangeFor(CDCStreamId streamId); std::pair decodeCDCProxyKey(KeyRef const& key); diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 14e44c3fd76..2369c1b109a 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -75,7 +75,10 @@ Optional clipCDCMutation(MutationRef const& mutation, KeyRangeRef c return Optional(); } -Future readCDCStreamState(Database cx, CDCStreamId streamId, bool requireKeys) { +Future readCDCStreamState(Database cx, + CDCStreamId streamId, + UID expectedProxyId, + bool requireKeys) { if (streamId == 0) { throw client_invalid_operation(); } @@ -101,6 +104,11 @@ Future readCDCStreamState(Database cx, CDCStreamId streamId, } result.minVersion = decodeCDCMinVersionValue(minVersionValue.get()); + RangeResult assignedProxies = co_await tr.getRange(cdcProxyRangeFor(streamId), 2); + if (assignedProxies.size() != 1 || decodeCDCProxyKey(assignedProxies[0].key).second != expectedProxyId) { + throw wrong_shard_server(); + } + std::vector> tagAssignments; KeyRange tagHistoryRange = cdcTagHistoryRangeFor(streamId); Key begin = tagHistoryRange.begin; @@ -128,6 +136,9 @@ Future readCDCStreamState(Database cx, CDCStreamId streamId, } co_return result; } catch (Error& e) { + if (e.code() == error_code_wrong_shard_server) { + throw; + } err = e; } co_await tr.onError(err); @@ -141,7 +152,7 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { throw client_invalid_operation(); } - CDCStreamReadState state = co_await readCDCStreamState(self->cx, request.cursor.streamId, true); + CDCStreamReadState state = co_await readCDCStreamState(self->cx, request.cursor.streamId, self->id, true); Version begin = request.cursor.lastConsumedVersion == invalidVersion ? state.minVersion : request.cursor.lastConsumedVersion + 1; if (begin < state.minVersion) { @@ -194,7 +205,7 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { Future acknowledge(CDCProxyData* self, CDCAckRequest request) { try { - CDCStreamReadState state = co_await readCDCStreamState(self->cx, request.streamId, false); + CDCStreamReadState state = co_await readCDCStreamState(self->cx, request.streamId, self->id, false); const Version minVersion = co_await acknowledgeNativeCdcStream(self->cx, request.streamId, request.version); std::set tags{ state.currentTag }; for (const auto& history : state.tagHistory) { @@ -212,7 +223,7 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { Future registerStream(CDCProxyData* self, CDCRegisterStreamRequest request) { try { - const CDCStreamId streamId = co_await registerNativeCdcStream(self->cx, request.name, request.keys); + const CDCStreamId streamId = co_await registerNativeCdcStream(self->cx, request.name, request.keys, self->id); request.reply.send(CDCRegisterStreamReply(streamId)); } catch (Error& e) { request.reply.sendError(e); @@ -222,7 +233,7 @@ Future registerStream(CDCProxyData* self, CDCRegisterStreamRequest request Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { try { - co_await removeNativeCdcStream(self->cx, request.name); + co_await removeNativeCdcStream(self->cx, request.name, self->id); request.reply.send(Void()); } catch (Error& e) { request.reply.sendError(e); diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 8fb84de574d..2cffb7f2013 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -1266,6 +1266,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co clientInfo.commitProxies = req.commitProxies; clientInfo.grvProxies = req.grvProxies; clientInfo.cdcProxies = req.cdcProxies; + clientInfo.streamToCDCProxyId = db->clientInfo->get().streamToCDCProxyId; clientInfo.history = db->clientInfo->get().history; clientInfo.clusterId = db->serverInfo->get().client.clusterId; clientInfo.clusterType = db->clusterType; @@ -1986,6 +1987,55 @@ Future monitorGlobalConfig(ClusterControllerData::DBInfo* db) { } } +Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { + while (true) { + ReadYourWritesTransaction tr(db->db); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + std::map streamToCDCProxyId; + Key begin = cdcProxyKeys.begin; + while (begin < cdcProxyKeys.end) { + RangeResult assignments = + co_await tr.getRange(KeyRangeRef(begin, cdcProxyKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& assignment : assignments) { + const auto [streamId, proxyId] = decodeCDCProxyKey(assignment.key); + ASSERT_WE_THINK(streamToCDCProxyId.emplace(streamId, proxyId).second); + } + if (!assignments.more) { + break; + } + begin = keyAfter(assignments.back().key); + } + + ClientDBInfo clientInfo = db->clientInfo->get(); + if (clientInfo.streamToCDCProxyId != streamToCDCProxyId) { + clientInfo.id = deterministicRandom()->randomUniqueID(); + clientInfo.streamToCDCProxyId = std::move(streamToCDCProxyId); + + ServerDBInfo serverInfo = db->serverInfo->get(); + serverInfo.id = deterministicRandom()->randomUniqueID(); + serverInfo.infoGeneration = ++db->dbInfoCount; + serverInfo.client = clientInfo; + db->serverInfo->set(serverInfo); + db->clientInfo->set(clientInfo); + } + + Future assignmentChangeFuture = tr.watch(cdcProxyAssignmentChangeKey); + co_await tr.commit(); + co_await assignmentChangeFuture; + break; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } +} + Future updatedChangingDatacenters(ClusterControllerData* self) { // do not change the cluster controller until all the processes have had a chance to register co_await delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY); @@ -2928,6 +2978,7 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, self.addActor.send(monitorServerInfoConfig(&self.db)); self.addActor.send(monitorStorageMetadata(&self)); self.addActor.send(monitorGlobalConfig(&self.db)); + self.addActor.send(monitorCDCProxyAssignments(&self.db)); self.addActor.send(updatedChangingDatacenters(&self)); self.addActor.send(updatedChangedDatacenters(&self)); self.addActor.send(updateDatacenterVersionDifference(&self)); diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index 7b3754827f9..1d4fe6eb58e 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -615,7 +615,7 @@ class ApplyMetadataMutationsImpl { void checkSetCDCMetadata(MutationRef m) { if (!cdcStreamNameKeys.contains(m.param1) && !cdcStreamKeys.contains(m.param1) && !cdcTagHistoryKeys.contains(m.param1) && !cdcMinVersionKeys.contains(m.param1) && - !cdcProxyKeys.contains(m.param1)) { + !cdcProxyKeys.contains(m.param1) && m.param1 != cdcProxyAssignmentChangeKey) { return; } if (!initialCommit) { @@ -1330,7 +1330,8 @@ bool containsMetadataMutation(const VectorRef& mutations) { (m.param1.startsWith(logRangesRange.begin)) || (m.param1.startsWith(serverKeysPrefix)) || (m.param1.startsWith(keyServersPrefix)) || cdcStreamNameKeys.contains(m.param1) || cdcStreamKeys.contains(m.param1) || cdcTagHistoryKeys.contains(m.param1) || - cdcMinVersionKeys.contains(m.param1) || cdcProxyKeys.contains(m.param1)) { + cdcMinVersionKeys.contains(m.param1) || cdcProxyKeys.contains(m.param1) || + m.param1 == cdcProxyAssignmentChangeKey) { return true; } } else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) { diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 96185bcf989..b727a77af78 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -91,6 +91,28 @@ struct NativeCdcWorkload : TestWorkload { co_return dbInfo->get().client.cdcProxies.front(); } + Future getCDCProxy(CDCStreamId streamId) { + while (true) { + const ClientDBInfo& client = dbInfo->get().client; + auto assigned = client.streamToCDCProxyId.find(streamId); + if (assigned != client.streamToCDCProxyId.end()) { + for (const auto& proxy : client.cdcProxies) { + if (proxy.id() == assigned->second) { + co_return proxy; + } + } + } + co_await dbInfo->onChange(); + } + } + + Future waitForCDCProxyAssignmentRemoval(CDCStreamId streamId) { + while (dbInfo->get().client.streamToCDCProxyId.contains(streamId)) { + co_await dbInfo->onChange(); + } + co_return; + } + Future run(Database cx) { const Key firstName = "native-cdc-first"_sr; const Key secondName = "native-cdc-second"_sr; @@ -150,6 +172,8 @@ struct NativeCdcWorkload : TestWorkload { const KeyRange liveRange(KeyRangeRef("live/"_sr, "live0"_sr)); CDCRegisterStreamReply liveRegistration = co_await proxy.registerStream.getReply(CDCRegisterStreamRequest(liveName, liveRange)); + CDCProxyInterface owner = co_await getCDCProxy(liveRegistration.streamId); + ASSERT(owner.id() == proxy.id()); CDCListStreamsReply listed = co_await proxy.listStreams.getReply(CDCListStreamsRequest()); ASSERT(listed.streams.size() == 1); @@ -163,8 +187,30 @@ struct NativeCdcWorkload : TestWorkload { co_await write.commit(); const Version writeVersion = write.getCommittedVersion(); + for (const auto& nonOwner : dbInfo->get().client.cdcProxies) { + if (nonOwner.id() == owner.id()) { + continue; + } + bool wrongOwnerRejected = false; + try { + co_await nonOwner.consume.getReply( + CDCConsumeRequest(CDCCursor(liveRegistration.streamId, invalidVersion))); + } catch (Error& e) { + wrongOwnerRejected = e.code() == error_code_wrong_shard_server; + } + ASSERT(wrongOwnerRejected); + bool wrongOwnerRemoveRejected = false; + try { + co_await nonOwner.removeStream.getReply(CDCRemoveStreamRequest(liveName)); + } catch (Error& e) { + wrongOwnerRemoveRejected = e.code() == error_code_wrong_shard_server; + } + ASSERT(wrongOwnerRemoveRejected); + break; + } + CDCConsumeReply consumed = co_await timeoutError( - proxy.consume.getReply(CDCConsumeRequest(CDCCursor(liveRegistration.streamId, invalidVersion))), 30.0); + owner.consume.getReply(CDCConsumeRequest(CDCCursor(liveRegistration.streamId, invalidVersion))), 30.0); ASSERT(consumed.lastConsumedVersion >= writeVersion); bool foundInRangeWrite = false; bool foundOutOfRangeWrite = false; @@ -181,9 +227,10 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(foundInRangeWrite); ASSERT(!foundOutOfRangeWrite); - co_await proxy.ack.getReply(CDCAckRequest(liveRegistration.streamId, writeVersion)); + co_await owner.ack.getReply(CDCAckRequest(liveRegistration.streamId, writeVersion)); ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == writeVersion + 1); - co_await proxy.removeStream.getReply(CDCRemoveStreamRequest(liveName)); + co_await owner.removeStream.getReply(CDCRemoveStreamRequest(liveName)); + co_await waitForCDCProxyAssignmentRemoval(liveRegistration.streamId); } }; From 9b768820f6705f2ed3c1bd02b190af92812fd6f6 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sun, 24 May 2026 18:59:20 -0700 Subject: [PATCH 09/56] Add non-recovery CDC proxy failover and durable stream reassignment --- fdbclient/NativeCdc.cpp | 44 ++++++ .../include/fdbclient/CDCProxyInterface.h | 15 ++ fdbclient/include/fdbclient/NativeCdc.h | 2 + fdbserver/cdcproxy/CDCProxy.cpp | 22 ++- .../ClusterController.actor.cpp | 146 ++++++++++++++++++ .../clustercontroller/ClusterRecovery.cpp | 14 -- fdbserver/worker/worker.actor.cpp | 1 + fdbserver/workloads/NativeCdc.cpp | 47 +++++- 8 files changed, 270 insertions(+), 21 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 0801a94e36c..9b745f98288 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -244,6 +244,50 @@ Future> listNativeCdcStreams(Database cx) { co_return result; } +Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId) { + if (oldProxyId == newProxyId) { + co_return; + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + bool changed = false; + Key begin = cdcProxyKeys.begin; + while (begin < cdcProxyKeys.end) { + RangeResult assignments = + co_await tr.getRange(KeyRangeRef(begin, cdcProxyKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& assignment : assignments) { + const auto [streamId, proxyId] = decodeCDCProxyKey(assignment.key); + if (proxyId == oldProxyId) { + tr.clear(assignment.key); + tr.set(cdcProxyKeyFor(streamId, newProxyId), Value()); + changed = true; + } + } + if (!assignments.more) { + break; + } + begin = keyAfter(assignments.back().key); + } + + if (changed) { + signalNativeCdcProxyAssignmentChange(&tr); + co_await tr.commit(); + } + co_return; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough) { if (streamId == 0 || consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { throw client_invalid_operation(); diff --git a/fdbclient/include/fdbclient/CDCProxyInterface.h b/fdbclient/include/fdbclient/CDCProxyInterface.h index 5de6c3a588b..981783eacc2 100644 --- a/fdbclient/include/fdbclient/CDCProxyInterface.h +++ b/fdbclient/include/fdbclient/CDCProxyInterface.h @@ -186,6 +186,18 @@ struct CDCAckRequest { } }; +struct HaltCDCProxyRequest { + constexpr static FileIdentifier file_identifier = 16776014; + ReplyPromise reply; + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + struct CDCProxyInterface { constexpr static FileIdentifier file_identifier = 16776011; enum { LocationAwareLoadBalance = 1 }; @@ -198,6 +210,7 @@ struct CDCProxyInterface { PublicRequestStream listStreams; PublicRequestStream ack; RequestStream> waitFailure; + RequestStream haltForTesting; UID id() const { return consume.getEndpoint().token; } std::string toString() const { return id().shortString(); } @@ -216,6 +229,7 @@ struct CDCProxyInterface { listStreams = PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(3)); ack = PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(4)); waitFailure = RequestStream>(consume.getEndpoint().getAdjustedEndpoint(5)); + haltForTesting = RequestStream(consume.getEndpoint().getAdjustedEndpoint(6)); } } @@ -227,6 +241,7 @@ struct CDCProxyInterface { streams.push_back(listStreams.getReceiver(TaskPriority::ReadSocket)); streams.push_back(ack.getReceiver(TaskPriority::ReadSocket)); streams.push_back(waitFailure.getReceiver()); + streams.push_back(haltForTesting.getReceiver()); FlowTransport::transport().addEndpoints(streams); } }; diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index e49fdd6e538..f2a0556c8d4 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -41,6 +41,8 @@ Future registerNativeCdcStream(Database cx, Optional proxyId = Optional()); Future removeNativeCdcStream(Database cx, Key name, Optional proxyId = Optional()); Future> listNativeCdcStreams(Database cx); +// Atomically moves any streams assigned to a failed proxy to its replacement. +Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId); // Persists the exclusive unpopped watermark after consuming through a version. // Removed streams remain acknowledgeable while retained CDC log data is drained. Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough); diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 2369c1b109a..6e12ab2ad10 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -272,6 +272,9 @@ Future cdcProxyServer(CDCProxyInterface proxy, actors.add(traceRole(Role::CDC_PROXY, proxy.id())); self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); Future dbInfoChange = dbInfo->onChange(); + bool hasBeenPublished = + std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) != + dbInfo->get().client.cdcProxies.end(); while (true) { auto result = co_await race(proxy.consume.getFuture(), @@ -279,6 +282,7 @@ Future cdcProxyServer(CDCProxyInterface proxy, proxy.registerStream.getFuture(), proxy.removeStream.getFuture(), proxy.listStreams.getFuture(), + proxy.haltForTesting.getFuture(), dbInfoChange, actors.getResult()); switch (result.index()) { @@ -298,15 +302,25 @@ Future cdcProxyServer(CDCProxyInterface proxy, actors.add(listStreams(&self, std::get<4>(std::move(result)))); break; case 5: - if (dbInfo->get().recoveryCount >= recoveryCount && - std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) == - dbInfo->get().client.cdcProxies.end()) { + if (!g_network->isSimulated()) { + std::get<5>(std::move(result)).reply.sendError(client_invalid_operation()); + break; + } + std::get<5>(std::move(result)).reply.send(Void()); + throw worker_removed(); + case 6: { + const bool isPublished = + std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) != + dbInfo->get().client.cdcProxies.end(); + if (hasBeenPublished && dbInfo->get().recoveryCount >= recoveryCount && !isPublished) { throw worker_removed(); } + hasBeenPublished = hasBeenPublished || isPublished; self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); dbInfoChange = dbInfo->onChange(); break; - case 6: + } + case 7: co_await actors.getResult(); break; default: diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 2cffb7f2013..5d6cb04c5fc 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -28,6 +28,7 @@ #include "fdbclient/ClientBooleanParams.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/NativeCdc.h" #include "fdbclient/SystemData.h" #include "fdbclient/DatabaseContext.h" #include "fdbrpc/FailureMonitor.h" @@ -553,6 +554,150 @@ Future monitorAndRecruitLogRouters(ClusterControllerData* self) { } } +Future> monitorCDCProxies(std::vector const& cdcProxies) { + std::vector> failures; + for (const auto& proxy : cdcProxies) { + failures.push_back( + waitFailureClient(proxy.waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true, + /*traceMsg=*/"CDCProxyFailed"_sr)); + } + if (failures.empty()) { + co_await Future(Never()); + UNREACHABLE(); + } + + co_await quorum(failures, 1); + std::vector failedProxies; + for (int i = 0; i < failures.size(); ++i) { + if (failures[i].isReady() || failures[i].isError()) { + failedProxies.push_back(i); + } + } + co_return failedProxies; +} + +Future recruitFailedCDCProxies(ClusterControllerData* self, + uint64_t recoveryCount, + std::vector const& monitoredProxies, + std::vector const& failedIndexes) { + if (!self->db.recoveryData.isValid() || self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount) { + co_return; + } + + std::vector> replacements; + for (int failedIndex : failedIndexes) { + ASSERT_WE_THINK(failedIndex >= 0 && failedIndex < monitoredProxies.size()); + const CDCProxyInterface& failedProxy = monitoredProxies[failedIndex]; + auto current = + std::find(self->db.recoveryData->cdcProxies.begin(), self->db.recoveryData->cdcProxies.end(), failedProxy); + if (current == self->db.recoveryData->cdcProxies.end()) { + continue; + } + + auto worker = self->id_worker.find(failedProxy.processId); + if (worker == self->id_worker.end()) { + throw recruitment_failed(); + } + + InitializeCDCProxyRequest request; + request.recoveryCount = recoveryCount; + CDCProxyInterface replacement = + co_await throwErrorOr(worker->second.details.interf.cdcProxy.getReplyUnlessFailedFor( + request, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)); + + if (!self->db.recoveryData.isValid() || + self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount) { + co_return; + } + current = + std::find(self->db.recoveryData->cdcProxies.begin(), self->db.recoveryData->cdcProxies.end(), failedProxy); + if (current == self->db.recoveryData->cdcProxies.end()) { + continue; + } + *current = replacement; + replacements.emplace_back(failedProxy.id(), replacement.id()); + TraceEvent("CDCProxyRecruited", self->id) + .detail("OldCDCProxyID", failedProxy.id()) + .detail("NewCDCProxyID", replacement.id()) + .detail("RecoveryCount", recoveryCount); + } + if (replacements.empty()) { + co_return; + } + + // Endpoint publication precedes assignment publication so clients never route + // a stream to a replacement that is not yet discoverable. + self->db.recoveryData->registrationTrigger.trigger(); + while (self->db.recoveryData.isValid() && self->db.recoveryData->cstate.myDBState.recoveryCount == recoveryCount) { + bool allPublished = true; + for (const auto& [oldProxyId, newProxyId] : replacements) { + allPublished = allPublished && std::any_of(self->db.clientInfo->get().cdcProxies.begin(), + self->db.clientInfo->get().cdcProxies.end(), + [newProxyId](CDCProxyInterface const& proxy) { + return proxy.id() == newProxyId; + }); + } + if (allPublished) { + break; + } + co_await self->db.clientInfo->onChange(); + } + if (!self->db.recoveryData.isValid() || self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount) { + co_return; + } + for (const auto& [oldProxyId, newProxyId] : replacements) { + co_await reassignNativeCdcStreams(self->db.db, oldProxyId, newProxyId); + } +} + +Future monitorAndRecruitCDCProxies(ClusterControllerData* self) { + while (true) { + while (self->db.serverInfo->get().recoveryState < RecoveryState::FULLY_RECOVERED || + !self->db.recoveryData.isValid() || self->db.recoveryData->cdcProxies.empty()) { + co_await self->db.serverInfo->onChange(); + } + + const uint64_t recoveryCount = self->db.recoveryData->cstate.myDBState.recoveryCount; + const std::vector monitoredProxies = self->db.recoveryData->cdcProxies; + Future> failures = monitorCDCProxies(monitoredProxies); + while (true) { + bool retryAfterFailure = false; + try { + auto result = co_await race(failures, self->db.serverInfo->onChange()); + if (result.index() == 0) { + const std::vector failedIndexes = std::get<0>(std::move(result)); + TraceEvent("CDCProxyFailureDetected", self->id) + .detail("FailedCount", failedIndexes.size()) + .detail("RecoveryCount", recoveryCount); + co_await recruitFailedCDCProxies(self, recoveryCount, monitoredProxies, failedIndexes); + break; + } + if (!self->db.recoveryData.isValid() || + self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount || + self->db.recoveryData->cdcProxies != monitoredProxies) { + break; + } + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + CODE_PROBE(true, "CDC proxy re-recruitment failed"); + TraceEvent(SevWarnAlways, "CDCProxyReRecruitmentFailed", self->id) + .error(e) + .detail("RecoveryCount", recoveryCount); + retryAfterFailure = true; + } + if (retryAfterFailure) { + co_await delay(1.0); + break; + } + } + } +} + ACTOR Future clusterWatchDatabase(ClusterControllerData* cluster, ClusterControllerData::DBInfo* db, ServerCoordinators coordinators) { @@ -2979,6 +3124,7 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, self.addActor.send(monitorStorageMetadata(&self)); self.addActor.send(monitorGlobalConfig(&self.db)); self.addActor.send(monitorCDCProxyAssignments(&self.db)); + self.addActor.send(monitorAndRecruitCDCProxies(&self)); self.addActor.send(updatedChangingDatacenters(&self)); self.addActor.send(updatedChangedDatacenters(&self)); self.addActor.send(updateDatacenterVersionDifference(&self)); diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index b51d9f0060e..477b7a0e611 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -429,19 +429,6 @@ Future waitGrvProxyFailure(std::vector const& grvProxie return tagError(quorum(failed, 1), grv_proxy_failed()); } -Future waitCDCProxyFailure(std::vector const& cdcProxies) { - std::vector> failed; - failed.reserve(cdcProxies.size()); - for (auto cdcProxy : cdcProxies) { - failed.push_back(waitFailureClient(cdcProxy.waitFailure, - SERVER_KNOBS->TLOG_TIMEOUT, - -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, - /*trace=*/true)); - } - ASSERT(failed.size() >= 1); - return tagError(quorum(failed, 1), cdc_proxy_failed()); -} - Future waitResolverFailure(std::vector const& resolvers) { std::vector> failed; failed.reserve(resolvers.size()); @@ -1798,7 +1785,6 @@ Future clusterRecoveryCore(Reference self) { self->addActor.send(waitResolverFailure(self->resolvers)); self->addActor.send(waitCommitProxyFailure(self->commitProxies)); self->addActor.send(waitGrvProxyFailure(self->grvProxies)); - self->addActor.send(waitCDCProxyFailure(self->cdcProxies)); self->addActor.send(reportErrors(updateRegistration(self, self->logSystem), "UpdateRegistration", self->dbgid)); self->registrationTrigger.trigger(); diff --git a/fdbserver/worker/worker.actor.cpp b/fdbserver/worker/worker.actor.cpp index 4334b1d404d..b10216c73e8 100644 --- a/fdbserver/worker/worker.actor.cpp +++ b/fdbserver/worker/worker.actor.cpp @@ -2800,6 +2800,7 @@ ACTOR Future workerServer(Reference connRecord, DUMPTOKEN(recruited.consume); DUMPTOKEN(recruited.ack); DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.haltForTesting); errorForwarders.add(zombie(recruited, forwardError(errors, diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index b727a77af78..46e9f01d29b 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -106,6 +106,21 @@ struct NativeCdcWorkload : TestWorkload { } } + Future getReplacementCDCProxy(CDCStreamId streamId, UID failedProxyId) { + while (true) { + const ClientDBInfo& client = dbInfo->get().client; + auto assigned = client.streamToCDCProxyId.find(streamId); + if (assigned != client.streamToCDCProxyId.end() && assigned->second != failedProxyId) { + for (const auto& proxy : client.cdcProxies) { + if (proxy.id() == assigned->second) { + co_return proxy; + } + } + } + co_await dbInfo->onChange(); + } + } + Future waitForCDCProxyAssignmentRemoval(CDCStreamId streamId) { while (dbInfo->get().client.streamToCDCProxyId.contains(streamId)) { co_await dbInfo->onChange(); @@ -227,9 +242,35 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(foundInRangeWrite); ASSERT(!foundOutOfRangeWrite); - co_await owner.ack.getReply(CDCAckRequest(liveRegistration.streamId, writeVersion)); - ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == writeVersion + 1); - co_await owner.removeStream.getReply(CDCRemoveStreamRequest(liveName)); + const uint64_t recoveryCount = dbInfo->get().recoveryCount; + co_await owner.haltForTesting.getReply(HaltCDCProxyRequest()); + CDCProxyInterface replacement = + co_await timeoutError(getReplacementCDCProxy(liveRegistration.streamId, owner.id()), 30.0); + ASSERT(replacement.id() != owner.id()); + ASSERT(dbInfo->get().recoveryCount == recoveryCount); + + Transaction afterFailureWrite(cx); + afterFailureWrite.set("live/after-failure"_sr, "captured-after-failure"_sr); + co_await afterFailureWrite.commit(); + const Version afterFailureVersion = afterFailureWrite.getCommittedVersion(); + CDCConsumeReply afterFailure = + co_await timeoutError(replacement.consume.getReply(CDCConsumeRequest( + CDCCursor(liveRegistration.streamId, consumed.lastConsumedVersion))), + 30.0); + ASSERT(afterFailure.lastConsumedVersion >= afterFailureVersion); + bool foundAfterFailureWrite = false; + for (const auto& versioned : afterFailure.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/after-failure"_sr) { + foundAfterFailureWrite = true; + } + } + } + ASSERT(foundAfterFailureWrite); + + co_await replacement.ack.getReply(CDCAckRequest(liveRegistration.streamId, afterFailureVersion)); + ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == afterFailureVersion + 1); + co_await replacement.removeStream.getReply(CDCRemoveStreamRequest(liveName)); co_await waitForCDCProxyAssignmentRemoval(liveRegistration.streamId); } }; From 230af4d065565f64cdf4603116b7f170f23832a2 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sun, 24 May 2026 22:11:49 -0700 Subject: [PATCH 10/56] Preserve native CDC streams across recovery and proxy failover --- fdbserver/cdcproxy/CDCProxy.cpp | 14 ++-- .../ClusterController.actor.cpp | 30 +++++--- .../clustercontroller/ClusterController.h | 1 + .../clustercontroller/ClusterRecovery.cpp | 29 +++++-- fdbserver/clustercontroller/ClusterRecovery.h | 1 - fdbserver/commitproxy/CommitProxyServer.cpp | 1 + fdbserver/workloads/NativeCdc.cpp | 75 ++++++++++++++++++- 7 files changed, 126 insertions(+), 25 deletions(-) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 6e12ab2ad10..2eb07542e95 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -161,10 +161,12 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { Reference cursor = self->logSystem->peekSingle(self->id, begin, state.currentTag, state.tagHistory); - co_await cursor->getMore(TaskPriority::TLogPeekReply); - cursor->setProtocolVersion(g_network->protocolVersion()); - if (cursor->popped() > begin) { - throw transaction_too_old(); + while (!cursor->hasMessage()) { + co_await cursor->getMore(TaskPriority::TLogPeekReply); + cursor->setProtocolVersion(g_network->protocolVersion()); + if (cursor->popped() > begin) { + throw transaction_too_old(); + } } CDCConsumeReply reply; @@ -316,7 +318,9 @@ Future cdcProxyServer(CDCProxyInterface proxy, throw worker_removed(); } hasBeenPublished = hasBeenPublished || isPublished; - self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); + if (!dbInfo->get().logSystemConfig.tLogs.empty()) { + self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); + } dbInfoChange = dbInfo->onChange(); break; } diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 5d6cb04c5fc..f915dd7094e 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -591,15 +591,22 @@ Future recruitFailedCDCProxies(ClusterControllerData* self, for (int failedIndex : failedIndexes) { ASSERT_WE_THINK(failedIndex >= 0 && failedIndex < monitoredProxies.size()); const CDCProxyInterface& failedProxy = monitoredProxies[failedIndex]; - auto current = - std::find(self->db.recoveryData->cdcProxies.begin(), self->db.recoveryData->cdcProxies.end(), failedProxy); - if (current == self->db.recoveryData->cdcProxies.end()) { + auto current = std::find(self->db.cdcProxies.begin(), self->db.cdcProxies.end(), failedProxy); + if (current == self->db.cdcProxies.end()) { continue; } auto worker = self->id_worker.find(failedProxy.processId); if (worker == self->id_worker.end()) { - throw recruitment_failed(); + for (const auto& grvProxy : self->db.recoveryData->grvProxies) { + worker = self->id_worker.find(grvProxy.processId); + if (worker != self->id_worker.end()) { + break; + } + } + if (worker == self->id_worker.end()) { + throw recruitment_failed(); + } } InitializeCDCProxyRequest request; @@ -612,9 +619,8 @@ Future recruitFailedCDCProxies(ClusterControllerData* self, self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount) { co_return; } - current = - std::find(self->db.recoveryData->cdcProxies.begin(), self->db.recoveryData->cdcProxies.end(), failedProxy); - if (current == self->db.recoveryData->cdcProxies.end()) { + current = std::find(self->db.cdcProxies.begin(), self->db.cdcProxies.end(), failedProxy); + if (current == self->db.cdcProxies.end()) { continue; } *current = replacement; @@ -655,13 +661,13 @@ Future recruitFailedCDCProxies(ClusterControllerData* self, Future monitorAndRecruitCDCProxies(ClusterControllerData* self) { while (true) { - while (self->db.serverInfo->get().recoveryState < RecoveryState::FULLY_RECOVERED || - !self->db.recoveryData.isValid() || self->db.recoveryData->cdcProxies.empty()) { + while (self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS || + !self->db.recoveryData.isValid() || self->db.cdcProxies.empty()) { co_await self->db.serverInfo->onChange(); } const uint64_t recoveryCount = self->db.recoveryData->cstate.myDBState.recoveryCount; - const std::vector monitoredProxies = self->db.recoveryData->cdcProxies; + const std::vector monitoredProxies = self->db.cdcProxies; Future> failures = monitorCDCProxies(monitoredProxies); while (true) { bool retryAfterFailure = false; @@ -677,7 +683,7 @@ Future monitorAndRecruitCDCProxies(ClusterControllerData* self) { } if (!self->db.recoveryData.isValid() || self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount || - self->db.recoveryData->cdcProxies != monitoredProxies) { + self->db.cdcProxies != monitoredProxies) { break; } } catch (Error& e) { @@ -738,6 +744,8 @@ ACTOR Future clusterWatchDatabase(ClusterControllerData* cluster, dbInfo.client = ClientDBInfo(); dbInfo.client.clusterId = db->serverInfo->get().client.clusterId; dbInfo.client.clusterType = db->clusterType; + dbInfo.client.cdcProxies = db->cdcProxies; + dbInfo.client.streamToCDCProxyId = db->clientInfo->get().streamToCDCProxyId; TraceEvent("CCWDB", cluster->id) .detail("NewMaster", dbInfo.master.id().toString()) diff --git a/fdbserver/clustercontroller/ClusterController.h b/fdbserver/clustercontroller/ClusterController.h index 3b0018d1ec5..61d9d8d2240 100644 --- a/fdbserver/clustercontroller/ClusterController.h +++ b/fdbserver/clustercontroller/ClusterController.h @@ -143,6 +143,7 @@ class ClusterControllerData { Future clientCounter; int clientCount; ClusterType clusterType = ClusterType::STANDALONE; + std::vector cdcProxies; Reference recoveryData; DBInfo() diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index 477b7a0e611..b0766127975 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -232,7 +232,12 @@ Future newGrvProxies(Reference self, RecruitFromConfi self->grvProxies = std::move(newRecruits); } -Future newCDCProxies(Reference self, RecruitFromConfigurationReply recr) { +Future ensureCDCProxies(Reference self, RecruitFromConfigurationReply recr) { + if (!self->controllerData->db.cdcProxies.empty()) { + TraceEvent("CDCProxiesReused", self->dbgid).detail("Count", self->controllerData->db.cdcProxies.size()); + co_return; + } + std::vector> initializationReplies; for (int i = 0; i < recr.grvProxies.size(); i++) { InitializeCDCProxyRequest req; @@ -246,7 +251,7 @@ Future newCDCProxies(Reference self, RecruitFromConfi std::vector newRecruits = co_await getAll(initializationReplies); TraceEvent("CDCProxyInitializationComplete", self->dbgid).log(); - self->cdcProxies = std::move(newRecruits); + self->controllerData->db.cdcProxies = std::move(newRecruits); } Future newResolvers(Reference self, RecruitFromConfigurationReply recr) { @@ -846,7 +851,7 @@ Future updateRegistration(Reference self, ReferenceprovisionalCommitProxies, self->provisionalGrvProxies, - std::vector(), + self->controllerData->db.cdcProxies, self->resolvers, self->cstate.myDBState.recoveryCount, self->cstate.prevDBState.getPriorCommittedLogServers()); @@ -856,7 +861,7 @@ Future updateRegistration(Reference self, ReferencecommitProxies, self->grvProxies, - self->cdcProxies, + self->controllerData->db.cdcProxies, self->resolvers, self->cstate.myDBState.recoveryCount, std::vector()); @@ -1145,7 +1150,7 @@ Future>> recruitEverything( Future txnSystemInitialized = traceAfter(newCommitProxies(self, recruits), "CommitProxiesInitialized") && traceAfter(newGrvProxies(self, recruits), "GRVProxiesInitialized") && - traceAfter(newCDCProxies(self, recruits), "CDCProxiesInitialized") && + traceAfter(ensureCDCProxies(self, recruits), "CDCProxiesAvailable") && traceAfter(newResolvers(self, recruits), "ResolversInitialized") && traceAfter(newTLogServers(self, recruits, oldLogSystem, &confChanges), "TLogServersInitialized"); co_await (txnSystemInitialized || monitorInitializingTxnSystem(self->controllerData->db.unfinishedRecoveries)); @@ -1303,6 +1308,20 @@ Future readTransactionSystemState(Reference self, self->allTags.push_back(decodeServerTagValue(kv.value)); } + std::set activeCdcStreams; + RangeResult rawCdcStreams = co_await self->txnStateStore->readRange(cdcStreamKeys); + for (auto& kv : rawCdcStreams) { + activeCdcStreams.insert(decodeCDCStreamKey(kv.key)); + } + + RangeResult rawCdcHistoryTags = co_await self->txnStateStore->readRange(cdcTagHistoryKeys); + for (auto& kv : rawCdcHistoryTags) { + const auto tagHistory = decodeCDCTagHistoryKey(kv.key); + if (activeCdcStreams.contains(std::get<0>(tagHistory))) { + self->allTags.push_back(std::get<2>(tagHistory)); + } + } + uniquify(self->allTags); self->txnStateLogAdapter->setNextVersion( diff --git a/fdbserver/clustercontroller/ClusterRecovery.h b/fdbserver/clustercontroller/ClusterRecovery.h index 6d82c1cadcc..09499d5dec3 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.h +++ b/fdbserver/clustercontroller/ClusterRecovery.h @@ -208,7 +208,6 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted std::vector provisionalCommitProxies; std::vector grvProxies; std::vector provisionalGrvProxies; - std::vector cdcProxies; std::vector resolvers; std::map lastCommitProxyVersionReplies; diff --git a/fdbserver/commitproxy/CommitProxyServer.cpp b/fdbserver/commitproxy/CommitProxyServer.cpp index f85b179d874..8bc17833945 100644 --- a/fdbserver/commitproxy/CommitProxyServer.cpp +++ b/fdbserver/commitproxy/CommitProxyServer.cpp @@ -2758,6 +2758,7 @@ Future processCompleteTransactionStateRequest(TransactionStateResolveConte auto lockedKey = pContext->pTxnStateStore->readValue(databaseLockedKey).get(); pContext->pCommitData->locked = lockedKey.present() && !lockedKey.get().empty(); pContext->pCommitData->metadataVersion = pContext->pTxnStateStore->readValue(metadataVersionKey).get(); + pContext->pCommitData->cdcRouting.reload(pContext->pTxnStateStore); pContext->pTxnStateStore->enableSnapshot(); } diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 46e9f01d29b..6586a51f9ef 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -22,8 +22,10 @@ #include #include "fdbclient/CDCProxyInterface.h" +#include "fdbclient/ManagementAPI.h" #include "fdbclient/NativeCdc.h" #include "fdbclient/SystemData.h" +#include "fdbserver/core/RecoveryState.h" #include "fdbserver/core/ServerDBInfo.h" #include "fdbserver/tester/workloads.h" @@ -128,6 +130,26 @@ struct NativeCdcWorkload : TestWorkload { co_return; } + Future changeResolverCount(Database cx, int32_t count) { + Standalone config(format("resolvers=%d", count)); + while (true) { + Optional conf; + ConfigurationResult result = + co_await ManagementAPI::changeConfig(cx.getReference(), { config }, conf, true); + if (result == ConfigurationResult::SUCCESS) { + co_return; + } + co_await delay(1.0); + } + } + + Future waitForRecoveryAfter(uint64_t previousRecoveryCount, RecoveryState requiredState) { + while (dbInfo->get().recoveryCount <= previousRecoveryCount || dbInfo->get().recoveryState < requiredState) { + co_await dbInfo->onChange(); + } + co_return; + } + Future run(Database cx) { const Key firstName = "native-cdc-first"_sr; const Key secondName = "native-cdc-second"_sr; @@ -197,6 +219,7 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(listed.streams[0].keys == liveRange); Transaction write(cx); + write.setOption(FDBTransactionOptions::LOCK_AWARE); write.set("live/in"_sr, "captured"_sr); write.set("other/out"_sr, "ignored"_sr); co_await write.commit(); @@ -250,6 +273,7 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(dbInfo->get().recoveryCount == recoveryCount); Transaction afterFailureWrite(cx); + afterFailureWrite.setOption(FDBTransactionOptions::LOCK_AWARE); afterFailureWrite.set("live/after-failure"_sr, "captured-after-failure"_sr); co_await afterFailureWrite.commit(); const Version afterFailureVersion = afterFailureWrite.getCommittedVersion(); @@ -268,9 +292,54 @@ struct NativeCdcWorkload : TestWorkload { } ASSERT(foundAfterFailureWrite); - co_await replacement.ack.getReply(CDCAckRequest(liveRegistration.streamId, afterFailureVersion)); - ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == afterFailureVersion + 1); - co_await replacement.removeStream.getReply(CDCRemoveStreamRequest(liveName)); + const Version cursorBeforeRecovery = afterFailure.lastConsumedVersion; + co_await replacement.ack.getReply(CDCAckRequest(liveRegistration.streamId, cursorBeforeRecovery)); + ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == cursorBeforeRecovery + 1); + + const int32_t recoveredResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; + const UID ownerBeforeRecovery = replacement.id(); + const uint64_t recoveryBeforeChange = dbInfo->get().recoveryCount; + co_await changeResolverCount(cx, recoveredResolverCount); + co_await timeoutError(waitForRecoveryAfter(recoveryBeforeChange, RecoveryState::ACCEPTING_COMMITS), 60.0); + CDCProxyInterface recoveredOwner = co_await getCDCProxy(liveRegistration.streamId); + ASSERT(recoveredOwner.id() == ownerBeforeRecovery); + + Transaction afterRecoveryWrite(cx); + afterRecoveryWrite.setOption(FDBTransactionOptions::LOCK_AWARE); + afterRecoveryWrite.set("live/after-recovery"_sr, "captured-after-recovery"_sr); + co_await afterRecoveryWrite.commit(); + const Version afterRecoveryVersion = afterRecoveryWrite.getCommittedVersion(); + Version afterRecoveryCursor = cursorBeforeRecovery; + bool foundAfterRecoveryWrite = false; + const double afterRecoveryConsumeDeadline = now() + 30.0; + while (afterRecoveryCursor < afterRecoveryVersion) { + CDCConsumeReply afterRecovery = + co_await timeoutError(recoveredOwner.consume.getReply( + CDCConsumeRequest(CDCCursor(liveRegistration.streamId, afterRecoveryCursor))), + 30.0); + if (afterRecovery.lastConsumedVersion == afterRecoveryCursor) { + ASSERT(now() < afterRecoveryConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(afterRecovery.lastConsumedVersion > afterRecoveryCursor); + afterRecoveryCursor = afterRecovery.lastConsumedVersion; + for (const auto& versioned : afterRecovery.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/after-recovery"_sr) { + foundAfterRecoveryWrite = true; + } + } + } + } + ASSERT(foundAfterRecoveryWrite); + + co_await recoveredOwner.ack.getReply(CDCAckRequest(liveRegistration.streamId, afterRecoveryCursor)); + ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == afterRecoveryCursor + 1); + co_await timeoutError(waitForRecoveryAfter(recoveryBeforeChange, RecoveryState::FULLY_RECOVERED), 60.0); + recoveredOwner = co_await getCDCProxy(liveRegistration.streamId); + ASSERT(recoveredOwner.id() == ownerBeforeRecovery); + co_await recoveredOwner.removeStream.getReply(CDCRemoveStreamRequest(liveName)); co_await waitForCDCProxyAssignmentRemoval(liveRegistration.streamId); } }; From 92752c5a64bbf9da74199dd1fe4adbc797505cc3 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 09:45:45 -0700 Subject: [PATCH 11/56] Add buffered native CDC consumption and safe shared-tag acknowledgements --- fdbclient/SystemData.cpp | 8 + fdbclient/include/fdbclient/SystemData.h | 1 + fdbserver/cdcproxy/CDCProxy.cpp | 310 +++++++++++++++++++---- fdbserver/workloads/NativeCdc.cpp | 93 ++++++- tests/fast/NativeCdcSharedTag.toml | 7 + 5 files changed, 371 insertions(+), 48 deletions(-) create mode 100644 tests/fast/NativeCdcSharedTag.toml diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index dbaa70d9f78..ef7ad537d38 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -867,6 +867,13 @@ Key cdcMinVersionKeyFor(CDCStreamId streamId) { return wr.toValue(); } +CDCStreamId decodeCDCMinVersionKey(KeyRef const& key) { + CDCStreamId streamId; + BinaryReader reader(key.removePrefix(cdcMinVersionKeys.begin), Unversioned()); + reader >> streamId; + return streamId; +} + Value cdcMinVersionValue(Version version) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withNativeCdc())); wr << version; @@ -1797,6 +1804,7 @@ TEST_CASE("noSim/SystemData/NativeCDC") { ASSERT(decodeCDCStreamNameValue(cdcStreamNameValue(streamId)) == streamId); ASSERT(decodeCDCStreamKey(cdcStreamKeyFor(streamId)) == streamId); ASSERT(decodeCDCStreamKeysValue(cdcStreamKeysValue(keys)) == keys); + ASSERT(decodeCDCMinVersionKey(cdcMinVersionKeyFor(streamId)) == streamId); ASSERT(decodeCDCMinVersionValue(cdcMinVersionValue(minVersion)) == minVersion); const Key tagHistoryKey = cdcTagHistoryKeyFor(streamId, minVersion, tag); diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index bc7785e68a7..27385cd5976 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -290,6 +290,7 @@ std::tuple decodeCDCTagHistoryKey(KeyRef const& key); // "\xff/cdc/minVersion/[[CDCStreamId]]" := "[[Version]]" extern const KeyRangeRef cdcMinVersionKeys; Key cdcMinVersionKeyFor(CDCStreamId streamId); +CDCStreamId decodeCDCMinVersionKey(KeyRef const& key); Value cdcMinVersionValue(Version version); Version decodeCDCMinVersionValue(ValueRef const& value); diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 2eb07542e95..fe32eeeed7e 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -19,7 +19,9 @@ */ #include +#include #include +#include #include #include #include @@ -49,14 +51,30 @@ struct CDCStreamReadState { std::vector> tagHistory; }; +struct CDCBufferedStream : ReferenceCounted { + CDCStreamId streamId; + bool active = true; + bool initialized = false; + Version minVersion = invalidVersion; + Version bufferedThrough = invalidVersion; + std::deque> mutations; + AsyncTrigger changed; + AsyncTrigger refresh; + AsyncTrigger stopped; + + explicit CDCBufferedStream(CDCStreamId streamId) : streamId(streamId) {} +}; + struct CDCProxyData { UID id; Database cx; Reference const> dbInfo; - Reference logSystem; + Reference>> logSystem; + std::map> streams; CDCProxyData(CDCProxyInterface const& proxy, Reference const> dbInfo) - : id(proxy.id()), cx(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)), dbInfo(dbInfo) {} + : id(proxy.id()), cx(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)), dbInfo(dbInfo), + logSystem(makeReference>>()) {} }; Optional clipCDCMutation(MutationRef const& mutation, KeyRangeRef const& keys) { @@ -145,6 +163,197 @@ Future readCDCStreamState(Database cx, } } +void bufferMessages(Reference stream, + CDCStreamReadState const& metadata, + Reference cursor) { + while (cursor->hasMessage()) { + const Version messageVersion = cursor->version().version; + ArenaReader& reader = *cursor->reader(); + if (LogProtocolMessage::isNextIn(reader)) { + LogProtocolMessage protocolMessage; + reader >> protocolMessage; + cursor->setProtocolVersion(reader.protocolVersion()); + } else if (reader.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(reader)) { + SpanContextMessage contextMessage; + reader >> contextMessage; + } else if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) { + OTELSpanContextMessage contextMessage; + reader >> contextMessage; + } else { + MutationRef mutation; + reader >> mutation; + Optional clipped = clipCDCMutation(mutation, metadata.keys.get()); + if (clipped.present()) { + if (stream->mutations.empty() || stream->mutations.back().version != messageVersion) { + stream->mutations.emplace_back(); + stream->mutations.back().version = messageVersion; + } + stream->mutations.back().mutations.push_back_deep(stream->mutations.back().arena(), clipped.get()); + } + } + stream->bufferedThrough = std::max(stream->bufferedThrough, messageVersion); + cursor->nextMessage(); + } +} + +Future bufferStream(CDCProxyData* self, Reference stream) { + try { + CDCStreamReadState metadata = co_await readCDCStreamState(self->cx, stream->streamId, self->id, true); + stream->minVersion = metadata.minVersion; + stream->bufferedThrough = metadata.minVersion - 1; + stream->initialized = true; + stream->changed.trigger(); + + while (stream->active) { + if (!self->logSystem->get()) { + co_await self->logSystem->onChange(); + continue; + } + + metadata = co_await readCDCStreamState(self->cx, stream->streamId, self->id, true); + const Version begin = stream->bufferedThrough + 1; + Reference cursor = + self->logSystem->get()->peekSingle(self->id, begin, metadata.currentTag, metadata.tagHistory); + while (stream->active) { + auto result = co_await race(cursor->getMore(TaskPriority::TLogPeekReply), + self->logSystem->onChange(), + stream->stopped.onTrigger(), + stream->refresh.onTrigger()); + if (result.index() == 1) { + break; + } + if (result.index() == 2) { + co_return; + } + if (result.index() == 3) { + break; + } + + cursor->setProtocolVersion(g_network->protocolVersion()); + if (cursor->popped() > begin) { + throw transaction_too_old(); + } + + const Version previousBufferedThrough = stream->bufferedThrough; + bufferMessages(stream, metadata, cursor); + if (stream->bufferedThrough > previousBufferedThrough) { + stream->changed.trigger(); + } + if (cursor->isExhausted()) { + Optional nextTagBoundary; + for (const auto& historyEntry : metadata.tagHistory) { + const Version boundary = historyEntry.first; + if (boundary > begin && (!nextTagBoundary.present() || boundary < nextTagBoundary.get())) { + nextTagBoundary = boundary; + } + } + if (nextTagBoundary.present()) { + const Version previousBufferedThrough = stream->bufferedThrough; + stream->bufferedThrough = std::max(stream->bufferedThrough, nextTagBoundary.get() - 1); + if (stream->bufferedThrough > previousBufferedThrough) { + stream->changed.trigger(); + } + } else { + co_await delay(0.1); + } + break; + } + } + } + } catch (Error& e) { + if (e.code() == error_code_client_invalid_operation || e.code() == error_code_wrong_shard_server) { + stream->active = false; + stream->changed.trigger(); + co_return; + } + throw; + } +} + +Future> readSafePopVersions(Database cx) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + + std::map minVersions; + Key begin = cdcMinVersionKeys.begin; + while (begin < cdcMinVersionKeys.end) { + RangeResult minima = + co_await tr.getRange(KeyRangeRef(begin, cdcMinVersionKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : minima) { + minVersions[decodeCDCMinVersionKey(kv.key)] = decodeCDCMinVersionValue(kv.value); + } + if (!minima.more) { + break; + } + begin = keyAfter(minima.back().key); + } + + std::map safePopVersions; + begin = cdcTagHistoryKeys.begin; + while (begin < cdcTagHistoryKeys.end) { + RangeResult histories = + co_await tr.getRange(KeyRangeRef(begin, cdcTagHistoryKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : histories) { + const auto [streamId, version, tag] = decodeCDCTagHistoryKey(kv.key); + auto minimum = minVersions.find(streamId); + if (minimum == minVersions.end()) { + continue; + } + auto safePop = safePopVersions.find(tag); + if (safePop == safePopVersions.end()) { + safePopVersions[tag] = minimum->second; + } else { + safePop->second = std::min(safePop->second, minimum->second); + } + } + if (!histories.more) { + break; + } + begin = keyAfter(histories.back().key); + } + co_return safePopVersions; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future popAcknowledgedData(CDCProxyData* self) { + const std::map safePopVersions = co_await readSafePopVersions(self->cx); + for (const auto& [tag, version] : safePopVersions) { + self->logSystem->get()->pop(version, tag); + } +} + +void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { + std::set assignedStreams; + for (const auto& [streamId, proxyId] : self->dbInfo->get().client.streamToCDCProxyId) { + if (proxyId == self->id) { + assignedStreams.insert(streamId); + if (!self->streams.contains(streamId)) { + Reference stream = makeReference(streamId); + self->streams.emplace(streamId, stream); + actors->add(bufferStream(self, stream)); + } + } + } + + for (auto it = self->streams.begin(); it != self->streams.end();) { + if (!assignedStreams.contains(it->first)) { + it->second->active = false; + it->second->stopped.trigger(); + it = self->streams.erase(it); + } else { + ++it; + } + } +} + Future consume(CDCProxyData* self, CDCConsumeRequest request) { try { if (request.cursor.lastConsumedVersion < invalidVersion || @@ -152,54 +361,48 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { throw client_invalid_operation(); } - CDCStreamReadState state = co_await readCDCStreamState(self->cx, request.cursor.streamId, self->id, true); - Version begin = request.cursor.lastConsumedVersion == invalidVersion ? state.minVersion + co_await readCDCStreamState(self->cx, request.cursor.streamId, self->id, true); + auto found = self->streams.find(request.cursor.streamId); + if (found == self->streams.end()) { + throw wrong_shard_server(); + } + Reference stream = found->second; + while (!stream->initialized) { + co_await stream->changed.onTrigger(); + } + + Version begin = request.cursor.lastConsumedVersion == invalidVersion ? stream->minVersion : request.cursor.lastConsumedVersion + 1; - if (begin < state.minVersion) { + if (begin < stream->minVersion) { throw transaction_too_old(); } - Reference cursor = - self->logSystem->peekSingle(self->id, begin, state.currentTag, state.tagHistory); - while (!cursor->hasMessage()) { - co_await cursor->getMore(TaskPriority::TLogPeekReply); - cursor->setProtocolVersion(g_network->protocolVersion()); - if (cursor->popped() > begin) { - throw transaction_too_old(); - } + if (stream->bufferedThrough < begin) { + stream->refresh.trigger(); + } + while (stream->active && stream->bufferedThrough < begin) { + co_await stream->changed.onTrigger(); + } + if (!stream->active) { + throw wrong_shard_server(); } CDCConsumeReply reply; reply.lastConsumedVersion = request.cursor.lastConsumedVersion; - while (cursor->hasMessage()) { - const Version messageVersion = cursor->version().version; - ArenaReader& reader = *cursor->reader(); - if (LogProtocolMessage::isNextIn(reader)) { - LogProtocolMessage protocolMessage; - reader >> protocolMessage; - cursor->setProtocolVersion(reader.protocolVersion()); - } else if (reader.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(reader)) { - SpanContextMessage contextMessage; - reader >> contextMessage; - } else if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) { - OTELSpanContextMessage contextMessage; - reader >> contextMessage; - } else { - MutationRef mutation; - reader >> mutation; - Optional clipped = clipCDCMutation(mutation, state.keys.get()); - if (clipped.present()) { - if (reply.mutations.empty() || reply.mutations.back().version != messageVersion) { - reply.mutations.push_back(reply.arena, VersionedMutationsRef(messageVersion, {})); - } - reply.mutations.back().mutations.push_back_deep(reply.arena, clipped.get()); + for (const auto& versioned : stream->mutations) { + if (versioned.version >= begin) { + reply.mutations.push_back(reply.arena, VersionedMutationsRef(versioned.version, {})); + for (const auto& mutation : versioned.mutations) { + reply.mutations.back().mutations.push_back_deep(reply.arena, mutation); } } - reply.lastConsumedVersion = std::max(reply.lastConsumedVersion, messageVersion); - cursor->nextMessage(); } + reply.lastConsumedVersion = stream->bufferedThrough; request.reply.send(reply); } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } request.reply.sendError(e); } co_return; @@ -207,17 +410,21 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { Future acknowledge(CDCProxyData* self, CDCAckRequest request) { try { - CDCStreamReadState state = co_await readCDCStreamState(self->cx, request.streamId, self->id, false); + co_await readCDCStreamState(self->cx, request.streamId, self->id, false); const Version minVersion = co_await acknowledgeNativeCdcStream(self->cx, request.streamId, request.version); - std::set tags{ state.currentTag }; - for (const auto& history : state.tagHistory) { - tags.insert(history.second); - } - for (Tag tag : tags) { - self->logSystem->pop(minVersion, tag); + auto found = self->streams.find(request.streamId); + if (found != self->streams.end()) { + found->second->minVersion = std::max(found->second->minVersion, minVersion); + while (!found->second->mutations.empty() && found->second->mutations.front().version < minVersion) { + found->second->mutations.pop_front(); + } } + co_await popAcknowledgedData(self); request.reply.send(Void()); } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } request.reply.sendError(e); } co_return; @@ -228,6 +435,9 @@ Future registerStream(CDCProxyData* self, CDCRegisterStreamRequest request const CDCStreamId streamId = co_await registerNativeCdcStream(self->cx, request.name, request.keys, self->id); request.reply.send(CDCRegisterStreamReply(streamId)); } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } request.reply.sendError(e); } co_return; @@ -238,6 +448,9 @@ Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { co_await removeNativeCdcStream(self->cx, request.name, self->id); request.reply.send(Void()); } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } request.reply.sendError(e); } co_return; @@ -256,6 +469,9 @@ Future listStreams(CDCProxyData* self, CDCListStreamsRequest request) { } request.reply.send(reply); } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } request.reply.sendError(e); } co_return; @@ -272,7 +488,8 @@ Future cdcProxyServer(CDCProxyInterface proxy, actors.add(waitFailureServer(proxy.waitFailure.getFuture())); actors.add(traceRole(Role::CDC_PROXY, proxy.id())); - self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); + self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); + reconcileStreams(&self, &actors); Future dbInfoChange = dbInfo->onChange(); bool hasBeenPublished = std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) != @@ -319,8 +536,9 @@ Future cdcProxyServer(CDCProxyInterface proxy, } hasBeenPublished = hasBeenPublished || isPublished; if (!dbInfo->get().logSystemConfig.tLogs.empty()) { - self.logSystem = makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get()); + self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); } + reconcileStreams(&self, &actors); dbInfoChange = dbInfo->onChange(); break; } diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 6586a51f9ef..7c4f8ee15f0 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -31,8 +31,10 @@ struct NativeCdcWorkload : TestWorkload { static constexpr auto NAME = "NativeCdc"; + bool sharedTagSafety; - explicit NativeCdcWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {} + explicit NativeCdcWorkload(WorkloadContext const& wcx) + : TestWorkload(wcx), sharedTagSafety(getOption(options, "sharedTagSafety"_sr, false)) {} Future setup(Database const& cx) override { return Void(); } @@ -40,7 +42,7 @@ struct NativeCdcWorkload : TestWorkload { if (clientId != 0) { return Void(); } - return run(cx); + return sharedTagSafety ? runSharedTagSafety(cx) : run(cx); } Future check(Database const& cx) override { return true; } @@ -86,6 +88,42 @@ struct NativeCdcWorkload : TestWorkload { } } + Future appendPersistedTag(Database cx, CDCStreamId streamId, Tag tag) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + const Version assignmentVersion = co_await tr.getReadVersion(); + tr.set(cdcTagHistoryKeyFor(streamId, assignmentVersion, tag), Value()); + co_await tr.commit(); + co_return; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future getLatestPersistedTag(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + RangeResult history = co_await tr.getRange(cdcTagHistoryRangeFor(streamId), CLIENT_KNOBS->TOO_MANY); + ASSERT(!history.empty()); + const auto historyEntry = decodeCDCTagHistoryKey(history.back().key); + ASSERT(std::get<0>(historyEntry) == streamId); + co_return std::get<2>(historyEntry); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + Future getCDCProxy() { while (dbInfo->get().client.cdcProxies.empty()) { co_await dbInfo->onChange(); @@ -150,6 +188,57 @@ struct NativeCdcWorkload : TestWorkload { co_return; } + Future runSharedTagSafety(Database cx) { + CDCProxyInterface proxy = co_await getCDCProxy(); + const Key firstName = "native-cdc-shared-first"_sr; + const Key secondName = "native-cdc-shared-second"_sr; + const KeyRange keys(KeyRangeRef("shared/"_sr, "shared0"_sr)); + const CDCStreamId firstId = co_await registerNativeCdcStream(cx, firstName, keys); + const CDCStreamId secondId = co_await registerNativeCdcStream(cx, secondName, keys); + const auto firstRoute = co_await getPersistedRoute(cx, firstId); + co_await appendPersistedTag(cx, secondId, firstRoute.first); + ASSERT((co_await getLatestPersistedTag(cx, secondId)) == firstRoute.first); + + ASSERT((co_await proxy.registerStream.getReply(CDCRegisterStreamRequest(firstName, keys))).streamId == firstId); + CDCProxyInterface firstOwner = co_await getCDCProxy(firstId); + Transaction write(cx); + write.setOption(FDBTransactionOptions::LOCK_AWARE); + write.set("shared/unread"_sr, "protected-by-minimum"_sr); + co_await write.commit(); + const Version writeVersion = write.getCommittedVersion(); + CDCConsumeReply consumed = co_await timeoutError( + firstOwner.consume.getReply(CDCConsumeRequest(CDCCursor(firstId, invalidVersion))), 30.0); + ASSERT(consumed.lastConsumedVersion >= writeVersion); + co_await firstOwner.ack.getReply(CDCAckRequest(firstId, consumed.lastConsumedVersion)); + + ASSERT((co_await firstOwner.registerStream.getReply(CDCRegisterStreamRequest(secondName, keys))).streamId == + secondId); + CDCProxyInterface secondOwner = co_await getCDCProxy(secondId); + CDCCursor unreadCursor(secondId, invalidVersion); + bool foundUnread = false; + while (unreadCursor.lastConsumedVersion < writeVersion) { + CDCConsumeReply unread = + co_await timeoutError(secondOwner.consume.getReply(CDCConsumeRequest(unreadCursor)), 30.0); + ASSERT(unread.lastConsumedVersion > unreadCursor.lastConsumedVersion); + for (const auto& versioned : unread.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "shared/unread"_sr) { + foundUnread = true; + } + } + } + unreadCursor.lastConsumedVersion = unread.lastConsumedVersion; + } + ASSERT(foundUnread); + co_await secondOwner.ack.getReply(CDCAckRequest(secondId, unreadCursor.lastConsumedVersion)); + + co_await firstOwner.removeStream.getReply(CDCRemoveStreamRequest(firstName)); + co_await secondOwner.removeStream.getReply(CDCRemoveStreamRequest(secondName)); + co_await waitForCDCProxyAssignmentRemoval(firstId); + co_await waitForCDCProxyAssignmentRemoval(secondId); + co_return; + } + Future run(Database cx) { const Key firstName = "native-cdc-first"_sr; const Key secondName = "native-cdc-second"_sr; diff --git a/tests/fast/NativeCdcSharedTag.toml b/tests/fast/NativeCdcSharedTag.toml new file mode 100644 index 00000000000..47272100f1f --- /dev/null +++ b/tests/fast/NativeCdcSharedTag.toml @@ -0,0 +1,7 @@ +[[test]] +testTitle = 'NativeCdcSharedTag' +useDB = true + + [[test.workload]] + testName = 'NativeCdc' + sharedTagSafety = true From 4d48b1c2f60f8d2b2f2ad2b4e2f2afe9371191d6 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 10:31:11 -0700 Subject: [PATCH 12/56] Add client-facing native CDC operations with proxy failover retries --- fdbclient/NativeCdc.cpp | 141 ++++++++++++++++++++++++ fdbclient/include/fdbclient/NativeCdc.h | 9 ++ fdbserver/workloads/NativeCdc.cpp | 53 ++++----- 3 files changed, 172 insertions(+), 31 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 9b745f98288..536e5f18f0c 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -24,6 +24,7 @@ #include #include +#include "fdbclient/DatabaseContext.h" #include "fdbclient/Knobs.h" #include "fdbclient/NativeCdc.h" #include "fdbclient/SystemData.h" @@ -114,6 +115,44 @@ Future observeNativeCdcMetadata(Transaction* tr, NativeCdcIdentifierAlloca } } +bool retryNativeCdcProxyRequest(Error const& error) { + return error.code() == error_code_wrong_shard_server || error.code() == error_code_broken_promise || + error.code() == error_code_connection_failed || error.code() == error_code_request_maybe_delivered; +} + +Future getAvailableNativeCdcProxy(Database cx, Optional previousProxy = Optional()) { + while (true) { + for (const auto& proxy : cx->clientInfo->get().cdcProxies) { + if (!previousProxy.present() || proxy.id() != previousProxy.get()) { + co_return proxy; + } + } + if (!cx->clientInfo->get().cdcProxies.empty()) { + co_return cx->clientInfo->get().cdcProxies.front(); + } + co_await cx->clientInfo->onChange(); + } +} + +Future getNativeCdcStreamProxy(Database cx, CDCStreamId streamId) { + if (streamId == 0) { + throw client_invalid_operation(); + } + + while (true) { + const ClientDBInfo& clientInfo = cx->clientInfo->get(); + auto assigned = clientInfo.streamToCDCProxyId.find(streamId); + if (assigned != clientInfo.streamToCDCProxyId.end()) { + for (const auto& proxy : clientInfo.cdcProxies) { + if (proxy.id() == assigned->second) { + co_return proxy; + } + } + } + co_await cx->clientInfo->onChange(); + } +} + } // namespace Future registerNativeCdcStream(Database cx, Key name, KeyRange keys, Optional proxyId) { @@ -321,6 +360,108 @@ Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Ve } } +Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys) { + validateNativeCdcStream(name, keys); + Optional previousProxy; + + while (true) { + CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx, previousProxy); + try { + CDCRegisterStreamReply reply = co_await proxy.registerStream.getReply(CDCRegisterStreamRequest(name, keys)); + co_return reply.streamId; + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + previousProxy = proxy.id(); + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + +Future> listNativeCdcStreamsClient(Database cx) { + Optional previousProxy; + + while (true) { + CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx, previousProxy); + try { + CDCListStreamsReply reply = co_await proxy.listStreams.getReply(CDCListStreamsRequest()); + std::vector streams; + streams.reserve(reply.streams.size()); + for (const auto& stream : reply.streams) { + streams.push_back( + NativeCdcStreamInfo{ Key(stream.name), stream.streamId, KeyRange(stream.keys), stream.minVersion }); + } + co_return streams; + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + previousProxy = proxy.id(); + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + +Future removeNativeCdcStreamClient(Database cx, Key name) { + if (name.empty()) { + throw client_invalid_operation(); + } + + while (true) { + std::vector streams = co_await listNativeCdcStreamsClient(cx); + auto stream = std::find_if( + streams.begin(), streams.end(), [&](NativeCdcStreamInfo const& info) { return info.name == name; }); + if (stream == streams.end()) { + co_return; + } + + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, stream->streamId); + try { + co_await proxy.removeStream.getReply(CDCRemoveStreamRequest(name)); + co_return; + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + +Future consumeNativeCdcStream(Database cx, CDCCursor cursor) { + while (true) { + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); + try { + co_return co_await proxy.consume.getReply(CDCConsumeRequest(cursor)); + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + +Future acknowledgeNativeCdcStreamClient(Database cx, CDCStreamId streamId, Version consumedThrough) { + if (streamId == 0 || consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { + throw client_invalid_operation(); + } + + while (true) { + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, streamId); + try { + co_await proxy.ack.getReply(CDCAckRequest(streamId, consumedThrough)); + co_return; + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + TEST_CASE("noSim/NativeCDC/LifecycleAllocation") { NativeCdcIdentifierAllocator allocator; auto [initialId, initialTag] = allocator.allocate(); diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index f2a0556c8d4..5163d1c56cf 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -24,6 +24,7 @@ #include +#include "fdbclient/CDCProxyInterface.h" #include "fdbclient/NativeAPI.actor.h" struct NativeCdcStreamInfo { @@ -47,4 +48,12 @@ Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyI // Removed streams remain acknowledgeable while retained CDC log data is drained. Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough); +// Client-facing CDC operations. These select the appropriate CDC proxy from +// ClientDBInfo and retry requests when stream ownership changes. +Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys); +Future removeNativeCdcStreamClient(Database cx, Key name); +Future> listNativeCdcStreamsClient(Database cx); +Future consumeNativeCdcStream(Database cx, CDCCursor cursor); +Future acknowledgeNativeCdcStreamClient(Database cx, CDCStreamId streamId, Version consumedThrough); + #endif // FDBCLIENT_NATIVECDC_H diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 7c4f8ee15f0..47ea689d4e1 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -293,19 +293,16 @@ struct NativeCdcWorkload : TestWorkload { co_await removeNativeCdcStream(cx, secondName); - CDCProxyInterface proxy = co_await getCDCProxy(); const Key liveName = "native-cdc-live"_sr; const KeyRange liveRange(KeyRangeRef("live/"_sr, "live0"_sr)); - CDCRegisterStreamReply liveRegistration = - co_await proxy.registerStream.getReply(CDCRegisterStreamRequest(liveName, liveRange)); - CDCProxyInterface owner = co_await getCDCProxy(liveRegistration.streamId); - ASSERT(owner.id() == proxy.id()); + const CDCStreamId liveStreamId = co_await registerNativeCdcStreamClient(cx, liveName, liveRange); + CDCProxyInterface owner = co_await getCDCProxy(liveStreamId); - CDCListStreamsReply listed = co_await proxy.listStreams.getReply(CDCListStreamsRequest()); - ASSERT(listed.streams.size() == 1); - ASSERT(listed.streams[0].name == liveName); - ASSERT(listed.streams[0].streamId == liveRegistration.streamId); - ASSERT(listed.streams[0].keys == liveRange); + std::vector listed = co_await listNativeCdcStreamsClient(cx); + ASSERT(listed.size() == 1); + ASSERT(listed[0].name == liveName); + ASSERT(listed[0].streamId == liveStreamId); + ASSERT(listed[0].keys == liveRange); Transaction write(cx); write.setOption(FDBTransactionOptions::LOCK_AWARE); @@ -320,8 +317,7 @@ struct NativeCdcWorkload : TestWorkload { } bool wrongOwnerRejected = false; try { - co_await nonOwner.consume.getReply( - CDCConsumeRequest(CDCCursor(liveRegistration.streamId, invalidVersion))); + co_await nonOwner.consume.getReply(CDCConsumeRequest(CDCCursor(liveStreamId, invalidVersion))); } catch (Error& e) { wrongOwnerRejected = e.code() == error_code_wrong_shard_server; } @@ -336,8 +332,8 @@ struct NativeCdcWorkload : TestWorkload { break; } - CDCConsumeReply consumed = co_await timeoutError( - owner.consume.getReply(CDCConsumeRequest(CDCCursor(liveRegistration.streamId, invalidVersion))), 30.0); + CDCConsumeReply consumed = + co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, invalidVersion)), 30.0); ASSERT(consumed.lastConsumedVersion >= writeVersion); bool foundInRangeWrite = false; bool foundOutOfRangeWrite = false; @@ -356,8 +352,7 @@ struct NativeCdcWorkload : TestWorkload { const uint64_t recoveryCount = dbInfo->get().recoveryCount; co_await owner.haltForTesting.getReply(HaltCDCProxyRequest()); - CDCProxyInterface replacement = - co_await timeoutError(getReplacementCDCProxy(liveRegistration.streamId, owner.id()), 30.0); + CDCProxyInterface replacement = co_await timeoutError(getReplacementCDCProxy(liveStreamId, owner.id()), 30.0); ASSERT(replacement.id() != owner.id()); ASSERT(dbInfo->get().recoveryCount == recoveryCount); @@ -366,10 +361,8 @@ struct NativeCdcWorkload : TestWorkload { afterFailureWrite.set("live/after-failure"_sr, "captured-after-failure"_sr); co_await afterFailureWrite.commit(); const Version afterFailureVersion = afterFailureWrite.getCommittedVersion(); - CDCConsumeReply afterFailure = - co_await timeoutError(replacement.consume.getReply(CDCConsumeRequest( - CDCCursor(liveRegistration.streamId, consumed.lastConsumedVersion))), - 30.0); + CDCConsumeReply afterFailure = co_await timeoutError( + consumeNativeCdcStream(cx, CDCCursor(liveStreamId, consumed.lastConsumedVersion)), 30.0); ASSERT(afterFailure.lastConsumedVersion >= afterFailureVersion); bool foundAfterFailureWrite = false; for (const auto& versioned : afterFailure.mutations) { @@ -382,15 +375,15 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(foundAfterFailureWrite); const Version cursorBeforeRecovery = afterFailure.lastConsumedVersion; - co_await replacement.ack.getReply(CDCAckRequest(liveRegistration.streamId, cursorBeforeRecovery)); - ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == cursorBeforeRecovery + 1); + co_await acknowledgeNativeCdcStreamClient(cx, liveStreamId, cursorBeforeRecovery); + ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == cursorBeforeRecovery + 1); const int32_t recoveredResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; const UID ownerBeforeRecovery = replacement.id(); const uint64_t recoveryBeforeChange = dbInfo->get().recoveryCount; co_await changeResolverCount(cx, recoveredResolverCount); co_await timeoutError(waitForRecoveryAfter(recoveryBeforeChange, RecoveryState::ACCEPTING_COMMITS), 60.0); - CDCProxyInterface recoveredOwner = co_await getCDCProxy(liveRegistration.streamId); + CDCProxyInterface recoveredOwner = co_await getCDCProxy(liveStreamId); ASSERT(recoveredOwner.id() == ownerBeforeRecovery); Transaction afterRecoveryWrite(cx); @@ -403,9 +396,7 @@ struct NativeCdcWorkload : TestWorkload { const double afterRecoveryConsumeDeadline = now() + 30.0; while (afterRecoveryCursor < afterRecoveryVersion) { CDCConsumeReply afterRecovery = - co_await timeoutError(recoveredOwner.consume.getReply( - CDCConsumeRequest(CDCCursor(liveRegistration.streamId, afterRecoveryCursor))), - 30.0); + co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, afterRecoveryCursor)), 30.0); if (afterRecovery.lastConsumedVersion == afterRecoveryCursor) { ASSERT(now() < afterRecoveryConsumeDeadline); co_await delay(0.1); @@ -423,13 +414,13 @@ struct NativeCdcWorkload : TestWorkload { } ASSERT(foundAfterRecoveryWrite); - co_await recoveredOwner.ack.getReply(CDCAckRequest(liveRegistration.streamId, afterRecoveryCursor)); - ASSERT(co_await getPersistedMinVersion(cx, liveRegistration.streamId) == afterRecoveryCursor + 1); + co_await acknowledgeNativeCdcStreamClient(cx, liveStreamId, afterRecoveryCursor); + ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == afterRecoveryCursor + 1); co_await timeoutError(waitForRecoveryAfter(recoveryBeforeChange, RecoveryState::FULLY_RECOVERED), 60.0); - recoveredOwner = co_await getCDCProxy(liveRegistration.streamId); + recoveredOwner = co_await getCDCProxy(liveStreamId); ASSERT(recoveredOwner.id() == ownerBeforeRecovery); - co_await recoveredOwner.removeStream.getReply(CDCRemoveStreamRequest(liveName)); - co_await waitForCDCProxyAssignmentRemoval(liveRegistration.streamId); + co_await removeNativeCdcStreamClient(cx, liveName); + co_await waitForCDCProxyAssignmentRemoval(liveStreamId); } }; From e61abe06c1ff9ab9f617df1d1f0cb0681b09d386 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 11:58:39 -0700 Subject: [PATCH 13/56] Finalize CDC retention state when streams are removed --- fdbclient/NativeCdc.cpp | 4 +-- fdbclient/include/fdbclient/NativeCdc.h | 1 - fdbserver/logsystem/ApplyMetadataMutation.cpp | 19 ++++++++++++ fdbserver/workloads/NativeCdc.cpp | 31 ++++++++++++++++--- 4 files changed, 47 insertions(+), 8 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 536e5f18f0c..89a01541d27 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -228,12 +228,12 @@ Future removeNativeCdcStream(Database cx, Key name, Optional proxyId) } tr.clear(nameKey); tr.clear(cdcStreamKeyFor(streamId)); + tr.clear(cdcTagHistoryRangeFor(streamId)); + tr.clear(cdcMinVersionKeyFor(streamId)); tr.clear(cdcProxyRangeFor(streamId)); if (assignedProxy.present()) { signalNativeCdcProxyAssignmentChange(&tr); } - // Retain tag history and minVersion until the pop/cleanup phase can - // safely release all durable mutations for this retired stream. co_await tr.commit(); co_return; } catch (Error& e) { diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index 5163d1c56cf..703957695b6 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -45,7 +45,6 @@ Future> listNativeCdcStreams(Database cx); // Atomically moves any streams assigned to a failed proxy to its replacement. Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId); // Persists the exclusive unpopped watermark after consuming through a version. -// Removed streams remain acknowledgeable while retained CDC log data is drained. Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough); // Client-facing CDC operations. These select the appropriate CDC proxy from diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index 1d4fe6eb58e..85a265679ee 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -1084,6 +1084,25 @@ class ApplyMetadataMutationsImpl { !cdcProxyKeys.intersects(range)) { return; } + if (logSystemConsumer && popVersion && cdcStreamKeys.intersects(range)) { + auto streamsCleared = txnStateStore->readRange(range & cdcStreamKeys).get(); + for (const auto& stream : streamsCleared) { + const CDCStreamId streamId = decodeCDCStreamKey(stream.key); + auto tagHistory = txnStateStore->readRange(cdcTagHistoryRangeFor(streamId)).get(); + for (const auto& entry : tagHistory) { + const Tag tag = std::get<2>(decodeCDCTagHistoryKey(entry.key)); + TraceEvent("CDCStreamTagRemove") + .detail("PopVersion", popVersion) + .detail("Tag", tag.toString()) + .detail("StreamId", streamId); + if (!forResolver) { + logSystemConsumer->pop(popVersion, tag); + (*tag_popped)[tag] = popVersion; + } + ASSERT_WE_THINK(forResolver ^ (tag_popped != nullptr)); + } + } + } if (!initialCommit) { for (const KeyRangeRef cdcRange : { cdcStreamNameKeys, cdcStreamKeys, cdcTagHistoryKeys, cdcMinVersionKeys, cdcProxyKeys }) { diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 47ea689d4e1..39761239af8 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -88,6 +88,22 @@ struct NativeCdcWorkload : TestWorkload { } } + Future hasPersistedRetention(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional minVersion = co_await tr.get(cdcMinVersionKeyFor(streamId)); + RangeResult history = co_await tr.getRange(cdcTagHistoryRangeFor(streamId), 1); + co_return minVersion.present() || !history.empty(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + Future appendPersistedTag(Database cx, CDCStreamId streamId, Tag tag) { Transaction tr(cx); while (true) { @@ -281,15 +297,20 @@ struct NativeCdcWorkload : TestWorkload { co_await removeNativeCdcStream(cx, firstName); ASSERT((co_await listNativeCdcStreams(cx)).empty()); - const Version retiredConsumedThrough = firstConsumedThrough + 5; - const Version retiredAckMinVersion = retiredConsumedThrough + 1; - ASSERT(co_await acknowledgeNativeCdcStream(cx, firstId, retiredConsumedThrough) == retiredAckMinVersion); - ASSERT(co_await getPersistedMinVersion(cx, firstId) == retiredAckMinVersion); + ASSERT(!(co_await hasPersistedRetention(cx, firstId))); + + bool retiredAcknowledgeRejected = false; + try { + co_await acknowledgeNativeCdcStream(cx, firstId, firstConsumedThrough + 5); + } catch (Error& e) { + retiredAcknowledgeRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(retiredAcknowledgeRejected); const CDCStreamId secondId = co_await registerNativeCdcStream(cx, secondName, secondRange); const auto secondRoute = co_await getPersistedRoute(cx, secondId); ASSERT(secondId > firstId); - ASSERT(secondRoute.first != firstRoute.first); + ASSERT(secondRoute.first == firstRoute.first); co_await removeNativeCdcStream(cx, secondName); From 364ec1e498eaca60ee9c97261f2c30281051dd3f Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 12:07:08 -0700 Subject: [PATCH 14/56] Bound buffered mutation memory in CDC proxies --- fdbserver/cdcproxy/CDCProxy.cpp | 23 +++++++++++++++++++ fdbserver/core/ServerKnobs.cpp | 1 + fdbserver/core/include/fdbserver/core/Knobs.h | 1 + 3 files changed, 25 insertions(+) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index fe32eeeed7e..62d6117ffbf 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -30,6 +30,7 @@ #include "fdbclient/NativeCdc.h" #include "fdbclient/SystemData.h" #include "fdbserver/cdcproxy/CDCProxy.h" +#include "fdbserver/core/Knobs.h" #include "fdbserver/core/LogProtocolMessage.h" #include "fdbserver/core/OTELSpanContextMessage.h" #include "fdbserver/core/ServerDBInfo.h" @@ -57,9 +58,11 @@ struct CDCBufferedStream : ReferenceCounted { bool initialized = false; Version minVersion = invalidVersion; Version bufferedThrough = invalidVersion; + int64_t bufferedBytes = 0; std::deque> mutations; AsyncTrigger changed; AsyncTrigger refresh; + AsyncTrigger spaceAvailable; AsyncTrigger stopped; explicit CDCBufferedStream(CDCStreamId streamId) : streamId(streamId) {} @@ -187,8 +190,10 @@ void bufferMessages(Reference stream, if (stream->mutations.empty() || stream->mutations.back().version != messageVersion) { stream->mutations.emplace_back(); stream->mutations.back().version = messageVersion; + stream->bufferedBytes += sizeof(VersionedMutationsRef); } stream->mutations.back().mutations.push_back_deep(stream->mutations.back().arena(), clipped.get()); + stream->bufferedBytes += clipped.get().expectedSize() + sizeof(MutationRef); } } stream->bufferedThrough = std::max(stream->bufferedThrough, messageVersion); @@ -215,6 +220,19 @@ Future bufferStream(CDCProxyData* self, Reference strea Reference cursor = self->logSystem->get()->peekSingle(self->id, begin, metadata.currentTag, metadata.tagHistory); while (stream->active) { + if (stream->bufferedBytes >= SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES) { + auto waitForSpace = co_await race(stream->spaceAvailable.onTrigger(), + self->logSystem->onChange(), + stream->stopped.onTrigger(), + stream->refresh.onTrigger()); + if (waitForSpace.index() == 0) { + continue; + } + if (waitForSpace.index() == 2) { + co_return; + } + break; + } auto result = co_await race(cursor->getMore(TaskPriority::TLogPeekReply), self->logSystem->onChange(), stream->stopped.onTrigger(), @@ -416,8 +434,13 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { if (found != self->streams.end()) { found->second->minVersion = std::max(found->second->minVersion, minVersion); while (!found->second->mutations.empty() && found->second->mutations.front().version < minVersion) { + found->second->bufferedBytes -= sizeof(VersionedMutationsRef) + + found->second->mutations.front().mutations.expectedSize() + + found->second->mutations.front().mutations.size() * sizeof(MutationRef); found->second->mutations.pop_front(); } + ASSERT(found->second->bufferedBytes >= 0); + found->second->spaceAvailable.trigger(); } co_await popAcknowledgedData(self); request.reply.send(Void()); diff --git a/fdbserver/core/ServerKnobs.cpp b/fdbserver/core/ServerKnobs.cpp index 892311c9984..66022903df6 100644 --- a/fdbserver/core/ServerKnobs.cpp +++ b/fdbserver/core/ServerKnobs.cpp @@ -175,6 +175,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DESIRED_UPDATE_BYTES, 2*DESIRED_TOTAL_BYTES ); init( UPDATE_DELAY, 0.001 ); init( MAXIMUM_PEEK_BYTES, 10e6 ); + init( CDC_PROXY_BUFFER_BYTES, 10e6 ); if( randomize && BUGGIFY ) CDC_PROXY_BUFFER_BYTES = 10000; init( APPLY_MUTATION_BYTES, 1e6 ); init( BUGGIFY_RECOVER_MEMORY_LIMIT, 1e6 ); init( BUGGIFY_WORKER_REMOVED_MAX_LAG, 30 ); diff --git a/fdbserver/core/include/fdbserver/core/Knobs.h b/fdbserver/core/include/fdbserver/core/Knobs.h index a86a669bec1..a548ac62c29 100644 --- a/fdbserver/core/include/fdbserver/core/Knobs.h +++ b/fdbserver/core/include/fdbserver/core/Knobs.h @@ -72,6 +72,7 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ServerKnobs : public KnobsImpl Date: Mon, 25 May 2026 12:08:22 -0700 Subject: [PATCH 15/56] Avoid waiting forever during concurrent CDC stream removal --- fdbclient/NativeCdc.cpp | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 89a01541d27..ed4fd4b91a4 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -153,6 +153,40 @@ Future getNativeCdcStreamProxy(Database cx, CDCStreamId strea } } +Future nativeCdcStreamStillExists(Database cx, Key name, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional currentId = co_await tr.get(cdcStreamNameKeyFor(name)); + co_return currentId.present() && decodeCDCStreamNameValue(currentId.get()) == streamId; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future> getNativeCdcStreamProxyForRemoval(Database cx, Key name, CDCStreamId streamId) { + while (true) { + const ClientDBInfo& clientInfo = cx->clientInfo->get(); + auto assigned = clientInfo.streamToCDCProxyId.find(streamId); + if (assigned != clientInfo.streamToCDCProxyId.end()) { + for (const auto& proxy : clientInfo.cdcProxies) { + if (proxy.id() == assigned->second) { + co_return proxy; + } + } + } + if (!(co_await nativeCdcStreamStillExists(cx, name, streamId))) { + co_return Optional(); + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + } // namespace Future registerNativeCdcStream(Database cx, Key name, KeyRange keys, Optional proxyId) { @@ -416,9 +450,12 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { co_return; } - CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, stream->streamId); + Optional proxy = co_await getNativeCdcStreamProxyForRemoval(cx, name, stream->streamId); + if (!proxy.present()) { + co_return; + } try { - co_await proxy.removeStream.getReply(CDCRemoveStreamRequest(name)); + co_await proxy.get().removeStream.getReply(CDCRemoveStreamRequest(name)); co_return; } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { From 1808150623975dee526250fd2b2b3e3085ea8614 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 12:22:15 -0700 Subject: [PATCH 16/56] Bound CDC tags and persist stream ID allocation --- fdbclient/ClientKnobs.cpp | 1 + fdbclient/NativeCdc.cpp | 47 +++++++++++++++---- fdbclient/SystemData.cpp | 10 ++++ fdbclient/include/fdbclient/Knobs.h | 1 + fdbclient/include/fdbclient/SystemData.h | 5 ++ fdbserver/logsystem/ApplyMetadataMutation.cpp | 12 +++-- 6 files changed, 64 insertions(+), 12 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 8e82ec2dc43..e2e4c2ef4fd 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -193,6 +193,7 @@ void ClientKnobs::initialize(Randomize randomize, IsSimulated isSimulated) { init( CHANGE_FEED_CACHE_FLUSH_BYTES, 10e6 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_FLUSH_BYTES = deterministicRandom()->randomInt64(1, 1e6); init( CHANGE_FEED_CACHE_EXPIRE_TIME, 60.0 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_EXPIRE_TIME = 1.0; init( CHANGE_FEED_CACHE_LIMIT_BYTES, 500000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_LIMIT_BYTES = 50000; + init( NATIVE_CDC_TAG_COUNT, 256 ); if( randomize && BUGGIFY ) NATIVE_CDC_TAG_COUNT = 2; init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index ed4fd4b91a4..3b0f4e28c15 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,7 @@ namespace { struct NativeCdcIdentifierAllocator { bool sawStream = false; CDCStreamId maxStreamId = 0; - std::set usedTagIds; + std::map tagStreamCounts; void observeStreamId(CDCStreamId streamId) { sawStream = true; @@ -45,7 +46,7 @@ struct NativeCdcIdentifierAllocator { void observeTag(Tag tag) { ASSERT_WE_THINK(tag.locality == tagLocalityCDC); - usedTagIds.insert(tag.id); + ++tagStreamCounts[tag.id]; } std::pair allocate() const { @@ -54,12 +55,19 @@ struct NativeCdcIdentifierAllocator { } const CDCStreamId streamId = sawStream ? maxStreamId + 1 : 1; - for (uint32_t tagId = 0; tagId <= std::numeric_limits::max(); ++tagId) { - if (!usedTagIds.contains(static_cast(tagId))) { - return { streamId, Tag(tagLocalityCDC, static_cast(tagId)) }; + ASSERT_WE_THINK(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT > 0); + ASSERT_WE_THINK(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT <= std::numeric_limits::max() + 1u); + uint32_t leastStreams = std::numeric_limits::max(); + uint16_t selectedTagId = 0; + for (uint32_t tagId = 0; tagId < static_cast(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT); ++tagId) { + auto count = tagStreamCounts.find(static_cast(tagId)); + const uint32_t streamCount = count == tagStreamCounts.end() ? 0 : count->second; + if (streamCount < leastStreams) { + leastStreams = streamCount; + selectedTagId = static_cast(tagId); } } - throw operation_failed(); + return { streamId, Tag(tagLocalityCDC, selectedTagId) }; } }; @@ -87,11 +95,19 @@ void signalNativeCdcProxyAssignmentChange(Transaction* tr) { } Future observeNativeCdcMetadata(Transaction* tr, NativeCdcIdentifierAllocator* allocator) { + Optional maxStreamId = co_await tr->get(cdcMaxStreamIdKey); + if (maxStreamId.present()) { + allocator->observeStreamId(decodeCDCMaxStreamIdValue(maxStreamId.get())); + } + + std::set activeStreamIds; Key begin = cdcStreamKeys.begin; while (begin < cdcStreamKeys.end) { RangeResult streams = co_await tr->getRange(KeyRangeRef(begin, cdcStreamKeys.end), CLIENT_KNOBS->TOO_MANY); for (const auto& kv : streams) { - allocator->observeStreamId(decodeCDCStreamKey(kv.key)); + const CDCStreamId streamId = decodeCDCStreamKey(kv.key); + activeStreamIds.insert(streamId); + allocator->observeStreamId(streamId); } if (!streams.more) { break; @@ -99,6 +115,7 @@ Future observeNativeCdcMetadata(Transaction* tr, NativeCdcIdentifierAlloca begin = keyAfter(streams.back().key); } + std::map currentTags; begin = cdcTagHistoryKeys.begin; while (begin < cdcTagHistoryKeys.end) { RangeResult histories = @@ -106,13 +123,18 @@ Future observeNativeCdcMetadata(Transaction* tr, NativeCdcIdentifierAlloca for (const auto& kv : histories) { const auto history = decodeCDCTagHistoryKey(kv.key); allocator->observeStreamId(std::get<0>(history)); - allocator->observeTag(std::get<2>(history)); + if (activeStreamIds.contains(std::get<0>(history))) { + currentTags[std::get<0>(history)] = std::get<2>(history); + } } if (!histories.more) { break; } begin = keyAfter(histories.back().key); } + for (const auto& tagAssignment : currentTags) { + allocator->observeTag(tagAssignment.second); + } } bool retryNativeCdcProxyRequest(Error const& error) { @@ -221,6 +243,7 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys const Version registrationVersion = co_await tr.getReadVersion(); tr.set(nameKey, cdcStreamNameValue(streamId)); + tr.set(cdcMaxStreamIdKey, cdcMaxStreamIdValue(streamId)); tr.set(cdcStreamKeyFor(streamId), cdcStreamKeysValue(keys)); tr.set(cdcTagHistoryKeyFor(streamId, registrationVersion, tag), Value()); tr.set(cdcMinVersionKeyFor(streamId), cdcMinVersionValue(registrationVersion)); @@ -512,5 +535,13 @@ TEST_CASE("noSim/NativeCDC/LifecycleAllocation") { ASSERT(nextId == 10); ASSERT(nextTag == Tag(tagLocalityCDC, 1)); + NativeCdcIdentifierAllocator fullPoolAllocator; + for (uint32_t tagId = 0; tagId < static_cast(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT); ++tagId) { + fullPoolAllocator.observeTag(Tag(tagLocalityCDC, static_cast(tagId))); + } + auto [sharedId, sharedTag] = fullPoolAllocator.allocate(); + ASSERT(sharedId == 1); + ASSERT(sharedTag == Tag(tagLocalityCDC, 0)); + return Void(); } diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index ef7ad537d38..a6fd38b2e37 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -772,6 +772,7 @@ int8_t decodeTagLocalityListValue(ValueRef const& value) { } const KeyRangeRef cdcStreamNameKeys("\xff/cdc/name/"_sr, "\xff/cdc/name0"_sr); +const KeyRef cdcMaxStreamIdKey = "\xff/cdc/maxStreamId"_sr; const KeyRangeRef cdcStreamKeys("\xff/cdc/keys/"_sr, "\xff/cdc/keys0"_sr); const KeyRangeRef cdcTagHistoryKeys("\xff/cdc/tagHistory/"_sr, "\xff/cdc/tagHistory0"_sr); const KeyRangeRef cdcMinVersionKeys("\xff/cdc/minVersion/"_sr, "\xff/cdc/minVersion0"_sr); @@ -800,6 +801,14 @@ CDCStreamId decodeCDCStreamNameValue(ValueRef const& value) { return streamId; } +Value cdcMaxStreamIdValue(CDCStreamId streamId) { + return cdcStreamNameValue(streamId); +} + +CDCStreamId decodeCDCMaxStreamIdValue(ValueRef const& value) { + return decodeCDCStreamNameValue(value); +} + Key cdcStreamKeyFor(CDCStreamId streamId) { BinaryWriter wr(Unversioned()); wr.serializeBytes(cdcStreamKeys.begin); @@ -1802,6 +1811,7 @@ TEST_CASE("noSim/SystemData/NativeCDC") { ASSERT(decodeCDCStreamNameKey(cdcStreamNameKeyFor(name)) == name); ASSERT(decodeCDCStreamNameValue(cdcStreamNameValue(streamId)) == streamId); + ASSERT(decodeCDCMaxStreamIdValue(cdcMaxStreamIdValue(streamId)) == streamId); ASSERT(decodeCDCStreamKey(cdcStreamKeyFor(streamId)) == streamId); ASSERT(decodeCDCStreamKeysValue(cdcStreamKeysValue(keys)) == keys); ASSERT(decodeCDCMinVersionKey(cdcMinVersionKeyFor(streamId)) == streamId); diff --git a/fdbclient/include/fdbclient/Knobs.h b/fdbclient/include/fdbclient/Knobs.h index 493506f69d5..db726f985b5 100644 --- a/fdbclient/include/fdbclient/Knobs.h +++ b/fdbclient/include/fdbclient/Knobs.h @@ -90,6 +90,7 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ClientKnobs : public KnobsImplclear(cdcRange & range); } } + if (range.contains(cdcMaxStreamIdKey)) { + txnStateStore->clear(singleKeyRange(cdcMaxStreamIdKey)); + } } if (toCommit && SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST && (cdcStreamKeys.intersects(range) || cdcTagHistoryKeys.intersects(range))) { @@ -1350,7 +1354,7 @@ bool containsMetadataMutation(const VectorRef& mutations) { (m.param1.startsWith(keyServersPrefix)) || cdcStreamNameKeys.contains(m.param1) || cdcStreamKeys.contains(m.param1) || cdcTagHistoryKeys.contains(m.param1) || cdcMinVersionKeys.contains(m.param1) || cdcProxyKeys.contains(m.param1) || - m.param1 == cdcProxyAssignmentChangeKey) { + m.param1 == cdcMaxStreamIdKey || m.param1 == cdcProxyAssignmentChangeKey) { return true; } } else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) { @@ -1366,7 +1370,7 @@ bool containsMetadataMutation(const VectorRef& mutations) { (range.contains(writeRecoveryKey)) || (range.intersects(testOnlyTxnStateStorePrefixRange)) || cdcStreamNameKeys.intersects(range) || cdcStreamKeys.intersects(range) || cdcTagHistoryKeys.intersects(range) || cdcMinVersionKeys.intersects(range) || - cdcProxyKeys.intersects(range)) { + cdcProxyKeys.intersects(range) || range.contains(cdcMaxStreamIdKey)) { return true; } } From 7acc70791baae82fbe2ef7129ea9534cef12c6bc Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 12:23:44 -0700 Subject: [PATCH 17/56] Fix CDC proxy buffered byte release accounting --- fdbserver/cdcproxy/CDCProxy.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 62d6117ffbf..d362acff6c6 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -435,8 +435,7 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { found->second->minVersion = std::max(found->second->minVersion, minVersion); while (!found->second->mutations.empty() && found->second->mutations.front().version < minVersion) { found->second->bufferedBytes -= sizeof(VersionedMutationsRef) + - found->second->mutations.front().mutations.expectedSize() + - found->second->mutations.front().mutations.size() * sizeof(MutationRef); + found->second->mutations.front().mutations.expectedSize(); found->second->mutations.pop_front(); } ASSERT(found->second->bufferedBytes >= 0); From dfb79bae84a7f48d311e8ed50b57523d3635901f Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 12:26:51 -0700 Subject: [PATCH 18/56] Resume CDC buffers from the retained minimum version --- fdbserver/cdcproxy/CDCProxy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index d362acff6c6..d5511a31ec9 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -216,7 +216,7 @@ Future bufferStream(CDCProxyData* self, Reference strea } metadata = co_await readCDCStreamState(self->cx, stream->streamId, self->id, true); - const Version begin = stream->bufferedThrough + 1; + const Version begin = std::max(stream->bufferedThrough + 1, metadata.minVersion); Reference cursor = self->logSystem->get()->peekSingle(self->id, begin, metadata.currentTag, metadata.tagHistory); while (stream->active) { From f98c7f563defb24fc22451fda6e908925f08495c Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 12:29:22 -0700 Subject: [PATCH 19/56] Refresh CDC buffer cursors after acknowledgements --- fdbserver/cdcproxy/CDCProxy.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index d5511a31ec9..275d47742a5 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -249,6 +249,9 @@ Future bufferStream(CDCProxyData* self, Reference strea cursor->setProtocolVersion(g_network->protocolVersion()); if (cursor->popped() > begin) { + if (cursor->popped() <= stream->minVersion) { + break; + } throw transaction_too_old(); } @@ -439,6 +442,7 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { found->second->mutations.pop_front(); } ASSERT(found->second->bufferedBytes >= 0); + found->second->refresh.trigger(); found->second->spaceAvailable.trigger(); } co_await popAcknowledgedData(self); From 7ada95e40b659ea0b68ad4a85c26c6b9fc85bf8e Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 12:50:57 -0700 Subject: [PATCH 20/56] Anchor CDC retention at stream registration commit versions --- fdbclient/NativeCdc.cpp | 4 +++- fdbclient/SystemData.cpp | 14 ++++++++++++++ fdbclient/include/fdbclient/SystemData.h | 2 ++ fdbserver/workloads/NativeCdc.cpp | 5 +++-- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 3b0f4e28c15..5bb78db028e 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -246,7 +246,9 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys tr.set(cdcMaxStreamIdKey, cdcMaxStreamIdValue(streamId)); tr.set(cdcStreamKeyFor(streamId), cdcStreamKeysValue(keys)); tr.set(cdcTagHistoryKeyFor(streamId, registrationVersion, tag), Value()); - tr.set(cdcMinVersionKeyFor(streamId), cdcMinVersionValue(registrationVersion)); + tr.atomicOp(cdcMinVersionKeyFor(streamId), + cdcVersionstampedMinVersionValue(), + MutationRef::SetVersionstampedValue); if (proxyId.present()) { tr.set(cdcProxyKeyFor(streamId, proxyId.get()), Value()); signalNativeCdcProxyAssignmentChange(&tr); diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index a6fd38b2e37..10f126b4bc3 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -889,7 +889,19 @@ Value cdcMinVersionValue(Version version) { return wr.toValue(); } +Value cdcVersionstampedMinVersionValue() { + // Ten placeholder bytes followed by the versionstamp offset at byte zero. + return "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"_sr; +} + Version decodeCDCMinVersionValue(ValueRef const& value) { + if (value.size() == sizeof(Version) + sizeof(uint16_t)) { + Versionstamp versionstamp; + BinaryReader reader(value, Unversioned()); + reader >> versionstamp; + return versionstamp.version; + } + Version version; BinaryReader reader(value, IncludeVersion()); ASSERT_WE_THINK(reader.protocolVersion().hasNativeCdc()); @@ -1816,6 +1828,8 @@ TEST_CASE("noSim/SystemData/NativeCDC") { ASSERT(decodeCDCStreamKeysValue(cdcStreamKeysValue(keys)) == keys); ASSERT(decodeCDCMinVersionKey(cdcMinVersionKeyFor(streamId)) == streamId); ASSERT(decodeCDCMinVersionValue(cdcMinVersionValue(minVersion)) == minVersion); + ASSERT(cdcVersionstampedMinVersionValue().size() == + sizeof(Version) + sizeof(uint16_t) + sizeof(int32_t)); const Key tagHistoryKey = cdcTagHistoryKeyFor(streamId, minVersion, tag); const auto [decodedStreamId, decodedVersion, decodedTag] = decodeCDCTagHistoryKey(tagHistoryKey); diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index fe3507ef030..2e13ba80d3d 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -293,10 +293,12 @@ KeyRange cdcTagHistoryRangeFor(CDCStreamId streamId); std::tuple decodeCDCTagHistoryKey(KeyRef const& key); // "\xff/cdc/minVersion/[[CDCStreamId]]" := "[[Version]]" +// The initial value is versionstamped at stream registration commit. extern const KeyRangeRef cdcMinVersionKeys; Key cdcMinVersionKeyFor(CDCStreamId streamId); CDCStreamId decodeCDCMinVersionKey(KeyRef const& key); Value cdcMinVersionValue(Version version); +Value cdcVersionstampedMinVersionValue(); Version decodeCDCMinVersionValue(ValueRef const& value); // "\xff/cdc/proxies/[[CDCStreamId]][[proxyUID]]" := "" diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 39761239af8..59d91cc5a86 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -63,8 +63,9 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(history.size() == 1); const auto [historyStreamId, historyVersion, tag] = decodeCDCTagHistoryKey(history[0].key); ASSERT(historyStreamId == streamId); - ASSERT(historyVersion == decodeCDCMinVersionValue(minVersion.get())); - co_return std::make_pair(tag, historyVersion); + const Version initialMinVersion = decodeCDCMinVersionValue(minVersion.get()); + ASSERT(historyVersion <= initialMinVersion); + co_return std::make_pair(tag, initialMinVersion); } catch (Error& e) { err = e; } From d3c942d50257141a353d812e1daef6997b6ce542 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 13:05:48 -0700 Subject: [PATCH 21/56] Register the CDC shared-tag simulator test --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7fbac15f4c2..595b7e7e74f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -198,6 +198,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/RandomUnitTests.toml) add_fdb_test(TEST_FILES fast/RangeLocking.toml) add_fdb_test(TEST_FILES fast/NativeCdc.toml) + add_fdb_test(TEST_FILES fast/NativeCdcSharedTag.toml) add_fdb_test(TEST_FILES fast/RangeLockCycle.toml) add_fdb_test(TEST_FILES fast/ReadHotDetectionCorrectness.toml IGNORE) # TODO re-enable once read hot detection is enabled. add_fdb_test(TEST_FILES fast/ReportConflictingKeys.toml) From eb58b1a40d79c0d00ac2b638a2c7aeed65421212 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 25 May 2026 17:11:04 -0700 Subject: [PATCH 22/56] Fix Native CDC recovery progress across log generations --- fdbserver/cdcproxy/CDCProxy.cpp | 5 ++++- fdbserver/workloads/NativeCdc.cpp | 6 +++--- tests/fast/NativeCdc.toml | 1 + tests/fast/NativeCdcSharedTag.toml | 1 + 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 275d47742a5..ca8aca2a9a8 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -257,6 +257,7 @@ Future bufferStream(CDCProxyData* self, Reference strea const Version previousBufferedThrough = stream->bufferedThrough; bufferMessages(stream, metadata, cursor); + stream->bufferedThrough = std::max(stream->bufferedThrough, cursor->version().version - 1); if (stream->bufferedThrough > previousBufferedThrough) { stream->changed.trigger(); } @@ -275,7 +276,9 @@ Future bufferStream(CDCProxyData* self, Reference strea stream->changed.trigger(); } } else { - co_await delay(0.1); + // ReplayMultiCursor advances past an empty old log generation on + // its next getMore(); rebuilding it here repeats that generation. + continue; } break; } diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 59d91cc5a86..8bb0e468613 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include #include #include @@ -36,6 +37,8 @@ struct NativeCdcWorkload : TestWorkload { explicit NativeCdcWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), sharedTagSafety(getOption(options, "sharedTagSafety"_sr, false)) {} + void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("all"); } + Future setup(Database const& cx) override { return Void(); } Future start(Database const& cx) override { @@ -438,9 +441,6 @@ struct NativeCdcWorkload : TestWorkload { co_await acknowledgeNativeCdcStreamClient(cx, liveStreamId, afterRecoveryCursor); ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == afterRecoveryCursor + 1); - co_await timeoutError(waitForRecoveryAfter(recoveryBeforeChange, RecoveryState::FULLY_RECOVERED), 60.0); - recoveredOwner = co_await getCDCProxy(liveStreamId); - ASSERT(recoveredOwner.id() == ownerBeforeRecovery); co_await removeNativeCdcStreamClient(cx, liveName); co_await waitForCDCProxyAssignmentRemoval(liveStreamId); } diff --git a/tests/fast/NativeCdc.toml b/tests/fast/NativeCdc.toml index 24d602a48f1..e75f57a3e00 100644 --- a/tests/fast/NativeCdc.toml +++ b/tests/fast/NativeCdc.toml @@ -1,6 +1,7 @@ [[test]] testTitle = 'NativeCdc' useDB = true +waitForQuiescenceEnd = false [[test.workload]] testName = 'NativeCdc' diff --git a/tests/fast/NativeCdcSharedTag.toml b/tests/fast/NativeCdcSharedTag.toml index 47272100f1f..46b1c8ef425 100644 --- a/tests/fast/NativeCdcSharedTag.toml +++ b/tests/fast/NativeCdcSharedTag.toml @@ -1,6 +1,7 @@ [[test]] testTitle = 'NativeCdcSharedTag' useDB = true +waitForQuiescenceEnd = false [[test.workload]] testName = 'NativeCdc' From 7a9e54e7b573246bbf3f9692aaedef372b183966 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Tue, 26 May 2026 11:15:50 -0700 Subject: [PATCH 23/56] Contain Native CDC stream expiry without terminating its proxy --- fdbserver/cdcproxy/CDCProxy.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index ca8aca2a9a8..124dcc4a02e 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -56,6 +56,7 @@ struct CDCBufferedStream : ReferenceCounted { CDCStreamId streamId; bool active = true; bool initialized = false; + bool tooOld = false; Version minVersion = invalidVersion; Version bufferedThrough = invalidVersion; int64_t bufferedBytes = 0; @@ -252,6 +253,14 @@ Future bufferStream(CDCProxyData* self, Reference strea if (cursor->popped() <= stream->minVersion) { break; } + TraceEvent("CDCBufferStreamTooOld", self->id) + .detail("StreamId", stream->streamId) + .detail("MinVersion", stream->minVersion) + .detail("BufferedThrough", stream->bufferedThrough) + .detail("Begin", begin) + .detail("Popped", cursor->popped()) + .detail("CurrentTag", metadata.currentTag) + .detail("TagHistorySize", metadata.tagHistory.size()); throw transaction_too_old(); } @@ -290,6 +299,12 @@ Future bufferStream(CDCProxyData* self, Reference strea stream->changed.trigger(); co_return; } + if (e.code() == error_code_transaction_too_old) { + stream->tooOld = true; + stream->active = false; + stream->changed.trigger(); + co_return; + } throw; } } @@ -394,6 +409,9 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { while (!stream->initialized) { co_await stream->changed.onTrigger(); } + if (stream->tooOld) { + throw transaction_too_old(); + } Version begin = request.cursor.lastConsumedVersion == invalidVersion ? stream->minVersion : request.cursor.lastConsumedVersion + 1; @@ -407,6 +425,9 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { while (stream->active && stream->bufferedThrough < begin) { co_await stream->changed.onTrigger(); } + if (stream->tooOld) { + throw transaction_too_old(); + } if (!stream->active) { throw wrong_shard_server(); } From f26afd0cb91d9d20c951d41cea385d3700b45ce3 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Tue, 26 May 2026 12:42:43 -0700 Subject: [PATCH 24/56] Fix Native CDC workload consumption synchronization --- fdbserver/workloads/NativeCdc.cpp | 74 ++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 8bb0e468613..8769ec6004f 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -226,10 +226,20 @@ struct NativeCdcWorkload : TestWorkload { write.set("shared/unread"_sr, "protected-by-minimum"_sr); co_await write.commit(); const Version writeVersion = write.getCommittedVersion(); - CDCConsumeReply consumed = co_await timeoutError( - firstOwner.consume.getReply(CDCConsumeRequest(CDCCursor(firstId, invalidVersion))), 30.0); - ASSERT(consumed.lastConsumedVersion >= writeVersion); - co_await firstOwner.ack.getReply(CDCAckRequest(firstId, consumed.lastConsumedVersion)); + CDCCursor firstCursor(firstId, invalidVersion); + const double firstConsumeDeadline = now() + 30.0; + while (firstCursor.lastConsumedVersion < writeVersion) { + CDCConsumeReply consumed = + co_await timeoutError(firstOwner.consume.getReply(CDCConsumeRequest(firstCursor)), 30.0); + if (consumed.lastConsumedVersion == firstCursor.lastConsumedVersion) { + ASSERT(now() < firstConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(consumed.lastConsumedVersion > firstCursor.lastConsumedVersion); + firstCursor.lastConsumedVersion = consumed.lastConsumedVersion; + } + co_await firstOwner.ack.getReply(CDCAckRequest(firstId, firstCursor.lastConsumedVersion)); ASSERT((co_await firstOwner.registerStream.getReply(CDCRegisterStreamRequest(secondName, keys))).streamId == secondId); @@ -357,18 +367,28 @@ struct NativeCdcWorkload : TestWorkload { break; } - CDCConsumeReply consumed = - co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, invalidVersion)), 30.0); - ASSERT(consumed.lastConsumedVersion >= writeVersion); + Version consumedThrough = invalidVersion; bool foundInRangeWrite = false; bool foundOutOfRangeWrite = false; - for (const auto& versioned : consumed.mutations) { - for (const auto& mutation : versioned.mutations) { - if (mutation.param1 == "live/in"_sr) { - foundInRangeWrite = true; - } - if (mutation.param1 == "other/out"_sr) { - foundOutOfRangeWrite = true; + const double initialConsumeDeadline = now() + 30.0; + while (consumedThrough < writeVersion) { + CDCConsumeReply consumed = + co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, consumedThrough)), 30.0); + if (consumed.lastConsumedVersion == consumedThrough) { + ASSERT(now() < initialConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(consumed.lastConsumedVersion > consumedThrough); + consumedThrough = consumed.lastConsumedVersion; + for (const auto& versioned : consumed.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/in"_sr) { + foundInRangeWrite = true; + } + if (mutation.param1 == "other/out"_sr) { + foundOutOfRangeWrite = true; + } } } } @@ -386,20 +406,30 @@ struct NativeCdcWorkload : TestWorkload { afterFailureWrite.set("live/after-failure"_sr, "captured-after-failure"_sr); co_await afterFailureWrite.commit(); const Version afterFailureVersion = afterFailureWrite.getCommittedVersion(); - CDCConsumeReply afterFailure = co_await timeoutError( - consumeNativeCdcStream(cx, CDCCursor(liveStreamId, consumed.lastConsumedVersion)), 30.0); - ASSERT(afterFailure.lastConsumedVersion >= afterFailureVersion); + Version afterFailureCursor = consumedThrough; bool foundAfterFailureWrite = false; - for (const auto& versioned : afterFailure.mutations) { - for (const auto& mutation : versioned.mutations) { - if (mutation.param1 == "live/after-failure"_sr) { - foundAfterFailureWrite = true; + const double afterFailureConsumeDeadline = now() + 30.0; + while (afterFailureCursor < afterFailureVersion) { + CDCConsumeReply afterFailure = + co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, afterFailureCursor)), 30.0); + if (afterFailure.lastConsumedVersion == afterFailureCursor) { + ASSERT(now() < afterFailureConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(afterFailure.lastConsumedVersion > afterFailureCursor); + afterFailureCursor = afterFailure.lastConsumedVersion; + for (const auto& versioned : afterFailure.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/after-failure"_sr) { + foundAfterFailureWrite = true; + } } } } ASSERT(foundAfterFailureWrite); - const Version cursorBeforeRecovery = afterFailure.lastConsumedVersion; + const Version cursorBeforeRecovery = afterFailureCursor; co_await acknowledgeNativeCdcStreamClient(cx, liveStreamId, cursorBeforeRecovery); ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == cursorBeforeRecovery + 1); From e0d06a4dcf085ae209bae1a814e8161ac487e47d Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 05:54:02 -0700 Subject: [PATCH 25/56] Fix Native CDC recovery handling and stabilize simulator coverage --- fdbserver/logsystem/LogSystemPeekCursor.cpp | 9 ++- fdbserver/tlog/TLogServer.cpp | 4 +- fdbserver/workloads/NativeCdc.cpp | 68 ++++++++++----------- tests/fast/NativeCdc.toml | 7 +++ tests/fast/NativeCdcSharedTag.toml | 7 +++ 5 files changed, 56 insertions(+), 39 deletions(-) diff --git a/fdbserver/logsystem/LogSystemPeekCursor.cpp b/fdbserver/logsystem/LogSystemPeekCursor.cpp index 8dfdbee95ee..dd78ec8d9d1 100644 --- a/fdbserver/logsystem/LogSystemPeekCursor.cpp +++ b/fdbserver/logsystem/LogSystemPeekCursor.cpp @@ -1044,8 +1044,13 @@ SetPeekCursor::SetPeekCursor(std::vector> const& logSets, ? canReturnEmptyVersionRange( bestServer, j /*currentServer*/, end, knownLockedTLogIds, bestSet, i /* currentSet */) : false); - auto cursor = makeReference( - logSets[i]->logServers[j], tag, begin, end, true, parallelGetMore, returnEmptyIfStopped); + auto cursor = makeReference(logSets[i]->logServers[j], + tag, + begin, + end, + tag.locality != tagLocalityCDC, + parallelGetMore, + returnEmptyIfStopped); serverCursors[i].push_back(cursor); } maxServers = std::max(maxServers, serverCursors[i].size()); diff --git a/fdbserver/tlog/TLogServer.cpp b/fdbserver/tlog/TLogServer.cpp index 265cf217fa4..e80071dc1e5 100644 --- a/fdbserver/tlog/TLogServer.cpp +++ b/fdbserver/tlog/TLogServer.cpp @@ -1950,8 +1950,8 @@ Future tLogPeekMessages(PromiseType replyPromise, auto tagData = logData->getTagData(reqTag); bool tagRecovered = tagData && !tagData->unpoppedRecovered; if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && poppedVer <= reqBegin && - reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && reqTag.locality >= 0 && - !reqReturnIfBlocked && tagRecovered) { + reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && + (reqTag.locality >= 0 || reqTag.locality == tagLocalityCDC) && !reqReturnIfBlocked && tagRecovered) { double startTime = now(); co_await waitForMessagesForTag(logData, reqTag, reqBegin, SERVER_KNOBS->BLOCKING_PEEK_TIMEOUT); double latency = now() - startTime; diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 8769ec6004f..a7536cc1580 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -126,6 +126,24 @@ struct NativeCdcWorkload : TestWorkload { } } + Future writeValues(Database cx, std::vector> values) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + for (const auto& [key, value] : values) { + tr.set(key, value); + } + co_await tr.commit(); + co_return tr.getCommittedVersion(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + Future getLatestPersistedTag(Database cx, CDCStreamId streamId) { Transaction tr(cx); while (true) { @@ -209,7 +227,6 @@ struct NativeCdcWorkload : TestWorkload { } Future runSharedTagSafety(Database cx) { - CDCProxyInterface proxy = co_await getCDCProxy(); const Key firstName = "native-cdc-shared-first"_sr; const Key secondName = "native-cdc-shared-second"_sr; const KeyRange keys(KeyRangeRef("shared/"_sr, "shared0"_sr)); @@ -219,18 +236,12 @@ struct NativeCdcWorkload : TestWorkload { co_await appendPersistedTag(cx, secondId, firstRoute.first); ASSERT((co_await getLatestPersistedTag(cx, secondId)) == firstRoute.first); - ASSERT((co_await proxy.registerStream.getReply(CDCRegisterStreamRequest(firstName, keys))).streamId == firstId); - CDCProxyInterface firstOwner = co_await getCDCProxy(firstId); - Transaction write(cx); - write.setOption(FDBTransactionOptions::LOCK_AWARE); - write.set("shared/unread"_sr, "protected-by-minimum"_sr); - co_await write.commit(); - const Version writeVersion = write.getCommittedVersion(); + ASSERT(co_await registerNativeCdcStreamClient(cx, firstName, keys) == firstId); + const Version writeVersion = co_await writeValues(cx, { { "shared/unread"_sr, "protected-by-minimum"_sr } }); CDCCursor firstCursor(firstId, invalidVersion); const double firstConsumeDeadline = now() + 30.0; while (firstCursor.lastConsumedVersion < writeVersion) { - CDCConsumeReply consumed = - co_await timeoutError(firstOwner.consume.getReply(CDCConsumeRequest(firstCursor)), 30.0); + CDCConsumeReply consumed = co_await timeoutError(consumeNativeCdcStream(cx, firstCursor), 30.0); if (consumed.lastConsumedVersion == firstCursor.lastConsumedVersion) { ASSERT(now() < firstConsumeDeadline); co_await delay(0.1); @@ -239,16 +250,13 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(consumed.lastConsumedVersion > firstCursor.lastConsumedVersion); firstCursor.lastConsumedVersion = consumed.lastConsumedVersion; } - co_await firstOwner.ack.getReply(CDCAckRequest(firstId, firstCursor.lastConsumedVersion)); + co_await acknowledgeNativeCdcStreamClient(cx, firstId, firstCursor.lastConsumedVersion); - ASSERT((co_await firstOwner.registerStream.getReply(CDCRegisterStreamRequest(secondName, keys))).streamId == - secondId); - CDCProxyInterface secondOwner = co_await getCDCProxy(secondId); + ASSERT(co_await registerNativeCdcStreamClient(cx, secondName, keys) == secondId); CDCCursor unreadCursor(secondId, invalidVersion); bool foundUnread = false; while (unreadCursor.lastConsumedVersion < writeVersion) { - CDCConsumeReply unread = - co_await timeoutError(secondOwner.consume.getReply(CDCConsumeRequest(unreadCursor)), 30.0); + CDCConsumeReply unread = co_await timeoutError(consumeNativeCdcStream(cx, unreadCursor), 30.0); ASSERT(unread.lastConsumedVersion > unreadCursor.lastConsumedVersion); for (const auto& versioned : unread.mutations) { for (const auto& mutation : versioned.mutations) { @@ -260,10 +268,10 @@ struct NativeCdcWorkload : TestWorkload { unreadCursor.lastConsumedVersion = unread.lastConsumedVersion; } ASSERT(foundUnread); - co_await secondOwner.ack.getReply(CDCAckRequest(secondId, unreadCursor.lastConsumedVersion)); + co_await acknowledgeNativeCdcStreamClient(cx, secondId, unreadCursor.lastConsumedVersion); - co_await firstOwner.removeStream.getReply(CDCRemoveStreamRequest(firstName)); - co_await secondOwner.removeStream.getReply(CDCRemoveStreamRequest(secondName)); + co_await removeNativeCdcStreamClient(cx, firstName); + co_await removeNativeCdcStreamClient(cx, secondName); co_await waitForCDCProxyAssignmentRemoval(firstId); co_await waitForCDCProxyAssignmentRemoval(secondId); co_return; @@ -339,12 +347,8 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(listed[0].streamId == liveStreamId); ASSERT(listed[0].keys == liveRange); - Transaction write(cx); - write.setOption(FDBTransactionOptions::LOCK_AWARE); - write.set("live/in"_sr, "captured"_sr); - write.set("other/out"_sr, "ignored"_sr); - co_await write.commit(); - const Version writeVersion = write.getCommittedVersion(); + const Version writeVersion = + co_await writeValues(cx, { { "live/in"_sr, "captured"_sr }, { "other/out"_sr, "ignored"_sr } }); for (const auto& nonOwner : dbInfo->get().client.cdcProxies) { if (nonOwner.id() == owner.id()) { @@ -401,11 +405,8 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(replacement.id() != owner.id()); ASSERT(dbInfo->get().recoveryCount == recoveryCount); - Transaction afterFailureWrite(cx); - afterFailureWrite.setOption(FDBTransactionOptions::LOCK_AWARE); - afterFailureWrite.set("live/after-failure"_sr, "captured-after-failure"_sr); - co_await afterFailureWrite.commit(); - const Version afterFailureVersion = afterFailureWrite.getCommittedVersion(); + const Version afterFailureVersion = + co_await writeValues(cx, { { "live/after-failure"_sr, "captured-after-failure"_sr } }); Version afterFailureCursor = consumedThrough; bool foundAfterFailureWrite = false; const double afterFailureConsumeDeadline = now() + 30.0; @@ -441,11 +442,8 @@ struct NativeCdcWorkload : TestWorkload { CDCProxyInterface recoveredOwner = co_await getCDCProxy(liveStreamId); ASSERT(recoveredOwner.id() == ownerBeforeRecovery); - Transaction afterRecoveryWrite(cx); - afterRecoveryWrite.setOption(FDBTransactionOptions::LOCK_AWARE); - afterRecoveryWrite.set("live/after-recovery"_sr, "captured-after-recovery"_sr); - co_await afterRecoveryWrite.commit(); - const Version afterRecoveryVersion = afterRecoveryWrite.getCommittedVersion(); + const Version afterRecoveryVersion = + co_await writeValues(cx, { { "live/after-recovery"_sr, "captured-after-recovery"_sr } }); Version afterRecoveryCursor = cursorBeforeRecovery; bool foundAfterRecoveryWrite = false; const double afterRecoveryConsumeDeadline = now() + 30.0; diff --git a/tests/fast/NativeCdc.toml b/tests/fast/NativeCdc.toml index e75f57a3e00..b6286c322fd 100644 --- a/tests/fast/NativeCdc.toml +++ b/tests/fast/NativeCdc.toml @@ -1,7 +1,14 @@ +[configuration] +config = 'single' +singleRegion = true +buggify = false +faultInjection = false + [[test]] testTitle = 'NativeCdc' useDB = true waitForQuiescenceEnd = false +connectionFailuresDisableDuration = 1000000 [[test.workload]] testName = 'NativeCdc' diff --git a/tests/fast/NativeCdcSharedTag.toml b/tests/fast/NativeCdcSharedTag.toml index 46b1c8ef425..87e3057d10f 100644 --- a/tests/fast/NativeCdcSharedTag.toml +++ b/tests/fast/NativeCdcSharedTag.toml @@ -1,7 +1,14 @@ +[configuration] +config = 'single' +singleRegion = true +buggify = false +faultInjection = false + [[test]] testTitle = 'NativeCdcSharedTag' useDB = true waitForQuiescenceEnd = false +connectionFailuresDisableDuration = 1000000 [[test.workload]] testName = 'NativeCdc' From 9b4b2f595e3b5db6b985e3f950a4d4546327b916 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 09:57:24 -0700 Subject: [PATCH 26/56] Simplify serialize methods --- .../include/fdbclient/CommitProxyInterface.h | 25 ++-- .../fdbserver/core/WorkerInterface.actor.h | 128 ++++++------------ 2 files changed, 50 insertions(+), 103 deletions(-) diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index ba48de19c52..b493007f05d 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -132,22 +132,17 @@ struct ClientDBInfo { void serialize(Archive& ar) { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); - serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType); - if (ar.protocolVersion().hasNativeCdc()) { - serializer(ar, cdcProxies, streamToCDCProxyId); - } - } else { - serializer(ar, - grvProxies, - commitProxies, - id, - forward, - history, - clusterId, - clusterType, - cdcProxies, - streamToCDCProxyId); } + serializer(ar, + grvProxies, + commitProxies, + id, + forward, + history, + clusterId, + clusterType, + cdcProxies, + streamToCDCProxyId); } }; diff --git a/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h b/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h index 74856ea3652..cd4ca503c6e 100644 --- a/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h +++ b/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h @@ -100,63 +100,32 @@ struct WorkerInterface { template void serialize(Ar& ar) { - if constexpr (is_fb_function) { - serializer(ar, - clientInterface, - locality, - tLog, - master, - commitProxy, - grvProxy, - dataDistributor, - ratekeeper, - consistencyScan, - resolver, - storage, - logRouter, - debugPing, - coordinationPing, - waitFailure, - setMetricsRate, - eventLogRequest, - traceBatchDumpRequest, - testerInterface, - diskStoreRequest, - execReq, - workerSnapReq, - backup, - updateServerDBInfo, - cdcProxy); - } else { - serializer(ar, - clientInterface, - locality, - tLog, - master, - commitProxy, - grvProxy, - dataDistributor, - ratekeeper, - consistencyScan, - resolver, - storage, - logRouter, - debugPing, - coordinationPing, - waitFailure, - setMetricsRate, - eventLogRequest, - traceBatchDumpRequest, - testerInterface, - diskStoreRequest, - execReq, - workerSnapReq, - backup, - updateServerDBInfo); - if (ar.protocolVersion().hasNativeCdc()) { - serializer(ar, cdcProxy); - } - } + serializer(ar, + clientInterface, + locality, + tLog, + master, + commitProxy, + grvProxy, + dataDistributor, + ratekeeper, + consistencyScan, + resolver, + storage, + logRouter, + debugPing, + coordinationPing, + waitFailure, + setMetricsRate, + eventLogRequest, + traceBatchDumpRequest, + testerInterface, + diskStoreRequest, + execReq, + workerSnapReq, + backup, + updateServerDBInfo, + cdcProxy); } }; @@ -289,38 +258,21 @@ struct RegisterMasterRequest { void serialize(Ar& ar) { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); - serializer(ar, - id, - mi, - logSystemConfig, - commitProxies, - grvProxies, - resolvers, - recoveryCount, - registrationCount, - configuration, - priorCommittedLogServers, - recoveryState, - recoveryStalled); - if (ar.protocolVersion().hasNativeCdc()) { - serializer(ar, cdcProxies); - } - } else { - serializer(ar, - id, - mi, - logSystemConfig, - commitProxies, - grvProxies, - resolvers, - recoveryCount, - registrationCount, - configuration, - priorCommittedLogServers, - recoveryState, - recoveryStalled, - cdcProxies); } + serializer(ar, + id, + mi, + logSystemConfig, + commitProxies, + grvProxies, + resolvers, + recoveryCount, + registrationCount, + configuration, + priorCommittedLogServers, + recoveryState, + recoveryStalled, + cdcProxies); } }; From fc075ffbe8a2d7f5d238c6c38b97296e1fee5818 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 10:19:24 -0700 Subject: [PATCH 27/56] Move native CDC acknowledgements to storage-backed system keys --- fdbclient/SystemData.cpp | 6 +++--- fdbclient/include/fdbclient/SystemData.h | 5 +++-- fdbserver/cdcproxy/CDCProxy.cpp | 6 ++++-- fdbserver/logsystem/ApplyMetadataMutation.cpp | 20 +++++++++---------- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 10f126b4bc3..4cd73708430 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -775,7 +775,7 @@ const KeyRangeRef cdcStreamNameKeys("\xff/cdc/name/"_sr, "\xff/cdc/name0"_sr); const KeyRef cdcMaxStreamIdKey = "\xff/cdc/maxStreamId"_sr; const KeyRangeRef cdcStreamKeys("\xff/cdc/keys/"_sr, "\xff/cdc/keys0"_sr); const KeyRangeRef cdcTagHistoryKeys("\xff/cdc/tagHistory/"_sr, "\xff/cdc/tagHistory0"_sr); -const KeyRangeRef cdcMinVersionKeys("\xff/cdc/minVersion/"_sr, "\xff/cdc/minVersion0"_sr); +const KeyRangeRef cdcMinVersionKeys("\xff\x02/cdc/minVersion/"_sr, "\xff\x02/cdc/minVersion0"_sr); const KeyRangeRef cdcProxyKeys("\xff/cdc/proxies/"_sr, "\xff/cdc/proxies0"_sr); const KeyRef cdcProxyAssignmentChangeKey = "\xff/cdc/proxyAssignmentChange"_sr; @@ -1828,8 +1828,8 @@ TEST_CASE("noSim/SystemData/NativeCDC") { ASSERT(decodeCDCStreamKeysValue(cdcStreamKeysValue(keys)) == keys); ASSERT(decodeCDCMinVersionKey(cdcMinVersionKeyFor(streamId)) == streamId); ASSERT(decodeCDCMinVersionValue(cdcMinVersionValue(minVersion)) == minVersion); - ASSERT(cdcVersionstampedMinVersionValue().size() == - sizeof(Version) + sizeof(uint16_t) + sizeof(int32_t)); + ASSERT(nonMetadataSystemKeys.contains(cdcMinVersionKeyFor(streamId))); + ASSERT(cdcVersionstampedMinVersionValue().size() == sizeof(Version) + sizeof(uint16_t) + sizeof(int32_t)); const Key tagHistoryKey = cdcTagHistoryKeyFor(streamId, minVersion, tag); const auto [decodedStreamId, decodedVersion, decodedTag] = decodeCDCTagHistoryKey(tagHistoryKey); diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index 2e13ba80d3d..20eb711aa81 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -266,7 +266,7 @@ Value tagLocalityListValue(int8_t const&); Optional decodeTagLocalityListKey(KeyRef const&); int8_t decodeTagLocalityListValue(ValueRef const&); -// Native CDC metadata persisted in the transaction state store. +// Native CDC stream routing and lifecycle metadata persisted in the transaction state store. // "\xff/cdc/name/[[streamName]]" := "[[CDCStreamId]]" extern const KeyRangeRef cdcStreamNameKeys; Key cdcStreamNameKeyFor(KeyRef const& streamName); @@ -292,7 +292,8 @@ Key cdcTagHistoryKeyFor(CDCStreamId streamId, Version version, Tag tag); KeyRange cdcTagHistoryRangeFor(CDCStreamId streamId); std::tuple decodeCDCTagHistoryKey(KeyRef const& key); -// "\xff/cdc/minVersion/[[CDCStreamId]]" := "[[Version]]" +// Native CDC acknowledgement progress is regular storage-server-backed system data. +// "\xff\x02/cdc/minVersion/[[CDCStreamId]]" := "[[Version]]" // The initial value is versionstamped at stream registration commit. extern const KeyRangeRef cdcMinVersionKeys; Key cdcMinVersionKeyFor(CDCStreamId streamId); diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 124dcc4a02e..21eea501303 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -461,8 +461,8 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { if (found != self->streams.end()) { found->second->minVersion = std::max(found->second->minVersion, minVersion); while (!found->second->mutations.empty() && found->second->mutations.front().version < minVersion) { - found->second->bufferedBytes -= sizeof(VersionedMutationsRef) + - found->second->mutations.front().mutations.expectedSize(); + found->second->bufferedBytes -= + sizeof(VersionedMutationsRef) + found->second->mutations.front().mutations.expectedSize(); found->second->mutations.pop_front(); } ASSERT(found->second->bufferedBytes >= 0); @@ -540,6 +540,7 @@ Future cdcProxyServer(CDCProxyInterface proxy, actors.add(traceRole(Role::CDC_PROXY, proxy.id())); self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); reconcileStreams(&self, &actors); + actors.add(popAcknowledgedData(&self)); Future dbInfoChange = dbInfo->onChange(); bool hasBeenPublished = std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) != @@ -589,6 +590,7 @@ Future cdcProxyServer(CDCProxyInterface proxy, self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); } reconcileStreams(&self, &actors); + actors.add(popAcknowledgedData(&self)); dbInfoChange = dbInfo->onChange(); break; } diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index 8255eeb4056..caf251c6720 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -614,9 +614,8 @@ class ApplyMetadataMutationsImpl { void checkSetCDCMetadata(MutationRef m) { if (!cdcStreamNameKeys.contains(m.param1) && !cdcStreamKeys.contains(m.param1) && - !cdcTagHistoryKeys.contains(m.param1) && !cdcMinVersionKeys.contains(m.param1) && - !cdcProxyKeys.contains(m.param1) && m.param1 != cdcMaxStreamIdKey && - m.param1 != cdcProxyAssignmentChangeKey) { + !cdcTagHistoryKeys.contains(m.param1) && !cdcProxyKeys.contains(m.param1) && + m.param1 != cdcMaxStreamIdKey && m.param1 != cdcProxyAssignmentChangeKey) { return; } if (!initialCommit) { @@ -1081,8 +1080,8 @@ class ApplyMetadataMutationsImpl { void checkClearCDCMetadata(KeyRangeRef range) { if (!cdcStreamNameKeys.intersects(range) && !cdcStreamKeys.intersects(range) && - !cdcTagHistoryKeys.intersects(range) && !cdcMinVersionKeys.intersects(range) && - !cdcProxyKeys.intersects(range) && !range.contains(cdcMaxStreamIdKey)) { + !cdcTagHistoryKeys.intersects(range) && !cdcProxyKeys.intersects(range) && + !range.contains(cdcMaxStreamIdKey)) { return; } if (logSystemConsumer && popVersion && cdcStreamKeys.intersects(range)) { @@ -1105,8 +1104,7 @@ class ApplyMetadataMutationsImpl { } } if (!initialCommit) { - for (const KeyRangeRef cdcRange : - { cdcStreamNameKeys, cdcStreamKeys, cdcTagHistoryKeys, cdcMinVersionKeys, cdcProxyKeys }) { + for (const KeyRangeRef cdcRange : { cdcStreamNameKeys, cdcStreamKeys, cdcTagHistoryKeys, cdcProxyKeys }) { if (cdcRange.intersects(range)) { txnStateStore->clear(cdcRange & range); } @@ -1353,8 +1351,8 @@ bool containsMetadataMutation(const VectorRef& mutations) { (m.param1.startsWith(logRangesRange.begin)) || (m.param1.startsWith(serverKeysPrefix)) || (m.param1.startsWith(keyServersPrefix)) || cdcStreamNameKeys.contains(m.param1) || cdcStreamKeys.contains(m.param1) || cdcTagHistoryKeys.contains(m.param1) || - cdcMinVersionKeys.contains(m.param1) || cdcProxyKeys.contains(m.param1) || - m.param1 == cdcMaxStreamIdKey || m.param1 == cdcProxyAssignmentChangeKey) { + cdcProxyKeys.contains(m.param1) || m.param1 == cdcMaxStreamIdKey || + m.param1 == cdcProxyAssignmentChangeKey) { return true; } } else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) { @@ -1369,8 +1367,8 @@ bool containsMetadataMutation(const VectorRef& mutations) { (range.contains(metadataVersionKey)) || (range.contains(mustContainSystemMutationsKey)) || (range.contains(writeRecoveryKey)) || (range.intersects(testOnlyTxnStateStorePrefixRange)) || cdcStreamNameKeys.intersects(range) || cdcStreamKeys.intersects(range) || - cdcTagHistoryKeys.intersects(range) || cdcMinVersionKeys.intersects(range) || - cdcProxyKeys.intersects(range) || range.contains(cdcMaxStreamIdKey)) { + cdcTagHistoryKeys.intersects(range) || cdcProxyKeys.intersects(range) || + range.contains(cdcMaxStreamIdKey)) { return true; } } From 7964d4fd3e1fff8b9302dacce082207b1ce211fe Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 10:21:11 -0700 Subject: [PATCH 28/56] Fix formatting --- fdbclient/NativeCdc.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 5bb78db028e..63692657156 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -246,9 +246,8 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys tr.set(cdcMaxStreamIdKey, cdcMaxStreamIdValue(streamId)); tr.set(cdcStreamKeyFor(streamId), cdcStreamKeysValue(keys)); tr.set(cdcTagHistoryKeyFor(streamId, registrationVersion, tag), Value()); - tr.atomicOp(cdcMinVersionKeyFor(streamId), - cdcVersionstampedMinVersionValue(), - MutationRef::SetVersionstampedValue); + tr.atomicOp( + cdcMinVersionKeyFor(streamId), cdcVersionstampedMinVersionValue(), MutationRef::SetVersionstampedValue); if (proxyId.present()) { tr.set(cdcProxyKeyFor(streamId, proxyId.get()), Value()); signalNativeCdcProxyAssignmentChange(&tr); From 74fb2b09b5afcf83addbf5871b44291c4f1338c5 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 11:09:52 -0700 Subject: [PATCH 29/56] Prevent CDC stream removal from popping shared unread data --- fdbclient/NativeCdc.cpp | 25 ++++++++++++++++--- fdbclient/include/fdbclient/NativeCdc.h | 11 +++++++- fdbserver/cdcproxy/CDCProxy.cpp | 14 ++++++++++- fdbserver/logsystem/ApplyMetadataMutation.cpp | 21 ++-------------- fdbserver/workloads/NativeCdc.cpp | 4 +-- 5 files changed, 49 insertions(+), 26 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 63692657156..36eb9d1981a 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -261,7 +261,7 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys } } -Future removeNativeCdcStream(Database cx, Key name, Optional proxyId) { +Future> removeNativeCdcStream(Database cx, Key name, Optional proxyId) { if (name.empty()) { throw client_invalid_operation(); } @@ -276,7 +276,7 @@ Future removeNativeCdcStream(Database cx, Key name, Optional proxyId) const Key nameKey = cdcStreamNameKeyFor(name); Optional currentId = co_await tr.get(nameKey); if (!currentId.present()) { - co_return; + co_return Optional(); } const CDCStreamId streamId = decodeCDCStreamNameValue(currentId.get()); @@ -284,6 +284,22 @@ Future removeNativeCdcStream(Database cx, Key name, Optional proxyId) if (proxyId.present() && (!assignedProxy.present() || assignedProxy.get() != proxyId.get())) { throw wrong_shard_server(); } + + std::set removedTags; + const KeyRange historyRange = cdcTagHistoryRangeFor(streamId); + Key begin = historyRange.begin; + while (begin < historyRange.end) { + RangeResult history = + co_await tr.getRange(KeyRangeRef(begin, historyRange.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& entry : history) { + removedTags.insert(std::get<2>(decodeCDCTagHistoryKey(entry.key))); + } + if (!history.more) { + break; + } + begin = keyAfter(history.back().key); + } + tr.clear(nameKey); tr.clear(cdcStreamKeyFor(streamId)); tr.clear(cdcTagHistoryRangeFor(streamId)); @@ -293,7 +309,10 @@ Future removeNativeCdcStream(Database cx, Key name, Optional proxyId) signalNativeCdcProxyAssignmentChange(&tr); } co_await tr.commit(); - co_return; + NativeCdcRemovedStreamInfo removed; + removed.removalVersion = tr.getCommittedVersion(); + removed.tags.assign(removedTags.begin(), removedTags.end()); + co_return Optional(removed); } catch (Error& e) { if (e.code() == error_code_wrong_shard_server) { throw; diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index 703957695b6..fcfb981889e 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -34,13 +34,22 @@ struct NativeCdcStreamInfo { Version minVersion = invalidVersion; }; +struct NativeCdcRemovedStreamInfo { + Version removalVersion = invalidVersion; + std::vector tags; +}; + // These durable metadata operations are intended to back CDCProxyInterface // lifecycle requests once CDC proxies are recruited. Future registerNativeCdcStream(Database cx, Key name, KeyRange keys, Optional proxyId = Optional()); -Future removeNativeCdcStream(Database cx, Key name, Optional proxyId = Optional()); +// Returns the retired tags so the owning proxy can pop them after applying +// the acknowledgement minima of any remaining streams that share them. +Future> removeNativeCdcStream(Database cx, + Key name, + Optional proxyId = Optional()); Future> listNativeCdcStreams(Database cx); // Atomically moves any streams assigned to a failed proxy to its replacement. Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId); diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 21eea501303..6d80f398acf 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -369,6 +369,15 @@ Future popAcknowledgedData(CDCProxyData* self) { } } +Future popRemovedStreamData(CDCProxyData* self, NativeCdcRemovedStreamInfo removed) { + const std::map safePopVersions = co_await readSafePopVersions(self->cx); + for (const Tag& tag : removed.tags) { + const auto safePop = safePopVersions.find(tag); + const Version version = safePop == safePopVersions.end() ? removed.removalVersion : safePop->second; + self->logSystem->get()->pop(version, tag); + } +} + void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { std::set assignedStreams; for (const auto& [streamId, proxyId] : self->dbInfo->get().client.streamToCDCProxyId) { @@ -495,7 +504,10 @@ Future registerStream(CDCProxyData* self, CDCRegisterStreamRequest request Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { try { - co_await removeNativeCdcStream(self->cx, request.name, self->id); + Optional removed = co_await removeNativeCdcStream(self->cx, request.name, self->id); + if (removed.present()) { + co_await popRemovedStreamData(self, removed.get()); + } request.reply.send(Void()); } catch (Error& e) { if (e.code() == error_code_actor_cancelled) { diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index caf251c6720..4a3191425da 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -1084,25 +1084,8 @@ class ApplyMetadataMutationsImpl { !range.contains(cdcMaxStreamIdKey)) { return; } - if (logSystemConsumer && popVersion && cdcStreamKeys.intersects(range)) { - auto streamsCleared = txnStateStore->readRange(range & cdcStreamKeys).get(); - for (const auto& stream : streamsCleared) { - const CDCStreamId streamId = decodeCDCStreamKey(stream.key); - auto tagHistory = txnStateStore->readRange(cdcTagHistoryRangeFor(streamId)).get(); - for (const auto& entry : tagHistory) { - const Tag tag = std::get<2>(decodeCDCTagHistoryKey(entry.key)); - TraceEvent("CDCStreamTagRemove") - .detail("PopVersion", popVersion) - .detail("Tag", tag.toString()) - .detail("StreamId", streamId); - if (!forResolver) { - logSystemConsumer->pop(popVersion, tag); - (*tag_popped)[tag] = popVersion; - } - ASSERT_WE_THINK(forResolver ^ (tag_popped != nullptr)); - } - } - } + // CDC tags may be shared and acknowledgement minima are stored outside transaction state. + // The owning CDC proxy safely pops retired tags after stream removal commits. if (!initialCommit) { for (const KeyRangeRef cdcRange : { cdcStreamNameKeys, cdcStreamKeys, cdcTagHistoryKeys, cdcProxyKeys }) { if (cdcRange.intersects(range)) { diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index a7536cc1580..afc4573295d 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -251,6 +251,8 @@ struct NativeCdcWorkload : TestWorkload { firstCursor.lastConsumedVersion = consumed.lastConsumedVersion; } co_await acknowledgeNativeCdcStreamClient(cx, firstId, firstCursor.lastConsumedVersion); + co_await removeNativeCdcStreamClient(cx, firstName); + co_await waitForCDCProxyAssignmentRemoval(firstId); ASSERT(co_await registerNativeCdcStreamClient(cx, secondName, keys) == secondId); CDCCursor unreadCursor(secondId, invalidVersion); @@ -270,9 +272,7 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(foundUnread); co_await acknowledgeNativeCdcStreamClient(cx, secondId, unreadCursor.lastConsumedVersion); - co_await removeNativeCdcStreamClient(cx, firstName); co_await removeNativeCdcStreamClient(cx, secondName); - co_await waitForCDCProxyAssignmentRemoval(firstId); co_await waitForCDCProxyAssignmentRemoval(secondId); co_return; } From eab9670fa9fd0008761c2a59a5c941a77fa5a127 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 11:16:03 -0700 Subject: [PATCH 30/56] Fail CDC consume and acknowledge requests after stream removal --- fdbclient/NativeCdc.cpp | 24 +++++++++++++++++++++--- fdbserver/cdcproxy/CDCProxy.cpp | 6 +++++- fdbserver/workloads/NativeCdc.cpp | 29 +++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 36eb9d1981a..12e8bddc78b 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -156,6 +156,21 @@ Future getAvailableNativeCdcProxy(Database cx, Optional } } +Future nativeCdcStreamStillExists(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + co_return (co_await tr.get(cdcStreamKeyFor(streamId))).present(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + Future getNativeCdcStreamProxy(Database cx, CDCStreamId streamId) { if (streamId == 0) { throw client_invalid_operation(); @@ -171,11 +186,14 @@ Future getNativeCdcStreamProxy(Database cx, CDCStreamId strea } } } - co_await cx->clientInfo->onChange(); + if (!(co_await nativeCdcStreamStillExists(cx, streamId))) { + throw client_invalid_operation(); + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); } } -Future nativeCdcStreamStillExists(Database cx, Key name, CDCStreamId streamId) { +Future namedNativeCdcStreamStillExists(Database cx, Key name, CDCStreamId streamId) { Transaction tr(cx); while (true) { Error err; @@ -202,7 +220,7 @@ Future> getNativeCdcStreamProxyForRemoval(Database c } } } - if (!(co_await nativeCdcStreamStillExists(cx, name, streamId))) { + if (!(co_await namedNativeCdcStreamStillExists(cx, name, streamId))) { co_return Optional(); } co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 6d80f398acf..fb0189ee554 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -395,6 +395,7 @@ void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { if (!assignedStreams.contains(it->first)) { it->second->active = false; it->second->stopped.trigger(); + it->second->changed.trigger(); it = self->streams.erase(it); } else { ++it; @@ -415,9 +416,12 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { throw wrong_shard_server(); } Reference stream = found->second; - while (!stream->initialized) { + while (stream->active && !stream->initialized) { co_await stream->changed.onTrigger(); } + if (!stream->active) { + throw wrong_shard_server(); + } if (stream->tooOld) { throw transaction_too_old(); } diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index afc4573295d..0039b8ca1b5 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -469,8 +469,37 @@ struct NativeCdcWorkload : TestWorkload { co_await acknowledgeNativeCdcStreamClient(cx, liveStreamId, afterRecoveryCursor); ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == afterRecoveryCursor + 1); + + Future pendingConsume = + recoveredOwner.consume.getReply(CDCConsumeRequest(CDCCursor(liveStreamId, afterRecoveryCursor + 1000000))); + co_await delay(0.1); co_await removeNativeCdcStreamClient(cx, liveName); co_await waitForCDCProxyAssignmentRemoval(liveStreamId); + + bool pendingConsumeRejected = false; + try { + co_await timeoutError(pendingConsume, 30.0); + } catch (Error& e) { + pendingConsumeRejected = + e.code() == error_code_wrong_shard_server || e.code() == error_code_client_invalid_operation; + } + ASSERT(pendingConsumeRejected); + + bool retiredConsumeRejected = false; + try { + co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, afterRecoveryCursor)), 30.0); + } catch (Error& e) { + retiredConsumeRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(retiredConsumeRejected); + + bool retiredClientAcknowledgeRejected = false; + try { + co_await timeoutError(acknowledgeNativeCdcStreamClient(cx, liveStreamId, afterRecoveryCursor), 30.0); + } catch (Error& e) { + retiredClientAcknowledgeRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(retiredClientAcknowledgeRejected); } }; From d3fdfad8b526ece8d0bf3f8ccfcbffa9c38e5d1a Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 11:20:52 -0700 Subject: [PATCH 31/56] Update file_identifiers --- .../include/fdbclient/CDCProxyInterface.h | 26 +++++++++---------- .../fdbserver/core/WorkerInterface.actor.h | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fdbclient/include/fdbclient/CDCProxyInterface.h b/fdbclient/include/fdbclient/CDCProxyInterface.h index 981783eacc2..b15c994bcd0 100644 --- a/fdbclient/include/fdbclient/CDCProxyInterface.h +++ b/fdbclient/include/fdbclient/CDCProxyInterface.h @@ -27,7 +27,7 @@ #include "fdbrpc/fdbrpc.h" struct CDCCursor { - constexpr static FileIdentifier file_identifier = 16776001; + constexpr static FileIdentifier file_identifier = 10949553; CDCStreamId streamId = 0; Version lastConsumedVersion = invalidVersion; @@ -42,7 +42,7 @@ struct CDCCursor { }; struct VersionedMutationsRef { - constexpr static FileIdentifier file_identifier = 16776002; + constexpr static FileIdentifier file_identifier = 3297577; Version version = invalidVersion; VectorRef mutations; @@ -56,7 +56,7 @@ struct VersionedMutationsRef { }; struct CDCStreamInfoRef { - constexpr static FileIdentifier file_identifier = 16776003; + constexpr static FileIdentifier file_identifier = 10228408; StringRef name; CDCStreamId streamId = 0; KeyRangeRef keys; @@ -73,7 +73,7 @@ struct CDCStreamInfoRef { }; struct CDCRegisterStreamReply { - constexpr static FileIdentifier file_identifier = 16776012; + constexpr static FileIdentifier file_identifier = 3217071; CDCStreamId streamId = 0; CDCRegisterStreamReply() = default; @@ -86,7 +86,7 @@ struct CDCRegisterStreamReply { }; struct CDCRegisterStreamRequest { - constexpr static FileIdentifier file_identifier = 16776004; + constexpr static FileIdentifier file_identifier = 1269096; Key name; KeyRange keys; ReplyPromise reply; @@ -103,7 +103,7 @@ struct CDCRegisterStreamRequest { }; struct CDCRemoveStreamRequest { - constexpr static FileIdentifier file_identifier = 16776005; + constexpr static FileIdentifier file_identifier = 3683857; Key name; ReplyPromise reply; @@ -119,7 +119,7 @@ struct CDCRemoveStreamRequest { }; struct CDCListStreamsReply { - constexpr static FileIdentifier file_identifier = 16776006; + constexpr static FileIdentifier file_identifier = 7600884; Arena arena; VectorRef streams; @@ -130,7 +130,7 @@ struct CDCListStreamsReply { }; struct CDCListStreamsRequest { - constexpr static FileIdentifier file_identifier = 16776007; + constexpr static FileIdentifier file_identifier = 8134529; ReplyPromise reply; bool verify() const { return true; } @@ -142,7 +142,7 @@ struct CDCListStreamsRequest { }; struct CDCConsumeReply { - constexpr static FileIdentifier file_identifier = 16776008; + constexpr static FileIdentifier file_identifier = 12940542; Arena arena; VectorRef mutations; Version lastConsumedVersion = invalidVersion; @@ -154,7 +154,7 @@ struct CDCConsumeReply { }; struct CDCConsumeRequest { - constexpr static FileIdentifier file_identifier = 16776009; + constexpr static FileIdentifier file_identifier = 8178243; CDCCursor cursor; ReplyPromise reply; @@ -170,7 +170,7 @@ struct CDCConsumeRequest { }; struct CDCAckRequest { - constexpr static FileIdentifier file_identifier = 16776010; + constexpr static FileIdentifier file_identifier = 15923892; CDCStreamId streamId = 0; Version version = invalidVersion; ReplyPromise reply; @@ -187,7 +187,7 @@ struct CDCAckRequest { }; struct HaltCDCProxyRequest { - constexpr static FileIdentifier file_identifier = 16776014; + constexpr static FileIdentifier file_identifier = 6992638; ReplyPromise reply; bool verify() const { return true; } @@ -199,7 +199,7 @@ struct HaltCDCProxyRequest { }; struct CDCProxyInterface { - constexpr static FileIdentifier file_identifier = 16776011; + constexpr static FileIdentifier file_identifier = 6689609; enum { LocationAwareLoadBalance = 1 }; enum { AlwaysFresh = 1 }; diff --git a/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h b/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h index cd4ca503c6e..5062f63f6a6 100644 --- a/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h +++ b/fdbserver/core/include/fdbserver/core/WorkerInterface.actor.h @@ -738,7 +738,7 @@ extern template class RequestStream; extern template struct NetNotifiedQueue; struct InitializeCDCProxyRequest { - constexpr static FileIdentifier file_identifier = 16776013; + constexpr static FileIdentifier file_identifier = 416762; uint64_t recoveryCount; ReplyPromise reply; From 1fce475ae723b8f4e590be3c8cab8d616128fc17 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 11:35:28 -0700 Subject: [PATCH 32/56] Enable unit tests in simulation --- fdbclient/NativeCdc.cpp | 2 +- fdbclient/SystemData.cpp | 2 +- fdbserver/cdcproxy/CDCProxy.cpp | 2 +- fdbserver/logsystem/ApplyMetadataMutation.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 12e8bddc78b..b503bc4ad6f 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -560,7 +560,7 @@ Future acknowledgeNativeCdcStreamClient(Database cx, CDCStreamId streamId, } } -TEST_CASE("noSim/NativeCDC/LifecycleAllocation") { +TEST_CASE("/NativeCDC/LifecycleAllocation") { NativeCdcIdentifierAllocator allocator; auto [initialId, initialTag] = allocator.allocate(); ASSERT(initialId == 1); diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 4cd73708430..0a3bd410e19 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -1813,7 +1813,7 @@ TEST_CASE("noSim/SystemData/DataMoveId") { return Void(); } -TEST_CASE("noSim/SystemData/NativeCDC") { +TEST_CASE("/SystemData/NativeCDC") { const Key name = "orders"_sr; const CDCStreamId streamId = 42; const KeyRange keys(KeyRangeRef("a"_sr, "z"_sr)); diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index fb0189ee554..1b36aea221b 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -625,7 +625,7 @@ Future cdcProxyServer(CDCProxyInterface proxy, } } -TEST_CASE("noSim/NativeCDC/ProxyMutationFiltering") { +TEST_CASE("/NativeCDC/ProxyMutationFiltering") { const KeyRangeRef keys("c"_sr, "m"_sr); Optional inRange = clipCDCMutation(MutationRef(MutationRef::SetValue, "d"_sr, "value"_sr), keys); diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index 4a3191425da..67524214c9c 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -1359,7 +1359,7 @@ bool containsMetadataMutation(const VectorRef& mutations) { return false; } -TEST_CASE("noSim/NativeCDC/RoutingTable") { +TEST_CASE("/NativeCDC/RoutingTable") { CDCRoutingTable table; const Tag ordersTag(tagLocalityCDC, 1); const Tag overlappingTag(tagLocalityCDC, 2); From 30ac0afe567429352a86efcf8892d62c314190ea Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 12:49:30 -0700 Subject: [PATCH 33/56] Use std::unordered_map for NativeCdcIdentifierAllocator::tagStreamCounts --- fdbclient/NativeCdc.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index b503bc4ad6f..e3df5d2571d 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -37,7 +38,7 @@ namespace { struct NativeCdcIdentifierAllocator { bool sawStream = false; CDCStreamId maxStreamId = 0; - std::map tagStreamCounts; + std::unordered_map tagStreamCounts; void observeStreamId(CDCStreamId streamId) { sawStream = true; From 1810d8aa6bc4976abe1cc8c7d3f9d1e169d4070d Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 12:54:59 -0700 Subject: [PATCH 34/56] Add TODO comments for future CDC load balancing work --- fdbclient/NativeCdc.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index e3df5d2571d..b8e6126a96c 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -60,6 +60,8 @@ struct NativeCdcIdentifierAllocator { ASSERT_WE_THINK(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT <= std::numeric_limits::max() + 1u); uint32_t leastStreams = std::numeric_limits::max(); uint16_t selectedTagId = 0; + // TODO: Use data-distributor-observed per-tag write throughput to rebalance CDC tags, including + // migrating active streams with versioned tag-history assignments. for (uint32_t tagId = 0; tagId < static_cast(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT); ++tagId) { auto count = tagStreamCounts.find(static_cast(tagId)); const uint32_t streamCount = count == tagStreamCounts.end() ? 0 : count->second; @@ -143,6 +145,8 @@ bool retryNativeCdcProxyRequest(Error const& error) { error.code() == error_code_connection_failed || error.code() == error_code_request_maybe_delivered; } +// TODO: Have the cluster controller rebalance stream ownership using aggregate CDC proxy throughput and +// update cdcProxyKeys and ClientDBInfo assignments; registration currently chooses any available proxy. Future getAvailableNativeCdcProxy(Database cx, Optional previousProxy = Optional()) { while (true) { for (const auto& proxy : cx->clientInfo->get().cdcProxies) { From 9509829902ef53e2f307fb8c5219f963414d8f2c Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 12:59:54 -0700 Subject: [PATCH 35/56] Add ENABLE_NATIVE_CDC knob --- fdbclient/ClientKnobs.cpp | 1 + fdbclient/NativeCdc.cpp | 16 +++++++++++++++ fdbclient/include/fdbclient/Knobs.h | 1 + .../ClusterController.actor.cpp | 6 ++++-- .../clustercontroller/ClusterRecovery.cpp | 3 +++ fdbserver/commitproxy/CommitProxyServer.cpp | 20 +++++++++++++------ .../logsystem/ApplyMetadataMutation.h | 1 + tests/fast/NativeCdc.toml | 3 +++ tests/fast/NativeCdcSharedTag.toml | 3 +++ 9 files changed, 46 insertions(+), 8 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index e2e4c2ef4fd..69a4db2e49e 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -193,6 +193,7 @@ void ClientKnobs::initialize(Randomize randomize, IsSimulated isSimulated) { init( CHANGE_FEED_CACHE_FLUSH_BYTES, 10e6 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_FLUSH_BYTES = deterministicRandom()->randomInt64(1, 1e6); init( CHANGE_FEED_CACHE_EXPIRE_TIME, 60.0 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_EXPIRE_TIME = 1.0; init( CHANGE_FEED_CACHE_LIMIT_BYTES, 500000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_LIMIT_BYTES = 50000; + init( ENABLE_NATIVE_CDC, false ); if( randomize && BUGGIFY ) ENABLE_NATIVE_CDC = true; init( NATIVE_CDC_TAG_COUNT, 256 ); if( randomize && BUGGIFY ) NATIVE_CDC_TAG_COUNT = 2; init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index b8e6126a96c..100b3f3a506 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -35,6 +35,12 @@ namespace { +void validateNativeCdcEnabled() { + if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC) { + throw client_invalid_operation(); + } +} + struct NativeCdcIdentifierAllocator { bool sawStream = false; CDCStreamId maxStreamId = 0; @@ -235,6 +241,7 @@ Future> getNativeCdcStreamProxyForRemoval(Database c } // namespace Future registerNativeCdcStream(Database cx, Key name, KeyRange keys, Optional proxyId) { + validateNativeCdcEnabled(); validateNativeCdcStream(name, keys); Transaction tr(cx); @@ -285,6 +292,7 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys } Future> removeNativeCdcStream(Database cx, Key name, Optional proxyId) { + validateNativeCdcEnabled(); if (name.empty()) { throw client_invalid_operation(); } @@ -347,6 +355,7 @@ Future> removeNativeCdcStream(Database cx, } Future> listNativeCdcStreams(Database cx) { + validateNativeCdcEnabled(); std::vector result; Key begin = cdcStreamNameKeys.begin; Transaction tr(cx); @@ -384,6 +393,7 @@ Future> listNativeCdcStreams(Database cx) { } Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId) { + validateNativeCdcEnabled(); if (oldProxyId == newProxyId) { co_return; } @@ -428,6 +438,7 @@ Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyI } Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough) { + validateNativeCdcEnabled(); if (streamId == 0 || consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { throw client_invalid_operation(); } @@ -461,6 +472,7 @@ Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Ve } Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys) { + validateNativeCdcEnabled(); validateNativeCdcStream(name, keys); Optional previousProxy; @@ -480,6 +492,7 @@ Future registerNativeCdcStreamClient(Database cx, Key name, KeyRang } Future> listNativeCdcStreamsClient(Database cx) { + validateNativeCdcEnabled(); Optional previousProxy; while (true) { @@ -504,6 +517,7 @@ Future> listNativeCdcStreamsClient(Database cx) } Future removeNativeCdcStreamClient(Database cx, Key name) { + validateNativeCdcEnabled(); if (name.empty()) { throw client_invalid_operation(); } @@ -533,6 +547,7 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { } Future consumeNativeCdcStream(Database cx, CDCCursor cursor) { + validateNativeCdcEnabled(); while (true) { CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); try { @@ -547,6 +562,7 @@ Future consumeNativeCdcStream(Database cx, CDCCursor cursor) { } Future acknowledgeNativeCdcStreamClient(Database cx, CDCStreamId streamId, Version consumedThrough) { + validateNativeCdcEnabled(); if (streamId == 0 || consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { throw client_invalid_operation(); } diff --git a/fdbclient/include/fdbclient/Knobs.h b/fdbclient/include/fdbclient/Knobs.h index db726f985b5..c7d75c8735d 100644 --- a/fdbclient/include/fdbclient/Knobs.h +++ b/fdbclient/include/fdbclient/Knobs.h @@ -90,6 +90,7 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ClientKnobs : public KnobsImpl clusterControllerCore(ClusterControllerFullInterface interf, self.addActor.send(monitorServerInfoConfig(&self.db)); self.addActor.send(monitorStorageMetadata(&self)); self.addActor.send(monitorGlobalConfig(&self.db)); - self.addActor.send(monitorCDCProxyAssignments(&self.db)); - self.addActor.send(monitorAndRecruitCDCProxies(&self)); + if (CLIENT_KNOBS->ENABLE_NATIVE_CDC) { + self.addActor.send(monitorCDCProxyAssignments(&self.db)); + self.addActor.send(monitorAndRecruitCDCProxies(&self)); + } self.addActor.send(updatedChangingDatacenters(&self)); self.addActor.send(updatedChangedDatacenters(&self)); self.addActor.send(updateDatacenterVersionDifference(&self)); diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index b0766127975..9ce04b24d59 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -233,6 +233,9 @@ Future newGrvProxies(Reference self, RecruitFromConfi } Future ensureCDCProxies(Reference self, RecruitFromConfigurationReply recr) { + if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC) { + co_return; + } if (!self->controllerData->db.cdcProxies.empty()) { TraceEvent("CDCProxiesReused", self->dbgid).detail("Count", self->controllerData->db.cdcProxies.size()); co_return; diff --git a/fdbserver/commitproxy/CommitProxyServer.cpp b/fdbserver/commitproxy/CommitProxyServer.cpp index 8bc17833945..aa4a0cb95ee 100644 --- a/fdbserver/commitproxy/CommitProxyServer.cpp +++ b/fdbserver/commitproxy/CommitProxyServer.cpp @@ -695,8 +695,10 @@ std::set CommitBatchContext::getWrittenTagsPreResolution() { if (isSingleKeyMutation((MutationRef::Type)m.type)) { auto& tags = pProxyCommitData->tagsForKey(m.param1); transactionTags.insert(tags.begin(), tags.end()); - const auto& cdcTags = pProxyCommitData->cdcRouting.tagsForKey(m.param1); - transactionTags.insert(cdcTags.begin(), cdcTags.end()); + if (!pProxyCommitData->cdcRouting.empty()) { + const auto& cdcTags = pProxyCommitData->cdcRouting.tagsForKey(m.param1); + transactionTags.insert(cdcTags.begin(), cdcTags.end()); + } } else if (m.type == MutationRef::ClearRange) { auto range = pProxyCommitData->keyInfo.rangeContaining(m.param1); if (range.end() >= m.param2) { @@ -712,8 +714,10 @@ std::set CommitBatchContext::getWrittenTagsPreResolution() { } } KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); - const auto cdcTags = pProxyCommitData->cdcRouting.tagsForRange(clearRange); - transactionTags.insert(cdcTags.begin(), cdcTags.end()); + if (!pProxyCommitData->cdcRouting.empty()) { + const auto cdcTags = pProxyCommitData->cdcRouting.tagsForRange(clearRange); + transactionTags.insert(cdcTags.begin(), cdcTags.end()); + } } else { UNREACHABLE(); } @@ -1395,7 +1399,9 @@ Future assignMutationsToStorageServers(CommitBatchContext* self) { DEBUG_MUTATION("ProxyCommit", self->commitVersion, m, pProxyCommitData->dbgid).detail("To", tags); self->toCommit.addTags(tags); - self->toCommit.addTags(pProxyCommitData->cdcRouting.tagsForKey(m.param1)); + if (!pProxyCommitData->cdcRouting.empty()) { + self->toCommit.addTags(pProxyCommitData->cdcRouting.tagsForKey(m.param1)); + } if (pProxyCommitData->acsBuilder != nullptr) { updateMutationWithAcsAndAddMutationToAcsBuilder( @@ -1487,7 +1493,9 @@ Future assignMutationsToStorageServers(CommitBatchContext* self) { } KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); - self->toCommit.addTags(pProxyCommitData->cdcRouting.tagsForRange(clearRange)); + if (!pProxyCommitData->cdcRouting.empty()) { + self->toCommit.addTags(pProxyCommitData->cdcRouting.tagsForRange(clearRange)); + } WriteMutationRefVar var = writeMutation(self, &m); // FIXME: Remove assert once ClearRange RAW_ACCESS usecase handling is done ASSERT(std::holds_alternative(var)); diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h index 17663d58a06..e703c4d60f1 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h @@ -73,6 +73,7 @@ class CDCRoutingTable : NonCopyable { void setRange(CDCStreamId streamId, KeyRangeRef const& keys); void setTag(CDCStreamId streamId, Version version, Tag tag); void reload(IKeyValueStore* txnStateStore); + bool empty() const { return streams.empty(); } const std::set& tagsForKey(KeyRef const& key) const; std::set tagsForRange(KeyRangeRef const& keys) const; diff --git a/tests/fast/NativeCdc.toml b/tests/fast/NativeCdc.toml index b6286c322fd..0caf4f92065 100644 --- a/tests/fast/NativeCdc.toml +++ b/tests/fast/NativeCdc.toml @@ -4,6 +4,9 @@ singleRegion = true buggify = false faultInjection = false +[[knobs]] +enable_native_cdc = true + [[test]] testTitle = 'NativeCdc' useDB = true diff --git a/tests/fast/NativeCdcSharedTag.toml b/tests/fast/NativeCdcSharedTag.toml index 87e3057d10f..8f72da81377 100644 --- a/tests/fast/NativeCdcSharedTag.toml +++ b/tests/fast/NativeCdcSharedTag.toml @@ -4,6 +4,9 @@ singleRegion = true buggify = false faultInjection = false +[[knobs]] +enable_native_cdc = true + [[test]] testTitle = 'NativeCdcSharedTag' useDB = true From 270b3d700f68a86ad46ba81688af60eee1d70356 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 13:27:33 -0700 Subject: [PATCH 36/56] Fix CDC lifecycle recovery, retired-tag cleanup, and expiry handling --- fdbclient/ClientKnobs.cpp | 2 +- fdbclient/NativeCdc.cpp | 10 ++-- fdbclient/SystemData.cpp | 36 ++++++++++++++ fdbclient/include/fdbclient/NativeCdc.h | 8 ++-- fdbclient/include/fdbclient/SystemData.h | 12 +++++ fdbserver/cdcproxy/CDCProxy.cpp | 48 +++++++++++++++---- .../ClusterController.actor.cpp | 43 +++++++++++++++-- .../clustercontroller/ClusterRecovery.cpp | 4 +- fdbserver/logsystem/ApplyMetadataMutation.cpp | 22 +++++---- fdbserver/workloads/NativeCdc.cpp | 26 +++++++++- 10 files changed, 174 insertions(+), 37 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 69a4db2e49e..feac2a88798 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -193,7 +193,7 @@ void ClientKnobs::initialize(Randomize randomize, IsSimulated isSimulated) { init( CHANGE_FEED_CACHE_FLUSH_BYTES, 10e6 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_FLUSH_BYTES = deterministicRandom()->randomInt64(1, 1e6); init( CHANGE_FEED_CACHE_EXPIRE_TIME, 60.0 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_EXPIRE_TIME = 1.0; init( CHANGE_FEED_CACHE_LIMIT_BYTES, 500000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_LIMIT_BYTES = 50000; - init( ENABLE_NATIVE_CDC, false ); if( randomize && BUGGIFY ) ENABLE_NATIVE_CDC = true; + init( ENABLE_NATIVE_CDC, false ); if( randomize && isSimulated && BUGGIFY ) ENABLE_NATIVE_CDC = true; init( NATIVE_CDC_TAG_COUNT, 256 ); if( randomize && BUGGIFY ) NATIVE_CDC_TAG_COUNT = 2; init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 100b3f3a506..4f914ef7617 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -292,7 +292,6 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys } Future> removeNativeCdcStream(Database cx, Key name, Optional proxyId) { - validateNativeCdcEnabled(); if (name.empty()) { throw client_invalid_operation(); } @@ -333,6 +332,12 @@ Future> removeNativeCdcStream(Database cx, tr.clear(nameKey); tr.clear(cdcStreamKeyFor(streamId)); + for (const Tag& tag : removedTags) { + tr.set(cdcRetiredTagPopKeyFor(tag), Value()); + tr.atomicOp(cdcRetiredTagPopVersionKeyFor(tag), + cdcVersionstampedMinVersionValue(), + MutationRef::SetVersionstampedValue); + } tr.clear(cdcTagHistoryRangeFor(streamId)); tr.clear(cdcMinVersionKeyFor(streamId)); tr.clear(cdcProxyRangeFor(streamId)); @@ -355,7 +360,6 @@ Future> removeNativeCdcStream(Database cx, } Future> listNativeCdcStreams(Database cx) { - validateNativeCdcEnabled(); std::vector result; Key begin = cdcStreamNameKeys.begin; Transaction tr(cx); @@ -393,7 +397,6 @@ Future> listNativeCdcStreams(Database cx) { } Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId) { - validateNativeCdcEnabled(); if (oldProxyId == newProxyId) { co_return; } @@ -438,7 +441,6 @@ Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyI } Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough) { - validateNativeCdcEnabled(); if (streamId == 0 || consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { throw client_invalid_operation(); } diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 0a3bd410e19..e1eddbcf9c0 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -776,6 +776,9 @@ const KeyRef cdcMaxStreamIdKey = "\xff/cdc/maxStreamId"_sr; const KeyRangeRef cdcStreamKeys("\xff/cdc/keys/"_sr, "\xff/cdc/keys0"_sr); const KeyRangeRef cdcTagHistoryKeys("\xff/cdc/tagHistory/"_sr, "\xff/cdc/tagHistory0"_sr); const KeyRangeRef cdcMinVersionKeys("\xff\x02/cdc/minVersion/"_sr, "\xff\x02/cdc/minVersion0"_sr); +const KeyRangeRef cdcRetiredTagPopKeys("\xff/cdc/retiredTagPop/"_sr, "\xff/cdc/retiredTagPop0"_sr); +const KeyRangeRef cdcRetiredTagPopVersionKeys("\xff\x02/cdc/retiredTagPopVersion/"_sr, + "\xff\x02/cdc/retiredTagPopVersion0"_sr); const KeyRangeRef cdcProxyKeys("\xff/cdc/proxies/"_sr, "\xff/cdc/proxies0"_sr); const KeyRef cdcProxyAssignmentChangeKey = "\xff/cdc/proxyAssignmentChange"_sr; @@ -909,6 +912,34 @@ Version decodeCDCMinVersionValue(ValueRef const& value) { return version; } +Key cdcRetiredTagPopKeyFor(Tag tag) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcRetiredTagPopKeys.begin); + wr << tag; + return wr.toValue(); +} + +Tag decodeCDCRetiredTagPopKey(KeyRef const& key) { + Tag tag; + BinaryReader reader(key.removePrefix(cdcRetiredTagPopKeys.begin), Unversioned()); + reader >> tag; + return tag; +} + +Key cdcRetiredTagPopVersionKeyFor(Tag tag) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcRetiredTagPopVersionKeys.begin); + wr << tag; + return wr.toValue(); +} + +Tag decodeCDCRetiredTagPopVersionKey(KeyRef const& key) { + Tag tag; + BinaryReader reader(key.removePrefix(cdcRetiredTagPopVersionKeys.begin), Unversioned()); + reader >> tag; + return tag; +} + static Key cdcProxyPrefixFor(CDCStreamId streamId) { BinaryWriter wr(Unversioned()); wr.serializeBytes(cdcProxyKeys.begin); @@ -1830,6 +1861,11 @@ TEST_CASE("/SystemData/NativeCDC") { ASSERT(decodeCDCMinVersionValue(cdcMinVersionValue(minVersion)) == minVersion); ASSERT(nonMetadataSystemKeys.contains(cdcMinVersionKeyFor(streamId))); ASSERT(cdcVersionstampedMinVersionValue().size() == sizeof(Version) + sizeof(uint16_t) + sizeof(int32_t)); + ASSERT(decodeCDCRetiredTagPopKey(cdcRetiredTagPopKeyFor(tag)) == tag); + ASSERT(cdcRetiredTagPopKeys.contains(cdcRetiredTagPopKeyFor(tag))); + ASSERT(decodeCDCRetiredTagPopVersionKey(cdcRetiredTagPopVersionKeyFor(tag)) == tag); + ASSERT(cdcRetiredTagPopVersionKeys.contains(cdcRetiredTagPopVersionKeyFor(tag))); + ASSERT(nonMetadataSystemKeys.contains(cdcRetiredTagPopVersionKeyFor(tag))); const Key tagHistoryKey = cdcTagHistoryKeyFor(streamId, minVersion, tag); const auto [decodedStreamId, decodedVersion, decodedTag] = decodeCDCTagHistoryKey(tagHistoryKey); diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index fcfb981889e..252cbced862 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -39,14 +39,14 @@ struct NativeCdcRemovedStreamInfo { std::vector tags; }; -// These durable metadata operations are intended to back CDCProxyInterface -// lifecycle requests once CDC proxies are recruited. +// These durable metadata operations back CDCProxyInterface lifecycle requests. +// Registration is knob-protected; draining and cleanup remain available for +// streams persisted while native CDC was enabled. Future registerNativeCdcStream(Database cx, Key name, KeyRange keys, Optional proxyId = Optional()); -// Returns the retired tags so the owning proxy can pop them after applying -// the acknowledgement minima of any remaining streams that share them. +// Persists per-tag final-pop watermarks before removing stream metadata. Future> removeNativeCdcStream(Database cx, Key name, Optional proxyId = Optional()); diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index 20eb711aa81..19c38e26ca3 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -302,6 +302,18 @@ Value cdcMinVersionValue(Version version); Value cdcVersionstampedMinVersionValue(); Version decodeCDCMinVersionValue(ValueRef const& value); +// "\xff/cdc/retiredTagPop/[[Tag]]" := "" +// Marks tags with durable final-pop work, so recovery keeps a CDC proxy available. +extern const KeyRangeRef cdcRetiredTagPopKeys; +Key cdcRetiredTagPopKeyFor(Tag tag); +Tag decodeCDCRetiredTagPopKey(KeyRef const& key); + +// "\xff\x02/cdc/retiredTagPopVersion/[[Tag]]" := "[[Version]]" +// Stores bounded storage-backed final-pop watermarks for removed streams. +extern const KeyRangeRef cdcRetiredTagPopVersionKeys; +Key cdcRetiredTagPopVersionKeyFor(Tag tag); +Tag decodeCDCRetiredTagPopVersionKey(KeyRef const& key); + // "\xff/cdc/proxies/[[CDCStreamId]][[proxyUID]]" := "" extern const KeyRangeRef cdcProxyKeys; // Changed whenever durable CDC stream-to-proxy assignments change. diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 1b36aea221b..ab0df76dbd0 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -362,18 +362,46 @@ Future> readSafePopVersions(Database cx) { } } +Future> readRetiredTagPopVersions(Database cx) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + + std::map retiredTagPopVersions; + Key begin = cdcRetiredTagPopVersionKeys.begin; + while (begin < cdcRetiredTagPopVersionKeys.end) { + RangeResult retired = + co_await tr.getRange(KeyRangeRef(begin, cdcRetiredTagPopVersionKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : retired) { + retiredTagPopVersions[decodeCDCRetiredTagPopVersionKey(kv.key)] = + decodeCDCMinVersionValue(kv.value); + } + if (!retired.more) { + break; + } + begin = keyAfter(retired.back().key); + } + co_return retiredTagPopVersions; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + Future popAcknowledgedData(CDCProxyData* self) { const std::map safePopVersions = co_await readSafePopVersions(self->cx); for (const auto& [tag, version] : safePopVersions) { self->logSystem->get()->pop(version, tag); } -} - -Future popRemovedStreamData(CDCProxyData* self, NativeCdcRemovedStreamInfo removed) { - const std::map safePopVersions = co_await readSafePopVersions(self->cx); - for (const Tag& tag : removed.tags) { + const std::map retiredTagPopVersions = co_await readRetiredTagPopVersions(self->cx); + for (const auto& [tag, retiredVersion] : retiredTagPopVersions) { const auto safePop = safePopVersions.find(tag); - const Version version = safePop == safePopVersions.end() ? removed.removalVersion : safePop->second; + const Version version = + safePop == safePopVersions.end() ? retiredVersion : std::min(retiredVersion, safePop->second); self->logSystem->get()->pop(version, tag); } } @@ -419,12 +447,12 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { while (stream->active && !stream->initialized) { co_await stream->changed.onTrigger(); } - if (!stream->active) { - throw wrong_shard_server(); - } if (stream->tooOld) { throw transaction_too_old(); } + if (!stream->active) { + throw wrong_shard_server(); + } Version begin = request.cursor.lastConsumedVersion == invalidVersion ? stream->minVersion : request.cursor.lastConsumedVersion + 1; @@ -510,7 +538,7 @@ Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { try { Optional removed = co_await removeNativeCdcStream(self->cx, request.name, self->id); if (removed.present()) { - co_await popRemovedStreamData(self, removed.get()); + co_await popAcknowledgedData(self); } request.reply.send(Void()); } catch (Error& e) { diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index fac9139d278..715da707493 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -2150,13 +2150,31 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); std::map streamToCDCProxyId; + const std::vector availableProxies = db->cdcProxies; + size_t replacementIndex = 0; + bool repairedAssignment = false; Key begin = cdcProxyKeys.begin; while (begin < cdcProxyKeys.end) { RangeResult assignments = co_await tr.getRange(KeyRangeRef(begin, cdcProxyKeys.end), CLIENT_KNOBS->TOO_MANY); for (const auto& assignment : assignments) { const auto [streamId, proxyId] = decodeCDCProxyKey(assignment.key); - ASSERT_WE_THINK(streamToCDCProxyId.emplace(streamId, proxyId).second); + UID resolvedProxyId = proxyId; + const bool hasOwner = + std::any_of(availableProxies.begin(), availableProxies.end(), [proxyId](const auto& proxy) { + return proxy.id() == proxyId; + }); + if (!availableProxies.empty() && !hasOwner) { + resolvedProxyId = availableProxies[replacementIndex++ % availableProxies.size()].id(); + tr.clear(assignment.key); + tr.set(cdcProxyKeyFor(streamId, resolvedProxyId), Value()); + repairedAssignment = true; + TraceEvent("CDCProxyAssignmentRepaired") + .detail("StreamId", streamId) + .detail("OldCDCProxyID", proxyId) + .detail("NewCDCProxyID", resolvedProxyId); + } + ASSERT_WE_THINK(streamToCDCProxyId.emplace(streamId, resolvedProxyId).second); } if (!assignments.more) { break; @@ -2164,6 +2182,22 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { begin = keyAfter(assignments.back().key); } + if (!streamToCDCProxyId.empty() && availableProxies.empty()) { + Future assignmentChangeFuture = tr.watch(cdcProxyAssignmentChangeKey); + Future endpointChangeFuture = db->clientInfo->onChange(); + co_await tr.commit(); + co_await (assignmentChangeFuture || endpointChangeFuture); + break; + } + + if (repairedAssignment) { + tr.set(cdcProxyAssignmentChangeKey, + BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), + IncludeVersion(ProtocolVersion::withNativeCdc()))); + co_await tr.commit(); + break; + } + ClientDBInfo clientInfo = db->clientInfo->get(); if (clientInfo.streamToCDCProxyId != streamToCDCProxyId) { clientInfo.id = deterministicRandom()->randomUniqueID(); @@ -3131,10 +3165,9 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, self.addActor.send(monitorServerInfoConfig(&self.db)); self.addActor.send(monitorStorageMetadata(&self)); self.addActor.send(monitorGlobalConfig(&self.db)); - if (CLIENT_KNOBS->ENABLE_NATIVE_CDC) { - self.addActor.send(monitorCDCProxyAssignments(&self.db)); - self.addActor.send(monitorAndRecruitCDCProxies(&self)); - } + // These actors also drain durable CDC state when new stream registration is disabled. + self.addActor.send(monitorCDCProxyAssignments(&self.db)); + self.addActor.send(monitorAndRecruitCDCProxies(&self)); self.addActor.send(updatedChangingDatacenters(&self)); self.addActor.send(updatedChangedDatacenters(&self)); self.addActor.send(updateDatacenterVersionDifference(&self)); diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index 9ce04b24d59..319b31fec35 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -233,7 +233,9 @@ Future newGrvProxies(Reference self, RecruitFromConfi } Future ensureCDCProxies(Reference self, RecruitFromConfigurationReply recr) { - if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC) { + const bool hasDurableCdcState = !(co_await self->txnStateStore->readRange(cdcStreamKeys)).empty() || + !(co_await self->txnStateStore->readRange(cdcRetiredTagPopKeys)).empty(); + if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC && !hasDurableCdcState) { co_return; } if (!self->controllerData->db.cdcProxies.empty()) { diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index 67524214c9c..1dccb9d04d7 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -614,8 +614,9 @@ class ApplyMetadataMutationsImpl { void checkSetCDCMetadata(MutationRef m) { if (!cdcStreamNameKeys.contains(m.param1) && !cdcStreamKeys.contains(m.param1) && - !cdcTagHistoryKeys.contains(m.param1) && !cdcProxyKeys.contains(m.param1) && - m.param1 != cdcMaxStreamIdKey && m.param1 != cdcProxyAssignmentChangeKey) { + !cdcTagHistoryKeys.contains(m.param1) && !cdcRetiredTagPopKeys.contains(m.param1) && + !cdcProxyKeys.contains(m.param1) && m.param1 != cdcMaxStreamIdKey && + m.param1 != cdcProxyAssignmentChangeKey) { return; } if (!initialCommit) { @@ -1080,14 +1081,15 @@ class ApplyMetadataMutationsImpl { void checkClearCDCMetadata(KeyRangeRef range) { if (!cdcStreamNameKeys.intersects(range) && !cdcStreamKeys.intersects(range) && - !cdcTagHistoryKeys.intersects(range) && !cdcProxyKeys.intersects(range) && - !range.contains(cdcMaxStreamIdKey)) { + !cdcTagHistoryKeys.intersects(range) && !cdcRetiredTagPopKeys.intersects(range) && + !cdcProxyKeys.intersects(range) && !range.contains(cdcMaxStreamIdKey)) { return; } // CDC tags may be shared and acknowledgement minima are stored outside transaction state. - // The owning CDC proxy safely pops retired tags after stream removal commits. + // A durable retired-tag watermark lets any CDC proxy finish pops after stream removal. if (!initialCommit) { - for (const KeyRangeRef cdcRange : { cdcStreamNameKeys, cdcStreamKeys, cdcTagHistoryKeys, cdcProxyKeys }) { + for (const KeyRangeRef cdcRange : + { cdcStreamNameKeys, cdcStreamKeys, cdcTagHistoryKeys, cdcRetiredTagPopKeys, cdcProxyKeys }) { if (cdcRange.intersects(range)) { txnStateStore->clear(cdcRange & range); } @@ -1334,8 +1336,8 @@ bool containsMetadataMutation(const VectorRef& mutations) { (m.param1.startsWith(logRangesRange.begin)) || (m.param1.startsWith(serverKeysPrefix)) || (m.param1.startsWith(keyServersPrefix)) || cdcStreamNameKeys.contains(m.param1) || cdcStreamKeys.contains(m.param1) || cdcTagHistoryKeys.contains(m.param1) || - cdcProxyKeys.contains(m.param1) || m.param1 == cdcMaxStreamIdKey || - m.param1 == cdcProxyAssignmentChangeKey) { + cdcRetiredTagPopKeys.contains(m.param1) || cdcProxyKeys.contains(m.param1) || + m.param1 == cdcMaxStreamIdKey || m.param1 == cdcProxyAssignmentChangeKey) { return true; } } else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) { @@ -1350,8 +1352,8 @@ bool containsMetadataMutation(const VectorRef& mutations) { (range.contains(metadataVersionKey)) || (range.contains(mustContainSystemMutationsKey)) || (range.contains(writeRecoveryKey)) || (range.intersects(testOnlyTxnStateStorePrefixRange)) || cdcStreamNameKeys.intersects(range) || cdcStreamKeys.intersects(range) || - cdcTagHistoryKeys.intersects(range) || cdcProxyKeys.intersects(range) || - range.contains(cdcMaxStreamIdKey)) { + cdcTagHistoryKeys.intersects(range) || cdcRetiredTagPopKeys.intersects(range) || + cdcProxyKeys.intersects(range) || range.contains(cdcMaxStreamIdKey)) { return true; } } diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 0039b8ca1b5..3b50b658a4e 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -108,6 +108,24 @@ struct NativeCdcWorkload : TestWorkload { } } + Future getRetiredTagPopVersion(Database cx, Tag tag) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional marker = co_await tr.get(cdcRetiredTagPopKeyFor(tag)); + Optional version = co_await tr.get(cdcRetiredTagPopVersionKeyFor(tag)); + ASSERT(marker.present()); + ASSERT(version.present()); + co_return decodeCDCMinVersionValue(version.get()); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + Future appendPersistedTag(Database cx, CDCStreamId streamId, Tag tag) { Transaction tr(cx); while (true) { @@ -317,7 +335,9 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(streams.size() == 1); ASSERT(streams[0].minVersion == firstAckMinVersion); - co_await removeNativeCdcStream(cx, firstName); + Optional removedFirst = co_await removeNativeCdcStream(cx, firstName); + ASSERT(removedFirst.present()); + ASSERT((co_await getRetiredTagPopVersion(cx, firstRoute.first)) == removedFirst.get().removalVersion); ASSERT((co_await listNativeCdcStreams(cx)).empty()); ASSERT(!(co_await hasPersistedRetention(cx, firstId))); @@ -334,7 +354,9 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(secondId > firstId); ASSERT(secondRoute.first == firstRoute.first); - co_await removeNativeCdcStream(cx, secondName); + Optional removedSecond = co_await removeNativeCdcStream(cx, secondName); + ASSERT(removedSecond.present()); + ASSERT((co_await getRetiredTagPopVersion(cx, secondRoute.first)) == removedSecond.get().removalVersion); const Key liveName = "native-cdc-live"_sr; const KeyRange liveRange(KeyRangeRef("live/"_sr, "live0"_sr)); From c75f037f731570ee829c894b1ba2c0d15a2c6501 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 13:37:16 -0700 Subject: [PATCH 37/56] Allow native CDC consumption by registered stream name --- fdbclient/NativeCdc.cpp | 34 +++++++++++++++++--- fdbclient/include/fdbclient/NativeCdc.h | 5 +-- fdbserver/workloads/NativeCdc.cpp | 42 +++++++++++++------------ 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 4f914ef7617..7facb34f27d 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -182,6 +182,29 @@ Future nativeCdcStreamStillExists(Database cx, CDCStreamId streamId) { } } +Future getNativeCdcStreamId(Database cx, Key name) { + if (name.empty()) { + throw client_invalid_operation(); + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional streamId = co_await tr.get(cdcStreamNameKeyFor(name)); + if (!streamId.present()) { + throw client_invalid_operation(); + } + co_return decodeCDCStreamNameValue(streamId.get()); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + Future getNativeCdcStreamProxy(Database cx, CDCStreamId streamId) { if (streamId == 0) { throw client_invalid_operation(); @@ -548,10 +571,12 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { } } -Future consumeNativeCdcStream(Database cx, CDCCursor cursor) { +Future consumeNativeCdcStream(Database cx, Key name, Version lastConsumedVersion) { validateNativeCdcEnabled(); + const CDCStreamId streamId = co_await getNativeCdcStreamId(cx, name); + const CDCCursor cursor(streamId, lastConsumedVersion); while (true) { - CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, streamId); try { co_return co_await proxy.consume.getReply(CDCConsumeRequest(cursor)); } catch (Error& error) { @@ -563,11 +588,12 @@ Future consumeNativeCdcStream(Database cx, CDCCursor cursor) { } } -Future acknowledgeNativeCdcStreamClient(Database cx, CDCStreamId streamId, Version consumedThrough) { +Future acknowledgeNativeCdcStreamClient(Database cx, Key name, Version consumedThrough) { validateNativeCdcEnabled(); - if (streamId == 0 || consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { + if (consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { throw client_invalid_operation(); } + const CDCStreamId streamId = co_await getNativeCdcStreamId(cx, name); while (true) { CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, streamId); diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index 252cbced862..94177244e7a 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -61,7 +61,8 @@ Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Ve Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys); Future removeNativeCdcStreamClient(Database cx, Key name); Future> listNativeCdcStreamsClient(Database cx); -Future consumeNativeCdcStream(Database cx, CDCCursor cursor); -Future acknowledgeNativeCdcStreamClient(Database cx, CDCStreamId streamId, Version consumedThrough); +// Uses the range registered for this name; consumers do not respecify it. +Future consumeNativeCdcStream(Database cx, Key name, Version lastConsumedVersion); +Future acknowledgeNativeCdcStreamClient(Database cx, Key name, Version consumedThrough); #endif // FDBCLIENT_NATIVECDC_H diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 3b50b658a4e..b99aaa0711f 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -256,28 +256,30 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(co_await registerNativeCdcStreamClient(cx, firstName, keys) == firstId); const Version writeVersion = co_await writeValues(cx, { { "shared/unread"_sr, "protected-by-minimum"_sr } }); - CDCCursor firstCursor(firstId, invalidVersion); + Version firstConsumedThrough = invalidVersion; const double firstConsumeDeadline = now() + 30.0; - while (firstCursor.lastConsumedVersion < writeVersion) { - CDCConsumeReply consumed = co_await timeoutError(consumeNativeCdcStream(cx, firstCursor), 30.0); - if (consumed.lastConsumedVersion == firstCursor.lastConsumedVersion) { + while (firstConsumedThrough < writeVersion) { + CDCConsumeReply consumed = + co_await timeoutError(consumeNativeCdcStream(cx, firstName, firstConsumedThrough), 30.0); + if (consumed.lastConsumedVersion == firstConsumedThrough) { ASSERT(now() < firstConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(consumed.lastConsumedVersion > firstCursor.lastConsumedVersion); - firstCursor.lastConsumedVersion = consumed.lastConsumedVersion; + ASSERT(consumed.lastConsumedVersion > firstConsumedThrough); + firstConsumedThrough = consumed.lastConsumedVersion; } - co_await acknowledgeNativeCdcStreamClient(cx, firstId, firstCursor.lastConsumedVersion); + co_await acknowledgeNativeCdcStreamClient(cx, firstName, firstConsumedThrough); co_await removeNativeCdcStreamClient(cx, firstName); co_await waitForCDCProxyAssignmentRemoval(firstId); ASSERT(co_await registerNativeCdcStreamClient(cx, secondName, keys) == secondId); - CDCCursor unreadCursor(secondId, invalidVersion); + Version unreadConsumedThrough = invalidVersion; bool foundUnread = false; - while (unreadCursor.lastConsumedVersion < writeVersion) { - CDCConsumeReply unread = co_await timeoutError(consumeNativeCdcStream(cx, unreadCursor), 30.0); - ASSERT(unread.lastConsumedVersion > unreadCursor.lastConsumedVersion); + while (unreadConsumedThrough < writeVersion) { + CDCConsumeReply unread = + co_await timeoutError(consumeNativeCdcStream(cx, secondName, unreadConsumedThrough), 30.0); + ASSERT(unread.lastConsumedVersion > unreadConsumedThrough); for (const auto& versioned : unread.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "shared/unread"_sr) { @@ -285,10 +287,10 @@ struct NativeCdcWorkload : TestWorkload { } } } - unreadCursor.lastConsumedVersion = unread.lastConsumedVersion; + unreadConsumedThrough = unread.lastConsumedVersion; } ASSERT(foundUnread); - co_await acknowledgeNativeCdcStreamClient(cx, secondId, unreadCursor.lastConsumedVersion); + co_await acknowledgeNativeCdcStreamClient(cx, secondName, unreadConsumedThrough); co_await removeNativeCdcStreamClient(cx, secondName); co_await waitForCDCProxyAssignmentRemoval(secondId); @@ -399,7 +401,7 @@ struct NativeCdcWorkload : TestWorkload { const double initialConsumeDeadline = now() + 30.0; while (consumedThrough < writeVersion) { CDCConsumeReply consumed = - co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, consumedThrough)), 30.0); + co_await timeoutError(consumeNativeCdcStream(cx, liveName, consumedThrough), 30.0); if (consumed.lastConsumedVersion == consumedThrough) { ASSERT(now() < initialConsumeDeadline); co_await delay(0.1); @@ -434,7 +436,7 @@ struct NativeCdcWorkload : TestWorkload { const double afterFailureConsumeDeadline = now() + 30.0; while (afterFailureCursor < afterFailureVersion) { CDCConsumeReply afterFailure = - co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, afterFailureCursor)), 30.0); + co_await timeoutError(consumeNativeCdcStream(cx, liveName, afterFailureCursor), 30.0); if (afterFailure.lastConsumedVersion == afterFailureCursor) { ASSERT(now() < afterFailureConsumeDeadline); co_await delay(0.1); @@ -453,7 +455,7 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(foundAfterFailureWrite); const Version cursorBeforeRecovery = afterFailureCursor; - co_await acknowledgeNativeCdcStreamClient(cx, liveStreamId, cursorBeforeRecovery); + co_await acknowledgeNativeCdcStreamClient(cx, liveName, cursorBeforeRecovery); ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == cursorBeforeRecovery + 1); const int32_t recoveredResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; @@ -471,7 +473,7 @@ struct NativeCdcWorkload : TestWorkload { const double afterRecoveryConsumeDeadline = now() + 30.0; while (afterRecoveryCursor < afterRecoveryVersion) { CDCConsumeReply afterRecovery = - co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, afterRecoveryCursor)), 30.0); + co_await timeoutError(consumeNativeCdcStream(cx, liveName, afterRecoveryCursor), 30.0); if (afterRecovery.lastConsumedVersion == afterRecoveryCursor) { ASSERT(now() < afterRecoveryConsumeDeadline); co_await delay(0.1); @@ -489,7 +491,7 @@ struct NativeCdcWorkload : TestWorkload { } ASSERT(foundAfterRecoveryWrite); - co_await acknowledgeNativeCdcStreamClient(cx, liveStreamId, afterRecoveryCursor); + co_await acknowledgeNativeCdcStreamClient(cx, liveName, afterRecoveryCursor); ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == afterRecoveryCursor + 1); Future pendingConsume = @@ -509,7 +511,7 @@ struct NativeCdcWorkload : TestWorkload { bool retiredConsumeRejected = false; try { - co_await timeoutError(consumeNativeCdcStream(cx, CDCCursor(liveStreamId, afterRecoveryCursor)), 30.0); + co_await timeoutError(consumeNativeCdcStream(cx, liveName, afterRecoveryCursor), 30.0); } catch (Error& e) { retiredConsumeRejected = e.code() == error_code_client_invalid_operation; } @@ -517,7 +519,7 @@ struct NativeCdcWorkload : TestWorkload { bool retiredClientAcknowledgeRejected = false; try { - co_await timeoutError(acknowledgeNativeCdcStreamClient(cx, liveStreamId, afterRecoveryCursor), 30.0); + co_await timeoutError(acknowledgeNativeCdcStreamClient(cx, liveName, afterRecoveryCursor), 30.0); } catch (Error& e) { retiredClientAcknowledgeRejected = e.code() == error_code_client_invalid_operation; } From 0323a68d517d2332278b39e81b66de6555d142cf Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 13:42:27 -0700 Subject: [PATCH 38/56] Create native CDC cursors from registered stream names --- fdbclient/NativeCdc.cpp | 20 +++--- fdbclient/include/fdbclient/NativeCdc.h | 5 +- fdbserver/workloads/NativeCdc.cpp | 84 ++++++++++++------------- 3 files changed, 55 insertions(+), 54 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 7facb34f27d..707fa860a4f 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -571,12 +571,16 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { } } -Future consumeNativeCdcStream(Database cx, Key name, Version lastConsumedVersion) { +Future createNativeCdcCursor(Database cx, Key name) { validateNativeCdcEnabled(); const CDCStreamId streamId = co_await getNativeCdcStreamId(cx, name); - const CDCCursor cursor(streamId, lastConsumedVersion); + co_return CDCCursor(streamId, invalidVersion); +} + +Future consumeNativeCdcStream(Database cx, CDCCursor cursor) { + validateNativeCdcEnabled(); while (true) { - CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, streamId); + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); try { co_return co_await proxy.consume.getReply(CDCConsumeRequest(cursor)); } catch (Error& error) { @@ -588,17 +592,17 @@ Future consumeNativeCdcStream(Database cx, Key name, Version la } } -Future acknowledgeNativeCdcStreamClient(Database cx, Key name, Version consumedThrough) { +Future acknowledgeNativeCdcStreamClient(Database cx, CDCCursor cursor) { validateNativeCdcEnabled(); - if (consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { + if (cursor.streamId == 0 || cursor.lastConsumedVersion < 0 || + cursor.lastConsumedVersion == std::numeric_limits::max()) { throw client_invalid_operation(); } - const CDCStreamId streamId = co_await getNativeCdcStreamId(cx, name); while (true) { - CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, streamId); + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); try { - co_await proxy.ack.getReply(CDCAckRequest(streamId, consumedThrough)); + co_await proxy.ack.getReply(CDCAckRequest(cursor.streamId, cursor.lastConsumedVersion)); co_return; } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index 94177244e7a..702b4d2e534 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -62,7 +62,8 @@ Future registerNativeCdcStreamClient(Database cx, Key name, KeyRang Future removeNativeCdcStreamClient(Database cx, Key name); Future> listNativeCdcStreamsClient(Database cx); // Uses the range registered for this name; consumers do not respecify it. -Future consumeNativeCdcStream(Database cx, Key name, Version lastConsumedVersion); -Future acknowledgeNativeCdcStreamClient(Database cx, Key name, Version consumedThrough); +Future createNativeCdcCursor(Database cx, Key name); +Future consumeNativeCdcStream(Database cx, CDCCursor cursor); +Future acknowledgeNativeCdcStreamClient(Database cx, CDCCursor cursor); #endif // FDBCLIENT_NATIVECDC_H diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index b99aaa0711f..18450e02b58 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -256,30 +256,30 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(co_await registerNativeCdcStreamClient(cx, firstName, keys) == firstId); const Version writeVersion = co_await writeValues(cx, { { "shared/unread"_sr, "protected-by-minimum"_sr } }); - Version firstConsumedThrough = invalidVersion; + CDCCursor firstCursor = co_await createNativeCdcCursor(cx, firstName); + ASSERT(firstCursor.streamId == firstId); const double firstConsumeDeadline = now() + 30.0; - while (firstConsumedThrough < writeVersion) { - CDCConsumeReply consumed = - co_await timeoutError(consumeNativeCdcStream(cx, firstName, firstConsumedThrough), 30.0); - if (consumed.lastConsumedVersion == firstConsumedThrough) { + while (firstCursor.lastConsumedVersion < writeVersion) { + CDCConsumeReply consumed = co_await timeoutError(consumeNativeCdcStream(cx, firstCursor), 30.0); + if (consumed.lastConsumedVersion == firstCursor.lastConsumedVersion) { ASSERT(now() < firstConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(consumed.lastConsumedVersion > firstConsumedThrough); - firstConsumedThrough = consumed.lastConsumedVersion; + ASSERT(consumed.lastConsumedVersion > firstCursor.lastConsumedVersion); + firstCursor.lastConsumedVersion = consumed.lastConsumedVersion; } - co_await acknowledgeNativeCdcStreamClient(cx, firstName, firstConsumedThrough); + co_await acknowledgeNativeCdcStreamClient(cx, firstCursor); co_await removeNativeCdcStreamClient(cx, firstName); co_await waitForCDCProxyAssignmentRemoval(firstId); ASSERT(co_await registerNativeCdcStreamClient(cx, secondName, keys) == secondId); - Version unreadConsumedThrough = invalidVersion; + CDCCursor unreadCursor = co_await createNativeCdcCursor(cx, secondName); + ASSERT(unreadCursor.streamId == secondId); bool foundUnread = false; - while (unreadConsumedThrough < writeVersion) { - CDCConsumeReply unread = - co_await timeoutError(consumeNativeCdcStream(cx, secondName, unreadConsumedThrough), 30.0); - ASSERT(unread.lastConsumedVersion > unreadConsumedThrough); + while (unreadCursor.lastConsumedVersion < writeVersion) { + CDCConsumeReply unread = co_await timeoutError(consumeNativeCdcStream(cx, unreadCursor), 30.0); + ASSERT(unread.lastConsumedVersion > unreadCursor.lastConsumedVersion); for (const auto& versioned : unread.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "shared/unread"_sr) { @@ -287,10 +287,10 @@ struct NativeCdcWorkload : TestWorkload { } } } - unreadConsumedThrough = unread.lastConsumedVersion; + unreadCursor.lastConsumedVersion = unread.lastConsumedVersion; } ASSERT(foundUnread); - co_await acknowledgeNativeCdcStreamClient(cx, secondName, unreadConsumedThrough); + co_await acknowledgeNativeCdcStreamClient(cx, unreadCursor); co_await removeNativeCdcStreamClient(cx, secondName); co_await waitForCDCProxyAssignmentRemoval(secondId); @@ -363,6 +363,8 @@ struct NativeCdcWorkload : TestWorkload { const Key liveName = "native-cdc-live"_sr; const KeyRange liveRange(KeyRangeRef("live/"_sr, "live0"_sr)); const CDCStreamId liveStreamId = co_await registerNativeCdcStreamClient(cx, liveName, liveRange); + CDCCursor liveCursor = co_await createNativeCdcCursor(cx, liveName); + ASSERT(liveCursor.streamId == liveStreamId); CDCProxyInterface owner = co_await getCDCProxy(liveStreamId); std::vector listed = co_await listNativeCdcStreamsClient(cx); @@ -395,20 +397,18 @@ struct NativeCdcWorkload : TestWorkload { break; } - Version consumedThrough = invalidVersion; bool foundInRangeWrite = false; bool foundOutOfRangeWrite = false; const double initialConsumeDeadline = now() + 30.0; - while (consumedThrough < writeVersion) { - CDCConsumeReply consumed = - co_await timeoutError(consumeNativeCdcStream(cx, liveName, consumedThrough), 30.0); - if (consumed.lastConsumedVersion == consumedThrough) { + while (liveCursor.lastConsumedVersion < writeVersion) { + CDCConsumeReply consumed = co_await timeoutError(consumeNativeCdcStream(cx, liveCursor), 30.0); + if (consumed.lastConsumedVersion == liveCursor.lastConsumedVersion) { ASSERT(now() < initialConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(consumed.lastConsumedVersion > consumedThrough); - consumedThrough = consumed.lastConsumedVersion; + ASSERT(consumed.lastConsumedVersion > liveCursor.lastConsumedVersion); + liveCursor.lastConsumedVersion = consumed.lastConsumedVersion; for (const auto& versioned : consumed.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "live/in"_sr) { @@ -431,19 +431,17 @@ struct NativeCdcWorkload : TestWorkload { const Version afterFailureVersion = co_await writeValues(cx, { { "live/after-failure"_sr, "captured-after-failure"_sr } }); - Version afterFailureCursor = consumedThrough; bool foundAfterFailureWrite = false; const double afterFailureConsumeDeadline = now() + 30.0; - while (afterFailureCursor < afterFailureVersion) { - CDCConsumeReply afterFailure = - co_await timeoutError(consumeNativeCdcStream(cx, liveName, afterFailureCursor), 30.0); - if (afterFailure.lastConsumedVersion == afterFailureCursor) { + while (liveCursor.lastConsumedVersion < afterFailureVersion) { + CDCConsumeReply afterFailure = co_await timeoutError(consumeNativeCdcStream(cx, liveCursor), 30.0); + if (afterFailure.lastConsumedVersion == liveCursor.lastConsumedVersion) { ASSERT(now() < afterFailureConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(afterFailure.lastConsumedVersion > afterFailureCursor); - afterFailureCursor = afterFailure.lastConsumedVersion; + ASSERT(afterFailure.lastConsumedVersion > liveCursor.lastConsumedVersion); + liveCursor.lastConsumedVersion = afterFailure.lastConsumedVersion; for (const auto& versioned : afterFailure.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "live/after-failure"_sr) { @@ -454,8 +452,8 @@ struct NativeCdcWorkload : TestWorkload { } ASSERT(foundAfterFailureWrite); - const Version cursorBeforeRecovery = afterFailureCursor; - co_await acknowledgeNativeCdcStreamClient(cx, liveName, cursorBeforeRecovery); + const Version cursorBeforeRecovery = liveCursor.lastConsumedVersion; + co_await acknowledgeNativeCdcStreamClient(cx, liveCursor); ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == cursorBeforeRecovery + 1); const int32_t recoveredResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; @@ -468,19 +466,17 @@ struct NativeCdcWorkload : TestWorkload { const Version afterRecoveryVersion = co_await writeValues(cx, { { "live/after-recovery"_sr, "captured-after-recovery"_sr } }); - Version afterRecoveryCursor = cursorBeforeRecovery; bool foundAfterRecoveryWrite = false; const double afterRecoveryConsumeDeadline = now() + 30.0; - while (afterRecoveryCursor < afterRecoveryVersion) { - CDCConsumeReply afterRecovery = - co_await timeoutError(consumeNativeCdcStream(cx, liveName, afterRecoveryCursor), 30.0); - if (afterRecovery.lastConsumedVersion == afterRecoveryCursor) { + while (liveCursor.lastConsumedVersion < afterRecoveryVersion) { + CDCConsumeReply afterRecovery = co_await timeoutError(consumeNativeCdcStream(cx, liveCursor), 30.0); + if (afterRecovery.lastConsumedVersion == liveCursor.lastConsumedVersion) { ASSERT(now() < afterRecoveryConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(afterRecovery.lastConsumedVersion > afterRecoveryCursor); - afterRecoveryCursor = afterRecovery.lastConsumedVersion; + ASSERT(afterRecovery.lastConsumedVersion > liveCursor.lastConsumedVersion); + liveCursor.lastConsumedVersion = afterRecovery.lastConsumedVersion; for (const auto& versioned : afterRecovery.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "live/after-recovery"_sr) { @@ -491,11 +487,11 @@ struct NativeCdcWorkload : TestWorkload { } ASSERT(foundAfterRecoveryWrite); - co_await acknowledgeNativeCdcStreamClient(cx, liveName, afterRecoveryCursor); - ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == afterRecoveryCursor + 1); + co_await acknowledgeNativeCdcStreamClient(cx, liveCursor); + ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == liveCursor.lastConsumedVersion + 1); - Future pendingConsume = - recoveredOwner.consume.getReply(CDCConsumeRequest(CDCCursor(liveStreamId, afterRecoveryCursor + 1000000))); + Future pendingConsume = recoveredOwner.consume.getReply( + CDCConsumeRequest(CDCCursor(liveStreamId, liveCursor.lastConsumedVersion + 1000000))); co_await delay(0.1); co_await removeNativeCdcStreamClient(cx, liveName); co_await waitForCDCProxyAssignmentRemoval(liveStreamId); @@ -511,7 +507,7 @@ struct NativeCdcWorkload : TestWorkload { bool retiredConsumeRejected = false; try { - co_await timeoutError(consumeNativeCdcStream(cx, liveName, afterRecoveryCursor), 30.0); + co_await timeoutError(consumeNativeCdcStream(cx, liveCursor), 30.0); } catch (Error& e) { retiredConsumeRejected = e.code() == error_code_client_invalid_operation; } @@ -519,7 +515,7 @@ struct NativeCdcWorkload : TestWorkload { bool retiredClientAcknowledgeRejected = false; try { - co_await timeoutError(acknowledgeNativeCdcStreamClient(cx, liveName, afterRecoveryCursor), 30.0); + co_await timeoutError(acknowledgeNativeCdcStreamClient(cx, liveCursor), 30.0); } catch (Error& e) { retiredClientAcknowledgeRejected = e.code() == error_code_client_invalid_operation; } From 2e386e3a41cc9d58aef298e1988a275f18b164b6 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 13:50:45 -0700 Subject: [PATCH 39/56] Clean up retired CDC tag pop state after confirmed drain --- fdbserver/cdcproxy/CDCProxy.cpp | 37 +++++++++++++++++++ .../clustercontroller/ClusterRecovery.cpp | 1 + fdbserver/logsystem/LogSystemConsumer.cpp | 28 ++++++++++++++ .../fdbserver/logsystem/LogSystemConsumer.h | 2 + fdbserver/workloads/NativeCdc.cpp | 36 ++++++++++++++++++ 5 files changed, 104 insertions(+) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index ab0df76dbd0..6b7caec15a3 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -392,18 +392,55 @@ Future> readRetiredTagPopVersions(Database cx) { } } +Future clearCompletedRetiredTagPops(Database cx, std::map completedPopVersions) { + if (completedPopVersions.empty()) { + co_return; + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + for (const auto& [tag, completedVersion] : completedPopVersions) { + Optional retiredVersionValue = co_await tr.get(cdcRetiredTagPopVersionKeyFor(tag)); + if (!retiredVersionValue.present() || + decodeCDCMinVersionValue(retiredVersionValue.get()) > completedVersion) { + continue; + } + tr.clear(cdcRetiredTagPopKeyFor(tag)); + tr.clear(cdcRetiredTagPopVersionKeyFor(tag)); + } + + co_await tr.commit(); + co_return; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + Future popAcknowledgedData(CDCProxyData* self) { const std::map safePopVersions = co_await readSafePopVersions(self->cx); for (const auto& [tag, version] : safePopVersions) { self->logSystem->get()->pop(version, tag); } const std::map retiredTagPopVersions = co_await readRetiredTagPopVersions(self->cx); + std::map completedPopVersions; for (const auto& [tag, retiredVersion] : retiredTagPopVersions) { const auto safePop = safePopVersions.find(tag); const Version version = safePop == safePopVersions.end() ? retiredVersion : std::min(retiredVersion, safePop->second); self->logSystem->get()->pop(version, tag); + if (version >= retiredVersion) { + co_await self->logSystem->get()->waitForPopped(retiredVersion, tag); + completedPopVersions[tag] = retiredVersion; + } } + co_await clearCompletedRetiredTagPops(self->cx, std::move(completedPopVersions)); } void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index 319b31fec35..2d2a555ae22 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -236,6 +236,7 @@ Future ensureCDCProxies(Reference self, RecruitFromCo const bool hasDurableCdcState = !(co_await self->txnStateStore->readRange(cdcStreamKeys)).empty() || !(co_await self->txnStateStore->readRange(cdcRetiredTagPopKeys)).empty(); if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC && !hasDurableCdcState) { + self->controllerData->db.cdcProxies.clear(); co_return; } if (!self->controllerData->db.cdcProxies.empty()) { diff --git a/fdbserver/logsystem/LogSystemConsumer.cpp b/fdbserver/logsystem/LogSystemConsumer.cpp index f39c533d243..5bf5d8539a2 100644 --- a/fdbserver/logsystem/LogSystemConsumer.cpp +++ b/fdbserver/logsystem/LogSystemConsumer.cpp @@ -1,7 +1,10 @@ #include "fdbserver/logsystem/LogSystemConsumer.h" +#include #include +#include "flow/genericactors.actor.h" + Reference LogSystemConsumer::peekAll(UID dbgid, Version begin, Version end, @@ -901,6 +904,31 @@ void LogSystemConsumer::pop(Version upTo, Tag tag, Version durableKnownCommitted } } +Future LogSystemConsumer::waitForPopped(Version upTo, Tag tag, int8_t popLocality) { + while (true) { + std::vector> poppedFutures; + for (auto& t : logSystem->tLogs) { + if (t->locality == tagLocalitySpecial || t->locality == tag.locality || + (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == t->isLocal))) { + for (auto& log : t->logServers) { + poppedFutures.push_back(LogSystem::getPoppedFromTLog(log, tag)); + } + } + } + if (poppedFutures.empty()) { + co_return; + } + + std::vector poppedVersions = co_await getAll(poppedFutures); + if (std::all_of(poppedVersions.begin(), poppedVersions.end(), [upTo](Version poppedVersion) { + return poppedVersion >= upTo; + })) { + co_return; + } + co_await delay(0.01, TaskPriority::TLogPop); + } +} + Future LogSystemConsumer::getTxsPoppedVersion() { auto& ls = *logSystem; return LogSystem::getPoppedTxs(&ls); diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemConsumer.h b/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemConsumer.h index f0ca7bb86f7..18b9de6f040 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemConsumer.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemConsumer.h @@ -66,6 +66,8 @@ struct LogSystemConsumer : ReferenceCounted { void popLogRouter(Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality); void popTxs(Version upTo, int8_t popLocality = tagLocalityInvalid); void pop(Version upTo, Tag tag, Version durableKnownCommittedVersion = 0, int8_t popLocality = tagLocalityInvalid); + // Waits until every currently targeted TLog reports that `tag` has been popped through `upTo`. + Future waitForPopped(Version upTo, Tag tag, int8_t popLocality = tagLocalityInvalid); Future getTxsPoppedVersion(); Version getEnd() const; Tag getPseudoPopTag(Tag tag, ProcessClass::ClassType type) const; diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 18450e02b58..50e5af8addf 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -23,6 +23,7 @@ #include #include "fdbclient/CDCProxyInterface.h" +#include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.h" #include "fdbclient/NativeCdc.h" #include "fdbclient/SystemData.h" @@ -126,6 +127,23 @@ struct NativeCdcWorkload : TestWorkload { } } + Future hasRetiredTagPopState(Database cx, Tag tag) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional marker = co_await tr.get(cdcRetiredTagPopKeyFor(tag)); + Optional version = co_await tr.get(cdcRetiredTagPopVersionKeyFor(tag)); + ASSERT(marker.present() == version.present()); + co_return marker.present(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + Future appendPersistedTag(Database cx, CDCStreamId streamId, Tag tag) { Transaction tr(cx); while (true) { @@ -224,6 +242,13 @@ struct NativeCdcWorkload : TestWorkload { co_return; } + Future waitForNoCDCProxies() { + while (!dbInfo->get().client.cdcProxies.empty()) { + co_await dbInfo->onChange(); + } + co_return; + } + Future changeResolverCount(Database cx, int32_t count) { Standalone config(format("resolvers=%d", count)); while (true) { @@ -363,6 +388,7 @@ struct NativeCdcWorkload : TestWorkload { const Key liveName = "native-cdc-live"_sr; const KeyRange liveRange(KeyRangeRef("live/"_sr, "live0"_sr)); const CDCStreamId liveStreamId = co_await registerNativeCdcStreamClient(cx, liveName, liveRange); + const Tag liveTag = co_await getLatestPersistedTag(cx, liveStreamId); CDCCursor liveCursor = co_await createNativeCdcCursor(cx, liveName); ASSERT(liveCursor.streamId == liveStreamId); CDCProxyInterface owner = co_await getCDCProxy(liveStreamId); @@ -520,6 +546,16 @@ struct NativeCdcWorkload : TestWorkload { retiredClientAcknowledgeRejected = e.code() == error_code_client_invalid_operation; } ASSERT(retiredClientAcknowledgeRejected); + ASSERT(!(co_await hasRetiredTagPopState(cx, liveTag))); + + if (g_network->isSimulated()) { + CLIENT_KNOBS->ENABLE_NATIVE_CDC = false; + const int32_t disabledResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; + const uint64_t recoveryBeforeDisable = dbInfo->get().recoveryCount; + co_await changeResolverCount(cx, disabledResolverCount); + co_await timeoutError(waitForRecoveryAfter(recoveryBeforeDisable, RecoveryState::ACCEPTING_COMMITS), 60.0); + co_await timeoutError(waitForNoCDCProxies(), 30.0); + } } }; From 807c252bf3203fa108842b152388a2ae243fa338 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 14:17:37 -0700 Subject: [PATCH 40/56] Reject future native CDC acknowledgements to prevent premature log popping --- fdbclient/NativeCdc.cpp | 5 +++++ fdbserver/workloads/NativeCdc.cpp | 13 ++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 707fa860a4f..a9492227095 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -481,6 +481,11 @@ Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Ve throw client_invalid_operation(); } + const Version readVersion = co_await tr.getReadVersion(); + if (consumedThrough > readVersion) { + throw client_invalid_operation(); + } + const Version minVersion = decodeCDCMinVersionValue(minVersionValue.get()); if (minUnpoppedVersion <= minVersion) { co_return minVersion; diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 50e5af8addf..333de404c61 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include #include #include #include @@ -354,7 +355,8 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(streams[0].keys == firstRange); ASSERT(streams[0].minVersion == firstRoute.second); - const Version firstConsumedThrough = firstRoute.second + 5; + const Version firstConsumedThrough = + co_await writeValues(cx, { { "first/acknowledged"_sr, "acknowledged"_sr } }); const Version firstAckMinVersion = firstConsumedThrough + 1; ASSERT(co_await acknowledgeNativeCdcStream(cx, firstId, firstConsumedThrough) == firstAckMinVersion); ASSERT(co_await acknowledgeNativeCdcStream(cx, firstId, firstRoute.second) == firstAckMinVersion); @@ -393,6 +395,15 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(liveCursor.streamId == liveStreamId); CDCProxyInterface owner = co_await getCDCProxy(liveStreamId); + bool futureAcknowledgeRejected = false; + try { + co_await acknowledgeNativeCdcStreamClient(cx, + CDCCursor(liveStreamId, std::numeric_limits::max() - 1)); + } catch (Error& e) { + futureAcknowledgeRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(futureAcknowledgeRejected); + std::vector listed = co_await listNativeCdcStreamsClient(cx); ASSERT(listed.size() == 1); ASSERT(listed[0].name == liveName); From 76ea7300164bff395acde30f272bf124ad848926 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 14:18:05 -0700 Subject: [PATCH 41/56] Initial design/cdc.md draft --- design/cdc.md | 530 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 530 insertions(+) create mode 100644 design/cdc.md diff --git a/design/cdc.md b/design/cdc.md new file mode 100644 index 00000000000..7d40b017c3d --- /dev/null +++ b/design/cdc.md @@ -0,0 +1,530 @@ +# Native Change Data Capture (CDC) + +## Status and scope + +Native Change Data Capture (CDC) provides a FoundationDB-native mechanism for +reading committed mutations for a registered key range. A client registers a +named stream, creates a cursor for that name, consumes batches of mutations, +and acknowledges processed versions. The implementation persists enough state +to retain unread TLog data and to resume stream service after CDC proxy failure +or transaction-system recovery. + +This design describes the native C++ interface and its server implementation. +The feature is disabled by default behind `ENABLE_NATIVE_CDC`; the native CDC +workloads explicitly enable it, and simulation may randomly enable it. The +initial interface is native-only: it does not expose bindings or an external +protocol compatibility guarantee. + +The implementation uses the following terms: + +* A **stream** is a durable named registration for a fixed user key range. +* A **cursor** identifies one stream and the version through which a consumer + has read. +* A **CDC tag** is a TLog tag with locality `tagLocalityCDC`. Commit proxies + append these tags to mutations covered by registered streams. +* A **CDC proxy** reads tagged TLog mutation streams, filters mutations to a + registered range, serves consumers, and coordinates acknowledgement-driven + log popping. + +CDC is not implemented as a storage server change feed. It captures mutations +in the transaction logging path, which lets an acknowledged consumer retain +and release its own log history without changing user data storage. + +## Goals + +Native CDC is intended to provide: + +* Durable, named registrations for key ranges in normal user key space. +* A cursor-based API in which a consumer only needs a stream name after + registration, rather than repeating its registered range on every read. +* Ordered mutation batches identified by FoundationDB commit versions. +* Durable acknowledgements that determine how much CDC-tagged TLog history may + be popped. +* Correct retention when several streams share a CDC tag, including streams + whose ranges overlap or whose consumers advance at different rates. +* Replacement and recovery of CDC proxies without losing active stream + ownership or prematurely releasing log data. +* Finite cleanup when streams are removed, so an old stream does not require + CDC infrastructure forever. + +The first implementation does not attempt to provide: + +* Exactly-once side effects in the consumer. A consumer must make its output + and its acknowledgement consistent if it needs exactly-once processing. +* Dynamic stream range changes. A name is registered for one range; changing a + range requires removing and registering a stream. +* Throughput-aware assignment of streams across CDC proxies. +* Throughput-aware movement of streams between CDC tags. +* Client bindings beyond the native API. + +## Client interface + +The client-facing declarations are in `fdbclient/NativeCdc.h`; cursor and wire +request types are in `fdbclient/CDCProxyInterface.h`. + +```cpp +Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys); +Future removeNativeCdcStreamClient(Database cx, Key name); +Future> listNativeCdcStreamsClient(Database cx); + +Future createNativeCdcCursor(Database cx, Key name); +Future consumeNativeCdcStream(Database cx, CDCCursor cursor); +Future acknowledgeNativeCdcStreamClient(Database cx, CDCCursor cursor); +``` + +A stream registration contains: + +```cpp +struct NativeCdcStreamInfo { + Key name; + CDCStreamId streamId; + KeyRange keys; + Version minVersion; +}; +``` + +The durable identity of a stream is its `CDCStreamId`, not its name. Names are +used to create and manage streams. A cursor resolves the current stream ID +once, so removing a name and later registering the same name does not silently +redirect an existing consumer to a different stream. + +```cpp +struct CDCCursor { + CDCStreamId streamId; + Version lastConsumedVersion; +}; +``` + +`lastConsumedVersion` is initialized to `invalidVersion`. A consume response +returns both mutations and a new `lastConsumedVersion`: + +```cpp +struct VersionedMutationsRef { + Version version; + VectorRef mutations; +}; + +struct CDCConsumeReply { + VectorRef mutations; + Version lastConsumedVersion; +}; +``` + +A typical consumer loop is: + +```cpp +co_await registerNativeCdcStreamClient(db, "orders"_sr, KeyRangeRef("order/"_sr, "order0"_sr)); +state CDCCursor cursor = co_await createNativeCdcCursor(db, "orders"_sr); + +loop { + CDCConsumeReply reply = co_await consumeNativeCdcStream(db, cursor); + for (auto const& versionedMutations : reply.mutations) { + // Apply all mutations for versionedMutations.version. + } + + cursor.lastConsumedVersion = reply.lastConsumedVersion; + co_await acknowledgeNativeCdcStreamClient(db, cursor); +} +``` + +The acknowledgement means that the consumer no longer requires CDC mutations +through `cursor.lastConsumedVersion`. Internally, acknowledgement advances the +stream's persisted minimum required version to `lastConsumedVersion + 1`. +Therefore the consumer must not acknowledge a returned cursor position before +it has durably processed all mutations represented through that position. +The server rejects an acknowledgement beyond its current read version, so a +consumer cannot pre-pop future mutations on a tag that may later be assigned +to another stream. + +### Registration and removal semantics + +`registerNativeCdcStreamClient()` accepts a non-empty stream name and a +non-empty range entirely within normal user keys. Registration of an existing +name with the same range is idempotent. Registering an existing name with a +different range is rejected. + +Registration establishes an initial minimum version using the registration +transaction's commit version. Mutations committed after the registration has +become visible are routed to the stream's CDC tag. The initial minimum version +also supplies the first retention watermark for its TLog history. + +`removeNativeCdcStreamClient()` removes the named stream and schedules final +release of tagged log history that was protected by the removed stream. +Removal explicitly relinquishes any unread history for that stream while still +respecting the retention needs of other streams sharing its tags. Stream +removal is terminal for existing cursors. Stale consume or acknowledgement +operations return an error instead of waiting indefinitely for an owner that +will never be assigned again. + +### Consumption and expiration + +Consumption is ordered by commit version. Mutations from a clear range are +intersected with the stream's registered range before being returned; a +single-key mutation is returned only if its key is within that range. + +For an active stream, unacknowledged CDC mutations are retained by its durable +minimum version: TLogs must not pop tagged data that the stream may still +consume. A slow consumer therefore retains its unread history rather than +expiring solely because of age. + +Consumption returns `transaction_too_old` when the caller supplies a cursor +older than the stream's already acknowledged durable watermark. The proxy also +treats discovery that an active stream's required tagged data has nevertheless +already been popped as `transaction_too_old`; that condition indicates a +retention invariant violation rather than a supported expiration policy. + +The native client methods retry transient endpoint and routing failures such as +a CDC proxy replacement. Invalid stream operations, already-acknowledged +cursor positions, and retention invariant violations are terminal errors for +that request. + +## Architecture + +The data path is: + +```text + durable stream and acknowledgement state + +-------------------------+ + | transaction state store | + | and system key storage | + +------------+------------+ + | + v +Client <---- consume / acknowledge ----> CDC Proxy <---- peek / pop ---- TLogs + ^ ^ + | | + | assigned streams | ordinary and + | | CDC tags + Cluster Controller Commit Proxies + ^ + | + user commits +``` + +Commit proxies keep a routing table derived from durable CDC metadata. For +each committed user mutation, the commit proxy determines which registered CDC +ranges include that mutation and appends the corresponding CDC tags to the +mutation sent to TLogs. A transaction continues to follow its ordinary +replication tags as well; CDC tags are additional log destinations used by CDC +consumers. + +CDC proxies do not participate in committing user transactions. They consume +the extra tagged log streams, buffer readable results, filter shared tagged +data back to each stream's registered range, and pop data after durable +acknowledgement permits it. + +The cluster controller recruits CDC proxies, publishes their interfaces, and +keeps durable stream-to-proxy ownership consistent with current endpoints. + +## Durable state + +CDC uses two categories of system data. Routing and recovery-critical metadata +is stored in the transaction state store, where commit proxies and recovery +can reconstruct it in transaction order. Acknowledgement and final-pop +watermarks are stored as regular storage-server-backed system keys, because +they are durable progress values rather than commit routing configuration. + +### Transaction state metadata + +These keys are in the metadata portion of system key space and are represented +in transaction state: + +| Key | Value | Purpose | +| --- | --- | --- | +| `\xff/cdc/name/` | `CDCStreamId` | Resolves a user-visible name to its durable stream identity. | +| `\xff/cdc/maxStreamId` | `CDCStreamId` | Allocates monotonic stream identifiers. | +| `\xff/cdc/keys/` | `KeyRange` | Stores the immutable registered range for an active stream. | +| `\xff/cdc/tagHistory///` | empty | Records the CDC tag assignment history used for routing and historical reads. | +| `\xff/cdc/proxies//` | empty | Stores the CDC proxy assigned to an active stream. | +| `\xff/cdc/proxyAssignmentChange` | version/change signal | Wakes ownership monitoring when durable assignments change. | +| `\xff/cdc/retiredTagPop/` | empty | Retains recovery-visible pending final-pop work after removal. | + +Tag history is versioned so the data model can support a stream moving between +tags without forgetting which old log streams may still contain unread +mutations. The initial implementation writes the initial assignment and reads +the history; dynamic throughput-driven reassignment is future work. + +### Storage-backed system data + +These keys are in the storage-server-backed `\xff\x02` system key range rather +than transaction state: + +| Key | Value | Purpose | +| --- | --- | --- | +| `\xff\x02/cdc/minVersion/` | `Version` | Earliest version that an active stream may still require. | +| `\xff\x02/cdc/retiredTagPopVersion/` | `Version` | Final pop watermark required after a stream using a tag is removed. | + +The initial `minVersion` is written with a versionstamp at stream +registration. When a cursor acknowledges processing through version `V`, the +stored value advances monotonically to `V + 1`. A CDC proxy may pop tagged +mutations before this watermark only when doing so is safe for every live +stream sharing that tag. + +The retired-tag marker and watermark are deliberately split between transaction +state and regular system storage. The marker tells recovery that a CDC proxy +must still be recruited to finish durable cleanup; the watermark bounds the +actual final pop to perform. + +## Stream creation and assignment + +Registration runs as a durable metadata transaction: + +1. It validates the feature knob, the stream name, and the registered normal + key range. +2. It checks whether the name is already registered and applies the idempotent + same-name/same-range rule. +3. It allocates a new monotonically increasing `CDCStreamId`. +4. It selects a CDC tag using current active stream counts. The allocator uses + the least populated tag among `NATIVE_CDC_TAG_COUNT` tags, choosing the + lowest tag ID on a tie. +5. It records the stream name, range, initial tag history entry, and + versionstamped initial minimum version. +6. It records an available CDC proxy owner and signals assignment monitoring. + +The tag allocator bounds the number of distinct CDC log streams while allowing +many user streams. Several streams may therefore share one tag intentionally. +This makes filtering and acknowledgement coordination required correctness +properties rather than exceptional cases. + +The current proxy assignment at registration uses an available CDC proxy; it +does not yet balance by stream traffic, memory use, or consumer lag. + +## Commit routing + +Each commit proxy has a `CDCRoutingTable`, reconstructed from active stream +ranges and tag history in transaction state. Changes to CDC stream metadata +are applied in commit order along with other transaction state mutations, so +the routing decision for later mutations observes committed registration and +removal changes. + +For a single-key mutation, the routing table returns CDC tags for all active +stream ranges containing that key. For a clear-range mutation, it returns tags +for all active stream ranges intersecting the cleared interval. These CDC tags +are appended in both the tag-determination and log-writing portions of commit +proxy processing. + +A shared CDC tag is a multiplexed log stream. A mutation routed because of +stream A may be read by the proxy serving stream B if both share the tag. +Consequently, the CDC proxy filters every read mutation against B's registered +range before returning it to B's consumer. Filtering also clips clear ranges +to the stream range. + +Routing at commit time has two important implications: + +* CDC observes mutations once their commits enter the transaction logging + path, without scanning storage server data. +* Registering CDC does not change normal durability for the user mutation; it + adds tagged log data whose retention is controlled separately by consumers. + +## CDC proxy read path + +A CDC proxy owns a set of active stream IDs. For each owned stream it loads: + +* The registered key range. +* The durable minimum required version. +* Its current CDC tag and versioned tag history. + +The proxy reads data from TLogs through `LogSystemConsumer::peekSingle()`. +When a stream has historical assignments, the proxy uses the history to select +the tag appropriate for the version interval it is reading. It filters +mutations to the registered range and stores versioned mutation batches in a +per-stream in-memory buffer. + +Buffers are bounded by `CDC_PROXY_BUFFER_BYTES`. A slow consumer does not +require the proxy to buffer its entire retained history in memory: durable +acknowledgement state and tagged TLog retention are the source of resumability, +while the proxy buffer is a delivery optimization. + +A consume operation supplies a cursor. The proxy returns buffered or newly +peeked data after the cursor position and a position through which the +consumer may acknowledge after processing. If a stream is removed while a +consume is blocked, reconciliation wakes the request and it fails rather than +waiting on a data-change trigger for an inactive stream. + +## Acknowledgement and tag popping + +Acknowledgement is per stream, while TLog popping is per CDC tag. This +distinction is the core retention rule. + +For every active stream `S`, `minVersion(S)` is the first version its consumer +may still need. For a tag `T`, the safe pop watermark is: + +```text +safePop(T) = min(minVersion(S)) for every live stream S whose history uses T +``` + +Popping tag `T` through `safePop(T)` discards versions older than the minimum +required by any live stream using that tag. A fast consumer therefore cannot +pop mutations still needed by a slower stream sharing its tag. + +The proxy recomputes these minima from durable active stream metadata and +acknowledgement rows. It does not rely solely on its in-memory owned-stream +set, because shared tags and replacement proxies must preserve the same global +retention decision. + +### Removing a stream + +Removing a stream eliminates its active name, range, tag history, minimum +version, and ownership rows. Removal must not unconditionally pop each tag in +the removed history: a different live stream may share a tag and still need +older data. + +Before deleting active state, removal writes pending final-pop state for each +historical tag: + +* A transaction-state retired-tag marker, which survives recovery and makes + outstanding cleanup discoverable. +* A versionstamped storage-backed retired-tag watermark, which identifies the + upper bound of tagged history protected by the removed stream. + +The CDC proxy processes retired work together with active acknowledgement +watermarks. If a retired tag is also used by a live stream, its attempted pop +is capped at that live stream's `safePop` value. Only when it is possible to +pop through the complete retired watermark is the retired operation eligible +for completion. + +### Completing retired work + +Retired final-pop state is durable work, not permanent stream state. After +issuing a complete retired pop, a CDC proxy waits for every targeted current +TLog to report that the tag has been popped through the required watermark. +It then transactionally clears both the retired marker and its stored +watermark. + +The cleanup transaction rereads the watermark before clearing it. If a newer +stream removal has advanced the retired watermark for the same tag while the +earlier pop was in progress, the newer work is retained rather than erased by +the older completion. + +This establishes the lifecycle invariant: + +```text +no live streams and no pending retired final pops + implies no durable CDC proxy requirement +``` + +Without this completion protocol, removing any stream would leave a retired +marker forever, causing later recoveries to recruit CDC proxies indefinitely +and repeatedly replay already-completed final pops. + +## Failure handling and recovery + +### CDC proxy failure + +CDC proxies are consumers of durable state and tagged logs, not authorities +for committed application data. If a CDC proxy fails, the cluster controller +can recruit a replacement, publish the replacement interface, and durably +reassign affected streams. The replacement reloads stream state and resumes +reading at durable acknowledgement watermarks. + +The cluster controller monitors durable proxy assignment rows and repairs +stale owner identifiers when endpoints are replaced or the controller itself +is reconstructed. Clients obtain the currently published owner for their +stream ID and retry transient proxy/routing failures. + +### Transaction-system recovery + +During full recovery, active stream ranges, tag history, and pending retired +markers are present in transaction state. Active history tags are added to the +set of log tags that recovery must preserve; otherwise TLog generations could +discard CDC data that an active consumer has not yet acknowledged. + +CDC proxy recruitment is required during recovery when either: + +* Native CDC is enabled and streams may be served, or +* Durable CDC state remains, including retired final-pop work that must be + completed even after admission of new CDC activity is disabled. + +When the knob is disabled and recovery finds neither active streams nor +retired work, no CDC proxies are published. This makes stream removal and +retired-pop completion observable as a finite drain rather than a permanent +cluster role. + +## Feature gating + +`ENABLE_NATIVE_CDC` defaults to false. In simulation it may be randomly enabled +under buggification; workloads that depend on CDC set it explicitly. + +The feature knob gates client admission to native CDC operations. Internal +cleanup and recovery paths remain capable of handling durable CDC state that +was created while the feature was enabled. This is necessary because disabling +new use of a feature cannot safely abandon log-retention obligations for +already registered or recently removed streams. + +`NATIVE_CDC_TAG_COUNT` controls the bounded tag pool used for new stream +allocation. Normal operation defaults to a larger tag pool; simulation may +reduce it so shared-tag behavior is exercised frequently. + +## Correctness properties + +The implementation is structured around the following properties: + +* **Registration identity:** a cursor binds to a stream ID, so reuse of a + removed stream name cannot cause an existing consumer to read a new stream. +* **Range correctness:** CDC proxies return only mutations within a stream's + registered range, even when its tag is shared with other streams. +* **Acknowledgement monotonicity:** durable minimum required versions advance + only forward. +* **Shared-tag retention:** tagged data is popped no farther than the minimum + durable watermark of all active streams that may require it. +* **Removal safety:** deleting one stream cannot pop unread data required by + another stream using the same tag. +* **Finite retired cleanup:** removal retains enough state to finish final + pops through failure and recovery, then removes that state after completion. +* **No age-based expiration:** an active stream's unread tagged mutations + remain protected until acknowledgement or explicit stream removal. + `transaction_too_old` for required active history indicates either a stale + already-acknowledged cursor or a violated retention invariant. +* **Failure visibility:** a removed stream fails consume and acknowledgement + requests rather than leaving them blocked indefinitely. +* **Recovery retention:** active CDC tag history is included in recovery's + required log data, and pending cleanup retains CDC proxy availability until + it has been completed. + +## Current limitations and future work + +The design records tag history and proxy ownership in forms that support more +complete load balancing, but the first implementation intentionally keeps +policy simple. + +* Tag selection is based on active stream counts, not observed byte or mutation + throughput. Data distribution could make equally counted tags very + different in cost. +* Registration selects an available CDC proxy without balancing aggregate + proxy throughput, buffer memory, lag, or number of active readers. +* There is no background process that changes a live stream's CDC tag in + response to load. A future implementation can use versioned tag history to + make such changes without losing the ability to read earlier tagged data. +* The native interface does not yet provide external binding support, + administrative tooling, or a higher-level consumer checkpoint abstraction. + +These improvements must preserve the acknowledgement and retired-pop +invariants above. In particular, moving a stream between tags cannot forget an +old tag until all data protected by that assignment has either been +acknowledged and popped or retained as finite final-pop work. + +## Validation + +The implementation includes codec and metadata tests for CDC system keys and +simulation workloads for the end-to-end behavior. + +The basic native CDC workload covers: + +* Registering, listing, consuming, acknowledging, and removing streams. +* Name-based cursor creation and correct filtering of returned mutations. +* Rejection of incompatible same-name registrations. +* CDC proxy replacement and recovery of stream service. +* Errors for stale consume and acknowledgement requests after removal. +* Creation and eventual collection of retired final-pop state. +* Recovery with native CDC disabled after the last stream and final-pop work + have drained, verifying that no CDC proxy remains required. + +The shared-tag workload forces streams to share routing tags and verifies both +range filtering and acknowledgement coordination. In particular, removing one +stream while another shared-tag stream is behind must not pop the unread +mutations needed by the remaining consumer. + +The simulation configurations enable CDC explicitly when testing these +behaviors, while the default-disabled knob and randomized simulation admission +exercise the requirement that clusters without active or pending CDC work do +not carry CDC service overhead. From e676f995e91fb3972ff907ca9dea538904fed31d Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 14:26:45 -0700 Subject: [PATCH 42/56] Bound CDC proxy buffering with a shared proxy-wide memory budget --- design/cdc.md | 10 ++- fdbserver/cdcproxy/CDCProxy.cpp | 140 ++++++++++++++++++++++++++------ fdbserver/core/ServerKnobs.cpp | 2 +- 3 files changed, 123 insertions(+), 29 deletions(-) diff --git a/design/cdc.md b/design/cdc.md index 7d40b017c3d..f33ff6a895b 100644 --- a/design/cdc.md +++ b/design/cdc.md @@ -330,8 +330,14 @@ the tag appropriate for the version interval it is reading. It filters mutations to the registered range and stores versioned mutation batches in a per-stream in-memory buffer. -Buffers are bounded by `CDC_PROXY_BUFFER_BYTES`. A slow consumer does not -require the proxy to buffer its entire retained history in memory: durable +All stream buffers owned by one CDC proxy share a `CDC_PROXY_BUFFER_BYTES` +budget. Before requesting more TLog data, a stream reserves a bounded peek +window from that budget, then converts the reservation to the actual filtered +mutation bytes retained in its buffer. Acknowledgement or stream removal +releases the retained reservation. This applies backpressure before ordinary +peek batches arrive, rather than allowing each stream or each received batch +to independently overshoot the proxy limit. A slow consumer does not require +the proxy to buffer its entire retained history in memory: durable acknowledgement state and tagged TLog retention are the source of resumability, while the proxy buffer is a delivery optimization. diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 6b7caec15a3..87664bfcc82 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -42,6 +42,7 @@ #include "flow/ActorCollection.h" #include "flow/Error.h" #include "flow/UnitTest.h" +#include "flow/genericactors.actor.h" namespace { @@ -63,22 +64,33 @@ struct CDCBufferedStream : ReferenceCounted { std::deque> mutations; AsyncTrigger changed; AsyncTrigger refresh; - AsyncTrigger spaceAvailable; AsyncTrigger stopped; explicit CDCBufferedStream(CDCStreamId streamId) : streamId(streamId) {} }; +struct CDCBufferedBatch { + Version bufferedThrough; + int64_t bufferedBytes = 0; + bool mergeFirstMutationVersion = false; + std::deque> mutations; + + explicit CDCBufferedBatch(Version bufferedThrough) : bufferedThrough(bufferedThrough) {} +}; + struct CDCProxyData { UID id; Database cx; Reference const> dbInfo; Reference>> logSystem; std::map> streams; + FlowLock bufferLock; + int64_t bufferedBytes = 0; CDCProxyData(CDCProxyInterface const& proxy, Reference const> dbInfo) : id(proxy.id()), cx(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)), dbInfo(dbInfo), - logSystem(makeReference>>()) {} + logSystem(makeReference>>()), + bufferLock(SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES) {} }; Optional clipCDCMutation(MutationRef const& mutation, KeyRangeRef const& keys) { @@ -167,9 +179,10 @@ Future readCDCStreamState(Database cx, } } -void bufferMessages(Reference stream, - CDCStreamReadState const& metadata, - Reference cursor) { +CDCBufferedBatch bufferMessages(Reference stream, + CDCStreamReadState const& metadata, + Reference cursor) { + CDCBufferedBatch batch(stream->bufferedThrough); while (cursor->hasMessage()) { const Version messageVersion = cursor->version().version; ArenaReader& reader = *cursor->reader(); @@ -188,18 +201,52 @@ void bufferMessages(Reference stream, reader >> mutation; Optional clipped = clipCDCMutation(mutation, metadata.keys.get()); if (clipped.present()) { - if (stream->mutations.empty() || stream->mutations.back().version != messageVersion) { - stream->mutations.emplace_back(); - stream->mutations.back().version = messageVersion; - stream->bufferedBytes += sizeof(VersionedMutationsRef); + if (batch.mutations.empty() || batch.mutations.back().version != messageVersion) { + batch.mutations.emplace_back(); + batch.mutations.back().version = messageVersion; + if (stream->mutations.empty() || stream->mutations.back().version != messageVersion) { + batch.bufferedBytes += sizeof(VersionedMutationsRef); + } else { + batch.mergeFirstMutationVersion = true; + } } - stream->mutations.back().mutations.push_back_deep(stream->mutations.back().arena(), clipped.get()); - stream->bufferedBytes += clipped.get().expectedSize() + sizeof(MutationRef); + batch.mutations.back().mutations.push_back_deep(batch.mutations.back().arena(), clipped.get()); + batch.bufferedBytes += clipped.get().expectedSize() + sizeof(MutationRef); } } - stream->bufferedThrough = std::max(stream->bufferedThrough, messageVersion); + batch.bufferedThrough = std::max(batch.bufferedThrough, messageVersion); cursor->nextMessage(); } + return batch; +} + +void addBufferedBatch(CDCProxyData* self, Reference stream, CDCBufferedBatch batch) { + if (batch.mergeFirstMutationVersion) { + ASSERT(!stream->mutations.empty()); + ASSERT(!batch.mutations.empty()); + ASSERT(stream->mutations.back().version == batch.mutations.front().version); + for (const auto& mutation : batch.mutations.front().mutations) { + stream->mutations.back().mutations.push_back_deep(stream->mutations.back().arena(), mutation); + } + batch.mutations.pop_front(); + } + while (!batch.mutations.empty()) { + stream->mutations.emplace_back(std::move(batch.mutations.front())); + batch.mutations.pop_front(); + } + stream->bufferedThrough = std::max(stream->bufferedThrough, batch.bufferedThrough); + stream->bufferedBytes += batch.bufferedBytes; + self->bufferedBytes += batch.bufferedBytes; +} + +void clearBufferedMutations(CDCProxyData* self, Reference stream) { + if (stream->bufferedBytes > 0) { + ASSERT(self->bufferedBytes >= stream->bufferedBytes); + self->bufferedBytes -= stream->bufferedBytes; + self->bufferLock.release(stream->bufferedBytes); + stream->bufferedBytes = 0; + } + stream->mutations.clear(); } Future bufferStream(CDCProxyData* self, Reference stream) { @@ -221,19 +268,23 @@ Future bufferStream(CDCProxyData* self, Reference strea Reference cursor = self->logSystem->get()->peekSingle(self->id, begin, metadata.currentTag, metadata.tagHistory); while (stream->active) { - if (stream->bufferedBytes >= SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES) { - auto waitForSpace = co_await race(stream->spaceAvailable.onTrigger(), - self->logSystem->onChange(), - stream->stopped.onTrigger(), - stream->refresh.onTrigger()); - if (waitForSpace.index() == 0) { - continue; - } - if (waitForSpace.index() == 2) { - co_return; - } + const int64_t peekReservation = + std::min(SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES, SERVER_KNOBS->MAXIMUM_PEEK_BYTES); + ASSERT(peekReservation > 0); + auto capacity = co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, peekReservation), + self->logSystem->onChange(), + stream->stopped.onTrigger(), + stream->refresh.onTrigger()); + if (capacity.index() == 1) { break; } + if (capacity.index() == 2) { + co_return; + } + if (capacity.index() == 3) { + break; + } + FlowLock::Releaser reservation(self->bufferLock, peekReservation); auto result = co_await race(cursor->getMore(TaskPriority::TLogPeekReply), self->logSystem->onChange(), stream->stopped.onTrigger(), @@ -264,8 +315,39 @@ Future bufferStream(CDCProxyData* self, Reference strea throw transaction_too_old(); } + CDCBufferedBatch batch = bufferMessages(stream, metadata, cursor); + if (batch.bufferedBytes > peekReservation) { + TraceEvent(SevWarn, "CDCProxyOversizedPeekBatch", self->id) + .detail("StreamId", stream->streamId) + .detail("BufferedBytes", batch.bufferedBytes) + .detail("ReservedBytes", peekReservation); + reservation.release(); + auto oversizedCapacity = + co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, batch.bufferedBytes), + self->logSystem->onChange(), + stream->stopped.onTrigger(), + stream->refresh.onTrigger()); + if (oversizedCapacity.index() == 1) { + break; + } + if (oversizedCapacity.index() == 2) { + co_return; + } + if (oversizedCapacity.index() == 3) { + break; + } + reservation = FlowLock::Releaser(self->bufferLock, batch.bufferedBytes); + } else { + reservation.release(peekReservation - batch.bufferedBytes); + } + if (!stream->active) { + co_return; + } + const Version previousBufferedThrough = stream->bufferedThrough; - bufferMessages(stream, metadata, cursor); + addBufferedBatch(self, stream, std::move(batch)); + // Buffered mutations own these permits until acknowledgement or stream removal. + reservation.remaining = 0; stream->bufferedThrough = std::max(stream->bufferedThrough, cursor->version().version - 1); if (stream->bufferedThrough > previousBufferedThrough) { stream->changed.trigger(); @@ -295,11 +377,13 @@ Future bufferStream(CDCProxyData* self, Reference strea } } catch (Error& e) { if (e.code() == error_code_client_invalid_operation || e.code() == error_code_wrong_shard_server) { + clearBufferedMutations(self, stream); stream->active = false; stream->changed.trigger(); co_return; } if (e.code() == error_code_transaction_too_old) { + clearBufferedMutations(self, stream); stream->tooOld = true; stream->active = false; stream->changed.trigger(); @@ -461,6 +545,7 @@ void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { it->second->active = false; it->second->stopped.trigger(); it->second->changed.trigger(); + clearBufferedMutations(self, it->second); it = self->streams.erase(it); } else { ++it; @@ -539,13 +624,16 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { if (found != self->streams.end()) { found->second->minVersion = std::max(found->second->minVersion, minVersion); while (!found->second->mutations.empty() && found->second->mutations.front().version < minVersion) { - found->second->bufferedBytes -= + const int64_t releasedBytes = sizeof(VersionedMutationsRef) + found->second->mutations.front().mutations.expectedSize(); + found->second->bufferedBytes -= releasedBytes; + ASSERT(self->bufferedBytes >= releasedBytes); + self->bufferedBytes -= releasedBytes; + self->bufferLock.release(releasedBytes); found->second->mutations.pop_front(); } ASSERT(found->second->bufferedBytes >= 0); found->second->refresh.trigger(); - found->second->spaceAvailable.trigger(); } co_await popAcknowledgedData(self); request.reply.send(Void()); diff --git a/fdbserver/core/ServerKnobs.cpp b/fdbserver/core/ServerKnobs.cpp index 66022903df6..6a96fd2ab68 100644 --- a/fdbserver/core/ServerKnobs.cpp +++ b/fdbserver/core/ServerKnobs.cpp @@ -175,7 +175,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DESIRED_UPDATE_BYTES, 2*DESIRED_TOTAL_BYTES ); init( UPDATE_DELAY, 0.001 ); init( MAXIMUM_PEEK_BYTES, 10e6 ); - init( CDC_PROXY_BUFFER_BYTES, 10e6 ); if( randomize && BUGGIFY ) CDC_PROXY_BUFFER_BYTES = 10000; + init( CDC_PROXY_BUFFER_BYTES, 1e9 ); if( randomize && BUGGIFY ) CDC_PROXY_BUFFER_BYTES = 10000; init( APPLY_MUTATION_BYTES, 1e6 ); init( BUGGIFY_RECOVER_MEMORY_LIMIT, 1e6 ); init( BUGGIFY_WORKER_REMOVED_MAX_LAG, 30 ); From 26dc4805cab105277b977ad342458dca0485578d Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 14:27:20 -0700 Subject: [PATCH 43/56] s/first/current --- design/cdc.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/design/cdc.md b/design/cdc.md index f33ff6a895b..16d7565295a 100644 --- a/design/cdc.md +++ b/design/cdc.md @@ -47,7 +47,7 @@ Native CDC is intended to provide: * Finite cleanup when streams are removed, so an old stream does not require CDC infrastructure forever. -The first implementation does not attempt to provide: +The current implementation does not attempt to provide: * Exactly-once side effects in the consumer. A consumer must make its output and its acknowledgement consistent if it needs exactly-once processing. From 0a09b8964b1b596d78a006abba34426f91c459b2 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 14:31:00 -0700 Subject: [PATCH 44/56] Remove redundant co_return statements --- fdbserver/cdcproxy/CDCProxy.cpp | 5 ----- fdbserver/workloads/NativeCdc.cpp | 4 ---- 2 files changed, 9 deletions(-) diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 87664bfcc82..72e8e54b389 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -613,7 +613,6 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { } request.reply.sendError(e); } - co_return; } Future acknowledge(CDCProxyData* self, CDCAckRequest request) { @@ -643,7 +642,6 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { } request.reply.sendError(e); } - co_return; } Future registerStream(CDCProxyData* self, CDCRegisterStreamRequest request) { @@ -656,7 +654,6 @@ Future registerStream(CDCProxyData* self, CDCRegisterStreamRequest request } request.reply.sendError(e); } - co_return; } Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { @@ -672,7 +669,6 @@ Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { } request.reply.sendError(e); } - co_return; } Future listStreams(CDCProxyData* self, CDCListStreamsRequest request) { @@ -693,7 +689,6 @@ Future listStreams(CDCProxyData* self, CDCListStreamsRequest request) { } request.reply.sendError(e); } - co_return; } } // namespace diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 333de404c61..de9adf13d83 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -240,14 +240,12 @@ struct NativeCdcWorkload : TestWorkload { while (dbInfo->get().client.streamToCDCProxyId.contains(streamId)) { co_await dbInfo->onChange(); } - co_return; } Future waitForNoCDCProxies() { while (!dbInfo->get().client.cdcProxies.empty()) { co_await dbInfo->onChange(); } - co_return; } Future changeResolverCount(Database cx, int32_t count) { @@ -267,7 +265,6 @@ struct NativeCdcWorkload : TestWorkload { while (dbInfo->get().recoveryCount <= previousRecoveryCount || dbInfo->get().recoveryState < requiredState) { co_await dbInfo->onChange(); } - co_return; } Future runSharedTagSafety(Database cx) { @@ -320,7 +317,6 @@ struct NativeCdcWorkload : TestWorkload { co_await removeNativeCdcStreamClient(cx, secondName); co_await waitForCDCProxyAssignmentRemoval(secondId); - co_return; } Future run(Database cx) { From 6855c7cbd0cb9960e0eb78b6587d7d3132743b0e Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 14:37:30 -0700 Subject: [PATCH 45/56] Use mermaid for diagram --- design/cdc.md | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/design/cdc.md b/design/cdc.md index 16d7565295a..f1c592d8b10 100644 --- a/design/cdc.md +++ b/design/cdc.md @@ -182,23 +182,17 @@ that request. The data path is: -```text - durable stream and acknowledgement state - +-------------------------+ - | transaction state store | - | and system key storage | - +------------+------------+ - | - v -Client <---- consume / acknowledge ----> CDC Proxy <---- peek / pop ---- TLogs - ^ ^ - | | - | assigned streams | ordinary and - | | CDC tags - Cluster Controller Commit Proxies - ^ - | - user commits +```mermaid +flowchart LR + client["Client"] <-->|"consume / acknowledge"| proxy["CDC Proxy"] + proxy <-->|"peek / pop"| tlogs["TLogs"] + + commits["User commits"] --> commitProxy["Commit Proxies"] + commitProxy -->|"ordinary and CDC tags"| tlogs + + controller["Cluster Controller"] -->|"assigned streams"| proxy + metadata["Transaction state store
and system key storage"] -->|"durable stream and acknowledgement state"| proxy + metadata -->|"durable CDC routing metadata"| commitProxy ``` Commit proxies keep a routing table derived from durable CDC metadata. For From ca1168355344a3f599f736e718b55fd16f2b0f0e Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 19:01:58 -0700 Subject: [PATCH 46/56] Share CDC tag readers across streams and retain log consumers during pops --- fdbclient/NativeCdc.cpp | 72 ++- fdbserver/cdcproxy/CDCProxy.cpp | 537 ++++++++++++------ .../ClusterController.actor.cpp | 9 +- fdbserver/workloads/NativeCdc.cpp | 5 +- 4 files changed, 430 insertions(+), 193 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index a9492227095..ce9709c2f7e 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -97,6 +97,69 @@ Future> getNativeCdcProxyAssignment(Transaction* tr, CDCStreamId s co_return proxyId; } +Future getNativeCdcCurrentTag(Transaction* tr, CDCStreamId streamId) { + Optional currentTag; + const KeyRange historyRange = cdcTagHistoryRangeFor(streamId); + Key begin = historyRange.begin; + while (begin < historyRange.end) { + RangeResult history = co_await tr->getRange(KeyRangeRef(begin, historyRange.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& assignment : history) { + currentTag = std::get<2>(decodeCDCTagHistoryKey(assignment.key)); + } + if (!history.more) { + break; + } + begin = keyAfter(history.back().key); + } + if (!currentTag.present()) { + throw client_invalid_operation(); + } + co_return currentTag.get(); +} + +Future> getNativeCdcProxyAssignmentForTag(Transaction* tr, Tag targetTag) { + std::set activeStreamIds; + Key begin = cdcStreamKeys.begin; + while (begin < cdcStreamKeys.end) { + RangeResult streams = co_await tr->getRange(KeyRangeRef(begin, cdcStreamKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& stream : streams) { + activeStreamIds.insert(decodeCDCStreamKey(stream.key)); + } + if (!streams.more) { + break; + } + begin = keyAfter(streams.back().key); + } + + std::map currentTags; + begin = cdcTagHistoryKeys.begin; + while (begin < cdcTagHistoryKeys.end) { + RangeResult histories = + co_await tr->getRange(KeyRangeRef(begin, cdcTagHistoryKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& history : histories) { + const auto decoded = decodeCDCTagHistoryKey(history.key); + const CDCStreamId streamId = std::get<0>(decoded); + if (activeStreamIds.contains(streamId)) { + currentTags[streamId] = std::get<2>(decoded); + } + } + if (!histories.more) { + break; + } + begin = keyAfter(histories.back().key); + } + + for (const auto& [streamId, tag] : currentTags) { + if (tag == targetTag) { + Optional proxyId = co_await getNativeCdcProxyAssignment(tr, streamId); + if (proxyId.present()) { + co_return proxyId; + } + } + } + co_return Optional(); +} + void signalNativeCdcProxyAssignmentChange(Transaction* tr) { tr->set(cdcProxyAssignmentChangeKey, BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), @@ -283,7 +346,10 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys throw client_invalid_operation(); } if (proxyId.present() && !(co_await getNativeCdcProxyAssignment(&tr, streamId)).present()) { - tr.set(cdcProxyKeyFor(streamId, proxyId.get()), Value()); + const Tag tag = co_await getNativeCdcCurrentTag(&tr, streamId); + Optional sharedTagProxy = co_await getNativeCdcProxyAssignmentForTag(&tr, tag); + const UID selectedProxy = sharedTagProxy.present() ? sharedTagProxy.get() : proxyId.get(); + tr.set(cdcProxyKeyFor(streamId, selectedProxy), Value()); signalNativeCdcProxyAssignmentChange(&tr); co_await tr.commit(); } @@ -302,7 +368,9 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys tr.atomicOp( cdcMinVersionKeyFor(streamId), cdcVersionstampedMinVersionValue(), MutationRef::SetVersionstampedValue); if (proxyId.present()) { - tr.set(cdcProxyKeyFor(streamId, proxyId.get()), Value()); + Optional sharedTagProxy = co_await getNativeCdcProxyAssignmentForTag(&tr, tag); + const UID selectedProxy = sharedTagProxy.present() ? sharedTagProxy.get() : proxyId.get(); + tr.set(cdcProxyKeyFor(streamId, selectedProxy), Value()); signalNativeCdcProxyAssignmentChange(&tr); } co_await tr.commit(); diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 72e8e54b389..b1abded62eb 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -49,33 +49,48 @@ namespace { struct CDCStreamReadState { Optional keys; Version minVersion = invalidVersion; - Tag currentTag = invalidTag; - std::vector> tagHistory; + std::vector> tagAssignments; +}; + +struct CDCTagInterval { + Tag tag; + Version begin; + Version end; + Version bufferedThrough; + + CDCTagInterval(Tag tag, Version begin, Version end) + : tag(tag), begin(begin), end(end), bufferedThrough(begin - 1) {} }; struct CDCBufferedStream : ReferenceCounted { CDCStreamId streamId; + Optional keys; bool active = true; bool initialized = false; bool tooOld = false; Version minVersion = invalidVersion; Version bufferedThrough = invalidVersion; int64_t bufferedBytes = 0; + std::vector tagIntervals; std::deque> mutations; AsyncTrigger changed; - AsyncTrigger refresh; - AsyncTrigger stopped; explicit CDCBufferedStream(CDCStreamId streamId) : streamId(streamId) {} }; struct CDCBufferedBatch { - Version bufferedThrough; int64_t bufferedBytes = 0; - bool mergeFirstMutationVersion = false; std::deque> mutations; +}; - explicit CDCBufferedBatch(Version bufferedThrough) : bufferedThrough(bufferedThrough) {} +struct CDCBufferedTag : ReferenceCounted { + Tag tag; + bool active = true; + std::set streamIds; + AsyncTrigger refresh; + AsyncTrigger stopped; + + explicit CDCBufferedTag(Tag tag) : tag(tag) {} }; struct CDCProxyData { @@ -84,6 +99,7 @@ struct CDCProxyData { Reference const> dbInfo; Reference>> logSystem; std::map> streams; + std::map> tags; FlowLock bufferLock; int64_t bufferedBytes = 0; @@ -164,10 +180,7 @@ Future readCDCStreamState(Database cx, throw client_invalid_operation(); } - result.currentTag = tagAssignments.back().second; - for (int i = tagAssignments.size() - 1; i > 0; --i) { - result.tagHistory.emplace_back(tagAssignments[i].first, tagAssignments[i - 1].second); - } + result.tagAssignments = std::move(tagAssignments); co_return result; } catch (Error& e) { if (e.code() == error_code_wrong_shard_server) { @@ -179,10 +192,180 @@ Future readCDCStreamState(Database cx, } } -CDCBufferedBatch bufferMessages(Reference stream, - CDCStreamReadState const& metadata, - Reference cursor) { - CDCBufferedBatch batch(stream->bufferedThrough); +void clearBufferedMutations(CDCProxyData* self, Reference stream) { + if (stream->bufferedBytes > 0) { + ASSERT(self->bufferedBytes >= stream->bufferedBytes); + self->bufferedBytes -= stream->bufferedBytes; + self->bufferLock.release(stream->bufferedBytes); + stream->bufferedBytes = 0; + } + stream->mutations.clear(); +} + +void addMutationToBatch(Reference stream, + CDCBufferedBatch* batch, + Version version, + MutationRef const& mutation) { + auto batchVersion = std::find_if(batch->mutations.begin(), batch->mutations.end(), [version](const auto& buffered) { + return buffered.version == version; + }); + if (batchVersion == batch->mutations.end()) { + batch->mutations.emplace_back(); + batchVersion = std::prev(batch->mutations.end()); + batchVersion->version = version; + const bool alreadyBuffered = + std::any_of(stream->mutations.begin(), stream->mutations.end(), [version](const auto& buffered) { + return buffered.version == version; + }); + if (!alreadyBuffered) { + batch->bufferedBytes += sizeof(VersionedMutationsRef); + } + } + batchVersion->mutations.push_back_deep(batchVersion->arena(), mutation); + batch->bufferedBytes += mutation.expectedSize() + sizeof(MutationRef); +} + +void addBufferedBatch(CDCProxyData* self, Reference stream, CDCBufferedBatch batch) { + while (!batch.mutations.empty()) { + Standalone versioned = std::move(batch.mutations.front()); + batch.mutations.pop_front(); + auto location = + std::lower_bound(stream->mutations.begin(), + stream->mutations.end(), + versioned.version, + [](const auto& buffered, Version version) { return buffered.version < version; }); + if (location != stream->mutations.end() && location->version == versioned.version) { + for (const auto& mutation : versioned.mutations) { + location->mutations.push_back_deep(location->arena(), mutation); + } + } else { + stream->mutations.insert(location, std::move(versioned)); + } + } + stream->bufferedBytes += batch.bufferedBytes; + self->bufferedBytes += batch.bufferedBytes; +} + +void updateStreamBufferedThrough(Reference stream) { + Version bufferedThrough = stream->minVersion - 1; + for (const auto& interval : stream->tagIntervals) { + if (interval.begin > bufferedThrough + 1) { + break; + } + if (interval.bufferedThrough < interval.begin) { + break; + } + bufferedThrough = std::max(bufferedThrough, interval.bufferedThrough); + if (interval.bufferedThrough < interval.end - 1) { + break; + } + } + if (bufferedThrough > stream->bufferedThrough) { + stream->bufferedThrough = bufferedThrough; + stream->changed.trigger(); + } +} + +void advanceStreamMinVersion(Reference stream, Version minVersion) { + stream->minVersion = std::max(stream->minVersion, minVersion); + for (auto& interval : stream->tagIntervals) { + if (stream->minVersion > interval.begin) { + interval.bufferedThrough = + std::max(interval.bufferedThrough, std::min(stream->minVersion - 1, interval.end - 1)); + } + } + updateStreamBufferedThrough(stream); +} + +void detachStreamFromTags(CDCProxyData* self, Reference stream) { + for (const auto& interval : stream->tagIntervals) { + auto tag = self->tags.find(interval.tag); + if (tag == self->tags.end()) { + continue; + } + Reference bufferedTag = tag->second; + bufferedTag->streamIds.erase(stream->streamId); + if (bufferedTag->streamIds.empty()) { + bufferedTag->active = false; + self->tags.erase(tag); + bufferedTag->stopped.trigger(); + } else { + bufferedTag->refresh.trigger(); + } + } +} + +Optional nextTagReadVersion(CDCProxyData* self, Reference tag) { + Optional begin; + for (const CDCStreamId streamId : tag->streamIds) { + auto stream = self->streams.find(streamId); + if (stream == self->streams.end() || !stream->second->active || !stream->second->initialized) { + continue; + } + for (const auto& interval : stream->second->tagIntervals) { + if (interval.tag != tag->tag) { + continue; + } + const Version next = std::max(interval.begin, interval.bufferedThrough + 1); + if (next < interval.end && (!begin.present() || next < begin.get())) { + begin = next; + } + } + } + return begin; +} + +void advanceTagBufferedThrough(CDCProxyData* self, Reference tag, Version bufferedThrough) { + const std::vector streamIds(tag->streamIds.begin(), tag->streamIds.end()); + for (const CDCStreamId streamId : streamIds) { + auto stream = self->streams.find(streamId); + if (stream == self->streams.end() || !stream->second->active) { + continue; + } + for (auto& interval : stream->second->tagIntervals) { + if (interval.tag == tag->tag && bufferedThrough >= interval.begin) { + interval.bufferedThrough = + std::max(interval.bufferedThrough, std::min(bufferedThrough, interval.end - 1)); + } + } + updateStreamBufferedThrough(stream->second); + } +} + +void markPoppedTagStreamsTooOld(CDCProxyData* self, Reference tag, Version popped) { + std::vector> tooOldStreams; + for (const CDCStreamId streamId : tag->streamIds) { + auto stream = self->streams.find(streamId); + if (stream == self->streams.end() || !stream->second->active) { + continue; + } + for (const auto& interval : stream->second->tagIntervals) { + const Version next = std::max(interval.begin, interval.bufferedThrough + 1); + if (interval.tag == tag->tag && next < interval.end && next < popped) { + tooOldStreams.push_back(stream->second); + break; + } + } + } + for (const auto& stream : tooOldStreams) { + TraceEvent("CDCBufferStreamTooOld", self->id) + .detail("StreamId", stream->streamId) + .detail("MinVersion", stream->minVersion) + .detail("BufferedThrough", stream->bufferedThrough) + .detail("Popped", popped) + .detail("Tag", tag->tag); + clearBufferedMutations(self, stream); + stream->tooOld = true; + stream->active = false; + stream->changed.trigger(); + detachStreamFromTags(self, stream); + } +} + +std::map bufferMessages(CDCProxyData* self, + Reference tag, + Reference cursor) { + std::map batches; while (cursor->hasMessage()) { const Version messageVersion = cursor->version().version; ArenaReader& reader = *cursor->reader(); @@ -199,180 +382,168 @@ CDCBufferedBatch bufferMessages(Reference stream, } else { MutationRef mutation; reader >> mutation; - Optional clipped = clipCDCMutation(mutation, metadata.keys.get()); - if (clipped.present()) { - if (batch.mutations.empty() || batch.mutations.back().version != messageVersion) { - batch.mutations.emplace_back(); - batch.mutations.back().version = messageVersion; - if (stream->mutations.empty() || stream->mutations.back().version != messageVersion) { - batch.bufferedBytes += sizeof(VersionedMutationsRef); - } else { - batch.mergeFirstMutationVersion = true; - } + for (const CDCStreamId streamId : tag->streamIds) { + auto stream = self->streams.find(streamId); + if (stream == self->streams.end() || !stream->second->active || !stream->second->keys.present()) { + continue; + } + const bool coversVersion = + std::any_of(stream->second->tagIntervals.begin(), + stream->second->tagIntervals.end(), + [tag, messageVersion](const auto& interval) { + return interval.tag == tag->tag && interval.begin <= messageVersion && + messageVersion < interval.end && messageVersion > interval.bufferedThrough; + }); + if (!coversVersion) { + continue; + } + Optional clipped = clipCDCMutation(mutation, stream->second->keys.get()); + if (clipped.present()) { + addMutationToBatch(stream->second, &batches[streamId], messageVersion, clipped.get()); } - batch.mutations.back().mutations.push_back_deep(batch.mutations.back().arena(), clipped.get()); - batch.bufferedBytes += clipped.get().expectedSize() + sizeof(MutationRef); } } - batch.bufferedThrough = std::max(batch.bufferedThrough, messageVersion); cursor->nextMessage(); } - return batch; + return batches; } -void addBufferedBatch(CDCProxyData* self, Reference stream, CDCBufferedBatch batch) { - if (batch.mergeFirstMutationVersion) { - ASSERT(!stream->mutations.empty()); - ASSERT(!batch.mutations.empty()); - ASSERT(stream->mutations.back().version == batch.mutations.front().version); - for (const auto& mutation : batch.mutations.front().mutations) { - stream->mutations.back().mutations.push_back_deep(stream->mutations.back().arena(), mutation); +Future bufferTag(CDCProxyData* self, Reference tag) { + while (tag->active) { + Optional begin = nextTagReadVersion(self, tag); + if (!begin.present()) { + tag->active = false; + auto current = self->tags.find(tag->tag); + if (current != self->tags.end() && current->second == tag) { + self->tags.erase(current); + } + co_return; } - batch.mutations.pop_front(); - } - while (!batch.mutations.empty()) { - stream->mutations.emplace_back(std::move(batch.mutations.front())); - batch.mutations.pop_front(); - } - stream->bufferedThrough = std::max(stream->bufferedThrough, batch.bufferedThrough); - stream->bufferedBytes += batch.bufferedBytes; - self->bufferedBytes += batch.bufferedBytes; -} - -void clearBufferedMutations(CDCProxyData* self, Reference stream) { - if (stream->bufferedBytes > 0) { - ASSERT(self->bufferedBytes >= stream->bufferedBytes); - self->bufferedBytes -= stream->bufferedBytes; - self->bufferLock.release(stream->bufferedBytes); - stream->bufferedBytes = 0; - } - stream->mutations.clear(); -} - -Future bufferStream(CDCProxyData* self, Reference stream) { - try { - CDCStreamReadState metadata = co_await readCDCStreamState(self->cx, stream->streamId, self->id, true); - stream->minVersion = metadata.minVersion; - stream->bufferedThrough = metadata.minVersion - 1; - stream->initialized = true; - stream->changed.trigger(); + if (!self->logSystem->get()) { + auto waitForLogSystem = + co_await race(self->logSystem->onChange(), tag->stopped.onTrigger(), tag->refresh.onTrigger()); + if (waitForLogSystem.index() == 1) { + co_return; + } + continue; + } + + Reference cursor = self->logSystem->get()->peekSingle(self->id, begin.get(), tag->tag, {}); + while (tag->active) { + const int64_t peekReservation = + std::min(SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES, SERVER_KNOBS->MAXIMUM_PEEK_BYTES); + ASSERT(peekReservation > 0); + auto capacity = co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, peekReservation), + self->logSystem->onChange(), + tag->stopped.onTrigger(), + tag->refresh.onTrigger()); + if (capacity.index() == 1 || capacity.index() == 3) { + break; + } + if (capacity.index() == 2) { + co_return; + } + FlowLock::Releaser reservation(self->bufferLock, peekReservation); + auto result = co_await race(cursor->getMore(TaskPriority::TLogPeekReply), + self->logSystem->onChange(), + tag->stopped.onTrigger(), + tag->refresh.onTrigger()); + if (result.index() == 1 || result.index() == 3) { + break; + } + if (result.index() == 2) { + co_return; + } - while (stream->active) { - if (!self->logSystem->get()) { - co_await self->logSystem->onChange(); - continue; + cursor->setProtocolVersion(g_network->protocolVersion()); + if (cursor->popped() > begin.get()) { + markPoppedTagStreamsTooOld(self, tag, cursor->popped()); + break; } - metadata = co_await readCDCStreamState(self->cx, stream->streamId, self->id, true); - const Version begin = std::max(stream->bufferedThrough + 1, metadata.minVersion); - Reference cursor = - self->logSystem->get()->peekSingle(self->id, begin, metadata.currentTag, metadata.tagHistory); - while (stream->active) { - const int64_t peekReservation = - std::min(SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES, SERVER_KNOBS->MAXIMUM_PEEK_BYTES); - ASSERT(peekReservation > 0); - auto capacity = co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, peekReservation), - self->logSystem->onChange(), - stream->stopped.onTrigger(), - stream->refresh.onTrigger()); - if (capacity.index() == 1) { - break; - } - if (capacity.index() == 2) { - co_return; - } - if (capacity.index() == 3) { - break; - } - FlowLock::Releaser reservation(self->bufferLock, peekReservation); - auto result = co_await race(cursor->getMore(TaskPriority::TLogPeekReply), - self->logSystem->onChange(), - stream->stopped.onTrigger(), - stream->refresh.onTrigger()); - if (result.index() == 1) { + std::map batches = bufferMessages(self, tag, cursor); + int64_t bufferedBytes = 0; + for (const auto& [streamId, batch] : batches) { + bufferedBytes += batch.bufferedBytes; + } + if (bufferedBytes > peekReservation) { + TraceEvent(SevWarn, "CDCProxyOversizedPeekBatch", self->id) + .detail("Tag", tag->tag) + .detail("BufferedBytes", bufferedBytes) + .detail("ReservedBytes", peekReservation); + reservation.release(); + auto oversizedCapacity = + co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, bufferedBytes), + self->logSystem->onChange(), + tag->stopped.onTrigger(), + tag->refresh.onTrigger()); + if (oversizedCapacity.index() == 1 || oversizedCapacity.index() == 3) { break; } - if (result.index() == 2) { + if (oversizedCapacity.index() == 2) { co_return; } - if (result.index() == 3) { - break; - } + reservation = FlowLock::Releaser(self->bufferLock, bufferedBytes); + } else { + reservation.release(peekReservation - bufferedBytes); + } + if (!tag->active) { + co_return; + } - cursor->setProtocolVersion(g_network->protocolVersion()); - if (cursor->popped() > begin) { - if (cursor->popped() <= stream->minVersion) { - break; - } - TraceEvent("CDCBufferStreamTooOld", self->id) - .detail("StreamId", stream->streamId) - .detail("MinVersion", stream->minVersion) - .detail("BufferedThrough", stream->bufferedThrough) - .detail("Begin", begin) - .detail("Popped", cursor->popped()) - .detail("CurrentTag", metadata.currentTag) - .detail("TagHistorySize", metadata.tagHistory.size()); - throw transaction_too_old(); + int64_t acceptedBytes = 0; + for (auto& [streamId, batch] : batches) { + auto stream = self->streams.find(streamId); + if (stream != self->streams.end() && stream->second->active && tag->streamIds.contains(streamId)) { + acceptedBytes += batch.bufferedBytes; + addBufferedBatch(self, stream->second, std::move(batch)); } - - CDCBufferedBatch batch = bufferMessages(stream, metadata, cursor); - if (batch.bufferedBytes > peekReservation) { - TraceEvent(SevWarn, "CDCProxyOversizedPeekBatch", self->id) - .detail("StreamId", stream->streamId) - .detail("BufferedBytes", batch.bufferedBytes) - .detail("ReservedBytes", peekReservation); - reservation.release(); - auto oversizedCapacity = - co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, batch.bufferedBytes), - self->logSystem->onChange(), - stream->stopped.onTrigger(), - stream->refresh.onTrigger()); - if (oversizedCapacity.index() == 1) { - break; - } - if (oversizedCapacity.index() == 2) { - co_return; - } - if (oversizedCapacity.index() == 3) { - break; - } - reservation = FlowLock::Releaser(self->bufferLock, batch.bufferedBytes); - } else { - reservation.release(peekReservation - batch.bufferedBytes); - } - if (!stream->active) { - co_return; + } + reservation.release(bufferedBytes - acceptedBytes); + // Buffered mutations own these permits until acknowledgement or stream removal. + reservation.remaining = 0; + advanceTagBufferedThrough(self, tag, cursor->version().version - 1); + if (!nextTagReadVersion(self, tag).present()) { + tag->active = false; + auto current = self->tags.find(tag->tag); + if (current != self->tags.end() && current->second == tag) { + self->tags.erase(current); } + co_return; + } + if (cursor->isExhausted()) { + break; + } + } + } +} - const Version previousBufferedThrough = stream->bufferedThrough; - addBufferedBatch(self, stream, std::move(batch)); - // Buffered mutations own these permits until acknowledgement or stream removal. - reservation.remaining = 0; - stream->bufferedThrough = std::max(stream->bufferedThrough, cursor->version().version - 1); - if (stream->bufferedThrough > previousBufferedThrough) { - stream->changed.trigger(); - } - if (cursor->isExhausted()) { - Optional nextTagBoundary; - for (const auto& historyEntry : metadata.tagHistory) { - const Version boundary = historyEntry.first; - if (boundary > begin && (!nextTagBoundary.present() || boundary < nextTagBoundary.get())) { - nextTagBoundary = boundary; - } - } - if (nextTagBoundary.present()) { - const Version previousBufferedThrough = stream->bufferedThrough; - stream->bufferedThrough = std::max(stream->bufferedThrough, nextTagBoundary.get() - 1); - if (stream->bufferedThrough > previousBufferedThrough) { - stream->changed.trigger(); - } - } else { - // ReplayMultiCursor advances past an empty old log generation on - // its next getMore(); rebuilding it here repeats that generation. - continue; - } - break; - } +Future initializeStream(CDCProxyData* self, Reference stream, ActorCollection* actors) { + try { + const CDCStreamReadState metadata = co_await readCDCStreamState(self->cx, stream->streamId, self->id, true); + stream->keys = metadata.keys; + stream->minVersion = metadata.minVersion; + stream->bufferedThrough = metadata.minVersion - 1; + for (size_t i = 0; i < metadata.tagAssignments.size(); ++i) { + const Version begin = std::max(metadata.minVersion, metadata.tagAssignments[i].first); + const Version end = i + 1 < metadata.tagAssignments.size() ? metadata.tagAssignments[i + 1].first + : std::numeric_limits::max(); + if (begin < end) { + stream->tagIntervals.emplace_back(metadata.tagAssignments[i].second, begin, end); + } + } + stream->initialized = true; + stream->changed.trigger(); + for (const auto& interval : stream->tagIntervals) { + auto tag = self->tags.find(interval.tag); + if (tag == self->tags.end()) { + Reference newTag = makeReference(interval.tag); + tag = self->tags.emplace(interval.tag, newTag).first; + tag->second->streamIds.insert(stream->streamId); + actors->add(bufferTag(self, newTag)); + } else { + tag->second->streamIds.insert(stream->streamId); + tag->second->refresh.trigger(); } } } catch (Error& e) { @@ -382,13 +553,6 @@ Future bufferStream(CDCProxyData* self, Reference strea stream->changed.trigger(); co_return; } - if (e.code() == error_code_transaction_too_old) { - clearBufferedMutations(self, stream); - stream->tooOld = true; - stream->active = false; - stream->changed.trigger(); - co_return; - } throw; } } @@ -509,8 +673,9 @@ Future clearCompletedRetiredTagPops(Database cx, std::map co Future popAcknowledgedData(CDCProxyData* self) { const std::map safePopVersions = co_await readSafePopVersions(self->cx); + Reference logSystem = self->logSystem->get(); for (const auto& [tag, version] : safePopVersions) { - self->logSystem->get()->pop(version, tag); + logSystem->pop(version, tag); } const std::map retiredTagPopVersions = co_await readRetiredTagPopVersions(self->cx); std::map completedPopVersions; @@ -518,9 +683,9 @@ Future popAcknowledgedData(CDCProxyData* self) { const auto safePop = safePopVersions.find(tag); const Version version = safePop == safePopVersions.end() ? retiredVersion : std::min(retiredVersion, safePop->second); - self->logSystem->get()->pop(version, tag); + logSystem->pop(version, tag); if (version >= retiredVersion) { - co_await self->logSystem->get()->waitForPopped(retiredVersion, tag); + co_await logSystem->waitForPopped(retiredVersion, tag); completedPopVersions[tag] = retiredVersion; } } @@ -535,7 +700,7 @@ void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { if (!self->streams.contains(streamId)) { Reference stream = makeReference(streamId); self->streams.emplace(streamId, stream); - actors->add(bufferStream(self, stream)); + actors->add(initializeStream(self, stream, actors)); } } } @@ -543,8 +708,8 @@ void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { for (auto it = self->streams.begin(); it != self->streams.end();) { if (!assignedStreams.contains(it->first)) { it->second->active = false; - it->second->stopped.trigger(); it->second->changed.trigger(); + detachStreamFromTags(self, it->second); clearBufferedMutations(self, it->second); it = self->streams.erase(it); } else { @@ -582,9 +747,6 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { throw transaction_too_old(); } - if (stream->bufferedThrough < begin) { - stream->refresh.trigger(); - } while (stream->active && stream->bufferedThrough < begin) { co_await stream->changed.onTrigger(); } @@ -598,7 +760,7 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { CDCConsumeReply reply; reply.lastConsumedVersion = request.cursor.lastConsumedVersion; for (const auto& versioned : stream->mutations) { - if (versioned.version >= begin) { + if (versioned.version >= begin && versioned.version <= stream->bufferedThrough) { reply.mutations.push_back(reply.arena, VersionedMutationsRef(versioned.version, {})); for (const auto& mutation : versioned.mutations) { reply.mutations.back().mutations.push_back_deep(reply.arena, mutation); @@ -621,7 +783,7 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { const Version minVersion = co_await acknowledgeNativeCdcStream(self->cx, request.streamId, request.version); auto found = self->streams.find(request.streamId); if (found != self->streams.end()) { - found->second->minVersion = std::max(found->second->minVersion, minVersion); + advanceStreamMinVersion(found->second, minVersion); while (!found->second->mutations.empty() && found->second->mutations.front().version < minVersion) { const int64_t releasedBytes = sizeof(VersionedMutationsRef) + found->second->mutations.front().mutations.expectedSize(); @@ -632,7 +794,6 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { found->second->mutations.pop_front(); } ASSERT(found->second->bufferedBytes >= 0); - found->second->refresh.trigger(); } co_await popAcknowledgedData(self); request.reply.send(Void()); diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 715da707493..8005d98540f 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -2151,6 +2151,7 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { std::map streamToCDCProxyId; const std::vector availableProxies = db->cdcProxies; + std::map replacementByFailedProxy; size_t replacementIndex = 0; bool repairedAssignment = false; Key begin = cdcProxyKeys.begin; @@ -2165,7 +2166,13 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { return proxy.id() == proxyId; }); if (!availableProxies.empty() && !hasOwner) { - resolvedProxyId = availableProxies[replacementIndex++ % availableProxies.size()].id(); + auto replacement = replacementByFailedProxy.find(proxyId); + if (replacement == replacementByFailedProxy.end()) { + resolvedProxyId = availableProxies[replacementIndex++ % availableProxies.size()].id(); + replacementByFailedProxy.emplace(proxyId, resolvedProxyId); + } else { + resolvedProxyId = replacement->second; + } tr.clear(assignment.key); tr.set(cdcProxyKeyFor(streamId, resolvedProxyId), Value()); repairedAssignment = true; diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index de9adf13d83..b80d33d9708 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -278,6 +278,8 @@ struct NativeCdcWorkload : TestWorkload { ASSERT((co_await getLatestPersistedTag(cx, secondId)) == firstRoute.first); ASSERT(co_await registerNativeCdcStreamClient(cx, firstName, keys) == firstId); + ASSERT(co_await registerNativeCdcStreamClient(cx, secondName, keys) == secondId); + ASSERT((co_await getCDCProxy(firstId)).id() == (co_await getCDCProxy(secondId)).id()); const Version writeVersion = co_await writeValues(cx, { { "shared/unread"_sr, "protected-by-minimum"_sr } }); CDCCursor firstCursor = co_await createNativeCdcCursor(cx, firstName); ASSERT(firstCursor.streamId == firstId); @@ -296,7 +298,6 @@ struct NativeCdcWorkload : TestWorkload { co_await removeNativeCdcStreamClient(cx, firstName); co_await waitForCDCProxyAssignmentRemoval(firstId); - ASSERT(co_await registerNativeCdcStreamClient(cx, secondName, keys) == secondId); CDCCursor unreadCursor = co_await createNativeCdcCursor(cx, secondName); ASSERT(unreadCursor.streamId == secondId); bool foundUnread = false; @@ -556,7 +557,7 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(!(co_await hasRetiredTagPopState(cx, liveTag))); if (g_network->isSimulated()) { - CLIENT_KNOBS->ENABLE_NATIVE_CDC = false; + (const_cast(CLIENT_KNOBS))->ENABLE_NATIVE_CDC = false; const int32_t disabledResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; const uint64_t recoveryBeforeDisable = dbInfo->get().recoveryCount; co_await changeResolverCount(cx, disabledResolverCount); From e4d8604e2974f8f79f6626ba1ee94adeaa84cb46 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 19:16:08 -0700 Subject: [PATCH 47/56] Add fdbserver_cdcproxy_test target --- fdbserver/cdcproxy/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdbserver/cdcproxy/CMakeLists.txt b/fdbserver/cdcproxy/CMakeLists.txt index 63019e0fa9e..ebb290ce37d 100644 --- a/fdbserver/cdcproxy/CMakeLists.txt +++ b/fdbserver/cdcproxy/CMakeLists.txt @@ -5,6 +5,10 @@ add_fdbserver_link_test(fdbserver_cdcproxylinktest fdbserver_cdcproxy fdbserver_logsystem fdbserver_core) +add_fdbserver_unit_test(fdbserver_cdcproxy_test cdcproxy + fdbserver_cdcproxy + fdbserver_logsystem + fdbserver_core) configure_fdbserver_common_includes(fdbserver_cdcproxy) target_include_directories(fdbserver_cdcproxy From 65e7d25a72cfd5c605354da23e414bff350bc659 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 19:38:37 -0700 Subject: [PATCH 48/56] Add end-to-end native CDC simulation coverage with overlapping streams and attrition --- fdbserver/workloads/NativeCdcEndToEnd.cpp | 221 ++++++++++++++++++++++ tests/CMakeLists.txt | 1 + tests/fast/NativeCdcEndToEnd.toml | 31 +++ 3 files changed, 253 insertions(+) create mode 100644 fdbserver/workloads/NativeCdcEndToEnd.cpp create mode 100644 tests/fast/NativeCdcEndToEnd.toml diff --git a/fdbserver/workloads/NativeCdcEndToEnd.cpp b/fdbserver/workloads/NativeCdcEndToEnd.cpp new file mode 100644 index 00000000000..fdad1d8de3c --- /dev/null +++ b/fdbserver/workloads/NativeCdcEndToEnd.cpp @@ -0,0 +1,221 @@ +/* + * NativeCdcEndToEnd.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "fdbclient/NativeCdc.h" +#include "fdbserver/tester/workloads.h" +#include "flow/DeterministicRandom.h" + +struct NativeCdcEndToEndWorkload : TestWorkload { + static constexpr auto NAME = "NativeCdcEndToEnd"; + + struct ExpectedWrite { + Version deadline; + bool observed = false; + }; + + struct StreamState { + Key name; + KeyRange keys; + CDCCursor cursor; + std::map, ExpectedWrite> expected; + }; + + int initialStreamCount; + int minStreamCount; + int maxStreamCount; + int keyCount; + int writesPerRound; + int rounds; + double drainProbability; + double delayBetweenRounds; + double operationTimeout; + int nextStreamNumber = 0; + std::vector streams; + + explicit NativeCdcEndToEndWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + initialStreamCount = getOption(options, "initialStreamCount"_sr, 12); + minStreamCount = getOption(options, "minStreamCount"_sr, 6); + maxStreamCount = getOption(options, "maxStreamCount"_sr, 20); + keyCount = getOption(options, "keyCount"_sr, 16); + writesPerRound = getOption(options, "writesPerRound"_sr, 5); + rounds = getOption(options, "rounds"_sr, 30); + drainProbability = getOption(options, "drainProbability"_sr, 0.25); + delayBetweenRounds = getOption(options, "delayBetweenRounds"_sr, 0.5); + operationTimeout = getOption(options, "operationTimeout"_sr, 120.0); + ASSERT(minStreamCount >= 1); + ASSERT(initialStreamCount >= minStreamCount); + ASSERT(maxStreamCount >= initialStreamCount); + ASSERT(keyCount >= 2); + ASSERT(writesPerRound >= 1 && writesPerRound <= keyCount); + } + + Future setup(Database const& cx) override { return Void(); } + + Future start(Database const& cx) override { + if (clientId != 0) { + return Void(); + } + return run(cx); + } + + Future check(Database const& cx) override { return true; } + + void getMetrics(std::vector& m) override {} + + Key keyForIndex(int index) const { return Key(StringRef(format("native-cdc-e2e/data/%04d", index))); } + + KeyRange randomOverlappingRange() const { + const int middle = keyCount / 2; + const int begin = deterministicRandom()->randomInt(0, middle + 1); + const int end = deterministicRandom()->randomInt(middle + 1, keyCount + 1); + return KeyRange(KeyRangeRef(keyForIndex(begin), keyForIndex(end))); + } + + Future writeValues(Database cx, std::vector> values) { + Transaction tr(cx); + while (true) { + Error err; + try { + for (const auto& [key, value] : values) { + tr.set(key, value); + } + co_await tr.commit(); + co_return tr.getCommittedVersion(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future addStream(Database cx) { + StreamState stream; + stream.name = Key(StringRef(format("native-cdc-e2e/stream/%04d", nextStreamNumber++))); + stream.keys = randomOverlappingRange(); + co_await timeoutError(registerNativeCdcStreamClient(cx, stream.name, stream.keys), operationTimeout); + stream.cursor = co_await timeoutError(createNativeCdcCursor(cx, stream.name), operationTimeout); + streams.push_back(std::move(stream)); + } + + void recordExpectedWrites(std::vector> const& values, Version committedVersion) { + for (auto& stream : streams) { + for (const auto& [key, value] : values) { + if (stream.keys.contains(key)) { + const auto inserted = + stream.expected.emplace(std::make_pair(key, value), ExpectedWrite{ committedVersion }); + ASSERT(inserted.second); + } + } + } + } + + Future drainThrough(Database cx, StreamState* stream, Version throughVersion) { + const double deadline = now() + operationTimeout; + while (stream->cursor.lastConsumedVersion < throughVersion) { + const Version previous = stream->cursor.lastConsumedVersion; + CDCConsumeReply reply = co_await timeoutError(consumeNativeCdcStream(cx, stream->cursor), operationTimeout); + if (reply.lastConsumedVersion == previous) { + ASSERT(now() < deadline); + co_await delay(0.1); + continue; + } + ASSERT(reply.lastConsumedVersion > previous); + for (const auto& versioned : reply.mutations) { + ASSERT(versioned.version > previous); + ASSERT(versioned.version <= reply.lastConsumedVersion); + for (const auto& mutation : versioned.mutations) { + ASSERT(mutation.type == MutationRef::SetValue); + ASSERT(stream->keys.contains(mutation.param1)); + const auto found = + stream->expected.find(std::make_pair(Key(mutation.param1), Value(mutation.param2))); + ASSERT(found != stream->expected.end()); + found->second.observed = true; + } + } + stream->cursor.lastConsumedVersion = reply.lastConsumedVersion; + } + for (const auto& expected : stream->expected) { + if (expected.second.deadline <= throughVersion) { + ASSERT(expected.second.observed); + } + } + co_await timeoutError(acknowledgeNativeCdcStreamClient(cx, stream->cursor), operationTimeout); + } + + Future removeStream(Database cx, int index, Version throughVersion) { + ASSERT(index > 0); + co_await drainThrough(cx, &streams[index], throughVersion); + co_await timeoutError(removeNativeCdcStreamClient(cx, streams[index].name), operationTimeout); + streams.erase(streams.begin() + index); + } + + Future run(Database cx) { + for (int i = 0; i < initialStreamCount; ++i) { + co_await addStream(cx); + } + + Version mostRecentWrite = invalidVersion; + for (int round = 0; round < rounds; ++round) { + if (round > 0 && static_cast(streams.size()) > minStreamCount && + (round % 3 == 0 || deterministicRandom()->random01() < 0.35)) { + const int removalIndex = deterministicRandom()->randomInt(1, static_cast(streams.size())); + co_await removeStream(cx, removalIndex, mostRecentWrite); + } + if (static_cast(streams.size()) < maxStreamCount && + (round % 2 == 0 || deterministicRandom()->random01() < 0.35)) { + co_await addStream(cx); + } + + std::set chosenKeys{ keyCount / 2 }; + while (static_cast(chosenKeys.size()) < writesPerRound) { + chosenKeys.insert(deterministicRandom()->randomInt(0, keyCount)); + } + std::vector> values; + for (int index : chosenKeys) { + values.emplace_back(keyForIndex(index), Value(StringRef(format("round/%04d/key/%04d", round, index)))); + } + mostRecentWrite = co_await writeValues(cx, values); + recordExpectedWrites(values, mostRecentWrite); + + // streams[0] intentionally stays behind while other streams are removed. + for (int i = 1; i < static_cast(streams.size()); ++i) { + if (deterministicRandom()->random01() < drainProbability) { + co_await drainThrough(cx, &streams[i], mostRecentWrite); + } + } + co_await delay(delayBetweenRounds); + } + + for (auto& stream : streams) { + co_await drainThrough(cx, &stream, mostRecentWrite); + } + while (!streams.empty()) { + co_await timeoutError(removeNativeCdcStreamClient(cx, streams.back().name), operationTimeout); + streams.pop_back(); + } + } +}; + +WorkloadFactory NativeCdcEndToEndWorkloadFactory; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 595b7e7e74f..6ca195c6302 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -198,6 +198,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/RandomUnitTests.toml) add_fdb_test(TEST_FILES fast/RangeLocking.toml) add_fdb_test(TEST_FILES fast/NativeCdc.toml) + add_fdb_test(TEST_FILES fast/NativeCdcEndToEnd.toml) add_fdb_test(TEST_FILES fast/NativeCdcSharedTag.toml) add_fdb_test(TEST_FILES fast/RangeLockCycle.toml) add_fdb_test(TEST_FILES fast/ReadHotDetectionCorrectness.toml IGNORE) # TODO re-enable once read hot detection is enabled. diff --git a/tests/fast/NativeCdcEndToEnd.toml b/tests/fast/NativeCdcEndToEnd.toml new file mode 100644 index 00000000000..28c5dbb9dfe --- /dev/null +++ b/tests/fast/NativeCdcEndToEnd.toml @@ -0,0 +1,31 @@ +[configuration] +config = 'triple' +singleRegion = true + +[[knobs]] +enable_native_cdc = true + +[[test]] +testTitle = 'NativeCdcEndToEnd' +useDB = true +waitForQuiescenceEnd = false +timeout = 300 + + [[test.workload]] + testName = 'NativeCdcEndToEnd' + initialStreamCount = 12 + minStreamCount = 6 + maxStreamCount = 20 + keyCount = 16 + writesPerRound = 5 + rounds = 30 + drainProbability = 0.25 + delayBetweenRounds = 0.5 + operationTimeout = 120.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 3 + machinesToLeave = 3 + reboot = true + testDuration = 20.0 From 3ff5ff602d0ac315cedd7e18b019e30af0d44103 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 19:41:48 -0700 Subject: [PATCH 49/56] Remove hard-coded configuration for NativeCdcEndToEnd.toml --- tests/fast/NativeCdcEndToEnd.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/fast/NativeCdcEndToEnd.toml b/tests/fast/NativeCdcEndToEnd.toml index 28c5dbb9dfe..fce06e0c73b 100644 --- a/tests/fast/NativeCdcEndToEnd.toml +++ b/tests/fast/NativeCdcEndToEnd.toml @@ -1,7 +1,3 @@ -[configuration] -config = 'triple' -singleRegion = true - [[knobs]] enable_native_cdc = true From 0761ac6b3c857984642e262d15e8b35e9d9cbdb5 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Wed, 27 May 2026 21:58:37 -0700 Subject: [PATCH 50/56] Fix native CDC failover assignment repair and acknowledged log popping across configurations --- fdbclient/NativeCdc.cpp | 13 ++++++----- fdbserver/cdcproxy/CDCProxy.cpp | 23 +++++++++++++++---- .../ClusterController.actor.cpp | 3 ++- fdbserver/logsystem/LogSystemConsumer.cpp | 16 +++++++++---- fdbserver/workloads/NativeCdcEndToEnd.cpp | 3 +++ 5 files changed, 43 insertions(+), 15 deletions(-) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index ce9709c2f7e..cd71f9204f0 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -551,7 +551,7 @@ Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Ve const Version readVersion = co_await tr.getReadVersion(); if (consumedThrough > readVersion) { - throw client_invalid_operation(); + throw future_version(); } const Version minVersion = decodeCDCMinVersionValue(minVersionValue.get()); @@ -577,7 +577,8 @@ Future registerNativeCdcStreamClient(Database cx, Key name, KeyRang while (true) { CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx, previousProxy); try { - CDCRegisterStreamReply reply = co_await proxy.registerStream.getReply(CDCRegisterStreamRequest(name, keys)); + CDCRegisterStreamReply reply = + co_await throwErrorOr(proxy.registerStream.tryGetReply(CDCRegisterStreamRequest(name, keys))); co_return reply.streamId; } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { @@ -596,7 +597,7 @@ Future> listNativeCdcStreamsClient(Database cx) while (true) { CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx, previousProxy); try { - CDCListStreamsReply reply = co_await proxy.listStreams.getReply(CDCListStreamsRequest()); + CDCListStreamsReply reply = co_await throwErrorOr(proxy.listStreams.tryGetReply(CDCListStreamsRequest())); std::vector streams; streams.reserve(reply.streams.size()); for (const auto& stream : reply.streams) { @@ -633,7 +634,7 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { co_return; } try { - co_await proxy.get().removeStream.getReply(CDCRemoveStreamRequest(name)); + co_await throwErrorOr(proxy.get().removeStream.tryGetReply(CDCRemoveStreamRequest(name))); co_return; } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { @@ -655,7 +656,7 @@ Future consumeNativeCdcStream(Database cx, CDCCursor cursor) { while (true) { CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); try { - co_return co_await proxy.consume.getReply(CDCConsumeRequest(cursor)); + co_return co_await throwErrorOr(proxy.consume.tryGetReply(CDCConsumeRequest(cursor))); } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { throw; @@ -675,7 +676,7 @@ Future acknowledgeNativeCdcStreamClient(Database cx, CDCCursor cursor) { while (true) { CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); try { - co_await proxy.ack.getReply(CDCAckRequest(cursor.streamId, cursor.lastConsumedVersion)); + co_await throwErrorOr(proxy.ack.tryGetReply(CDCAckRequest(cursor.streamId, cursor.lastConsumedVersion))); co_return; } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index b1abded62eb..d20bb03794c 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -100,6 +100,7 @@ struct CDCProxyData { Reference>> logSystem; std::map> streams; std::map> tags; + AsyncTrigger popAcknowledgedDataTrigger; FlowLock bufferLock; int64_t bufferedBytes = 0; @@ -692,6 +693,19 @@ Future popAcknowledgedData(CDCProxyData* self) { co_await clearCompletedRetiredTagPops(self->cx, std::move(completedPopVersions)); } +Future monitorAcknowledgedDataPops(CDCProxyData* self) { + co_await self->popAcknowledgedDataTrigger.onTrigger(); + while (true) { + // Pop completion may wait on an unavailable log generation. A new acknowledgement or log-system + // configuration supersedes that attempt and retries the durable work against current state. + Future retriggered = self->popAcknowledgedDataTrigger.onTrigger(); + auto result = co_await race(popAcknowledgedData(self), retriggered); + if (result.index() == 0) { + co_await self->popAcknowledgedDataTrigger.onTrigger(); + } + } +} + void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { std::set assignedStreams; for (const auto& [streamId, proxyId] : self->dbInfo->get().client.streamToCDCProxyId) { @@ -795,7 +809,7 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { } ASSERT(found->second->bufferedBytes >= 0); } - co_await popAcknowledgedData(self); + self->popAcknowledgedDataTrigger.trigger(); request.reply.send(Void()); } catch (Error& e) { if (e.code() == error_code_actor_cancelled) { @@ -821,7 +835,7 @@ Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { try { Optional removed = co_await removeNativeCdcStream(self->cx, request.name, self->id); if (removed.present()) { - co_await popAcknowledgedData(self); + self->popAcknowledgedDataTrigger.trigger(); } request.reply.send(Void()); } catch (Error& e) { @@ -865,7 +879,8 @@ Future cdcProxyServer(CDCProxyInterface proxy, actors.add(traceRole(Role::CDC_PROXY, proxy.id())); self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); reconcileStreams(&self, &actors); - actors.add(popAcknowledgedData(&self)); + actors.add(monitorAcknowledgedDataPops(&self)); + self.popAcknowledgedDataTrigger.trigger(); Future dbInfoChange = dbInfo->onChange(); bool hasBeenPublished = std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) != @@ -915,7 +930,7 @@ Future cdcProxyServer(CDCProxyInterface proxy, self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); } reconcileStreams(&self, &actors); - actors.add(popAcknowledgedData(&self)); + self.popAcknowledgedDataTrigger.trigger(); dbInfoChange = dbInfo->onChange(); break; } diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 8005d98540f..7b0cbb27c26 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -2219,8 +2219,9 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { } Future assignmentChangeFuture = tr.watch(cdcProxyAssignmentChangeKey); + Future endpointChangeFuture = db->clientInfo->onChange(); co_await tr.commit(); - co_await assignmentChangeFuture; + co_await (assignmentChangeFuture || endpointChangeFuture); break; } catch (Error& e) { err = e; diff --git a/fdbserver/logsystem/LogSystemConsumer.cpp b/fdbserver/logsystem/LogSystemConsumer.cpp index 5bf5d8539a2..bada5d89ed1 100644 --- a/fdbserver/logsystem/LogSystemConsumer.cpp +++ b/fdbserver/logsystem/LogSystemConsumer.cpp @@ -5,6 +5,16 @@ #include "flow/genericactors.actor.h" +namespace { +bool shouldPopFromLogSet(Reference const& logSet, Tag tag, int8_t popLocality) { + // CDC tags are replicated to each TLog set. Once a version is acknowledged, every copy can be discarded; + // leaving remote copies unpopped can retain old log generations across failover. + return logSet->locality == tagLocalitySpecial || logSet->locality == tag.locality || + tag.locality == tagLocalityCDC || + (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == logSet->isLocal)); +} +} // namespace + Reference LogSystemConsumer::peekAll(UID dbgid, Version begin, Version end, @@ -885,8 +895,7 @@ void LogSystemConsumer::pop(Version upTo, Tag tag, Version durableKnownCommitted return; } for (auto& t : ls.tLogs) { - if (t->locality == tagLocalitySpecial || t->locality == tag.locality || - (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == t->isLocal))) { + if (shouldPopFromLogSet(t, tag, popLocality)) { for (auto& log : t->logServers) { Version prev = ls.outstandingPops[std::make_pair(log->get().id(), tag)].first; if (prev < upTo) { @@ -908,8 +917,7 @@ Future LogSystemConsumer::waitForPopped(Version upTo, Tag tag, int8_t popL while (true) { std::vector> poppedFutures; for (auto& t : logSystem->tLogs) { - if (t->locality == tagLocalitySpecial || t->locality == tag.locality || - (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == t->isLocal))) { + if (shouldPopFromLogSet(t, tag, popLocality)) { for (auto& log : t->logServers) { poppedFutures.push_back(LogSystem::getPoppedFromTLog(log, tag)); } diff --git a/fdbserver/workloads/NativeCdcEndToEnd.cpp b/fdbserver/workloads/NativeCdcEndToEnd.cpp index fdad1d8de3c..0533480dd5c 100644 --- a/fdbserver/workloads/NativeCdcEndToEnd.cpp +++ b/fdbserver/workloads/NativeCdcEndToEnd.cpp @@ -71,6 +71,9 @@ struct NativeCdcEndToEndWorkload : TestWorkload { ASSERT(writesPerRound >= 1 && writesPerRound <= keyCount); } + // RandomRangeLock can outlive this bounded CDC workload and mask its progress check. + void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("RandomRangeLock"); } + Future setup(Database const& cx) override { return Void(); } Future start(Database const& cx) override { From db90538abfe84c8d4cf2d4ef9ea27309e7e6b082 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Thu, 28 May 2026 14:23:47 -0700 Subject: [PATCH 51/56] Fix native CDC recovery and replication across failure topologies --- design/cdc.md | 77 +++++++---- fdbclient/NativeCdc.cpp | 120 +++++++++++------- fdbclient/include/fdbclient/NativeCdc.h | 32 ++++- fdbserver/cdcproxy/CDCProxy.cpp | 108 +++++++++++----- .../ClusterController.actor.cpp | 36 +++--- .../clustercontroller/ClusterRecovery.cpp | 1 + fdbserver/logsystem/LogSet.cpp | 10 +- fdbserver/logsystem/LogSystem.cpp | 20 ++- .../fdbserver/logsystem/LogSystemTypes.h | 6 +- fdbserver/workloads/NativeCdc.cpp | 90 +++++++------ fdbserver/workloads/NativeCdcEndToEnd.cpp | 21 ++- tests/fast/NativeCdcEndToEnd.toml | 11 +- 12 files changed, 346 insertions(+), 186 deletions(-) diff --git a/design/cdc.md b/design/cdc.md index f1c592d8b10..3749a6b51c8 100644 --- a/design/cdc.md +++ b/design/cdc.md @@ -4,7 +4,7 @@ Native Change Data Capture (CDC) provides a FoundationDB-native mechanism for reading committed mutations for a registered key range. A client registers a -named stream, creates a cursor for that name, consumes batches of mutations, +named stream, creates a consumer for that name, consumes batches of mutations, and acknowledges processed versions. The implementation persists enough state to retain unread TLog data and to resume stream service after CDC proxy failure or transaction-system recovery. @@ -35,7 +35,7 @@ and release its own log history without changing user data storage. Native CDC is intended to provide: * Durable, named registrations for key ranges in normal user key space. -* A cursor-based API in which a consumer only needs a stream name after +* A consumer API in which a client only needs a stream name after registration, rather than repeating its registered range on every read. * Ordered mutation batches identified by FoundationDB commit versions. * Durable acknowledgements that determine how much CDC-tagged TLog history may @@ -67,9 +67,8 @@ Future registerNativeCdcStreamClient(Database cx, Key name, KeyRang Future removeNativeCdcStreamClient(Database cx, Key name); Future> listNativeCdcStreamsClient(Database cx); -Future createNativeCdcCursor(Database cx, Key name); -Future consumeNativeCdcStream(Database cx, CDCCursor cursor); -Future acknowledgeNativeCdcStreamClient(Database cx, CDCCursor cursor); +Future> createNativeCdcConsumer(Database cx, Key name); +Reference resumeNativeCdcConsumer(Database cx, CDCCursor position); ``` A stream registration contains: @@ -84,9 +83,26 @@ struct NativeCdcStreamInfo { ``` The durable identity of a stream is its `CDCStreamId`, not its name. Names are -used to create and manage streams. A cursor resolves the current stream ID -once, so removing a name and later registering the same name does not silently -redirect an existing consumer to a different stream. +used to create and manage streams. Creating a consumer resolves the current +stream ID once, so removing a name and later registering the same name does +not silently redirect an existing consumer to a different stream. + +`NativeCdcConsumer` is a client-side, reference-counted reader object. It +holds the client's `Database` handle and current delivered position and +exposes consumption and acknowledgement operations: + +```cpp +class NativeCdcConsumer : public ReferenceCounted { +public: + Future consume(); + Future acknowledge(); + const CDCCursor& position() const; +}; +``` + +`CDCCursor` remains a small serializable position token used by CDC proxy +requests and by callers that need to checkpoint or resume a consumer. It does +not contain a `Database` handle or other process-local state: ```cpp struct CDCCursor { @@ -114,27 +130,35 @@ A typical consumer loop is: ```cpp co_await registerNativeCdcStreamClient(db, "orders"_sr, KeyRangeRef("order/"_sr, "order0"_sr)); -state CDCCursor cursor = co_await createNativeCdcCursor(db, "orders"_sr); +state Reference consumer = co_await createNativeCdcConsumer(db, "orders"_sr); loop { - CDCConsumeReply reply = co_await consumeNativeCdcStream(db, cursor); + CDCConsumeReply reply = co_await consumer->consume(); for (auto const& versionedMutations : reply.mutations) { // Apply all mutations for versionedMutations.version. } - cursor.lastConsumedVersion = reply.lastConsumedVersion; - co_await acknowledgeNativeCdcStreamClient(db, cursor); + co_await consumer->acknowledge(); } ``` -The acknowledgement means that the consumer no longer requires CDC mutations -through `cursor.lastConsumedVersion`. Internally, acknowledgement advances the -stream's persisted minimum required version to `lastConsumedVersion + 1`. -Therefore the consumer must not acknowledge a returned cursor position before -it has durably processed all mutations represented through that position. -The server rejects an acknowledgement beyond its current read version, so a -consumer cannot pre-pop future mutations on a tag that may later be assigned -to another stream. +`consume()` advances `consumer->position()` to the returned delivered +position, but does not change durable retention. The acknowledgement means +that the consumer no longer requires CDC mutations through +`consumer->position().lastConsumedVersion`. Internally, acknowledgement +advances the stream's persisted minimum required version to +`lastConsumedVersion + 1`. Therefore the consumer must not call +`acknowledge()` before it has durably processed all mutations represented +through the delivered position, and must not issue another `consume()` if it +still needs to retry processing the previous reply from that same in-memory +consumer. A consumer restarted from its last durably checkpointed position +can use `resumeNativeCdcConsumer()`. +The server accepts an acknowledgement beyond its current transaction read +version only when the owning CDC proxy has read through that position from its +tagged log stream. A resumed consumer may reissue an acknowledgement already +represented by the durable watermark, and a replacement proxy reconciles its +in-memory frontier to that watermark. A fabricated future position cannot +pre-pop mutations that have not reached a proxy or the database read version. ### Registration and removal semantics @@ -152,7 +176,7 @@ also supplies the first retention watermark for its TLog history. release of tagged log history that was protected by the removed stream. Removal explicitly relinquishes any unread history for that stream while still respecting the retention needs of other streams sharing its tags. Stream -removal is terminal for existing cursors. Stale consume or acknowledgement +removal is terminal for existing consumers. Stale consume or acknowledgement operations return an error instead of waiting indefinitely for an owner that will never be assigned again. @@ -167,7 +191,7 @@ minimum version: TLogs must not pop tagged data that the stream may still consume. A slow consumer therefore retains its unread history rather than expiring solely because of age. -Consumption returns `transaction_too_old` when the caller supplies a cursor +Consumption returns `transaction_too_old` when a consumer supplies a cursor older than the stream's already acknowledged durable watermark. The proxy also treats discovery that an active stream's required tagged data has nevertheless already been popped as `transaction_too_old`; that condition indicates a @@ -249,7 +273,7 @@ than transaction state: | `\xff\x02/cdc/retiredTagPopVersion/` | `Version` | Final pop watermark required after a stream using a tag is removed. | The initial `minVersion` is written with a versionstamp at stream -registration. When a cursor acknowledges processing through version `V`, the +registration. When a consumer acknowledges processing through version `V`, the stored value advances monotonically to `V + 1`. A CDC proxy may pop tagged mutations before this watermark only when doing so is safe for every live stream sharing that tag. @@ -459,8 +483,9 @@ reduce it so shared-tag behavior is exercised frequently. The implementation is structured around the following properties: -* **Registration identity:** a cursor binds to a stream ID, so reuse of a - removed stream name cannot cause an existing consumer to read a new stream. +* **Registration identity:** a consumer's cursor binds to a stream ID, so + reuse of a removed stream name cannot cause an existing consumer to read a + new stream. * **Range correctness:** CDC proxies return only mutations within a stream's registered range, even when its tag is shared with other streams. * **Acknowledgement monotonicity:** durable minimum required versions advance @@ -511,7 +536,7 @@ simulation workloads for the end-to-end behavior. The basic native CDC workload covers: * Registering, listing, consuming, acknowledging, and removing streams. -* Name-based cursor creation and correct filtering of returned mutations. +* Name-based consumer creation and correct filtering of returned mutations. * Rejection of incompatible same-name registrations. * CDC proxy replacement and recovery of stream service. * Errors for stale consume and acknowledgement requests after removal. diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index cd71f9204f0..40d11464b27 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -531,8 +531,11 @@ Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyI } } -Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough) { - if (streamId == 0 || consumedThrough < 0 || consumedThrough == std::numeric_limits::max()) { +Future acknowledgeNativeCdcStream(Database cx, + CDCStreamId streamId, + Version consumedThrough, + Version knownAvailableThrough) { + if (streamId == 0 || consumedThrough < 0 || consumedThrough >= std::numeric_limits::max() - 1) { throw client_invalid_operation(); } const Version minUnpoppedVersion = consumedThrough + 1; @@ -543,22 +546,23 @@ Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Ve try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); Optional minVersionValue = co_await tr.get(cdcMinVersionKeyFor(streamId)); if (!minVersionValue.present()) { throw client_invalid_operation(); } - const Version readVersion = co_await tr.getReadVersion(); - if (consumedThrough > readVersion) { - throw future_version(); - } - const Version minVersion = decodeCDCMinVersionValue(minVersionValue.get()); if (minUnpoppedVersion <= minVersion) { co_return minVersion; } + const Version readVersion = co_await tr.getReadVersion(); + if (consumedThrough > readVersion && consumedThrough > knownAvailableThrough) { + throw client_invalid_operation(); + } + tr.set(cdcMinVersionKeyFor(streamId), cdcMinVersionValue(minUnpoppedVersion)); co_await tr.commit(); co_return minUnpoppedVersion; @@ -572,22 +576,8 @@ Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Ve Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys) { validateNativeCdcEnabled(); validateNativeCdcStream(name, keys); - Optional previousProxy; - - while (true) { - CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx, previousProxy); - try { - CDCRegisterStreamReply reply = - co_await throwErrorOr(proxy.registerStream.tryGetReply(CDCRegisterStreamRequest(name, keys))); - co_return reply.streamId; - } catch (Error& error) { - if (!retryNativeCdcProxyRequest(error)) { - throw; - } - previousProxy = proxy.id(); - } - co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); - } + const CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx); + co_return co_await registerNativeCdcStream(cx, name, keys, proxy.id()); } Future> listNativeCdcStreamsClient(Database cx) { @@ -597,20 +587,25 @@ Future> listNativeCdcStreamsClient(Database cx) while (true) { CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx, previousProxy); try { - CDCListStreamsReply reply = co_await throwErrorOr(proxy.listStreams.tryGetReply(CDCListStreamsRequest())); - std::vector streams; - streams.reserve(reply.streams.size()); - for (const auto& stream : reply.streams) { - streams.push_back( - NativeCdcStreamInfo{ Key(stream.name), stream.streamId, KeyRange(stream.keys), stream.minVersion }); + Future proxyChanged = cx->clientInfo->onChange(); + auto result = + co_await race(throwErrorOr(proxy.listStreams.tryGetReply(CDCListStreamsRequest())), proxyChanged); + if (result.index() == 0) { + CDCListStreamsReply reply = std::get<0>(std::move(result)); + std::vector streams; + streams.reserve(reply.streams.size()); + for (const auto& stream : reply.streams) { + streams.push_back(NativeCdcStreamInfo{ + Key(stream.name), stream.streamId, KeyRange(stream.keys), stream.minVersion }); + } + co_return streams; } - co_return streams; } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { throw; } - previousProxy = proxy.id(); } + previousProxy = proxy.id(); co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); } } @@ -634,8 +629,12 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { co_return; } try { - co_await throwErrorOr(proxy.get().removeStream.tryGetReply(CDCRemoveStreamRequest(name))); - co_return; + Future proxyChanged = cx->clientInfo->onChange(); + auto result = co_await race( + throwErrorOr(proxy.get().removeStream.tryGetReply(CDCRemoveStreamRequest(name))), proxyChanged); + if (result.index() == 0) { + co_return; + } } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { throw; @@ -645,48 +644,77 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { } } -Future createNativeCdcCursor(Database cx, Key name) { +Future> createNativeCdcConsumer(Database cx, Key name) { validateNativeCdcEnabled(); const CDCStreamId streamId = co_await getNativeCdcStreamId(cx, name); - co_return CDCCursor(streamId, invalidVersion); + co_return makeReference(cx, CDCCursor(streamId, invalidVersion)); } -Future consumeNativeCdcStream(Database cx, CDCCursor cursor) { +Reference resumeNativeCdcConsumer(Database cx, CDCCursor position) { + validateNativeCdcEnabled(); + return makeReference(cx, position); +} + +Future NativeCdcConsumer::consumeImpl(Reference self) { validateNativeCdcEnabled(); while (true) { - CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); try { - co_return co_await throwErrorOr(proxy.consume.tryGetReply(CDCConsumeRequest(cursor))); + Future proxyChanged = self->cx->clientInfo->onChange(); + auto result = co_await race( + throwErrorOr(proxy.consume.tryGetReply(CDCConsumeRequest(self->currentPosition))), proxyChanged); + if (result.index() == 0) { + CDCConsumeReply reply = std::get<0>(std::move(result)); + self->knownAvailableThrough = reply.lastConsumedVersion; + self->currentPosition.lastConsumedVersion = reply.lastConsumedVersion; + co_return reply; + } } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { throw; } } - co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); } } -Future acknowledgeNativeCdcStreamClient(Database cx, CDCCursor cursor) { +Future NativeCdcConsumer::consume() { + return consumeImpl(Reference::addRef(this)); +} + +Future NativeCdcConsumer::acknowledgeImpl(Reference self) { validateNativeCdcEnabled(); - if (cursor.streamId == 0 || cursor.lastConsumedVersion < 0 || - cursor.lastConsumedVersion == std::numeric_limits::max()) { + if (self->currentPosition.streamId == 0 || self->currentPosition.lastConsumedVersion < 0 || + self->currentPosition.lastConsumedVersion == std::numeric_limits::max()) { throw client_invalid_operation(); } + const Version acknowledgedVersion = self->currentPosition.lastConsumedVersion; + co_await acknowledgeNativeCdcStream( + self->cx, self->currentPosition.streamId, acknowledgedVersion, self->knownAvailableThrough); while (true) { - CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(cx, cursor.streamId); + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); try { - co_await throwErrorOr(proxy.ack.tryGetReply(CDCAckRequest(cursor.streamId, cursor.lastConsumedVersion))); - co_return; + Future proxyChanged = self->cx->clientInfo->onChange(); + auto result = co_await race( + throwErrorOr(proxy.ack.tryGetReply(CDCAckRequest(self->currentPosition.streamId, acknowledgedVersion))), + proxyChanged); + if (result.index() == 0) { + co_return; + } } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { throw; } } - co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); } } +Future NativeCdcConsumer::acknowledge() { + return acknowledgeImpl(Reference::addRef(this)); +} + TEST_CASE("/NativeCDC/LifecycleAllocation") { NativeCdcIdentifierAllocator allocator; auto [initialId, initialTag] = allocator.allocate(); diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index 702b4d2e534..d872e2bea62 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -39,6 +39,23 @@ struct NativeCdcRemovedStreamInfo { std::vector tags; }; +class NativeCdcConsumer : public ReferenceCounted { +public: + NativeCdcConsumer(Database cx, CDCCursor position) : cx(cx), currentPosition(position) {} + + Future consume(); + Future acknowledge(); + const CDCCursor& position() const { return currentPosition; } + +private: + static Future consumeImpl(Reference self); + static Future acknowledgeImpl(Reference self); + + Database cx; + CDCCursor currentPosition; + Version knownAvailableThrough = invalidVersion; +}; + // These durable metadata operations back CDCProxyInterface lifecycle requests. // Registration is knob-protected; draining and cleanup remain available for // streams persisted while native CDC was enabled. @@ -54,16 +71,21 @@ Future> listNativeCdcStreams(Database cx); // Atomically moves any streams assigned to a failed proxy to its replacement. Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId); // Persists the exclusive unpopped watermark after consuming through a version. -Future acknowledgeNativeCdcStream(Database cx, CDCStreamId streamId, Version consumedThrough); +// knownAvailableThrough permits a consumer to acknowledge log data it has +// already received before that version is visible at a transaction read version. +Future acknowledgeNativeCdcStream(Database cx, + CDCStreamId streamId, + Version consumedThrough, + Version knownAvailableThrough = invalidVersion); // Client-facing CDC operations. These select the appropriate CDC proxy from // ClientDBInfo and retry requests when stream ownership changes. Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys); Future removeNativeCdcStreamClient(Database cx, Key name); Future> listNativeCdcStreamsClient(Database cx); -// Uses the range registered for this name; consumers do not respecify it. -Future createNativeCdcCursor(Database cx, Key name); -Future consumeNativeCdcStream(Database cx, CDCCursor cursor); -Future acknowledgeNativeCdcStreamClient(Database cx, CDCCursor cursor); +// Uses the range registered for this name; consumers do not respecify it. A +// CDCCursor remains a serializable position token and does not hold Database. +Future> createNativeCdcConsumer(Database cx, Key name); +Reference resumeNativeCdcConsumer(Database cx, CDCCursor position); #endif // FDBCLIENT_NATIVECDC_H diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index d20bb03794c..8860a92c074 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -71,6 +71,7 @@ struct CDCBufferedStream : ReferenceCounted { Version minVersion = invalidVersion; Version bufferedThrough = invalidVersion; int64_t bufferedBytes = 0; + int readDemand = 0; std::vector tagIntervals; std::deque> mutations; AsyncTrigger changed; @@ -101,6 +102,7 @@ struct CDCProxyData { std::map> streams; std::map> tags; AsyncTrigger popAcknowledgedDataTrigger; + AsyncTrigger peekCapacityContended; FlowLock bufferLock; int64_t bufferedBytes = 0; @@ -296,11 +298,21 @@ void detachStreamFromTags(CDCProxyData* self, Reference strea } } +void refreshStreamTags(CDCProxyData* self, Reference stream) { + for (const auto& interval : stream->tagIntervals) { + auto tag = self->tags.find(interval.tag); + if (tag != self->tags.end()) { + tag->second->refresh.trigger(); + } + } +} + Optional nextTagReadVersion(CDCProxyData* self, Reference tag) { Optional begin; for (const CDCStreamId streamId : tag->streamIds) { auto stream = self->streams.find(streamId); - if (stream == self->streams.end() || !stream->second->active || !stream->second->initialized) { + if (stream == self->streams.end() || !stream->second->active || !stream->second->initialized || + stream->second->readDemand == 0) { continue; } for (const auto& interval : stream->second->tagIntervals) { @@ -320,7 +332,7 @@ void advanceTagBufferedThrough(CDCProxyData* self, Reference tag const std::vector streamIds(tag->streamIds.begin(), tag->streamIds.end()); for (const CDCStreamId streamId : streamIds) { auto stream = self->streams.find(streamId); - if (stream == self->streams.end() || !stream->second->active) { + if (stream == self->streams.end() || !stream->second->active || stream->second->readDemand == 0) { continue; } for (auto& interval : stream->second->tagIntervals) { @@ -385,7 +397,8 @@ std::map bufferMessages(CDCProxyData* self, reader >> mutation; for (const CDCStreamId streamId : tag->streamIds) { auto stream = self->streams.find(streamId); - if (stream == self->streams.end() || !stream->second->active || !stream->second->keys.present()) { + if (stream == self->streams.end() || !stream->second->active || stream->second->readDemand == 0 || + !stream->second->keys.present()) { continue; } const bool coversVersion = @@ -409,16 +422,22 @@ std::map bufferMessages(CDCProxyData* self, return batches; } +Future rotateContendedPeek(CDCProxyData* self) { + if (self->bufferLock.waiters() == 0) { + co_await self->peekCapacityContended.onTrigger(); + } + co_await delay(SERVER_KNOBS->BLOCKING_PEEK_TIMEOUT); +} + Future bufferTag(CDCProxyData* self, Reference tag) { while (tag->active) { Optional begin = nextTagReadVersion(self, tag); if (!begin.present()) { - tag->active = false; - auto current = self->tags.find(tag->tag); - if (current != self->tags.end() && current->second == tag) { - self->tags.erase(current); + auto waitForDemand = co_await race(tag->stopped.onTrigger(), tag->refresh.onTrigger()); + if (waitForDemand.index() == 0) { + co_return; } - co_return; + continue; } if (!self->logSystem->get()) { auto waitForLogSystem = @@ -434,6 +453,9 @@ Future bufferTag(CDCProxyData* self, Reference tag) { const int64_t peekReservation = std::min(SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES, SERVER_KNOBS->MAXIMUM_PEEK_BYTES); ASSERT(peekReservation > 0); + if (self->bufferLock.available() < peekReservation) { + self->peekCapacityContended.trigger(); + } auto capacity = co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, peekReservation), self->logSystem->onChange(), tag->stopped.onTrigger(), @@ -445,11 +467,14 @@ Future bufferTag(CDCProxyData* self, Reference tag) { co_return; } FlowLock::Releaser reservation(self->bufferLock, peekReservation); + // Blocking peeks hold a response reservation. Once another tag queues for capacity, rotate this + // reader after one blocking-peek interval so an idle tag cannot monopolize the shared budget. auto result = co_await race(cursor->getMore(TaskPriority::TLogPeekReply), self->logSystem->onChange(), tag->stopped.onTrigger(), - tag->refresh.onTrigger()); - if (result.index() == 1 || result.index() == 3) { + tag->refresh.onTrigger(), + rotateContendedPeek(self)); + if (result.index() == 1 || result.index() == 3 || result.index() == 4) { break; } if (result.index() == 2) { @@ -505,12 +530,7 @@ Future bufferTag(CDCProxyData* self, Reference tag) { reservation.remaining = 0; advanceTagBufferedThrough(self, tag, cursor->version().version - 1); if (!nextTagReadVersion(self, tag).present()) { - tag->active = false; - auto current = self->tags.find(tag->tag); - if (current != self->tags.end() && current->second == tag) { - self->tags.erase(current); - } - co_return; + break; } if (cursor->isExhausted()) { break; @@ -761,9 +781,20 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { throw transaction_too_old(); } + bool issuedReadDemand = false; + if (stream->bufferedThrough < begin) { + ++stream->readDemand; + refreshStreamTags(self, stream); + issuedReadDemand = true; + } while (stream->active && stream->bufferedThrough < begin) { co_await stream->changed.onTrigger(); } + if (issuedReadDemand) { + ASSERT(stream->readDemand > 0); + --stream->readDemand; + refreshStreamTags(self, stream); + } if (stream->tooOld) { throw transaction_too_old(); } @@ -793,22 +824,39 @@ Future consume(CDCProxyData* self, CDCConsumeRequest request) { Future acknowledge(CDCProxyData* self, CDCAckRequest request) { try { - co_await readCDCStreamState(self->cx, request.streamId, self->id, false); - const Version minVersion = co_await acknowledgeNativeCdcStream(self->cx, request.streamId, request.version); + if (request.version < 0 || request.version >= std::numeric_limits::max() - 1) { + throw client_invalid_operation(); + } + const CDCStreamReadState metadata = co_await readCDCStreamState(self->cx, request.streamId, self->id, false); + if (metadata.minVersion <= request.version) { + throw client_invalid_operation(); + } auto found = self->streams.find(request.streamId); - if (found != self->streams.end()) { - advanceStreamMinVersion(found->second, minVersion); - while (!found->second->mutations.empty() && found->second->mutations.front().version < minVersion) { - const int64_t releasedBytes = - sizeof(VersionedMutationsRef) + found->second->mutations.front().mutations.expectedSize(); - found->second->bufferedBytes -= releasedBytes; - ASSERT(self->bufferedBytes >= releasedBytes); - self->bufferedBytes -= releasedBytes; - self->bufferLock.release(releasedBytes); - found->second->mutations.pop_front(); - } - ASSERT(found->second->bufferedBytes >= 0); + if (found == self->streams.end()) { + throw wrong_shard_server(); } + Reference stream = found->second; + while (stream->active && !stream->initialized) { + co_await stream->changed.onTrigger(); + } + if (!stream->active) { + throw wrong_shard_server(); + } + + // The durable acknowledgement can commit before a replacement owner observes the RPC. + // Reconcile the new owner's in-memory frontier to that already verified watermark. + const Version minVersion = metadata.minVersion; + advanceStreamMinVersion(stream, minVersion); + while (!stream->mutations.empty() && stream->mutations.front().version < minVersion) { + const int64_t releasedBytes = + sizeof(VersionedMutationsRef) + stream->mutations.front().mutations.expectedSize(); + stream->bufferedBytes -= releasedBytes; + ASSERT(self->bufferedBytes >= releasedBytes); + self->bufferedBytes -= releasedBytes; + self->bufferLock.release(releasedBytes); + stream->mutations.pop_front(); + } + ASSERT(stream->bufferedBytes >= 0); self->popAcknowledgedDataTrigger.trigger(); request.reply.send(Void()); } catch (Error& e) { diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 7b0cbb27c26..bfe84fa0de6 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -637,25 +637,25 @@ Future recruitFailedCDCProxies(ClusterControllerData* self, // Endpoint publication precedes assignment publication so clients never route // a stream to a replacement that is not yet discoverable. self->db.recoveryData->registrationTrigger.trigger(); - while (self->db.recoveryData.isValid() && self->db.recoveryData->cstate.myDBState.recoveryCount == recoveryCount) { - bool allPublished = true; - for (const auto& [oldProxyId, newProxyId] : replacements) { - allPublished = allPublished && std::any_of(self->db.clientInfo->get().cdcProxies.begin(), - self->db.clientInfo->get().cdcProxies.end(), - [newProxyId](CDCProxyInterface const& proxy) { - return proxy.id() == newProxyId; - }); - } - if (allPublished) { - break; - } - co_await self->db.clientInfo->onChange(); - } - if (!self->db.recoveryData.isValid() || self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount) { - co_return; - } for (const auto& [oldProxyId, newProxyId] : replacements) { - co_await reassignNativeCdcStreams(self->db.db, oldProxyId, newProxyId); + auto isCurrentProxy = [self, newProxyId]() { + return std::any_of(self->db.cdcProxies.begin(), + self->db.cdcProxies.end(), + [newProxyId](CDCProxyInterface const& proxy) { return proxy.id() == newProxyId; }); + }; + auto isPublishedProxy = [self, newProxyId]() { + return std::any_of(self->db.clientInfo->get().cdcProxies.begin(), + self->db.clientInfo->get().cdcProxies.end(), + [newProxyId](CDCProxyInterface const& proxy) { return proxy.id() == newProxyId; }); + }; + while (isCurrentProxy() && !isPublishedProxy()) { + co_await self->db.clientInfo->onChange(); + } + if (isCurrentProxy() && isPublishedProxy()) { + // Reassignment remains necessary if recovery changes while the + // replacement endpoint is being published. + co_await reassignNativeCdcStreams(self->db.db, oldProxyId, newProxyId); + } } } diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index 2d2a555ae22..b6fad17a5b5 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -258,6 +258,7 @@ Future ensureCDCProxies(Reference self, RecruitFromCo std::vector newRecruits = co_await getAll(initializationReplies); TraceEvent("CDCProxyInitializationComplete", self->dbgid).log(); self->controllerData->db.cdcProxies = std::move(newRecruits); + self->registrationTrigger.trigger(); } Future newResolvers(Reference self, RecruitFromConfigurationReply recr) { diff --git a/fdbserver/logsystem/LogSet.cpp b/fdbserver/logsystem/LogSet.cpp index e8cff87116f..ac519202808 100644 --- a/fdbserver/logsystem/LogSet.cpp +++ b/fdbserver/logsystem/LogSet.cpp @@ -62,9 +62,13 @@ std::string LogSet::logServerString() { return result; } -void LogSet::populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) { +void LogSet::populateSatelliteTagLocations(int logRouterTags, + int oldLogRouterTags, + int txsTags, + int oldTxsTags, + int cdcTags) { satelliteTagLocations.clear(); - satelliteTagLocations.resize(std::max({ logRouterTags, oldLogRouterTags, txsTags, oldTxsTags }) + 1); + satelliteTagLocations.resize(std::max({ logRouterTags, oldLogRouterTags, txsTags, oldTxsTags, cdcTags }) + 1); std::map server_usedBest; std::set> used_servers; @@ -217,7 +221,7 @@ void LogSet::getPushLocations(VectorRef tags, const Optional>& restrictedLogSet) { if (locality == tagLocalitySatellite) { for (auto& t : tags) { - if (t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) { + if (t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter || t.locality == tagLocalityCDC) { for (int loc : satelliteTagLocations[t.id + 1]) { locations.push_back(locationOffset + loc); } diff --git a/fdbserver/logsystem/LogSystem.cpp b/fdbserver/logsystem/LogSystem.cpp index af2c5b6f524..ecb7a684916 100644 --- a/fdbserver/logsystem/LogSystem.cpp +++ b/fdbserver/logsystem/LogSystem.cpp @@ -21,6 +21,7 @@ #include "fdbserver/logsystem/LogSystem.h" #include "fdbserver/logsystem/LogSystemConsumer.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/Knobs.h" #include "fdbserver/core/OTELSpanContextMessage.h" #include "fdbserver/core/SpanContextMessage.h" #include "flow/serialize.h" @@ -2556,6 +2557,12 @@ Future> LogSystem::newEpoch(Reference oldLogSyst for (auto& it : oldLogSystem->oldLogData) { maxTxsTags = std::max(maxTxsTags, it.txsTags); } + int maxCdcTags = CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT; + for (Tag tag : allTags) { + if (tag.locality == tagLocalityCDC) { + maxCdcTags = std::max(maxCdcTags, tag.id + 1); + } + } if (region.satelliteTLogReplicationFactor > 0 && configuration.usableRegions > 1) { logSystem->tLogs.push_back(makeReference()); @@ -2584,7 +2591,7 @@ Future> LogSystem::newEpoch(Reference oldLogSyst .size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities); logSystem->tLogs[1]->populateSatelliteTagLocations( - logSystem->logRouterTags, oldLogSystem->logRouterTags, logSystem->txsTags, maxTxsTags); + logSystem->logRouterTags, oldLogSystem->logRouterTags, logSystem->txsTags, maxTxsTags, maxCdcTags); logSystem->expectedLogSets++; } @@ -2756,6 +2763,17 @@ Future> LogSystem::newEpoch(Reference oldLogSyst std::vector sreqs(recr.satelliteTLogs.size()); std::vector satelliteTags; + for (Tag tag : allTags) { + if (tag.locality == tagLocalityCDC) { + locations.clear(); + logSystem->tLogs[1]->getPushLocations(VectorRef(&tag, 1), locations, 0); + for (int loc : locations) { + sreqs[loc].recoverTags.push_back(tag); + } + satelliteTags.push_back(tag); + } + } + if (logSystem->logRouterTags) { for (int i = 0; i < oldLogSystem->logRouterTags; i++) { Tag tag = Tag(tagLocalityLogRouter, i); diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemTypes.h b/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemTypes.h index ca4d90f39a7..646b18bbd87 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemTypes.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemTypes.h @@ -56,7 +56,11 @@ class LogSet : NonCopyable, public ReferenceCounted { bool hasLogRouter(UID id) const; bool hasBackupWorker(UID id) const; std::string logServerString(); - void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags); + void populateSatelliteTagLocations(int logRouterTags, + int oldLogRouterTags, + int txsTags, + int oldTxsTags, + int cdcTags); void checkSatelliteTagLocations(); int bestLocationFor(Tag tag); void updateLocalitySet(std::vector const& localities); diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index b80d33d9708..fdd1a8e7f40 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -145,6 +145,12 @@ struct NativeCdcWorkload : TestWorkload { } } + Future waitForNoRetiredTagPopState(Database cx, Tag tag) { + while (co_await hasRetiredTagPopState(cx, tag)) { + co_await delay(0.1); + } + } + Future appendPersistedTag(Database cx, CDCStreamId streamId, Tag tag) { Transaction tr(cx); while (true) { @@ -281,29 +287,30 @@ struct NativeCdcWorkload : TestWorkload { ASSERT(co_await registerNativeCdcStreamClient(cx, secondName, keys) == secondId); ASSERT((co_await getCDCProxy(firstId)).id() == (co_await getCDCProxy(secondId)).id()); const Version writeVersion = co_await writeValues(cx, { { "shared/unread"_sr, "protected-by-minimum"_sr } }); - CDCCursor firstCursor = co_await createNativeCdcCursor(cx, firstName); - ASSERT(firstCursor.streamId == firstId); + Reference firstConsumer = co_await createNativeCdcConsumer(cx, firstName); + ASSERT(firstConsumer->position().streamId == firstId); const double firstConsumeDeadline = now() + 30.0; - while (firstCursor.lastConsumedVersion < writeVersion) { - CDCConsumeReply consumed = co_await timeoutError(consumeNativeCdcStream(cx, firstCursor), 30.0); - if (consumed.lastConsumedVersion == firstCursor.lastConsumedVersion) { + while (firstConsumer->position().lastConsumedVersion < writeVersion) { + const Version previous = firstConsumer->position().lastConsumedVersion; + CDCConsumeReply consumed = co_await timeoutError(firstConsumer->consume(), 30.0); + if (consumed.lastConsumedVersion == previous) { ASSERT(now() < firstConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(consumed.lastConsumedVersion > firstCursor.lastConsumedVersion); - firstCursor.lastConsumedVersion = consumed.lastConsumedVersion; + ASSERT(consumed.lastConsumedVersion > previous); } - co_await acknowledgeNativeCdcStreamClient(cx, firstCursor); + co_await firstConsumer->acknowledge(); co_await removeNativeCdcStreamClient(cx, firstName); co_await waitForCDCProxyAssignmentRemoval(firstId); - CDCCursor unreadCursor = co_await createNativeCdcCursor(cx, secondName); - ASSERT(unreadCursor.streamId == secondId); + Reference unreadConsumer = co_await createNativeCdcConsumer(cx, secondName); + ASSERT(unreadConsumer->position().streamId == secondId); bool foundUnread = false; - while (unreadCursor.lastConsumedVersion < writeVersion) { - CDCConsumeReply unread = co_await timeoutError(consumeNativeCdcStream(cx, unreadCursor), 30.0); - ASSERT(unread.lastConsumedVersion > unreadCursor.lastConsumedVersion); + while (unreadConsumer->position().lastConsumedVersion < writeVersion) { + const Version previous = unreadConsumer->position().lastConsumedVersion; + CDCConsumeReply unread = co_await timeoutError(unreadConsumer->consume(), 30.0); + ASSERT(unread.lastConsumedVersion > previous); for (const auto& versioned : unread.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "shared/unread"_sr) { @@ -311,10 +318,9 @@ struct NativeCdcWorkload : TestWorkload { } } } - unreadCursor.lastConsumedVersion = unread.lastConsumedVersion; } ASSERT(foundUnread); - co_await acknowledgeNativeCdcStreamClient(cx, unreadCursor); + co_await unreadConsumer->acknowledge(); co_await removeNativeCdcStreamClient(cx, secondName); co_await waitForCDCProxyAssignmentRemoval(secondId); @@ -388,14 +394,14 @@ struct NativeCdcWorkload : TestWorkload { const KeyRange liveRange(KeyRangeRef("live/"_sr, "live0"_sr)); const CDCStreamId liveStreamId = co_await registerNativeCdcStreamClient(cx, liveName, liveRange); const Tag liveTag = co_await getLatestPersistedTag(cx, liveStreamId); - CDCCursor liveCursor = co_await createNativeCdcCursor(cx, liveName); - ASSERT(liveCursor.streamId == liveStreamId); + Reference liveConsumer = co_await createNativeCdcConsumer(cx, liveName); + ASSERT(liveConsumer->position().streamId == liveStreamId); CDCProxyInterface owner = co_await getCDCProxy(liveStreamId); bool futureAcknowledgeRejected = false; try { - co_await acknowledgeNativeCdcStreamClient(cx, - CDCCursor(liveStreamId, std::numeric_limits::max() - 1)); + co_await resumeNativeCdcConsumer(cx, CDCCursor(liveStreamId, std::numeric_limits::max() - 2)) + ->acknowledge(); } catch (Error& e) { futureAcknowledgeRejected = e.code() == error_code_client_invalid_operation; } @@ -434,15 +440,15 @@ struct NativeCdcWorkload : TestWorkload { bool foundInRangeWrite = false; bool foundOutOfRangeWrite = false; const double initialConsumeDeadline = now() + 30.0; - while (liveCursor.lastConsumedVersion < writeVersion) { - CDCConsumeReply consumed = co_await timeoutError(consumeNativeCdcStream(cx, liveCursor), 30.0); - if (consumed.lastConsumedVersion == liveCursor.lastConsumedVersion) { + while (liveConsumer->position().lastConsumedVersion < writeVersion) { + const Version previous = liveConsumer->position().lastConsumedVersion; + CDCConsumeReply consumed = co_await timeoutError(liveConsumer->consume(), 30.0); + if (consumed.lastConsumedVersion == previous) { ASSERT(now() < initialConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(consumed.lastConsumedVersion > liveCursor.lastConsumedVersion); - liveCursor.lastConsumedVersion = consumed.lastConsumedVersion; + ASSERT(consumed.lastConsumedVersion > previous); for (const auto& versioned : consumed.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "live/in"_sr) { @@ -467,15 +473,15 @@ struct NativeCdcWorkload : TestWorkload { co_await writeValues(cx, { { "live/after-failure"_sr, "captured-after-failure"_sr } }); bool foundAfterFailureWrite = false; const double afterFailureConsumeDeadline = now() + 30.0; - while (liveCursor.lastConsumedVersion < afterFailureVersion) { - CDCConsumeReply afterFailure = co_await timeoutError(consumeNativeCdcStream(cx, liveCursor), 30.0); - if (afterFailure.lastConsumedVersion == liveCursor.lastConsumedVersion) { + while (liveConsumer->position().lastConsumedVersion < afterFailureVersion) { + const Version previous = liveConsumer->position().lastConsumedVersion; + CDCConsumeReply afterFailure = co_await timeoutError(liveConsumer->consume(), 30.0); + if (afterFailure.lastConsumedVersion == previous) { ASSERT(now() < afterFailureConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(afterFailure.lastConsumedVersion > liveCursor.lastConsumedVersion); - liveCursor.lastConsumedVersion = afterFailure.lastConsumedVersion; + ASSERT(afterFailure.lastConsumedVersion > previous); for (const auto& versioned : afterFailure.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "live/after-failure"_sr) { @@ -486,8 +492,8 @@ struct NativeCdcWorkload : TestWorkload { } ASSERT(foundAfterFailureWrite); - const Version cursorBeforeRecovery = liveCursor.lastConsumedVersion; - co_await acknowledgeNativeCdcStreamClient(cx, liveCursor); + const Version cursorBeforeRecovery = liveConsumer->position().lastConsumedVersion; + co_await liveConsumer->acknowledge(); ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == cursorBeforeRecovery + 1); const int32_t recoveredResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; @@ -502,15 +508,15 @@ struct NativeCdcWorkload : TestWorkload { co_await writeValues(cx, { { "live/after-recovery"_sr, "captured-after-recovery"_sr } }); bool foundAfterRecoveryWrite = false; const double afterRecoveryConsumeDeadline = now() + 30.0; - while (liveCursor.lastConsumedVersion < afterRecoveryVersion) { - CDCConsumeReply afterRecovery = co_await timeoutError(consumeNativeCdcStream(cx, liveCursor), 30.0); - if (afterRecovery.lastConsumedVersion == liveCursor.lastConsumedVersion) { + while (liveConsumer->position().lastConsumedVersion < afterRecoveryVersion) { + const Version previous = liveConsumer->position().lastConsumedVersion; + CDCConsumeReply afterRecovery = co_await timeoutError(liveConsumer->consume(), 30.0); + if (afterRecovery.lastConsumedVersion == previous) { ASSERT(now() < afterRecoveryConsumeDeadline); co_await delay(0.1); continue; } - ASSERT(afterRecovery.lastConsumedVersion > liveCursor.lastConsumedVersion); - liveCursor.lastConsumedVersion = afterRecovery.lastConsumedVersion; + ASSERT(afterRecovery.lastConsumedVersion > previous); for (const auto& versioned : afterRecovery.mutations) { for (const auto& mutation : versioned.mutations) { if (mutation.param1 == "live/after-recovery"_sr) { @@ -521,11 +527,11 @@ struct NativeCdcWorkload : TestWorkload { } ASSERT(foundAfterRecoveryWrite); - co_await acknowledgeNativeCdcStreamClient(cx, liveCursor); - ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == liveCursor.lastConsumedVersion + 1); + co_await liveConsumer->acknowledge(); + ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == liveConsumer->position().lastConsumedVersion + 1); Future pendingConsume = recoveredOwner.consume.getReply( - CDCConsumeRequest(CDCCursor(liveStreamId, liveCursor.lastConsumedVersion + 1000000))); + CDCConsumeRequest(CDCCursor(liveStreamId, std::numeric_limits::max() - 2))); co_await delay(0.1); co_await removeNativeCdcStreamClient(cx, liveName); co_await waitForCDCProxyAssignmentRemoval(liveStreamId); @@ -541,7 +547,7 @@ struct NativeCdcWorkload : TestWorkload { bool retiredConsumeRejected = false; try { - co_await timeoutError(consumeNativeCdcStream(cx, liveCursor), 30.0); + co_await timeoutError(liveConsumer->consume(), 30.0); } catch (Error& e) { retiredConsumeRejected = e.code() == error_code_client_invalid_operation; } @@ -549,12 +555,12 @@ struct NativeCdcWorkload : TestWorkload { bool retiredClientAcknowledgeRejected = false; try { - co_await timeoutError(acknowledgeNativeCdcStreamClient(cx, liveCursor), 30.0); + co_await timeoutError(liveConsumer->acknowledge(), 30.0); } catch (Error& e) { retiredClientAcknowledgeRejected = e.code() == error_code_client_invalid_operation; } ASSERT(retiredClientAcknowledgeRejected); - ASSERT(!(co_await hasRetiredTagPopState(cx, liveTag))); + co_await timeoutError(waitForNoRetiredTagPopState(cx, liveTag), 30.0); if (g_network->isSimulated()) { (const_cast(CLIENT_KNOBS))->ENABLE_NATIVE_CDC = false; diff --git a/fdbserver/workloads/NativeCdcEndToEnd.cpp b/fdbserver/workloads/NativeCdcEndToEnd.cpp index 0533480dd5c..dbed14c3730 100644 --- a/fdbserver/workloads/NativeCdcEndToEnd.cpp +++ b/fdbserver/workloads/NativeCdcEndToEnd.cpp @@ -38,7 +38,7 @@ struct NativeCdcEndToEndWorkload : TestWorkload { struct StreamState { Key name; KeyRange keys; - CDCCursor cursor; + Reference consumer; std::map, ExpectedWrite> expected; }; @@ -118,7 +118,7 @@ struct NativeCdcEndToEndWorkload : TestWorkload { stream.name = Key(StringRef(format("native-cdc-e2e/stream/%04d", nextStreamNumber++))); stream.keys = randomOverlappingRange(); co_await timeoutError(registerNativeCdcStreamClient(cx, stream.name, stream.keys), operationTimeout); - stream.cursor = co_await timeoutError(createNativeCdcCursor(cx, stream.name), operationTimeout); + stream.consumer = co_await timeoutError(createNativeCdcConsumer(cx, stream.name), operationTimeout); streams.push_back(std::move(stream)); } @@ -134,11 +134,11 @@ struct NativeCdcEndToEndWorkload : TestWorkload { } } - Future drainThrough(Database cx, StreamState* stream, Version throughVersion) { + Future drainThrough(StreamState* stream, Version throughVersion) { const double deadline = now() + operationTimeout; - while (stream->cursor.lastConsumedVersion < throughVersion) { - const Version previous = stream->cursor.lastConsumedVersion; - CDCConsumeReply reply = co_await timeoutError(consumeNativeCdcStream(cx, stream->cursor), operationTimeout); + while (stream->consumer->position().lastConsumedVersion < throughVersion) { + const Version previous = stream->consumer->position().lastConsumedVersion; + CDCConsumeReply reply = co_await timeoutError(stream->consumer->consume(), operationTimeout); if (reply.lastConsumedVersion == previous) { ASSERT(now() < deadline); co_await delay(0.1); @@ -157,19 +157,18 @@ struct NativeCdcEndToEndWorkload : TestWorkload { found->second.observed = true; } } - stream->cursor.lastConsumedVersion = reply.lastConsumedVersion; + co_await timeoutError(stream->consumer->acknowledge(), operationTimeout); } for (const auto& expected : stream->expected) { if (expected.second.deadline <= throughVersion) { ASSERT(expected.second.observed); } } - co_await timeoutError(acknowledgeNativeCdcStreamClient(cx, stream->cursor), operationTimeout); } Future removeStream(Database cx, int index, Version throughVersion) { ASSERT(index > 0); - co_await drainThrough(cx, &streams[index], throughVersion); + co_await drainThrough(&streams[index], throughVersion); co_await timeoutError(removeNativeCdcStreamClient(cx, streams[index].name), operationTimeout); streams.erase(streams.begin() + index); } @@ -205,14 +204,14 @@ struct NativeCdcEndToEndWorkload : TestWorkload { // streams[0] intentionally stays behind while other streams are removed. for (int i = 1; i < static_cast(streams.size()); ++i) { if (deterministicRandom()->random01() < drainProbability) { - co_await drainThrough(cx, &streams[i], mostRecentWrite); + co_await drainThrough(&streams[i], mostRecentWrite); } } co_await delay(delayBetweenRounds); } for (auto& stream : streams) { - co_await drainThrough(cx, &stream, mostRecentWrite); + co_await drainThrough(&stream, mostRecentWrite); } while (!streams.empty()) { co_await timeoutError(removeNativeCdcStreamClient(cx, streams.back().name), operationTimeout); diff --git a/tests/fast/NativeCdcEndToEnd.toml b/tests/fast/NativeCdcEndToEnd.toml index fce06e0c73b..3879a54c2b4 100644 --- a/tests/fast/NativeCdcEndToEnd.toml +++ b/tests/fast/NativeCdcEndToEnd.toml @@ -5,7 +5,9 @@ enable_native_cdc = true testTitle = 'NativeCdcEndToEnd' useDB = true waitForQuiescenceEnd = false -timeout = 300 +timeout = 600 +connectionFailuresDisableDuration = 1000000 +runFailureWorkloads = false [[test.workload]] testName = 'NativeCdcEndToEnd' @@ -17,11 +19,14 @@ timeout = 300 rounds = 30 drainProbability = 0.25 delayBetweenRounds = 0.5 - operationTimeout = 120.0 + operationTimeout = 500.0 [[test.workload]] testName = 'Attrition' - machinesToKill = 3 + machinesToKill = 1 machinesToLeave = 3 reboot = true testDuration = 20.0 + waitForVersion = true + allowFaultInjection = false + killDc = false From 2823578aa1406c0229b2a6b1d96b0b6ace48d75b Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Thu, 28 May 2026 14:36:25 -0700 Subject: [PATCH 52/56] Add code probes for native CDC --- fdbclient/NativeCdc.cpp | 12 ++++++++++++ fdbserver/cdcproxy/CDCProxy.cpp | 14 ++++++++++++++ .../clustercontroller/ClusterController.actor.cpp | 5 +++++ fdbserver/clustercontroller/ClusterRecovery.cpp | 5 +++++ fdbserver/logsystem/LogSet.cpp | 2 ++ fdbserver/logsystem/LogSystem.cpp | 2 ++ 6 files changed, 40 insertions(+) diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 40d11464b27..2eb6857d974 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -30,6 +30,7 @@ #include "fdbclient/Knobs.h" #include "fdbclient/NativeCdc.h" #include "fdbclient/SystemData.h" +#include "flow/CodeProbe.h" #include "flow/Error.h" #include "flow/UnitTest.h" @@ -37,6 +38,7 @@ namespace { void validateNativeCdcEnabled() { if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC) { + CODE_PROBE(true, "Native CDC API rejected while feature disabled", probe::decoration::rare); throw client_invalid_operation(); } } @@ -284,6 +286,7 @@ Future getNativeCdcStreamProxy(Database cx, CDCStreamId strea } } if (!(co_await nativeCdcStreamStillExists(cx, streamId))) { + CODE_PROBE(true, "Native CDC client rejected operation after stream removal"); throw client_invalid_operation(); } co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); @@ -346,8 +349,10 @@ Future registerNativeCdcStream(Database cx, Key name, KeyRange keys throw client_invalid_operation(); } if (proxyId.present() && !(co_await getNativeCdcProxyAssignment(&tr, streamId)).present()) { + CODE_PROBE(true, "Native CDC registration restores missing stream owner"); const Tag tag = co_await getNativeCdcCurrentTag(&tr, streamId); Optional sharedTagProxy = co_await getNativeCdcProxyAssignmentForTag(&tr, tag); + CODE_PROBE(sharedTagProxy.present(), "Native CDC shared-tag streams use one owner"); const UID selectedProxy = sharedTagProxy.present() ? sharedTagProxy.get() : proxyId.get(); tr.set(cdcProxyKeyFor(streamId, selectedProxy), Value()); signalNativeCdcProxyAssignmentChange(&tr); @@ -403,6 +408,7 @@ Future> removeNativeCdcStream(Database cx, const CDCStreamId streamId = decodeCDCStreamNameValue(currentId.get()); Optional assignedProxy = co_await getNativeCdcProxyAssignment(&tr, streamId); if (proxyId.present() && (!assignedProxy.present() || assignedProxy.get() != proxyId.get())) { + CODE_PROBE(true, "Native CDC rejects removal through a stale owner"); throw wrong_shard_server(); } @@ -439,6 +445,7 @@ Future> removeNativeCdcStream(Database cx, NativeCdcRemovedStreamInfo removed; removed.removalVersion = tr.getCommittedVersion(); removed.tags.assign(removedTags.begin(), removedTags.end()); + CODE_PROBE(!removed.tags.empty(), "Native CDC removal records final tagged pop work"); co_return Optional(removed); } catch (Error& e) { if (e.code() == error_code_wrong_shard_server) { @@ -520,6 +527,7 @@ Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyI } if (changed) { + CODE_PROBE(true, "Native CDC reassigns streams after proxy replacement"); signalNativeCdcProxyAssignmentChange(&tr); co_await tr.commit(); } @@ -555,11 +563,13 @@ Future acknowledgeNativeCdcStream(Database cx, const Version minVersion = decodeCDCMinVersionValue(minVersionValue.get()); if (minUnpoppedVersion <= minVersion) { + CODE_PROBE(true, "Native CDC preserves a durable duplicate acknowledgement"); co_return minVersion; } const Version readVersion = co_await tr.getReadVersion(); if (consumedThrough > readVersion && consumedThrough > knownAvailableThrough) { + CODE_PROBE(true, "Native CDC rejects unproven acknowledgement progress"); throw client_invalid_operation(); } @@ -669,10 +679,12 @@ Future NativeCdcConsumer::consumeImpl(ReferencecurrentPosition.lastConsumedVersion = reply.lastConsumedVersion; co_return reply; } + CODE_PROBE(true, "Native CDC consume retries after proxy metadata change", probe::decoration::rare); } catch (Error& error) { if (!retryNativeCdcProxyRequest(error)) { throw; } + CODE_PROBE(true, "Native CDC consume retries after proxy request failure", probe::decoration::rare); } co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); } diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index 8860a92c074..ffda767a13c 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -40,6 +40,7 @@ #include "fdbserver/logsystem/LogSystemConsumer.h" #include "fdbserver/logsystem/LogSystemFactory.h" #include "flow/ActorCollection.h" +#include "flow/CodeProbe.h" #include "flow/Error.h" #include "flow/UnitTest.h" #include "flow/genericactors.actor.h" @@ -159,6 +160,7 @@ Future readCDCStreamState(Database cx, RangeResult assignedProxies = co_await tr.getRange(cdcProxyRangeFor(streamId), 2); if (assignedProxies.size() != 1 || decodeCDCProxyKey(assignedProxies[0].key).second != expectedProxyId) { + CODE_PROBE(true, "CDC proxy rejects request for stream owned elsewhere"); throw wrong_shard_server(); } @@ -361,6 +363,7 @@ void markPoppedTagStreamsTooOld(CDCProxyData* self, Reference ta } } for (const auto& stream : tooOldStreams) { + CODE_PROBE(true, "CDC proxy detects unread mutations already popped from TLogs", probe::decoration::rare); TraceEvent("CDCBufferStreamTooOld", self->id) .detail("StreamId", stream->streamId) .detail("MinVersion", stream->minVersion) @@ -454,6 +457,7 @@ Future bufferTag(CDCProxyData* self, Reference tag) { std::min(SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES, SERVER_KNOBS->MAXIMUM_PEEK_BYTES); ASSERT(peekReservation > 0); if (self->bufferLock.available() < peekReservation) { + CODE_PROBE(true, "CDC proxy applies shared buffer backpressure", probe::decoration::rare); self->peekCapacityContended.trigger(); } auto capacity = co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, peekReservation), @@ -493,6 +497,7 @@ Future bufferTag(CDCProxyData* self, Reference tag) { bufferedBytes += batch.bufferedBytes; } if (bufferedBytes > peekReservation) { + CODE_PROBE(true, "CDC proxy reserves capacity for oversized peek batch", probe::decoration::rare); TraceEvent(SevWarn, "CDCProxyOversizedPeekBatch", self->id) .detail("Tag", tag->tag) .detail("BufferedBytes", bufferedBytes) @@ -563,6 +568,7 @@ Future initializeStream(CDCProxyData* self, Reference s tag->second->streamIds.insert(stream->streamId); actors->add(bufferTag(self, newTag)); } else { + CODE_PROBE(true, "CDC proxy shares a tag reader across streams"); tag->second->streamIds.insert(stream->streamId); tag->second->refresh.trigger(); } @@ -679,6 +685,7 @@ Future clearCompletedRetiredTagPops(Database cx, std::map co decodeCDCMinVersionValue(retiredVersionValue.get()) > completedVersion) { continue; } + CODE_PROBE(true, "CDC proxy clears completed retired tag pop metadata"); tr.clear(cdcRetiredTagPopKeyFor(tag)); tr.clear(cdcRetiredTagPopVersionKeyFor(tag)); } @@ -704,6 +711,8 @@ Future popAcknowledgedData(CDCProxyData* self) { const auto safePop = safePopVersions.find(tag); const Version version = safePop == safePopVersions.end() ? retiredVersion : std::min(retiredVersion, safePop->second); + CODE_PROBE(safePop != safePopVersions.end() && version < retiredVersion, + "CDC proxy defers retired tag pop behind a live shared stream"); logSystem->pop(version, tag); if (version >= retiredVersion) { co_await logSystem->waitForPopped(retiredVersion, tag); @@ -741,6 +750,8 @@ void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { for (auto it = self->streams.begin(); it != self->streams.end();) { if (!assignedStreams.contains(it->first)) { + CODE_PROBE(it->second->readDemand > 0, "CDC proxy wakes pending consume when stream is unassigned"); + CODE_PROBE(true, "CDC proxy drops removed or reassigned stream state"); it->second->active = false; it->second->changed.trigger(); detachStreamFromTags(self, it->second); @@ -846,6 +857,7 @@ Future acknowledge(CDCProxyData* self, CDCAckRequest request) { // The durable acknowledgement can commit before a replacement owner observes the RPC. // Reconcile the new owner's in-memory frontier to that already verified watermark. const Version minVersion = metadata.minVersion; + CODE_PROBE(stream->minVersion < minVersion, "CDC proxy reconciles a durable stream acknowledgement"); advanceStreamMinVersion(stream, minVersion); while (!stream->mutations.empty() && stream->mutations.front().version < minVersion) { const int64_t releasedBytes = @@ -975,6 +987,8 @@ Future cdcProxyServer(CDCProxyInterface proxy, } hasBeenPublished = hasBeenPublished || isPublished; if (!dbInfo->get().logSystemConfig.tLogs.empty()) { + CODE_PROBE(dbInfo->get().recoveryCount > recoveryCount, + "CDC proxy refreshes its log consumer after recovery"); self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); } reconcileStreams(&self, &actors); diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index bfe84fa0de6..35adfd380b6 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -625,6 +625,7 @@ Future recruitFailedCDCProxies(ClusterControllerData* self, } *current = replacement; replacements.emplace_back(failedProxy.id(), replacement.id()); + CODE_PROBE(true, "CDC proxy is recruited after failure"); TraceEvent("CDCProxyRecruited", self->id) .detail("OldCDCProxyID", failedProxy.id()) .detail("NewCDCProxyID", replacement.id()) @@ -2176,6 +2177,8 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { tr.clear(assignment.key); tr.set(cdcProxyKeyFor(streamId, resolvedProxyId), Value()); repairedAssignment = true; + CODE_PROBE( + true, "CDC stream assignment is repaired after owner loss", probe::decoration::rare); TraceEvent("CDCProxyAssignmentRepaired") .detail("StreamId", streamId) .detail("OldCDCProxyID", proxyId) @@ -2190,6 +2193,8 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { } if (!streamToCDCProxyId.empty() && availableProxies.empty()) { + CODE_PROBE( + true, "CDC assignments wait while no proxy endpoints are published", probe::decoration::rare); Future assignmentChangeFuture = tr.watch(cdcProxyAssignmentChangeKey); Future endpointChangeFuture = db->clientInfo->onChange(); co_await tr.commit(); diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index b6fad17a5b5..7315a367141 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -236,10 +236,15 @@ Future ensureCDCProxies(Reference self, RecruitFromCo const bool hasDurableCdcState = !(co_await self->txnStateStore->readRange(cdcStreamKeys)).empty() || !(co_await self->txnStateStore->readRange(cdcRetiredTagPopKeys)).empty(); if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC && !hasDurableCdcState) { + CODE_PROBE(true, "Recovery skips CDC proxies when disabled with no durable state"); self->controllerData->db.cdcProxies.clear(); co_return; } + CODE_PROBE(!CLIENT_KNOBS->ENABLE_NATIVE_CDC && hasDurableCdcState, + "Recovery recruits CDC proxies to drain disabled durable state", + probe::decoration::rare); if (!self->controllerData->db.cdcProxies.empty()) { + CODE_PROBE(true, "Recovery reuses CDC proxies while CDC state remains durable"); TraceEvent("CDCProxiesReused", self->dbgid).detail("Count", self->controllerData->db.cdcProxies.size()); co_return; } diff --git a/fdbserver/logsystem/LogSet.cpp b/fdbserver/logsystem/LogSet.cpp index ac519202808..127c2b7986b 100644 --- a/fdbserver/logsystem/LogSet.cpp +++ b/fdbserver/logsystem/LogSet.cpp @@ -21,6 +21,7 @@ #include "fdbserver/logsystem/LogSystem.h" #include "fdbclient/FDBTypes.h" +#include "flow/CodeProbe.h" std::string LogSet::logRouterString() { std::string result; @@ -222,6 +223,7 @@ void LogSet::getPushLocations(VectorRef tags, if (locality == tagLocalitySatellite) { for (auto& t : tags) { if (t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter || t.locality == tagLocalityCDC) { + CODE_PROBE(t.locality == tagLocalityCDC, "CDC mutations are routed to satellite TLogs"); for (int loc : satelliteTagLocations[t.id + 1]) { locations.push_back(locationOffset + loc); } diff --git a/fdbserver/logsystem/LogSystem.cpp b/fdbserver/logsystem/LogSystem.cpp index ecb7a684916..e92a6ed299d 100644 --- a/fdbserver/logsystem/LogSystem.cpp +++ b/fdbserver/logsystem/LogSystem.cpp @@ -24,6 +24,7 @@ #include "fdbclient/Knobs.h" #include "fdbserver/core/OTELSpanContextMessage.h" #include "fdbserver/core/SpanContextMessage.h" +#include "flow/CodeProbe.h" #include "flow/serialize.h" bool logSystemHasRemoteLogs(LogSystem const& logSystem) { @@ -2765,6 +2766,7 @@ Future> LogSystem::newEpoch(Reference oldLogSyst for (Tag tag : allTags) { if (tag.locality == tagLocalityCDC) { + CODE_PROBE(true, "CDC tags are recovered onto satellite TLogs"); locations.clear(); logSystem->tLogs[1]->getPushLocations(VectorRef(&tag, 1), locations, 0); for (int loc : locations) { From 1dd8b80c7b7289ab7da95938fdc181272e3bafd2 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Thu, 28 May 2026 15:52:00 -0700 Subject: [PATCH 53/56] Fix native CDC proxy reassignment wakeups and initialize streams before attrition --- .../ClusterController.actor.cpp | 7 ++++--- fdbserver/workloads/NativeCdcEndToEnd.cpp | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 35adfd380b6..0514d6be56b 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -2150,8 +2150,11 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + // Install the wakeup before reading published endpoints so an endpoint + // publication during the metadata scan cannot leave stale assignments asleep. + Future endpointChangeFuture = db->clientInfo->onChange(); std::map streamToCDCProxyId; - const std::vector availableProxies = db->cdcProxies; + const std::vector availableProxies = db->clientInfo->get().cdcProxies; std::map replacementByFailedProxy; size_t replacementIndex = 0; bool repairedAssignment = false; @@ -2196,7 +2199,6 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { CODE_PROBE( true, "CDC assignments wait while no proxy endpoints are published", probe::decoration::rare); Future assignmentChangeFuture = tr.watch(cdcProxyAssignmentChangeKey); - Future endpointChangeFuture = db->clientInfo->onChange(); co_await tr.commit(); co_await (assignmentChangeFuture || endpointChangeFuture); break; @@ -2224,7 +2226,6 @@ Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { } Future assignmentChangeFuture = tr.watch(cdcProxyAssignmentChangeKey); - Future endpointChangeFuture = db->clientInfo->onChange(); co_await tr.commit(); co_await (assignmentChangeFuture || endpointChangeFuture); break; diff --git a/fdbserver/workloads/NativeCdcEndToEnd.cpp b/fdbserver/workloads/NativeCdcEndToEnd.cpp index dbed14c3730..51869d61517 100644 --- a/fdbserver/workloads/NativeCdcEndToEnd.cpp +++ b/fdbserver/workloads/NativeCdcEndToEnd.cpp @@ -74,7 +74,12 @@ struct NativeCdcEndToEndWorkload : TestWorkload { // RandomRangeLock can outlive this bounded CDC workload and mask its progress check. void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("RandomRangeLock"); } - Future setup(Database const& cx) override { return Void(); } + Future setup(Database const& cx) override { + if (clientId != 0) { + return Void(); + } + return initializeStreams(cx); + } Future start(Database const& cx) override { if (clientId != 0) { @@ -122,6 +127,12 @@ struct NativeCdcEndToEndWorkload : TestWorkload { streams.push_back(std::move(stream)); } + Future initializeStreams(Database cx) { + for (int i = 0; i < initialStreamCount; ++i) { + co_await addStream(cx); + } + } + void recordExpectedWrites(std::vector> const& values, Version committedVersion) { for (auto& stream : streams) { for (const auto& [key, value] : values) { @@ -174,10 +185,6 @@ struct NativeCdcEndToEndWorkload : TestWorkload { } Future run(Database cx) { - for (int i = 0; i < initialStreamCount; ++i) { - co_await addStream(cx); - } - Version mostRecentWrite = invalidVersion; for (int round = 0; round < rounds; ++round) { if (round > 0 && static_cast(streams.size()) > minStreamCount && From 93065de7ecf3db78bef82b40190a61e2d4ad2ac2 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Thu, 28 May 2026 16:11:44 -0700 Subject: [PATCH 54/56] Fix native CDC CI linkage and allow draining streams after disablement --- design/cdc.md | 14 +++-- fdbclient/NativeCdc.cpp | 8 +-- fdbclient/include/fdbclient/NativeCdc.h | 5 +- fdbserver/CMakeLists.txt | 2 +- .../ClusterController.actor.cpp | 21 ++++---- fdbserver/workloads/NativeCdc.cpp | 52 ++++++++++++++++++- 6 files changed, 74 insertions(+), 28 deletions(-) diff --git a/design/cdc.md b/design/cdc.md index 3749a6b51c8..dbabc64b155 100644 --- a/design/cdc.md +++ b/design/cdc.md @@ -469,11 +469,12 @@ cluster role. `ENABLE_NATIVE_CDC` defaults to false. In simulation it may be randomly enabled under buggification; workloads that depend on CDC set it explicitly. -The feature knob gates client admission to native CDC operations. Internal -cleanup and recovery paths remain capable of handling durable CDC state that -was created while the feature was enabled. This is necessary because disabling -new use of a feature cannot safely abandon log-retention obligations for -already registered or recently removed streams. +The feature knob gates new stream registration. Listing, consumer creation and +resume, consumption, acknowledgement, and removal remain available for +streams persisted while native CDC was enabled. Internal cleanup and recovery +paths likewise continue handling durable CDC state. This is necessary because +disabling new use of a feature cannot safely abandon log-retention obligations +for already registered or recently removed streams. `NATIVE_CDC_TAG_COUNT` controls the bounded tag pool used for new stream allocation. Normal operation defaults to a larger tag pool; simulation may @@ -541,6 +542,9 @@ The basic native CDC workload covers: * CDC proxy replacement and recovery of stream service. * Errors for stale consume and acknowledgement requests after removal. * Creation and eventual collection of retired final-pop state. +* Disabling native CDC while a live stream remains, rejecting new + registration while allowing recovery, consumption, acknowledgement, and + removal to drain the persisted stream. * Recovery with native CDC disabled after the last stream and final-pop work have drained, verifying that no CDC proxy remains required. diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 2eb6857d974..2690abd38df 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -38,7 +38,7 @@ namespace { void validateNativeCdcEnabled() { if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC) { - CODE_PROBE(true, "Native CDC API rejected while feature disabled", probe::decoration::rare); + CODE_PROBE(true, "Native CDC registration rejected while feature disabled", probe::decoration::rare); throw client_invalid_operation(); } } @@ -591,7 +591,6 @@ Future registerNativeCdcStreamClient(Database cx, Key name, KeyRang } Future> listNativeCdcStreamsClient(Database cx) { - validateNativeCdcEnabled(); Optional previousProxy; while (true) { @@ -621,7 +620,6 @@ Future> listNativeCdcStreamsClient(Database cx) } Future removeNativeCdcStreamClient(Database cx, Key name) { - validateNativeCdcEnabled(); if (name.empty()) { throw client_invalid_operation(); } @@ -655,18 +653,15 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { } Future> createNativeCdcConsumer(Database cx, Key name) { - validateNativeCdcEnabled(); const CDCStreamId streamId = co_await getNativeCdcStreamId(cx, name); co_return makeReference(cx, CDCCursor(streamId, invalidVersion)); } Reference resumeNativeCdcConsumer(Database cx, CDCCursor position) { - validateNativeCdcEnabled(); return makeReference(cx, position); } Future NativeCdcConsumer::consumeImpl(Reference self) { - validateNativeCdcEnabled(); while (true) { CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); try { @@ -695,7 +690,6 @@ Future NativeCdcConsumer::consume() { } Future NativeCdcConsumer::acknowledgeImpl(Reference self) { - validateNativeCdcEnabled(); if (self->currentPosition.streamId == 0 || self->currentPosition.lastConsumedVersion < 0 || self->currentPosition.lastConsumedVersion == std::numeric_limits::max()) { throw client_invalid_operation(); diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index d872e2bea62..0cb13d64b88 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -78,8 +78,9 @@ Future acknowledgeNativeCdcStream(Database cx, Version consumedThrough, Version knownAvailableThrough = invalidVersion); -// Client-facing CDC operations. These select the appropriate CDC proxy from -// ClientDBInfo and retry requests when stream ownership changes. +// Client-facing CDC operations. Registration is feature gated; the remaining +// operations stay available so existing durable streams can be drained after +// the feature is disabled. Requests retry when stream ownership changes. Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys); Future removeNativeCdcStreamClient(Database cx, Key name); Future> listNativeCdcStreamsClient(Database cx); diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index e1491384135..b0640a2fe22 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -169,7 +169,7 @@ target_link_libraries(fdbserver PRIVATE "$" fdbserver_worker fdbserver_backupworker - "$" + fdbserver_cdcproxy fdbserver_clustercontroller fdbserver_commitproxy fdbserver_consistencyscan diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 0514d6be56b..90033276223 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -579,6 +579,11 @@ Future> monitorCDCProxies(std::vector const& co_return failedProxies; } +bool containsCDCProxy(std::vector const& proxies, UID proxyId) { + return std::any_of( + proxies.begin(), proxies.end(), [proxyId](CDCProxyInterface const& proxy) { return proxy.id() == proxyId; }); +} + Future recruitFailedCDCProxies(ClusterControllerData* self, uint64_t recoveryCount, std::vector const& monitoredProxies, @@ -639,20 +644,12 @@ Future recruitFailedCDCProxies(ClusterControllerData* self, // a stream to a replacement that is not yet discoverable. self->db.recoveryData->registrationTrigger.trigger(); for (const auto& [oldProxyId, newProxyId] : replacements) { - auto isCurrentProxy = [self, newProxyId]() { - return std::any_of(self->db.cdcProxies.begin(), - self->db.cdcProxies.end(), - [newProxyId](CDCProxyInterface const& proxy) { return proxy.id() == newProxyId; }); - }; - auto isPublishedProxy = [self, newProxyId]() { - return std::any_of(self->db.clientInfo->get().cdcProxies.begin(), - self->db.clientInfo->get().cdcProxies.end(), - [newProxyId](CDCProxyInterface const& proxy) { return proxy.id() == newProxyId; }); - }; - while (isCurrentProxy() && !isPublishedProxy()) { + while (containsCDCProxy(self->db.cdcProxies, newProxyId) && + !containsCDCProxy(self->db.clientInfo->get().cdcProxies, newProxyId)) { co_await self->db.clientInfo->onChange(); } - if (isCurrentProxy() && isPublishedProxy()) { + if (containsCDCProxy(self->db.cdcProxies, newProxyId) && + containsCDCProxy(self->db.clientInfo->get().cdcProxies, newProxyId)) { // Reassignment remains necessary if recovery changes while the // replacement endpoint is being published. co_await reassignNativeCdcStreams(self->db.db, oldProxyId, newProxyId); diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index fdd1a8e7f40..54f13ff05d5 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -530,6 +530,57 @@ struct NativeCdcWorkload : TestWorkload { co_await liveConsumer->acknowledge(); ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == liveConsumer->position().lastConsumedVersion + 1); + if (g_network->isSimulated()) { + (const_cast(CLIENT_KNOBS))->ENABLE_NATIVE_CDC = false; + + bool disabledRegistrationRejected = false; + try { + co_await registerNativeCdcStreamClient(cx, "native-cdc-disabled-registration"_sr, liveRange); + } catch (Error& e) { + disabledRegistrationRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(disabledRegistrationRejected); + + listed = co_await listNativeCdcStreamsClient(cx); + ASSERT(listed.size() == 1); + ASSERT(listed[0].streamId == liveStreamId); + ASSERT((co_await createNativeCdcConsumer(cx, liveName))->position().streamId == liveStreamId); + liveConsumer = resumeNativeCdcConsumer(cx, liveConsumer->position()); + + const int32_t disabledResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; + const uint64_t recoveryBeforeDisabledDrain = dbInfo->get().recoveryCount; + co_await changeResolverCount(cx, disabledResolverCount); + co_await timeoutError(waitForRecoveryAfter(recoveryBeforeDisabledDrain, RecoveryState::ACCEPTING_COMMITS), + 60.0); + recoveredOwner = co_await timeoutError(getCDCProxy(liveStreamId), 30.0); + + const Version afterDisableVersion = + co_await writeValues(cx, { { "live/after-disable"_sr, "captured-after-disable"_sr } }); + bool foundAfterDisableWrite = false; + const double afterDisableConsumeDeadline = now() + 30.0; + while (liveConsumer->position().lastConsumedVersion < afterDisableVersion) { + const Version previous = liveConsumer->position().lastConsumedVersion; + CDCConsumeReply afterDisable = co_await timeoutError(liveConsumer->consume(), 30.0); + if (afterDisable.lastConsumedVersion == previous) { + ASSERT(now() < afterDisableConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(afterDisable.lastConsumedVersion > previous); + for (const auto& versioned : afterDisable.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/after-disable"_sr) { + foundAfterDisableWrite = true; + } + } + } + } + ASSERT(foundAfterDisableWrite); + co_await liveConsumer->acknowledge(); + ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == + liveConsumer->position().lastConsumedVersion + 1); + } + Future pendingConsume = recoveredOwner.consume.getReply( CDCConsumeRequest(CDCCursor(liveStreamId, std::numeric_limits::max() - 2))); co_await delay(0.1); @@ -563,7 +614,6 @@ struct NativeCdcWorkload : TestWorkload { co_await timeoutError(waitForNoRetiredTagPopState(cx, liveTag), 30.0); if (g_network->isSimulated()) { - (const_cast(CLIENT_KNOBS))->ENABLE_NATIVE_CDC = false; const int32_t disabledResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; const uint64_t recoveryBeforeDisable = dbInfo->get().recoveryCount; co_await changeResolverCount(cx, disabledResolverCount); From 185500ca759c162ace26785587af3e6e532dd703 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Thu, 28 May 2026 16:55:17 -0700 Subject: [PATCH 55/56] Fix native CDC replay from satellite TLogs and add targeted coverage --- fdbserver/tlog/TLogServer.cpp | 5 ++++- fdbserver/workloads/NativeCdc.cpp | 32 +++++++++++++++++++++++++++++- tests/CMakeLists.txt | 1 + tests/fast/NativeCdcSatellite.toml | 23 +++++++++++++++++++++ 4 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 tests/fast/NativeCdcSatellite.toml diff --git a/fdbserver/tlog/TLogServer.cpp b/fdbserver/tlog/TLogServer.cpp index e80071dc1e5..e2b9cba0186 100644 --- a/fdbserver/tlog/TLogServer.cpp +++ b/fdbserver/tlog/TLogServer.cpp @@ -1577,13 +1577,16 @@ void commitMessages(TLogData* self, block.append(block.arena(), msg.message.begin(), msg.message.size()); for (auto tag : msg.tags) { if (logData->locality == tagLocalitySatellite) { - if (!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || tag == txsTag)) { + if (!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || + tag.locality == tagLocalityCDC || tag == txsTag)) { continue; } } else if (!(logData->locality == tagLocalitySpecial || logData->locality == tag.locality || tag.locality < 0)) { continue; } + CODE_PROBE(logData->locality == tagLocalitySatellite && tag.locality == tagLocalityCDC, + "Satellite TLog indexes CDC mutation"); if (tag.locality == tagLocalityLogRouter) { if (!logData->logRouterTags) { diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 54f13ff05d5..13371717751 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -30,14 +30,17 @@ #include "fdbclient/SystemData.h" #include "fdbserver/core/RecoveryState.h" #include "fdbserver/core/ServerDBInfo.h" +#include "fdbserver/core/TLogInterface.h" #include "fdbserver/tester/workloads.h" struct NativeCdcWorkload : TestWorkload { static constexpr auto NAME = "NativeCdc"; bool sharedTagSafety; + bool verifySatelliteIndexing; explicit NativeCdcWorkload(WorkloadContext const& wcx) - : TestWorkload(wcx), sharedTagSafety(getOption(options, "sharedTagSafety"_sr, false)) {} + : TestWorkload(wcx), sharedTagSafety(getOption(options, "sharedTagSafety"_sr, false)), + verifySatelliteIndexing(getOption(options, "verifySatelliteIndexing"_sr, false)) {} void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("all"); } @@ -273,6 +276,27 @@ struct NativeCdcWorkload : TestWorkload { } } + Future verifySatelliteCDCWrite(Tag tag, Version version) { + bool foundSatelliteTLog = false; + for (const auto& tlogset : dbInfo->get().logSystemConfig.tLogs) { + if (!tlogset.isLocal || tlogset.locality != tagLocalitySatellite) { + continue; + } + foundSatelliteTLog = true; + for (const auto& tlog : tlogset.tLogs) { + TLogPeekReply reply = co_await timeoutError( + tlog.interf().peekMessages.getReply(TLogPeekRequest(version, tag, true, false)), 30.0); + if (!reply.messages.empty()) { + CODE_PROBE(true, "Native CDC workload reads tagged mutation from a satellite TLog"); + co_return; + } + } + } + ASSERT(foundSatelliteTLog); + ASSERT(false); + co_return; + } + Future runSharedTagSafety(Database cx) { const Key firstName = "native-cdc-shared-first"_sr; const Key secondName = "native-cdc-shared-second"_sr; @@ -415,6 +439,9 @@ struct NativeCdcWorkload : TestWorkload { const Version writeVersion = co_await writeValues(cx, { { "live/in"_sr, "captured"_sr }, { "other/out"_sr, "ignored"_sr } }); + if (verifySatelliteIndexing) { + co_await verifySatelliteCDCWrite(liveTag, writeVersion); + } for (const auto& nonOwner : dbInfo->get().client.cdcProxies) { if (nonOwner.id() == owner.id()) { @@ -506,6 +533,9 @@ struct NativeCdcWorkload : TestWorkload { const Version afterRecoveryVersion = co_await writeValues(cx, { { "live/after-recovery"_sr, "captured-after-recovery"_sr } }); + if (verifySatelliteIndexing) { + co_await verifySatelliteCDCWrite(liveTag, afterRecoveryVersion); + } bool foundAfterRecoveryWrite = false; const double afterRecoveryConsumeDeadline = now() + 30.0; while (liveConsumer->position().lastConsumedVersion < afterRecoveryVersion) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6ca195c6302..f13f20e51e6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -230,6 +230,7 @@ if(WITH_PYTHON) if (MULTIREGION_TEST) # ValidateStorage depends on WITH_ROCKSDB + add_fdb_test(TEST_FILES fast/NativeCdcSatellite.toml) add_fdb_test(TEST_FILES slow/DiskFailureCycle.toml) add_fdb_test(TEST_FILES rare/FailoverWithSSLag.toml) add_fdb_test(TEST_FILES rare/DcLag.toml) diff --git a/tests/fast/NativeCdcSatellite.toml b/tests/fast/NativeCdcSatellite.toml new file mode 100644 index 00000000000..d8fe0f64c01 --- /dev/null +++ b/tests/fast/NativeCdcSatellite.toml @@ -0,0 +1,23 @@ +[configuration] +config = '''double remote_double usable_regions=2 regions=[{"datacenters":[{"id":"0","priority":2},{"id":"2","priority":1,"satellite":1}],"satellite_redundancy_mode":"one_satellite_single"},{"datacenters":[{"id":"1","priority":1},{"id":"3","priority":1,"satellite":1}],"satellite_redundancy_mode":"one_satellite_single"}]''' +minimumRegions = 2 +singleRegion = true +generateFearless = false +datacenters = 4 +machineCount = 16 +processesPerMachine = 1 +buggify = false +faultInjection = false + +[[knobs]] +enable_native_cdc = true + +[[test]] +testTitle = 'NativeCdcSatellite' +useDB = true +waitForQuiescenceEnd = false +connectionFailuresDisableDuration = 1000000 + + [[test.workload]] + testName = 'NativeCdc' + verifySatelliteIndexing = true From 1bb10ee78af3fd3960a953897777b7f7c23e86d3 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Thu, 28 May 2026 17:17:33 -0700 Subject: [PATCH 56/56] Refine native CDC API boundaries and consumer coordination --- design/cdc.md | 9 +- fdbclient/NativeCdc.cpp | 122 +++++++++++------- fdbclient/include/fdbclient/NativeCdc.h | 29 +---- .../include/fdbclient/NativeCdcInternal.h | 56 ++++++++ fdbserver/cdcproxy/CDCProxy.cpp | 4 +- .../ClusterController.actor.cpp | 2 +- fdbserver/logsystem/ApplyMetadataMutation.cpp | 25 +++- .../logsystem/ApplyMetadataMutation.h | 2 + fdbserver/workloads/NativeCdc.cpp | 1 + 9 files changed, 166 insertions(+), 84 deletions(-) create mode 100644 fdbclient/include/fdbclient/NativeCdcInternal.h diff --git a/design/cdc.md b/design/cdc.md index dbabc64b155..8109b4ee6b4 100644 --- a/design/cdc.md +++ b/design/cdc.md @@ -59,8 +59,10 @@ The current implementation does not attempt to provide: ## Client interface -The client-facing declarations are in `fdbclient/NativeCdc.h`; cursor and wire -request types are in `fdbclient/CDCProxyInterface.h`. +The client-facing declarations are in `fdbclient/NativeCdc.h`; durable +metadata operations used by server roles are in +`fdbclient/NativeCdcInternal.h`; cursor and wire request types are in +`fdbclient/CDCProxyInterface.h`. ```cpp Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys); @@ -153,6 +155,9 @@ through the delivered position, and must not issue another `consume()` if it still needs to retry processing the previous reply from that same in-memory consumer. A consumer restarted from its last durably checkpointed position can use `resumeNativeCdcConsumer()`. +Only one `consume()` or `acknowledge()` operation may be outstanding on a +`NativeCdcConsumer`; concurrent operations are rejected because they would +race updates to its delivered position and acknowledgement proof. The server accepts an acknowledgement beyond its current transaction read version only when the owning CDC proxy has read through that position from its tagged log stream. A resumed consumer may reissue an acknowledgement already diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp index 2690abd38df..4733fd3911e 100644 --- a/fdbclient/NativeCdc.cpp +++ b/fdbclient/NativeCdc.cpp @@ -29,6 +29,7 @@ #include "fdbclient/DatabaseContext.h" #include "fdbclient/Knobs.h" #include "fdbclient/NativeCdc.h" +#include "fdbclient/NativeCdcInternal.h" #include "fdbclient/SystemData.h" #include "flow/CodeProbe.h" #include "flow/Error.h" @@ -119,6 +120,7 @@ Future getNativeCdcCurrentTag(Transaction* tr, CDCStreamId streamId) { co_return currentTag.get(); } +// TODO: Persist current per-tag ownership so registration does not reconstruct it by scanning all active streams. Future> getNativeCdcProxyAssignmentForTag(Transaction* tr, Tag targetTag) { std::set activeStreamIds; Key begin = cdcStreamKeys.begin; @@ -247,7 +249,7 @@ Future nativeCdcStreamStillExists(Database cx, CDCStreamId streamId) { } } -Future getNativeCdcStreamId(Database cx, Key name) { +Future> findNativeCdcStreamId(Database cx, Key name) { if (name.empty()) { throw client_invalid_operation(); } @@ -260,7 +262,7 @@ Future getNativeCdcStreamId(Database cx, Key name) { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional streamId = co_await tr.get(cdcStreamNameKeyFor(name)); if (!streamId.present()) { - throw client_invalid_operation(); + co_return Optional(); } co_return decodeCDCStreamNameValue(streamId.get()); } catch (Error& e) { @@ -270,6 +272,14 @@ Future getNativeCdcStreamId(Database cx, Key name) { } } +Future getNativeCdcStreamId(Database cx, Key name) { + Optional streamId = co_await findNativeCdcStreamId(cx, name); + if (!streamId.present()) { + throw client_invalid_operation(); + } + co_return streamId.get(); +} + Future getNativeCdcStreamProxy(Database cx, CDCStreamId streamId) { if (streamId == 0) { throw client_invalid_operation(); @@ -625,14 +635,12 @@ Future removeNativeCdcStreamClient(Database cx, Key name) { } while (true) { - std::vector streams = co_await listNativeCdcStreamsClient(cx); - auto stream = std::find_if( - streams.begin(), streams.end(), [&](NativeCdcStreamInfo const& info) { return info.name == name; }); - if (stream == streams.end()) { + Optional streamId = co_await findNativeCdcStreamId(cx, name); + if (!streamId.present()) { co_return; } - Optional proxy = co_await getNativeCdcStreamProxyForRemoval(cx, name, stream->streamId); + Optional proxy = co_await getNativeCdcStreamProxyForRemoval(cx, name, streamId.get()); if (!proxy.present()) { co_return; } @@ -662,62 +670,84 @@ Reference resumeNativeCdcConsumer(Database cx, CDCCursor posi } Future NativeCdcConsumer::consumeImpl(Reference self) { - while (true) { - CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); - try { + try { + while (true) { + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); Future proxyChanged = self->cx->clientInfo->onChange(); - auto result = co_await race( - throwErrorOr(proxy.consume.tryGetReply(CDCConsumeRequest(self->currentPosition))), proxyChanged); - if (result.index() == 0) { - CDCConsumeReply reply = std::get<0>(std::move(result)); - self->knownAvailableThrough = reply.lastConsumedVersion; - self->currentPosition.lastConsumedVersion = reply.lastConsumedVersion; - co_return reply; - } - CODE_PROBE(true, "Native CDC consume retries after proxy metadata change", probe::decoration::rare); - } catch (Error& error) { - if (!retryNativeCdcProxyRequest(error)) { - throw; + try { + auto result = co_await race( + throwErrorOr(proxy.consume.tryGetReply(CDCConsumeRequest(self->currentPosition))), proxyChanged); + if (result.index() == 0) { + CDCConsumeReply reply = std::get<0>(std::move(result)); + self->knownAvailableThrough = reply.lastConsumedVersion; + self->currentPosition.lastConsumedVersion = reply.lastConsumedVersion; + self->operationOutstanding = false; + co_return reply; + } + CODE_PROBE(true, "Native CDC consume retries after proxy metadata change", probe::decoration::rare); + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + CODE_PROBE(true, "Native CDC consume retries after proxy request failure", probe::decoration::rare); } - CODE_PROBE(true, "Native CDC consume retries after proxy request failure", probe::decoration::rare); + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); } - co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); + } catch (Error&) { + self->operationOutstanding = false; + throw; } } Future NativeCdcConsumer::consume() { + if (operationOutstanding) { + CODE_PROBE(true, "Native CDC consumer rejects overlapping operations", probe::decoration::rare); + return Future(client_invalid_operation()); + } + operationOutstanding = true; return consumeImpl(Reference::addRef(this)); } Future NativeCdcConsumer::acknowledgeImpl(Reference self) { - if (self->currentPosition.streamId == 0 || self->currentPosition.lastConsumedVersion < 0 || - self->currentPosition.lastConsumedVersion == std::numeric_limits::max()) { - throw client_invalid_operation(); - } - const Version acknowledgedVersion = self->currentPosition.lastConsumedVersion; - co_await acknowledgeNativeCdcStream( - self->cx, self->currentPosition.streamId, acknowledgedVersion, self->knownAvailableThrough); - - while (true) { - CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); - try { - Future proxyChanged = self->cx->clientInfo->onChange(); - auto result = co_await race( - throwErrorOr(proxy.ack.tryGetReply(CDCAckRequest(self->currentPosition.streamId, acknowledgedVersion))), - proxyChanged); - if (result.index() == 0) { - co_return; - } - } catch (Error& error) { - if (!retryNativeCdcProxyRequest(error)) { - throw; + try { + if (self->currentPosition.streamId == 0 || self->currentPosition.lastConsumedVersion < 0 || + self->currentPosition.lastConsumedVersion == std::numeric_limits::max()) { + throw client_invalid_operation(); + } + const Version acknowledgedVersion = self->currentPosition.lastConsumedVersion; + co_await acknowledgeNativeCdcStream( + self->cx, self->currentPosition.streamId, acknowledgedVersion, self->knownAvailableThrough); + + while (true) { + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); + try { + Future proxyChanged = self->cx->clientInfo->onChange(); + auto result = co_await race(throwErrorOr(proxy.ack.tryGetReply( + CDCAckRequest(self->currentPosition.streamId, acknowledgedVersion))), + proxyChanged); + if (result.index() == 0) { + self->operationOutstanding = false; + co_return; + } + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); } - co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); + } catch (Error&) { + self->operationOutstanding = false; + throw; } } Future NativeCdcConsumer::acknowledge() { + if (operationOutstanding) { + CODE_PROBE(true, "Native CDC consumer rejects overlapping operations", probe::decoration::rare); + return Future(client_invalid_operation()); + } + operationOutstanding = true; return acknowledgeImpl(Reference::addRef(this)); } diff --git a/fdbclient/include/fdbclient/NativeCdc.h b/fdbclient/include/fdbclient/NativeCdc.h index 0cb13d64b88..ca4355ee63c 100644 --- a/fdbclient/include/fdbclient/NativeCdc.h +++ b/fdbclient/include/fdbclient/NativeCdc.h @@ -34,15 +34,11 @@ struct NativeCdcStreamInfo { Version minVersion = invalidVersion; }; -struct NativeCdcRemovedStreamInfo { - Version removalVersion = invalidVersion; - std::vector tags; -}; - class NativeCdcConsumer : public ReferenceCounted { public: NativeCdcConsumer(Database cx, CDCCursor position) : cx(cx), currentPosition(position) {} + // Operations advance shared delivery state; only one may be outstanding. Future consume(); Future acknowledge(); const CDCCursor& position() const { return currentPosition; } @@ -54,30 +50,9 @@ class NativeCdcConsumer : public ReferenceCounted { Database cx; CDCCursor currentPosition; Version knownAvailableThrough = invalidVersion; + bool operationOutstanding = false; }; -// These durable metadata operations back CDCProxyInterface lifecycle requests. -// Registration is knob-protected; draining and cleanup remain available for -// streams persisted while native CDC was enabled. -Future registerNativeCdcStream(Database cx, - Key name, - KeyRange keys, - Optional proxyId = Optional()); -// Persists per-tag final-pop watermarks before removing stream metadata. -Future> removeNativeCdcStream(Database cx, - Key name, - Optional proxyId = Optional()); -Future> listNativeCdcStreams(Database cx); -// Atomically moves any streams assigned to a failed proxy to its replacement. -Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId); -// Persists the exclusive unpopped watermark after consuming through a version. -// knownAvailableThrough permits a consumer to acknowledge log data it has -// already received before that version is visible at a transaction read version. -Future acknowledgeNativeCdcStream(Database cx, - CDCStreamId streamId, - Version consumedThrough, - Version knownAvailableThrough = invalidVersion); - // Client-facing CDC operations. Registration is feature gated; the remaining // operations stay available so existing durable streams can be drained after // the feature is disabled. Requests retry when stream ownership changes. diff --git a/fdbclient/include/fdbclient/NativeCdcInternal.h b/fdbclient/include/fdbclient/NativeCdcInternal.h new file mode 100644 index 00000000000..f3b6a24bbc2 --- /dev/null +++ b/fdbclient/include/fdbclient/NativeCdcInternal.h @@ -0,0 +1,56 @@ +/* + * NativeCdcInternal.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_NATIVECDCINTERNAL_H +#define FDBCLIENT_NATIVECDCINTERNAL_H +#pragma once + +#include + +#include "fdbclient/NativeCdc.h" + +struct NativeCdcRemovedStreamInfo { + Version removalVersion = invalidVersion; + std::vector tags; +}; + +// Durable metadata operations used by CDC server roles and simulation +// coverage. Registration is feature gated; drain and cleanup operations remain +// available for streams persisted before native CDC is disabled. +Future registerNativeCdcStream(Database cx, + Key name, + KeyRange keys, + Optional proxyId = Optional()); +// Persists per-tag final-pop watermarks before removing stream metadata. +Future> removeNativeCdcStream(Database cx, + Key name, + Optional proxyId = Optional()); +Future> listNativeCdcStreams(Database cx); +// Atomically moves any streams assigned to a failed proxy to its replacement. +Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId); +// Persists the exclusive unpopped watermark after consuming through a version. +// knownAvailableThrough permits a consumer to acknowledge log data it has +// already received before that version is visible at a transaction read version. +Future acknowledgeNativeCdcStream(Database cx, + CDCStreamId streamId, + Version consumedThrough, + Version knownAvailableThrough = invalidVersion); + +#endif // FDBCLIENT_NATIVECDCINTERNAL_H diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp index ffda767a13c..706b20a24c0 100644 --- a/fdbserver/cdcproxy/CDCProxy.cpp +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -27,7 +27,7 @@ #include #include "fdbclient/Knobs.h" -#include "fdbclient/NativeCdc.h" +#include "fdbclient/NativeCdcInternal.h" #include "fdbclient/SystemData.h" #include "fdbserver/cdcproxy/CDCProxy.h" #include "fdbserver/core/Knobs.h" @@ -584,6 +584,8 @@ Future initializeStream(CDCProxyData* self, Reference s } } +// TODO: Persist per-tag safe-pop state or coordinate pops centrally instead of rebuilding minima from all stream +// history on every acknowledgement. Future> readSafePopVersions(Database cx) { Transaction tr(cx); while (true) { diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 90033276223..0621b157242 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -28,7 +28,7 @@ #include "fdbclient/ClientBooleanParams.h" #include "fdbclient/FDBTypes.h" -#include "fdbclient/NativeCdc.h" +#include "fdbclient/NativeCdcInternal.h" #include "fdbclient/SystemData.h" #include "fdbclient/DatabaseContext.h" #include "fdbrpc/FailureMonitor.h" diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index 1dccb9d04d7..c9b0155a775 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -53,6 +53,20 @@ CDCRoutingTable::CDCRoutingTable() { tagsByRange.insert(allKeys, std::set()); } +void CDCRoutingTable::updateRange(CDCStreamId streamId, KeyRangeRef const& keys) { + streams[streamId].keys = KeyRange(keys); +} + +bool CDCRoutingTable::updateTag(CDCStreamId streamId, Version version, Tag tag) { + ASSERT(tag.locality == tagLocalityCDC); + auto& existing = streams[streamId].tag; + if (!existing.present() || version >= existing.get().first) { + existing = std::make_pair(version, tag); + return true; + } + return false; +} + void CDCRoutingTable::rebuildRanges() { tagsByRange.insert(allKeys, std::set()); for (const auto& [streamId, state] : streams) { @@ -67,15 +81,12 @@ void CDCRoutingTable::rebuildRanges() { } void CDCRoutingTable::setRange(CDCStreamId streamId, KeyRangeRef const& keys) { - streams[streamId].keys = KeyRange(keys); + updateRange(streamId, keys); rebuildRanges(); } void CDCRoutingTable::setTag(CDCStreamId streamId, Version version, Tag tag) { - ASSERT(tag.locality == tagLocalityCDC); - auto& existing = streams[streamId].tag; - if (!existing.present() || version >= existing.get().first) { - existing = std::make_pair(version, tag); + if (updateTag(streamId, version, tag)) { rebuildRanges(); } } @@ -84,12 +95,12 @@ void CDCRoutingTable::reload(IKeyValueStore* txnStateStore) { streams.clear(); const RangeResult streamRows = txnStateStore->readRange(cdcStreamKeys).get(); for (const auto& kv : streamRows) { - setRange(decodeCDCStreamKey(kv.key), decodeCDCStreamKeysValue(kv.value)); + updateRange(decodeCDCStreamKey(kv.key), decodeCDCStreamKeysValue(kv.value)); } const RangeResult tagHistoryRows = txnStateStore->readRange(cdcTagHistoryKeys).get(); for (const auto& kv : tagHistoryRows) { const auto [streamId, version, tag] = decodeCDCTagHistoryKey(kv.key); - setTag(streamId, version, tag); + updateTag(streamId, version, tag); } rebuildRanges(); } diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h index e703c4d60f1..6916cd8703f 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h @@ -66,6 +66,8 @@ class CDCRoutingTable : NonCopyable { std::map streams; KeyRangeMap> tagsByRange; + void updateRange(CDCStreamId streamId, KeyRangeRef const& keys); + bool updateTag(CDCStreamId streamId, Version version, Tag tag); void rebuildRanges(); public: diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp index 13371717751..900c3486591 100644 --- a/fdbserver/workloads/NativeCdc.cpp +++ b/fdbserver/workloads/NativeCdc.cpp @@ -27,6 +27,7 @@ #include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.h" #include "fdbclient/NativeCdc.h" +#include "fdbclient/NativeCdcInternal.h" #include "fdbclient/SystemData.h" #include "fdbserver/core/RecoveryState.h" #include "fdbserver/core/ServerDBInfo.h"