diff --git a/design/cdc.md b/design/cdc.md new file mode 100644 index 00000000000..8109b4ee6b4 --- /dev/null +++ b/design/cdc.md @@ -0,0 +1,564 @@ +# Native Change Data Capture (CDC) + +## Status and scope + +Native Change Data Capture (CDC) provides a FoundationDB-native mechanism for +reading committed mutations for a registered key range. A client registers a +named stream, creates a consumer for that name, consumes batches of mutations, +and acknowledges processed versions. The implementation persists enough state +to retain unread TLog data and to resume stream service after CDC proxy failure +or transaction-system recovery. + +This design describes the native C++ interface and its server implementation. +The feature is disabled by default behind `ENABLE_NATIVE_CDC`; the native CDC +workloads explicitly enable it, and simulation may randomly enable it. The +initial interface is native-only: it does not expose bindings or an external +protocol compatibility guarantee. + +The implementation uses the following terms: + +* A **stream** is a durable named registration for a fixed user key range. +* A **cursor** identifies one stream and the version through which a consumer + has read. +* A **CDC tag** is a TLog tag with locality `tagLocalityCDC`. Commit proxies + append these tags to mutations covered by registered streams. +* A **CDC proxy** reads tagged TLog mutation streams, filters mutations to a + registered range, serves consumers, and coordinates acknowledgement-driven + log popping. + +CDC is not implemented as a storage server change feed. It captures mutations +in the transaction logging path, which lets an acknowledged consumer retain +and release its own log history without changing user data storage. + +## Goals + +Native CDC is intended to provide: + +* Durable, named registrations for key ranges in normal user key space. +* A consumer API in which a client only needs a stream name after + registration, rather than repeating its registered range on every read. +* Ordered mutation batches identified by FoundationDB commit versions. +* Durable acknowledgements that determine how much CDC-tagged TLog history may + be popped. +* Correct retention when several streams share a CDC tag, including streams + whose ranges overlap or whose consumers advance at different rates. +* Replacement and recovery of CDC proxies without losing active stream + ownership or prematurely releasing log data. +* Finite cleanup when streams are removed, so an old stream does not require + CDC infrastructure forever. + +The current implementation does not attempt to provide: + +* Exactly-once side effects in the consumer. A consumer must make its output + and its acknowledgement consistent if it needs exactly-once processing. +* Dynamic stream range changes. A name is registered for one range; changing a + range requires removing and registering a stream. +* Throughput-aware assignment of streams across CDC proxies. +* Throughput-aware movement of streams between CDC tags. +* Client bindings beyond the native API. + +## Client interface + +The client-facing declarations are in `fdbclient/NativeCdc.h`; durable +metadata operations used by server roles are in +`fdbclient/NativeCdcInternal.h`; cursor and wire request types are in +`fdbclient/CDCProxyInterface.h`. + +```cpp +Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys); +Future removeNativeCdcStreamClient(Database cx, Key name); +Future> listNativeCdcStreamsClient(Database cx); + +Future> createNativeCdcConsumer(Database cx, Key name); +Reference resumeNativeCdcConsumer(Database cx, CDCCursor position); +``` + +A stream registration contains: + +```cpp +struct NativeCdcStreamInfo { + Key name; + CDCStreamId streamId; + KeyRange keys; + Version minVersion; +}; +``` + +The durable identity of a stream is its `CDCStreamId`, not its name. Names are +used to create and manage streams. Creating a consumer resolves the current +stream ID once, so removing a name and later registering the same name does +not silently redirect an existing consumer to a different stream. + +`NativeCdcConsumer` is a client-side, reference-counted reader object. It +holds the client's `Database` handle and current delivered position and +exposes consumption and acknowledgement operations: + +```cpp +class NativeCdcConsumer : public ReferenceCounted { +public: + Future consume(); + Future acknowledge(); + const CDCCursor& position() const; +}; +``` + +`CDCCursor` remains a small serializable position token used by CDC proxy +requests and by callers that need to checkpoint or resume a consumer. It does +not contain a `Database` handle or other process-local state: + +```cpp +struct CDCCursor { + CDCStreamId streamId; + Version lastConsumedVersion; +}; +``` + +`lastConsumedVersion` is initialized to `invalidVersion`. A consume response +returns both mutations and a new `lastConsumedVersion`: + +```cpp +struct VersionedMutationsRef { + Version version; + VectorRef mutations; +}; + +struct CDCConsumeReply { + VectorRef mutations; + Version lastConsumedVersion; +}; +``` + +A typical consumer loop is: + +```cpp +co_await registerNativeCdcStreamClient(db, "orders"_sr, KeyRangeRef("order/"_sr, "order0"_sr)); +state Reference consumer = co_await createNativeCdcConsumer(db, "orders"_sr); + +loop { + CDCConsumeReply reply = co_await consumer->consume(); + for (auto const& versionedMutations : reply.mutations) { + // Apply all mutations for versionedMutations.version. + } + + co_await consumer->acknowledge(); +} +``` + +`consume()` advances `consumer->position()` to the returned delivered +position, but does not change durable retention. The acknowledgement means +that the consumer no longer requires CDC mutations through +`consumer->position().lastConsumedVersion`. Internally, acknowledgement +advances the stream's persisted minimum required version to +`lastConsumedVersion + 1`. Therefore the consumer must not call +`acknowledge()` before it has durably processed all mutations represented +through the delivered position, and must not issue another `consume()` if it +still needs to retry processing the previous reply from that same in-memory +consumer. A consumer restarted from its last durably checkpointed position +can use `resumeNativeCdcConsumer()`. +Only one `consume()` or `acknowledge()` operation may be outstanding on a +`NativeCdcConsumer`; concurrent operations are rejected because they would +race updates to its delivered position and acknowledgement proof. +The server accepts an acknowledgement beyond its current transaction read +version only when the owning CDC proxy has read through that position from its +tagged log stream. A resumed consumer may reissue an acknowledgement already +represented by the durable watermark, and a replacement proxy reconciles its +in-memory frontier to that watermark. A fabricated future position cannot +pre-pop mutations that have not reached a proxy or the database read version. + +### Registration and removal semantics + +`registerNativeCdcStreamClient()` accepts a non-empty stream name and a +non-empty range entirely within normal user keys. Registration of an existing +name with the same range is idempotent. Registering an existing name with a +different range is rejected. + +Registration establishes an initial minimum version using the registration +transaction's commit version. Mutations committed after the registration has +become visible are routed to the stream's CDC tag. The initial minimum version +also supplies the first retention watermark for its TLog history. + +`removeNativeCdcStreamClient()` removes the named stream and schedules final +release of tagged log history that was protected by the removed stream. +Removal explicitly relinquishes any unread history for that stream while still +respecting the retention needs of other streams sharing its tags. Stream +removal is terminal for existing consumers. Stale consume or acknowledgement +operations return an error instead of waiting indefinitely for an owner that +will never be assigned again. + +### Consumption and expiration + +Consumption is ordered by commit version. Mutations from a clear range are +intersected with the stream's registered range before being returned; a +single-key mutation is returned only if its key is within that range. + +For an active stream, unacknowledged CDC mutations are retained by its durable +minimum version: TLogs must not pop tagged data that the stream may still +consume. A slow consumer therefore retains its unread history rather than +expiring solely because of age. + +Consumption returns `transaction_too_old` when a consumer supplies a cursor +older than the stream's already acknowledged durable watermark. The proxy also +treats discovery that an active stream's required tagged data has nevertheless +already been popped as `transaction_too_old`; that condition indicates a +retention invariant violation rather than a supported expiration policy. + +The native client methods retry transient endpoint and routing failures such as +a CDC proxy replacement. Invalid stream operations, already-acknowledged +cursor positions, and retention invariant violations are terminal errors for +that request. + +## Architecture + +The data path is: + +```mermaid +flowchart LR + client["Client"] <-->|"consume / acknowledge"| proxy["CDC Proxy"] + proxy <-->|"peek / pop"| tlogs["TLogs"] + + commits["User commits"] --> commitProxy["Commit Proxies"] + commitProxy -->|"ordinary and CDC tags"| tlogs + + controller["Cluster Controller"] -->|"assigned streams"| proxy + metadata["Transaction state store
and system key storage"] -->|"durable stream and acknowledgement state"| proxy + metadata -->|"durable CDC routing metadata"| commitProxy +``` + +Commit proxies keep a routing table derived from durable CDC metadata. For +each committed user mutation, the commit proxy determines which registered CDC +ranges include that mutation and appends the corresponding CDC tags to the +mutation sent to TLogs. A transaction continues to follow its ordinary +replication tags as well; CDC tags are additional log destinations used by CDC +consumers. + +CDC proxies do not participate in committing user transactions. They consume +the extra tagged log streams, buffer readable results, filter shared tagged +data back to each stream's registered range, and pop data after durable +acknowledgement permits it. + +The cluster controller recruits CDC proxies, publishes their interfaces, and +keeps durable stream-to-proxy ownership consistent with current endpoints. + +## Durable state + +CDC uses two categories of system data. Routing and recovery-critical metadata +is stored in the transaction state store, where commit proxies and recovery +can reconstruct it in transaction order. Acknowledgement and final-pop +watermarks are stored as regular storage-server-backed system keys, because +they are durable progress values rather than commit routing configuration. + +### Transaction state metadata + +These keys are in the metadata portion of system key space and are represented +in transaction state: + +| Key | Value | Purpose | +| --- | --- | --- | +| `\xff/cdc/name/` | `CDCStreamId` | Resolves a user-visible name to its durable stream identity. | +| `\xff/cdc/maxStreamId` | `CDCStreamId` | Allocates monotonic stream identifiers. | +| `\xff/cdc/keys/` | `KeyRange` | Stores the immutable registered range for an active stream. | +| `\xff/cdc/tagHistory///` | empty | Records the CDC tag assignment history used for routing and historical reads. | +| `\xff/cdc/proxies//` | empty | Stores the CDC proxy assigned to an active stream. | +| `\xff/cdc/proxyAssignmentChange` | version/change signal | Wakes ownership monitoring when durable assignments change. | +| `\xff/cdc/retiredTagPop/` | empty | Retains recovery-visible pending final-pop work after removal. | + +Tag history is versioned so the data model can support a stream moving between +tags without forgetting which old log streams may still contain unread +mutations. The initial implementation writes the initial assignment and reads +the history; dynamic throughput-driven reassignment is future work. + +### Storage-backed system data + +These keys are in the storage-server-backed `\xff\x02` system key range rather +than transaction state: + +| Key | Value | Purpose | +| --- | --- | --- | +| `\xff\x02/cdc/minVersion/` | `Version` | Earliest version that an active stream may still require. | +| `\xff\x02/cdc/retiredTagPopVersion/` | `Version` | Final pop watermark required after a stream using a tag is removed. | + +The initial `minVersion` is written with a versionstamp at stream +registration. When a consumer acknowledges processing through version `V`, the +stored value advances monotonically to `V + 1`. A CDC proxy may pop tagged +mutations before this watermark only when doing so is safe for every live +stream sharing that tag. + +The retired-tag marker and watermark are deliberately split between transaction +state and regular system storage. The marker tells recovery that a CDC proxy +must still be recruited to finish durable cleanup; the watermark bounds the +actual final pop to perform. + +## Stream creation and assignment + +Registration runs as a durable metadata transaction: + +1. It validates the feature knob, the stream name, and the registered normal + key range. +2. It checks whether the name is already registered and applies the idempotent + same-name/same-range rule. +3. It allocates a new monotonically increasing `CDCStreamId`. +4. It selects a CDC tag using current active stream counts. The allocator uses + the least populated tag among `NATIVE_CDC_TAG_COUNT` tags, choosing the + lowest tag ID on a tie. +5. It records the stream name, range, initial tag history entry, and + versionstamped initial minimum version. +6. It records an available CDC proxy owner and signals assignment monitoring. + +The tag allocator bounds the number of distinct CDC log streams while allowing +many user streams. Several streams may therefore share one tag intentionally. +This makes filtering and acknowledgement coordination required correctness +properties rather than exceptional cases. + +The current proxy assignment at registration uses an available CDC proxy; it +does not yet balance by stream traffic, memory use, or consumer lag. + +## Commit routing + +Each commit proxy has a `CDCRoutingTable`, reconstructed from active stream +ranges and tag history in transaction state. Changes to CDC stream metadata +are applied in commit order along with other transaction state mutations, so +the routing decision for later mutations observes committed registration and +removal changes. + +For a single-key mutation, the routing table returns CDC tags for all active +stream ranges containing that key. For a clear-range mutation, it returns tags +for all active stream ranges intersecting the cleared interval. These CDC tags +are appended in both the tag-determination and log-writing portions of commit +proxy processing. + +A shared CDC tag is a multiplexed log stream. A mutation routed because of +stream A may be read by the proxy serving stream B if both share the tag. +Consequently, the CDC proxy filters every read mutation against B's registered +range before returning it to B's consumer. Filtering also clips clear ranges +to the stream range. + +Routing at commit time has two important implications: + +* CDC observes mutations once their commits enter the transaction logging + path, without scanning storage server data. +* Registering CDC does not change normal durability for the user mutation; it + adds tagged log data whose retention is controlled separately by consumers. + +## CDC proxy read path + +A CDC proxy owns a set of active stream IDs. For each owned stream it loads: + +* The registered key range. +* The durable minimum required version. +* Its current CDC tag and versioned tag history. + +The proxy reads data from TLogs through `LogSystemConsumer::peekSingle()`. +When a stream has historical assignments, the proxy uses the history to select +the tag appropriate for the version interval it is reading. It filters +mutations to the registered range and stores versioned mutation batches in a +per-stream in-memory buffer. + +All stream buffers owned by one CDC proxy share a `CDC_PROXY_BUFFER_BYTES` +budget. Before requesting more TLog data, a stream reserves a bounded peek +window from that budget, then converts the reservation to the actual filtered +mutation bytes retained in its buffer. Acknowledgement or stream removal +releases the retained reservation. This applies backpressure before ordinary +peek batches arrive, rather than allowing each stream or each received batch +to independently overshoot the proxy limit. A slow consumer does not require +the proxy to buffer its entire retained history in memory: durable +acknowledgement state and tagged TLog retention are the source of resumability, +while the proxy buffer is a delivery optimization. + +A consume operation supplies a cursor. The proxy returns buffered or newly +peeked data after the cursor position and a position through which the +consumer may acknowledge after processing. If a stream is removed while a +consume is blocked, reconciliation wakes the request and it fails rather than +waiting on a data-change trigger for an inactive stream. + +## Acknowledgement and tag popping + +Acknowledgement is per stream, while TLog popping is per CDC tag. This +distinction is the core retention rule. + +For every active stream `S`, `minVersion(S)` is the first version its consumer +may still need. For a tag `T`, the safe pop watermark is: + +```text +safePop(T) = min(minVersion(S)) for every live stream S whose history uses T +``` + +Popping tag `T` through `safePop(T)` discards versions older than the minimum +required by any live stream using that tag. A fast consumer therefore cannot +pop mutations still needed by a slower stream sharing its tag. + +The proxy recomputes these minima from durable active stream metadata and +acknowledgement rows. It does not rely solely on its in-memory owned-stream +set, because shared tags and replacement proxies must preserve the same global +retention decision. + +### Removing a stream + +Removing a stream eliminates its active name, range, tag history, minimum +version, and ownership rows. Removal must not unconditionally pop each tag in +the removed history: a different live stream may share a tag and still need +older data. + +Before deleting active state, removal writes pending final-pop state for each +historical tag: + +* A transaction-state retired-tag marker, which survives recovery and makes + outstanding cleanup discoverable. +* A versionstamped storage-backed retired-tag watermark, which identifies the + upper bound of tagged history protected by the removed stream. + +The CDC proxy processes retired work together with active acknowledgement +watermarks. If a retired tag is also used by a live stream, its attempted pop +is capped at that live stream's `safePop` value. Only when it is possible to +pop through the complete retired watermark is the retired operation eligible +for completion. + +### Completing retired work + +Retired final-pop state is durable work, not permanent stream state. After +issuing a complete retired pop, a CDC proxy waits for every targeted current +TLog to report that the tag has been popped through the required watermark. +It then transactionally clears both the retired marker and its stored +watermark. + +The cleanup transaction rereads the watermark before clearing it. If a newer +stream removal has advanced the retired watermark for the same tag while the +earlier pop was in progress, the newer work is retained rather than erased by +the older completion. + +This establishes the lifecycle invariant: + +```text +no live streams and no pending retired final pops + implies no durable CDC proxy requirement +``` + +Without this completion protocol, removing any stream would leave a retired +marker forever, causing later recoveries to recruit CDC proxies indefinitely +and repeatedly replay already-completed final pops. + +## Failure handling and recovery + +### CDC proxy failure + +CDC proxies are consumers of durable state and tagged logs, not authorities +for committed application data. If a CDC proxy fails, the cluster controller +can recruit a replacement, publish the replacement interface, and durably +reassign affected streams. The replacement reloads stream state and resumes +reading at durable acknowledgement watermarks. + +The cluster controller monitors durable proxy assignment rows and repairs +stale owner identifiers when endpoints are replaced or the controller itself +is reconstructed. Clients obtain the currently published owner for their +stream ID and retry transient proxy/routing failures. + +### Transaction-system recovery + +During full recovery, active stream ranges, tag history, and pending retired +markers are present in transaction state. Active history tags are added to the +set of log tags that recovery must preserve; otherwise TLog generations could +discard CDC data that an active consumer has not yet acknowledged. + +CDC proxy recruitment is required during recovery when either: + +* Native CDC is enabled and streams may be served, or +* Durable CDC state remains, including retired final-pop work that must be + completed even after admission of new CDC activity is disabled. + +When the knob is disabled and recovery finds neither active streams nor +retired work, no CDC proxies are published. This makes stream removal and +retired-pop completion observable as a finite drain rather than a permanent +cluster role. + +## Feature gating + +`ENABLE_NATIVE_CDC` defaults to false. In simulation it may be randomly enabled +under buggification; workloads that depend on CDC set it explicitly. + +The feature knob gates new stream registration. Listing, consumer creation and +resume, consumption, acknowledgement, and removal remain available for +streams persisted while native CDC was enabled. Internal cleanup and recovery +paths likewise continue handling durable CDC state. This is necessary because +disabling new use of a feature cannot safely abandon log-retention obligations +for already registered or recently removed streams. + +`NATIVE_CDC_TAG_COUNT` controls the bounded tag pool used for new stream +allocation. Normal operation defaults to a larger tag pool; simulation may +reduce it so shared-tag behavior is exercised frequently. + +## Correctness properties + +The implementation is structured around the following properties: + +* **Registration identity:** a consumer's cursor binds to a stream ID, so + reuse of a removed stream name cannot cause an existing consumer to read a + new stream. +* **Range correctness:** CDC proxies return only mutations within a stream's + registered range, even when its tag is shared with other streams. +* **Acknowledgement monotonicity:** durable minimum required versions advance + only forward. +* **Shared-tag retention:** tagged data is popped no farther than the minimum + durable watermark of all active streams that may require it. +* **Removal safety:** deleting one stream cannot pop unread data required by + another stream using the same tag. +* **Finite retired cleanup:** removal retains enough state to finish final + pops through failure and recovery, then removes that state after completion. +* **No age-based expiration:** an active stream's unread tagged mutations + remain protected until acknowledgement or explicit stream removal. + `transaction_too_old` for required active history indicates either a stale + already-acknowledged cursor or a violated retention invariant. +* **Failure visibility:** a removed stream fails consume and acknowledgement + requests rather than leaving them blocked indefinitely. +* **Recovery retention:** active CDC tag history is included in recovery's + required log data, and pending cleanup retains CDC proxy availability until + it has been completed. + +## Current limitations and future work + +The design records tag history and proxy ownership in forms that support more +complete load balancing, but the first implementation intentionally keeps +policy simple. + +* Tag selection is based on active stream counts, not observed byte or mutation + throughput. Data distribution could make equally counted tags very + different in cost. +* Registration selects an available CDC proxy without balancing aggregate + proxy throughput, buffer memory, lag, or number of active readers. +* There is no background process that changes a live stream's CDC tag in + response to load. A future implementation can use versioned tag history to + make such changes without losing the ability to read earlier tagged data. +* The native interface does not yet provide external binding support, + administrative tooling, or a higher-level consumer checkpoint abstraction. + +These improvements must preserve the acknowledgement and retired-pop +invariants above. In particular, moving a stream between tags cannot forget an +old tag until all data protected by that assignment has either been +acknowledged and popped or retained as finite final-pop work. + +## Validation + +The implementation includes codec and metadata tests for CDC system keys and +simulation workloads for the end-to-end behavior. + +The basic native CDC workload covers: + +* Registering, listing, consuming, acknowledging, and removing streams. +* Name-based consumer creation and correct filtering of returned mutations. +* Rejection of incompatible same-name registrations. +* CDC proxy replacement and recovery of stream service. +* Errors for stale consume and acknowledgement requests after removal. +* Creation and eventual collection of retired final-pop state. +* Disabling native CDC while a live stream remains, rejecting new + registration while allowing recovery, consumption, acknowledgement, and + removal to drain the persisted stream. +* Recovery with native CDC disabled after the last stream and final-pop work + have drained, verifying that no CDC proxy remains required. + +The shared-tag workload forces streams to share routing tags and verifies both +range filtering and acknowledgement coordination. In particular, removing one +stream while another shared-tag stream is behind must not pop the unread +mutations needed by the remaining consumer. + +The simulation configurations enable CDC explicitly when testing these +behaviors, while the default-disabled knob and randomized simulation admission +exercise the requirement that clusters without active or pending CDC work do +not carry CDC service overhead. diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 8e82ec2dc43..feac2a88798 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -193,6 +193,8 @@ void ClientKnobs::initialize(Randomize randomize, IsSimulated isSimulated) { init( CHANGE_FEED_CACHE_FLUSH_BYTES, 10e6 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_FLUSH_BYTES = deterministicRandom()->randomInt64(1, 1e6); init( CHANGE_FEED_CACHE_EXPIRE_TIME, 60.0 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_EXPIRE_TIME = 1.0; init( CHANGE_FEED_CACHE_LIMIT_BYTES, 500000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_LIMIT_BYTES = 50000; + init( ENABLE_NATIVE_CDC, false ); if( randomize && isSimulated && BUGGIFY ) ENABLE_NATIVE_CDC = true; + init( NATIVE_CDC_TAG_COUNT, 256 ); if( randomize && BUGGIFY ) NATIVE_CDC_TAG_COUNT = 2; init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; diff --git a/fdbclient/NativeCdc.cpp b/fdbclient/NativeCdc.cpp new file mode 100644 index 00000000000..4733fd3911e --- /dev/null +++ b/fdbclient/NativeCdc.cpp @@ -0,0 +1,776 @@ +/* + * NativeCdc.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/Knobs.h" +#include "fdbclient/NativeCdc.h" +#include "fdbclient/NativeCdcInternal.h" +#include "fdbclient/SystemData.h" +#include "flow/CodeProbe.h" +#include "flow/Error.h" +#include "flow/UnitTest.h" + +namespace { + +void validateNativeCdcEnabled() { + if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC) { + CODE_PROBE(true, "Native CDC registration rejected while feature disabled", probe::decoration::rare); + throw client_invalid_operation(); + } +} + +struct NativeCdcIdentifierAllocator { + bool sawStream = false; + CDCStreamId maxStreamId = 0; + std::unordered_map tagStreamCounts; + + void observeStreamId(CDCStreamId streamId) { + sawStream = true; + maxStreamId = std::max(maxStreamId, streamId); + } + + void observeTag(Tag tag) { + ASSERT_WE_THINK(tag.locality == tagLocalityCDC); + ++tagStreamCounts[tag.id]; + } + + std::pair allocate() const { + if (sawStream && maxStreamId == std::numeric_limits::max()) { + throw operation_failed(); + } + + const CDCStreamId streamId = sawStream ? maxStreamId + 1 : 1; + ASSERT_WE_THINK(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT > 0); + ASSERT_WE_THINK(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT <= std::numeric_limits::max() + 1u); + uint32_t leastStreams = std::numeric_limits::max(); + uint16_t selectedTagId = 0; + // TODO: Use data-distributor-observed per-tag write throughput to rebalance CDC tags, including + // migrating active streams with versioned tag-history assignments. + for (uint32_t tagId = 0; tagId < static_cast(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT); ++tagId) { + auto count = tagStreamCounts.find(static_cast(tagId)); + const uint32_t streamCount = count == tagStreamCounts.end() ? 0 : count->second; + if (streamCount < leastStreams) { + leastStreams = streamCount; + selectedTagId = static_cast(tagId); + } + } + return { streamId, Tag(tagLocalityCDC, selectedTagId) }; + } +}; + +void validateNativeCdcStream(KeyRef const& name, KeyRangeRef const& keys) { + if (name.empty() || keys.empty() || !normalKeys.contains(keys)) { + throw client_invalid_operation(); + } +} + +Future> getNativeCdcProxyAssignment(Transaction* tr, CDCStreamId streamId) { + RangeResult assignments = co_await tr->getRange(cdcProxyRangeFor(streamId), 2); + ASSERT(assignments.size() <= 1); + if (assignments.empty()) { + co_return Optional(); + } + const auto [assignedStreamId, proxyId] = decodeCDCProxyKey(assignments[0].key); + ASSERT_WE_THINK(assignedStreamId == streamId); + co_return proxyId; +} + +Future getNativeCdcCurrentTag(Transaction* tr, CDCStreamId streamId) { + Optional currentTag; + const KeyRange historyRange = cdcTagHistoryRangeFor(streamId); + Key begin = historyRange.begin; + while (begin < historyRange.end) { + RangeResult history = co_await tr->getRange(KeyRangeRef(begin, historyRange.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& assignment : history) { + currentTag = std::get<2>(decodeCDCTagHistoryKey(assignment.key)); + } + if (!history.more) { + break; + } + begin = keyAfter(history.back().key); + } + if (!currentTag.present()) { + throw client_invalid_operation(); + } + co_return currentTag.get(); +} + +// TODO: Persist current per-tag ownership so registration does not reconstruct it by scanning all active streams. +Future> getNativeCdcProxyAssignmentForTag(Transaction* tr, Tag targetTag) { + std::set activeStreamIds; + Key begin = cdcStreamKeys.begin; + while (begin < cdcStreamKeys.end) { + RangeResult streams = co_await tr->getRange(KeyRangeRef(begin, cdcStreamKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& stream : streams) { + activeStreamIds.insert(decodeCDCStreamKey(stream.key)); + } + if (!streams.more) { + break; + } + begin = keyAfter(streams.back().key); + } + + std::map currentTags; + begin = cdcTagHistoryKeys.begin; + while (begin < cdcTagHistoryKeys.end) { + RangeResult histories = + co_await tr->getRange(KeyRangeRef(begin, cdcTagHistoryKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& history : histories) { + const auto decoded = decodeCDCTagHistoryKey(history.key); + const CDCStreamId streamId = std::get<0>(decoded); + if (activeStreamIds.contains(streamId)) { + currentTags[streamId] = std::get<2>(decoded); + } + } + if (!histories.more) { + break; + } + begin = keyAfter(histories.back().key); + } + + for (const auto& [streamId, tag] : currentTags) { + if (tag == targetTag) { + Optional proxyId = co_await getNativeCdcProxyAssignment(tr, streamId); + if (proxyId.present()) { + co_return proxyId; + } + } + } + co_return Optional(); +} + +void signalNativeCdcProxyAssignmentChange(Transaction* tr) { + tr->set(cdcProxyAssignmentChangeKey, + BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), + IncludeVersion(ProtocolVersion::withNativeCdc()))); +} + +Future observeNativeCdcMetadata(Transaction* tr, NativeCdcIdentifierAllocator* allocator) { + Optional maxStreamId = co_await tr->get(cdcMaxStreamIdKey); + if (maxStreamId.present()) { + allocator->observeStreamId(decodeCDCMaxStreamIdValue(maxStreamId.get())); + } + + std::set activeStreamIds; + Key begin = cdcStreamKeys.begin; + while (begin < cdcStreamKeys.end) { + RangeResult streams = co_await tr->getRange(KeyRangeRef(begin, cdcStreamKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : streams) { + const CDCStreamId streamId = decodeCDCStreamKey(kv.key); + activeStreamIds.insert(streamId); + allocator->observeStreamId(streamId); + } + if (!streams.more) { + break; + } + begin = keyAfter(streams.back().key); + } + + std::map currentTags; + begin = cdcTagHistoryKeys.begin; + while (begin < cdcTagHistoryKeys.end) { + RangeResult histories = + co_await tr->getRange(KeyRangeRef(begin, cdcTagHistoryKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : histories) { + const auto history = decodeCDCTagHistoryKey(kv.key); + allocator->observeStreamId(std::get<0>(history)); + if (activeStreamIds.contains(std::get<0>(history))) { + currentTags[std::get<0>(history)] = std::get<2>(history); + } + } + if (!histories.more) { + break; + } + begin = keyAfter(histories.back().key); + } + for (const auto& tagAssignment : currentTags) { + allocator->observeTag(tagAssignment.second); + } +} + +bool retryNativeCdcProxyRequest(Error const& error) { + return error.code() == error_code_wrong_shard_server || error.code() == error_code_broken_promise || + error.code() == error_code_connection_failed || error.code() == error_code_request_maybe_delivered; +} + +// TODO: Have the cluster controller rebalance stream ownership using aggregate CDC proxy throughput and +// update cdcProxyKeys and ClientDBInfo assignments; registration currently chooses any available proxy. +Future getAvailableNativeCdcProxy(Database cx, Optional previousProxy = Optional()) { + while (true) { + for (const auto& proxy : cx->clientInfo->get().cdcProxies) { + if (!previousProxy.present() || proxy.id() != previousProxy.get()) { + co_return proxy; + } + } + if (!cx->clientInfo->get().cdcProxies.empty()) { + co_return cx->clientInfo->get().cdcProxies.front(); + } + co_await cx->clientInfo->onChange(); + } +} + +Future nativeCdcStreamStillExists(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + co_return (co_await tr.get(cdcStreamKeyFor(streamId))).present(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future> findNativeCdcStreamId(Database cx, Key name) { + if (name.empty()) { + throw client_invalid_operation(); + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional streamId = co_await tr.get(cdcStreamNameKeyFor(name)); + if (!streamId.present()) { + co_return Optional(); + } + co_return decodeCDCStreamNameValue(streamId.get()); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future getNativeCdcStreamId(Database cx, Key name) { + Optional streamId = co_await findNativeCdcStreamId(cx, name); + if (!streamId.present()) { + throw client_invalid_operation(); + } + co_return streamId.get(); +} + +Future getNativeCdcStreamProxy(Database cx, CDCStreamId streamId) { + if (streamId == 0) { + throw client_invalid_operation(); + } + + while (true) { + const ClientDBInfo& clientInfo = cx->clientInfo->get(); + auto assigned = clientInfo.streamToCDCProxyId.find(streamId); + if (assigned != clientInfo.streamToCDCProxyId.end()) { + for (const auto& proxy : clientInfo.cdcProxies) { + if (proxy.id() == assigned->second) { + co_return proxy; + } + } + } + if (!(co_await nativeCdcStreamStillExists(cx, streamId))) { + CODE_PROBE(true, "Native CDC client rejected operation after stream removal"); + throw client_invalid_operation(); + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + +Future namedNativeCdcStreamStillExists(Database cx, Key name, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional currentId = co_await tr.get(cdcStreamNameKeyFor(name)); + co_return currentId.present() && decodeCDCStreamNameValue(currentId.get()) == streamId; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future> getNativeCdcStreamProxyForRemoval(Database cx, Key name, CDCStreamId streamId) { + while (true) { + const ClientDBInfo& clientInfo = cx->clientInfo->get(); + auto assigned = clientInfo.streamToCDCProxyId.find(streamId); + if (assigned != clientInfo.streamToCDCProxyId.end()) { + for (const auto& proxy : clientInfo.cdcProxies) { + if (proxy.id() == assigned->second) { + co_return proxy; + } + } + } + if (!(co_await namedNativeCdcStreamStillExists(cx, name, streamId))) { + co_return Optional(); + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + +} // namespace + +Future registerNativeCdcStream(Database cx, Key name, KeyRange keys, Optional proxyId) { + validateNativeCdcEnabled(); + validateNativeCdcStream(name, keys); + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + const Key nameKey = cdcStreamNameKeyFor(name); + Optional currentId = co_await tr.get(nameKey); + if (currentId.present()) { + const CDCStreamId streamId = decodeCDCStreamNameValue(currentId.get()); + Optional currentKeys = co_await tr.get(cdcStreamKeyFor(streamId)); + if (!currentKeys.present() || decodeCDCStreamKeysValue(currentKeys.get()) != keys) { + throw client_invalid_operation(); + } + if (proxyId.present() && !(co_await getNativeCdcProxyAssignment(&tr, streamId)).present()) { + CODE_PROBE(true, "Native CDC registration restores missing stream owner"); + const Tag tag = co_await getNativeCdcCurrentTag(&tr, streamId); + Optional sharedTagProxy = co_await getNativeCdcProxyAssignmentForTag(&tr, tag); + CODE_PROBE(sharedTagProxy.present(), "Native CDC shared-tag streams use one owner"); + const UID selectedProxy = sharedTagProxy.present() ? sharedTagProxy.get() : proxyId.get(); + tr.set(cdcProxyKeyFor(streamId, selectedProxy), Value()); + signalNativeCdcProxyAssignmentChange(&tr); + co_await tr.commit(); + } + co_return streamId; + } + + NativeCdcIdentifierAllocator allocator; + co_await observeNativeCdcMetadata(&tr, &allocator); + const auto [streamId, tag] = allocator.allocate(); + const Version registrationVersion = co_await tr.getReadVersion(); + + tr.set(nameKey, cdcStreamNameValue(streamId)); + tr.set(cdcMaxStreamIdKey, cdcMaxStreamIdValue(streamId)); + tr.set(cdcStreamKeyFor(streamId), cdcStreamKeysValue(keys)); + tr.set(cdcTagHistoryKeyFor(streamId, registrationVersion, tag), Value()); + tr.atomicOp( + cdcMinVersionKeyFor(streamId), cdcVersionstampedMinVersionValue(), MutationRef::SetVersionstampedValue); + if (proxyId.present()) { + Optional sharedTagProxy = co_await getNativeCdcProxyAssignmentForTag(&tr, tag); + const UID selectedProxy = sharedTagProxy.present() ? sharedTagProxy.get() : proxyId.get(); + tr.set(cdcProxyKeyFor(streamId, selectedProxy), Value()); + signalNativeCdcProxyAssignmentChange(&tr); + } + co_await tr.commit(); + co_return streamId; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future> removeNativeCdcStream(Database cx, Key name, Optional proxyId) { + if (name.empty()) { + throw client_invalid_operation(); + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + const Key nameKey = cdcStreamNameKeyFor(name); + Optional currentId = co_await tr.get(nameKey); + if (!currentId.present()) { + co_return Optional(); + } + + const CDCStreamId streamId = decodeCDCStreamNameValue(currentId.get()); + Optional assignedProxy = co_await getNativeCdcProxyAssignment(&tr, streamId); + if (proxyId.present() && (!assignedProxy.present() || assignedProxy.get() != proxyId.get())) { + CODE_PROBE(true, "Native CDC rejects removal through a stale owner"); + throw wrong_shard_server(); + } + + std::set removedTags; + const KeyRange historyRange = cdcTagHistoryRangeFor(streamId); + Key begin = historyRange.begin; + while (begin < historyRange.end) { + RangeResult history = + co_await tr.getRange(KeyRangeRef(begin, historyRange.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& entry : history) { + removedTags.insert(std::get<2>(decodeCDCTagHistoryKey(entry.key))); + } + if (!history.more) { + break; + } + begin = keyAfter(history.back().key); + } + + tr.clear(nameKey); + tr.clear(cdcStreamKeyFor(streamId)); + for (const Tag& tag : removedTags) { + tr.set(cdcRetiredTagPopKeyFor(tag), Value()); + tr.atomicOp(cdcRetiredTagPopVersionKeyFor(tag), + cdcVersionstampedMinVersionValue(), + MutationRef::SetVersionstampedValue); + } + tr.clear(cdcTagHistoryRangeFor(streamId)); + tr.clear(cdcMinVersionKeyFor(streamId)); + tr.clear(cdcProxyRangeFor(streamId)); + if (assignedProxy.present()) { + signalNativeCdcProxyAssignmentChange(&tr); + } + co_await tr.commit(); + NativeCdcRemovedStreamInfo removed; + removed.removalVersion = tr.getCommittedVersion(); + removed.tags.assign(removedTags.begin(), removedTags.end()); + CODE_PROBE(!removed.tags.empty(), "Native CDC removal records final tagged pop work"); + co_return Optional(removed); + } catch (Error& e) { + if (e.code() == error_code_wrong_shard_server) { + throw; + } + err = e; + } + co_await tr.onError(err); + } +} + +Future> listNativeCdcStreams(Database cx) { + std::vector result; + Key begin = cdcStreamNameKeys.begin; + Transaction tr(cx); + + while (begin < cdcStreamNameKeys.end) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + RangeResult names = co_await tr.getRange(KeyRangeRef(begin, cdcStreamNameKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : names) { + const CDCStreamId streamId = decodeCDCStreamNameValue(kv.value); + Optional keys = co_await tr.get(cdcStreamKeyFor(streamId)); + Optional minVersion = co_await tr.get(cdcMinVersionKeyFor(streamId)); + if (keys.present() && minVersion.present()) { + result.push_back(NativeCdcStreamInfo{ decodeCDCStreamNameKey(kv.key), + streamId, + decodeCDCStreamKeysValue(keys.get()), + decodeCDCMinVersionValue(minVersion.get()) }); + } + } + if (!names.more) { + break; + } + begin = keyAfter(names.back().key); + continue; + } catch (Error& e) { + err = e; + } + result.clear(); + begin = cdcStreamNameKeys.begin; + co_await tr.onError(err); + } + co_return result; +} + +Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId) { + if (oldProxyId == newProxyId) { + co_return; + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + bool changed = false; + Key begin = cdcProxyKeys.begin; + while (begin < cdcProxyKeys.end) { + RangeResult assignments = + co_await tr.getRange(KeyRangeRef(begin, cdcProxyKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& assignment : assignments) { + const auto [streamId, proxyId] = decodeCDCProxyKey(assignment.key); + if (proxyId == oldProxyId) { + tr.clear(assignment.key); + tr.set(cdcProxyKeyFor(streamId, newProxyId), Value()); + changed = true; + } + } + if (!assignments.more) { + break; + } + begin = keyAfter(assignments.back().key); + } + + if (changed) { + CODE_PROBE(true, "Native CDC reassigns streams after proxy replacement"); + signalNativeCdcProxyAssignmentChange(&tr); + co_await tr.commit(); + } + co_return; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future acknowledgeNativeCdcStream(Database cx, + CDCStreamId streamId, + Version consumedThrough, + Version knownAvailableThrough) { + if (streamId == 0 || consumedThrough < 0 || consumedThrough >= std::numeric_limits::max() - 1) { + throw client_invalid_operation(); + } + const Version minUnpoppedVersion = consumedThrough + 1; + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + Optional minVersionValue = co_await tr.get(cdcMinVersionKeyFor(streamId)); + if (!minVersionValue.present()) { + throw client_invalid_operation(); + } + + const Version minVersion = decodeCDCMinVersionValue(minVersionValue.get()); + if (minUnpoppedVersion <= minVersion) { + CODE_PROBE(true, "Native CDC preserves a durable duplicate acknowledgement"); + co_return minVersion; + } + + const Version readVersion = co_await tr.getReadVersion(); + if (consumedThrough > readVersion && consumedThrough > knownAvailableThrough) { + CODE_PROBE(true, "Native CDC rejects unproven acknowledgement progress"); + throw client_invalid_operation(); + } + + tr.set(cdcMinVersionKeyFor(streamId), cdcMinVersionValue(minUnpoppedVersion)); + co_await tr.commit(); + co_return minUnpoppedVersion; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys) { + validateNativeCdcEnabled(); + validateNativeCdcStream(name, keys); + const CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx); + co_return co_await registerNativeCdcStream(cx, name, keys, proxy.id()); +} + +Future> listNativeCdcStreamsClient(Database cx) { + Optional previousProxy; + + while (true) { + CDCProxyInterface proxy = co_await getAvailableNativeCdcProxy(cx, previousProxy); + try { + Future proxyChanged = cx->clientInfo->onChange(); + auto result = + co_await race(throwErrorOr(proxy.listStreams.tryGetReply(CDCListStreamsRequest())), proxyChanged); + if (result.index() == 0) { + CDCListStreamsReply reply = std::get<0>(std::move(result)); + std::vector streams; + streams.reserve(reply.streams.size()); + for (const auto& stream : reply.streams) { + streams.push_back(NativeCdcStreamInfo{ + Key(stream.name), stream.streamId, KeyRange(stream.keys), stream.minVersion }); + } + co_return streams; + } + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + } + previousProxy = proxy.id(); + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + +Future removeNativeCdcStreamClient(Database cx, Key name) { + if (name.empty()) { + throw client_invalid_operation(); + } + + while (true) { + Optional streamId = co_await findNativeCdcStreamId(cx, name); + if (!streamId.present()) { + co_return; + } + + Optional proxy = co_await getNativeCdcStreamProxyForRemoval(cx, name, streamId.get()); + if (!proxy.present()) { + co_return; + } + try { + Future proxyChanged = cx->clientInfo->onChange(); + auto result = co_await race( + throwErrorOr(proxy.get().removeStream.tryGetReply(CDCRemoveStreamRequest(name))), proxyChanged); + if (result.index() == 0) { + co_return; + } + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, cx->taskID); + } +} + +Future> createNativeCdcConsumer(Database cx, Key name) { + const CDCStreamId streamId = co_await getNativeCdcStreamId(cx, name); + co_return makeReference(cx, CDCCursor(streamId, invalidVersion)); +} + +Reference resumeNativeCdcConsumer(Database cx, CDCCursor position) { + return makeReference(cx, position); +} + +Future NativeCdcConsumer::consumeImpl(Reference self) { + try { + while (true) { + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); + Future proxyChanged = self->cx->clientInfo->onChange(); + try { + auto result = co_await race( + throwErrorOr(proxy.consume.tryGetReply(CDCConsumeRequest(self->currentPosition))), proxyChanged); + if (result.index() == 0) { + CDCConsumeReply reply = std::get<0>(std::move(result)); + self->knownAvailableThrough = reply.lastConsumedVersion; + self->currentPosition.lastConsumedVersion = reply.lastConsumedVersion; + self->operationOutstanding = false; + co_return reply; + } + CODE_PROBE(true, "Native CDC consume retries after proxy metadata change", probe::decoration::rare); + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + CODE_PROBE(true, "Native CDC consume retries after proxy request failure", probe::decoration::rare); + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); + } + } catch (Error&) { + self->operationOutstanding = false; + throw; + } +} + +Future NativeCdcConsumer::consume() { + if (operationOutstanding) { + CODE_PROBE(true, "Native CDC consumer rejects overlapping operations", probe::decoration::rare); + return Future(client_invalid_operation()); + } + operationOutstanding = true; + return consumeImpl(Reference::addRef(this)); +} + +Future NativeCdcConsumer::acknowledgeImpl(Reference self) { + try { + if (self->currentPosition.streamId == 0 || self->currentPosition.lastConsumedVersion < 0 || + self->currentPosition.lastConsumedVersion == std::numeric_limits::max()) { + throw client_invalid_operation(); + } + const Version acknowledgedVersion = self->currentPosition.lastConsumedVersion; + co_await acknowledgeNativeCdcStream( + self->cx, self->currentPosition.streamId, acknowledgedVersion, self->knownAvailableThrough); + + while (true) { + CDCProxyInterface proxy = co_await getNativeCdcStreamProxy(self->cx, self->currentPosition.streamId); + try { + Future proxyChanged = self->cx->clientInfo->onChange(); + auto result = co_await race(throwErrorOr(proxy.ack.tryGetReply( + CDCAckRequest(self->currentPosition.streamId, acknowledgedVersion))), + proxyChanged); + if (result.index() == 0) { + self->operationOutstanding = false; + co_return; + } + } catch (Error& error) { + if (!retryNativeCdcProxyRequest(error)) { + throw; + } + } + co_await delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, self->cx->taskID); + } + } catch (Error&) { + self->operationOutstanding = false; + throw; + } +} + +Future NativeCdcConsumer::acknowledge() { + if (operationOutstanding) { + CODE_PROBE(true, "Native CDC consumer rejects overlapping operations", probe::decoration::rare); + return Future(client_invalid_operation()); + } + operationOutstanding = true; + return acknowledgeImpl(Reference::addRef(this)); +} + +TEST_CASE("/NativeCDC/LifecycleAllocation") { + NativeCdcIdentifierAllocator allocator; + auto [initialId, initialTag] = allocator.allocate(); + ASSERT(initialId == 1); + ASSERT(initialTag == Tag(tagLocalityCDC, 0)); + + allocator.observeStreamId(9); + allocator.observeTag(initialTag); + allocator.observeTag(Tag(tagLocalityCDC, 2)); + auto [nextId, nextTag] = allocator.allocate(); + ASSERT(nextId == 10); + ASSERT(nextTag == Tag(tagLocalityCDC, 1)); + + NativeCdcIdentifierAllocator fullPoolAllocator; + for (uint32_t tagId = 0; tagId < static_cast(CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT); ++tagId) { + fullPoolAllocator.observeTag(Tag(tagLocalityCDC, static_cast(tagId))); + } + auto [sharedId, sharedTag] = fullPoolAllocator.allocate(); + ASSERT(sharedId == 1); + ASSERT(sharedTag == Tag(tagLocalityCDC, 0)); + + return Void(); +} diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 07ca49a33f3..e1eddbcf9c0 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -771,6 +771,201 @@ int8_t decodeTagLocalityListValue(ValueRef const& value) { return s; } +const KeyRangeRef cdcStreamNameKeys("\xff/cdc/name/"_sr, "\xff/cdc/name0"_sr); +const KeyRef cdcMaxStreamIdKey = "\xff/cdc/maxStreamId"_sr; +const KeyRangeRef cdcStreamKeys("\xff/cdc/keys/"_sr, "\xff/cdc/keys0"_sr); +const KeyRangeRef cdcTagHistoryKeys("\xff/cdc/tagHistory/"_sr, "\xff/cdc/tagHistory0"_sr); +const KeyRangeRef cdcMinVersionKeys("\xff\x02/cdc/minVersion/"_sr, "\xff\x02/cdc/minVersion0"_sr); +const KeyRangeRef cdcRetiredTagPopKeys("\xff/cdc/retiredTagPop/"_sr, "\xff/cdc/retiredTagPop0"_sr); +const KeyRangeRef cdcRetiredTagPopVersionKeys("\xff\x02/cdc/retiredTagPopVersion/"_sr, + "\xff\x02/cdc/retiredTagPopVersion0"_sr); +const KeyRangeRef cdcProxyKeys("\xff/cdc/proxies/"_sr, "\xff/cdc/proxies0"_sr); +const KeyRef cdcProxyAssignmentChangeKey = "\xff/cdc/proxyAssignmentChange"_sr; + +Key cdcStreamNameKeyFor(KeyRef const& streamName) { + return streamName.withPrefix(cdcStreamNameKeys.begin); +} + +Key decodeCDCStreamNameKey(KeyRef const& key) { + return key.removePrefix(cdcStreamNameKeys.begin); +} + +Value cdcStreamNameValue(CDCStreamId streamId) { + BinaryWriter wr(IncludeVersion(ProtocolVersion::withNativeCdc())); + wr << streamId; + return wr.toValue(); +} + +CDCStreamId decodeCDCStreamNameValue(ValueRef const& value) { + CDCStreamId streamId; + BinaryReader reader(value, IncludeVersion()); + ASSERT_WE_THINK(reader.protocolVersion().hasNativeCdc()); + reader >> streamId; + return streamId; +} + +Value cdcMaxStreamIdValue(CDCStreamId streamId) { + return cdcStreamNameValue(streamId); +} + +CDCStreamId decodeCDCMaxStreamIdValue(ValueRef const& value) { + return decodeCDCStreamNameValue(value); +} + +Key cdcStreamKeyFor(CDCStreamId streamId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcStreamKeys.begin); + wr << streamId; + return wr.toValue(); +} + +CDCStreamId decodeCDCStreamKey(KeyRef const& key) { + CDCStreamId streamId; + BinaryReader reader(key.removePrefix(cdcStreamKeys.begin), Unversioned()); + reader >> streamId; + return streamId; +} + +Value cdcStreamKeysValue(KeyRangeRef const& keys) { + BinaryWriter wr(IncludeVersion(ProtocolVersion::withNativeCdc())); + wr << keys; + return wr.toValue(); +} + +KeyRange decodeCDCStreamKeysValue(ValueRef const& value) { + KeyRange keys; + BinaryReader reader(value, IncludeVersion()); + ASSERT_WE_THINK(reader.protocolVersion().hasNativeCdc()); + reader >> keys; + return keys; +} + +static Key cdcTagHistoryPrefixFor(CDCStreamId streamId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcTagHistoryKeys.begin); + wr << streamId; + return wr.toValue(); +} + +Key cdcTagHistoryKeyFor(CDCStreamId streamId, Version version, Tag tag) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcTagHistoryPrefixFor(streamId)); + + Version encodedVersion = bigEndian64(version); + Key versionBytes = makeString(sizeof(encodedVersion)); + memcpy(mutateString(versionBytes), &encodedVersion, sizeof(encodedVersion)); + wr.serializeBytes(versionBytes); + wr << tag; + return wr.toValue(); +} + +KeyRange cdcTagHistoryRangeFor(CDCStreamId streamId) { + return prefixRange(cdcTagHistoryPrefixFor(streamId)); +} + +std::tuple decodeCDCTagHistoryKey(KeyRef const& key) { + CDCStreamId streamId; + Version encodedVersion; + Tag tag; + BinaryReader reader(key.removePrefix(cdcTagHistoryKeys.begin), Unversioned()); + reader >> streamId >> encodedVersion >> tag; + return { streamId, bigEndian64(encodedVersion), tag }; +} + +Key cdcMinVersionKeyFor(CDCStreamId streamId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcMinVersionKeys.begin); + wr << streamId; + return wr.toValue(); +} + +CDCStreamId decodeCDCMinVersionKey(KeyRef const& key) { + CDCStreamId streamId; + BinaryReader reader(key.removePrefix(cdcMinVersionKeys.begin), Unversioned()); + reader >> streamId; + return streamId; +} + +Value cdcMinVersionValue(Version version) { + BinaryWriter wr(IncludeVersion(ProtocolVersion::withNativeCdc())); + wr << version; + return wr.toValue(); +} + +Value cdcVersionstampedMinVersionValue() { + // Ten placeholder bytes followed by the versionstamp offset at byte zero. + return "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"_sr; +} + +Version decodeCDCMinVersionValue(ValueRef const& value) { + if (value.size() == sizeof(Version) + sizeof(uint16_t)) { + Versionstamp versionstamp; + BinaryReader reader(value, Unversioned()); + reader >> versionstamp; + return versionstamp.version; + } + + Version version; + BinaryReader reader(value, IncludeVersion()); + ASSERT_WE_THINK(reader.protocolVersion().hasNativeCdc()); + reader >> version; + return version; +} + +Key cdcRetiredTagPopKeyFor(Tag tag) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcRetiredTagPopKeys.begin); + wr << tag; + return wr.toValue(); +} + +Tag decodeCDCRetiredTagPopKey(KeyRef const& key) { + Tag tag; + BinaryReader reader(key.removePrefix(cdcRetiredTagPopKeys.begin), Unversioned()); + reader >> tag; + return tag; +} + +Key cdcRetiredTagPopVersionKeyFor(Tag tag) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcRetiredTagPopVersionKeys.begin); + wr << tag; + return wr.toValue(); +} + +Tag decodeCDCRetiredTagPopVersionKey(KeyRef const& key) { + Tag tag; + BinaryReader reader(key.removePrefix(cdcRetiredTagPopVersionKeys.begin), Unversioned()); + reader >> tag; + return tag; +} + +static Key cdcProxyPrefixFor(CDCStreamId streamId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcProxyKeys.begin); + wr << streamId; + return wr.toValue(); +} + +Key cdcProxyKeyFor(CDCStreamId streamId, UID proxyId) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(cdcProxyPrefixFor(streamId)); + wr << proxyId; + return wr.toValue(); +} + +KeyRange cdcProxyRangeFor(CDCStreamId streamId) { + return prefixRange(cdcProxyPrefixFor(streamId)); +} + +std::pair decodeCDCProxyKey(KeyRef const& key) { + CDCStreamId streamId; + UID proxyId; + BinaryReader reader(key.removePrefix(cdcProxyKeys.begin), Unversioned()); + reader >> streamId >> proxyId; + return { streamId, proxyId }; +} + const KeyRangeRef datacenterReplicasKeys("\xff\x02/datacenterReplicas/"_sr, "\xff\x02/datacenterReplicas0"_sr); const KeyRef datacenterReplicasPrefix = datacenterReplicasKeys.begin; @@ -1648,3 +1843,41 @@ TEST_CASE("noSim/SystemData/DataMoveId") { return Void(); } + +TEST_CASE("/SystemData/NativeCDC") { + const Key name = "orders"_sr; + const CDCStreamId streamId = 42; + const KeyRange keys(KeyRangeRef("a"_sr, "z"_sr)); + const Version minVersion = 123456789; + const Tag tag(tagLocalityCDC, 9); + const UID proxyId(1, 2); + + ASSERT(decodeCDCStreamNameKey(cdcStreamNameKeyFor(name)) == name); + ASSERT(decodeCDCStreamNameValue(cdcStreamNameValue(streamId)) == streamId); + ASSERT(decodeCDCMaxStreamIdValue(cdcMaxStreamIdValue(streamId)) == streamId); + ASSERT(decodeCDCStreamKey(cdcStreamKeyFor(streamId)) == streamId); + ASSERT(decodeCDCStreamKeysValue(cdcStreamKeysValue(keys)) == keys); + ASSERT(decodeCDCMinVersionKey(cdcMinVersionKeyFor(streamId)) == streamId); + ASSERT(decodeCDCMinVersionValue(cdcMinVersionValue(minVersion)) == minVersion); + ASSERT(nonMetadataSystemKeys.contains(cdcMinVersionKeyFor(streamId))); + ASSERT(cdcVersionstampedMinVersionValue().size() == sizeof(Version) + sizeof(uint16_t) + sizeof(int32_t)); + ASSERT(decodeCDCRetiredTagPopKey(cdcRetiredTagPopKeyFor(tag)) == tag); + ASSERT(cdcRetiredTagPopKeys.contains(cdcRetiredTagPopKeyFor(tag))); + ASSERT(decodeCDCRetiredTagPopVersionKey(cdcRetiredTagPopVersionKeyFor(tag)) == tag); + ASSERT(cdcRetiredTagPopVersionKeys.contains(cdcRetiredTagPopVersionKeyFor(tag))); + ASSERT(nonMetadataSystemKeys.contains(cdcRetiredTagPopVersionKeyFor(tag))); + + const Key tagHistoryKey = cdcTagHistoryKeyFor(streamId, minVersion, tag); + const auto [decodedStreamId, decodedVersion, decodedTag] = decodeCDCTagHistoryKey(tagHistoryKey); + ASSERT(decodedStreamId == streamId); + ASSERT(decodedVersion == minVersion); + ASSERT(decodedTag == tag); + ASSERT(cdcTagHistoryRangeFor(streamId).contains(tagHistoryKey)); + + const auto [proxyStreamId, decodedProxyId] = decodeCDCProxyKey(cdcProxyKeyFor(streamId, proxyId)); + ASSERT(proxyStreamId == streamId); + ASSERT(decodedProxyId == proxyId); + ASSERT(cdcProxyRangeFor(streamId).contains(cdcProxyKeyFor(streamId, proxyId))); + + return Void(); +} diff --git a/fdbclient/include/fdbclient/CDCProxyInterface.h b/fdbclient/include/fdbclient/CDCProxyInterface.h new file mode 100644 index 00000000000..b15c994bcd0 --- /dev/null +++ b/fdbclient/include/fdbclient/CDCProxyInterface.h @@ -0,0 +1,249 @@ +/* + * CDCProxyInterface.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_CDCPROXYINTERFACE_H +#define FDBCLIENT_CDCPROXYINTERFACE_H +#pragma once + +#include "fdbclient/CommitTransaction.h" +#include "flow/FileIdentifier.h" +#include "fdbrpc/fdbrpc.h" + +struct CDCCursor { + constexpr static FileIdentifier file_identifier = 10949553; + CDCStreamId streamId = 0; + Version lastConsumedVersion = invalidVersion; + + CDCCursor() = default; + CDCCursor(CDCStreamId streamId, Version lastConsumedVersion) + : streamId(streamId), lastConsumedVersion(lastConsumedVersion) {} + + template + void serialize(Ar& ar) { + serializer(ar, streamId, lastConsumedVersion); + } +}; + +struct VersionedMutationsRef { + constexpr static FileIdentifier file_identifier = 3297577; + Version version = invalidVersion; + VectorRef mutations; + + VersionedMutationsRef() = default; + VersionedMutationsRef(Version version, VectorRef mutations) : version(version), mutations(mutations) {} + + template + void serialize(Ar& ar) { + serializer(ar, version, mutations); + } +}; + +struct CDCStreamInfoRef { + constexpr static FileIdentifier file_identifier = 10228408; + StringRef name; + CDCStreamId streamId = 0; + KeyRangeRef keys; + Version minVersion = invalidVersion; + + CDCStreamInfoRef() = default; + CDCStreamInfoRef(StringRef name, CDCStreamId streamId, KeyRangeRef keys, Version minVersion) + : name(name), streamId(streamId), keys(keys), minVersion(minVersion) {} + + template + void serialize(Ar& ar) { + serializer(ar, name, streamId, keys, minVersion); + } +}; + +struct CDCRegisterStreamReply { + constexpr static FileIdentifier file_identifier = 3217071; + CDCStreamId streamId = 0; + + CDCRegisterStreamReply() = default; + explicit CDCRegisterStreamReply(CDCStreamId streamId) : streamId(streamId) {} + + template + void serialize(Ar& ar) { + serializer(ar, streamId); + } +}; + +struct CDCRegisterStreamRequest { + constexpr static FileIdentifier file_identifier = 1269096; + Key name; + KeyRange keys; + ReplyPromise reply; + + CDCRegisterStreamRequest() = default; + CDCRegisterStreamRequest(Key name, KeyRange keys) : name(name), keys(keys) {} + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, name, keys, reply); + } +}; + +struct CDCRemoveStreamRequest { + constexpr static FileIdentifier file_identifier = 3683857; + Key name; + ReplyPromise reply; + + CDCRemoveStreamRequest() = default; + explicit CDCRemoveStreamRequest(Key name) : name(name) {} + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, name, reply); + } +}; + +struct CDCListStreamsReply { + constexpr static FileIdentifier file_identifier = 7600884; + Arena arena; + VectorRef streams; + + template + void serialize(Ar& ar) { + serializer(ar, streams, arena); + } +}; + +struct CDCListStreamsRequest { + constexpr static FileIdentifier file_identifier = 8134529; + ReplyPromise reply; + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + +struct CDCConsumeReply { + constexpr static FileIdentifier file_identifier = 12940542; + Arena arena; + VectorRef mutations; + Version lastConsumedVersion = invalidVersion; + + template + void serialize(Ar& ar) { + serializer(ar, mutations, lastConsumedVersion, arena); + } +}; + +struct CDCConsumeRequest { + constexpr static FileIdentifier file_identifier = 8178243; + CDCCursor cursor; + ReplyPromise reply; + + CDCConsumeRequest() = default; + explicit CDCConsumeRequest(CDCCursor cursor) : cursor(cursor) {} + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, cursor, reply); + } +}; + +struct CDCAckRequest { + constexpr static FileIdentifier file_identifier = 15923892; + CDCStreamId streamId = 0; + Version version = invalidVersion; + ReplyPromise reply; + + CDCAckRequest() = default; + CDCAckRequest(CDCStreamId streamId, Version version) : streamId(streamId), version(version) {} + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, streamId, version, reply); + } +}; + +struct HaltCDCProxyRequest { + constexpr static FileIdentifier file_identifier = 6992638; + ReplyPromise reply; + + bool verify() const { return true; } + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + +struct CDCProxyInterface { + constexpr static FileIdentifier file_identifier = 6689609; + enum { LocationAwareLoadBalance = 1 }; + enum { AlwaysFresh = 1 }; + + Optional processId; + PublicRequestStream consume; + PublicRequestStream registerStream; + PublicRequestStream removeStream; + PublicRequestStream listStreams; + PublicRequestStream ack; + RequestStream> waitFailure; + RequestStream haltForTesting; + + UID id() const { return consume.getEndpoint().token; } + std::string toString() const { return id().shortString(); } + bool operator==(CDCProxyInterface const& r) const { return id() == r.id(); } + bool operator!=(CDCProxyInterface const& r) const { return id() != r.id(); } + NetworkAddress address() const { return consume.getEndpoint().getPrimaryAddress(); } + NetworkAddressList addresses() const { return consume.getEndpoint().addresses; } + + template + void serialize(Ar& ar) { + serializer(ar, processId, consume); + if (Ar::isDeserializing) { + registerStream = + PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(1)); + removeStream = PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(2)); + listStreams = PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(3)); + ack = PublicRequestStream(consume.getEndpoint().getAdjustedEndpoint(4)); + waitFailure = RequestStream>(consume.getEndpoint().getAdjustedEndpoint(5)); + haltForTesting = RequestStream(consume.getEndpoint().getAdjustedEndpoint(6)); + } + } + + void initEndpoints() { + std::vector> streams; + streams.push_back(consume.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(registerStream.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(removeStream.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(listStreams.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(ack.getReceiver(TaskPriority::ReadSocket)); + streams.push_back(waitFailure.getReceiver()); + streams.push_back(haltForTesting.getReceiver()); + FlowTransport::transport().addEndpoints(streams); + } +}; + +#endif // FDBCLIENT_CDCPROXYINTERFACE_H diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index 5fa33fae819..b493007f05d 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -22,10 +22,12 @@ #define FDBCLIENT_COMMITPROXYINTERFACE_H #pragma once +#include #include #include #include "fdbclient/CommitTransaction.h" +#include "fdbclient/CDCProxyInterface.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/GlobalConfig.h" #include "fdbclient/GrvProxyInterface.h" @@ -111,6 +113,8 @@ struct ClientDBInfo { UID id; // Changes each time anything else changes std::vector grvProxies; std::vector commitProxies; + std::vector cdcProxies; + std::map streamToCDCProxyId; Optional firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk Optional forward; @@ -129,7 +133,16 @@ struct ClientDBInfo { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); } - serializer(ar, grvProxies, commitProxies, id, forward, history, clusterId, clusterType); + serializer(ar, + grvProxies, + commitProxies, + id, + forward, + history, + clusterId, + clusterType, + cdcProxies, + streamToCDCProxyId); } }; diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index ea728b8f724..5e87aa45d48 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -39,6 +39,7 @@ #include "fdbrpc/Locality.h" typedef int64_t Version; +typedef uint64_t CDCStreamId; typedef uint64_t LogEpoch; typedef uint64_t Sequence; typedef StringRef KeyRef; @@ -62,6 +63,7 @@ enum { tagLocalityTxs = -7, tagLocalityBackup = -8, // used by backup role to pop from TLogs tagLocalityRangeBackup = -9, // used by range-partitioned backup workers + tagLocalityCDC = -10, // used by native change data capture streams tagLocalityInvalid = -99 }; // The TLog and LogRouter require these number to be as compact as possible diff --git a/fdbclient/include/fdbclient/Knobs.h b/fdbclient/include/fdbclient/Knobs.h index 493506f69d5..c7d75c8735d 100644 --- a/fdbclient/include/fdbclient/Knobs.h +++ b/fdbclient/include/fdbclient/Knobs.h @@ -90,6 +90,8 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ClientKnobs : public KnobsImpl + +#include "fdbclient/CDCProxyInterface.h" +#include "fdbclient/NativeAPI.actor.h" + +struct NativeCdcStreamInfo { + Key name; + CDCStreamId streamId = 0; + KeyRange keys; + Version minVersion = invalidVersion; +}; + +class NativeCdcConsumer : public ReferenceCounted { +public: + NativeCdcConsumer(Database cx, CDCCursor position) : cx(cx), currentPosition(position) {} + + // Operations advance shared delivery state; only one may be outstanding. + Future consume(); + Future acknowledge(); + const CDCCursor& position() const { return currentPosition; } + +private: + static Future consumeImpl(Reference self); + static Future acknowledgeImpl(Reference self); + + Database cx; + CDCCursor currentPosition; + Version knownAvailableThrough = invalidVersion; + bool operationOutstanding = false; +}; + +// Client-facing CDC operations. Registration is feature gated; the remaining +// operations stay available so existing durable streams can be drained after +// the feature is disabled. Requests retry when stream ownership changes. +Future registerNativeCdcStreamClient(Database cx, Key name, KeyRange keys); +Future removeNativeCdcStreamClient(Database cx, Key name); +Future> listNativeCdcStreamsClient(Database cx); +// Uses the range registered for this name; consumers do not respecify it. A +// CDCCursor remains a serializable position token and does not hold Database. +Future> createNativeCdcConsumer(Database cx, Key name); +Reference resumeNativeCdcConsumer(Database cx, CDCCursor position); + +#endif // FDBCLIENT_NATIVECDC_H diff --git a/fdbclient/include/fdbclient/NativeCdcInternal.h b/fdbclient/include/fdbclient/NativeCdcInternal.h new file mode 100644 index 00000000000..f3b6a24bbc2 --- /dev/null +++ b/fdbclient/include/fdbclient/NativeCdcInternal.h @@ -0,0 +1,56 @@ +/* + * NativeCdcInternal.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_NATIVECDCINTERNAL_H +#define FDBCLIENT_NATIVECDCINTERNAL_H +#pragma once + +#include + +#include "fdbclient/NativeCdc.h" + +struct NativeCdcRemovedStreamInfo { + Version removalVersion = invalidVersion; + std::vector tags; +}; + +// Durable metadata operations used by CDC server roles and simulation +// coverage. Registration is feature gated; drain and cleanup operations remain +// available for streams persisted before native CDC is disabled. +Future registerNativeCdcStream(Database cx, + Key name, + KeyRange keys, + Optional proxyId = Optional()); +// Persists per-tag final-pop watermarks before removing stream metadata. +Future> removeNativeCdcStream(Database cx, + Key name, + Optional proxyId = Optional()); +Future> listNativeCdcStreams(Database cx); +// Atomically moves any streams assigned to a failed proxy to its replacement. +Future reassignNativeCdcStreams(Database cx, UID oldProxyId, UID newProxyId); +// Persists the exclusive unpopped watermark after consuming through a version. +// knownAvailableThrough permits a consumer to acknowledge log data it has +// already received before that version is visible at a transaction read version. +Future acknowledgeNativeCdcStream(Database cx, + CDCStreamId streamId, + Version consumedThrough, + Version knownAvailableThrough = invalidVersion); + +#endif // FDBCLIENT_NATIVECDCINTERNAL_H diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index 2ad0ede86f1..19c38e26ca3 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -22,6 +22,8 @@ #define FDBCLIENT_SYSTEMDATA_H #pragma once +#include + // Functions and constants documenting the organization of the reserved keyspace in the database beginning with "\xFF" #include "fdbclient/AccumulativeChecksum.h" @@ -264,6 +266,62 @@ Value tagLocalityListValue(int8_t const&); Optional decodeTagLocalityListKey(KeyRef const&); int8_t decodeTagLocalityListValue(ValueRef const&); +// Native CDC stream routing and lifecycle metadata persisted in the transaction state store. +// "\xff/cdc/name/[[streamName]]" := "[[CDCStreamId]]" +extern const KeyRangeRef cdcStreamNameKeys; +Key cdcStreamNameKeyFor(KeyRef const& streamName); +Key decodeCDCStreamNameKey(KeyRef const& key); +Value cdcStreamNameValue(CDCStreamId streamId); +CDCStreamId decodeCDCStreamNameValue(ValueRef const& value); + +// "\xff/cdc/maxStreamId" := "[[CDCStreamId]]" +extern const KeyRef cdcMaxStreamIdKey; +Value cdcMaxStreamIdValue(CDCStreamId streamId); +CDCStreamId decodeCDCMaxStreamIdValue(ValueRef const& value); + +// "\xff/cdc/keys/[[CDCStreamId]]" := "[[KeyRange]]" +extern const KeyRangeRef cdcStreamKeys; +Key cdcStreamKeyFor(CDCStreamId streamId); +CDCStreamId decodeCDCStreamKey(KeyRef const& key); +Value cdcStreamKeysValue(KeyRangeRef const& keys); +KeyRange decodeCDCStreamKeysValue(ValueRef const& value); + +// "\xff/cdc/tagHistory/[[CDCStreamId]][[Version]][[Tag]]" := "" +extern const KeyRangeRef cdcTagHistoryKeys; +Key cdcTagHistoryKeyFor(CDCStreamId streamId, Version version, Tag tag); +KeyRange cdcTagHistoryRangeFor(CDCStreamId streamId); +std::tuple decodeCDCTagHistoryKey(KeyRef const& key); + +// Native CDC acknowledgement progress is regular storage-server-backed system data. +// "\xff\x02/cdc/minVersion/[[CDCStreamId]]" := "[[Version]]" +// The initial value is versionstamped at stream registration commit. +extern const KeyRangeRef cdcMinVersionKeys; +Key cdcMinVersionKeyFor(CDCStreamId streamId); +CDCStreamId decodeCDCMinVersionKey(KeyRef const& key); +Value cdcMinVersionValue(Version version); +Value cdcVersionstampedMinVersionValue(); +Version decodeCDCMinVersionValue(ValueRef const& value); + +// "\xff/cdc/retiredTagPop/[[Tag]]" := "" +// Marks tags with durable final-pop work, so recovery keeps a CDC proxy available. +extern const KeyRangeRef cdcRetiredTagPopKeys; +Key cdcRetiredTagPopKeyFor(Tag tag); +Tag decodeCDCRetiredTagPopKey(KeyRef const& key); + +// "\xff\x02/cdc/retiredTagPopVersion/[[Tag]]" := "[[Version]]" +// Stores bounded storage-backed final-pop watermarks for removed streams. +extern const KeyRangeRef cdcRetiredTagPopVersionKeys; +Key cdcRetiredTagPopVersionKeyFor(Tag tag); +Tag decodeCDCRetiredTagPopVersionKey(KeyRef const& key); + +// "\xff/cdc/proxies/[[CDCStreamId]][[proxyUID]]" := "" +extern const KeyRangeRef cdcProxyKeys; +// Changed whenever durable CDC stream-to-proxy assignments change. +extern const KeyRef cdcProxyAssignmentChangeKey; +Key cdcProxyKeyFor(CDCStreamId streamId, UID proxyId); +KeyRange cdcProxyRangeFor(CDCStreamId streamId); +std::pair decodeCDCProxyKey(KeyRef const& key); + // "\xff\x02/datacenterReplicas/[[datacenterID]]" := "[[replicas]]" // Provides the number of replicas for the given datacenterID. // Used in the initialization of the Data Distributor. diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index b40708092ad..b0640a2fe22 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -29,6 +29,7 @@ add_subdirectory(core) add_subdirectory(kvstore) add_subdirectory(logsystem) add_subdirectory(mocks3) +add_subdirectory(cdcproxy) add_subdirectory(clustercontroller) add_subdirectory(backupworker) add_subdirectory(commitproxy) @@ -168,6 +169,7 @@ target_link_libraries(fdbserver PRIVATE "$" fdbserver_worker fdbserver_backupworker + fdbserver_cdcproxy fdbserver_clustercontroller fdbserver_commitproxy fdbserver_consistencyscan diff --git a/fdbserver/cdcproxy/CDCProxy.cpp b/fdbserver/cdcproxy/CDCProxy.cpp new file mode 100644 index 00000000000..706b20a24c0 --- /dev/null +++ b/fdbserver/cdcproxy/CDCProxy.cpp @@ -0,0 +1,1035 @@ +/* + * CDCProxy.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "fdbclient/Knobs.h" +#include "fdbclient/NativeCdcInternal.h" +#include "fdbclient/SystemData.h" +#include "fdbserver/cdcproxy/CDCProxy.h" +#include "fdbserver/core/Knobs.h" +#include "fdbserver/core/LogProtocolMessage.h" +#include "fdbserver/core/OTELSpanContextMessage.h" +#include "fdbserver/core/ServerDBInfo.h" +#include "fdbserver/core/SpanContextMessage.h" +#include "fdbserver/core/WaitFailure.h" +#include "fdbserver/core/WorkerInterface.actor.h" +#include "fdbserver/logsystem/LogSystemConsumer.h" +#include "fdbserver/logsystem/LogSystemFactory.h" +#include "flow/ActorCollection.h" +#include "flow/CodeProbe.h" +#include "flow/Error.h" +#include "flow/UnitTest.h" +#include "flow/genericactors.actor.h" + +namespace { + +struct CDCStreamReadState { + Optional keys; + Version minVersion = invalidVersion; + std::vector> tagAssignments; +}; + +struct CDCTagInterval { + Tag tag; + Version begin; + Version end; + Version bufferedThrough; + + CDCTagInterval(Tag tag, Version begin, Version end) + : tag(tag), begin(begin), end(end), bufferedThrough(begin - 1) {} +}; + +struct CDCBufferedStream : ReferenceCounted { + CDCStreamId streamId; + Optional keys; + bool active = true; + bool initialized = false; + bool tooOld = false; + Version minVersion = invalidVersion; + Version bufferedThrough = invalidVersion; + int64_t bufferedBytes = 0; + int readDemand = 0; + std::vector tagIntervals; + std::deque> mutations; + AsyncTrigger changed; + + explicit CDCBufferedStream(CDCStreamId streamId) : streamId(streamId) {} +}; + +struct CDCBufferedBatch { + int64_t bufferedBytes = 0; + std::deque> mutations; +}; + +struct CDCBufferedTag : ReferenceCounted { + Tag tag; + bool active = true; + std::set streamIds; + AsyncTrigger refresh; + AsyncTrigger stopped; + + explicit CDCBufferedTag(Tag tag) : tag(tag) {} +}; + +struct CDCProxyData { + UID id; + Database cx; + Reference const> dbInfo; + Reference>> logSystem; + std::map> streams; + std::map> tags; + AsyncTrigger popAcknowledgedDataTrigger; + AsyncTrigger peekCapacityContended; + FlowLock bufferLock; + int64_t bufferedBytes = 0; + + CDCProxyData(CDCProxyInterface const& proxy, Reference const> dbInfo) + : id(proxy.id()), cx(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)), dbInfo(dbInfo), + logSystem(makeReference>>()), + bufferLock(SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES) {} +}; + +Optional clipCDCMutation(MutationRef const& mutation, KeyRangeRef const& keys) { + if (isSingleKeyMutation((MutationRef::Type)mutation.type)) { + if (keys.contains(mutation.param1)) { + return mutation; + } + } else if (mutation.type == MutationRef::ClearRange) { + KeyRangeRef intersection = keys & KeyRangeRef(mutation.param1, mutation.param2); + if (!intersection.empty()) { + return MutationRef(MutationRef::ClearRange, intersection.begin, intersection.end); + } + } else { + ASSERT(false); + } + return Optional(); +} + +Future readCDCStreamState(Database cx, + CDCStreamId streamId, + UID expectedProxyId, + bool requireKeys) { + if (streamId == 0) { + throw client_invalid_operation(); + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + + CDCStreamReadState result; + Optional keysValue = co_await tr.get(cdcStreamKeyFor(streamId)); + if (keysValue.present()) { + result.keys = decodeCDCStreamKeysValue(keysValue.get()); + } else if (requireKeys) { + throw client_invalid_operation(); + } + + Optional minVersionValue = co_await tr.get(cdcMinVersionKeyFor(streamId)); + if (!minVersionValue.present()) { + throw client_invalid_operation(); + } + result.minVersion = decodeCDCMinVersionValue(minVersionValue.get()); + + RangeResult assignedProxies = co_await tr.getRange(cdcProxyRangeFor(streamId), 2); + if (assignedProxies.size() != 1 || decodeCDCProxyKey(assignedProxies[0].key).second != expectedProxyId) { + CODE_PROBE(true, "CDC proxy rejects request for stream owned elsewhere"); + throw wrong_shard_server(); + } + + std::vector> tagAssignments; + KeyRange tagHistoryRange = cdcTagHistoryRangeFor(streamId); + Key begin = tagHistoryRange.begin; + while (begin < tagHistoryRange.end) { + RangeResult history = + co_await tr.getRange(KeyRangeRef(begin, tagHistoryRange.end), CLIENT_KNOBS->TOO_MANY); + for (KeyValueRef const& kv : history) { + const auto [historyStreamId, version, tag] = decodeCDCTagHistoryKey(kv.key); + ASSERT_WE_THINK(historyStreamId == streamId); + ASSERT_WE_THINK(tag.locality == tagLocalityCDC); + tagAssignments.emplace_back(version, tag); + } + if (!history.more) { + break; + } + begin = keyAfter(history.back().key); + } + if (tagAssignments.empty()) { + throw client_invalid_operation(); + } + + result.tagAssignments = std::move(tagAssignments); + co_return result; + } catch (Error& e) { + if (e.code() == error_code_wrong_shard_server) { + throw; + } + err = e; + } + co_await tr.onError(err); + } +} + +void clearBufferedMutations(CDCProxyData* self, Reference stream) { + if (stream->bufferedBytes > 0) { + ASSERT(self->bufferedBytes >= stream->bufferedBytes); + self->bufferedBytes -= stream->bufferedBytes; + self->bufferLock.release(stream->bufferedBytes); + stream->bufferedBytes = 0; + } + stream->mutations.clear(); +} + +void addMutationToBatch(Reference stream, + CDCBufferedBatch* batch, + Version version, + MutationRef const& mutation) { + auto batchVersion = std::find_if(batch->mutations.begin(), batch->mutations.end(), [version](const auto& buffered) { + return buffered.version == version; + }); + if (batchVersion == batch->mutations.end()) { + batch->mutations.emplace_back(); + batchVersion = std::prev(batch->mutations.end()); + batchVersion->version = version; + const bool alreadyBuffered = + std::any_of(stream->mutations.begin(), stream->mutations.end(), [version](const auto& buffered) { + return buffered.version == version; + }); + if (!alreadyBuffered) { + batch->bufferedBytes += sizeof(VersionedMutationsRef); + } + } + batchVersion->mutations.push_back_deep(batchVersion->arena(), mutation); + batch->bufferedBytes += mutation.expectedSize() + sizeof(MutationRef); +} + +void addBufferedBatch(CDCProxyData* self, Reference stream, CDCBufferedBatch batch) { + while (!batch.mutations.empty()) { + Standalone versioned = std::move(batch.mutations.front()); + batch.mutations.pop_front(); + auto location = + std::lower_bound(stream->mutations.begin(), + stream->mutations.end(), + versioned.version, + [](const auto& buffered, Version version) { return buffered.version < version; }); + if (location != stream->mutations.end() && location->version == versioned.version) { + for (const auto& mutation : versioned.mutations) { + location->mutations.push_back_deep(location->arena(), mutation); + } + } else { + stream->mutations.insert(location, std::move(versioned)); + } + } + stream->bufferedBytes += batch.bufferedBytes; + self->bufferedBytes += batch.bufferedBytes; +} + +void updateStreamBufferedThrough(Reference stream) { + Version bufferedThrough = stream->minVersion - 1; + for (const auto& interval : stream->tagIntervals) { + if (interval.begin > bufferedThrough + 1) { + break; + } + if (interval.bufferedThrough < interval.begin) { + break; + } + bufferedThrough = std::max(bufferedThrough, interval.bufferedThrough); + if (interval.bufferedThrough < interval.end - 1) { + break; + } + } + if (bufferedThrough > stream->bufferedThrough) { + stream->bufferedThrough = bufferedThrough; + stream->changed.trigger(); + } +} + +void advanceStreamMinVersion(Reference stream, Version minVersion) { + stream->minVersion = std::max(stream->minVersion, minVersion); + for (auto& interval : stream->tagIntervals) { + if (stream->minVersion > interval.begin) { + interval.bufferedThrough = + std::max(interval.bufferedThrough, std::min(stream->minVersion - 1, interval.end - 1)); + } + } + updateStreamBufferedThrough(stream); +} + +void detachStreamFromTags(CDCProxyData* self, Reference stream) { + for (const auto& interval : stream->tagIntervals) { + auto tag = self->tags.find(interval.tag); + if (tag == self->tags.end()) { + continue; + } + Reference bufferedTag = tag->second; + bufferedTag->streamIds.erase(stream->streamId); + if (bufferedTag->streamIds.empty()) { + bufferedTag->active = false; + self->tags.erase(tag); + bufferedTag->stopped.trigger(); + } else { + bufferedTag->refresh.trigger(); + } + } +} + +void refreshStreamTags(CDCProxyData* self, Reference stream) { + for (const auto& interval : stream->tagIntervals) { + auto tag = self->tags.find(interval.tag); + if (tag != self->tags.end()) { + tag->second->refresh.trigger(); + } + } +} + +Optional nextTagReadVersion(CDCProxyData* self, Reference tag) { + Optional begin; + for (const CDCStreamId streamId : tag->streamIds) { + auto stream = self->streams.find(streamId); + if (stream == self->streams.end() || !stream->second->active || !stream->second->initialized || + stream->second->readDemand == 0) { + continue; + } + for (const auto& interval : stream->second->tagIntervals) { + if (interval.tag != tag->tag) { + continue; + } + const Version next = std::max(interval.begin, interval.bufferedThrough + 1); + if (next < interval.end && (!begin.present() || next < begin.get())) { + begin = next; + } + } + } + return begin; +} + +void advanceTagBufferedThrough(CDCProxyData* self, Reference tag, Version bufferedThrough) { + const std::vector streamIds(tag->streamIds.begin(), tag->streamIds.end()); + for (const CDCStreamId streamId : streamIds) { + auto stream = self->streams.find(streamId); + if (stream == self->streams.end() || !stream->second->active || stream->second->readDemand == 0) { + continue; + } + for (auto& interval : stream->second->tagIntervals) { + if (interval.tag == tag->tag && bufferedThrough >= interval.begin) { + interval.bufferedThrough = + std::max(interval.bufferedThrough, std::min(bufferedThrough, interval.end - 1)); + } + } + updateStreamBufferedThrough(stream->second); + } +} + +void markPoppedTagStreamsTooOld(CDCProxyData* self, Reference tag, Version popped) { + std::vector> tooOldStreams; + for (const CDCStreamId streamId : tag->streamIds) { + auto stream = self->streams.find(streamId); + if (stream == self->streams.end() || !stream->second->active) { + continue; + } + for (const auto& interval : stream->second->tagIntervals) { + const Version next = std::max(interval.begin, interval.bufferedThrough + 1); + if (interval.tag == tag->tag && next < interval.end && next < popped) { + tooOldStreams.push_back(stream->second); + break; + } + } + } + for (const auto& stream : tooOldStreams) { + CODE_PROBE(true, "CDC proxy detects unread mutations already popped from TLogs", probe::decoration::rare); + TraceEvent("CDCBufferStreamTooOld", self->id) + .detail("StreamId", stream->streamId) + .detail("MinVersion", stream->minVersion) + .detail("BufferedThrough", stream->bufferedThrough) + .detail("Popped", popped) + .detail("Tag", tag->tag); + clearBufferedMutations(self, stream); + stream->tooOld = true; + stream->active = false; + stream->changed.trigger(); + detachStreamFromTags(self, stream); + } +} + +std::map bufferMessages(CDCProxyData* self, + Reference tag, + Reference cursor) { + std::map batches; + while (cursor->hasMessage()) { + const Version messageVersion = cursor->version().version; + ArenaReader& reader = *cursor->reader(); + if (LogProtocolMessage::isNextIn(reader)) { + LogProtocolMessage protocolMessage; + reader >> protocolMessage; + cursor->setProtocolVersion(reader.protocolVersion()); + } else if (reader.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(reader)) { + SpanContextMessage contextMessage; + reader >> contextMessage; + } else if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) { + OTELSpanContextMessage contextMessage; + reader >> contextMessage; + } else { + MutationRef mutation; + reader >> mutation; + for (const CDCStreamId streamId : tag->streamIds) { + auto stream = self->streams.find(streamId); + if (stream == self->streams.end() || !stream->second->active || stream->second->readDemand == 0 || + !stream->second->keys.present()) { + continue; + } + const bool coversVersion = + std::any_of(stream->second->tagIntervals.begin(), + stream->second->tagIntervals.end(), + [tag, messageVersion](const auto& interval) { + return interval.tag == tag->tag && interval.begin <= messageVersion && + messageVersion < interval.end && messageVersion > interval.bufferedThrough; + }); + if (!coversVersion) { + continue; + } + Optional clipped = clipCDCMutation(mutation, stream->second->keys.get()); + if (clipped.present()) { + addMutationToBatch(stream->second, &batches[streamId], messageVersion, clipped.get()); + } + } + } + cursor->nextMessage(); + } + return batches; +} + +Future rotateContendedPeek(CDCProxyData* self) { + if (self->bufferLock.waiters() == 0) { + co_await self->peekCapacityContended.onTrigger(); + } + co_await delay(SERVER_KNOBS->BLOCKING_PEEK_TIMEOUT); +} + +Future bufferTag(CDCProxyData* self, Reference tag) { + while (tag->active) { + Optional begin = nextTagReadVersion(self, tag); + if (!begin.present()) { + auto waitForDemand = co_await race(tag->stopped.onTrigger(), tag->refresh.onTrigger()); + if (waitForDemand.index() == 0) { + co_return; + } + continue; + } + if (!self->logSystem->get()) { + auto waitForLogSystem = + co_await race(self->logSystem->onChange(), tag->stopped.onTrigger(), tag->refresh.onTrigger()); + if (waitForLogSystem.index() == 1) { + co_return; + } + continue; + } + + Reference cursor = self->logSystem->get()->peekSingle(self->id, begin.get(), tag->tag, {}); + while (tag->active) { + const int64_t peekReservation = + std::min(SERVER_KNOBS->CDC_PROXY_BUFFER_BYTES, SERVER_KNOBS->MAXIMUM_PEEK_BYTES); + ASSERT(peekReservation > 0); + if (self->bufferLock.available() < peekReservation) { + CODE_PROBE(true, "CDC proxy applies shared buffer backpressure", probe::decoration::rare); + self->peekCapacityContended.trigger(); + } + auto capacity = co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, peekReservation), + self->logSystem->onChange(), + tag->stopped.onTrigger(), + tag->refresh.onTrigger()); + if (capacity.index() == 1 || capacity.index() == 3) { + break; + } + if (capacity.index() == 2) { + co_return; + } + FlowLock::Releaser reservation(self->bufferLock, peekReservation); + // Blocking peeks hold a response reservation. Once another tag queues for capacity, rotate this + // reader after one blocking-peek interval so an idle tag cannot monopolize the shared budget. + auto result = co_await race(cursor->getMore(TaskPriority::TLogPeekReply), + self->logSystem->onChange(), + tag->stopped.onTrigger(), + tag->refresh.onTrigger(), + rotateContendedPeek(self)); + if (result.index() == 1 || result.index() == 3 || result.index() == 4) { + break; + } + if (result.index() == 2) { + co_return; + } + + cursor->setProtocolVersion(g_network->protocolVersion()); + if (cursor->popped() > begin.get()) { + markPoppedTagStreamsTooOld(self, tag, cursor->popped()); + break; + } + + std::map batches = bufferMessages(self, tag, cursor); + int64_t bufferedBytes = 0; + for (const auto& [streamId, batch] : batches) { + bufferedBytes += batch.bufferedBytes; + } + if (bufferedBytes > peekReservation) { + CODE_PROBE(true, "CDC proxy reserves capacity for oversized peek batch", probe::decoration::rare); + TraceEvent(SevWarn, "CDCProxyOversizedPeekBatch", self->id) + .detail("Tag", tag->tag) + .detail("BufferedBytes", bufferedBytes) + .detail("ReservedBytes", peekReservation); + reservation.release(); + auto oversizedCapacity = + co_await race(self->bufferLock.take(TaskPriority::TLogPeekReply, bufferedBytes), + self->logSystem->onChange(), + tag->stopped.onTrigger(), + tag->refresh.onTrigger()); + if (oversizedCapacity.index() == 1 || oversizedCapacity.index() == 3) { + break; + } + if (oversizedCapacity.index() == 2) { + co_return; + } + reservation = FlowLock::Releaser(self->bufferLock, bufferedBytes); + } else { + reservation.release(peekReservation - bufferedBytes); + } + if (!tag->active) { + co_return; + } + + int64_t acceptedBytes = 0; + for (auto& [streamId, batch] : batches) { + auto stream = self->streams.find(streamId); + if (stream != self->streams.end() && stream->second->active && tag->streamIds.contains(streamId)) { + acceptedBytes += batch.bufferedBytes; + addBufferedBatch(self, stream->second, std::move(batch)); + } + } + reservation.release(bufferedBytes - acceptedBytes); + // Buffered mutations own these permits until acknowledgement or stream removal. + reservation.remaining = 0; + advanceTagBufferedThrough(self, tag, cursor->version().version - 1); + if (!nextTagReadVersion(self, tag).present()) { + break; + } + if (cursor->isExhausted()) { + break; + } + } + } +} + +Future initializeStream(CDCProxyData* self, Reference stream, ActorCollection* actors) { + try { + const CDCStreamReadState metadata = co_await readCDCStreamState(self->cx, stream->streamId, self->id, true); + stream->keys = metadata.keys; + stream->minVersion = metadata.minVersion; + stream->bufferedThrough = metadata.minVersion - 1; + for (size_t i = 0; i < metadata.tagAssignments.size(); ++i) { + const Version begin = std::max(metadata.minVersion, metadata.tagAssignments[i].first); + const Version end = i + 1 < metadata.tagAssignments.size() ? metadata.tagAssignments[i + 1].first + : std::numeric_limits::max(); + if (begin < end) { + stream->tagIntervals.emplace_back(metadata.tagAssignments[i].second, begin, end); + } + } + stream->initialized = true; + stream->changed.trigger(); + for (const auto& interval : stream->tagIntervals) { + auto tag = self->tags.find(interval.tag); + if (tag == self->tags.end()) { + Reference newTag = makeReference(interval.tag); + tag = self->tags.emplace(interval.tag, newTag).first; + tag->second->streamIds.insert(stream->streamId); + actors->add(bufferTag(self, newTag)); + } else { + CODE_PROBE(true, "CDC proxy shares a tag reader across streams"); + tag->second->streamIds.insert(stream->streamId); + tag->second->refresh.trigger(); + } + } + } catch (Error& e) { + if (e.code() == error_code_client_invalid_operation || e.code() == error_code_wrong_shard_server) { + clearBufferedMutations(self, stream); + stream->active = false; + stream->changed.trigger(); + co_return; + } + throw; + } +} + +// TODO: Persist per-tag safe-pop state or coordinate pops centrally instead of rebuilding minima from all stream +// history on every acknowledgement. +Future> readSafePopVersions(Database cx) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + + std::map minVersions; + Key begin = cdcMinVersionKeys.begin; + while (begin < cdcMinVersionKeys.end) { + RangeResult minima = + co_await tr.getRange(KeyRangeRef(begin, cdcMinVersionKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : minima) { + minVersions[decodeCDCMinVersionKey(kv.key)] = decodeCDCMinVersionValue(kv.value); + } + if (!minima.more) { + break; + } + begin = keyAfter(minima.back().key); + } + + std::map safePopVersions; + begin = cdcTagHistoryKeys.begin; + while (begin < cdcTagHistoryKeys.end) { + RangeResult histories = + co_await tr.getRange(KeyRangeRef(begin, cdcTagHistoryKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : histories) { + const auto [streamId, version, tag] = decodeCDCTagHistoryKey(kv.key); + auto minimum = minVersions.find(streamId); + if (minimum == minVersions.end()) { + continue; + } + auto safePop = safePopVersions.find(tag); + if (safePop == safePopVersions.end()) { + safePopVersions[tag] = minimum->second; + } else { + safePop->second = std::min(safePop->second, minimum->second); + } + } + if (!histories.more) { + break; + } + begin = keyAfter(histories.back().key); + } + co_return safePopVersions; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future> readRetiredTagPopVersions(Database cx) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + + std::map retiredTagPopVersions; + Key begin = cdcRetiredTagPopVersionKeys.begin; + while (begin < cdcRetiredTagPopVersionKeys.end) { + RangeResult retired = + co_await tr.getRange(KeyRangeRef(begin, cdcRetiredTagPopVersionKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& kv : retired) { + retiredTagPopVersions[decodeCDCRetiredTagPopVersionKey(kv.key)] = + decodeCDCMinVersionValue(kv.value); + } + if (!retired.more) { + break; + } + begin = keyAfter(retired.back().key); + } + co_return retiredTagPopVersions; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future clearCompletedRetiredTagPops(Database cx, std::map completedPopVersions) { + if (completedPopVersions.empty()) { + co_return; + } + + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + for (const auto& [tag, completedVersion] : completedPopVersions) { + Optional retiredVersionValue = co_await tr.get(cdcRetiredTagPopVersionKeyFor(tag)); + if (!retiredVersionValue.present() || + decodeCDCMinVersionValue(retiredVersionValue.get()) > completedVersion) { + continue; + } + CODE_PROBE(true, "CDC proxy clears completed retired tag pop metadata"); + tr.clear(cdcRetiredTagPopKeyFor(tag)); + tr.clear(cdcRetiredTagPopVersionKeyFor(tag)); + } + + co_await tr.commit(); + co_return; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } +} + +Future popAcknowledgedData(CDCProxyData* self) { + const std::map safePopVersions = co_await readSafePopVersions(self->cx); + Reference logSystem = self->logSystem->get(); + for (const auto& [tag, version] : safePopVersions) { + logSystem->pop(version, tag); + } + const std::map retiredTagPopVersions = co_await readRetiredTagPopVersions(self->cx); + std::map completedPopVersions; + for (const auto& [tag, retiredVersion] : retiredTagPopVersions) { + const auto safePop = safePopVersions.find(tag); + const Version version = + safePop == safePopVersions.end() ? retiredVersion : std::min(retiredVersion, safePop->second); + CODE_PROBE(safePop != safePopVersions.end() && version < retiredVersion, + "CDC proxy defers retired tag pop behind a live shared stream"); + logSystem->pop(version, tag); + if (version >= retiredVersion) { + co_await logSystem->waitForPopped(retiredVersion, tag); + completedPopVersions[tag] = retiredVersion; + } + } + co_await clearCompletedRetiredTagPops(self->cx, std::move(completedPopVersions)); +} + +Future monitorAcknowledgedDataPops(CDCProxyData* self) { + co_await self->popAcknowledgedDataTrigger.onTrigger(); + while (true) { + // Pop completion may wait on an unavailable log generation. A new acknowledgement or log-system + // configuration supersedes that attempt and retries the durable work against current state. + Future retriggered = self->popAcknowledgedDataTrigger.onTrigger(); + auto result = co_await race(popAcknowledgedData(self), retriggered); + if (result.index() == 0) { + co_await self->popAcknowledgedDataTrigger.onTrigger(); + } + } +} + +void reconcileStreams(CDCProxyData* self, ActorCollection* actors) { + std::set assignedStreams; + for (const auto& [streamId, proxyId] : self->dbInfo->get().client.streamToCDCProxyId) { + if (proxyId == self->id) { + assignedStreams.insert(streamId); + if (!self->streams.contains(streamId)) { + Reference stream = makeReference(streamId); + self->streams.emplace(streamId, stream); + actors->add(initializeStream(self, stream, actors)); + } + } + } + + for (auto it = self->streams.begin(); it != self->streams.end();) { + if (!assignedStreams.contains(it->first)) { + CODE_PROBE(it->second->readDemand > 0, "CDC proxy wakes pending consume when stream is unassigned"); + CODE_PROBE(true, "CDC proxy drops removed or reassigned stream state"); + it->second->active = false; + it->second->changed.trigger(); + detachStreamFromTags(self, it->second); + clearBufferedMutations(self, it->second); + it = self->streams.erase(it); + } else { + ++it; + } + } +} + +Future consume(CDCProxyData* self, CDCConsumeRequest request) { + try { + if (request.cursor.lastConsumedVersion < invalidVersion || + request.cursor.lastConsumedVersion == std::numeric_limits::max()) { + throw client_invalid_operation(); + } + + co_await readCDCStreamState(self->cx, request.cursor.streamId, self->id, true); + auto found = self->streams.find(request.cursor.streamId); + if (found == self->streams.end()) { + throw wrong_shard_server(); + } + Reference stream = found->second; + while (stream->active && !stream->initialized) { + co_await stream->changed.onTrigger(); + } + if (stream->tooOld) { + throw transaction_too_old(); + } + if (!stream->active) { + throw wrong_shard_server(); + } + + Version begin = request.cursor.lastConsumedVersion == invalidVersion ? stream->minVersion + : request.cursor.lastConsumedVersion + 1; + if (begin < stream->minVersion) { + throw transaction_too_old(); + } + + bool issuedReadDemand = false; + if (stream->bufferedThrough < begin) { + ++stream->readDemand; + refreshStreamTags(self, stream); + issuedReadDemand = true; + } + while (stream->active && stream->bufferedThrough < begin) { + co_await stream->changed.onTrigger(); + } + if (issuedReadDemand) { + ASSERT(stream->readDemand > 0); + --stream->readDemand; + refreshStreamTags(self, stream); + } + if (stream->tooOld) { + throw transaction_too_old(); + } + if (!stream->active) { + throw wrong_shard_server(); + } + + CDCConsumeReply reply; + reply.lastConsumedVersion = request.cursor.lastConsumedVersion; + for (const auto& versioned : stream->mutations) { + if (versioned.version >= begin && versioned.version <= stream->bufferedThrough) { + reply.mutations.push_back(reply.arena, VersionedMutationsRef(versioned.version, {})); + for (const auto& mutation : versioned.mutations) { + reply.mutations.back().mutations.push_back_deep(reply.arena, mutation); + } + } + } + reply.lastConsumedVersion = stream->bufferedThrough; + request.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + request.reply.sendError(e); + } +} + +Future acknowledge(CDCProxyData* self, CDCAckRequest request) { + try { + if (request.version < 0 || request.version >= std::numeric_limits::max() - 1) { + throw client_invalid_operation(); + } + const CDCStreamReadState metadata = co_await readCDCStreamState(self->cx, request.streamId, self->id, false); + if (metadata.minVersion <= request.version) { + throw client_invalid_operation(); + } + auto found = self->streams.find(request.streamId); + if (found == self->streams.end()) { + throw wrong_shard_server(); + } + Reference stream = found->second; + while (stream->active && !stream->initialized) { + co_await stream->changed.onTrigger(); + } + if (!stream->active) { + throw wrong_shard_server(); + } + + // The durable acknowledgement can commit before a replacement owner observes the RPC. + // Reconcile the new owner's in-memory frontier to that already verified watermark. + const Version minVersion = metadata.minVersion; + CODE_PROBE(stream->minVersion < minVersion, "CDC proxy reconciles a durable stream acknowledgement"); + advanceStreamMinVersion(stream, minVersion); + while (!stream->mutations.empty() && stream->mutations.front().version < minVersion) { + const int64_t releasedBytes = + sizeof(VersionedMutationsRef) + stream->mutations.front().mutations.expectedSize(); + stream->bufferedBytes -= releasedBytes; + ASSERT(self->bufferedBytes >= releasedBytes); + self->bufferedBytes -= releasedBytes; + self->bufferLock.release(releasedBytes); + stream->mutations.pop_front(); + } + ASSERT(stream->bufferedBytes >= 0); + self->popAcknowledgedDataTrigger.trigger(); + request.reply.send(Void()); + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + request.reply.sendError(e); + } +} + +Future registerStream(CDCProxyData* self, CDCRegisterStreamRequest request) { + try { + const CDCStreamId streamId = co_await registerNativeCdcStream(self->cx, request.name, request.keys, self->id); + request.reply.send(CDCRegisterStreamReply(streamId)); + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + request.reply.sendError(e); + } +} + +Future removeStream(CDCProxyData* self, CDCRemoveStreamRequest request) { + try { + Optional removed = co_await removeNativeCdcStream(self->cx, request.name, self->id); + if (removed.present()) { + self->popAcknowledgedDataTrigger.trigger(); + } + request.reply.send(Void()); + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + request.reply.sendError(e); + } +} + +Future listStreams(CDCProxyData* self, CDCListStreamsRequest request) { + try { + std::vector streams = co_await listNativeCdcStreams(self->cx); + CDCListStreamsReply reply; + for (NativeCdcStreamInfo const& stream : streams) { + reply.streams.push_back(reply.arena, + CDCStreamInfoRef(StringRef(reply.arena, stream.name), + stream.streamId, + KeyRangeRef(reply.arena, stream.keys), + stream.minVersion)); + } + request.reply.send(reply); + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + request.reply.sendError(e); + } +} + +} // namespace + +Future cdcProxyServer(CDCProxyInterface proxy, + uint64_t recoveryCount, + Reference const> dbInfo) { + try { + CDCProxyData self(proxy, dbInfo); + ActorCollection actors(false); + + actors.add(waitFailureServer(proxy.waitFailure.getFuture())); + actors.add(traceRole(Role::CDC_PROXY, proxy.id())); + self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); + reconcileStreams(&self, &actors); + actors.add(monitorAcknowledgedDataPops(&self)); + self.popAcknowledgedDataTrigger.trigger(); + Future dbInfoChange = dbInfo->onChange(); + bool hasBeenPublished = + std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) != + dbInfo->get().client.cdcProxies.end(); + + while (true) { + auto result = co_await race(proxy.consume.getFuture(), + proxy.ack.getFuture(), + proxy.registerStream.getFuture(), + proxy.removeStream.getFuture(), + proxy.listStreams.getFuture(), + proxy.haltForTesting.getFuture(), + dbInfoChange, + actors.getResult()); + switch (result.index()) { + case 0: + actors.add(consume(&self, std::get<0>(std::move(result)))); + break; + case 1: + actors.add(acknowledge(&self, std::get<1>(std::move(result)))); + break; + case 2: + actors.add(registerStream(&self, std::get<2>(std::move(result)))); + break; + case 3: + actors.add(removeStream(&self, std::get<3>(std::move(result)))); + break; + case 4: + actors.add(listStreams(&self, std::get<4>(std::move(result)))); + break; + case 5: + if (!g_network->isSimulated()) { + std::get<5>(std::move(result)).reply.sendError(client_invalid_operation()); + break; + } + std::get<5>(std::move(result)).reply.send(Void()); + throw worker_removed(); + case 6: { + const bool isPublished = + std::find(dbInfo->get().client.cdcProxies.begin(), dbInfo->get().client.cdcProxies.end(), proxy) != + dbInfo->get().client.cdcProxies.end(); + if (hasBeenPublished && dbInfo->get().recoveryCount >= recoveryCount && !isPublished) { + throw worker_removed(); + } + hasBeenPublished = hasBeenPublished || isPublished; + if (!dbInfo->get().logSystemConfig.tLogs.empty()) { + CODE_PROBE(dbInfo->get().recoveryCount > recoveryCount, + "CDC proxy refreshes its log consumer after recovery"); + self.logSystem->set(makeLogSystemConsumerFromServerDBInfo(self.id, dbInfo->get())); + } + reconcileStreams(&self, &actors); + self.popAcknowledgedDataTrigger.trigger(); + dbInfoChange = dbInfo->onChange(); + break; + } + case 7: + co_await actors.getResult(); + break; + default: + ASSERT(false); + } + } + } catch (Error& e) { + TraceEvent("CDCProxyTerminated", proxy.id()).errorUnsuppressed(e); + if (e.code() != error_code_worker_removed) { + throw; + } + } +} + +TEST_CASE("/NativeCDC/ProxyMutationFiltering") { + const KeyRangeRef keys("c"_sr, "m"_sr); + + Optional inRange = clipCDCMutation(MutationRef(MutationRef::SetValue, "d"_sr, "value"_sr), keys); + ASSERT(inRange.present()); + ASSERT(inRange.get().param1 == "d"_sr); + + Optional outOfRange = clipCDCMutation(MutationRef(MutationRef::SetValue, "z"_sr, "value"_sr), keys); + ASSERT(!outOfRange.present()); + + Optional clippedClear = clipCDCMutation(MutationRef(MutationRef::ClearRange, "a"_sr, "f"_sr), keys); + ASSERT(clippedClear.present()); + ASSERT(clippedClear.get().param1 == "c"_sr); + ASSERT(clippedClear.get().param2 == "f"_sr); + + Optional excludedClear = clipCDCMutation(MutationRef(MutationRef::ClearRange, "n"_sr, "z"_sr), keys); + ASSERT(!excludedClear.present()); + + return Void(); +} diff --git a/fdbserver/cdcproxy/CMakeLists.txt b/fdbserver/cdcproxy/CMakeLists.txt new file mode 100644 index 00000000000..ebb290ce37d --- /dev/null +++ b/fdbserver/cdcproxy/CMakeLists.txt @@ -0,0 +1,17 @@ +fdb_find_sources(FDBSERVER_CDCPROXY_SRCS) + +add_flow_target(STATIC_LIBRARY NAME fdbserver_cdcproxy SRCS ${FDBSERVER_CDCPROXY_SRCS}) +add_fdbserver_link_test(fdbserver_cdcproxylinktest + fdbserver_cdcproxy + fdbserver_logsystem + fdbserver_core) +add_fdbserver_unit_test(fdbserver_cdcproxy_test cdcproxy + fdbserver_cdcproxy + fdbserver_logsystem + fdbserver_core) + +configure_fdbserver_common_includes(fdbserver_cdcproxy) +target_include_directories(fdbserver_cdcproxy + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include) +target_link_libraries(fdbserver_cdcproxy PUBLIC fdbserver_logsystem fdbserver_core) diff --git a/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h b/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h new file mode 100644 index 00000000000..f2756ab27b3 --- /dev/null +++ b/fdbserver/cdcproxy/include/fdbserver/cdcproxy/CDCProxy.h @@ -0,0 +1,30 @@ +/* + * CDCProxy.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "fdbclient/CDCProxyInterface.h" +#include "flow/flow.h" + +struct ServerDBInfo; + +Future cdcProxyServer(CDCProxyInterface proxy, + uint64_t recoveryCount, + Reference const> dbInfo); diff --git a/fdbserver/clustercontroller/ClusterController.actor.cpp b/fdbserver/clustercontroller/ClusterController.actor.cpp index 92f7308b295..0621b157242 100644 --- a/fdbserver/clustercontroller/ClusterController.actor.cpp +++ b/fdbserver/clustercontroller/ClusterController.actor.cpp @@ -28,6 +28,7 @@ #include "fdbclient/ClientBooleanParams.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/NativeCdcInternal.h" #include "fdbclient/SystemData.h" #include "fdbclient/DatabaseContext.h" #include "fdbrpc/FailureMonitor.h" @@ -236,6 +237,12 @@ bool ClusterControllerData::transactionSystemContainsDegradedServers() { } } + for (const auto& proxy : dbi.client.cdcProxies) { + if (proxy.addresses().contains(server)) { + return true; + } + } + for (const auto& resolver : dbi.resolvers) { if (resolver.addresses().contains(server)) { return true; @@ -547,6 +554,154 @@ Future monitorAndRecruitLogRouters(ClusterControllerData* self) { } } +Future> monitorCDCProxies(std::vector const& cdcProxies) { + std::vector> failures; + for (const auto& proxy : cdcProxies) { + failures.push_back( + waitFailureClient(proxy.waitFailure, + SERVER_KNOBS->TLOG_TIMEOUT, + -SERVER_KNOBS->TLOG_TIMEOUT / SERVER_KNOBS->SECONDS_BEFORE_NO_FAILURE_DELAY, + /*trace=*/true, + /*traceMsg=*/"CDCProxyFailed"_sr)); + } + if (failures.empty()) { + co_await Future(Never()); + UNREACHABLE(); + } + + co_await quorum(failures, 1); + std::vector failedProxies; + for (int i = 0; i < failures.size(); ++i) { + if (failures[i].isReady() || failures[i].isError()) { + failedProxies.push_back(i); + } + } + co_return failedProxies; +} + +bool containsCDCProxy(std::vector const& proxies, UID proxyId) { + return std::any_of( + proxies.begin(), proxies.end(), [proxyId](CDCProxyInterface const& proxy) { return proxy.id() == proxyId; }); +} + +Future recruitFailedCDCProxies(ClusterControllerData* self, + uint64_t recoveryCount, + std::vector const& monitoredProxies, + std::vector const& failedIndexes) { + if (!self->db.recoveryData.isValid() || self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount) { + co_return; + } + + std::vector> replacements; + for (int failedIndex : failedIndexes) { + ASSERT_WE_THINK(failedIndex >= 0 && failedIndex < monitoredProxies.size()); + const CDCProxyInterface& failedProxy = monitoredProxies[failedIndex]; + auto current = std::find(self->db.cdcProxies.begin(), self->db.cdcProxies.end(), failedProxy); + if (current == self->db.cdcProxies.end()) { + continue; + } + + auto worker = self->id_worker.find(failedProxy.processId); + if (worker == self->id_worker.end()) { + for (const auto& grvProxy : self->db.recoveryData->grvProxies) { + worker = self->id_worker.find(grvProxy.processId); + if (worker != self->id_worker.end()) { + break; + } + } + if (worker == self->id_worker.end()) { + throw recruitment_failed(); + } + } + + InitializeCDCProxyRequest request; + request.recoveryCount = recoveryCount; + CDCProxyInterface replacement = + co_await throwErrorOr(worker->second.details.interf.cdcProxy.getReplyUnlessFailedFor( + request, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)); + + if (!self->db.recoveryData.isValid() || + self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount) { + co_return; + } + current = std::find(self->db.cdcProxies.begin(), self->db.cdcProxies.end(), failedProxy); + if (current == self->db.cdcProxies.end()) { + continue; + } + *current = replacement; + replacements.emplace_back(failedProxy.id(), replacement.id()); + CODE_PROBE(true, "CDC proxy is recruited after failure"); + TraceEvent("CDCProxyRecruited", self->id) + .detail("OldCDCProxyID", failedProxy.id()) + .detail("NewCDCProxyID", replacement.id()) + .detail("RecoveryCount", recoveryCount); + } + if (replacements.empty()) { + co_return; + } + + // Endpoint publication precedes assignment publication so clients never route + // a stream to a replacement that is not yet discoverable. + self->db.recoveryData->registrationTrigger.trigger(); + for (const auto& [oldProxyId, newProxyId] : replacements) { + while (containsCDCProxy(self->db.cdcProxies, newProxyId) && + !containsCDCProxy(self->db.clientInfo->get().cdcProxies, newProxyId)) { + co_await self->db.clientInfo->onChange(); + } + if (containsCDCProxy(self->db.cdcProxies, newProxyId) && + containsCDCProxy(self->db.clientInfo->get().cdcProxies, newProxyId)) { + // Reassignment remains necessary if recovery changes while the + // replacement endpoint is being published. + co_await reassignNativeCdcStreams(self->db.db, oldProxyId, newProxyId); + } + } +} + +Future monitorAndRecruitCDCProxies(ClusterControllerData* self) { + while (true) { + while (self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS || + !self->db.recoveryData.isValid() || self->db.cdcProxies.empty()) { + co_await self->db.serverInfo->onChange(); + } + + const uint64_t recoveryCount = self->db.recoveryData->cstate.myDBState.recoveryCount; + const std::vector monitoredProxies = self->db.cdcProxies; + Future> failures = monitorCDCProxies(monitoredProxies); + while (true) { + bool retryAfterFailure = false; + try { + auto result = co_await race(failures, self->db.serverInfo->onChange()); + if (result.index() == 0) { + const std::vector failedIndexes = std::get<0>(std::move(result)); + TraceEvent("CDCProxyFailureDetected", self->id) + .detail("FailedCount", failedIndexes.size()) + .detail("RecoveryCount", recoveryCount); + co_await recruitFailedCDCProxies(self, recoveryCount, monitoredProxies, failedIndexes); + break; + } + if (!self->db.recoveryData.isValid() || + self->db.recoveryData->cstate.myDBState.recoveryCount != recoveryCount || + self->db.cdcProxies != monitoredProxies) { + break; + } + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + CODE_PROBE(true, "CDC proxy re-recruitment failed"); + TraceEvent(SevWarnAlways, "CDCProxyReRecruitmentFailed", self->id) + .error(e) + .detail("RecoveryCount", recoveryCount); + retryAfterFailure = true; + } + if (retryAfterFailure) { + co_await delay(1.0); + break; + } + } + } +} + ACTOR Future clusterWatchDatabase(ClusterControllerData* cluster, ClusterControllerData::DBInfo* db, ServerCoordinators coordinators) { @@ -587,6 +742,8 @@ ACTOR Future clusterWatchDatabase(ClusterControllerData* cluster, dbInfo.client = ClientDBInfo(); dbInfo.client.clusterId = db->serverInfo->get().client.clusterId; dbInfo.client.clusterType = db->clusterType; + dbInfo.client.cdcProxies = db->cdcProxies; + dbInfo.client.streamToCDCProxyId = db->clientInfo->get().streamToCDCProxyId; TraceEvent("CCWDB", cluster->id) .detail("NewMaster", dbInfo.master.id().toString()) @@ -1177,6 +1334,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co .detail("RegistrationCount", req.registrationCount) .detail("CommitProxies", req.commitProxies.size()) .detail("GrvProxies", req.grvProxies.size()) + .detail("CDCProxies", req.cdcProxies.size()) .detail("RecoveryCount", req.recoveryCount) .detail("Stalled", req.recoveryStalled) .detail("OldestBackupEpoch", req.logSystemConfig.oldestBackupEpoch); @@ -1236,7 +1394,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co // Construct the client information if (db->clientInfo->get().commitProxies != req.commitProxies || - db->clientInfo->get().grvProxies != req.grvProxies || + db->clientInfo->get().grvProxies != req.grvProxies || db->clientInfo->get().cdcProxies != req.cdcProxies || db->clientInfo->get().clusterId != db->serverInfo->get().client.clusterId || db->clientInfo->get().clusterType != db->clusterType) { TraceEvent("PublishNewClientInfo", self->id) @@ -1246,6 +1404,8 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co .detail("CommitProxies", db->clientInfo->get().commitProxies) .detail("GlobalConfigHistorySize", db->clientInfo->get().history.size()) .detail("ReqCPs", req.commitProxies) + .detail("CDCProxies", db->clientInfo->get().cdcProxies) + .detail("ReqCDCProxies", req.cdcProxies) .detail("ClusterId", db->serverInfo->get().client.clusterId) .detail("ClientClusterId", db->clientInfo->get().clusterId) .detail("ClusterType", db->clientInfo->get().clusterType) @@ -1256,6 +1416,8 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co clientInfo.id = deterministicRandom()->randomUniqueID(); clientInfo.commitProxies = req.commitProxies; clientInfo.grvProxies = req.grvProxies; + clientInfo.cdcProxies = req.cdcProxies; + clientInfo.streamToCDCProxyId = db->clientInfo->get().streamToCDCProxyId; clientInfo.history = db->clientInfo->get().history; clientInfo.clusterId = db->serverInfo->get().client.clusterId; clientInfo.clusterType = db->clusterType; @@ -1976,6 +2138,102 @@ Future monitorGlobalConfig(ClusterControllerData::DBInfo* db) { } } +Future monitorCDCProxyAssignments(ClusterControllerData::DBInfo* db) { + while (true) { + ReadYourWritesTransaction tr(db->db); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + // Install the wakeup before reading published endpoints so an endpoint + // publication during the metadata scan cannot leave stale assignments asleep. + Future endpointChangeFuture = db->clientInfo->onChange(); + std::map streamToCDCProxyId; + const std::vector availableProxies = db->clientInfo->get().cdcProxies; + std::map replacementByFailedProxy; + size_t replacementIndex = 0; + bool repairedAssignment = false; + Key begin = cdcProxyKeys.begin; + while (begin < cdcProxyKeys.end) { + RangeResult assignments = + co_await tr.getRange(KeyRangeRef(begin, cdcProxyKeys.end), CLIENT_KNOBS->TOO_MANY); + for (const auto& assignment : assignments) { + const auto [streamId, proxyId] = decodeCDCProxyKey(assignment.key); + UID resolvedProxyId = proxyId; + const bool hasOwner = + std::any_of(availableProxies.begin(), availableProxies.end(), [proxyId](const auto& proxy) { + return proxy.id() == proxyId; + }); + if (!availableProxies.empty() && !hasOwner) { + auto replacement = replacementByFailedProxy.find(proxyId); + if (replacement == replacementByFailedProxy.end()) { + resolvedProxyId = availableProxies[replacementIndex++ % availableProxies.size()].id(); + replacementByFailedProxy.emplace(proxyId, resolvedProxyId); + } else { + resolvedProxyId = replacement->second; + } + tr.clear(assignment.key); + tr.set(cdcProxyKeyFor(streamId, resolvedProxyId), Value()); + repairedAssignment = true; + CODE_PROBE( + true, "CDC stream assignment is repaired after owner loss", probe::decoration::rare); + TraceEvent("CDCProxyAssignmentRepaired") + .detail("StreamId", streamId) + .detail("OldCDCProxyID", proxyId) + .detail("NewCDCProxyID", resolvedProxyId); + } + ASSERT_WE_THINK(streamToCDCProxyId.emplace(streamId, resolvedProxyId).second); + } + if (!assignments.more) { + break; + } + begin = keyAfter(assignments.back().key); + } + + if (!streamToCDCProxyId.empty() && availableProxies.empty()) { + CODE_PROBE( + true, "CDC assignments wait while no proxy endpoints are published", probe::decoration::rare); + Future assignmentChangeFuture = tr.watch(cdcProxyAssignmentChangeKey); + co_await tr.commit(); + co_await (assignmentChangeFuture || endpointChangeFuture); + break; + } + + if (repairedAssignment) { + tr.set(cdcProxyAssignmentChangeKey, + BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), + IncludeVersion(ProtocolVersion::withNativeCdc()))); + co_await tr.commit(); + break; + } + + ClientDBInfo clientInfo = db->clientInfo->get(); + if (clientInfo.streamToCDCProxyId != streamToCDCProxyId) { + clientInfo.id = deterministicRandom()->randomUniqueID(); + clientInfo.streamToCDCProxyId = std::move(streamToCDCProxyId); + + ServerDBInfo serverInfo = db->serverInfo->get(); + serverInfo.id = deterministicRandom()->randomUniqueID(); + serverInfo.infoGeneration = ++db->dbInfoCount; + serverInfo.client = clientInfo; + db->serverInfo->set(serverInfo); + db->clientInfo->set(clientInfo); + } + + Future assignmentChangeFuture = tr.watch(cdcProxyAssignmentChangeKey); + co_await tr.commit(); + co_await (assignmentChangeFuture || endpointChangeFuture); + break; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } +} + Future updatedChangingDatacenters(ClusterControllerData* self) { // do not change the cluster controller until all the processes have had a chance to register co_await delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY); @@ -2918,6 +3176,9 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, self.addActor.send(monitorServerInfoConfig(&self.db)); self.addActor.send(monitorStorageMetadata(&self)); self.addActor.send(monitorGlobalConfig(&self.db)); + // These actors also drain durable CDC state when new stream registration is disabled. + self.addActor.send(monitorCDCProxyAssignments(&self.db)); + self.addActor.send(monitorAndRecruitCDCProxies(&self)); self.addActor.send(updatedChangingDatacenters(&self)); self.addActor.send(updatedChangedDatacenters(&self)); self.addActor.send(updateDatacenterVersionDifference(&self)); diff --git a/fdbserver/clustercontroller/ClusterController.h b/fdbserver/clustercontroller/ClusterController.h index 3b0018d1ec5..61d9d8d2240 100644 --- a/fdbserver/clustercontroller/ClusterController.h +++ b/fdbserver/clustercontroller/ClusterController.h @@ -143,6 +143,7 @@ class ClusterControllerData { Future clientCounter; int clientCount; ClusterType clusterType = ClusterType::STANDALONE; + std::vector cdcProxies; Reference recoveryData; DBInfo() diff --git a/fdbserver/clustercontroller/ClusterRecovery.cpp b/fdbserver/clustercontroller/ClusterRecovery.cpp index ec4a30e54d2..7315a367141 100644 --- a/fdbserver/clustercontroller/ClusterRecovery.cpp +++ b/fdbserver/clustercontroller/ClusterRecovery.cpp @@ -45,6 +45,7 @@ static std::set const& normalClusterRecoveryErrors() { s.insert(error_code_tlog_failed); s.insert(error_code_commit_proxy_failed); s.insert(error_code_grv_proxy_failed); + s.insert(error_code_cdc_proxy_failed); s.insert(error_code_resolver_failed); s.insert(error_code_backup_worker_failed); s.insert(error_code_recruitment_failed); @@ -231,6 +232,40 @@ Future newGrvProxies(Reference self, RecruitFromConfi self->grvProxies = std::move(newRecruits); } +Future ensureCDCProxies(Reference self, RecruitFromConfigurationReply recr) { + const bool hasDurableCdcState = !(co_await self->txnStateStore->readRange(cdcStreamKeys)).empty() || + !(co_await self->txnStateStore->readRange(cdcRetiredTagPopKeys)).empty(); + if (!CLIENT_KNOBS->ENABLE_NATIVE_CDC && !hasDurableCdcState) { + CODE_PROBE(true, "Recovery skips CDC proxies when disabled with no durable state"); + self->controllerData->db.cdcProxies.clear(); + co_return; + } + CODE_PROBE(!CLIENT_KNOBS->ENABLE_NATIVE_CDC && hasDurableCdcState, + "Recovery recruits CDC proxies to drain disabled durable state", + probe::decoration::rare); + if (!self->controllerData->db.cdcProxies.empty()) { + CODE_PROBE(true, "Recovery reuses CDC proxies while CDC state remains durable"); + TraceEvent("CDCProxiesReused", self->dbgid).detail("Count", self->controllerData->db.cdcProxies.size()); + co_return; + } + + std::vector> initializationReplies; + for (int i = 0; i < recr.grvProxies.size(); i++) { + InitializeCDCProxyRequest req; + req.recoveryCount = self->cstate.myDBState.recoveryCount + 1; + TraceEvent("CDCProxyReplies", self->dbgid).detail("WorkerID", recr.grvProxies[i].id()); + initializationReplies.push_back( + transformErrors(throwErrorOr(recr.grvProxies[i].cdcProxy.getReplyUnlessFailedFor( + req, SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY)), + cdc_proxy_failed())); + } + + std::vector newRecruits = co_await getAll(initializationReplies); + TraceEvent("CDCProxyInitializationComplete", self->dbgid).log(); + self->controllerData->db.cdcProxies = std::move(newRecruits); + self->registrationTrigger.trigger(); +} + Future newResolvers(Reference self, RecruitFromConfigurationReply recr) { std::vector> initializationReplies; for (int i = 0; i < recr.resolvers.size(); i++) { @@ -779,6 +814,7 @@ void sendMasterRegistration(ClusterRecoveryData* self, LogSystemConfig const& logSystemConfig, std::vector commitProxies, std::vector grvProxies, + std::vector cdcProxies, std::vector resolvers, DBRecoveryCount recoveryCount, std::vector priorCommittedLogServers) { @@ -788,6 +824,7 @@ void sendMasterRegistration(ClusterRecoveryData* self, masterReq.logSystemConfig = logSystemConfig; masterReq.commitProxies = commitProxies; masterReq.grvProxies = grvProxies; + masterReq.cdcProxies = cdcProxies; masterReq.resolvers = resolvers; masterReq.recoveryCount = recoveryCount; if (self->hasConfiguration) @@ -826,6 +863,7 @@ Future updateRegistration(Reference self, ReferenceprovisionalCommitProxies, self->provisionalGrvProxies, + self->controllerData->db.cdcProxies, self->resolvers, self->cstate.myDBState.recoveryCount, self->cstate.prevDBState.getPriorCommittedLogServers()); @@ -835,6 +873,7 @@ Future updateRegistration(Reference self, ReferencecommitProxies, self->grvProxies, + self->controllerData->db.cdcProxies, self->resolvers, self->cstate.myDBState.recoveryCount, std::vector()); @@ -1102,6 +1141,7 @@ Future>> recruitEverything( .detail("Status", RecoveryStatus::names[RecoveryStatus::initializing_transaction_servers]) .detail("CommitProxies", recruits.commitProxies.size()) .detail("GrvProxies", recruits.grvProxies.size()) + .detail("CDCProxies", recruits.grvProxies.size()) .detail("TLogs", recruits.tLogs.size()) .detail("Resolvers", recruits.resolvers.size()) .detail("SatelliteTLogs", recruits.satelliteTLogs.size()) @@ -1122,6 +1162,7 @@ Future>> recruitEverything( Future txnSystemInitialized = traceAfter(newCommitProxies(self, recruits), "CommitProxiesInitialized") && traceAfter(newGrvProxies(self, recruits), "GRVProxiesInitialized") && + traceAfter(ensureCDCProxies(self, recruits), "CDCProxiesAvailable") && traceAfter(newResolvers(self, recruits), "ResolversInitialized") && traceAfter(newTLogServers(self, recruits, oldLogSystem, &confChanges), "TLogServersInitialized"); co_await (txnSystemInitialized || monitorInitializingTxnSystem(self->controllerData->db.unfinishedRecoveries)); @@ -1279,6 +1320,20 @@ Future readTransactionSystemState(Reference self, self->allTags.push_back(decodeServerTagValue(kv.value)); } + std::set activeCdcStreams; + RangeResult rawCdcStreams = co_await self->txnStateStore->readRange(cdcStreamKeys); + for (auto& kv : rawCdcStreams) { + activeCdcStreams.insert(decodeCDCStreamKey(kv.key)); + } + + RangeResult rawCdcHistoryTags = co_await self->txnStateStore->readRange(cdcTagHistoryKeys); + for (auto& kv : rawCdcHistoryTags) { + const auto tagHistory = decodeCDCTagHistoryKey(kv.key); + if (activeCdcStreams.contains(std::get<0>(tagHistory))) { + self->allTags.push_back(std::get<2>(tagHistory)); + } + } + uniquify(self->allTags); self->txnStateLogAdapter->setNextVersion( diff --git a/fdbserver/commitproxy/CommitProxyServer.cpp b/fdbserver/commitproxy/CommitProxyServer.cpp index 086b4814e09..aa4a0cb95ee 100644 --- a/fdbserver/commitproxy/CommitProxyServer.cpp +++ b/fdbserver/commitproxy/CommitProxyServer.cpp @@ -695,6 +695,10 @@ std::set CommitBatchContext::getWrittenTagsPreResolution() { if (isSingleKeyMutation((MutationRef::Type)m.type)) { auto& tags = pProxyCommitData->tagsForKey(m.param1); transactionTags.insert(tags.begin(), tags.end()); + if (!pProxyCommitData->cdcRouting.empty()) { + const auto& cdcTags = pProxyCommitData->cdcRouting.tagsForKey(m.param1); + transactionTags.insert(cdcTags.begin(), cdcTags.end()); + } } else if (m.type == MutationRef::ClearRange) { auto range = pProxyCommitData->keyInfo.rangeContaining(m.param1); if (range.end() >= m.param2) { @@ -710,6 +714,10 @@ std::set CommitBatchContext::getWrittenTagsPreResolution() { } } KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); + if (!pProxyCommitData->cdcRouting.empty()) { + const auto cdcTags = pProxyCommitData->cdcRouting.tagsForRange(clearRange); + transactionTags.insert(cdcTags.begin(), cdcTags.end()); + } } else { UNREACHABLE(); } @@ -1391,6 +1399,9 @@ Future assignMutationsToStorageServers(CommitBatchContext* self) { DEBUG_MUTATION("ProxyCommit", self->commitVersion, m, pProxyCommitData->dbgid).detail("To", tags); self->toCommit.addTags(tags); + if (!pProxyCommitData->cdcRouting.empty()) { + self->toCommit.addTags(pProxyCommitData->cdcRouting.tagsForKey(m.param1)); + } if (pProxyCommitData->acsBuilder != nullptr) { updateMutationWithAcsAndAddMutationToAcsBuilder( @@ -1482,6 +1493,9 @@ Future assignMutationsToStorageServers(CommitBatchContext* self) { } KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); + if (!pProxyCommitData->cdcRouting.empty()) { + self->toCommit.addTags(pProxyCommitData->cdcRouting.tagsForRange(clearRange)); + } WriteMutationRefVar var = writeMutation(self, &m); // FIXME: Remove assert once ClearRange RAW_ACCESS usecase handling is done ASSERT(std::holds_alternative(var)); @@ -2752,6 +2766,7 @@ Future processCompleteTransactionStateRequest(TransactionStateResolveConte auto lockedKey = pContext->pTxnStateStore->readValue(databaseLockedKey).get(); pContext->pCommitData->locked = lockedKey.present() && !lockedKey.get().empty(); pContext->pCommitData->metadataVersion = pContext->pTxnStateStore->readValue(metadataVersionKey).get(); + pContext->pCommitData->cdcRouting.reload(pContext->pTxnStateStore); pContext->pTxnStateStore->enableSnapshot(); } diff --git a/fdbserver/commitproxy/ProxyCommitData.h b/fdbserver/commitproxy/ProxyCommitData.h index c2c41f99652..dbe1cce69a7 100644 --- a/fdbserver/commitproxy/ProxyCommitData.h +++ b/fdbserver/commitproxy/ProxyCommitData.h @@ -199,6 +199,7 @@ struct ProxyCommitData { Promise validState; // Set once txnStateStore and version are valid double lastVersionTime; KeyRangeMap> vecBackupKeys; + CDCRoutingTable cdcRouting; uint64_t commitVersionRequestNumber; uint64_t mostRecentProcessedRequestNumber; KeyRangeMap>> keyResolvers; @@ -414,6 +415,7 @@ inline ApplyMetadataProxyContext ProxyCommitData::getApplyMetadataProxyContext() return { .dbgid = dbgid, .txnStateStore = txnStateStore, .vecBackupKeys = &vecBackupKeys, + .cdcRouting = &cdcRouting, .keyInfo = &keyInfo, .uid_applyMutationsData = firstProxy ? &uid_applyMutationsData : nullptr, .commit = commit, diff --git a/fdbserver/core/ServerKnobs.cpp b/fdbserver/core/ServerKnobs.cpp index 892311c9984..6a96fd2ab68 100644 --- a/fdbserver/core/ServerKnobs.cpp +++ b/fdbserver/core/ServerKnobs.cpp @@ -175,6 +175,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DESIRED_UPDATE_BYTES, 2*DESIRED_TOTAL_BYTES ); init( UPDATE_DELAY, 0.001 ); init( MAXIMUM_PEEK_BYTES, 10e6 ); + init( CDC_PROXY_BUFFER_BYTES, 1e9 ); if( randomize && BUGGIFY ) CDC_PROXY_BUFFER_BYTES = 10000; init( APPLY_MUTATION_BYTES, 1e6 ); init( BUGGIFY_RECOVER_MEMORY_LIMIT, 1e6 ); init( BUGGIFY_WORKER_REMOVED_MAX_LAG, 30 ); diff --git a/fdbserver/core/WorkerSupport.cpp b/fdbserver/core/WorkerSupport.cpp index 0385e1ddd3a..fd7a58cbebf 100644 --- a/fdbserver/core/WorkerSupport.cpp +++ b/fdbserver/core/WorkerSupport.cpp @@ -19,6 +19,9 @@ template struct NetNotifiedQueue; template class RequestStream; template struct NetNotifiedQueue; +template class RequestStream; +template struct NetNotifiedQueue; + template class RequestStream; template struct NetNotifiedQueue; @@ -230,6 +233,7 @@ const Role Role::TRANSACTION_LOG("TLog", "TL"); const Role Role::SHARED_TRANSACTION_LOG("SharedTLog", "SL", false); const Role Role::COMMIT_PROXY("CommitProxyServer", "CP"); const Role Role::GRV_PROXY("GrvProxyServer", "GP"); +const Role Role::CDC_PROXY("CDCProxyServer", "DP"); const Role Role::MASTER("MasterServer", "MS"); const Role Role::RESOLVER("Resolver", "RV"); const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC"); diff --git a/fdbserver/core/include/fdbserver/core/Knobs.h b/fdbserver/core/include/fdbserver/core/Knobs.h index a86a669bec1..a548ac62c29 100644 --- a/fdbserver/core/include/fdbserver/core/Knobs.h +++ b/fdbserver/core/include/fdbserver/core/Knobs.h @@ -72,6 +72,7 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ServerKnobs : public KnobsImpl master; RequestStream commitProxy; RequestStream grvProxy; + RequestStream cdcProxy; RequestStream dataDistributor; RequestStream ratekeeper; RequestStream consistencyScan; @@ -87,6 +89,7 @@ struct WorkerInterface { master.getEndpoint(TaskPriority::Worker); commitProxy.getEndpoint(TaskPriority::Worker); grvProxy.getEndpoint(TaskPriority::Worker); + cdcProxy.getEndpoint(TaskPriority::Worker); resolver.getEndpoint(TaskPriority::Worker); logRouter.getEndpoint(TaskPriority::Worker); debugPing.getEndpoint(TaskPriority::Worker); @@ -121,7 +124,8 @@ struct WorkerInterface { execReq, workerSnapReq, backup, - updateServerDBInfo); + updateServerDBInfo, + cdcProxy); } }; @@ -239,6 +243,7 @@ struct RegisterMasterRequest { LogSystemConfig logSystemConfig; std::vector commitProxies; std::vector grvProxies; + std::vector cdcProxies; std::vector resolvers; DBRecoveryCount recoveryCount; int64_t registrationCount; @@ -266,7 +271,8 @@ struct RegisterMasterRequest { configuration, priorCommittedLogServers, recoveryState, - recoveryStalled); + recoveryStalled, + cdcProxies); } }; @@ -731,6 +737,21 @@ struct InitializeGrvProxyRequest { extern template class RequestStream; extern template struct NetNotifiedQueue; +struct InitializeCDCProxyRequest { + constexpr static FileIdentifier file_identifier = 416762; + uint64_t recoveryCount; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, recoveryCount, reply); + } +}; + +// Instantiated in WorkerSupport.cpp +extern template class RequestStream; +extern template struct NetNotifiedQueue; + struct InitializeDataDistributorRequest { constexpr static FileIdentifier file_identifier = 8858952; UID reqId; @@ -970,6 +991,7 @@ struct Role { static const Role SHARED_TRANSACTION_LOG; static const Role COMMIT_PROXY; static const Role GRV_PROXY; + static const Role CDC_PROXY; static const Role MASTER; static const Role RESOLVER; static const Role CLUSTER_CONTROLLER; diff --git a/fdbserver/logsystem/ApplyMetadataMutation.cpp b/fdbserver/logsystem/ApplyMetadataMutation.cpp index cf001f7ec61..c9b0155a775 100644 --- a/fdbserver/logsystem/ApplyMetadataMutation.cpp +++ b/fdbserver/logsystem/ApplyMetadataMutation.cpp @@ -31,6 +31,7 @@ #include "fdbserver/logsystem/LogSystem.h" #include "flow/Error.h" #include "flow/Trace.h" +#include "flow/UnitTest.h" Reference getStorageInfo(UID id, std::map>* storageCache, @@ -47,6 +48,75 @@ Reference getStorageInfo(UID id, } return storageInfo; } + +CDCRoutingTable::CDCRoutingTable() { + tagsByRange.insert(allKeys, std::set()); +} + +void CDCRoutingTable::updateRange(CDCStreamId streamId, KeyRangeRef const& keys) { + streams[streamId].keys = KeyRange(keys); +} + +bool CDCRoutingTable::updateTag(CDCStreamId streamId, Version version, Tag tag) { + ASSERT(tag.locality == tagLocalityCDC); + auto& existing = streams[streamId].tag; + if (!existing.present() || version >= existing.get().first) { + existing = std::make_pair(version, tag); + return true; + } + return false; +} + +void CDCRoutingTable::rebuildRanges() { + tagsByRange.insert(allKeys, std::set()); + for (const auto& [streamId, state] : streams) { + if (!state.keys.present() || !state.tag.present()) { + continue; + } + for (auto range : tagsByRange.modify(state.keys.get())) { + range->value().insert(state.tag.get().second); + } + } + tagsByRange.coalesce(allKeys); +} + +void CDCRoutingTable::setRange(CDCStreamId streamId, KeyRangeRef const& keys) { + updateRange(streamId, keys); + rebuildRanges(); +} + +void CDCRoutingTable::setTag(CDCStreamId streamId, Version version, Tag tag) { + if (updateTag(streamId, version, tag)) { + rebuildRanges(); + } +} + +void CDCRoutingTable::reload(IKeyValueStore* txnStateStore) { + streams.clear(); + const RangeResult streamRows = txnStateStore->readRange(cdcStreamKeys).get(); + for (const auto& kv : streamRows) { + updateRange(decodeCDCStreamKey(kv.key), decodeCDCStreamKeysValue(kv.value)); + } + const RangeResult tagHistoryRows = txnStateStore->readRange(cdcTagHistoryKeys).get(); + for (const auto& kv : tagHistoryRows) { + const auto [streamId, version, tag] = decodeCDCTagHistoryKey(kv.key); + updateTag(streamId, version, tag); + } + rebuildRanges(); +} + +const std::set& CDCRoutingTable::tagsForKey(KeyRef const& key) const { + return tagsByRange.rangeContaining(key).value(); +} + +std::set CDCRoutingTable::tagsForRange(KeyRangeRef const& keys) const { + std::set tags; + for (auto range : tagsByRange.intersectingRanges(keys)) { + tags.insert(range.value().begin(), range.value().end()); + } + return tags; +} + namespace { // It is incredibly important that any modifications to txnStateStore are done in such a way that the same operations @@ -77,9 +147,9 @@ class ApplyMetadataMutationsImpl { : spanContext(spanContext_), dbgid(proxyMetadata_.dbgid), arena(arena_), mutations(mutations_), txnStateStore(proxyMetadata_.txnStateStore), toCommit(toCommit_), confChange(confChange_), logSystemConsumer(logSystemConsumer_), version(version), popVersion(popVersion_), - vecBackupKeys(proxyMetadata_.vecBackupKeys), keyInfo(proxyMetadata_.keyInfo), - uid_applyMutationsData(proxyMetadata_.uid_applyMutationsData), commit(proxyMetadata_.commit), - cx(proxyMetadata_.cx), committedVersion(proxyMetadata_.committedVersion), + vecBackupKeys(proxyMetadata_.vecBackupKeys), cdcRouting(proxyMetadata_.cdcRouting), + keyInfo(proxyMetadata_.keyInfo), uid_applyMutationsData(proxyMetadata_.uid_applyMutationsData), + commit(proxyMetadata_.commit), cx(proxyMetadata_.cx), committedVersion(proxyMetadata_.committedVersion), storageCache(proxyMetadata_.storageCache), tag_popped(proxyMetadata_.tag_popped), tssMapping(proxyMetadata_.tssMapping), initialCommit(initialCommit_), provisionalCommitProxy(provisionalCommitProxy_), @@ -124,6 +194,7 @@ class ApplyMetadataMutationsImpl { Version version = invalidVersion; Version popVersion = 0; KeyRangeMap>* vecBackupKeys = nullptr; + CDCRoutingTable* cdcRouting = nullptr; KeyRangeMap* keyInfo = nullptr; std::map* uid_applyMutationsData = nullptr; PublicRequestStream commit = PublicRequestStream(); @@ -552,6 +623,31 @@ class ApplyMetadataMutationsImpl { .detail("LogRangeEnd", logRangeEnd); } + void checkSetCDCMetadata(MutationRef m) { + if (!cdcStreamNameKeys.contains(m.param1) && !cdcStreamKeys.contains(m.param1) && + !cdcTagHistoryKeys.contains(m.param1) && !cdcRetiredTagPopKeys.contains(m.param1) && + !cdcProxyKeys.contains(m.param1) && m.param1 != cdcMaxStreamIdKey && + m.param1 != cdcProxyAssignmentChangeKey) { + return; + } + if (!initialCommit) { + txnStateStore->set(KeyValueRef(m.param1, m.param2)); + } + if (toCommit && SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST && + (cdcStreamKeys.contains(m.param1) || cdcTagHistoryKeys.contains(m.param1))) { + toCommit->setLogsChanged(); + } + if (!cdcRouting) { + return; + } + if (cdcStreamKeys.contains(m.param1)) { + cdcRouting->setRange(decodeCDCStreamKey(m.param1), decodeCDCStreamKeysValue(m.param2)); + } else if (cdcTagHistoryKeys.contains(m.param1)) { + const auto [streamId, tagVersion, tag] = decodeCDCTagHistoryKey(m.param1); + cdcRouting->setTag(streamId, tagVersion, tag); + } + } + void checkSetGlobalKeys(MutationRef m) { if (!m.param1.startsWith(globalKeysPrefix)) { return; @@ -994,6 +1090,34 @@ class ApplyMetadataMutationsImpl { txnStateStore->clear(commonLogRange); } + void checkClearCDCMetadata(KeyRangeRef range) { + if (!cdcStreamNameKeys.intersects(range) && !cdcStreamKeys.intersects(range) && + !cdcTagHistoryKeys.intersects(range) && !cdcRetiredTagPopKeys.intersects(range) && + !cdcProxyKeys.intersects(range) && !range.contains(cdcMaxStreamIdKey)) { + return; + } + // CDC tags may be shared and acknowledgement minima are stored outside transaction state. + // A durable retired-tag watermark lets any CDC proxy finish pops after stream removal. + if (!initialCommit) { + for (const KeyRangeRef cdcRange : + { cdcStreamNameKeys, cdcStreamKeys, cdcTagHistoryKeys, cdcRetiredTagPopKeys, cdcProxyKeys }) { + if (cdcRange.intersects(range)) { + txnStateStore->clear(cdcRange & range); + } + } + if (range.contains(cdcMaxStreamIdKey)) { + txnStateStore->clear(singleKeyRange(cdcMaxStreamIdKey)); + } + } + if (toCommit && SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST && + (cdcStreamKeys.intersects(range) || cdcTagHistoryKeys.intersects(range))) { + toCommit->setLogsChanged(); + } + if (cdcRouting && (cdcStreamKeys.intersects(range) || cdcTagHistoryKeys.intersects(range))) { + cdcRouting->reload(txnStateStore); + } + } + void checkClearTssMappingKeys(MutationRef m, KeyRangeRef range) { if (!tssMappingKeys.intersects(range)) { return; @@ -1131,6 +1255,7 @@ class ApplyMetadataMutationsImpl { checkSetApplyMutationsEndRange(m); checkSetApplyMutationsKeyVersionMapRange(m); checkSetLogRangesRange(m); + checkSetCDCMetadata(m); checkSetGlobalKeys(m); checkSetWriteRecoverKey(m); checkSetMinRequiredCommitVersionKey(m); @@ -1149,6 +1274,7 @@ class ApplyMetadataMutationsImpl { checkClearApplyMutationsEndRange(m, range); checkClearApplyMutationKeyVersionMapRange(m, range); checkClearLogRangesRange(range); + checkClearCDCMetadata(range); checkClearTssMappingKeys(m, range); checkClearTssQuarantineKeys(m, range); checkClearVersionEpochKeys(m, range); @@ -1219,7 +1345,10 @@ bool containsMetadataMutation(const VectorRef& mutations) { (m.param1.startsWith(applyMutationsEndRange.begin)) || (m.param1.startsWith(applyMutationsKeyVersionMapRange.begin)) || (m.param1.startsWith(logRangesRange.begin)) || (m.param1.startsWith(serverKeysPrefix)) || - (m.param1.startsWith(keyServersPrefix))) { + (m.param1.startsWith(keyServersPrefix)) || cdcStreamNameKeys.contains(m.param1) || + cdcStreamKeys.contains(m.param1) || cdcTagHistoryKeys.contains(m.param1) || + cdcRetiredTagPopKeys.contains(m.param1) || cdcProxyKeys.contains(m.param1) || + m.param1 == cdcMaxStreamIdKey || m.param1 == cdcProxyAssignmentChangeKey) { return true; } } else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) { @@ -1232,10 +1361,39 @@ bool containsMetadataMutation(const VectorRef& mutations) { (tssQuarantineKeys.intersects(range)) || (range.contains(previousCoordinatorsKey)) || (range.contains(coordinatorsKey)) || (range.contains(databaseLockedKey)) || (range.contains(metadataVersionKey)) || (range.contains(mustContainSystemMutationsKey)) || - (range.contains(writeRecoveryKey)) || (range.intersects(testOnlyTxnStateStorePrefixRange))) { + (range.contains(writeRecoveryKey)) || (range.intersects(testOnlyTxnStateStorePrefixRange)) || + cdcStreamNameKeys.intersects(range) || cdcStreamKeys.intersects(range) || + cdcTagHistoryKeys.intersects(range) || cdcRetiredTagPopKeys.intersects(range) || + cdcProxyKeys.intersects(range) || range.contains(cdcMaxStreamIdKey)) { return true; } } } return false; } + +TEST_CASE("/NativeCDC/RoutingTable") { + CDCRoutingTable table; + const Tag ordersTag(tagLocalityCDC, 1); + const Tag overlappingTag(tagLocalityCDC, 2); + const Tag rotatedOrdersTag(tagLocalityCDC, 3); + + ASSERT(table.tagsForKey("b"_sr).empty()); + ASSERT(table.tagsForRange(KeyRangeRef("b"_sr, "x"_sr)).empty()); + + table.setRange(1, KeyRangeRef("a"_sr, "m"_sr)); + table.setTag(1, 100, ordersTag); + table.setRange(2, KeyRangeRef("g"_sr, "z"_sr)); + table.setTag(2, 100, overlappingTag); + + ASSERT(table.tagsForKey("b"_sr) == std::set{ ordersTag }); + ASSERT(table.tagsForKey("h"_sr) == (std::set{ ordersTag, overlappingTag })); + ASSERT(table.tagsForKey("x"_sr) == std::set{ overlappingTag }); + ASSERT(table.tagsForRange(KeyRangeRef("b"_sr, "x"_sr)) == (std::set{ ordersTag, overlappingTag })); + + table.setTag(1, 200, rotatedOrdersTag); + ASSERT(table.tagsForKey("b"_sr) == std::set{ rotatedOrdersTag }); + ASSERT(table.tagsForKey("h"_sr) == (std::set{ rotatedOrdersTag, overlappingTag })); + + return Void(); +} diff --git a/fdbserver/logsystem/LogSet.cpp b/fdbserver/logsystem/LogSet.cpp index e8cff87116f..127c2b7986b 100644 --- a/fdbserver/logsystem/LogSet.cpp +++ b/fdbserver/logsystem/LogSet.cpp @@ -21,6 +21,7 @@ #include "fdbserver/logsystem/LogSystem.h" #include "fdbclient/FDBTypes.h" +#include "flow/CodeProbe.h" std::string LogSet::logRouterString() { std::string result; @@ -62,9 +63,13 @@ std::string LogSet::logServerString() { return result; } -void LogSet::populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) { +void LogSet::populateSatelliteTagLocations(int logRouterTags, + int oldLogRouterTags, + int txsTags, + int oldTxsTags, + int cdcTags) { satelliteTagLocations.clear(); - satelliteTagLocations.resize(std::max({ logRouterTags, oldLogRouterTags, txsTags, oldTxsTags }) + 1); + satelliteTagLocations.resize(std::max({ logRouterTags, oldLogRouterTags, txsTags, oldTxsTags, cdcTags }) + 1); std::map server_usedBest; std::set> used_servers; @@ -217,7 +222,8 @@ void LogSet::getPushLocations(VectorRef tags, const Optional>& restrictedLogSet) { if (locality == tagLocalitySatellite) { for (auto& t : tags) { - if (t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) { + if (t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter || t.locality == tagLocalityCDC) { + CODE_PROBE(t.locality == tagLocalityCDC, "CDC mutations are routed to satellite TLogs"); for (int loc : satelliteTagLocations[t.id + 1]) { locations.push_back(locationOffset + loc); } diff --git a/fdbserver/logsystem/LogSystem.cpp b/fdbserver/logsystem/LogSystem.cpp index af2c5b6f524..e92a6ed299d 100644 --- a/fdbserver/logsystem/LogSystem.cpp +++ b/fdbserver/logsystem/LogSystem.cpp @@ -21,8 +21,10 @@ #include "fdbserver/logsystem/LogSystem.h" #include "fdbserver/logsystem/LogSystemConsumer.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/Knobs.h" #include "fdbserver/core/OTELSpanContextMessage.h" #include "fdbserver/core/SpanContextMessage.h" +#include "flow/CodeProbe.h" #include "flow/serialize.h" bool logSystemHasRemoteLogs(LogSystem const& logSystem) { @@ -2556,6 +2558,12 @@ Future> LogSystem::newEpoch(Reference oldLogSyst for (auto& it : oldLogSystem->oldLogData) { maxTxsTags = std::max(maxTxsTags, it.txsTags); } + int maxCdcTags = CLIENT_KNOBS->NATIVE_CDC_TAG_COUNT; + for (Tag tag : allTags) { + if (tag.locality == tagLocalityCDC) { + maxCdcTags = std::max(maxCdcTags, tag.id + 1); + } + } if (region.satelliteTLogReplicationFactor > 0 && configuration.usableRegions > 1) { logSystem->tLogs.push_back(makeReference()); @@ -2584,7 +2592,7 @@ Future> LogSystem::newEpoch(Reference oldLogSyst .size()); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities); logSystem->tLogs[1]->populateSatelliteTagLocations( - logSystem->logRouterTags, oldLogSystem->logRouterTags, logSystem->txsTags, maxTxsTags); + logSystem->logRouterTags, oldLogSystem->logRouterTags, logSystem->txsTags, maxTxsTags, maxCdcTags); logSystem->expectedLogSets++; } @@ -2756,6 +2764,18 @@ Future> LogSystem::newEpoch(Reference oldLogSyst std::vector sreqs(recr.satelliteTLogs.size()); std::vector satelliteTags; + for (Tag tag : allTags) { + if (tag.locality == tagLocalityCDC) { + CODE_PROBE(true, "CDC tags are recovered onto satellite TLogs"); + locations.clear(); + logSystem->tLogs[1]->getPushLocations(VectorRef(&tag, 1), locations, 0); + for (int loc : locations) { + sreqs[loc].recoverTags.push_back(tag); + } + satelliteTags.push_back(tag); + } + } + if (logSystem->logRouterTags) { for (int i = 0; i < oldLogSystem->logRouterTags; i++) { Tag tag = Tag(tagLocalityLogRouter, i); diff --git a/fdbserver/logsystem/LogSystemConsumer.cpp b/fdbserver/logsystem/LogSystemConsumer.cpp index 816fe221616..bada5d89ed1 100644 --- a/fdbserver/logsystem/LogSystemConsumer.cpp +++ b/fdbserver/logsystem/LogSystemConsumer.cpp @@ -1,7 +1,20 @@ #include "fdbserver/logsystem/LogSystemConsumer.h" +#include #include +#include "flow/genericactors.actor.h" + +namespace { +bool shouldPopFromLogSet(Reference const& logSet, Tag tag, int8_t popLocality) { + // CDC tags are replicated to each TLog set. Once a version is acknowledged, every copy can be discarded; + // leaving remote copies unpopped can retain old log generations across failover. + return logSet->locality == tagLocalitySpecial || logSet->locality == tag.locality || + tag.locality == tagLocalityCDC || + (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == logSet->isLocal)); +} +} // namespace + Reference LogSystemConsumer::peekAll(UID dbgid, Version begin, Version end, @@ -20,7 +33,7 @@ Reference LogSystemConsumer::peekAll(UID dbgid, } if (log->isLocal && !log->logServers.empty() && (log->locality == tagLocalitySpecial || log->locality == tag.locality || tag.locality == tagLocalityTxs || - tag.locality == tagLocalityLogRouter)) { + tag.locality == tagLocalityLogRouter || tag.locality == tagLocalityCDC)) { lastBegin = std::max(lastBegin, log->startVersion); localSets.push_back(log); if (log->locality != tagLocalitySatellite) { @@ -95,7 +108,8 @@ Reference LogSystemConsumer::peekAll(UID dbgid, } if (log->isLocal && !log->logServers.empty() && (log->locality == tagLocalitySpecial || log->locality == tag.locality || - tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter)) { + tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || + tag.locality == tagLocalityCDC)) { thisBegin = std::max(thisBegin, log->startVersion); localOldSets.push_back(log); if (log->locality != tagLocalitySatellite) { @@ -608,19 +622,25 @@ Reference LogSystemConsumer::peekSingle(UID dbgid, Tag tag, std::vector> history) { auto& ls = *logSystem; + auto peekTag = [&](Tag readTag, Version readBegin, Version readEnd) -> Reference { + if (readTag.locality == tagLocalityCDC) { + return peekAll(dbgid, readBegin, readEnd, readTag, false); + } + return peekLocal(dbgid, readTag, readBegin, readEnd, false); + }; while (!history.empty() && begin >= history.back().first) { history.pop_back(); } if (history.empty()) { TraceEvent("TLogPeekSingleNoHistory", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); - return peekLocal(dbgid, tag, begin, ls.getPeekEnd(), false); + return peekTag(tag, begin, ls.getPeekEnd()); } else { std::vector> cursors; std::vector epochEnds; TraceEvent("TLogPeekSingleAddingLocal", dbgid).detail("Tag", tag.toString()).detail("Begin", history[0].first); - cursors.push_back(peekLocal(dbgid, tag, history[0].first, ls.getPeekEnd(), false)); + cursors.push_back(peekTag(tag, history[0].first, ls.getPeekEnd())); for (int i = 0; i < history.size(); i++) { TraceEvent("TLogPeekSingleAddingOld", dbgid) @@ -628,11 +648,9 @@ Reference LogSystemConsumer::peekSingle(UID dbgid, .detail("HistoryTag", history[i].second.toString()) .detail("Begin", i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin)) .detail("End", history[i].first); - cursors.push_back(peekLocal(dbgid, - history[i].second, - i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin), - history[i].first, - false)); + cursors.push_back(peekTag(history[i].second, + i + 1 == history.size() ? begin : std::max(history[i + 1].first, begin), + history[i].first)); epochEnds.emplace_back(history[i].first); } @@ -877,8 +895,7 @@ void LogSystemConsumer::pop(Version upTo, Tag tag, Version durableKnownCommitted return; } for (auto& t : ls.tLogs) { - if (t->locality == tagLocalitySpecial || t->locality == tag.locality || - (tag.locality < 0 && ((popLocality == tagLocalityInvalid) == t->isLocal))) { + if (shouldPopFromLogSet(t, tag, popLocality)) { for (auto& log : t->logServers) { Version prev = ls.outstandingPops[std::make_pair(log->get().id(), tag)].first; if (prev < upTo) { @@ -896,6 +913,30 @@ void LogSystemConsumer::pop(Version upTo, Tag tag, Version durableKnownCommitted } } +Future LogSystemConsumer::waitForPopped(Version upTo, Tag tag, int8_t popLocality) { + while (true) { + std::vector> poppedFutures; + for (auto& t : logSystem->tLogs) { + if (shouldPopFromLogSet(t, tag, popLocality)) { + for (auto& log : t->logServers) { + poppedFutures.push_back(LogSystem::getPoppedFromTLog(log, tag)); + } + } + } + if (poppedFutures.empty()) { + co_return; + } + + std::vector poppedVersions = co_await getAll(poppedFutures); + if (std::all_of(poppedVersions.begin(), poppedVersions.end(), [upTo](Version poppedVersion) { + return poppedVersion >= upTo; + })) { + co_return; + } + co_await delay(0.01, TaskPriority::TLogPop); + } +} + Future LogSystemConsumer::getTxsPoppedVersion() { auto& ls = *logSystem; return LogSystem::getPoppedTxs(&ls); diff --git a/fdbserver/logsystem/LogSystemPeekCursor.cpp b/fdbserver/logsystem/LogSystemPeekCursor.cpp index 8dfdbee95ee..dd78ec8d9d1 100644 --- a/fdbserver/logsystem/LogSystemPeekCursor.cpp +++ b/fdbserver/logsystem/LogSystemPeekCursor.cpp @@ -1044,8 +1044,13 @@ SetPeekCursor::SetPeekCursor(std::vector> const& logSets, ? canReturnEmptyVersionRange( bestServer, j /*currentServer*/, end, knownLockedTLogIds, bestSet, i /* currentSet */) : false); - auto cursor = makeReference( - logSets[i]->logServers[j], tag, begin, end, true, parallelGetMore, returnEmptyIfStopped); + auto cursor = makeReference(logSets[i]->logServers[j], + tag, + begin, + end, + tag.locality != tagLocalityCDC, + parallelGetMore, + returnEmptyIfStopped); serverCursors[i].push_back(cursor); } maxServers = std::max(maxServers, serverCursors[i].size()); diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h index aac8c5d5b0e..6916cd8703f 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/ApplyMetadataMutation.h @@ -56,10 +56,36 @@ struct ApplyMutationsData { Reference> keyVersion; }; +// Active CDC write routing reconstructed from durable stream and tag-history metadata. +class CDCRoutingTable : NonCopyable { + struct StreamState { + Optional keys; + Optional> tag; + }; + + std::map streams; + KeyRangeMap> tagsByRange; + + void updateRange(CDCStreamId streamId, KeyRangeRef const& keys); + bool updateTag(CDCStreamId streamId, Version version, Tag tag); + void rebuildRanges(); + +public: + CDCRoutingTable(); + void setRange(CDCStreamId streamId, KeyRangeRef const& keys); + void setTag(CDCStreamId streamId, Version version, Tag tag); + void reload(IKeyValueStore* txnStateStore); + bool empty() const { return streams.empty(); } + + const std::set& tagsForKey(KeyRef const& key) const; + std::set tagsForRange(KeyRangeRef const& keys) const; +}; + struct ApplyMetadataProxyContext { UID dbgid; IKeyValueStore* txnStateStore = nullptr; KeyRangeMap>* vecBackupKeys = nullptr; + CDCRoutingTable* cdcRouting = nullptr; KeyRangeMap* keyInfo = nullptr; std::map* uid_applyMutationsData = nullptr; PublicRequestStream commit; diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemConsumer.h b/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemConsumer.h index f0ca7bb86f7..18b9de6f040 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemConsumer.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemConsumer.h @@ -66,6 +66,8 @@ struct LogSystemConsumer : ReferenceCounted { void popLogRouter(Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality); void popTxs(Version upTo, int8_t popLocality = tagLocalityInvalid); void pop(Version upTo, Tag tag, Version durableKnownCommittedVersion = 0, int8_t popLocality = tagLocalityInvalid); + // Waits until every currently targeted TLog reports that `tag` has been popped through `upTo`. + Future waitForPopped(Version upTo, Tag tag, int8_t popLocality = tagLocalityInvalid); Future getTxsPoppedVersion(); Version getEnd() const; Tag getPseudoPopTag(Tag tag, ProcessClass::ClassType type) const; diff --git a/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemTypes.h b/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemTypes.h index ca4d90f39a7..646b18bbd87 100644 --- a/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemTypes.h +++ b/fdbserver/logsystem/include/fdbserver/logsystem/LogSystemTypes.h @@ -56,7 +56,11 @@ class LogSet : NonCopyable, public ReferenceCounted { bool hasLogRouter(UID id) const; bool hasBackupWorker(UID id) const; std::string logServerString(); - void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags); + void populateSatelliteTagLocations(int logRouterTags, + int oldLogRouterTags, + int txsTags, + int oldTxsTags, + int cdcTags); void checkSatelliteTagLocations(); int bestLocationFor(Tag tag); void updateLocalitySet(std::vector const& localities); diff --git a/fdbserver/tlog/TLogServer.cpp b/fdbserver/tlog/TLogServer.cpp index 265cf217fa4..e2b9cba0186 100644 --- a/fdbserver/tlog/TLogServer.cpp +++ b/fdbserver/tlog/TLogServer.cpp @@ -1577,13 +1577,16 @@ void commitMessages(TLogData* self, block.append(block.arena(), msg.message.begin(), msg.message.size()); for (auto tag : msg.tags) { if (logData->locality == tagLocalitySatellite) { - if (!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || tag == txsTag)) { + if (!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || + tag.locality == tagLocalityCDC || tag == txsTag)) { continue; } } else if (!(logData->locality == tagLocalitySpecial || logData->locality == tag.locality || tag.locality < 0)) { continue; } + CODE_PROBE(logData->locality == tagLocalitySatellite && tag.locality == tagLocalityCDC, + "Satellite TLog indexes CDC mutation"); if (tag.locality == tagLocalityLogRouter) { if (!logData->logRouterTags) { @@ -1950,8 +1953,8 @@ Future tLogPeekMessages(PromiseType replyPromise, auto tagData = logData->getTagData(reqTag); bool tagRecovered = tagData && !tagData->unpoppedRecovered; if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && poppedVer <= reqBegin && - reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && reqTag.locality >= 0 && - !reqReturnIfBlocked && tagRecovered) { + reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && + (reqTag.locality >= 0 || reqTag.locality == tagLocalityCDC) && !reqReturnIfBlocked && tagRecovered) { double startTime = now(); co_await waitForMessagesForTag(logData, reqTag, reqBegin, SERVER_KNOBS->BLOCKING_PEEK_TIMEOUT); double latency = now() - startTime; diff --git a/fdbserver/worker/CMakeLists.txt b/fdbserver/worker/CMakeLists.txt index 16a26a701a7..b7e4888340f 100644 --- a/fdbserver/worker/CMakeLists.txt +++ b/fdbserver/worker/CMakeLists.txt @@ -4,6 +4,7 @@ add_flow_target(STATIC_LIBRARY NAME fdbserver_worker SRCS ${FDBSERVER_WORKER_SRC add_fdbserver_link_test(fdbserver_workerlinktest fdbserver_worker fdbserver_backupworker + fdbserver_cdcproxy fdbserver_clustercontroller fdbserver_commitproxy fdbserver_consistencyscan @@ -56,6 +57,7 @@ target_link_libraries(fdbserver_worker fdbctl PRIVATE fdbserver_backupworker + fdbserver_cdcproxy fdbserver_clustercontroller fdbserver_commitproxy fdbserver_consistencyscan diff --git a/fdbserver/worker/worker.actor.cpp b/fdbserver/worker/worker.actor.cpp index cf9c9959600..b10216c73e8 100644 --- a/fdbserver/worker/worker.actor.cpp +++ b/fdbserver/worker/worker.actor.cpp @@ -50,6 +50,7 @@ #include "MetricLogger.actor.h" #include "fdbserver/backupworker/BackupWorker.h" #include "fdbserver/clustercontroller/ClusterController.h" +#include "fdbserver/cdcproxy/CDCProxy.h" #include "fdbserver/commitproxy/CommitProxyServer.h" #include "fdbserver/consistencyscan/ConsistencyScan.h" #include "fdbserver/datadistributor/DataDistributor.h" @@ -2787,6 +2788,27 @@ ACTOR Future workerServer(Reference connRecord, forwardError(errors, Role::GRV_PROXY, recruited.id(), grvProxyServer(recruited, req, dbInfo)))); req.reply.send(recruited); } + when(InitializeCDCProxyRequest req = waitNext(interf.cdcProxy.getFuture())) { + LocalLineage _; + CDCProxyInterface recruited; + recruited.processId = locality.processId(); + recruited.initEndpoints(); + + std::map details; + startRole(Role::CDC_PROXY, recruited.id(), interf.id(), details); + + DUMPTOKEN(recruited.consume); + DUMPTOKEN(recruited.ack); + DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.haltForTesting); + + errorForwarders.add(zombie(recruited, + forwardError(errors, + Role::CDC_PROXY, + recruited.id(), + cdcProxyServer(recruited, req.recoveryCount, dbInfo)))); + req.reply.send(recruited); + } when(InitializeResolverRequest req = waitNext(interf.resolver.getFuture())) { LocalLineage _; getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver; diff --git a/fdbserver/workloads/NativeCdc.cpp b/fdbserver/workloads/NativeCdc.cpp new file mode 100644 index 00000000000..900c3486591 --- /dev/null +++ b/fdbserver/workloads/NativeCdc.cpp @@ -0,0 +1,657 @@ +/* + * NativeCdc.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "fdbclient/CDCProxyInterface.h" +#include "fdbclient/Knobs.h" +#include "fdbclient/ManagementAPI.h" +#include "fdbclient/NativeCdc.h" +#include "fdbclient/NativeCdcInternal.h" +#include "fdbclient/SystemData.h" +#include "fdbserver/core/RecoveryState.h" +#include "fdbserver/core/ServerDBInfo.h" +#include "fdbserver/core/TLogInterface.h" +#include "fdbserver/tester/workloads.h" + +struct NativeCdcWorkload : TestWorkload { + static constexpr auto NAME = "NativeCdc"; + bool sharedTagSafety; + bool verifySatelliteIndexing; + + explicit NativeCdcWorkload(WorkloadContext const& wcx) + : TestWorkload(wcx), sharedTagSafety(getOption(options, "sharedTagSafety"_sr, false)), + verifySatelliteIndexing(getOption(options, "verifySatelliteIndexing"_sr, false)) {} + + void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("all"); } + + Future setup(Database const& cx) override { return Void(); } + + Future start(Database const& cx) override { + if (clientId != 0) { + return Void(); + } + return sharedTagSafety ? runSharedTagSafety(cx) : run(cx); + } + + Future check(Database const& cx) override { return true; } + + void getMetrics(std::vector& m) override {} + + Future> getPersistedRoute(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional keys = co_await tr.get(cdcStreamKeyFor(streamId)); + Optional minVersion = co_await tr.get(cdcMinVersionKeyFor(streamId)); + RangeResult history = co_await tr.getRange(cdcTagHistoryRangeFor(streamId), 2); + ASSERT(keys.present()); + ASSERT(minVersion.present()); + ASSERT(history.size() == 1); + const auto [historyStreamId, historyVersion, tag] = decodeCDCTagHistoryKey(history[0].key); + ASSERT(historyStreamId == streamId); + const Version initialMinVersion = decodeCDCMinVersionValue(minVersion.get()); + ASSERT(historyVersion <= initialMinVersion); + co_return std::make_pair(tag, initialMinVersion); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future getPersistedMinVersion(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional minVersion = co_await tr.get(cdcMinVersionKeyFor(streamId)); + ASSERT(minVersion.present()); + co_return decodeCDCMinVersionValue(minVersion.get()); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future hasPersistedRetention(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional minVersion = co_await tr.get(cdcMinVersionKeyFor(streamId)); + RangeResult history = co_await tr.getRange(cdcTagHistoryRangeFor(streamId), 1); + co_return minVersion.present() || !history.empty(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future getRetiredTagPopVersion(Database cx, Tag tag) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional marker = co_await tr.get(cdcRetiredTagPopKeyFor(tag)); + Optional version = co_await tr.get(cdcRetiredTagPopVersionKeyFor(tag)); + ASSERT(marker.present()); + ASSERT(version.present()); + co_return decodeCDCMinVersionValue(version.get()); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future hasRetiredTagPopState(Database cx, Tag tag) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional marker = co_await tr.get(cdcRetiredTagPopKeyFor(tag)); + Optional version = co_await tr.get(cdcRetiredTagPopVersionKeyFor(tag)); + ASSERT(marker.present() == version.present()); + co_return marker.present(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future waitForNoRetiredTagPopState(Database cx, Tag tag) { + while (co_await hasRetiredTagPopState(cx, tag)) { + co_await delay(0.1); + } + } + + Future appendPersistedTag(Database cx, CDCStreamId streamId, Tag tag) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + const Version assignmentVersion = co_await tr.getReadVersion(); + tr.set(cdcTagHistoryKeyFor(streamId, assignmentVersion, tag), Value()); + co_await tr.commit(); + co_return; + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future writeValues(Database cx, std::vector> values) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + for (const auto& [key, value] : values) { + tr.set(key, value); + } + co_await tr.commit(); + co_return tr.getCommittedVersion(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future getLatestPersistedTag(Database cx, CDCStreamId streamId) { + Transaction tr(cx); + while (true) { + Error err; + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + RangeResult history = co_await tr.getRange(cdcTagHistoryRangeFor(streamId), CLIENT_KNOBS->TOO_MANY); + ASSERT(!history.empty()); + const auto historyEntry = decodeCDCTagHistoryKey(history.back().key); + ASSERT(std::get<0>(historyEntry) == streamId); + co_return std::get<2>(historyEntry); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future getCDCProxy() { + while (dbInfo->get().client.cdcProxies.empty()) { + co_await dbInfo->onChange(); + } + co_return dbInfo->get().client.cdcProxies.front(); + } + + Future getCDCProxy(CDCStreamId streamId) { + while (true) { + const ClientDBInfo& client = dbInfo->get().client; + auto assigned = client.streamToCDCProxyId.find(streamId); + if (assigned != client.streamToCDCProxyId.end()) { + for (const auto& proxy : client.cdcProxies) { + if (proxy.id() == assigned->second) { + co_return proxy; + } + } + } + co_await dbInfo->onChange(); + } + } + + Future getReplacementCDCProxy(CDCStreamId streamId, UID failedProxyId) { + while (true) { + const ClientDBInfo& client = dbInfo->get().client; + auto assigned = client.streamToCDCProxyId.find(streamId); + if (assigned != client.streamToCDCProxyId.end() && assigned->second != failedProxyId) { + for (const auto& proxy : client.cdcProxies) { + if (proxy.id() == assigned->second) { + co_return proxy; + } + } + } + co_await dbInfo->onChange(); + } + } + + Future waitForCDCProxyAssignmentRemoval(CDCStreamId streamId) { + while (dbInfo->get().client.streamToCDCProxyId.contains(streamId)) { + co_await dbInfo->onChange(); + } + } + + Future waitForNoCDCProxies() { + while (!dbInfo->get().client.cdcProxies.empty()) { + co_await dbInfo->onChange(); + } + } + + Future changeResolverCount(Database cx, int32_t count) { + Standalone config(format("resolvers=%d", count)); + while (true) { + Optional conf; + ConfigurationResult result = + co_await ManagementAPI::changeConfig(cx.getReference(), { config }, conf, true); + if (result == ConfigurationResult::SUCCESS) { + co_return; + } + co_await delay(1.0); + } + } + + Future waitForRecoveryAfter(uint64_t previousRecoveryCount, RecoveryState requiredState) { + while (dbInfo->get().recoveryCount <= previousRecoveryCount || dbInfo->get().recoveryState < requiredState) { + co_await dbInfo->onChange(); + } + } + + Future verifySatelliteCDCWrite(Tag tag, Version version) { + bool foundSatelliteTLog = false; + for (const auto& tlogset : dbInfo->get().logSystemConfig.tLogs) { + if (!tlogset.isLocal || tlogset.locality != tagLocalitySatellite) { + continue; + } + foundSatelliteTLog = true; + for (const auto& tlog : tlogset.tLogs) { + TLogPeekReply reply = co_await timeoutError( + tlog.interf().peekMessages.getReply(TLogPeekRequest(version, tag, true, false)), 30.0); + if (!reply.messages.empty()) { + CODE_PROBE(true, "Native CDC workload reads tagged mutation from a satellite TLog"); + co_return; + } + } + } + ASSERT(foundSatelliteTLog); + ASSERT(false); + co_return; + } + + Future runSharedTagSafety(Database cx) { + const Key firstName = "native-cdc-shared-first"_sr; + const Key secondName = "native-cdc-shared-second"_sr; + const KeyRange keys(KeyRangeRef("shared/"_sr, "shared0"_sr)); + const CDCStreamId firstId = co_await registerNativeCdcStream(cx, firstName, keys); + const CDCStreamId secondId = co_await registerNativeCdcStream(cx, secondName, keys); + const auto firstRoute = co_await getPersistedRoute(cx, firstId); + co_await appendPersistedTag(cx, secondId, firstRoute.first); + ASSERT((co_await getLatestPersistedTag(cx, secondId)) == firstRoute.first); + + ASSERT(co_await registerNativeCdcStreamClient(cx, firstName, keys) == firstId); + ASSERT(co_await registerNativeCdcStreamClient(cx, secondName, keys) == secondId); + ASSERT((co_await getCDCProxy(firstId)).id() == (co_await getCDCProxy(secondId)).id()); + const Version writeVersion = co_await writeValues(cx, { { "shared/unread"_sr, "protected-by-minimum"_sr } }); + Reference firstConsumer = co_await createNativeCdcConsumer(cx, firstName); + ASSERT(firstConsumer->position().streamId == firstId); + const double firstConsumeDeadline = now() + 30.0; + while (firstConsumer->position().lastConsumedVersion < writeVersion) { + const Version previous = firstConsumer->position().lastConsumedVersion; + CDCConsumeReply consumed = co_await timeoutError(firstConsumer->consume(), 30.0); + if (consumed.lastConsumedVersion == previous) { + ASSERT(now() < firstConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(consumed.lastConsumedVersion > previous); + } + co_await firstConsumer->acknowledge(); + co_await removeNativeCdcStreamClient(cx, firstName); + co_await waitForCDCProxyAssignmentRemoval(firstId); + + Reference unreadConsumer = co_await createNativeCdcConsumer(cx, secondName); + ASSERT(unreadConsumer->position().streamId == secondId); + bool foundUnread = false; + while (unreadConsumer->position().lastConsumedVersion < writeVersion) { + const Version previous = unreadConsumer->position().lastConsumedVersion; + CDCConsumeReply unread = co_await timeoutError(unreadConsumer->consume(), 30.0); + ASSERT(unread.lastConsumedVersion > previous); + for (const auto& versioned : unread.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "shared/unread"_sr) { + foundUnread = true; + } + } + } + } + ASSERT(foundUnread); + co_await unreadConsumer->acknowledge(); + + co_await removeNativeCdcStreamClient(cx, secondName); + co_await waitForCDCProxyAssignmentRemoval(secondId); + } + + Future run(Database cx) { + const Key firstName = "native-cdc-first"_sr; + const Key secondName = "native-cdc-second"_sr; + const KeyRange firstRange(KeyRangeRef("a"_sr, "m"_sr)); + const KeyRange conflictingRange(KeyRangeRef("a"_sr, "z"_sr)); + const KeyRange secondRange(KeyRangeRef("g"_sr, "z"_sr)); + + const CDCStreamId firstId = co_await registerNativeCdcStream(cx, firstName, firstRange); + ASSERT(co_await registerNativeCdcStream(cx, firstName, firstRange) == firstId); + + bool conflictingDuplicateRejected = false; + try { + co_await registerNativeCdcStream(cx, firstName, conflictingRange); + } catch (Error& e) { + if (e.code() == error_code_client_invalid_operation) { + conflictingDuplicateRejected = true; + } else { + throw; + } + } + ASSERT(conflictingDuplicateRejected); + + const auto firstRoute = co_await getPersistedRoute(cx, firstId); + ASSERT(firstRoute.first.locality == tagLocalityCDC); + + std::vector streams = co_await listNativeCdcStreams(cx); + ASSERT(streams.size() == 1); + ASSERT(streams[0].name == firstName); + ASSERT(streams[0].streamId == firstId); + ASSERT(streams[0].keys == firstRange); + ASSERT(streams[0].minVersion == firstRoute.second); + + const Version firstConsumedThrough = + co_await writeValues(cx, { { "first/acknowledged"_sr, "acknowledged"_sr } }); + const Version firstAckMinVersion = firstConsumedThrough + 1; + ASSERT(co_await acknowledgeNativeCdcStream(cx, firstId, firstConsumedThrough) == firstAckMinVersion); + ASSERT(co_await acknowledgeNativeCdcStream(cx, firstId, firstRoute.second) == firstAckMinVersion); + streams = co_await listNativeCdcStreams(cx); + ASSERT(streams.size() == 1); + ASSERT(streams[0].minVersion == firstAckMinVersion); + + Optional removedFirst = co_await removeNativeCdcStream(cx, firstName); + ASSERT(removedFirst.present()); + ASSERT((co_await getRetiredTagPopVersion(cx, firstRoute.first)) == removedFirst.get().removalVersion); + ASSERT((co_await listNativeCdcStreams(cx)).empty()); + ASSERT(!(co_await hasPersistedRetention(cx, firstId))); + + bool retiredAcknowledgeRejected = false; + try { + co_await acknowledgeNativeCdcStream(cx, firstId, firstConsumedThrough + 5); + } catch (Error& e) { + retiredAcknowledgeRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(retiredAcknowledgeRejected); + + const CDCStreamId secondId = co_await registerNativeCdcStream(cx, secondName, secondRange); + const auto secondRoute = co_await getPersistedRoute(cx, secondId); + ASSERT(secondId > firstId); + ASSERT(secondRoute.first == firstRoute.first); + + Optional removedSecond = co_await removeNativeCdcStream(cx, secondName); + ASSERT(removedSecond.present()); + ASSERT((co_await getRetiredTagPopVersion(cx, secondRoute.first)) == removedSecond.get().removalVersion); + + const Key liveName = "native-cdc-live"_sr; + const KeyRange liveRange(KeyRangeRef("live/"_sr, "live0"_sr)); + const CDCStreamId liveStreamId = co_await registerNativeCdcStreamClient(cx, liveName, liveRange); + const Tag liveTag = co_await getLatestPersistedTag(cx, liveStreamId); + Reference liveConsumer = co_await createNativeCdcConsumer(cx, liveName); + ASSERT(liveConsumer->position().streamId == liveStreamId); + CDCProxyInterface owner = co_await getCDCProxy(liveStreamId); + + bool futureAcknowledgeRejected = false; + try { + co_await resumeNativeCdcConsumer(cx, CDCCursor(liveStreamId, std::numeric_limits::max() - 2)) + ->acknowledge(); + } catch (Error& e) { + futureAcknowledgeRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(futureAcknowledgeRejected); + + std::vector listed = co_await listNativeCdcStreamsClient(cx); + ASSERT(listed.size() == 1); + ASSERT(listed[0].name == liveName); + ASSERT(listed[0].streamId == liveStreamId); + ASSERT(listed[0].keys == liveRange); + + const Version writeVersion = + co_await writeValues(cx, { { "live/in"_sr, "captured"_sr }, { "other/out"_sr, "ignored"_sr } }); + if (verifySatelliteIndexing) { + co_await verifySatelliteCDCWrite(liveTag, writeVersion); + } + + for (const auto& nonOwner : dbInfo->get().client.cdcProxies) { + if (nonOwner.id() == owner.id()) { + continue; + } + bool wrongOwnerRejected = false; + try { + co_await nonOwner.consume.getReply(CDCConsumeRequest(CDCCursor(liveStreamId, invalidVersion))); + } catch (Error& e) { + wrongOwnerRejected = e.code() == error_code_wrong_shard_server; + } + ASSERT(wrongOwnerRejected); + bool wrongOwnerRemoveRejected = false; + try { + co_await nonOwner.removeStream.getReply(CDCRemoveStreamRequest(liveName)); + } catch (Error& e) { + wrongOwnerRemoveRejected = e.code() == error_code_wrong_shard_server; + } + ASSERT(wrongOwnerRemoveRejected); + break; + } + + bool foundInRangeWrite = false; + bool foundOutOfRangeWrite = false; + const double initialConsumeDeadline = now() + 30.0; + while (liveConsumer->position().lastConsumedVersion < writeVersion) { + const Version previous = liveConsumer->position().lastConsumedVersion; + CDCConsumeReply consumed = co_await timeoutError(liveConsumer->consume(), 30.0); + if (consumed.lastConsumedVersion == previous) { + ASSERT(now() < initialConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(consumed.lastConsumedVersion > previous); + for (const auto& versioned : consumed.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/in"_sr) { + foundInRangeWrite = true; + } + if (mutation.param1 == "other/out"_sr) { + foundOutOfRangeWrite = true; + } + } + } + } + ASSERT(foundInRangeWrite); + ASSERT(!foundOutOfRangeWrite); + + const uint64_t recoveryCount = dbInfo->get().recoveryCount; + co_await owner.haltForTesting.getReply(HaltCDCProxyRequest()); + CDCProxyInterface replacement = co_await timeoutError(getReplacementCDCProxy(liveStreamId, owner.id()), 30.0); + ASSERT(replacement.id() != owner.id()); + ASSERT(dbInfo->get().recoveryCount == recoveryCount); + + const Version afterFailureVersion = + co_await writeValues(cx, { { "live/after-failure"_sr, "captured-after-failure"_sr } }); + bool foundAfterFailureWrite = false; + const double afterFailureConsumeDeadline = now() + 30.0; + while (liveConsumer->position().lastConsumedVersion < afterFailureVersion) { + const Version previous = liveConsumer->position().lastConsumedVersion; + CDCConsumeReply afterFailure = co_await timeoutError(liveConsumer->consume(), 30.0); + if (afterFailure.lastConsumedVersion == previous) { + ASSERT(now() < afterFailureConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(afterFailure.lastConsumedVersion > previous); + for (const auto& versioned : afterFailure.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/after-failure"_sr) { + foundAfterFailureWrite = true; + } + } + } + } + ASSERT(foundAfterFailureWrite); + + const Version cursorBeforeRecovery = liveConsumer->position().lastConsumedVersion; + co_await liveConsumer->acknowledge(); + ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == cursorBeforeRecovery + 1); + + const int32_t recoveredResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; + const UID ownerBeforeRecovery = replacement.id(); + const uint64_t recoveryBeforeChange = dbInfo->get().recoveryCount; + co_await changeResolverCount(cx, recoveredResolverCount); + co_await timeoutError(waitForRecoveryAfter(recoveryBeforeChange, RecoveryState::ACCEPTING_COMMITS), 60.0); + CDCProxyInterface recoveredOwner = co_await getCDCProxy(liveStreamId); + ASSERT(recoveredOwner.id() == ownerBeforeRecovery); + + const Version afterRecoveryVersion = + co_await writeValues(cx, { { "live/after-recovery"_sr, "captured-after-recovery"_sr } }); + if (verifySatelliteIndexing) { + co_await verifySatelliteCDCWrite(liveTag, afterRecoveryVersion); + } + bool foundAfterRecoveryWrite = false; + const double afterRecoveryConsumeDeadline = now() + 30.0; + while (liveConsumer->position().lastConsumedVersion < afterRecoveryVersion) { + const Version previous = liveConsumer->position().lastConsumedVersion; + CDCConsumeReply afterRecovery = co_await timeoutError(liveConsumer->consume(), 30.0); + if (afterRecovery.lastConsumedVersion == previous) { + ASSERT(now() < afterRecoveryConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(afterRecovery.lastConsumedVersion > previous); + for (const auto& versioned : afterRecovery.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/after-recovery"_sr) { + foundAfterRecoveryWrite = true; + } + } + } + } + ASSERT(foundAfterRecoveryWrite); + + co_await liveConsumer->acknowledge(); + ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == liveConsumer->position().lastConsumedVersion + 1); + + if (g_network->isSimulated()) { + (const_cast(CLIENT_KNOBS))->ENABLE_NATIVE_CDC = false; + + bool disabledRegistrationRejected = false; + try { + co_await registerNativeCdcStreamClient(cx, "native-cdc-disabled-registration"_sr, liveRange); + } catch (Error& e) { + disabledRegistrationRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(disabledRegistrationRejected); + + listed = co_await listNativeCdcStreamsClient(cx); + ASSERT(listed.size() == 1); + ASSERT(listed[0].streamId == liveStreamId); + ASSERT((co_await createNativeCdcConsumer(cx, liveName))->position().streamId == liveStreamId); + liveConsumer = resumeNativeCdcConsumer(cx, liveConsumer->position()); + + const int32_t disabledResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; + const uint64_t recoveryBeforeDisabledDrain = dbInfo->get().recoveryCount; + co_await changeResolverCount(cx, disabledResolverCount); + co_await timeoutError(waitForRecoveryAfter(recoveryBeforeDisabledDrain, RecoveryState::ACCEPTING_COMMITS), + 60.0); + recoveredOwner = co_await timeoutError(getCDCProxy(liveStreamId), 30.0); + + const Version afterDisableVersion = + co_await writeValues(cx, { { "live/after-disable"_sr, "captured-after-disable"_sr } }); + bool foundAfterDisableWrite = false; + const double afterDisableConsumeDeadline = now() + 30.0; + while (liveConsumer->position().lastConsumedVersion < afterDisableVersion) { + const Version previous = liveConsumer->position().lastConsumedVersion; + CDCConsumeReply afterDisable = co_await timeoutError(liveConsumer->consume(), 30.0); + if (afterDisable.lastConsumedVersion == previous) { + ASSERT(now() < afterDisableConsumeDeadline); + co_await delay(0.1); + continue; + } + ASSERT(afterDisable.lastConsumedVersion > previous); + for (const auto& versioned : afterDisable.mutations) { + for (const auto& mutation : versioned.mutations) { + if (mutation.param1 == "live/after-disable"_sr) { + foundAfterDisableWrite = true; + } + } + } + } + ASSERT(foundAfterDisableWrite); + co_await liveConsumer->acknowledge(); + ASSERT(co_await getPersistedMinVersion(cx, liveStreamId) == + liveConsumer->position().lastConsumedVersion + 1); + } + + Future pendingConsume = recoveredOwner.consume.getReply( + CDCConsumeRequest(CDCCursor(liveStreamId, std::numeric_limits::max() - 2))); + co_await delay(0.1); + co_await removeNativeCdcStreamClient(cx, liveName); + co_await waitForCDCProxyAssignmentRemoval(liveStreamId); + + bool pendingConsumeRejected = false; + try { + co_await timeoutError(pendingConsume, 30.0); + } catch (Error& e) { + pendingConsumeRejected = + e.code() == error_code_wrong_shard_server || e.code() == error_code_client_invalid_operation; + } + ASSERT(pendingConsumeRejected); + + bool retiredConsumeRejected = false; + try { + co_await timeoutError(liveConsumer->consume(), 30.0); + } catch (Error& e) { + retiredConsumeRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(retiredConsumeRejected); + + bool retiredClientAcknowledgeRejected = false; + try { + co_await timeoutError(liveConsumer->acknowledge(), 30.0); + } catch (Error& e) { + retiredClientAcknowledgeRejected = e.code() == error_code_client_invalid_operation; + } + ASSERT(retiredClientAcknowledgeRejected); + co_await timeoutError(waitForNoRetiredTagPopState(cx, liveTag), 30.0); + + if (g_network->isSimulated()) { + const int32_t disabledResolverCount = (co_await getDatabaseConfiguration(cx)).getDesiredResolvers() + 1; + const uint64_t recoveryBeforeDisable = dbInfo->get().recoveryCount; + co_await changeResolverCount(cx, disabledResolverCount); + co_await timeoutError(waitForRecoveryAfter(recoveryBeforeDisable, RecoveryState::ACCEPTING_COMMITS), 60.0); + co_await timeoutError(waitForNoCDCProxies(), 30.0); + } + } +}; + +WorkloadFactory NativeCdcWorkloadFactory; diff --git a/fdbserver/workloads/NativeCdcEndToEnd.cpp b/fdbserver/workloads/NativeCdcEndToEnd.cpp new file mode 100644 index 00000000000..51869d61517 --- /dev/null +++ b/fdbserver/workloads/NativeCdcEndToEnd.cpp @@ -0,0 +1,230 @@ +/* + * NativeCdcEndToEnd.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2026 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "fdbclient/NativeCdc.h" +#include "fdbserver/tester/workloads.h" +#include "flow/DeterministicRandom.h" + +struct NativeCdcEndToEndWorkload : TestWorkload { + static constexpr auto NAME = "NativeCdcEndToEnd"; + + struct ExpectedWrite { + Version deadline; + bool observed = false; + }; + + struct StreamState { + Key name; + KeyRange keys; + Reference consumer; + std::map, ExpectedWrite> expected; + }; + + int initialStreamCount; + int minStreamCount; + int maxStreamCount; + int keyCount; + int writesPerRound; + int rounds; + double drainProbability; + double delayBetweenRounds; + double operationTimeout; + int nextStreamNumber = 0; + std::vector streams; + + explicit NativeCdcEndToEndWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + initialStreamCount = getOption(options, "initialStreamCount"_sr, 12); + minStreamCount = getOption(options, "minStreamCount"_sr, 6); + maxStreamCount = getOption(options, "maxStreamCount"_sr, 20); + keyCount = getOption(options, "keyCount"_sr, 16); + writesPerRound = getOption(options, "writesPerRound"_sr, 5); + rounds = getOption(options, "rounds"_sr, 30); + drainProbability = getOption(options, "drainProbability"_sr, 0.25); + delayBetweenRounds = getOption(options, "delayBetweenRounds"_sr, 0.5); + operationTimeout = getOption(options, "operationTimeout"_sr, 120.0); + ASSERT(minStreamCount >= 1); + ASSERT(initialStreamCount >= minStreamCount); + ASSERT(maxStreamCount >= initialStreamCount); + ASSERT(keyCount >= 2); + ASSERT(writesPerRound >= 1 && writesPerRound <= keyCount); + } + + // RandomRangeLock can outlive this bounded CDC workload and mask its progress check. + void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("RandomRangeLock"); } + + Future setup(Database const& cx) override { + if (clientId != 0) { + return Void(); + } + return initializeStreams(cx); + } + + Future start(Database const& cx) override { + if (clientId != 0) { + return Void(); + } + return run(cx); + } + + Future check(Database const& cx) override { return true; } + + void getMetrics(std::vector& m) override {} + + Key keyForIndex(int index) const { return Key(StringRef(format("native-cdc-e2e/data/%04d", index))); } + + KeyRange randomOverlappingRange() const { + const int middle = keyCount / 2; + const int begin = deterministicRandom()->randomInt(0, middle + 1); + const int end = deterministicRandom()->randomInt(middle + 1, keyCount + 1); + return KeyRange(KeyRangeRef(keyForIndex(begin), keyForIndex(end))); + } + + Future writeValues(Database cx, std::vector> values) { + Transaction tr(cx); + while (true) { + Error err; + try { + for (const auto& [key, value] : values) { + tr.set(key, value); + } + co_await tr.commit(); + co_return tr.getCommittedVersion(); + } catch (Error& e) { + err = e; + } + co_await tr.onError(err); + } + } + + Future addStream(Database cx) { + StreamState stream; + stream.name = Key(StringRef(format("native-cdc-e2e/stream/%04d", nextStreamNumber++))); + stream.keys = randomOverlappingRange(); + co_await timeoutError(registerNativeCdcStreamClient(cx, stream.name, stream.keys), operationTimeout); + stream.consumer = co_await timeoutError(createNativeCdcConsumer(cx, stream.name), operationTimeout); + streams.push_back(std::move(stream)); + } + + Future initializeStreams(Database cx) { + for (int i = 0; i < initialStreamCount; ++i) { + co_await addStream(cx); + } + } + + void recordExpectedWrites(std::vector> const& values, Version committedVersion) { + for (auto& stream : streams) { + for (const auto& [key, value] : values) { + if (stream.keys.contains(key)) { + const auto inserted = + stream.expected.emplace(std::make_pair(key, value), ExpectedWrite{ committedVersion }); + ASSERT(inserted.second); + } + } + } + } + + Future drainThrough(StreamState* stream, Version throughVersion) { + const double deadline = now() + operationTimeout; + while (stream->consumer->position().lastConsumedVersion < throughVersion) { + const Version previous = stream->consumer->position().lastConsumedVersion; + CDCConsumeReply reply = co_await timeoutError(stream->consumer->consume(), operationTimeout); + if (reply.lastConsumedVersion == previous) { + ASSERT(now() < deadline); + co_await delay(0.1); + continue; + } + ASSERT(reply.lastConsumedVersion > previous); + for (const auto& versioned : reply.mutations) { + ASSERT(versioned.version > previous); + ASSERT(versioned.version <= reply.lastConsumedVersion); + for (const auto& mutation : versioned.mutations) { + ASSERT(mutation.type == MutationRef::SetValue); + ASSERT(stream->keys.contains(mutation.param1)); + const auto found = + stream->expected.find(std::make_pair(Key(mutation.param1), Value(mutation.param2))); + ASSERT(found != stream->expected.end()); + found->second.observed = true; + } + } + co_await timeoutError(stream->consumer->acknowledge(), operationTimeout); + } + for (const auto& expected : stream->expected) { + if (expected.second.deadline <= throughVersion) { + ASSERT(expected.second.observed); + } + } + } + + Future removeStream(Database cx, int index, Version throughVersion) { + ASSERT(index > 0); + co_await drainThrough(&streams[index], throughVersion); + co_await timeoutError(removeNativeCdcStreamClient(cx, streams[index].name), operationTimeout); + streams.erase(streams.begin() + index); + } + + Future run(Database cx) { + Version mostRecentWrite = invalidVersion; + for (int round = 0; round < rounds; ++round) { + if (round > 0 && static_cast(streams.size()) > minStreamCount && + (round % 3 == 0 || deterministicRandom()->random01() < 0.35)) { + const int removalIndex = deterministicRandom()->randomInt(1, static_cast(streams.size())); + co_await removeStream(cx, removalIndex, mostRecentWrite); + } + if (static_cast(streams.size()) < maxStreamCount && + (round % 2 == 0 || deterministicRandom()->random01() < 0.35)) { + co_await addStream(cx); + } + + std::set chosenKeys{ keyCount / 2 }; + while (static_cast(chosenKeys.size()) < writesPerRound) { + chosenKeys.insert(deterministicRandom()->randomInt(0, keyCount)); + } + std::vector> values; + for (int index : chosenKeys) { + values.emplace_back(keyForIndex(index), Value(StringRef(format("round/%04d/key/%04d", round, index)))); + } + mostRecentWrite = co_await writeValues(cx, values); + recordExpectedWrites(values, mostRecentWrite); + + // streams[0] intentionally stays behind while other streams are removed. + for (int i = 1; i < static_cast(streams.size()); ++i) { + if (deterministicRandom()->random01() < drainProbability) { + co_await drainThrough(&streams[i], mostRecentWrite); + } + } + co_await delay(delayBetweenRounds); + } + + for (auto& stream : streams) { + co_await drainThrough(&stream, mostRecentWrite); + } + while (!streams.empty()) { + co_await timeoutError(removeNativeCdcStreamClient(cx, streams.back().name), operationTimeout); + streams.pop_back(); + } + } +}; + +WorkloadFactory NativeCdcEndToEndWorkloadFactory; diff --git a/flow/ProtocolVersion.h.cmake b/flow/ProtocolVersion.h.cmake index fd8a24c6384..dca92cc298a 100644 --- a/flow/ProtocolVersion.h.cmake +++ b/flow/ProtocolVersion.h.cmake @@ -179,6 +179,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(@FDB_PV_GC_TXN_GENERATIONS@, GcTxnGenerations); PROTOCOL_VERSION_FEATURE(@FDB_PV_MUTATION_CHECKSUM@, MutationChecksum); PROTOCOL_VERSION_FEATURE(@FDB_PV_RANGE_BACKUP_WORKER@, RangeBackupWorker); + PROTOCOL_VERSION_FEATURE(@FDB_PV_NATIVE_CDC@, NativeCdc); }; template <> diff --git a/flow/ProtocolVersions.cmake b/flow/ProtocolVersions.cmake index 4066e7435a4..10a6eb6a000 100644 --- a/flow/ProtocolVersions.cmake +++ b/flow/ProtocolVersions.cmake @@ -8,10 +8,10 @@ # used and should not be changed from 0. # xyzdev # vvvv -set(FDB_PV_DEFAULT_VERSION "0x0FDB00B080000000LL") -set(FDB_PV_FUTURE_VERSION "0x0FDB00B081000000LL") +set(FDB_PV_DEFAULT_VERSION "0x0FDB00B081000000LL") +set(FDB_PV_FUTURE_VERSION "0x0FDB00B082000000LL") set(FDB_PV_MIN_COMPATIBLE_VERSION "0x0FDB00B074000000LL") -set(FDB_PV_MIN_INVALID_VERSION "0x0FDB00B082000000LL") +set(FDB_PV_MIN_INVALID_VERSION "0x0FDB00B083000000LL") set(FDB_PV_LEFT_MOST_CHECK "0x0FDB00B100000000LL") set(FDB_PV_LSB_MASK "0xF0FFFFLL") @@ -96,3 +96,4 @@ set(FDB_PV_GC_TXN_GENERATIONS "0x0FDB00B073000000LL") set(FDB_PV_MUTATION_CHECKSUM "0x0FDB00B074000000LL") set(FDB_PV_GRPC_ENDPOINT "0x0FDB00B080000000LL") set(FDB_PV_RANGE_BACKUP_WORKER "0x0FDB00B080000000LL") +set(FDB_PV_NATIVE_CDC "0x0FDB00B081000000LL") diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index fb1a6a52cb7..846fa13567e 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -169,6 +169,7 @@ ERROR( bulkload_invalid_configuration, 1250, "BulkLoad requires cluster configur ERROR( transaction_grv_queue_rejected, 1251, "GRV request rejected because estimated queue wait exceeds transaction limit" ) ERROR( finish_move_keys_too_many_retries, 1252, "finishMoveKeys exceeded retry limit" ) ERROR( start_move_keys_too_many_retries, 1253, "startMoveKeys exceeded retry limit" ) +ERROR( cdc_proxy_failed, 1254, "Cluster recovery terminating because a CDCProxy failed" ) // 15xx Platform errors ERROR( platform_error, 1500, "Platform error" ) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7915120ceea..f13f20e51e6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -197,6 +197,9 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/RandomSelector.toml) add_fdb_test(TEST_FILES fast/RandomUnitTests.toml) add_fdb_test(TEST_FILES fast/RangeLocking.toml) + add_fdb_test(TEST_FILES fast/NativeCdc.toml) + add_fdb_test(TEST_FILES fast/NativeCdcEndToEnd.toml) + add_fdb_test(TEST_FILES fast/NativeCdcSharedTag.toml) add_fdb_test(TEST_FILES fast/RangeLockCycle.toml) add_fdb_test(TEST_FILES fast/ReadHotDetectionCorrectness.toml IGNORE) # TODO re-enable once read hot detection is enabled. add_fdb_test(TEST_FILES fast/ReportConflictingKeys.toml) @@ -227,6 +230,7 @@ if(WITH_PYTHON) if (MULTIREGION_TEST) # ValidateStorage depends on WITH_ROCKSDB + add_fdb_test(TEST_FILES fast/NativeCdcSatellite.toml) add_fdb_test(TEST_FILES slow/DiskFailureCycle.toml) add_fdb_test(TEST_FILES rare/FailoverWithSSLag.toml) add_fdb_test(TEST_FILES rare/DcLag.toml) diff --git a/tests/fast/NativeCdc.toml b/tests/fast/NativeCdc.toml new file mode 100644 index 00000000000..0caf4f92065 --- /dev/null +++ b/tests/fast/NativeCdc.toml @@ -0,0 +1,17 @@ +[configuration] +config = 'single' +singleRegion = true +buggify = false +faultInjection = false + +[[knobs]] +enable_native_cdc = true + +[[test]] +testTitle = 'NativeCdc' +useDB = true +waitForQuiescenceEnd = false +connectionFailuresDisableDuration = 1000000 + + [[test.workload]] + testName = 'NativeCdc' diff --git a/tests/fast/NativeCdcEndToEnd.toml b/tests/fast/NativeCdcEndToEnd.toml new file mode 100644 index 00000000000..3879a54c2b4 --- /dev/null +++ b/tests/fast/NativeCdcEndToEnd.toml @@ -0,0 +1,32 @@ +[[knobs]] +enable_native_cdc = true + +[[test]] +testTitle = 'NativeCdcEndToEnd' +useDB = true +waitForQuiescenceEnd = false +timeout = 600 +connectionFailuresDisableDuration = 1000000 +runFailureWorkloads = false + + [[test.workload]] + testName = 'NativeCdcEndToEnd' + initialStreamCount = 12 + minStreamCount = 6 + maxStreamCount = 20 + keyCount = 16 + writesPerRound = 5 + rounds = 30 + drainProbability = 0.25 + delayBetweenRounds = 0.5 + operationTimeout = 500.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 1 + machinesToLeave = 3 + reboot = true + testDuration = 20.0 + waitForVersion = true + allowFaultInjection = false + killDc = false diff --git a/tests/fast/NativeCdcSatellite.toml b/tests/fast/NativeCdcSatellite.toml new file mode 100644 index 00000000000..d8fe0f64c01 --- /dev/null +++ b/tests/fast/NativeCdcSatellite.toml @@ -0,0 +1,23 @@ +[configuration] +config = '''double remote_double usable_regions=2 regions=[{"datacenters":[{"id":"0","priority":2},{"id":"2","priority":1,"satellite":1}],"satellite_redundancy_mode":"one_satellite_single"},{"datacenters":[{"id":"1","priority":1},{"id":"3","priority":1,"satellite":1}],"satellite_redundancy_mode":"one_satellite_single"}]''' +minimumRegions = 2 +singleRegion = true +generateFearless = false +datacenters = 4 +machineCount = 16 +processesPerMachine = 1 +buggify = false +faultInjection = false + +[[knobs]] +enable_native_cdc = true + +[[test]] +testTitle = 'NativeCdcSatellite' +useDB = true +waitForQuiescenceEnd = false +connectionFailuresDisableDuration = 1000000 + + [[test.workload]] + testName = 'NativeCdc' + verifySatelliteIndexing = true diff --git a/tests/fast/NativeCdcSharedTag.toml b/tests/fast/NativeCdcSharedTag.toml new file mode 100644 index 00000000000..8f72da81377 --- /dev/null +++ b/tests/fast/NativeCdcSharedTag.toml @@ -0,0 +1,18 @@ +[configuration] +config = 'single' +singleRegion = true +buggify = false +faultInjection = false + +[[knobs]] +enable_native_cdc = true + +[[test]] +testTitle = 'NativeCdcSharedTag' +useDB = true +waitForQuiescenceEnd = false +connectionFailuresDisableDuration = 1000000 + + [[test.workload]] + testName = 'NativeCdc' + sharedTagSafety = true